deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -11,15 +11,19 @@ from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
12
  get_turns_in_sliding_window,
13
13
  get_unit_interactions,
14
- trimAndLoadJson,
15
14
  initialize_model,
16
15
  convert_turn_to_dict,
16
+ a_generate_with_schema_and_extract,
17
+ generate_with_schema_and_extract,
17
18
  )
18
19
  from deepeval.models import DeepEvalBaseLLM
19
20
  from deepeval.metrics.indicator import metric_progress_indicator
20
21
  from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
21
22
  from deepeval.utils import get_or_create_event_loop, prettify_list
22
- from deepeval.metrics.turn_relevancy.schema import *
23
+ from deepeval.metrics.turn_relevancy.schema import (
24
+ TurnRelevancyVerdict,
25
+ TurnRelevancyScoreReason,
26
+ )
23
27
  from deepeval.metrics.api import metric_data_manager
24
28
 
25
29
 
@@ -53,7 +57,12 @@ class TurnRelevancyMetric(BaseConversationalMetric):
53
57
  _log_metric_to_confident: bool = True,
54
58
  ):
55
59
  check_conversational_test_case_params(
56
- test_case, self._required_test_case_params, self
60
+ test_case,
61
+ self._required_test_case_params,
62
+ self,
63
+ False,
64
+ self.model,
65
+ test_case.multimodal,
57
66
  )
58
67
 
59
68
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -108,7 +117,12 @@ class TurnRelevancyMetric(BaseConversationalMetric):
108
117
  _log_metric_to_confident: bool = True,
109
118
  ) -> float:
110
119
  check_conversational_test_case_params(
111
- test_case, self._required_test_case_params, self
120
+ test_case,
121
+ self._required_test_case_params,
122
+ self,
123
+ False,
124
+ self.model,
125
+ test_case.multimodal,
112
126
  )
113
127
 
114
128
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -148,7 +162,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
148
162
  )
149
163
  return self.score
150
164
 
151
- async def _a_generate_reason(self) -> str:
165
+ async def _a_generate_reason(self) -> Optional[str]:
152
166
  if self.include_reason is False:
153
167
  return None
154
168
 
@@ -162,24 +176,19 @@ class TurnRelevancyMetric(BaseConversationalMetric):
162
176
  prompt = TurnRelevancyTemplate.generate_reason(
163
177
  score=self.score, irrelevancies=irrelevancies
164
178
  )
165
- if self.using_native_model:
166
- res, cost = await self.model.a_generate(
167
- prompt, schema=TurnRelevancyScoreReason
168
- )
169
- self.evaluation_cost += cost
170
- return res.reason
171
- else:
172
- try:
173
- res: TurnRelevancyScoreReason = await self.model.a_generate(
174
- prompt, schema=TurnRelevancyScoreReason
175
- )
176
- return res.reason
177
- except TypeError:
178
- res = await self.model.a_generate(prompt)
179
- data = trimAndLoadJson(res, self)
180
- return data["reason"]
181
179
 
182
- def _generate_reason(self) -> str:
180
+ return await a_generate_with_schema_and_extract(
181
+ metric=self,
182
+ prompt=prompt,
183
+ schema_cls=TurnRelevancyScoreReason,
184
+ extract_schema=lambda s: s.reason,
185
+ extract_json=lambda data: data["reason"],
186
+ )
187
+
188
+ def _generate_reason(self) -> Optional[str]:
189
+ if self.include_reason is False:
190
+ return None
191
+
183
192
  irrelevancies: List[Dict[str, str]] = []
184
193
  for index, verdict in enumerate(self.verdicts):
185
194
  if verdict.verdict.strip().lower() == "no":
@@ -190,22 +199,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
190
199
  prompt = TurnRelevancyTemplate.generate_reason(
191
200
  score=self.score, irrelevancies=irrelevancies
192
201
  )
193
- if self.using_native_model:
194
- res, cost = self.model.generate(
195
- prompt, schema=TurnRelevancyScoreReason
196
- )
197
- self.evaluation_cost += cost
198
- return res.reason
199
- else:
200
- try:
201
- res: TurnRelevancyScoreReason = self.model.generate(
202
- prompt, schema=TurnRelevancyScoreReason
203
- )
204
- return res.reason
205
- except TypeError:
206
- res = self.model.generate(prompt)
207
- data = trimAndLoadJson(res, self)
208
- return data["reason"]
202
+
203
+ return generate_with_schema_and_extract(
204
+ metric=self,
205
+ prompt=prompt,
206
+ schema_cls=TurnRelevancyScoreReason,
207
+ extract_schema=lambda s: s.reason,
208
+ extract_json=lambda data: data["reason"],
209
+ )
209
210
 
210
211
  async def _a_generate_verdict(
211
212
  self, turns_sliding_window: List[Turn]
@@ -215,22 +216,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
215
216
  convert_turn_to_dict(turn) for turn in turns_sliding_window
216
217
  ]
217
218
  )
218
- if self.using_native_model:
219
- res, cost = await self.model.a_generate(
220
- prompt, schema=TurnRelevancyVerdict
221
- )
222
- self.evaluation_cost += cost
223
- return res
224
- else:
225
- try:
226
- res: TurnRelevancyVerdict = await self.model.a_generate(
227
- prompt, schema=TurnRelevancyVerdict
228
- )
229
- return res
230
- except TypeError:
231
- res = await self.model.a_generate(prompt)
232
- data = trimAndLoadJson(res, self)
233
- return TurnRelevancyVerdict(**data)
219
+
220
+ return await a_generate_with_schema_and_extract(
221
+ metric=self,
222
+ prompt=prompt,
223
+ schema_cls=TurnRelevancyVerdict,
224
+ extract_schema=lambda s: s,
225
+ extract_json=lambda data: TurnRelevancyVerdict(**data),
226
+ )
234
227
 
235
228
  def _generate_verdict(
236
229
  self, turns_sliding_window: List[Turn]
@@ -240,20 +233,14 @@ class TurnRelevancyMetric(BaseConversationalMetric):
240
233
  convert_turn_to_dict(turn) for turn in turns_sliding_window
241
234
  ]
242
235
  )
243
- if self.using_native_model:
244
- res, cost = self.model.generate(prompt, schema=TurnRelevancyVerdict)
245
- self.evaluation_cost += cost
246
- return res
247
- else:
248
- try:
249
- res: TurnRelevancyVerdict = self.model.generate(
250
- prompt, schema=TurnRelevancyVerdict
251
- )
252
- return res
253
- except TypeError:
254
- res = self.model.generate(prompt)
255
- data = trimAndLoadJson(res, self)
256
- return TurnRelevancyVerdict(**data)
236
+
237
+ return generate_with_schema_and_extract(
238
+ metric=self,
239
+ prompt=prompt,
240
+ schema_cls=TurnRelevancyVerdict,
241
+ extract_schema=lambda s: s,
242
+ extract_json=lambda data: TurnRelevancyVerdict(**data),
243
+ )
257
244
 
258
245
  def _calculate_score(self) -> float:
259
246
  number_of_verdicts = len(self.verdicts)
@@ -274,7 +261,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
274
261
  else:
275
262
  try:
276
263
  self.score >= self.threshold
277
- except:
264
+ except TypeError:
278
265
  self.success = False
279
266
  return self.success
280
267
 
deepeval/metrics/utils.py CHANGED
@@ -2,16 +2,24 @@ import inspect
2
2
  import json
3
3
  import re
4
4
  import sys
5
- import itertools
6
- from typing import Any, Dict, Optional, List, Union, Tuple
5
+ from typing import (
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Optional,
11
+ Tuple,
12
+ Type,
13
+ TypeVar,
14
+ Union,
15
+ )
7
16
 
8
17
  from deepeval.errors import (
9
18
  MissingTestCaseParamsError,
10
- MismatchedTestCaseInputsError,
11
19
  )
20
+ from deepeval.utils import convert_to_multi_modal_array
12
21
  from deepeval.models import (
13
22
  DeepEvalBaseLLM,
14
- DeepEvalBaseMLLM,
15
23
  GPTModel,
16
24
  AnthropicModel,
17
25
  AzureOpenAIModel,
@@ -22,16 +30,20 @@ from deepeval.models import (
22
30
  OllamaEmbeddingModel,
23
31
  LocalEmbeddingModel,
24
32
  GeminiModel,
25
- MultimodalOpenAIModel,
26
- MultimodalGeminiModel,
27
- MultimodalOllamaModel,
28
- MultimodalAzureOpenAIMLLMModel,
29
33
  AmazonBedrockModel,
30
34
  LiteLLMModel,
31
35
  KimiModel,
32
36
  GrokModel,
33
37
  DeepSeekModel,
34
38
  )
39
+ from deepeval.models.llms.constants import (
40
+ OPENAI_MODELS_DATA,
41
+ GEMINI_MODELS_DATA,
42
+ OLLAMA_MODELS_DATA,
43
+ ANTHROPIC_MODELS_DATA,
44
+ GROK_MODELS_DATA,
45
+ KIMI_MODELS_DATA,
46
+ )
35
47
  from deepeval.key_handler import (
36
48
  ModelKeyValues,
37
49
  EmbeddingKeyValues,
@@ -40,16 +52,12 @@ from deepeval.key_handler import (
40
52
  from deepeval.metrics import (
41
53
  BaseMetric,
42
54
  BaseConversationalMetric,
43
- BaseMultimodalMetric,
44
55
  BaseArenaMetric,
45
56
  )
46
57
  from deepeval.models.base_model import DeepEvalBaseEmbeddingModel
47
58
  from deepeval.test_case import (
48
- Turn,
49
59
  LLMTestCase,
50
60
  LLMTestCaseParams,
51
- MLLMTestCase,
52
- MLLMTestCaseParams,
53
61
  ConversationalTestCase,
54
62
  MLLMImage,
55
63
  Turn,
@@ -58,12 +66,20 @@ from deepeval.test_case import (
58
66
  TurnParams,
59
67
  )
60
68
 
69
+ MULTIMODAL_SUPPORTED_MODELS = {
70
+ GPTModel: OPENAI_MODELS_DATA,
71
+ GeminiModel: GEMINI_MODELS_DATA,
72
+ OllamaModel: OLLAMA_MODELS_DATA,
73
+ AzureOpenAIModel: OPENAI_MODELS_DATA,
74
+ KimiModel: KIMI_MODELS_DATA,
75
+ AnthropicModel: ANTHROPIC_MODELS_DATA,
76
+ GrokModel: GROK_MODELS_DATA,
77
+ }
78
+
61
79
 
62
80
  def copy_metrics(
63
- metrics: List[
64
- Union[BaseMetric, BaseConversationalMetric, BaseMultimodalMetric]
65
- ],
66
- ) -> List[Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]]:
81
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
82
+ ) -> List[Union[BaseMetric, BaseConversationalMetric]]:
67
83
  copied_metrics = []
68
84
  for metric in metrics:
69
85
  metric_class = type(metric)
@@ -201,7 +217,28 @@ def check_conversational_test_case_params(
201
217
  test_case_params: List[TurnParams],
202
218
  metric: BaseConversationalMetric,
203
219
  require_chatbot_role: bool = False,
220
+ model: Optional[DeepEvalBaseLLM] = None,
221
+ multimodal: Optional[bool] = False,
204
222
  ):
223
+ if multimodal:
224
+ if not model or not model.supports_multimodal():
225
+ if model and type(model) in MULTIMODAL_SUPPORTED_MODELS.keys():
226
+ valid_multimodal_models = []
227
+ for model_name, model_data in MULTIMODAL_SUPPORTED_MODELS.get(
228
+ type(model)
229
+ ).items():
230
+ if callable(model_data):
231
+ model_data = model_data()
232
+ if model_data.supports_multimodal:
233
+ valid_multimodal_models.append(model_name)
234
+ raise ValueError(
235
+ f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(valid_multimodal_models)}."
236
+ )
237
+ else:
238
+ raise ValueError(
239
+ f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS.keys()])}"
240
+ )
241
+
205
242
  if isinstance(test_case, ConversationalTestCase) is False:
206
243
  error_str = f"Unable to evaluate test cases that are not of type 'ConversationalTestCase' using the conversational '{metric.__name__}' metric."
207
244
  metric.error = error_str
@@ -234,8 +271,49 @@ def check_conversational_test_case_params(
234
271
  def check_llm_test_case_params(
235
272
  test_case: LLMTestCase,
236
273
  test_case_params: List[LLMTestCaseParams],
274
+ input_image_count: Optional[int],
275
+ actual_output_image_count: Optional[int],
237
276
  metric: Union[BaseMetric, BaseArenaMetric],
277
+ model: Optional[DeepEvalBaseLLM] = None,
278
+ multimodal: Optional[bool] = False,
238
279
  ):
280
+ if multimodal:
281
+ if not model or not model.supports_multimodal():
282
+ if model and type(model) in MULTIMODAL_SUPPORTED_MODELS.keys():
283
+ valid_multimodal_models = []
284
+ for model_name, model_data in MULTIMODAL_SUPPORTED_MODELS.get(
285
+ type(model)
286
+ ).items():
287
+ if callable(model_data):
288
+ model_data = model_data()
289
+ if model_data.supports_multimodal:
290
+ valid_multimodal_models.append(model_name)
291
+ raise ValueError(
292
+ f"The evaluation model {model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {model.__class__.__name__} provider includes {', '.join(valid_multimodal_models)}."
293
+ )
294
+ else:
295
+ raise ValueError(
296
+ f"The evaluation model {model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS.keys()])}"
297
+ )
298
+
299
+ if input_image_count:
300
+ count = 0
301
+ for ele in convert_to_multi_modal_array(test_case.input):
302
+ if isinstance(ele, MLLMImage):
303
+ count += 1
304
+ if count != input_image_count:
305
+ error_str = f"Can only evaluate test cases with '{input_image_count}' input images using the '{metric.__name__}' metric. `{count}` found."
306
+ raise ValueError(error_str)
307
+
308
+ if actual_output_image_count:
309
+ count = 0
310
+ for ele in convert_to_multi_modal_array(test_case.actual_output):
311
+ if isinstance(ele, MLLMImage):
312
+ count += 1
313
+ if count != actual_output_image_count:
314
+ error_str = f"Unable to evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
315
+ raise ValueError(error_str)
316
+
239
317
  if isinstance(test_case, LLMTestCase) is False:
240
318
  error_str = f"Unable to evaluate test cases that are not of type 'LLMTestCase' using the non-conversational '{metric.__name__}' metric."
241
319
  metric.error = error_str
@@ -265,6 +343,8 @@ def check_arena_test_case_params(
265
343
  arena_test_case: ArenaTestCase,
266
344
  test_case_params: List[LLMTestCaseParams],
267
345
  metric: BaseArenaMetric,
346
+ model: Optional[DeepEvalBaseLLM] = None,
347
+ multimodal: Optional[bool] = False,
268
348
  ):
269
349
  if not isinstance(arena_test_case, ArenaTestCase):
270
350
  raise ValueError(
@@ -285,73 +365,8 @@ def check_arena_test_case_params(
285
365
  )
286
366
 
287
367
  for test_case in cases:
288
- check_llm_test_case_params(test_case, test_case_params, metric)
289
-
290
-
291
- def check_mllm_test_case_params(
292
- test_case: MLLMTestCase,
293
- test_case_params: List[MLLMTestCaseParams],
294
- input_image_count: Optional[int],
295
- actual_output_image_count: Optional[int],
296
- metric: BaseMetric,
297
- ):
298
- if input_image_count:
299
- count = 0
300
- for ele in test_case.input:
301
- if isinstance(ele, MLLMImage):
302
- count += 1
303
- if count != input_image_count:
304
- error_str = f"Can only evaluate test cases with '{input_image_count}' input images using the '{metric.__name__}' metric. `{count}` found."
305
- raise ValueError(error_str)
306
-
307
- if actual_output_image_count:
308
- count = 0
309
- for ele in test_case.actual_output:
310
- if isinstance(ele, MLLMImage):
311
- count += 1
312
- if count != actual_output_image_count:
313
- error_str = f"Unable to evaluate test cases with '{actual_output_image_count}' output images using the '{metric.__name__}' metric. `{count}` found."
314
- raise ValueError(error_str)
315
-
316
- if isinstance(test_case, MLLMTestCase) is False:
317
- error_str = f"Unable to evaluate test cases that are not of type 'MLLMTestCase' using the '{metric.__name__}' metric."
318
- metric.error = error_str
319
- raise ValueError(error_str)
320
-
321
- missing_params = []
322
- for param in test_case_params:
323
- if getattr(test_case, param.value) is None:
324
- missing_params.append(f"'{param.value}'")
325
-
326
- if missing_params:
327
- if len(missing_params) == 1:
328
- missing_params_str = missing_params[0]
329
- elif len(missing_params) == 2:
330
- missing_params_str = " and ".join(missing_params)
331
- else:
332
- missing_params_str = (
333
- ", ".join(missing_params[:-1]) + ", and " + missing_params[-1]
334
- )
335
-
336
- error_str = f"{missing_params_str} cannot be None for the '{metric.__name__}' metric"
337
- metric.error = error_str
338
- raise MissingTestCaseParamsError(error_str)
339
-
340
-
341
- def check_mllm_test_cases_params(
342
- test_cases: List[MLLMTestCase],
343
- test_case_params: List[MLLMTestCaseParams],
344
- input_image_count: Optional[int],
345
- actual_output_image_count: Optional[int],
346
- metric: BaseMetric,
347
- ):
348
- for test_case in test_cases:
349
- check_mllm_test_case_params(
350
- test_case,
351
- test_case_params,
352
- input_image_count,
353
- actual_output_image_count,
354
- metric,
368
+ check_llm_test_case_params(
369
+ test_case, test_case_params, None, None, metric, model, multimodal
355
370
  )
356
371
 
357
372
 
@@ -381,6 +396,63 @@ def trimAndLoadJson(
381
396
  raise Exception(f"An unexpected error occurred: {str(e)}")
382
397
 
383
398
 
399
+ SchemaType = TypeVar("SchemaType")
400
+ ReturnType = TypeVar("ReturnType")
401
+
402
+
403
+ def generate_with_schema_and_extract(
404
+ metric: Union[BaseMetric, BaseArenaMetric, BaseConversationalMetric],
405
+ prompt: Any,
406
+ schema_cls: Type[SchemaType],
407
+ *,
408
+ extract_schema: Callable[[SchemaType], ReturnType],
409
+ extract_json: Callable[[Dict[str, Any]], ReturnType],
410
+ ) -> ReturnType:
411
+ """
412
+ Synchronous wrapper:
413
+ - calls model.generate_with_schema(...)
414
+ - accrues cost if applicable
415
+ - if schema instance -> extract_schema
416
+ else parse JSON -> extract_json
417
+ """
418
+ if metric.using_native_model:
419
+ result, cost = metric.model.generate_with_schema(
420
+ prompt, schema=schema_cls
421
+ )
422
+ metric._accrue_cost(cost)
423
+ else:
424
+ result = metric.model.generate_with_schema(prompt, schema=schema_cls)
425
+ if isinstance(result, schema_cls):
426
+ return extract_schema(result)
427
+ data = trimAndLoadJson(result, metric)
428
+ return extract_json(data)
429
+
430
+
431
+ async def a_generate_with_schema_and_extract(
432
+ metric: Union[BaseMetric, BaseArenaMetric, BaseConversationalMetric],
433
+ prompt: Any,
434
+ schema_cls: Type[SchemaType],
435
+ *,
436
+ extract_schema: Callable[[SchemaType], ReturnType],
437
+ extract_json: Callable[[Dict[str, Any]], ReturnType],
438
+ ) -> ReturnType:
439
+ if metric.using_native_model:
440
+ result, cost = await metric.model.a_generate_with_schema(
441
+ prompt, schema=schema_cls
442
+ )
443
+ metric._accrue_cost(cost)
444
+ else:
445
+ result = await metric.model.a_generate_with_schema(
446
+ prompt, schema=schema_cls
447
+ )
448
+
449
+ if isinstance(result, schema_cls):
450
+ return extract_schema(result)
451
+
452
+ data = trimAndLoadJson(result, metric)
453
+ return extract_json(data)
454
+
455
+
384
456
  ###############################################
385
457
  # Default Model Providers
386
458
  ###############################################
@@ -397,8 +469,8 @@ def should_use_local_model():
397
469
 
398
470
 
399
471
  def should_use_ollama_model():
400
- base_url = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.LOCAL_MODEL_API_KEY)
401
- return base_url == "ollama"
472
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.LOCAL_MODEL_API_KEY)
473
+ return value == "ollama"
402
474
 
403
475
 
404
476
  def should_use_gemini_model():
@@ -459,7 +531,7 @@ def initialize_model(
459
531
  elif should_use_local_model():
460
532
  return LocalModel(), True
461
533
  elif should_use_azure_openai():
462
- return AzureOpenAIModel(model_name=model), True
534
+ return AzureOpenAIModel(model=model), True
463
535
  elif should_use_moonshot_model():
464
536
  return KimiModel(model=model), True
465
537
  elif should_use_grok_model():
@@ -501,42 +573,6 @@ def is_native_model(
501
573
  ###############################################
502
574
 
503
575
 
504
- def initialize_multimodal_model(
505
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
506
- ) -> Tuple[DeepEvalBaseLLM, bool]:
507
- """
508
- Returns a tuple of (initialized DeepEvalBaseMLLM, using_native_model boolean)
509
- """
510
- if is_native_mllm(model):
511
- return model, True
512
- if isinstance(model, DeepEvalBaseMLLM):
513
- return model, False
514
- if should_use_gemini_model():
515
- return MultimodalGeminiModel(), True
516
- if should_use_ollama_model():
517
- return MultimodalOllamaModel(), True
518
- elif should_use_azure_openai():
519
- return MultimodalAzureOpenAIMLLMModel(model_name=model), True
520
- elif isinstance(model, str) or model is None:
521
- return MultimodalOpenAIModel(model=model), True
522
- raise TypeError(
523
- f"Unsupported type for model: {type(model)}. Expected None, str, DeepEvalBaseMLLM, MultimodalOpenAIModel, MultimodalOllamaModel."
524
- )
525
-
526
-
527
- def is_native_mllm(
528
- model: Optional[Union[str, DeepEvalBaseLLM]] = None,
529
- ) -> bool:
530
- if (
531
- isinstance(model, MultimodalOpenAIModel)
532
- or isinstance(model, MultimodalOllamaModel)
533
- or isinstance(model, MultimodalGeminiModel)
534
- ):
535
- return True
536
- else:
537
- return False
538
-
539
-
540
576
  ###############################################
541
577
  # Embedding Model
542
578
  ###############################################
@@ -1,7 +1,6 @@
1
1
  from deepeval.models.base_model import (
2
2
  DeepEvalBaseModel,
3
3
  DeepEvalBaseLLM,
4
- DeepEvalBaseMLLM,
5
4
  DeepEvalBaseEmbeddingModel,
6
5
  )
7
6
  from deepeval.models.llms import (
@@ -17,12 +16,6 @@ from deepeval.models.llms import (
17
16
  GrokModel,
18
17
  DeepSeekModel,
19
18
  )
20
- from deepeval.models.mlllms import (
21
- MultimodalOpenAIModel,
22
- MultimodalOllamaModel,
23
- MultimodalGeminiModel,
24
- MultimodalAzureOpenAIMLLMModel,
25
- )
26
19
  from deepeval.models.embedding_models import (
27
20
  OpenAIEmbeddingModel,
28
21
  AzureOpenAIEmbeddingModel,
@@ -33,7 +26,6 @@ from deepeval.models.embedding_models import (
33
26
  __all__ = [
34
27
  "DeepEvalBaseModel",
35
28
  "DeepEvalBaseLLM",
36
- "DeepEvalBaseMLLM",
37
29
  "DeepEvalBaseEmbeddingModel",
38
30
  "GPTModel",
39
31
  "AzureOpenAIModel",
@@ -46,10 +38,6 @@ __all__ = [
46
38
  "KimiModel",
47
39
  "GrokModel",
48
40
  "DeepSeekModel",
49
- "MultimodalOpenAIModel",
50
- "MultimodalOllamaModel",
51
- "MultimodalGeminiModel",
52
- "MultimodalAzureOpenAIMLLMModel",
53
41
  "OpenAIEmbeddingModel",
54
42
  "AzureOpenAIEmbeddingModel",
55
43
  "LocalEmbeddingModel",