deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  from typing import Optional, List, Tuple, Union, Dict
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import get_or_create_event_loop
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import (
11
12
  LLMTestCase,
@@ -15,7 +16,11 @@ from deepeval.metrics import BaseMetric
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.task_completion.template import TaskCompletionTemplate
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.task_completion.schema import *
19
+ from deepeval.metrics.task_completion.schema import (
20
+ TaskAndOutcome,
21
+ TaskCompletionVerdict,
22
+ )
23
+ from deepeval.metrics.api import metric_data_manager
19
24
 
20
25
 
21
26
  class TaskCompletionMetric(BaseMetric):
@@ -23,7 +28,6 @@ class TaskCompletionMetric(BaseMetric):
23
28
  _required_params: List[LLMTestCaseParams] = [
24
29
  LLMTestCaseParams.INPUT,
25
30
  LLMTestCaseParams.ACTUAL_OUTPUT,
26
- LLMTestCaseParams.TOOLS_CALLED,
27
31
  ]
28
32
 
29
33
  def __init__(
@@ -58,9 +62,15 @@ class TaskCompletionMetric(BaseMetric):
58
62
  _in_component: bool = False,
59
63
  _log_metric_to_confident: bool = True,
60
64
  ) -> float:
61
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
62
- if not has_trace:
63
- check_llm_test_case_params(test_case, self._required_params, self)
65
+ check_llm_test_case_params(
66
+ test_case,
67
+ self._required_params,
68
+ None,
69
+ None,
70
+ self,
71
+ self.model,
72
+ test_case.multimodal,
73
+ )
64
74
 
65
75
  self.evaluation_cost = 0 if self.using_native_model else None
66
76
  with metric_progress_indicator(
@@ -91,6 +101,12 @@ class TaskCompletionMetric(BaseMetric):
91
101
  f"Score: {self.score}\nReason: {self.reason}",
92
102
  ],
93
103
  )
104
+
105
+ if _log_metric_to_confident:
106
+ metric_data_manager.post_metric_if_enabled(
107
+ self, test_case=test_case
108
+ )
109
+
94
110
  return self.score
95
111
 
96
112
  async def a_measure(
@@ -100,9 +116,15 @@ class TaskCompletionMetric(BaseMetric):
100
116
  _in_component: bool = False,
101
117
  _log_metric_to_confident: bool = True,
102
118
  ) -> float:
103
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
104
- if not has_trace:
105
- check_llm_test_case_params(test_case, self._required_params, self)
119
+ check_llm_test_case_params(
120
+ test_case,
121
+ self._required_params,
122
+ None,
123
+ None,
124
+ self,
125
+ self.model,
126
+ test_case.multimodal,
127
+ )
106
128
 
107
129
  self.evaluation_cost = 0 if self.using_native_model else None
108
130
  with metric_progress_indicator(
@@ -127,6 +149,12 @@ class TaskCompletionMetric(BaseMetric):
127
149
  f"Score: {self.score}\nReason: {self.reason}",
128
150
  ],
129
151
  )
152
+
153
+ if _log_metric_to_confident:
154
+ metric_data_manager.post_metric_if_enabled(
155
+ self, test_case=test_case
156
+ )
157
+
130
158
  return self.score
131
159
 
132
160
  async def _a_generate_verdicts(self) -> Tuple:
@@ -134,44 +162,26 @@ class TaskCompletionMetric(BaseMetric):
134
162
  task=self.task,
135
163
  actual_outcome=self.outcome,
136
164
  )
137
- if self.using_native_model:
138
- res, cost = await self.model.a_generate(
139
- prompt, schema=TaskCompletionVerdict
140
- )
141
- self.evaluation_cost += cost
142
- return res.verdict, res.reason
143
- else:
144
- try:
145
- res: TaskCompletionVerdict = await self.model.a_generate(
146
- prompt, schema=TaskCompletionVerdict
147
- )
148
- return res.verdict, res.reason
149
- except TypeError:
150
- res = await self.model.a_generate(prompt)
151
- data = trimAndLoadJson(res, self)
152
- return data["verdict"], data["reason"]
165
+ return await a_generate_with_schema_and_extract(
166
+ metric=self,
167
+ prompt=prompt,
168
+ schema_cls=TaskCompletionVerdict,
169
+ extract_schema=lambda s: (s.verdict, s.reason),
170
+ extract_json=lambda data: (data["verdict"], data["reason"]),
171
+ )
153
172
 
154
173
  def _generate_verdicts(self) -> Tuple:
155
174
  prompt = TaskCompletionTemplate.generate_verdict(
156
175
  task=self.task,
157
176
  actual_outcome=self.outcome,
158
177
  )
159
- if self.using_native_model:
160
- res, cost = self.model.generate(
161
- prompt, schema=TaskCompletionVerdict
162
- )
163
- self.evaluation_cost += cost
164
- return res.verdict, res.reason
165
- else:
166
- try:
167
- res: TaskCompletionVerdict = self.model.generate(
168
- prompt, schema=TaskCompletionVerdict
169
- )
170
- return res.verdict, res.reason
171
- except TypeError:
172
- res = self.model.generate(prompt)
173
- data = trimAndLoadJson(res, self)
174
- return data["verdict"], data["reason"]
178
+ return generate_with_schema_and_extract(
179
+ metric=self,
180
+ prompt=prompt,
181
+ schema_cls=TaskCompletionVerdict,
182
+ extract_schema=lambda s: (s.verdict, s.reason),
183
+ extract_json=lambda data: (data["verdict"], data["reason"]),
184
+ )
175
185
 
176
186
  async def _a_extract_task_and_outcome(
177
187
  self,
@@ -189,22 +199,13 @@ class TaskCompletionMetric(BaseMetric):
189
199
  actual_output=test_case.actual_output,
190
200
  tools_called=test_case.tools_called,
191
201
  )
192
- if self.using_native_model:
193
- res, cost = await self.model.a_generate(
194
- prompt, schema=TaskAndOutcome
195
- )
196
- self.evaluation_cost += cost
197
- return res.task, res.outcome
198
- else:
199
- try:
200
- res: TaskAndOutcome = await self.model.a_generate(
201
- prompt, schema=TaskAndOutcome
202
- )
203
- return res.task, res.outcome
204
- except TypeError:
205
- res = await self.model.a_generate(prompt)
206
- data = trimAndLoadJson(res, self)
207
- return data["task"], data["outcome"]
202
+ return await a_generate_with_schema_and_extract(
203
+ metric=self,
204
+ prompt=prompt,
205
+ schema_cls=TaskAndOutcome,
206
+ extract_schema=lambda s: (s.task, s.outcome),
207
+ extract_json=lambda data: (data["task"], data["outcome"]),
208
+ )
208
209
 
209
210
  def _extract_task_and_outcome(
210
211
  self,
@@ -222,20 +223,13 @@ class TaskCompletionMetric(BaseMetric):
222
223
  actual_output=test_case.actual_output,
223
224
  tools_called=test_case.tools_called,
224
225
  )
225
- if self.using_native_model:
226
- res, cost = self.model.generate(prompt, schema=TaskAndOutcome)
227
- self.evaluation_cost += cost
228
- return res.task, res.outcome
229
- else:
230
- try:
231
- res: TaskAndOutcome = self.model.generate(
232
- prompt, schema=TaskAndOutcome
233
- )
234
- return res.task, res.outcome
235
- except TypeError:
236
- res = self.model.generate(prompt)
237
- data = trimAndLoadJson(res, self)
238
- return data["task"], data["outcome"]
226
+ return generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=TaskAndOutcome,
230
+ extract_schema=lambda s: (s.task, s.outcome),
231
+ extract_json=lambda data: (data["task"], data["outcome"]),
232
+ )
239
233
 
240
234
  def _calculate_score(self):
241
235
  return (
@@ -250,7 +244,7 @@ class TaskCompletionMetric(BaseMetric):
250
244
  else:
251
245
  try:
252
246
  self.success = self.score >= self.threshold
253
- except:
247
+ except TypeError:
254
248
  self.success = False
255
249
  return self.success
256
250
 
@@ -1,13 +1,14 @@
1
- from typing import List, Dict, Optional, Union
1
+ from typing import List, Dict, Optional, Union, Tuple
2
2
 
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
- from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.utils import get_or_create_event_loop
5
5
  from deepeval.metrics.utils import (
6
6
  construct_verbose_logs,
7
7
  check_llm_test_case_params,
8
- trimAndLoadJson,
9
8
  initialize_model,
10
9
  print_tools_called,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.models import DeepEvalBaseLLM
13
14
  from deepeval.test_case import (
@@ -62,7 +63,15 @@ class ToolCorrectnessMetric(BaseMetric):
62
63
  _log_metric_to_confident: bool = True,
63
64
  ) -> float:
64
65
 
65
- check_llm_test_case_params(test_case, self._required_params, self)
66
+ check_llm_test_case_params(
67
+ test_case,
68
+ self._required_params,
69
+ None,
70
+ None,
71
+ self,
72
+ self.model,
73
+ test_case.multimodal,
74
+ )
66
75
  self.test_case = test_case
67
76
  self.evaluation_cost = 0 if self.using_native_model else None
68
77
 
@@ -83,18 +92,16 @@ class ToolCorrectnessMetric(BaseMetric):
83
92
  self.tools_called: List[ToolCall] = test_case.tools_called
84
93
  self.expected_tools: List[ToolCall] = test_case.expected_tools
85
94
  tool_calling_score = self._calculate_score()
86
- if self.available_tools:
95
+ if self.available_tools and not test_case.multimodal:
87
96
  tool_selection_score = self._get_tool_selection_score(
88
97
  test_case.input,
89
98
  test_case.tools_called,
90
99
  self.available_tools,
91
100
  )
92
101
  else:
93
- tool_selection_score = tool_selection_score = (
94
- ToolSelectionScore(
95
- score=1,
96
- reason="No available tools were provided to assess tool selection criteria",
97
- )
102
+ tool_selection_score = ToolSelectionScore(
103
+ score=1,
104
+ reason="No available tools were provided to assess tool selection criteria",
98
105
  )
99
106
  score = min(tool_calling_score, tool_selection_score.score)
100
107
  self.score = (
@@ -165,7 +172,15 @@ class ToolCorrectnessMetric(BaseMetric):
165
172
  _in_component: bool = False,
166
173
  _log_metric_to_confident: bool = True,
167
174
  ) -> float:
168
- check_llm_test_case_params(test_case, self._required_params, self)
175
+ check_llm_test_case_params(
176
+ test_case,
177
+ self._required_params,
178
+ None,
179
+ None,
180
+ self,
181
+ self.model,
182
+ test_case.multimodal,
183
+ )
169
184
 
170
185
  self.evaluation_cost = 0 if self.using_native_model else None
171
186
  with metric_progress_indicator(
@@ -177,7 +192,7 @@ class ToolCorrectnessMetric(BaseMetric):
177
192
  self.tools_called: List[ToolCall] = test_case.tools_called
178
193
  self.expected_tools: List[ToolCall] = test_case.expected_tools
179
194
  tool_calling_score = self._calculate_score()
180
- if self.available_tools:
195
+ if self.available_tools and not test_case.multimodal:
181
196
  tool_selection_score = await self._a_get_tool_selection_score(
182
197
  test_case.input,
183
198
  test_case.tools_called,
@@ -324,18 +339,13 @@ class ToolCorrectnessMetric(BaseMetric):
324
339
  prompt = ToolCorrectnessTemplate.get_tool_selection_score(
325
340
  user_input, tools_called_formatted, available_tools_formatted
326
341
  )
327
- if self.using_native_model:
328
- res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
329
- self.evaluation_cost += cost
330
- return res
331
- else:
332
- try:
333
- res = self.model.generate(prompt, schema=ToolSelectionScore)
334
- return res
335
- except TypeError:
336
- res = self.model.generate(prompt)
337
- data = trimAndLoadJson(res, self)
338
- return ToolSelectionScore(**data)
342
+ return generate_with_schema_and_extract(
343
+ metric=self,
344
+ prompt=prompt,
345
+ schema_cls=ToolSelectionScore,
346
+ extract_schema=lambda s: s,
347
+ extract_json=lambda data: ToolSelectionScore(**data),
348
+ )
339
349
 
340
350
  async def _a_get_tool_selection_score(
341
351
  self, user_input, tools_called, available_tools
@@ -345,25 +355,16 @@ class ToolCorrectnessMetric(BaseMetric):
345
355
  prompt = ToolCorrectnessTemplate.get_tool_selection_score(
346
356
  user_input, tools_called_formatted, available_tools_formatted
347
357
  )
348
- if self.using_native_model:
349
- res, cost = await self.model.a_generate(
350
- prompt, schema=ToolSelectionScore
351
- )
352
- self.evaluation_cost += cost
353
- return res
354
- else:
355
- try:
356
- res = await self.model.a_generate(
357
- prompt, schema=ToolSelectionScore
358
- )
359
- return res
360
- except TypeError:
361
- res = await self.model.a_generate(prompt)
362
- data = trimAndLoadJson(res, self)
363
- return ToolSelectionScore(**data)
358
+ return await a_generate_with_schema_and_extract(
359
+ metric=self,
360
+ prompt=prompt,
361
+ schema_cls=ToolSelectionScore,
362
+ extract_schema=lambda s: s,
363
+ extract_json=lambda data: ToolSelectionScore(**data),
364
+ )
364
365
 
365
366
  # Calculate score
366
- def _calculate_score(self):
367
+ def _calculate_score(self) -> float:
367
368
  if self.should_exact_match:
368
369
  score = self._calculate_exact_match_score()
369
370
  elif self.should_consider_ordering:
@@ -382,7 +383,7 @@ class ToolCorrectnessMetric(BaseMetric):
382
383
  return 0 if self.strict_mode and score < self.threshold else score
383
384
 
384
385
  # Exact matching score
385
- def _calculate_exact_match_score(self):
386
+ def _calculate_exact_match_score(self) -> float:
386
387
  if len(self.tools_called) != len(self.expected_tools):
387
388
  return 0.0
388
389
  if (
@@ -405,7 +406,7 @@ class ToolCorrectnessMetric(BaseMetric):
405
406
  return 1.0
406
407
 
407
408
  # Non exact matching score
408
- def _calculate_non_exact_match_score(self):
409
+ def _calculate_non_exact_match_score(self) -> float:
409
410
  total_score = 0.0
410
411
  matched_called_tools = set()
411
412
  for expected_tool in self.expected_tools:
@@ -445,7 +446,7 @@ class ToolCorrectnessMetric(BaseMetric):
445
446
  )
446
447
 
447
448
  # Consider ordering score
448
- def _compute_weighted_lcs(self):
449
+ def _compute_weighted_lcs(self) -> Tuple[List[ToolCall], float]:
449
450
  m, n = len(self.expected_tools), len(self.tools_called)
450
451
  dp = [[0.0] * (n + 1) for _ in range(m + 1)]
451
452
  for i in range(1, m + 1):
@@ -3,11 +3,11 @@ import asyncio
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  get_unit_interactions,
8
- print_tools_called,
9
7
  check_conversational_test_case_params,
10
8
  initialize_model,
9
+ a_generate_with_schema_and_extract,
10
+ generate_with_schema_and_extract,
11
11
  )
12
12
  from deepeval.test_case import (
13
13
  ConversationalTestCase,
@@ -61,7 +61,12 @@ class ToolUseMetric(BaseConversationalMetric):
61
61
  _log_metric_to_confident: bool = True,
62
62
  ):
63
63
  check_conversational_test_case_params(
64
- test_case, self._required_test_case_params, self
64
+ test_case,
65
+ self._required_test_case_params,
66
+ self,
67
+ False,
68
+ self.model,
69
+ test_case.multimodal,
65
70
  )
66
71
 
67
72
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -136,7 +141,12 @@ class ToolUseMetric(BaseConversationalMetric):
136
141
  _log_metric_to_confident: bool = True,
137
142
  ):
138
143
  check_conversational_test_case_params(
139
- test_case, self._required_test_case_params, self
144
+ test_case,
145
+ self._required_test_case_params,
146
+ self,
147
+ False,
148
+ self.model,
149
+ test_case.multimodal,
140
150
  )
141
151
 
142
152
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -206,22 +216,13 @@ class ToolUseMetric(BaseConversationalMetric):
206
216
  user_and_tools.tools_called,
207
217
  user_and_tools.available_tools,
208
218
  )
209
- if self.using_native_model:
210
- res, cost = self.model.generate(
211
- prompt, schema=ArgumentCorrectnessScore
212
- )
213
- self.evaluation_cost += cost
214
- return res
215
- else:
216
- try:
217
- res: ArgumentCorrectnessScore = self.model.generate(
218
- prompt, schema=ArgumentCorrectnessScore
219
- )
220
- return res
221
- except TypeError:
222
- res = self.model.generate(prompt)
223
- data = trimAndLoadJson(res, self)
224
- return ArgumentCorrectnessScore(**data)
219
+ return generate_with_schema_and_extract(
220
+ metric=self,
221
+ prompt=prompt,
222
+ schema_cls=ArgumentCorrectnessScore,
223
+ extract_schema=lambda s: s,
224
+ extract_json=lambda data: ArgumentCorrectnessScore(**data),
225
+ )
225
226
 
226
227
  async def _a_get_argument_correctness_score(
227
228
  self,
@@ -233,22 +234,13 @@ class ToolUseMetric(BaseConversationalMetric):
233
234
  user_and_tools.tools_called,
234
235
  user_and_tools.available_tools,
235
236
  )
236
- if self.using_native_model:
237
- res, cost = await self.model.a_generate(
238
- prompt, schema=ArgumentCorrectnessScore
239
- )
240
- self.evaluation_cost += cost
241
- return res
242
- else:
243
- try:
244
- res: ArgumentCorrectnessScore = await self.model.a_generate(
245
- prompt, schema=ArgumentCorrectnessScore
246
- )
247
- return res
248
- except TypeError:
249
- res = await self.model.a_generate(prompt)
250
- data = trimAndLoadJson(res, self)
251
- return ArgumentCorrectnessScore(**data)
237
+ return await a_generate_with_schema_and_extract(
238
+ metric=self,
239
+ prompt=prompt,
240
+ schema_cls=ArgumentCorrectnessScore,
241
+ extract_schema=lambda s: s,
242
+ extract_json=lambda data: ArgumentCorrectnessScore(**data),
243
+ )
252
244
 
253
245
  def _get_tool_selection_score(
254
246
  self,
@@ -260,20 +252,13 @@ class ToolUseMetric(BaseConversationalMetric):
260
252
  user_and_tools.tools_called,
261
253
  user_and_tools.available_tools,
262
254
  )
263
- if self.using_native_model:
264
- res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
265
- self.evaluation_cost += cost
266
- return res
267
- else:
268
- try:
269
- res: ToolSelectionScore = self.model.generate(
270
- prompt, schema=ToolSelectionScore
271
- )
272
- return res
273
- except TypeError:
274
- res = self.model.generate(prompt)
275
- data = trimAndLoadJson(res, self)
276
- return ToolSelectionScore(**data)
255
+ return generate_with_schema_and_extract(
256
+ metric=self,
257
+ prompt=prompt,
258
+ schema_cls=ToolSelectionScore,
259
+ extract_schema=lambda s: s,
260
+ extract_json=lambda data: ToolSelectionScore(**data),
261
+ )
277
262
 
278
263
  async def _a_get_tool_selection_score(
279
264
  self,
@@ -285,22 +270,13 @@ class ToolUseMetric(BaseConversationalMetric):
285
270
  user_and_tools.tools_called,
286
271
  user_and_tools.available_tools,
287
272
  )
288
- if self.using_native_model:
289
- res, cost = await self.model.a_generate(
290
- prompt, schema=ToolSelectionScore
291
- )
292
- self.evaluation_cost += cost
293
- return res
294
- else:
295
- try:
296
- res: ToolSelectionScore = await self.model.a_generate(
297
- prompt, schema=ToolSelectionScore
298
- )
299
- return res
300
- except TypeError:
301
- res = await self.model.a_generate(prompt)
302
- data = trimAndLoadJson(res, self)
303
- return ToolSelectionScore(**data)
273
+ return await a_generate_with_schema_and_extract(
274
+ metric=self,
275
+ prompt=prompt,
276
+ schema_cls=ToolSelectionScore,
277
+ extract_schema=lambda s: s,
278
+ extract_json=lambda data: ToolSelectionScore(**data),
279
+ )
304
280
 
305
281
  def _get_user_input_and_turns(
306
282
  self,
@@ -3,6 +3,13 @@ import textwrap
3
3
 
4
4
 
5
5
  class TopicAdherenceTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
6
13
 
7
14
  @staticmethod
8
15
  def get_qa_pairs(
@@ -19,6 +26,8 @@ class TopicAdherenceTemplate:
19
26
  Do not infer information beyond what is stated. Ignore irrelevant or conversational turns (e.g. greetings, affirmations) that do not constitute clear QA pairs.
20
27
  If there are multiple questions and multiple answers in a single sentence, break them into separate pairs. Each pair must be standalone, and should not contain more than one question or response.
21
28
 
29
+ {TopicAdherenceTemplate.multimodal_rules}
30
+
22
31
  OUTPUT Format:
23
32
  Return a **JSON object** with a single 2 keys:
24
33
  - `"question"`: the user's question
@@ -82,6 +91,8 @@ class TopicAdherenceTemplate:
82
91
  3. Based on both relevance and correctness, assign one of four possible verdicts.
83
92
  4. Give a simple, comprehensive reason explaining why this question-answer pair was assigned this verdict
84
93
 
94
+ {TopicAdherenceTemplate.multimodal_rules}
95
+
85
96
  VERDICTS:
86
97
  - `"TP"` (True Positive): Question is relevant and the response correctly answers it.
87
98
  - `"FN"` (False Negative): Question is relevant, but the assistant refused to answer or gave an irrelevant response.
@@ -138,6 +149,8 @@ class TopicAdherenceTemplate:
138
149
 
139
150
  Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
140
151
 
152
+ {TopicAdherenceTemplate.multimodal_rules}
153
+
141
154
  Pass: {success}
142
155
  Score: {score}
143
156
  Threshold: {threshold}