deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -10,10 +10,9 @@ from deepeval.errors import MissingTestCaseParamsError
10
10
  from deepeval.metrics import (
11
11
  BaseMetric,
12
12
  BaseConversationalMetric,
13
- BaseMultimodalMetric,
14
13
  BaseArenaMetric,
15
14
  )
16
- from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
15
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
17
16
  from deepeval.test_run.cache import CachedTestCase, Cache
18
17
  from deepeval.telemetry import capture_metric_type
19
18
  from deepeval.utils import update_pbar
@@ -74,8 +73,8 @@ def metric_progress_indicator(
74
73
  async def measure_metric_task(
75
74
  task_id,
76
75
  progress,
77
- metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
78
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
76
+ metric: Union[BaseMetric, BaseConversationalMetric],
77
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
79
78
  cached_test_case: Union[CachedTestCase, None],
80
79
  ignore_errors: bool,
81
80
  skip_on_missing_params: bool,
@@ -156,10 +155,8 @@ async def measure_metric_task(
156
155
 
157
156
 
158
157
  async def measure_metrics_with_indicator(
159
- metrics: List[
160
- Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
161
- ],
162
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
158
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
159
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
163
160
  cached_test_case: Union[CachedTestCase, None],
164
161
  ignore_errors: bool,
165
162
  skip_on_missing_params: bool,
@@ -238,8 +235,8 @@ async def measure_metrics_with_indicator(
238
235
 
239
236
 
240
237
  async def safe_a_measure(
241
- metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
242
- tc: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
238
+ metric: Union[BaseMetric, BaseConversationalMetric],
239
+ tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
243
240
  ignore_errors: bool,
244
241
  skip_on_missing_params: bool,
245
242
  progress: Optional[Progress] = None,
@@ -11,7 +11,8 @@ from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
12
  check_llm_test_case_params,
13
13
  initialize_model,
14
- trimAndLoadJson,
14
+ a_generate_with_schema_and_extract,
15
+ generate_with_schema_and_extract,
15
16
  )
16
17
  from deepeval.models import DeepEvalBaseLLM
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -46,6 +47,7 @@ class JsonCorrectnessMetric(BaseMetric):
46
47
  self.async_mode = async_mode
47
48
  self.verbose_mode = verbose_mode
48
49
  self.expected_schema = expected_schema
50
+ self.evaluation_model = self.model.get_model_name()
49
51
 
50
52
  def measure(
51
53
  self,
@@ -55,7 +57,16 @@ class JsonCorrectnessMetric(BaseMetric):
55
57
  _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
- check_llm_test_case_params(test_case, self._required_params, self)
60
+ multimodal = test_case.multimodal
61
+ check_llm_test_case_params(
62
+ test_case,
63
+ self._required_params,
64
+ None,
65
+ None,
66
+ self,
67
+ self.model,
68
+ multimodal,
69
+ )
59
70
 
60
71
  self.evaluation_cost = 0 if self.using_native_model else None
61
72
  with metric_progress_indicator(
@@ -77,7 +88,7 @@ class JsonCorrectnessMetric(BaseMetric):
77
88
  self.expected_schema.model_validate_json(
78
89
  test_case.actual_output
79
90
  )
80
- except ValidationError as e:
91
+ except ValidationError:
81
92
  valid_json = False
82
93
 
83
94
  self.score = 1 if valid_json else 0
@@ -106,7 +117,16 @@ class JsonCorrectnessMetric(BaseMetric):
106
117
  _log_metric_to_confident: bool = True,
107
118
  ) -> float:
108
119
 
109
- check_llm_test_case_params(test_case, self._required_params, self)
120
+ multimodal = test_case.multimodal
121
+ check_llm_test_case_params(
122
+ test_case,
123
+ self._required_params,
124
+ None,
125
+ None,
126
+ self,
127
+ self.model,
128
+ multimodal,
129
+ )
110
130
 
111
131
  self.evaluation_cost = 0 if self.using_native_model else None
112
132
  with metric_progress_indicator(
@@ -120,7 +140,7 @@ class JsonCorrectnessMetric(BaseMetric):
120
140
  self.expected_schema.model_validate_json(
121
141
  test_case.actual_output
122
142
  )
123
- except ValidationError as e:
143
+ except ValidationError:
124
144
  valid_json = False
125
145
 
126
146
  self.score = 1 if valid_json else 0
@@ -156,22 +176,13 @@ class JsonCorrectnessMetric(BaseMetric):
156
176
  is_valid_json=is_valid_json,
157
177
  )
158
178
 
159
- if self.using_native_model:
160
- res, cost = await self.model.a_generate(
161
- prompt, schema=JsonCorrectnessScoreReason
162
- )
163
- self.evaluation_cost += cost
164
- return res.reason
165
- else:
166
- try:
167
- res: JsonCorrectnessScoreReason = await self.model.a_generate(
168
- prompt, schema=JsonCorrectnessScoreReason
169
- )
170
- return res.reason
171
- except TypeError:
172
- res = await self.model.a_generate(prompt)
173
- data = trimAndLoadJson(res, self)
174
- return data["reason"]
179
+ return await a_generate_with_schema_and_extract(
180
+ metric=self,
181
+ prompt=prompt,
182
+ schema_cls=JsonCorrectnessScoreReason,
183
+ extract_schema=lambda s: s.reason,
184
+ extract_json=lambda data: data["reason"],
185
+ )
175
186
 
176
187
  def generate_reason(self, actual_output: str) -> str:
177
188
  if self.include_reason is False:
@@ -189,22 +200,13 @@ class JsonCorrectnessMetric(BaseMetric):
189
200
  is_valid_json=is_valid_json,
190
201
  )
191
202
 
192
- if self.using_native_model:
193
- res, cost = self.model.generate(
194
- prompt, schema=JsonCorrectnessScoreReason
195
- )
196
- self.evaluation_cost += cost
197
- return res.reason
198
- else:
199
- try:
200
- res: JsonCorrectnessScoreReason = self.model.generate(
201
- prompt, schema=JsonCorrectnessScoreReason
202
- )
203
- return res.reason
204
- except TypeError:
205
- res = self.model.generate(prompt)
206
- data = trimAndLoadJson(res, self)
207
- return data["reason"]
203
+ return generate_with_schema_and_extract(
204
+ metric=self,
205
+ prompt=prompt,
206
+ schema_cls=JsonCorrectnessScoreReason,
207
+ extract_schema=lambda s: s.reason,
208
+ extract_json=lambda data: data["reason"],
209
+ )
208
210
 
209
211
  def is_successful(self) -> bool:
210
212
  if self.error is not None:
@@ -212,7 +214,7 @@ class JsonCorrectnessMetric(BaseMetric):
212
214
  else:
213
215
  try:
214
216
  self.success = self.score >= self.threshold
215
- except:
217
+ except TypeError:
216
218
  self.success = False
217
219
  return self.success
218
220
 
@@ -2,12 +2,22 @@ from typing import Optional
2
2
 
3
3
 
4
4
  class JsonCorrectnessTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_reason(
7
15
  actual_output: str, expected_schema: str, is_valid_json: bool
8
16
  ):
9
17
  return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
10
18
 
19
+ {JsonCorrectnessTemplate.multimodal_rules}
20
+
11
21
  **
12
22
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
13
23
  Example JSON:
@@ -5,9 +5,10 @@ from deepeval.metrics import BaseConversationalMetric
5
5
  from deepeval.metrics.utils import (
6
6
  check_conversational_test_case_params,
7
7
  construct_verbose_logs,
8
- trimAndLoadJson,
9
8
  initialize_model,
10
9
  convert_turn_to_dict,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.models import DeepEvalBaseLLM
13
14
  from deepeval.metrics.knowledge_retention.template import (
@@ -51,7 +52,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
51
52
  _log_metric_to_confident: bool = True,
52
53
  ):
53
54
  check_conversational_test_case_params(
54
- test_case, self._required_test_case_params, self
55
+ test_case,
56
+ self._required_test_case_params,
57
+ self,
58
+ False,
59
+ self.model,
60
+ test_case.multimodal,
55
61
  )
56
62
 
57
63
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -101,7 +107,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
101
107
  _log_metric_to_confident: bool = True,
102
108
  ) -> float:
103
109
  check_conversational_test_case_params(
104
- test_case, self._required_test_case_params, self
110
+ test_case,
111
+ self._required_test_case_params,
112
+ self,
113
+ False,
114
+ self.model,
115
+ test_case.multimodal,
105
116
  )
106
117
 
107
118
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -147,23 +158,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
147
158
  attritions=attritions,
148
159
  score=format(self.score, ".2f"),
149
160
  )
150
- if self.using_native_model:
151
- res, cost = await self.model.a_generate(prompt)
152
- self.evaluation_cost += cost
153
- data = trimAndLoadJson(res, self)
154
- return data["reason"]
155
- else:
156
- try:
157
- res: KnowledgeRetentionScoreReason = (
158
- await self.model.a_generate(
159
- prompt, schema=KnowledgeRetentionScoreReason
160
- )
161
- )
162
- return res.reason
163
- except TypeError:
164
- res = await self.model.a_generate(prompt)
165
- data = trimAndLoadJson(res, self)
166
- return data["reason"]
161
+ return await a_generate_with_schema_and_extract(
162
+ metric=self,
163
+ prompt=prompt,
164
+ schema_cls=KnowledgeRetentionScoreReason,
165
+ extract_schema=lambda s: s.reason,
166
+ extract_json=lambda data: data["reason"],
167
+ )
167
168
 
168
169
  def _generate_reason(self) -> str:
169
170
  if self.include_reason is False:
@@ -178,21 +179,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
178
179
  attritions=attritions,
179
180
  score=format(self.score, ".2f"),
180
181
  )
181
- if self.using_native_model:
182
- res, cost = self.model.generate(prompt)
183
- self.evaluation_cost += cost
184
- data = trimAndLoadJson(res, self)
185
- return data["reason"]
186
- else:
187
- try:
188
- res: KnowledgeRetentionScoreReason = self.model.generate(
189
- prompt, schema=KnowledgeRetentionScoreReason
190
- )
191
- return res.reason
192
- except TypeError:
193
- res = self.model.generate(prompt)
194
- data = trimAndLoadJson(res, self)
195
- return data["reason"]
182
+ return generate_with_schema_and_extract(
183
+ metric=self,
184
+ prompt=prompt,
185
+ schema_cls=KnowledgeRetentionScoreReason,
186
+ extract_schema=lambda s: s.reason,
187
+ extract_json=lambda data: data["reason"],
188
+ )
196
189
 
197
190
  async def _a_generate_verdicts(
198
191
  self, turns: List[Turn]
@@ -205,7 +198,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
205
198
  accumulated_knowledge = [
206
199
  knowledge.data
207
200
  for knowledge in self.knowledges[:i]
208
- if knowledge is not None
201
+ if knowledge is not None and knowledge.data
209
202
  ]
210
203
  if len(accumulated_knowledge) == 0:
211
204
  continue
@@ -214,22 +207,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
214
207
  llm_message=turns[i].content,
215
208
  accumulated_knowledge=accumulated_knowledge,
216
209
  )
217
- if self.using_native_model:
218
- res, cost = await self.model.a_generate(prompt)
219
- self.evaluation_cost += cost
220
- data = trimAndLoadJson(res, self)
221
- verdict = KnowledgeRetentionVerdict(**data)
222
- else:
223
- try:
224
- verdict: KnowledgeRetentionVerdict = (
225
- await self.model.a_generate(
226
- prompt, schema=KnowledgeRetentionVerdict
227
- )
228
- )
229
- except TypeError:
230
- res = await self.model.a_generate(prompt)
231
- data = trimAndLoadJson(res, self)
232
- verdict = KnowledgeRetentionVerdict(**data)
210
+ verdict = await a_generate_with_schema_and_extract(
211
+ metric=self,
212
+ prompt=prompt,
213
+ schema_cls=KnowledgeRetentionVerdict,
214
+ extract_schema=lambda s: s,
215
+ extract_json=lambda data: KnowledgeRetentionVerdict(**data),
216
+ )
233
217
  verdicts.append(verdict)
234
218
  return verdicts
235
219
 
@@ -244,7 +228,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
244
228
  accumulated_knowledge = [
245
229
  knowledge.data
246
230
  for knowledge in self.knowledges[:i]
247
- if knowledge is not None
231
+ if knowledge is not None and knowledge.data
248
232
  ]
249
233
  if len(accumulated_knowledge) == 0:
250
234
  continue
@@ -254,20 +238,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
254
238
  accumulated_knowledge=accumulated_knowledge,
255
239
  )
256
240
 
257
- if self.using_native_model:
258
- res, cost = self.model.generate(prompt)
259
- self.evaluation_cost += cost
260
- data = trimAndLoadJson(res, self)
261
- verdict = KnowledgeRetentionVerdict(**data)
262
- else:
263
- try:
264
- verdict: KnowledgeRetentionVerdict = self.model.generate(
265
- prompt, schema=KnowledgeRetentionVerdict
266
- )
267
- except TypeError:
268
- res = self.model.generate(prompt)
269
- data = trimAndLoadJson(res, self)
270
- verdict = KnowledgeRetentionVerdict(**data)
241
+ verdict = generate_with_schema_and_extract(
242
+ metric=self,
243
+ prompt=prompt,
244
+ schema_cls=KnowledgeRetentionVerdict,
245
+ extract_schema=lambda s: s,
246
+ extract_json=lambda data: KnowledgeRetentionVerdict(**data),
247
+ )
271
248
  verdicts.append(verdict)
272
249
  return verdicts
273
250
 
@@ -289,20 +266,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
289
266
  convert_turn_to_dict(turn) for turn in previous_turns
290
267
  ],
291
268
  )
292
- if self.using_native_model:
293
- res, cost = await self.model.a_generate(prompt)
294
- self.evaluation_cost += cost
295
- data = trimAndLoadJson(res, self)
296
- knowledges[i] = Knowledge(data=data)
297
- else:
298
- try:
299
- knowledges[i] = await self.model.a_generate(
300
- prompt, schema=Knowledge
301
- )
302
- except TypeError:
303
- res = await self.model.a_generate(prompt)
304
- data = trimAndLoadJson(res, self)
305
- knowledges[i] = Knowledge(data=data)
269
+ knowledges[i] = await a_generate_with_schema_and_extract(
270
+ metric=self,
271
+ prompt=prompt,
272
+ schema_cls=Knowledge,
273
+ extract_schema=lambda s: s,
274
+ extract_json=lambda data: Knowledge(data=data),
275
+ )
306
276
 
307
277
  return knowledges
308
278
 
@@ -325,20 +295,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
325
295
  ],
326
296
  )
327
297
 
328
- if self.using_native_model:
329
- res, cost = self.model.generate(prompt)
330
- self.evaluation_cost += cost
331
- data = trimAndLoadJson(res, self)
332
- knowledges[i] = Knowledge(data=data)
333
- else:
334
- try:
335
- knowledges[i] = self.model.generate(
336
- prompt, schema=Knowledge
337
- )
338
- except TypeError:
339
- res = self.model.generate(prompt)
340
- data = trimAndLoadJson(res, self)
341
- knowledges[i] = Knowledge(data=data)
298
+ knowledges[i] = generate_with_schema_and_extract(
299
+ metric=self,
300
+ prompt=prompt,
301
+ schema_cls=Knowledge,
302
+ extract_schema=lambda s: s,
303
+ extract_json=lambda data: Knowledge(data=data),
304
+ )
342
305
 
343
306
  return knowledges
344
307
 
@@ -361,8 +324,8 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
361
324
  self.success = False
362
325
  else:
363
326
  try:
364
- self.score >= self.threshold
365
- except:
327
+ self.success = self.score >= self.threshold
328
+ except TypeError:
366
329
  self.success = False
367
330
  return self.success
368
331
 
@@ -1,15 +1,21 @@
1
- from typing import Dict, Optional, Any
2
- from pydantic import BaseModel
1
+ from typing import Dict, Optional, Union, List
2
+ from pydantic import BaseModel, ConfigDict
3
3
 
4
4
 
5
5
  class Knowledge(BaseModel):
6
- data: Dict[str, Any]
6
+ # Each fact’s value is either a string or a list of strings
7
+ # data: Dict[str, Union[str, List[str]]]
8
+ data: Dict[str, Union[str, List[str]]] | None = None
9
+ # Forbid extra top-level fields to satisfy OpenAI’s schema requirements
10
+ model_config = ConfigDict(extra="forbid")
7
11
 
8
12
 
9
13
  class KnowledgeRetentionVerdict(BaseModel):
10
14
  verdict: str
11
15
  reason: Optional[str] = None
16
+ model_config = ConfigDict(extra="forbid")
12
17
 
13
18
 
14
19
  class KnowledgeRetentionScoreReason(BaseModel):
15
20
  reason: str
21
+ model_config = ConfigDict(extra="forbid")
@@ -2,10 +2,20 @@ from typing import List, Dict, Any
2
2
 
3
3
 
4
4
  class KnowledgeRetentionTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_reason(attritions, score):
7
15
  return f"""Given a list of attritions, which highlights forgetfulness in the LLM response and knowledge established previously in the conversation, use it to CONCISELY provide a reason for the knowledge retention score. Note that The knowledge retention score ranges from 0 - 1, and the higher the better.
8
16
 
17
+ {KnowledgeRetentionTemplate.multimodal_rules}
18
+
9
19
  **
10
20
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
11
21
  Example JSON:
@@ -33,6 +43,8 @@ JSON:
33
43
 
34
44
  Your task is to determine whether the LLM message **contradicts** or **forgets** any of the known facts.
35
45
 
46
+ {KnowledgeRetentionTemplate.multimodal_rules}
47
+
36
48
  ---
37
49
  **Output format:**
38
50
 
@@ -7,8 +7,9 @@ from deepeval.metrics.utils import (
7
7
  check_conversational_test_case_params,
8
8
  construct_verbose_logs,
9
9
  get_unit_interactions,
10
- trimAndLoadJson,
11
10
  initialize_model,
11
+ a_generate_with_schema_and_extract,
12
+ generate_with_schema_and_extract,
12
13
  )
13
14
  from deepeval.metrics.indicator import metric_progress_indicator
14
15
  from deepeval.test_case import ConversationalTestCase, TurnParams
@@ -50,7 +51,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
50
51
  _log_metric_to_confident: bool = True,
51
52
  ):
52
53
  check_conversational_test_case_params(
53
- test_case, self._required_test_case_params, self
54
+ test_case,
55
+ self._required_test_case_params,
56
+ self,
57
+ False,
58
+ self.model,
59
+ test_case.multimodal,
54
60
  )
55
61
 
56
62
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -107,7 +113,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
107
113
  _log_metric_to_confident: bool = True,
108
114
  ):
109
115
  check_conversational_test_case_params(
110
- test_case, self._required_test_case_params, self
116
+ test_case,
117
+ self._required_test_case_params,
118
+ self,
119
+ False,
120
+ self.model,
121
+ test_case.multimodal,
111
122
  )
112
123
 
113
124
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -149,48 +160,67 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
149
160
 
150
161
  return self.score
151
162
 
152
- def _generate_reason(self, task_scores: List[TaskScore]) -> str:
153
- reason = "["
163
+ def _generate_reason(self, task_scores: List[TaskScore]) -> Optional[str]:
164
+ if not self.include_reason:
165
+ return None
166
+
167
+ reasons = []
154
168
  for task_score in task_scores:
155
- if task_score.score < self.threshold:
156
- reason += (
157
- f"\nScore: {task_score.score}\n"
158
- f"Reason: {task_score.reason}\n"
159
- )
160
- reason += "]"
161
- return reason
169
+ reasons.append(task_score.reason)
170
+
171
+ prompt = MCPTaskCompletionTemplate.generate_final_reason(
172
+ self.score, self.success, reasons
173
+ )
162
174
 
163
- def _get_task_score(self, task: Task) -> TaskScore:
164
- prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
165
175
  if self.using_native_model:
166
- res, cost = self.model.generate(prompt, schema=TaskScore)
176
+ res, cost = self.model.generate(prompt)
167
177
  self.evaluation_cost += cost
168
178
  return res
169
179
  else:
170
- try:
171
- res: TaskScore = self.model.generate(prompt, schema=TaskScore)
172
- return res
173
- except TypeError:
174
- res = self.model.generate(prompt)
175
- data = trimAndLoadJson(res, self)
176
- return TaskScore(**data)
180
+ res = self.model.generate(prompt)
181
+ return res
182
+
183
+ async def _a_generate_reason(
184
+ self, task_scores: List[TaskScore]
185
+ ) -> Optional[str]:
186
+ if not self.include_reason:
187
+ return None
188
+
189
+ reasons = []
190
+ for task_score in task_scores:
191
+ reasons.append(task_score.reason)
192
+
193
+ prompt = MCPTaskCompletionTemplate.generate_final_reason(
194
+ self.score, self.success, reasons
195
+ )
177
196
 
178
- async def _a_get_task_score(self, task: Task) -> TaskScore:
179
- prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
180
197
  if self.using_native_model:
181
- res, cost = await self.model.a_generate(prompt, schema=TaskScore)
198
+ res, cost = await self.model.a_generate(prompt)
182
199
  self.evaluation_cost += cost
183
200
  return res
184
201
  else:
185
- try:
186
- res: TaskScore = await self.model.a_generate(
187
- prompt, schema=TaskScore
188
- )
189
- return res
190
- except TypeError:
191
- res = await self.model.a_generate(prompt)
192
- data = trimAndLoadJson(res, self)
193
- return TaskScore(**data)
202
+ res = await self.model.a_generate(prompt)
203
+ return res
204
+
205
+ def _get_task_score(self, task: Task) -> TaskScore:
206
+ prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
207
+ return generate_with_schema_and_extract(
208
+ metric=self,
209
+ prompt=prompt,
210
+ schema_cls=TaskScore,
211
+ extract_schema=lambda s: s,
212
+ extract_json=lambda data: TaskScore(**data),
213
+ )
214
+
215
+ async def _a_get_task_score(self, task: Task) -> TaskScore:
216
+ prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
217
+ return await a_generate_with_schema_and_extract(
218
+ metric=self,
219
+ prompt=prompt,
220
+ schema_cls=TaskScore,
221
+ extract_schema=lambda s: s,
222
+ extract_json=lambda data: TaskScore(**data),
223
+ )
194
224
 
195
225
  def _get_tasks(self, unit_interactions: List) -> List[Task]:
196
226
  tasks = []
@@ -244,9 +274,9 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
244
274
  return tasks
245
275
 
246
276
  def _calculate_score(self, scores: List[TaskScore]) -> float:
247
- score_divsor = len(scores) if len(scores) > 0 else 1
277
+ score_divisor = len(scores) if len(scores) > 0 else 1
248
278
  total_score = sum(score.score for score in scores)
249
- score = total_score / score_divsor
279
+ score = total_score / score_divisor
250
280
  return 0 if self.strict_mode and score < self.threshold else score
251
281
 
252
282
  def is_successful(self) -> bool:
@@ -254,8 +284,8 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
254
284
  self.success = False
255
285
  else:
256
286
  try:
257
- self.score >= self.threshold
258
- except:
287
+ self.success = self.score >= self.threshold
288
+ except TypeError:
259
289
  self.success = False
260
290
  return self.success
261
291