deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, List, Union
2
2
 
3
3
  from deepeval.metrics import BaseMultimodalMetric
4
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
4
+ from deepeval.test_case import MLLMTestCase
5
5
  from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
6
6
  MultiModalContextualPrecisionTemplate,
7
7
  )
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
14
14
  )
15
15
  from deepeval.test_case import LLMTestCaseParams
16
16
  from deepeval.models import DeepEvalBaseMLLM
17
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema import *
17
+ import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
19
 
20
20
 
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
56
57
 
57
58
  self.evaluation_cost = 0 if self.using_native_model else None
58
59
  with metric_progress_indicator(
59
- self, _show_indicator=_show_indicator, _in_component=_in_component
60
+ self,
61
+ _show_indicator=_show_indicator,
62
+ _in_component=_in_component,
60
63
  ):
61
64
  if self.async_mode:
62
65
  loop = get_or_create_event_loop()
@@ -65,10 +68,11 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
65
68
  test_case,
66
69
  _show_indicator=False,
67
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
68
72
  )
69
73
  )
70
74
  else:
71
- self.verdicts: List[ContextualPrecisionVerdict] = (
75
+ self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
72
76
  self._generate_verdicts(
73
77
  test_case.input,
74
78
  test_case.expected_output,
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
93
97
  test_case: MLLMTestCase,
94
98
  _show_indicator: bool = True,
95
99
  _in_component: bool = False,
100
+ _log_metric_to_confident: bool = True,
96
101
  ) -> float:
97
102
  check_mllm_test_case_params(
98
103
  test_case, self._required_params, None, None, self
@@ -105,7 +110,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
105
110
  _show_indicator=_show_indicator,
106
111
  _in_component=_in_component,
107
112
  ):
108
- self.verdicts: List[ContextualPrecisionVerdict] = (
113
+ self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
109
114
  await self._a_generate_verdicts(
110
115
  test_case.input,
111
116
  test_case.expected_output,
@@ -125,12 +130,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
125
130
 
126
131
  return self.score
127
132
 
128
- async def _a_generate_reason(self, input: str):
133
+ async def _a_generate_reason(self, input: str) -> Optional[str]:
129
134
  if self.include_reason is False:
130
135
  return None
131
136
 
132
137
  retrieval_contexts_verdicts = [
133
- {"verdict": verdict.verdict, "reasons": verdict.reason}
138
+ {"verdict": verdict.verdict, "reason": verdict.reason}
134
139
  for verdict in self.verdicts
135
140
  ]
136
141
  prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -141,15 +146,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
141
146
 
142
147
  if self.using_native_model:
143
148
  res, cost = await self.model.a_generate(
144
- prompt, schema=MultimodelContextualPrecisionScoreReason
149
+ prompt,
150
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
145
151
  )
146
152
  self.evaluation_cost += cost
147
153
  return res.reason
148
154
  else:
149
155
  try:
150
- res: MultimodelContextualPrecisionScoreReason = (
156
+ res: mcpschema.MultimodelContextualPrecisionScoreReason = (
151
157
  await self.model.a_generate(
152
- prompt, schema=MultimodelContextualPrecisionScoreReason
158
+ prompt,
159
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
153
160
  )
154
161
  )
155
162
  return res.reason
@@ -158,12 +165,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
158
165
  data = trimAndLoadJson(res, self)
159
166
  return data["reason"]
160
167
 
161
- def _generate_reason(self, input: str):
168
+ def _generate_reason(self, input: str) -> Optional[str]:
162
169
  if self.include_reason is False:
163
170
  return None
164
171
 
165
172
  retrieval_contexts_verdicts = [
166
- {"verdict": verdict.verdict, "reasons": verdict.reason}
173
+ {"verdict": verdict.verdict, "reason": verdict.reason}
167
174
  for verdict in self.verdicts
168
175
  ]
169
176
  prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -174,15 +181,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
174
181
 
175
182
  if self.using_native_model:
176
183
  res, cost = self.model.generate(
177
- prompt, schema=MultimodelContextualPrecisionScoreReason
184
+ prompt,
185
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
178
186
  )
179
187
  self.evaluation_cost += cost
180
188
  return res.reason
181
189
  else:
182
190
  try:
183
- res: MultimodelContextualPrecisionScoreReason = (
191
+ res: mcpschema.MultimodelContextualPrecisionScoreReason = (
184
192
  self.model.generate(
185
- prompt, schema=MultimodelContextualPrecisionScoreReason
193
+ prompt,
194
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
186
195
  )
187
196
  )
188
197
  return res.reason
@@ -193,21 +202,23 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
193
202
 
194
203
  async def _a_generate_verdicts(
195
204
  self, input: str, expected_output: str, retrieval_context: List[str]
196
- ) -> List[ContextualPrecisionVerdict]:
205
+ ) -> List[mcpschema.ContextualPrecisionVerdict]:
197
206
  prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
198
207
  input=input,
199
208
  expected_output=expected_output,
200
209
  retrieval_context=retrieval_context,
201
210
  )
202
211
  if self.using_native_model:
203
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
212
+ res, cost = await self.model.a_generate(
213
+ prompt, schema=mcpschema.Verdicts
214
+ )
204
215
  self.evaluation_cost += cost
205
216
  verdicts = [item for item in res.verdicts]
206
217
  return verdicts
207
218
  else:
208
219
  try:
209
- res: Verdicts = await self.model.a_generate(
210
- prompt, schema=Verdicts
220
+ res: mcpschema.Verdicts = await self.model.a_generate(
221
+ prompt, schema=mcpschema.Verdicts
211
222
  )
212
223
  verdicts = [item for item in res.verdicts]
213
224
  return verdicts
@@ -215,34 +226,36 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
215
226
  res = await self.model.a_generate(prompt)
216
227
  data = trimAndLoadJson(res, self)
217
228
  verdicts = [
218
- ContextualPrecisionVerdict(**item)
229
+ mcpschema.ContextualPrecisionVerdict(**item)
219
230
  for item in data["verdicts"]
220
231
  ]
221
232
  return verdicts
222
233
 
223
234
  def _generate_verdicts(
224
235
  self, input: str, expected_output: str, retrieval_context: List[str]
225
- ) -> List[ContextualPrecisionVerdict]:
236
+ ) -> List[mcpschema.ContextualPrecisionVerdict]:
226
237
  prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
227
238
  input=input,
228
239
  expected_output=expected_output,
229
240
  retrieval_context=retrieval_context,
230
241
  )
231
242
  if self.using_native_model:
232
- res, cost = self.model.generate(prompt, schema=Verdicts)
243
+ res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
233
244
  self.evaluation_cost += cost
234
245
  verdicts = [item for item in res.verdicts]
235
246
  return verdicts
236
247
  else:
237
248
  try:
238
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
249
+ res: mcpschema.Verdicts = self.model.generate(
250
+ prompt, schema=mcpschema.Verdicts
251
+ )
239
252
  verdicts = [item for item in res.verdicts]
240
253
  return verdicts
241
254
  except TypeError:
242
255
  res = self.model.generate(prompt)
243
256
  data = trimAndLoadJson(res, self)
244
257
  verdicts = [
245
- ContextualPrecisionVerdict(**item)
258
+ mcpschema.ContextualPrecisionVerdict(**item)
246
259
  for item in data["verdicts"]
247
260
  ]
248
261
  return verdicts
@@ -279,7 +292,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
279
292
  else:
280
293
  try:
281
294
  self.success = self.score >= self.threshold
282
- except:
295
+ except TypeError:
283
296
  self.success = False
284
297
  return self.success
285
298
 
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
65
66
  test_case,
66
67
  _show_indicator=False,
67
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
68
70
  )
69
71
  )
70
72
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
53
53
  test_case: MLLMTestCase,
54
54
  _show_indicator: bool = True,
55
55
  _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
58
  check_mllm_test_case_params(
58
59
  test_case, self._required_params, None, None, self
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
71
72
  test_case,
72
73
  _show_indicator=False,
73
74
  _in_component=_in_component,
75
+ _log_metric_to_confident=_log_metric_to_confident,
74
76
  )
75
77
  )
76
78
  else:
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
97
99
  test_case: MLLMTestCase,
98
100
  _show_indicator: bool = True,
99
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
100
103
  ) -> float:
101
104
  check_mllm_test_case_params(
102
105
  test_case, self._required_params, None, None, self
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
78
78
  test_case: MLLMTestCase,
79
79
  _show_indicator: bool = True,
80
80
  _in_component: bool = False,
81
+ _log_metric_to_confident: bool = True,
81
82
  _additional_context: Optional[str] = None,
82
83
  ) -> float:
83
84
 
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
96
97
  test_case,
97
98
  _show_indicator=False,
98
99
  _in_component=_in_component,
100
+ _log_metric_to_confident=_log_metric_to_confident,
99
101
  _additional_context=_additional_context,
100
102
  )
101
103
  )
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
132
134
  _show_indicator: bool = True,
133
135
  _in_component: bool = False,
134
136
  _additional_context: Optional[str] = None,
137
+ _log_metric_to_confident: bool = True,
135
138
  ) -> float:
136
139
 
137
140
  check_mllm_test_case_params(
@@ -3,7 +3,7 @@ from typing import List, Dict
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- check_llm_test_case_params,
6
+ check_mllm_test_case_params,
7
7
  )
8
8
  from deepeval.test_case import (
9
9
  MLLMTestCase,
@@ -11,10 +11,10 @@ from deepeval.test_case import (
11
11
  ToolCallParams,
12
12
  ToolCall,
13
13
  )
14
- from deepeval.metrics import BaseMetric
14
+ from deepeval.metrics import BaseMultimodalMetric
15
15
 
16
16
 
17
- class MultimodalToolCorrectnessMetric(BaseMetric):
17
+ class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
18
18
 
19
19
  _required_params: List[MLLMTestCaseParams] = [
20
20
  MLLMTestCaseParams.INPUT,
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
- check_llm_test_case_params(test_case, self._required_params, self)
51
+ check_mllm_test_case_params(
52
+ test_case, self._required_params, None, None, self
53
+ )
51
54
  self.test_case = test_case
52
55
  with metric_progress_indicator(
53
56
  self, _show_indicator=_show_indicator, _in_component=_in_component
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
90
93
  test_case: MLLMTestCase,
91
94
  _show_indicator: bool = True,
92
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
93
97
  ) -> float:
94
98
  return self.measure(
95
99
  test_case,
96
100
  _show_indicator=_show_indicator,
97
101
  _in_component=_in_component,
102
+ _log_metric_to_confident=_log_metric_to_confident,
98
103
  )
99
104
 
100
105
  ##################################################
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
278
283
 
279
284
  @property
280
285
  def __name__(self):
281
- return "Tool Correctness"
286
+ return "Multi Modal Tool Correctness"
282
287
 
283
288
  def indent_multiline_string(self, s, indent_level=4):
284
289
  indent = " " * indent_level
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.non_advice.template import NonAdviceTemplate
19
19
  from deepeval.metrics.non_advice.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class NonAdviceMetric(BaseMetric):
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
93
96
  f"Score: {self.score}\nReason: {self.reason}",
94
97
  ],
95
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
96
103
 
97
104
  return self.score
98
105
 
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
101
108
  test_case: LLMTestCase,
102
109
  _show_indicator: bool = True,
103
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
104
112
  ) -> float:
105
113
 
106
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
129
137
  f"Score: {self.score}\nReason: {self.reason}",
130
138
  ],
131
139
  )
140
+ if _log_metric_to_confident:
141
+ metric_data_manager.post_metric_if_enabled(
142
+ self, test_case=test_case
143
+ )
132
144
 
133
145
  return self.score
134
146
 
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
19
19
  from deepeval.metrics.pii_leakage.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class PIILeakageMetric(BaseMetric):
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
49
50
  test_case: LLMTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ) -> float:
53
55
 
54
56
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
64
66
  test_case,
65
67
  _show_indicator=False,
66
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
67
70
  )
68
71
  )
69
72
  else:
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
123
-
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
124
135
  return self.score
125
136
 
126
137
  async def _a_generate_reason(self) -> str:
@@ -0,0 +1 @@
1
+ from .plan_adherence import PlanAdherenceMetric