deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -100,6 +100,7 @@ async def measure_metric_task(
100
100
  test_case,
101
101
  _show_indicator=False,
102
102
  _in_component=_in_component,
103
+ _log_metric_to_confident=False,
103
104
  )
104
105
  finish_text = "Done"
105
106
  except MissingTestCaseParamsError as e:
@@ -116,7 +117,9 @@ async def measure_metric_task(
116
117
  except TypeError:
117
118
  try:
118
119
  await metric.a_measure(
119
- test_case, _in_component=_in_component
120
+ test_case,
121
+ _in_component=_in_component,
122
+ _log_metric_to_confident=False,
120
123
  )
121
124
  finish_text = "Done"
122
125
  except MissingTestCaseParamsError as e:
@@ -241,7 +244,10 @@ async def safe_a_measure(
241
244
  ):
242
245
  try:
243
246
  await metric.a_measure(
244
- tc, _show_indicator=False, _in_component=_in_component
247
+ tc,
248
+ _show_indicator=False,
249
+ _in_component=_in_component,
250
+ _log_metric_to_confident=False,
245
251
  )
246
252
  update_pbar(progress, pbar_eval_id)
247
253
  except MissingTestCaseParamsError as e:
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
19
19
  from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
20
20
  from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
  DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
23
24
 
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
58
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
88
91
  f"Score: {self.score}\nReason: {self.reason}",
89
92
  ],
90
93
  )
94
+ if _log_metric_to_confident:
95
+ metric_data_manager.post_metric_if_enabled(
96
+ self, test_case=test_case
97
+ )
91
98
 
92
99
  return self.score
93
100
 
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
96
103
  test_case: LLMTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
 
101
109
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
126
134
  f"Score: {self.score}\nReason: {self.reason}",
127
135
  ],
128
136
  )
129
-
137
+ if _log_metric_to_confident:
138
+ metric_data_manager.post_metric_if_enabled(
139
+ self, test_case=test_case
140
+ )
130
141
  return self.score
131
142
 
132
143
  async def a_generate_reason(self, actual_output: str) -> str:
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
20
20
  KnowledgeRetentionScoreReason,
21
21
  )
22
22
  from deepeval.utils import get_or_create_event_loop, prettify_list
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class KnowledgeRetentionMetric(BaseConversationalMetric):
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
47
48
  test_case: ConversationalTestCase,
48
49
  _show_indicator: bool = True,
49
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
50
52
  ):
51
53
  check_conversational_test_case_params(
52
54
  test_case, self._required_test_case_params, self
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
  return self.score
88
95
 
89
96
  async def a_measure(
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
91
98
  test_case: ConversationalTestCase,
92
99
  _show_indicator: bool = True,
93
100
  _in_component: bool = False,
101
+ _log_metric_to_confident: bool = True,
94
102
  ) -> float:
95
103
  check_conversational_test_case_params(
96
104
  test_case, self._required_test_case_params, self
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
123
135
  return self.score
124
136
 
125
137
  async def _a_generate_reason(self) -> str:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, TaskScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MCPTaskCompletionMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
90
93
  f"Score: {self.score}",
91
94
  ],
92
95
  )
96
+ if _log_metric_to_confident:
97
+ metric_data_manager.post_metric_if_enabled(
98
+ self, test_case=test_case
99
+ )
93
100
  return self.score
94
101
 
95
102
  async def a_measure(
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
97
104
  test_case: ConversationalTestCase,
98
105
  _show_indicator: bool = True,
99
106
  _in_component: bool = False,
107
+ _log_metric_to_confident: bool = True,
100
108
  ):
101
109
  check_conversational_test_case_params(
102
110
  test_case, self._required_test_case_params, self
@@ -104,7 +112,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
104
112
 
105
113
  self.evaluation_cost = 0 if self.using_native_model else None
106
114
  with metric_progress_indicator(
107
- self, async_mode=True, _show_indicator=_show_indicator
115
+ self,
116
+ async_mode=True,
117
+ _show_indicator=_show_indicator,
118
+ _in_component=_in_component,
108
119
  ):
109
120
  if not test_case.mcp_servers:
110
121
  error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MCPTaskCompletionMetric' metric."
@@ -131,6 +142,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
131
142
  f"Score: {self.score}",
132
143
  ],
133
144
  )
145
+ if _log_metric_to_confident:
146
+ metric_data_manager.post_metric_if_enabled(
147
+ self, test_case=test_case
148
+ )
149
+
134
150
  return self.score
135
151
 
136
152
  def _generate_reason(self, task_scores: List[TaskScore]) -> str:
@@ -228,8 +244,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
228
244
  return tasks
229
245
 
230
246
  def _calculate_score(self, scores: List[TaskScore]) -> float:
247
+ score_divsor = len(scores) if len(scores) > 0 else 1
231
248
  total_score = sum(score.score for score in scores)
232
- return total_score / len(scores)
249
+ score = total_score / score_divsor
250
+ return 0 if self.strict_mode and score < self.threshold else score
233
251
 
234
252
  def is_successful(self) -> bool:
235
253
  if self.error is not None:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MultiTurnMCPUseMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
102
105
  f"Score: {self.score}",
103
106
  ],
104
107
  )
108
+ if _log_metric_to_confident:
109
+ metric_data_manager.post_metric_if_enabled(
110
+ self, test_case=test_case
111
+ )
112
+
105
113
  return self.score
106
114
 
107
115
  async def a_measure(
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
109
117
  test_case: ConversationalTestCase,
110
118
  _show_indicator: bool = True,
111
119
  _in_component: bool = False,
120
+ _log_metric_to_confident: bool = True,
112
121
  ):
113
122
  check_conversational_test_case_params(
114
123
  test_case, self._required_test_case_params, self
@@ -116,7 +125,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
116
125
 
117
126
  self.evaluation_cost = 0 if self.using_native_model else None
118
127
  with metric_progress_indicator(
119
- self, async_mode=True, _show_indicator=_show_indicator
128
+ self,
129
+ async_mode=True,
130
+ _show_indicator=_show_indicator,
131
+ _in_component=_in_component,
120
132
  ):
121
133
  if not test_case.mcp_servers:
122
134
  error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MultiTurnMCPUseMetric' metric."
@@ -161,6 +173,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
161
173
  f"Score: {self.score}",
162
174
  ],
163
175
  )
176
+ if _log_metric_to_confident:
177
+ metric_data_manager.post_metric_if_enabled(
178
+ self, test_case=test_case
179
+ )
164
180
  return self.score
165
181
 
166
182
  def _get_tool_accuracy_score(
@@ -299,13 +315,20 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
299
315
  tool_accuracy_score: List[ToolScore],
300
316
  args_accuracy_score: List[ArgsScore],
301
317
  ) -> float:
302
- tool_score = sum(score.score for score in tool_accuracy_score) / len(
303
- tool_accuracy_score
318
+ tool_divisor = (
319
+ len(tool_accuracy_score) if len(tool_accuracy_score) > 0 else 1
320
+ )
321
+ args_divisor = (
322
+ len(args_accuracy_score) if len(args_accuracy_score) > 0 else 1
323
+ )
324
+ tool_score = (
325
+ sum(score.score for score in tool_accuracy_score) / tool_divisor
304
326
  )
305
- args_score = sum(score.score for score in args_accuracy_score) / len(
306
- args_accuracy_score
327
+ args_score = (
328
+ sum(score.score for score in args_accuracy_score) / args_divisor
307
329
  )
308
- return min(tool_score, args_score)
330
+ score = min(tool_score, args_score)
331
+ return 0 if self.strict_mode and score < self.threshold else score
309
332
 
310
333
  def _generate_reason(
311
334
  self,
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from .template import MCPUseMetricTemplate
22
22
  from .schema import MCPPrimitivesScore, MCPArgsScore
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class MCPUseMetric(BaseMetric):
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
56
58
 
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
104
107
  self,
105
108
  steps=steps,
106
109
  )
110
+ if _log_metric_to_confident:
111
+ metric_data_manager.post_metric_if_enabled(
112
+ self, test_case=test_case
113
+ )
107
114
 
108
115
  return self.score
109
116
 
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
112
119
  test_case: LLMTestCase,
113
120
  _show_indicator: bool = True,
114
121
  _in_component: bool = False,
122
+ _log_metric_to_confident: bool = True,
115
123
  ) -> float:
116
124
  check_llm_test_case_params(test_case, self._required_params, self)
117
125
 
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
154
162
  self,
155
163
  steps=steps,
156
164
  )
157
-
165
+ if _log_metric_to_confident:
166
+ metric_data_manager.post_metric_if_enabled(
167
+ self, test_case=test_case
168
+ )
158
169
  return self.score
159
170
 
160
171
  def _get_primitives_used_score(
@@ -260,9 +271,10 @@ class MCPUseMetric(BaseMetric):
260
271
  primitives_used_score: MCPPrimitivesScore,
261
272
  argument_correctness_score: MCPArgsScore,
262
273
  ) -> float:
263
- return min(
274
+ score = min(
264
275
  primitives_used_score.score, argument_correctness_score.score
265
276
  )
277
+ return 0 if self.strict_mode and score < self.threshold else score
266
278
 
267
279
  def _get_reason(
268
280
  self,
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
16
16
  )
17
17
  from deepeval.metrics.misuse.template import MisuseTemplate
18
18
  from deepeval.metrics.misuse.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MisuseMetric(BaseMetric):
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
86
89
  f"Score: {self.score}\nReason: {self.reason}",
87
90
  ],
88
91
  )
92
+ if _log_metric_to_confident:
93
+ metric_data_manager.post_metric_if_enabled(
94
+ self, test_case=test_case
95
+ )
89
96
 
90
97
  return self.score
91
98
 
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
94
101
  test_case: LLMTestCase,
95
102
  _show_indicator: bool = True,
96
103
  _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
97
105
  ) -> float:
98
106
 
99
107
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
125
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
126
137
  return self.score
127
138
 
128
139
  async def _a_generate_reason(self) -> str:
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
146
148
  test_case: MLLMTestCase,
147
149
  _show_indicator: bool = True,
148
150
  _in_component: bool = False,
151
+ _log_metric_to_confident: bool = True,
149
152
  ) -> float:
150
153
  check_mllm_test_case_params(
151
154
  test_case, self._required_params, None, None, self
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
47
47
  test_case: MLLMTestCase,
48
48
  _show_indicator: bool = True,
49
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
50
51
  ) -> float:
51
52
  check_mllm_test_case_params(
52
53
  test_case, self._required_params, 1, 1, self
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
108
110
  test_case: MLLMTestCase,
109
111
  _show_indicator: bool = True,
110
112
  _in_component: bool = False,
113
+ _log_metric_to_confident: bool = True,
111
114
  ) -> float:
112
115
  check_mllm_test_case_params(
113
116
  test_case, self._required_params, 1, 1, self
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
51
  check_mllm_test_case_params(
51
52
  test_case, self._required_params, None, None, self
52
53
  )
53
54
  self.evaluation_cost = 0 if self.using_native_model else None
54
55
  with metric_progress_indicator(
55
- self, _show_indicator=_show_indicator, _in_component=_in_component
56
+ self,
57
+ _show_indicator=_show_indicator,
58
+ _in_component=_in_component,
56
59
  ):
57
60
  if self.async_mode:
58
61
  loop = get_or_create_event_loop()
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
61
64
  test_case,
62
65
  _show_indicator=False,
63
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
64
68
  )
65
69
  )
66
70
  else:
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
89
93
  test_case: MLLMTestCase,
90
94
  _show_indicator: bool = True,
91
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
92
97
  ) -> float:
93
98
  check_mllm_test_case_params(
94
99
  test_case, self._required_params, None, None, self