deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -31,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
31
31
  number_evaluation_steps,
32
32
  get_score_range,
33
33
  )
34
+ from deepeval.metrics.api import metric_data_manager
34
35
  from deepeval.config.settings import get_settings
35
36
 
36
37
 
@@ -74,6 +75,7 @@ class GEval(BaseMetric):
74
75
  test_case: LLMTestCase,
75
76
  _show_indicator: bool = True,
76
77
  _in_component: bool = False,
78
+ _log_metric_to_confident: bool = True,
77
79
  _additional_context: Optional[str] = None,
78
80
  ) -> float:
79
81
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -122,6 +124,10 @@ class GEval(BaseMetric):
122
124
  f"Reason: {self.reason}",
123
125
  ],
124
126
  )
127
+ if _log_metric_to_confident:
128
+ metric_data_manager.post_metric_if_enabled(
129
+ self, test_case=test_case
130
+ )
125
131
 
126
132
  return self.score
127
133
 
@@ -130,6 +136,7 @@ class GEval(BaseMetric):
130
136
  test_case: LLMTestCase,
131
137
  _show_indicator: bool = True,
132
138
  _in_component: bool = False,
139
+ _log_metric_to_confident: bool = True,
133
140
  _additional_context: Optional[str] = None,
134
141
  ) -> float:
135
142
  check_llm_test_case_params(test_case, self.evaluation_params, self)
@@ -165,6 +172,10 @@ class GEval(BaseMetric):
165
172
  f"Reason: {self.reason}",
166
173
  ],
167
174
  )
175
+ if _log_metric_to_confident:
176
+ metric_data_manager.post_metric_if_enabled(
177
+ self, test_case=test_case
178
+ )
168
179
  return self.score
169
180
 
170
181
  async def _a_generate_evaluation_steps(self) -> List[str]:
@@ -16,6 +16,7 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
16
16
  from deepeval.models import DeepEvalBaseLLM
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.hallucination.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
  required_params: List[LLMTestCaseParams] = [
21
22
  LLMTestCaseParams.INPUT,
@@ -51,6 +52,7 @@ class HallucinationMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
58
  check_llm_test_case_params(test_case, required_params, self)
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class HallucinationMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, required_params, self)
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
118
126
  f"Score: {self.score}\nReason: {self.reason}",
119
127
  ],
120
128
  )
121
-
129
+ if _log_metric_to_confident:
130
+ metric_data_manager.post_metric_if_enabled(
131
+ self, test_case=test_case
132
+ )
122
133
  return self.score
123
134
 
124
135
  async def _a_generate_reason(self):
@@ -100,6 +100,7 @@ async def measure_metric_task(
100
100
  test_case,
101
101
  _show_indicator=False,
102
102
  _in_component=_in_component,
103
+ _log_metric_to_confident=False,
103
104
  )
104
105
  finish_text = "Done"
105
106
  except MissingTestCaseParamsError as e:
@@ -116,7 +117,9 @@ async def measure_metric_task(
116
117
  except TypeError:
117
118
  try:
118
119
  await metric.a_measure(
119
- test_case, _in_component=_in_component
120
+ test_case,
121
+ _in_component=_in_component,
122
+ _log_metric_to_confident=False,
120
123
  )
121
124
  finish_text = "Done"
122
125
  except MissingTestCaseParamsError as e:
@@ -241,7 +244,10 @@ async def safe_a_measure(
241
244
  ):
242
245
  try:
243
246
  await metric.a_measure(
244
- tc, _show_indicator=False, _in_component=_in_component
247
+ tc,
248
+ _show_indicator=False,
249
+ _in_component=_in_component,
250
+ _log_metric_to_confident=False,
245
251
  )
246
252
  update_pbar(progress, pbar_eval_id)
247
253
  except MissingTestCaseParamsError as e:
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
19
19
  from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
20
20
  from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.metrics.api import metric_data_manager
21
22
 
22
23
  DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
23
24
 
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
58
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
88
91
  f"Score: {self.score}\nReason: {self.reason}",
89
92
  ],
90
93
  )
94
+ if _log_metric_to_confident:
95
+ metric_data_manager.post_metric_if_enabled(
96
+ self, test_case=test_case
97
+ )
91
98
 
92
99
  return self.score
93
100
 
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
96
103
  test_case: LLMTestCase,
97
104
  _show_indicator: bool = True,
98
105
  _in_component: bool = False,
106
+ _log_metric_to_confident: bool = True,
99
107
  ) -> float:
100
108
 
101
109
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
126
134
  f"Score: {self.score}\nReason: {self.reason}",
127
135
  ],
128
136
  )
129
-
137
+ if _log_metric_to_confident:
138
+ metric_data_manager.post_metric_if_enabled(
139
+ self, test_case=test_case
140
+ )
130
141
  return self.score
131
142
 
132
143
  async def a_generate_reason(self, actual_output: str) -> str:
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
20
20
  KnowledgeRetentionScoreReason,
21
21
  )
22
22
  from deepeval.utils import get_or_create_event_loop, prettify_list
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class KnowledgeRetentionMetric(BaseConversationalMetric):
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
47
48
  test_case: ConversationalTestCase,
48
49
  _show_indicator: bool = True,
49
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
50
52
  ):
51
53
  check_conversational_test_case_params(
52
54
  test_case, self._required_test_case_params, self
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
  return self.score
88
95
 
89
96
  async def a_measure(
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
91
98
  test_case: ConversationalTestCase,
92
99
  _show_indicator: bool = True,
93
100
  _in_component: bool = False,
101
+ _log_metric_to_confident: bool = True,
94
102
  ) -> float:
95
103
  check_conversational_test_case_params(
96
104
  test_case, self._required_test_case_params, self
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
120
128
  f"Score: {self.score}\nReason: {self.reason}",
121
129
  ],
122
130
  )
131
+ if _log_metric_to_confident:
132
+ metric_data_manager.post_metric_if_enabled(
133
+ self, test_case=test_case
134
+ )
123
135
  return self.score
124
136
 
125
137
  async def _a_generate_reason(self) -> str:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, TaskScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MCPTaskCompletionMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
90
93
  f"Score: {self.score}",
91
94
  ],
92
95
  )
96
+ if _log_metric_to_confident:
97
+ metric_data_manager.post_metric_if_enabled(
98
+ self, test_case=test_case
99
+ )
93
100
  return self.score
94
101
 
95
102
  async def a_measure(
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
97
104
  test_case: ConversationalTestCase,
98
105
  _show_indicator: bool = True,
99
106
  _in_component: bool = False,
107
+ _log_metric_to_confident: bool = True,
100
108
  ):
101
109
  check_conversational_test_case_params(
102
110
  test_case, self._required_test_case_params, self
@@ -131,6 +139,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
131
139
  f"Score: {self.score}",
132
140
  ],
133
141
  )
142
+ if _log_metric_to_confident:
143
+ metric_data_manager.post_metric_if_enabled(
144
+ self, test_case=test_case
145
+ )
146
+
134
147
  return self.score
135
148
 
136
149
  def _generate_reason(self, task_scores: List[TaskScore]) -> str:
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
16
16
  from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
17
17
  from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
18
18
  from deepeval.errors import MissingTestCaseParamsError
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MultiTurnMCPUseMetric(BaseConversationalMetric):
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
46
47
  test_case: ConversationalTestCase,
47
48
  _show_indicator: bool = True,
48
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
49
51
  ):
50
52
  check_conversational_test_case_params(
51
53
  test_case, self._required_test_case_params, self
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
62
64
  test_case,
63
65
  _show_indicator=False,
64
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
65
68
  )
66
69
  )
67
70
  else:
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
102
105
  f"Score: {self.score}",
103
106
  ],
104
107
  )
108
+ if _log_metric_to_confident:
109
+ metric_data_manager.post_metric_if_enabled(
110
+ self, test_case=test_case
111
+ )
112
+
105
113
  return self.score
106
114
 
107
115
  async def a_measure(
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
109
117
  test_case: ConversationalTestCase,
110
118
  _show_indicator: bool = True,
111
119
  _in_component: bool = False,
120
+ _log_metric_to_confident: bool = True,
112
121
  ):
113
122
  check_conversational_test_case_params(
114
123
  test_case, self._required_test_case_params, self
@@ -161,6 +170,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
161
170
  f"Score: {self.score}",
162
171
  ],
163
172
  )
173
+ if _log_metric_to_confident:
174
+ metric_data_manager.post_metric_if_enabled(
175
+ self, test_case=test_case
176
+ )
164
177
  return self.score
165
178
 
166
179
  def _get_tool_accuracy_score(
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
20
20
  from deepeval.metrics.indicator import metric_progress_indicator
21
21
  from .template import MCPUseMetricTemplate
22
22
  from .schema import MCPPrimitivesScore, MCPArgsScore
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class MCPUseMetric(BaseMetric):
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
56
58
 
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
104
107
  self,
105
108
  steps=steps,
106
109
  )
110
+ if _log_metric_to_confident:
111
+ metric_data_manager.post_metric_if_enabled(
112
+ self, test_case=test_case
113
+ )
107
114
 
108
115
  return self.score
109
116
 
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
112
119
  test_case: LLMTestCase,
113
120
  _show_indicator: bool = True,
114
121
  _in_component: bool = False,
122
+ _log_metric_to_confident: bool = True,
115
123
  ) -> float:
116
124
  check_llm_test_case_params(test_case, self._required_params, self)
117
125
 
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
154
162
  self,
155
163
  steps=steps,
156
164
  )
157
-
165
+ if _log_metric_to_confident:
166
+ metric_data_manager.post_metric_if_enabled(
167
+ self, test_case=test_case
168
+ )
158
169
  return self.score
159
170
 
160
171
  def _get_primitives_used_score(
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
16
16
  )
17
17
  from deepeval.metrics.misuse.template import MisuseTemplate
18
18
  from deepeval.metrics.misuse.schema import *
19
+ from deepeval.metrics.api import metric_data_manager
19
20
 
20
21
 
21
22
  class MisuseMetric(BaseMetric):
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
53
54
  test_case: LLMTestCase,
54
55
  _show_indicator: bool = True,
55
56
  _in_component: bool = False,
57
+ _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
60
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
68
70
  test_case,
69
71
  _show_indicator=False,
70
72
  _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
71
74
  )
72
75
  )
73
76
  else:
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
86
89
  f"Score: {self.score}\nReason: {self.reason}",
87
90
  ],
88
91
  )
92
+ if _log_metric_to_confident:
93
+ metric_data_manager.post_metric_if_enabled(
94
+ self, test_case=test_case
95
+ )
89
96
 
90
97
  return self.score
91
98
 
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
94
101
  test_case: LLMTestCase,
95
102
  _show_indicator: bool = True,
96
103
  _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
97
105
  ) -> float:
98
106
 
99
107
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
125
-
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
126
137
  return self.score
127
138
 
128
139
  async def _a_generate_reason(self) -> str:
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
146
148
  test_case: MLLMTestCase,
147
149
  _show_indicator: bool = True,
148
150
  _in_component: bool = False,
151
+ _log_metric_to_confident: bool = True,
149
152
  ) -> float:
150
153
  check_mllm_test_case_params(
151
154
  test_case, self._required_params, None, None, self
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
47
47
  test_case: MLLMTestCase,
48
48
  _show_indicator: bool = True,
49
49
  _in_component: bool = False,
50
+ _log_metric_to_confident: bool = True,
50
51
  ) -> float:
51
52
  check_mllm_test_case_params(
52
53
  test_case, self._required_params, 1, 1, self
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
63
64
  test_case,
64
65
  _show_indicator=False,
65
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
66
68
  )
67
69
  )
68
70
  else:
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
108
110
  test_case: MLLMTestCase,
109
111
  _show_indicator: bool = True,
110
112
  _in_component: bool = False,
113
+ _log_metric_to_confident: bool = True,
111
114
  ) -> float:
112
115
  check_mllm_test_case_params(
113
116
  test_case, self._required_params, 1, 1, self
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
147
149
  test_case: MLLMTestCase,
148
150
  _show_indicator: bool = True,
149
151
  _in_component: bool = False,
152
+ _log_metric_to_confident: bool = True,
150
153
  ) -> float:
151
154
  check_mllm_test_case_params(
152
155
  test_case, self._required_params, None, None, self
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
46
46
  test_case: MLLMTestCase,
47
47
  _show_indicator: bool = True,
48
48
  _in_component: bool = False,
49
+ _log_metric_to_confident: bool = True,
49
50
  ) -> float:
50
51
  check_mllm_test_case_params(
51
52
  test_case, self._required_params, None, None, self
52
53
  )
53
54
  self.evaluation_cost = 0 if self.using_native_model else None
54
55
  with metric_progress_indicator(
55
- self, _show_indicator=_show_indicator, _in_component=_in_component
56
+ self,
57
+ _show_indicator=_show_indicator,
58
+ _in_component=_in_component,
56
59
  ):
57
60
  if self.async_mode:
58
61
  loop = get_or_create_event_loop()
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
61
64
  test_case,
62
65
  _show_indicator=False,
63
66
  _in_component=_in_component,
67
+ _log_metric_to_confident=_log_metric_to_confident,
64
68
  )
65
69
  )
66
70
  else:
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
89
93
  test_case: MLLMTestCase,
90
94
  _show_indicator: bool = True,
91
95
  _in_component: bool = False,
96
+ _log_metric_to_confident: bool = True,
92
97
  ) -> float:
93
98
  check_mllm_test_case_params(
94
99
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
56
57
 
57
58
  self.evaluation_cost = 0 if self.using_native_model else None
58
59
  with metric_progress_indicator(
59
- self, _show_indicator=_show_indicator, _in_component=_in_component
60
+ self,
61
+ _show_indicator=_show_indicator,
62
+ _in_component=_in_component,
60
63
  ):
61
64
  if self.async_mode:
62
65
  loop = get_or_create_event_loop()
@@ -65,6 +68,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
65
68
  test_case,
66
69
  _show_indicator=False,
67
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
68
72
  )
69
73
  )
70
74
  else:
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
93
97
  test_case: MLLMTestCase,
94
98
  _show_indicator: bool = True,
95
99
  _in_component: bool = False,
100
+ _log_metric_to_confident: bool = True,
96
101
  ) -> float:
97
102
  check_mllm_test_case_params(
98
103
  test_case, self._required_params, None, None, self
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
48
48
  test_case: MLLMTestCase,
49
49
  _show_indicator: bool = True,
50
50
  _in_component: bool = False,
51
+ _log_metric_to_confident: bool = True,
51
52
  ) -> float:
52
53
  check_mllm_test_case_params(
53
54
  test_case, self._required_params, None, None, self
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
64
65
  test_case,
65
66
  _show_indicator=False,
66
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
67
69
  )
68
70
  )
69
71
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
49
49
  test_case: MLLMTestCase,
50
50
  _show_indicator: bool = True,
51
51
  _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
52
53
  ) -> float:
53
54
  check_mllm_test_case_params(
54
55
  test_case, self._required_params, None, None, self
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
65
66
  test_case,
66
67
  _show_indicator=False,
67
68
  _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
68
70
  )
69
71
  )
70
72
  else:
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
90
92
  test_case: MLLMTestCase,
91
93
  _show_indicator: bool = True,
92
94
  _in_component: bool = False,
95
+ _log_metric_to_confident: bool = True,
93
96
  ) -> float:
94
97
  check_mllm_test_case_params(
95
98
  test_case, self._required_params, None, None, self