deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/metrics/indicator.py
CHANGED
|
@@ -100,6 +100,7 @@ async def measure_metric_task(
|
|
|
100
100
|
test_case,
|
|
101
101
|
_show_indicator=False,
|
|
102
102
|
_in_component=_in_component,
|
|
103
|
+
_log_metric_to_confident=False,
|
|
103
104
|
)
|
|
104
105
|
finish_text = "Done"
|
|
105
106
|
except MissingTestCaseParamsError as e:
|
|
@@ -116,7 +117,9 @@ async def measure_metric_task(
|
|
|
116
117
|
except TypeError:
|
|
117
118
|
try:
|
|
118
119
|
await metric.a_measure(
|
|
119
|
-
test_case,
|
|
120
|
+
test_case,
|
|
121
|
+
_in_component=_in_component,
|
|
122
|
+
_log_metric_to_confident=False,
|
|
120
123
|
)
|
|
121
124
|
finish_text = "Done"
|
|
122
125
|
except MissingTestCaseParamsError as e:
|
|
@@ -241,7 +244,10 @@ async def safe_a_measure(
|
|
|
241
244
|
):
|
|
242
245
|
try:
|
|
243
246
|
await metric.a_measure(
|
|
244
|
-
tc,
|
|
247
|
+
tc,
|
|
248
|
+
_show_indicator=False,
|
|
249
|
+
_in_component=_in_component,
|
|
250
|
+
_log_metric_to_confident=False,
|
|
245
251
|
)
|
|
246
252
|
update_pbar(progress, pbar_eval_id)
|
|
247
253
|
except MissingTestCaseParamsError as e:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
18
18
|
from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
|
|
19
19
|
from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
|
|
20
20
|
from deepeval.utils import get_or_create_event_loop
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
|
|
23
24
|
|
|
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
58
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
88
91
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
89
92
|
],
|
|
90
93
|
)
|
|
94
|
+
if _log_metric_to_confident:
|
|
95
|
+
metric_data_manager.post_metric_if_enabled(
|
|
96
|
+
self, test_case=test_case
|
|
97
|
+
)
|
|
91
98
|
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
96
103
|
test_case: LLMTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
|
|
101
109
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
126
134
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
127
135
|
],
|
|
128
136
|
)
|
|
129
|
-
|
|
137
|
+
if _log_metric_to_confident:
|
|
138
|
+
metric_data_manager.post_metric_if_enabled(
|
|
139
|
+
self, test_case=test_case
|
|
140
|
+
)
|
|
130
141
|
return self.score
|
|
131
142
|
|
|
132
143
|
async def a_generate_reason(self, actual_output: str) -> str:
|
|
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
|
|
|
20
20
|
KnowledgeRetentionScoreReason,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
47
48
|
test_case: ConversationalTestCase,
|
|
48
49
|
_show_indicator: bool = True,
|
|
49
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
50
52
|
):
|
|
51
53
|
check_conversational_test_case_params(
|
|
52
54
|
test_case, self._required_test_case_params, self
|
|
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
return self.score
|
|
88
95
|
|
|
89
96
|
async def a_measure(
|
|
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
91
98
|
test_case: ConversationalTestCase,
|
|
92
99
|
_show_indicator: bool = True,
|
|
93
100
|
_in_component: bool = False,
|
|
101
|
+
_log_metric_to_confident: bool = True,
|
|
94
102
|
) -> float:
|
|
95
103
|
check_conversational_test_case_params(
|
|
96
104
|
test_case, self._required_test_case_params, self
|
|
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
123
135
|
return self.score
|
|
124
136
|
|
|
125
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, TaskScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
90
93
|
f"Score: {self.score}",
|
|
91
94
|
],
|
|
92
95
|
)
|
|
96
|
+
if _log_metric_to_confident:
|
|
97
|
+
metric_data_manager.post_metric_if_enabled(
|
|
98
|
+
self, test_case=test_case
|
|
99
|
+
)
|
|
93
100
|
return self.score
|
|
94
101
|
|
|
95
102
|
async def a_measure(
|
|
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
97
104
|
test_case: ConversationalTestCase,
|
|
98
105
|
_show_indicator: bool = True,
|
|
99
106
|
_in_component: bool = False,
|
|
107
|
+
_log_metric_to_confident: bool = True,
|
|
100
108
|
):
|
|
101
109
|
check_conversational_test_case_params(
|
|
102
110
|
test_case, self._required_test_case_params, self
|
|
@@ -104,7 +112,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
104
112
|
|
|
105
113
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
106
114
|
with metric_progress_indicator(
|
|
107
|
-
self,
|
|
115
|
+
self,
|
|
116
|
+
async_mode=True,
|
|
117
|
+
_show_indicator=_show_indicator,
|
|
118
|
+
_in_component=_in_component,
|
|
108
119
|
):
|
|
109
120
|
if not test_case.mcp_servers:
|
|
110
121
|
error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MCPTaskCompletionMetric' metric."
|
|
@@ -131,6 +142,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
131
142
|
f"Score: {self.score}",
|
|
132
143
|
],
|
|
133
144
|
)
|
|
145
|
+
if _log_metric_to_confident:
|
|
146
|
+
metric_data_manager.post_metric_if_enabled(
|
|
147
|
+
self, test_case=test_case
|
|
148
|
+
)
|
|
149
|
+
|
|
134
150
|
return self.score
|
|
135
151
|
|
|
136
152
|
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
@@ -228,8 +244,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
228
244
|
return tasks
|
|
229
245
|
|
|
230
246
|
def _calculate_score(self, scores: List[TaskScore]) -> float:
|
|
247
|
+
score_divsor = len(scores) if len(scores) > 0 else 1
|
|
231
248
|
total_score = sum(score.score for score in scores)
|
|
232
|
-
|
|
249
|
+
score = total_score / score_divsor
|
|
250
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
233
251
|
|
|
234
252
|
def is_successful(self) -> bool:
|
|
235
253
|
if self.error is not None:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
102
105
|
f"Score: {self.score}",
|
|
103
106
|
],
|
|
104
107
|
)
|
|
108
|
+
if _log_metric_to_confident:
|
|
109
|
+
metric_data_manager.post_metric_if_enabled(
|
|
110
|
+
self, test_case=test_case
|
|
111
|
+
)
|
|
112
|
+
|
|
105
113
|
return self.score
|
|
106
114
|
|
|
107
115
|
async def a_measure(
|
|
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
109
117
|
test_case: ConversationalTestCase,
|
|
110
118
|
_show_indicator: bool = True,
|
|
111
119
|
_in_component: bool = False,
|
|
120
|
+
_log_metric_to_confident: bool = True,
|
|
112
121
|
):
|
|
113
122
|
check_conversational_test_case_params(
|
|
114
123
|
test_case, self._required_test_case_params, self
|
|
@@ -116,7 +125,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
116
125
|
|
|
117
126
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
118
127
|
with metric_progress_indicator(
|
|
119
|
-
self,
|
|
128
|
+
self,
|
|
129
|
+
async_mode=True,
|
|
130
|
+
_show_indicator=_show_indicator,
|
|
131
|
+
_in_component=_in_component,
|
|
120
132
|
):
|
|
121
133
|
if not test_case.mcp_servers:
|
|
122
134
|
error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MultiTurnMCPUseMetric' metric."
|
|
@@ -161,6 +173,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
161
173
|
f"Score: {self.score}",
|
|
162
174
|
],
|
|
163
175
|
)
|
|
176
|
+
if _log_metric_to_confident:
|
|
177
|
+
metric_data_manager.post_metric_if_enabled(
|
|
178
|
+
self, test_case=test_case
|
|
179
|
+
)
|
|
164
180
|
return self.score
|
|
165
181
|
|
|
166
182
|
def _get_tool_accuracy_score(
|
|
@@ -299,13 +315,20 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
299
315
|
tool_accuracy_score: List[ToolScore],
|
|
300
316
|
args_accuracy_score: List[ArgsScore],
|
|
301
317
|
) -> float:
|
|
302
|
-
|
|
303
|
-
tool_accuracy_score
|
|
318
|
+
tool_divisor = (
|
|
319
|
+
len(tool_accuracy_score) if len(tool_accuracy_score) > 0 else 1
|
|
320
|
+
)
|
|
321
|
+
args_divisor = (
|
|
322
|
+
len(args_accuracy_score) if len(args_accuracy_score) > 0 else 1
|
|
323
|
+
)
|
|
324
|
+
tool_score = (
|
|
325
|
+
sum(score.score for score in tool_accuracy_score) / tool_divisor
|
|
304
326
|
)
|
|
305
|
-
args_score =
|
|
306
|
-
args_accuracy_score
|
|
327
|
+
args_score = (
|
|
328
|
+
sum(score.score for score in args_accuracy_score) / args_divisor
|
|
307
329
|
)
|
|
308
|
-
|
|
330
|
+
score = min(tool_score, args_score)
|
|
331
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
309
332
|
|
|
310
333
|
def _generate_reason(
|
|
311
334
|
self,
|
|
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from .template import MCPUseMetricTemplate
|
|
22
22
|
from .schema import MCPPrimitivesScore, MCPArgsScore
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class MCPUseMetric(BaseMetric):
|
|
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
56
58
|
|
|
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
104
107
|
self,
|
|
105
108
|
steps=steps,
|
|
106
109
|
)
|
|
110
|
+
if _log_metric_to_confident:
|
|
111
|
+
metric_data_manager.post_metric_if_enabled(
|
|
112
|
+
self, test_case=test_case
|
|
113
|
+
)
|
|
107
114
|
|
|
108
115
|
return self.score
|
|
109
116
|
|
|
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
112
119
|
test_case: LLMTestCase,
|
|
113
120
|
_show_indicator: bool = True,
|
|
114
121
|
_in_component: bool = False,
|
|
122
|
+
_log_metric_to_confident: bool = True,
|
|
115
123
|
) -> float:
|
|
116
124
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
117
125
|
|
|
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
154
162
|
self,
|
|
155
163
|
steps=steps,
|
|
156
164
|
)
|
|
157
|
-
|
|
165
|
+
if _log_metric_to_confident:
|
|
166
|
+
metric_data_manager.post_metric_if_enabled(
|
|
167
|
+
self, test_case=test_case
|
|
168
|
+
)
|
|
158
169
|
return self.score
|
|
159
170
|
|
|
160
171
|
def _get_primitives_used_score(
|
|
@@ -260,9 +271,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
260
271
|
primitives_used_score: MCPPrimitivesScore,
|
|
261
272
|
argument_correctness_score: MCPArgsScore,
|
|
262
273
|
) -> float:
|
|
263
|
-
|
|
274
|
+
score = min(
|
|
264
275
|
primitives_used_score.score, argument_correctness_score.score
|
|
265
276
|
)
|
|
277
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
266
278
|
|
|
267
279
|
def _get_reason(
|
|
268
280
|
self,
|
|
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
|
|
|
16
16
|
)
|
|
17
17
|
from deepeval.metrics.misuse.template import MisuseTemplate
|
|
18
18
|
from deepeval.metrics.misuse.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MisuseMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
|
|
|
86
89
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
87
90
|
],
|
|
88
91
|
)
|
|
92
|
+
if _log_metric_to_confident:
|
|
93
|
+
metric_data_manager.post_metric_if_enabled(
|
|
94
|
+
self, test_case=test_case
|
|
95
|
+
)
|
|
89
96
|
|
|
90
97
|
return self.score
|
|
91
98
|
|
|
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
|
|
|
94
101
|
test_case: LLMTestCase,
|
|
95
102
|
_show_indicator: bool = True,
|
|
96
103
|
_in_component: bool = False,
|
|
104
|
+
_log_metric_to_confident: bool = True,
|
|
97
105
|
) -> float:
|
|
98
106
|
|
|
99
107
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
125
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
126
137
|
return self.score
|
|
127
138
|
|
|
128
139
|
async def _a_generate_reason(self) -> str:
|
|
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
146
148
|
test_case: MLLMTestCase,
|
|
147
149
|
_show_indicator: bool = True,
|
|
148
150
|
_in_component: bool = False,
|
|
151
|
+
_log_metric_to_confident: bool = True,
|
|
149
152
|
) -> float:
|
|
150
153
|
check_mllm_test_case_params(
|
|
151
154
|
test_case, self._required_params, None, None, self
|
|
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
47
47
|
test_case: MLLMTestCase,
|
|
48
48
|
_show_indicator: bool = True,
|
|
49
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
50
51
|
) -> float:
|
|
51
52
|
check_mllm_test_case_params(
|
|
52
53
|
test_case, self._required_params, 1, 1, self
|
|
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
108
110
|
test_case: MLLMTestCase,
|
|
109
111
|
_show_indicator: bool = True,
|
|
110
112
|
_in_component: bool = False,
|
|
113
|
+
_log_metric_to_confident: bool = True,
|
|
111
114
|
) -> float:
|
|
112
115
|
check_mllm_test_case_params(
|
|
113
116
|
test_case, self._required_params, 1, 1, self
|
|
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py
CHANGED
|
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
51
|
check_mllm_test_case_params(
|
|
51
52
|
test_case, self._required_params, None, None, self
|
|
52
53
|
)
|
|
53
54
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
54
55
|
with metric_progress_indicator(
|
|
55
|
-
self,
|
|
56
|
+
self,
|
|
57
|
+
_show_indicator=_show_indicator,
|
|
58
|
+
_in_component=_in_component,
|
|
56
59
|
):
|
|
57
60
|
if self.async_mode:
|
|
58
61
|
loop = get_or_create_event_loop()
|
|
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
61
64
|
test_case,
|
|
62
65
|
_show_indicator=False,
|
|
63
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
64
68
|
)
|
|
65
69
|
)
|
|
66
70
|
else:
|
|
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
89
93
|
test_case: MLLMTestCase,
|
|
90
94
|
_show_indicator: bool = True,
|
|
91
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
92
97
|
) -> float:
|
|
93
98
|
check_mllm_test_case_params(
|
|
94
99
|
test_case, self._required_params, None, None, self
|