deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +97 -42
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/utils.py +1 -1
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/tracing.py +51 -3
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -31,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
31
31
|
number_evaluation_steps,
|
|
32
32
|
get_score_range,
|
|
33
33
|
)
|
|
34
|
+
from deepeval.metrics.api import metric_data_manager
|
|
34
35
|
from deepeval.config.settings import get_settings
|
|
35
36
|
|
|
36
37
|
|
|
@@ -74,6 +75,7 @@ class GEval(BaseMetric):
|
|
|
74
75
|
test_case: LLMTestCase,
|
|
75
76
|
_show_indicator: bool = True,
|
|
76
77
|
_in_component: bool = False,
|
|
78
|
+
_log_metric_to_confident: bool = True,
|
|
77
79
|
_additional_context: Optional[str] = None,
|
|
78
80
|
) -> float:
|
|
79
81
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -122,6 +124,10 @@ class GEval(BaseMetric):
|
|
|
122
124
|
f"Reason: {self.reason}",
|
|
123
125
|
],
|
|
124
126
|
)
|
|
127
|
+
if _log_metric_to_confident:
|
|
128
|
+
metric_data_manager.post_metric_if_enabled(
|
|
129
|
+
self, test_case=test_case
|
|
130
|
+
)
|
|
125
131
|
|
|
126
132
|
return self.score
|
|
127
133
|
|
|
@@ -130,6 +136,7 @@ class GEval(BaseMetric):
|
|
|
130
136
|
test_case: LLMTestCase,
|
|
131
137
|
_show_indicator: bool = True,
|
|
132
138
|
_in_component: bool = False,
|
|
139
|
+
_log_metric_to_confident: bool = True,
|
|
133
140
|
_additional_context: Optional[str] = None,
|
|
134
141
|
) -> float:
|
|
135
142
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -165,6 +172,10 @@ class GEval(BaseMetric):
|
|
|
165
172
|
f"Reason: {self.reason}",
|
|
166
173
|
],
|
|
167
174
|
)
|
|
175
|
+
if _log_metric_to_confident:
|
|
176
|
+
metric_data_manager.post_metric_if_enabled(
|
|
177
|
+
self, test_case=test_case
|
|
178
|
+
)
|
|
168
179
|
return self.score
|
|
169
180
|
|
|
170
181
|
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
@@ -16,6 +16,7 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
|
|
|
16
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.hallucination.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
required_params: List[LLMTestCaseParams] = [
|
|
21
22
|
LLMTestCaseParams.INPUT,
|
|
@@ -51,6 +52,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
58
|
check_llm_test_case_params(test_case, required_params, self)
|
|
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, required_params, self)
|
|
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
118
126
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
119
127
|
],
|
|
120
128
|
)
|
|
121
|
-
|
|
129
|
+
if _log_metric_to_confident:
|
|
130
|
+
metric_data_manager.post_metric_if_enabled(
|
|
131
|
+
self, test_case=test_case
|
|
132
|
+
)
|
|
122
133
|
return self.score
|
|
123
134
|
|
|
124
135
|
async def _a_generate_reason(self):
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -100,6 +100,7 @@ async def measure_metric_task(
|
|
|
100
100
|
test_case,
|
|
101
101
|
_show_indicator=False,
|
|
102
102
|
_in_component=_in_component,
|
|
103
|
+
_log_metric_to_confident=False,
|
|
103
104
|
)
|
|
104
105
|
finish_text = "Done"
|
|
105
106
|
except MissingTestCaseParamsError as e:
|
|
@@ -116,7 +117,9 @@ async def measure_metric_task(
|
|
|
116
117
|
except TypeError:
|
|
117
118
|
try:
|
|
118
119
|
await metric.a_measure(
|
|
119
|
-
test_case,
|
|
120
|
+
test_case,
|
|
121
|
+
_in_component=_in_component,
|
|
122
|
+
_log_metric_to_confident=False,
|
|
120
123
|
)
|
|
121
124
|
finish_text = "Done"
|
|
122
125
|
except MissingTestCaseParamsError as e:
|
|
@@ -241,7 +244,10 @@ async def safe_a_measure(
|
|
|
241
244
|
):
|
|
242
245
|
try:
|
|
243
246
|
await metric.a_measure(
|
|
244
|
-
tc,
|
|
247
|
+
tc,
|
|
248
|
+
_show_indicator=False,
|
|
249
|
+
_in_component=_in_component,
|
|
250
|
+
_log_metric_to_confident=False,
|
|
245
251
|
)
|
|
246
252
|
update_pbar(progress, pbar_eval_id)
|
|
247
253
|
except MissingTestCaseParamsError as e:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
18
18
|
from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
|
|
19
19
|
from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
|
|
20
20
|
from deepeval.utils import get_or_create_event_loop
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
|
|
23
24
|
|
|
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
58
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
88
91
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
89
92
|
],
|
|
90
93
|
)
|
|
94
|
+
if _log_metric_to_confident:
|
|
95
|
+
metric_data_manager.post_metric_if_enabled(
|
|
96
|
+
self, test_case=test_case
|
|
97
|
+
)
|
|
91
98
|
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
96
103
|
test_case: LLMTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
|
|
101
109
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
126
134
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
127
135
|
],
|
|
128
136
|
)
|
|
129
|
-
|
|
137
|
+
if _log_metric_to_confident:
|
|
138
|
+
metric_data_manager.post_metric_if_enabled(
|
|
139
|
+
self, test_case=test_case
|
|
140
|
+
)
|
|
130
141
|
return self.score
|
|
131
142
|
|
|
132
143
|
async def a_generate_reason(self, actual_output: str) -> str:
|
|
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
|
|
|
20
20
|
KnowledgeRetentionScoreReason,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
47
48
|
test_case: ConversationalTestCase,
|
|
48
49
|
_show_indicator: bool = True,
|
|
49
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
50
52
|
):
|
|
51
53
|
check_conversational_test_case_params(
|
|
52
54
|
test_case, self._required_test_case_params, self
|
|
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
return self.score
|
|
88
95
|
|
|
89
96
|
async def a_measure(
|
|
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
91
98
|
test_case: ConversationalTestCase,
|
|
92
99
|
_show_indicator: bool = True,
|
|
93
100
|
_in_component: bool = False,
|
|
101
|
+
_log_metric_to_confident: bool = True,
|
|
94
102
|
) -> float:
|
|
95
103
|
check_conversational_test_case_params(
|
|
96
104
|
test_case, self._required_test_case_params, self
|
|
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
123
135
|
return self.score
|
|
124
136
|
|
|
125
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, TaskScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
90
93
|
f"Score: {self.score}",
|
|
91
94
|
],
|
|
92
95
|
)
|
|
96
|
+
if _log_metric_to_confident:
|
|
97
|
+
metric_data_manager.post_metric_if_enabled(
|
|
98
|
+
self, test_case=test_case
|
|
99
|
+
)
|
|
93
100
|
return self.score
|
|
94
101
|
|
|
95
102
|
async def a_measure(
|
|
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
97
104
|
test_case: ConversationalTestCase,
|
|
98
105
|
_show_indicator: bool = True,
|
|
99
106
|
_in_component: bool = False,
|
|
107
|
+
_log_metric_to_confident: bool = True,
|
|
100
108
|
):
|
|
101
109
|
check_conversational_test_case_params(
|
|
102
110
|
test_case, self._required_test_case_params, self
|
|
@@ -131,6 +139,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
131
139
|
f"Score: {self.score}",
|
|
132
140
|
],
|
|
133
141
|
)
|
|
142
|
+
if _log_metric_to_confident:
|
|
143
|
+
metric_data_manager.post_metric_if_enabled(
|
|
144
|
+
self, test_case=test_case
|
|
145
|
+
)
|
|
146
|
+
|
|
134
147
|
return self.score
|
|
135
148
|
|
|
136
149
|
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
102
105
|
f"Score: {self.score}",
|
|
103
106
|
],
|
|
104
107
|
)
|
|
108
|
+
if _log_metric_to_confident:
|
|
109
|
+
metric_data_manager.post_metric_if_enabled(
|
|
110
|
+
self, test_case=test_case
|
|
111
|
+
)
|
|
112
|
+
|
|
105
113
|
return self.score
|
|
106
114
|
|
|
107
115
|
async def a_measure(
|
|
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
109
117
|
test_case: ConversationalTestCase,
|
|
110
118
|
_show_indicator: bool = True,
|
|
111
119
|
_in_component: bool = False,
|
|
120
|
+
_log_metric_to_confident: bool = True,
|
|
112
121
|
):
|
|
113
122
|
check_conversational_test_case_params(
|
|
114
123
|
test_case, self._required_test_case_params, self
|
|
@@ -161,6 +170,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
161
170
|
f"Score: {self.score}",
|
|
162
171
|
],
|
|
163
172
|
)
|
|
173
|
+
if _log_metric_to_confident:
|
|
174
|
+
metric_data_manager.post_metric_if_enabled(
|
|
175
|
+
self, test_case=test_case
|
|
176
|
+
)
|
|
164
177
|
return self.score
|
|
165
178
|
|
|
166
179
|
def _get_tool_accuracy_score(
|
|
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from .template import MCPUseMetricTemplate
|
|
22
22
|
from .schema import MCPPrimitivesScore, MCPArgsScore
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class MCPUseMetric(BaseMetric):
|
|
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
56
58
|
|
|
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
104
107
|
self,
|
|
105
108
|
steps=steps,
|
|
106
109
|
)
|
|
110
|
+
if _log_metric_to_confident:
|
|
111
|
+
metric_data_manager.post_metric_if_enabled(
|
|
112
|
+
self, test_case=test_case
|
|
113
|
+
)
|
|
107
114
|
|
|
108
115
|
return self.score
|
|
109
116
|
|
|
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
112
119
|
test_case: LLMTestCase,
|
|
113
120
|
_show_indicator: bool = True,
|
|
114
121
|
_in_component: bool = False,
|
|
122
|
+
_log_metric_to_confident: bool = True,
|
|
115
123
|
) -> float:
|
|
116
124
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
117
125
|
|
|
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
154
162
|
self,
|
|
155
163
|
steps=steps,
|
|
156
164
|
)
|
|
157
|
-
|
|
165
|
+
if _log_metric_to_confident:
|
|
166
|
+
metric_data_manager.post_metric_if_enabled(
|
|
167
|
+
self, test_case=test_case
|
|
168
|
+
)
|
|
158
169
|
return self.score
|
|
159
170
|
|
|
160
171
|
def _get_primitives_used_score(
|
|
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
|
|
|
16
16
|
)
|
|
17
17
|
from deepeval.metrics.misuse.template import MisuseTemplate
|
|
18
18
|
from deepeval.metrics.misuse.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MisuseMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
|
|
|
86
89
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
87
90
|
],
|
|
88
91
|
)
|
|
92
|
+
if _log_metric_to_confident:
|
|
93
|
+
metric_data_manager.post_metric_if_enabled(
|
|
94
|
+
self, test_case=test_case
|
|
95
|
+
)
|
|
89
96
|
|
|
90
97
|
return self.score
|
|
91
98
|
|
|
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
|
|
|
94
101
|
test_case: LLMTestCase,
|
|
95
102
|
_show_indicator: bool = True,
|
|
96
103
|
_in_component: bool = False,
|
|
104
|
+
_log_metric_to_confident: bool = True,
|
|
97
105
|
) -> float:
|
|
98
106
|
|
|
99
107
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
125
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
126
137
|
return self.score
|
|
127
138
|
|
|
128
139
|
async def _a_generate_reason(self) -> str:
|
|
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
146
148
|
test_case: MLLMTestCase,
|
|
147
149
|
_show_indicator: bool = True,
|
|
148
150
|
_in_component: bool = False,
|
|
151
|
+
_log_metric_to_confident: bool = True,
|
|
149
152
|
) -> float:
|
|
150
153
|
check_mllm_test_case_params(
|
|
151
154
|
test_case, self._required_params, None, None, self
|
|
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
47
47
|
test_case: MLLMTestCase,
|
|
48
48
|
_show_indicator: bool = True,
|
|
49
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
50
51
|
) -> float:
|
|
51
52
|
check_mllm_test_case_params(
|
|
52
53
|
test_case, self._required_params, 1, 1, self
|
|
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
108
110
|
test_case: MLLMTestCase,
|
|
109
111
|
_show_indicator: bool = True,
|
|
110
112
|
_in_component: bool = False,
|
|
113
|
+
_log_metric_to_confident: bool = True,
|
|
111
114
|
) -> float:
|
|
112
115
|
check_mllm_test_case_params(
|
|
113
116
|
test_case, self._required_params, 1, 1, self
|
|
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py
CHANGED
|
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
51
|
check_mllm_test_case_params(
|
|
51
52
|
test_case, self._required_params, None, None, self
|
|
52
53
|
)
|
|
53
54
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
54
55
|
with metric_progress_indicator(
|
|
55
|
-
self,
|
|
56
|
+
self,
|
|
57
|
+
_show_indicator=_show_indicator,
|
|
58
|
+
_in_component=_in_component,
|
|
56
59
|
):
|
|
57
60
|
if self.async_mode:
|
|
58
61
|
loop = get_or_create_event_loop()
|
|
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
61
64
|
test_case,
|
|
62
65
|
_show_indicator=False,
|
|
63
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
64
68
|
)
|
|
65
69
|
)
|
|
66
70
|
else:
|
|
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
89
93
|
test_case: MLLMTestCase,
|
|
90
94
|
_show_indicator: bool = True,
|
|
91
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
92
97
|
) -> float:
|
|
93
98
|
check_mllm_test_case_params(
|
|
94
99
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
56
57
|
|
|
57
58
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
59
|
with metric_progress_indicator(
|
|
59
|
-
self,
|
|
60
|
+
self,
|
|
61
|
+
_show_indicator=_show_indicator,
|
|
62
|
+
_in_component=_in_component,
|
|
60
63
|
):
|
|
61
64
|
if self.async_mode:
|
|
62
65
|
loop = get_or_create_event_loop()
|
|
@@ -65,6 +68,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
65
68
|
test_case,
|
|
66
69
|
_show_indicator=False,
|
|
67
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
72
|
)
|
|
69
73
|
)
|
|
70
74
|
else:
|
|
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
93
97
|
test_case: MLLMTestCase,
|
|
94
98
|
_show_indicator: bool = True,
|
|
95
99
|
_in_component: bool = False,
|
|
100
|
+
_log_metric_to_confident: bool = True,
|
|
96
101
|
) -> float:
|
|
97
102
|
check_mllm_test_case_params(
|
|
98
103
|
test_case, self._required_params, None, None, self
|
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py
CHANGED
|
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
65
66
|
test_case,
|
|
66
67
|
_show_indicator=False,
|
|
67
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
70
|
)
|
|
69
71
|
)
|
|
70
72
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|