deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional, List, Union
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseMultimodalMetric
|
|
4
|
-
from deepeval.test_case import
|
|
4
|
+
from deepeval.test_case import MLLMTestCase
|
|
5
5
|
from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
|
|
6
6
|
MultiModalContextualPrecisionTemplate,
|
|
7
7
|
)
|
|
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
)
|
|
15
15
|
from deepeval.test_case import LLMTestCaseParams
|
|
16
16
|
from deepeval.models import DeepEvalBaseMLLM
|
|
17
|
-
|
|
17
|
+
import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
|
|
18
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
19
|
|
|
20
20
|
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
56
57
|
|
|
57
58
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
59
|
with metric_progress_indicator(
|
|
59
|
-
self,
|
|
60
|
+
self,
|
|
61
|
+
_show_indicator=_show_indicator,
|
|
62
|
+
_in_component=_in_component,
|
|
60
63
|
):
|
|
61
64
|
if self.async_mode:
|
|
62
65
|
loop = get_or_create_event_loop()
|
|
@@ -65,10 +68,11 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
65
68
|
test_case,
|
|
66
69
|
_show_indicator=False,
|
|
67
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
72
|
)
|
|
69
73
|
)
|
|
70
74
|
else:
|
|
71
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
75
|
+
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
72
76
|
self._generate_verdicts(
|
|
73
77
|
test_case.input,
|
|
74
78
|
test_case.expected_output,
|
|
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
93
97
|
test_case: MLLMTestCase,
|
|
94
98
|
_show_indicator: bool = True,
|
|
95
99
|
_in_component: bool = False,
|
|
100
|
+
_log_metric_to_confident: bool = True,
|
|
96
101
|
) -> float:
|
|
97
102
|
check_mllm_test_case_params(
|
|
98
103
|
test_case, self._required_params, None, None, self
|
|
@@ -105,7 +110,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
105
110
|
_show_indicator=_show_indicator,
|
|
106
111
|
_in_component=_in_component,
|
|
107
112
|
):
|
|
108
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
113
|
+
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
109
114
|
await self._a_generate_verdicts(
|
|
110
115
|
test_case.input,
|
|
111
116
|
test_case.expected_output,
|
|
@@ -125,12 +130,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
125
130
|
|
|
126
131
|
return self.score
|
|
127
132
|
|
|
128
|
-
async def _a_generate_reason(self, input: str):
|
|
133
|
+
async def _a_generate_reason(self, input: str) -> Optional[str]:
|
|
129
134
|
if self.include_reason is False:
|
|
130
135
|
return None
|
|
131
136
|
|
|
132
137
|
retrieval_contexts_verdicts = [
|
|
133
|
-
{"verdict": verdict.verdict, "
|
|
138
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
134
139
|
for verdict in self.verdicts
|
|
135
140
|
]
|
|
136
141
|
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
@@ -141,15 +146,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
141
146
|
|
|
142
147
|
if self.using_native_model:
|
|
143
148
|
res, cost = await self.model.a_generate(
|
|
144
|
-
prompt,
|
|
149
|
+
prompt,
|
|
150
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
145
151
|
)
|
|
146
152
|
self.evaluation_cost += cost
|
|
147
153
|
return res.reason
|
|
148
154
|
else:
|
|
149
155
|
try:
|
|
150
|
-
res: MultimodelContextualPrecisionScoreReason = (
|
|
156
|
+
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
151
157
|
await self.model.a_generate(
|
|
152
|
-
prompt,
|
|
158
|
+
prompt,
|
|
159
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
153
160
|
)
|
|
154
161
|
)
|
|
155
162
|
return res.reason
|
|
@@ -158,12 +165,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
158
165
|
data = trimAndLoadJson(res, self)
|
|
159
166
|
return data["reason"]
|
|
160
167
|
|
|
161
|
-
def _generate_reason(self, input: str):
|
|
168
|
+
def _generate_reason(self, input: str) -> Optional[str]:
|
|
162
169
|
if self.include_reason is False:
|
|
163
170
|
return None
|
|
164
171
|
|
|
165
172
|
retrieval_contexts_verdicts = [
|
|
166
|
-
{"verdict": verdict.verdict, "
|
|
173
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
167
174
|
for verdict in self.verdicts
|
|
168
175
|
]
|
|
169
176
|
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
@@ -174,15 +181,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
174
181
|
|
|
175
182
|
if self.using_native_model:
|
|
176
183
|
res, cost = self.model.generate(
|
|
177
|
-
prompt,
|
|
184
|
+
prompt,
|
|
185
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
178
186
|
)
|
|
179
187
|
self.evaluation_cost += cost
|
|
180
188
|
return res.reason
|
|
181
189
|
else:
|
|
182
190
|
try:
|
|
183
|
-
res: MultimodelContextualPrecisionScoreReason = (
|
|
191
|
+
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
184
192
|
self.model.generate(
|
|
185
|
-
prompt,
|
|
193
|
+
prompt,
|
|
194
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
186
195
|
)
|
|
187
196
|
)
|
|
188
197
|
return res.reason
|
|
@@ -193,21 +202,23 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
193
202
|
|
|
194
203
|
async def _a_generate_verdicts(
|
|
195
204
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
196
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
205
|
+
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
197
206
|
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
198
207
|
input=input,
|
|
199
208
|
expected_output=expected_output,
|
|
200
209
|
retrieval_context=retrieval_context,
|
|
201
210
|
)
|
|
202
211
|
if self.using_native_model:
|
|
203
|
-
res, cost = await self.model.a_generate(
|
|
212
|
+
res, cost = await self.model.a_generate(
|
|
213
|
+
prompt, schema=mcpschema.Verdicts
|
|
214
|
+
)
|
|
204
215
|
self.evaluation_cost += cost
|
|
205
216
|
verdicts = [item for item in res.verdicts]
|
|
206
217
|
return verdicts
|
|
207
218
|
else:
|
|
208
219
|
try:
|
|
209
|
-
res: Verdicts = await self.model.a_generate(
|
|
210
|
-
prompt, schema=Verdicts
|
|
220
|
+
res: mcpschema.Verdicts = await self.model.a_generate(
|
|
221
|
+
prompt, schema=mcpschema.Verdicts
|
|
211
222
|
)
|
|
212
223
|
verdicts = [item for item in res.verdicts]
|
|
213
224
|
return verdicts
|
|
@@ -215,34 +226,36 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
215
226
|
res = await self.model.a_generate(prompt)
|
|
216
227
|
data = trimAndLoadJson(res, self)
|
|
217
228
|
verdicts = [
|
|
218
|
-
ContextualPrecisionVerdict(**item)
|
|
229
|
+
mcpschema.ContextualPrecisionVerdict(**item)
|
|
219
230
|
for item in data["verdicts"]
|
|
220
231
|
]
|
|
221
232
|
return verdicts
|
|
222
233
|
|
|
223
234
|
def _generate_verdicts(
|
|
224
235
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
225
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
236
|
+
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
226
237
|
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
227
238
|
input=input,
|
|
228
239
|
expected_output=expected_output,
|
|
229
240
|
retrieval_context=retrieval_context,
|
|
230
241
|
)
|
|
231
242
|
if self.using_native_model:
|
|
232
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
243
|
+
res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
|
|
233
244
|
self.evaluation_cost += cost
|
|
234
245
|
verdicts = [item for item in res.verdicts]
|
|
235
246
|
return verdicts
|
|
236
247
|
else:
|
|
237
248
|
try:
|
|
238
|
-
res: Verdicts = self.model.generate(
|
|
249
|
+
res: mcpschema.Verdicts = self.model.generate(
|
|
250
|
+
prompt, schema=mcpschema.Verdicts
|
|
251
|
+
)
|
|
239
252
|
verdicts = [item for item in res.verdicts]
|
|
240
253
|
return verdicts
|
|
241
254
|
except TypeError:
|
|
242
255
|
res = self.model.generate(prompt)
|
|
243
256
|
data = trimAndLoadJson(res, self)
|
|
244
257
|
verdicts = [
|
|
245
|
-
ContextualPrecisionVerdict(**item)
|
|
258
|
+
mcpschema.ContextualPrecisionVerdict(**item)
|
|
246
259
|
for item in data["verdicts"]
|
|
247
260
|
]
|
|
248
261
|
return verdicts
|
|
@@ -279,7 +292,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
279
292
|
else:
|
|
280
293
|
try:
|
|
281
294
|
self.success = self.score >= self.threshold
|
|
282
|
-
except:
|
|
295
|
+
except TypeError:
|
|
283
296
|
self.success = False
|
|
284
297
|
return self.success
|
|
285
298
|
|
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py
CHANGED
|
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
65
66
|
test_case,
|
|
66
67
|
_show_indicator=False,
|
|
67
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
70
|
)
|
|
69
71
|
)
|
|
70
72
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|
|
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
53
53
|
test_case: MLLMTestCase,
|
|
54
54
|
_show_indicator: bool = True,
|
|
55
55
|
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
58
|
check_mllm_test_case_params(
|
|
58
59
|
test_case, self._required_params, None, None, self
|
|
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
71
72
|
test_case,
|
|
72
73
|
_show_indicator=False,
|
|
73
74
|
_in_component=_in_component,
|
|
75
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
74
76
|
)
|
|
75
77
|
)
|
|
76
78
|
else:
|
|
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
97
99
|
test_case: MLLMTestCase,
|
|
98
100
|
_show_indicator: bool = True,
|
|
99
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
100
103
|
) -> float:
|
|
101
104
|
check_mllm_test_case_params(
|
|
102
105
|
test_case, self._required_params, None, None, self
|
|
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
78
78
|
test_case: MLLMTestCase,
|
|
79
79
|
_show_indicator: bool = True,
|
|
80
80
|
_in_component: bool = False,
|
|
81
|
+
_log_metric_to_confident: bool = True,
|
|
81
82
|
_additional_context: Optional[str] = None,
|
|
82
83
|
) -> float:
|
|
83
84
|
|
|
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
96
97
|
test_case,
|
|
97
98
|
_show_indicator=False,
|
|
98
99
|
_in_component=_in_component,
|
|
100
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
99
101
|
_additional_context=_additional_context,
|
|
100
102
|
)
|
|
101
103
|
)
|
|
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
132
134
|
_show_indicator: bool = True,
|
|
133
135
|
_in_component: bool = False,
|
|
134
136
|
_additional_context: Optional[str] = None,
|
|
137
|
+
_log_metric_to_confident: bool = True,
|
|
135
138
|
) -> float:
|
|
136
139
|
|
|
137
140
|
check_mllm_test_case_params(
|
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
|
|
6
|
+
check_mllm_test_case_params,
|
|
7
7
|
)
|
|
8
8
|
from deepeval.test_case import (
|
|
9
9
|
MLLMTestCase,
|
|
@@ -11,10 +11,10 @@ from deepeval.test_case import (
|
|
|
11
11
|
ToolCallParams,
|
|
12
12
|
ToolCall,
|
|
13
13
|
)
|
|
14
|
-
from deepeval.metrics import
|
|
14
|
+
from deepeval.metrics import BaseMultimodalMetric
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class MultimodalToolCorrectnessMetric(
|
|
17
|
+
class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
|
|
18
18
|
|
|
19
19
|
_required_params: List[MLLMTestCaseParams] = [
|
|
20
20
|
MLLMTestCaseParams.INPUT,
|
|
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
|
-
|
|
51
|
+
check_mllm_test_case_params(
|
|
52
|
+
test_case, self._required_params, None, None, self
|
|
53
|
+
)
|
|
51
54
|
self.test_case = test_case
|
|
52
55
|
with metric_progress_indicator(
|
|
53
56
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
90
93
|
test_case: MLLMTestCase,
|
|
91
94
|
_show_indicator: bool = True,
|
|
92
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
93
97
|
) -> float:
|
|
94
98
|
return self.measure(
|
|
95
99
|
test_case,
|
|
96
100
|
_show_indicator=_show_indicator,
|
|
97
101
|
_in_component=_in_component,
|
|
102
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
98
103
|
)
|
|
99
104
|
|
|
100
105
|
##################################################
|
|
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
278
283
|
|
|
279
284
|
@property
|
|
280
285
|
def __name__(self):
|
|
281
|
-
return "Tool Correctness"
|
|
286
|
+
return "Multi Modal Tool Correctness"
|
|
282
287
|
|
|
283
288
|
def indent_multiline_string(self, s, indent_level=4):
|
|
284
289
|
indent = " " * indent_level
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.non_advice.template import NonAdviceTemplate
|
|
19
19
|
from deepeval.metrics.non_advice.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class NonAdviceMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
93
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
94
97
|
],
|
|
95
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
96
103
|
|
|
97
104
|
return self.score
|
|
98
105
|
|
|
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
101
108
|
test_case: LLMTestCase,
|
|
102
109
|
_show_indicator: bool = True,
|
|
103
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
104
112
|
) -> float:
|
|
105
113
|
|
|
106
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
129
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
138
|
],
|
|
131
139
|
)
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
132
144
|
|
|
133
145
|
return self.score
|
|
134
146
|
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
|
|
19
19
|
from deepeval.metrics.pii_leakage.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class PIILeakageMetric(BaseMetric):
|
|
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
49
50
|
test_case: LLMTestCase,
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
52
54
|
) -> float:
|
|
53
55
|
|
|
54
56
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
64
66
|
test_case,
|
|
65
67
|
_show_indicator=False,
|
|
66
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
70
|
)
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
123
|
-
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
124
135
|
return self.score
|
|
125
136
|
|
|
126
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .plan_adherence import PlanAdherenceMetric
|