deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from typing import Optional, List, Tuple, Union, Dict
|
|
2
2
|
|
|
3
|
-
from deepeval.utils import get_or_create_event_loop
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
check_llm_test_case_params,
|
|
8
7
|
initialize_model,
|
|
8
|
+
a_generate_with_schema_and_extract,
|
|
9
|
+
generate_with_schema_and_extract,
|
|
9
10
|
)
|
|
10
11
|
from deepeval.test_case import (
|
|
11
12
|
LLMTestCase,
|
|
@@ -15,7 +16,11 @@ from deepeval.metrics import BaseMetric
|
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.task_completion.template import TaskCompletionTemplate
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.task_completion.schema import
|
|
19
|
+
from deepeval.metrics.task_completion.schema import (
|
|
20
|
+
TaskAndOutcome,
|
|
21
|
+
TaskCompletionVerdict,
|
|
22
|
+
)
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class TaskCompletionMetric(BaseMetric):
|
|
@@ -23,7 +28,6 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
23
28
|
_required_params: List[LLMTestCaseParams] = [
|
|
24
29
|
LLMTestCaseParams.INPUT,
|
|
25
30
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
-
LLMTestCaseParams.TOOLS_CALLED,
|
|
27
31
|
]
|
|
28
32
|
|
|
29
33
|
def __init__(
|
|
@@ -58,9 +62,15 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
58
62
|
_in_component: bool = False,
|
|
59
63
|
_log_metric_to_confident: bool = True,
|
|
60
64
|
) -> float:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
65
|
+
check_llm_test_case_params(
|
|
66
|
+
test_case,
|
|
67
|
+
self._required_params,
|
|
68
|
+
None,
|
|
69
|
+
None,
|
|
70
|
+
self,
|
|
71
|
+
self.model,
|
|
72
|
+
test_case.multimodal,
|
|
73
|
+
)
|
|
64
74
|
|
|
65
75
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
66
76
|
with metric_progress_indicator(
|
|
@@ -91,6 +101,12 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
91
101
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
102
|
],
|
|
93
103
|
)
|
|
104
|
+
|
|
105
|
+
if _log_metric_to_confident:
|
|
106
|
+
metric_data_manager.post_metric_if_enabled(
|
|
107
|
+
self, test_case=test_case
|
|
108
|
+
)
|
|
109
|
+
|
|
94
110
|
return self.score
|
|
95
111
|
|
|
96
112
|
async def a_measure(
|
|
@@ -100,9 +116,15 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
100
116
|
_in_component: bool = False,
|
|
101
117
|
_log_metric_to_confident: bool = True,
|
|
102
118
|
) -> float:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
119
|
+
check_llm_test_case_params(
|
|
120
|
+
test_case,
|
|
121
|
+
self._required_params,
|
|
122
|
+
None,
|
|
123
|
+
None,
|
|
124
|
+
self,
|
|
125
|
+
self.model,
|
|
126
|
+
test_case.multimodal,
|
|
127
|
+
)
|
|
106
128
|
|
|
107
129
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
108
130
|
with metric_progress_indicator(
|
|
@@ -127,6 +149,12 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
127
149
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
128
150
|
],
|
|
129
151
|
)
|
|
152
|
+
|
|
153
|
+
if _log_metric_to_confident:
|
|
154
|
+
metric_data_manager.post_metric_if_enabled(
|
|
155
|
+
self, test_case=test_case
|
|
156
|
+
)
|
|
157
|
+
|
|
130
158
|
return self.score
|
|
131
159
|
|
|
132
160
|
async def _a_generate_verdicts(self) -> Tuple:
|
|
@@ -134,44 +162,26 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
134
162
|
task=self.task,
|
|
135
163
|
actual_outcome=self.outcome,
|
|
136
164
|
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
try:
|
|
145
|
-
res: TaskCompletionVerdict = await self.model.a_generate(
|
|
146
|
-
prompt, schema=TaskCompletionVerdict
|
|
147
|
-
)
|
|
148
|
-
return res.verdict, res.reason
|
|
149
|
-
except TypeError:
|
|
150
|
-
res = await self.model.a_generate(prompt)
|
|
151
|
-
data = trimAndLoadJson(res, self)
|
|
152
|
-
return data["verdict"], data["reason"]
|
|
165
|
+
return await a_generate_with_schema_and_extract(
|
|
166
|
+
metric=self,
|
|
167
|
+
prompt=prompt,
|
|
168
|
+
schema_cls=TaskCompletionVerdict,
|
|
169
|
+
extract_schema=lambda s: (s.verdict, s.reason),
|
|
170
|
+
extract_json=lambda data: (data["verdict"], data["reason"]),
|
|
171
|
+
)
|
|
153
172
|
|
|
154
173
|
def _generate_verdicts(self) -> Tuple:
|
|
155
174
|
prompt = TaskCompletionTemplate.generate_verdict(
|
|
156
175
|
task=self.task,
|
|
157
176
|
actual_outcome=self.outcome,
|
|
158
177
|
)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
try:
|
|
167
|
-
res: TaskCompletionVerdict = self.model.generate(
|
|
168
|
-
prompt, schema=TaskCompletionVerdict
|
|
169
|
-
)
|
|
170
|
-
return res.verdict, res.reason
|
|
171
|
-
except TypeError:
|
|
172
|
-
res = self.model.generate(prompt)
|
|
173
|
-
data = trimAndLoadJson(res, self)
|
|
174
|
-
return data["verdict"], data["reason"]
|
|
178
|
+
return generate_with_schema_and_extract(
|
|
179
|
+
metric=self,
|
|
180
|
+
prompt=prompt,
|
|
181
|
+
schema_cls=TaskCompletionVerdict,
|
|
182
|
+
extract_schema=lambda s: (s.verdict, s.reason),
|
|
183
|
+
extract_json=lambda data: (data["verdict"], data["reason"]),
|
|
184
|
+
)
|
|
175
185
|
|
|
176
186
|
async def _a_extract_task_and_outcome(
|
|
177
187
|
self,
|
|
@@ -189,22 +199,13 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
189
199
|
actual_output=test_case.actual_output,
|
|
190
200
|
tools_called=test_case.tools_called,
|
|
191
201
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
try:
|
|
200
|
-
res: TaskAndOutcome = await self.model.a_generate(
|
|
201
|
-
prompt, schema=TaskAndOutcome
|
|
202
|
-
)
|
|
203
|
-
return res.task, res.outcome
|
|
204
|
-
except TypeError:
|
|
205
|
-
res = await self.model.a_generate(prompt)
|
|
206
|
-
data = trimAndLoadJson(res, self)
|
|
207
|
-
return data["task"], data["outcome"]
|
|
202
|
+
return await a_generate_with_schema_and_extract(
|
|
203
|
+
metric=self,
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
schema_cls=TaskAndOutcome,
|
|
206
|
+
extract_schema=lambda s: (s.task, s.outcome),
|
|
207
|
+
extract_json=lambda data: (data["task"], data["outcome"]),
|
|
208
|
+
)
|
|
208
209
|
|
|
209
210
|
def _extract_task_and_outcome(
|
|
210
211
|
self,
|
|
@@ -222,20 +223,13 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
222
223
|
actual_output=test_case.actual_output,
|
|
223
224
|
tools_called=test_case.tools_called,
|
|
224
225
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
prompt, schema=TaskAndOutcome
|
|
233
|
-
)
|
|
234
|
-
return res.task, res.outcome
|
|
235
|
-
except TypeError:
|
|
236
|
-
res = self.model.generate(prompt)
|
|
237
|
-
data = trimAndLoadJson(res, self)
|
|
238
|
-
return data["task"], data["outcome"]
|
|
226
|
+
return generate_with_schema_and_extract(
|
|
227
|
+
metric=self,
|
|
228
|
+
prompt=prompt,
|
|
229
|
+
schema_cls=TaskAndOutcome,
|
|
230
|
+
extract_schema=lambda s: (s.task, s.outcome),
|
|
231
|
+
extract_json=lambda data: (data["task"], data["outcome"]),
|
|
232
|
+
)
|
|
239
233
|
|
|
240
234
|
def _calculate_score(self):
|
|
241
235
|
return (
|
|
@@ -250,7 +244,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
250
244
|
else:
|
|
251
245
|
try:
|
|
252
246
|
self.success = self.score >= self.threshold
|
|
253
|
-
except:
|
|
247
|
+
except TypeError:
|
|
254
248
|
self.success = False
|
|
255
249
|
return self.success
|
|
256
250
|
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
from typing import List, Dict, Optional, Union
|
|
1
|
+
from typing import List, Dict, Optional, Union, Tuple
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
-
from deepeval.utils import get_or_create_event_loop
|
|
4
|
+
from deepeval.utils import get_or_create_event_loop
|
|
5
5
|
from deepeval.metrics.utils import (
|
|
6
6
|
construct_verbose_logs,
|
|
7
7
|
check_llm_test_case_params,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
initialize_model,
|
|
10
9
|
print_tools_called,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.test_case import (
|
|
@@ -62,7 +63,15 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
62
63
|
_log_metric_to_confident: bool = True,
|
|
63
64
|
) -> float:
|
|
64
65
|
|
|
65
|
-
check_llm_test_case_params(
|
|
66
|
+
check_llm_test_case_params(
|
|
67
|
+
test_case,
|
|
68
|
+
self._required_params,
|
|
69
|
+
None,
|
|
70
|
+
None,
|
|
71
|
+
self,
|
|
72
|
+
self.model,
|
|
73
|
+
test_case.multimodal,
|
|
74
|
+
)
|
|
66
75
|
self.test_case = test_case
|
|
67
76
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
77
|
|
|
@@ -83,18 +92,16 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
83
92
|
self.tools_called: List[ToolCall] = test_case.tools_called
|
|
84
93
|
self.expected_tools: List[ToolCall] = test_case.expected_tools
|
|
85
94
|
tool_calling_score = self._calculate_score()
|
|
86
|
-
if self.available_tools:
|
|
95
|
+
if self.available_tools and not test_case.multimodal:
|
|
87
96
|
tool_selection_score = self._get_tool_selection_score(
|
|
88
97
|
test_case.input,
|
|
89
98
|
test_case.tools_called,
|
|
90
99
|
self.available_tools,
|
|
91
100
|
)
|
|
92
101
|
else:
|
|
93
|
-
tool_selection_score =
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
reason="No available tools were provided to assess tool selection criteria",
|
|
97
|
-
)
|
|
102
|
+
tool_selection_score = ToolSelectionScore(
|
|
103
|
+
score=1,
|
|
104
|
+
reason="No available tools were provided to assess tool selection criteria",
|
|
98
105
|
)
|
|
99
106
|
score = min(tool_calling_score, tool_selection_score.score)
|
|
100
107
|
self.score = (
|
|
@@ -165,7 +172,15 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
165
172
|
_in_component: bool = False,
|
|
166
173
|
_log_metric_to_confident: bool = True,
|
|
167
174
|
) -> float:
|
|
168
|
-
check_llm_test_case_params(
|
|
175
|
+
check_llm_test_case_params(
|
|
176
|
+
test_case,
|
|
177
|
+
self._required_params,
|
|
178
|
+
None,
|
|
179
|
+
None,
|
|
180
|
+
self,
|
|
181
|
+
self.model,
|
|
182
|
+
test_case.multimodal,
|
|
183
|
+
)
|
|
169
184
|
|
|
170
185
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
171
186
|
with metric_progress_indicator(
|
|
@@ -177,7 +192,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
177
192
|
self.tools_called: List[ToolCall] = test_case.tools_called
|
|
178
193
|
self.expected_tools: List[ToolCall] = test_case.expected_tools
|
|
179
194
|
tool_calling_score = self._calculate_score()
|
|
180
|
-
if self.available_tools:
|
|
195
|
+
if self.available_tools and not test_case.multimodal:
|
|
181
196
|
tool_selection_score = await self._a_get_tool_selection_score(
|
|
182
197
|
test_case.input,
|
|
183
198
|
test_case.tools_called,
|
|
@@ -324,18 +339,13 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
324
339
|
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
325
340
|
user_input, tools_called_formatted, available_tools_formatted
|
|
326
341
|
)
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
return res
|
|
335
|
-
except TypeError:
|
|
336
|
-
res = self.model.generate(prompt)
|
|
337
|
-
data = trimAndLoadJson(res, self)
|
|
338
|
-
return ToolSelectionScore(**data)
|
|
342
|
+
return generate_with_schema_and_extract(
|
|
343
|
+
metric=self,
|
|
344
|
+
prompt=prompt,
|
|
345
|
+
schema_cls=ToolSelectionScore,
|
|
346
|
+
extract_schema=lambda s: s,
|
|
347
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
348
|
+
)
|
|
339
349
|
|
|
340
350
|
async def _a_get_tool_selection_score(
|
|
341
351
|
self, user_input, tools_called, available_tools
|
|
@@ -345,25 +355,16 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
345
355
|
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
346
356
|
user_input, tools_called_formatted, available_tools_formatted
|
|
347
357
|
)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
try:
|
|
356
|
-
res = await self.model.a_generate(
|
|
357
|
-
prompt, schema=ToolSelectionScore
|
|
358
|
-
)
|
|
359
|
-
return res
|
|
360
|
-
except TypeError:
|
|
361
|
-
res = await self.model.a_generate(prompt)
|
|
362
|
-
data = trimAndLoadJson(res, self)
|
|
363
|
-
return ToolSelectionScore(**data)
|
|
358
|
+
return await a_generate_with_schema_and_extract(
|
|
359
|
+
metric=self,
|
|
360
|
+
prompt=prompt,
|
|
361
|
+
schema_cls=ToolSelectionScore,
|
|
362
|
+
extract_schema=lambda s: s,
|
|
363
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
364
|
+
)
|
|
364
365
|
|
|
365
366
|
# Calculate score
|
|
366
|
-
def _calculate_score(self):
|
|
367
|
+
def _calculate_score(self) -> float:
|
|
367
368
|
if self.should_exact_match:
|
|
368
369
|
score = self._calculate_exact_match_score()
|
|
369
370
|
elif self.should_consider_ordering:
|
|
@@ -382,7 +383,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
382
383
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
383
384
|
|
|
384
385
|
# Exact matching score
|
|
385
|
-
def _calculate_exact_match_score(self):
|
|
386
|
+
def _calculate_exact_match_score(self) -> float:
|
|
386
387
|
if len(self.tools_called) != len(self.expected_tools):
|
|
387
388
|
return 0.0
|
|
388
389
|
if (
|
|
@@ -405,7 +406,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
405
406
|
return 1.0
|
|
406
407
|
|
|
407
408
|
# Non exact matching score
|
|
408
|
-
def _calculate_non_exact_match_score(self):
|
|
409
|
+
def _calculate_non_exact_match_score(self) -> float:
|
|
409
410
|
total_score = 0.0
|
|
410
411
|
matched_called_tools = set()
|
|
411
412
|
for expected_tool in self.expected_tools:
|
|
@@ -445,7 +446,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
445
446
|
)
|
|
446
447
|
|
|
447
448
|
# Consider ordering score
|
|
448
|
-
def _compute_weighted_lcs(self):
|
|
449
|
+
def _compute_weighted_lcs(self) -> Tuple[List[ToolCall], float]:
|
|
449
450
|
m, n = len(self.expected_tools), len(self.tools_called)
|
|
450
451
|
dp = [[0.0] * (n + 1) for _ in range(m + 1)]
|
|
451
452
|
for i in range(1, m + 1):
|
|
@@ -3,11 +3,11 @@ import asyncio
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
get_unit_interactions,
|
|
8
|
-
print_tools_called,
|
|
9
7
|
check_conversational_test_case_params,
|
|
10
8
|
initialize_model,
|
|
9
|
+
a_generate_with_schema_and_extract,
|
|
10
|
+
generate_with_schema_and_extract,
|
|
11
11
|
)
|
|
12
12
|
from deepeval.test_case import (
|
|
13
13
|
ConversationalTestCase,
|
|
@@ -61,7 +61,12 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
61
61
|
_log_metric_to_confident: bool = True,
|
|
62
62
|
):
|
|
63
63
|
check_conversational_test_case_params(
|
|
64
|
-
test_case,
|
|
64
|
+
test_case,
|
|
65
|
+
self._required_test_case_params,
|
|
66
|
+
self,
|
|
67
|
+
False,
|
|
68
|
+
self.model,
|
|
69
|
+
test_case.multimodal,
|
|
65
70
|
)
|
|
66
71
|
|
|
67
72
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -136,7 +141,12 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
136
141
|
_log_metric_to_confident: bool = True,
|
|
137
142
|
):
|
|
138
143
|
check_conversational_test_case_params(
|
|
139
|
-
test_case,
|
|
144
|
+
test_case,
|
|
145
|
+
self._required_test_case_params,
|
|
146
|
+
self,
|
|
147
|
+
False,
|
|
148
|
+
self.model,
|
|
149
|
+
test_case.multimodal,
|
|
140
150
|
)
|
|
141
151
|
|
|
142
152
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -206,22 +216,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
206
216
|
user_and_tools.tools_called,
|
|
207
217
|
user_and_tools.available_tools,
|
|
208
218
|
)
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
try:
|
|
217
|
-
res: ArgumentCorrectnessScore = self.model.generate(
|
|
218
|
-
prompt, schema=ArgumentCorrectnessScore
|
|
219
|
-
)
|
|
220
|
-
return res
|
|
221
|
-
except TypeError:
|
|
222
|
-
res = self.model.generate(prompt)
|
|
223
|
-
data = trimAndLoadJson(res, self)
|
|
224
|
-
return ArgumentCorrectnessScore(**data)
|
|
219
|
+
return generate_with_schema_and_extract(
|
|
220
|
+
metric=self,
|
|
221
|
+
prompt=prompt,
|
|
222
|
+
schema_cls=ArgumentCorrectnessScore,
|
|
223
|
+
extract_schema=lambda s: s,
|
|
224
|
+
extract_json=lambda data: ArgumentCorrectnessScore(**data),
|
|
225
|
+
)
|
|
225
226
|
|
|
226
227
|
async def _a_get_argument_correctness_score(
|
|
227
228
|
self,
|
|
@@ -233,22 +234,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
233
234
|
user_and_tools.tools_called,
|
|
234
235
|
user_and_tools.available_tools,
|
|
235
236
|
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
try:
|
|
244
|
-
res: ArgumentCorrectnessScore = await self.model.a_generate(
|
|
245
|
-
prompt, schema=ArgumentCorrectnessScore
|
|
246
|
-
)
|
|
247
|
-
return res
|
|
248
|
-
except TypeError:
|
|
249
|
-
res = await self.model.a_generate(prompt)
|
|
250
|
-
data = trimAndLoadJson(res, self)
|
|
251
|
-
return ArgumentCorrectnessScore(**data)
|
|
237
|
+
return await a_generate_with_schema_and_extract(
|
|
238
|
+
metric=self,
|
|
239
|
+
prompt=prompt,
|
|
240
|
+
schema_cls=ArgumentCorrectnessScore,
|
|
241
|
+
extract_schema=lambda s: s,
|
|
242
|
+
extract_json=lambda data: ArgumentCorrectnessScore(**data),
|
|
243
|
+
)
|
|
252
244
|
|
|
253
245
|
def _get_tool_selection_score(
|
|
254
246
|
self,
|
|
@@ -260,20 +252,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
260
252
|
user_and_tools.tools_called,
|
|
261
253
|
user_and_tools.available_tools,
|
|
262
254
|
)
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
prompt, schema=ToolSelectionScore
|
|
271
|
-
)
|
|
272
|
-
return res
|
|
273
|
-
except TypeError:
|
|
274
|
-
res = self.model.generate(prompt)
|
|
275
|
-
data = trimAndLoadJson(res, self)
|
|
276
|
-
return ToolSelectionScore(**data)
|
|
255
|
+
return generate_with_schema_and_extract(
|
|
256
|
+
metric=self,
|
|
257
|
+
prompt=prompt,
|
|
258
|
+
schema_cls=ToolSelectionScore,
|
|
259
|
+
extract_schema=lambda s: s,
|
|
260
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
261
|
+
)
|
|
277
262
|
|
|
278
263
|
async def _a_get_tool_selection_score(
|
|
279
264
|
self,
|
|
@@ -285,22 +270,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
285
270
|
user_and_tools.tools_called,
|
|
286
271
|
user_and_tools.available_tools,
|
|
287
272
|
)
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
try:
|
|
296
|
-
res: ToolSelectionScore = await self.model.a_generate(
|
|
297
|
-
prompt, schema=ToolSelectionScore
|
|
298
|
-
)
|
|
299
|
-
return res
|
|
300
|
-
except TypeError:
|
|
301
|
-
res = await self.model.a_generate(prompt)
|
|
302
|
-
data = trimAndLoadJson(res, self)
|
|
303
|
-
return ToolSelectionScore(**data)
|
|
273
|
+
return await a_generate_with_schema_and_extract(
|
|
274
|
+
metric=self,
|
|
275
|
+
prompt=prompt,
|
|
276
|
+
schema_cls=ToolSelectionScore,
|
|
277
|
+
extract_schema=lambda s: s,
|
|
278
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
279
|
+
)
|
|
304
280
|
|
|
305
281
|
def _get_user_input_and_turns(
|
|
306
282
|
self,
|
|
@@ -3,6 +3,13 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class TopicAdherenceTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
6
13
|
|
|
7
14
|
@staticmethod
|
|
8
15
|
def get_qa_pairs(
|
|
@@ -19,6 +26,8 @@ class TopicAdherenceTemplate:
|
|
|
19
26
|
Do not infer information beyond what is stated. Ignore irrelevant or conversational turns (e.g. greetings, affirmations) that do not constitute clear QA pairs.
|
|
20
27
|
If there are multiple questions and multiple answers in a single sentence, break them into separate pairs. Each pair must be standalone, and should not contain more than one question or response.
|
|
21
28
|
|
|
29
|
+
{TopicAdherenceTemplate.multimodal_rules}
|
|
30
|
+
|
|
22
31
|
OUTPUT Format:
|
|
23
32
|
Return a **JSON object** with a single 2 keys:
|
|
24
33
|
- `"question"`: the user's question
|
|
@@ -82,6 +91,8 @@ class TopicAdherenceTemplate:
|
|
|
82
91
|
3. Based on both relevance and correctness, assign one of four possible verdicts.
|
|
83
92
|
4. Give a simple, comprehensive reason explaining why this question-answer pair was assigned this verdict
|
|
84
93
|
|
|
94
|
+
{TopicAdherenceTemplate.multimodal_rules}
|
|
95
|
+
|
|
85
96
|
VERDICTS:
|
|
86
97
|
- `"TP"` (True Positive): Question is relevant and the response correctly answers it.
|
|
87
98
|
- `"FN"` (False Negative): Question is relevant, but the assistant refused to answer or gave an irrelevant response.
|
|
@@ -138,6 +149,8 @@ class TopicAdherenceTemplate:
|
|
|
138
149
|
|
|
139
150
|
Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
|
|
140
151
|
|
|
152
|
+
{TopicAdherenceTemplate.multimodal_rules}
|
|
153
|
+
|
|
141
154
|
Pass: {success}
|
|
142
155
|
Score: {score}
|
|
143
156
|
Threshold: {threshold}
|