deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import textwrap
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GoalAccuracyTemplate:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def get_accuracy_score(task, steps_taken):
|
|
8
|
+
return textwrap.dedent(
|
|
9
|
+
f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
|
|
10
|
+
|
|
11
|
+
PURPOSE:
|
|
12
|
+
|
|
13
|
+
Evaluate whether the assistant's **visible output** (what the user actually saw) **fully and correctly achieved the user's stated goal.
|
|
14
|
+
Ignore internal reasoning, hidden tool calls, or retriever outputs unless their results were explicitly surfaced to the user.
|
|
15
|
+
|
|
16
|
+
The evaluation must be **strict and adversarial** — if the goal is not *clearly, fully, and correctly achieved*, assign a low score.
|
|
17
|
+
|
|
18
|
+
EVALUATION RULES
|
|
19
|
+
|
|
20
|
+
1. **User-visible fulfillment only**
|
|
21
|
+
- Base your judgment solely on what the user would see in the assistant's message.
|
|
22
|
+
- Ignore hidden or internal steps unless their results were explicitly communicated.
|
|
23
|
+
|
|
24
|
+
2. **Goal completion**
|
|
25
|
+
- The assistant must explicitly provide everything the user asked for.
|
|
26
|
+
- If even one subpart of the task is missing, incomplete, or vague, the score must be **≤ 0.5**.
|
|
27
|
+
|
|
28
|
+
3. **Correctness and relevance**
|
|
29
|
+
- The information provided must be factually correct and directly relevant to the task.
|
|
30
|
+
- Hallucinated or unrelated content automatically lowers the score.
|
|
31
|
+
|
|
32
|
+
4. **Self-sufficiency**
|
|
33
|
+
- The visible response must stand on its own; the user should not need prior context or follow-up clarification.
|
|
34
|
+
|
|
35
|
+
5. **Strict bias toward failure**
|
|
36
|
+
- When uncertain, assume the goal was **not achieved**.
|
|
37
|
+
- The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
|
|
38
|
+
|
|
39
|
+
SCORING GUIDE:
|
|
40
|
+
|
|
41
|
+
- **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
|
|
42
|
+
- **0.75** → Mostly achieved; minor omissions or trivial inaccuracies.
|
|
43
|
+
- **0.5** → Partially achieved; core goal addressed, but key parts missing or incorrect.
|
|
44
|
+
- **0.25** → Weak attempt; loosely related but fails to satisfy the user’s request.
|
|
45
|
+
- **0.0** → Goal not achieved at all; irrelevant, wrong, or missing answer.
|
|
46
|
+
|
|
47
|
+
*When in doubt, choose the lower score.*
|
|
48
|
+
|
|
49
|
+
OUTPUT FORMAT:
|
|
50
|
+
|
|
51
|
+
Return only a valid JSON object with this structure:
|
|
52
|
+
|
|
53
|
+
{{
|
|
54
|
+
"score": 0.0,
|
|
55
|
+
"reason": "1-3 factual sentences explaining what parts of the user's goal were or were not achieved."
|
|
56
|
+
}}
|
|
57
|
+
|
|
58
|
+
The reason must:
|
|
59
|
+
- Be objective and concise.
|
|
60
|
+
- Refer to **specific missing or incorrect elements**.
|
|
61
|
+
- Avoid vague language (“somewhat correct”, “pretty accurate”).
|
|
62
|
+
|
|
63
|
+
EXAMPLES:
|
|
64
|
+
|
|
65
|
+
**Example 1**
|
|
66
|
+
Task: "Translate 'good night' into French."
|
|
67
|
+
Assistant Reply: "Bonne nuit."
|
|
68
|
+
→
|
|
69
|
+
{{
|
|
70
|
+
"score": 1.0,
|
|
71
|
+
"reason": "The assistant provided the exact, correct translation requested by the user."
|
|
72
|
+
}}
|
|
73
|
+
|
|
74
|
+
**Example 2**
|
|
75
|
+
Task: "List three renewable energy sources."
|
|
76
|
+
Assistant Reply: "Solar and wind energy."
|
|
77
|
+
→
|
|
78
|
+
{{
|
|
79
|
+
"score": 0.5,
|
|
80
|
+
"reason": "The assistant only listed two sources instead of three, so the goal was partially achieved."
|
|
81
|
+
}}
|
|
82
|
+
|
|
83
|
+
**Example 3**
|
|
84
|
+
Task: "Summarize this paragraph."
|
|
85
|
+
Assistant Reply: "It talks about technology."
|
|
86
|
+
→
|
|
87
|
+
{{
|
|
88
|
+
"score": 0.25,
|
|
89
|
+
"reason": "The summary is too vague and fails to convey key information from the text."
|
|
90
|
+
}}
|
|
91
|
+
|
|
92
|
+
*** END OF EXAMPLES ***
|
|
93
|
+
|
|
94
|
+
USER TASK:
|
|
95
|
+
{task}
|
|
96
|
+
|
|
97
|
+
AGENT STEPS:
|
|
98
|
+
{steps_taken}
|
|
99
|
+
|
|
100
|
+
JSON:
|
|
101
|
+
"""
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def get_plan_evaluation_score(task, steps_taken):
|
|
106
|
+
return textwrap.dedent(
|
|
107
|
+
f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
|
|
108
|
+
|
|
109
|
+
OBJECTIVE:
|
|
110
|
+
|
|
111
|
+
Evaluate:
|
|
112
|
+
|
|
113
|
+
1. **Plan Quality** — Was the agent's plan clear, complete, and logically structured to fully address the user's task?
|
|
114
|
+
2. **Plan Adherence** — Did the agent consistently follow that plan without unjustified deviations, omissions, or extraneous steps?
|
|
115
|
+
|
|
116
|
+
Your judgment must be strict: a plan must be well-formed and execution must align with it for a high score.
|
|
117
|
+
|
|
118
|
+
EVALUATION CRITERIA
|
|
119
|
+
|
|
120
|
+
- Plan Quality:
|
|
121
|
+
- The plan should explicitly or implicitly outline all necessary steps to fulfill the user's task.
|
|
122
|
+
- It must be logically ordered, neither vague nor overly generic.
|
|
123
|
+
- Missing critical components or unclear structuring lowers the score drastically.
|
|
124
|
+
|
|
125
|
+
- Plan Adherence:
|
|
126
|
+
- Execution must closely match the planned steps.
|
|
127
|
+
- Any skipped, added, or rearranged steps without clear justification count as plan deviations.
|
|
128
|
+
- Minor, justified variations are acceptable but reduce the score slightly.
|
|
129
|
+
|
|
130
|
+
- General Rules:
|
|
131
|
+
- If no discernible plan exists, score ≤ 0.5 regardless of task completion.
|
|
132
|
+
- Tool use should be coherent within the plan, not ad hoc or speculative.
|
|
133
|
+
- This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
|
|
134
|
+
|
|
135
|
+
SCORING GUIDE:
|
|
136
|
+
|
|
137
|
+
- **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
|
|
138
|
+
- **0.75** → Mostly clear plan with minor omissions or small execution deviations that do not impact the overall strategy.
|
|
139
|
+
- **0.5** → Partial plan exists but is incomplete, vague, or only partially followed; notable deviations present.
|
|
140
|
+
- **0.25** → Weak or fragmented plan; execution frequently diverges or lacks coherence with any strategy.
|
|
141
|
+
- **0.0** → No evidence of a plan; execution appears random or unrelated to the user's task.
|
|
142
|
+
|
|
143
|
+
INSTRUCTIONS:
|
|
144
|
+
|
|
145
|
+
1. Identify the agent's plan from the steps taken (explicit plans stated or implicit structure).
|
|
146
|
+
2. Assess plan completeness and logical order relative to the user's task.
|
|
147
|
+
3. Compare execution steps against the plan to check for adherence, noting any unjustified deviations.
|
|
148
|
+
4. Deduct points for vagueness, missing critical steps, or inconsistent execution.
|
|
149
|
+
|
|
150
|
+
OUTPUT FORMAT:
|
|
151
|
+
|
|
152
|
+
Return only a valid JSON object with exactly two fields:
|
|
153
|
+
|
|
154
|
+
{{
|
|
155
|
+
"score": 0.0,
|
|
156
|
+
"reason": "1-3 concise sentences explaining the quality of the plan and how well execution matched it. Specify missing or extra steps, plan clarity, and adherence issues."
|
|
157
|
+
}}
|
|
158
|
+
|
|
159
|
+
EXAMPLE:
|
|
160
|
+
|
|
161
|
+
User Task: "Plan a business trip including booking a flight, hotel, and preparing an agenda."
|
|
162
|
+
|
|
163
|
+
Agent Steps include:
|
|
164
|
+
- Outlined flight, hotel, and agenda steps explicitly.
|
|
165
|
+
- Executed flight and hotel booking steps.
|
|
166
|
+
- Skipped agenda preparation despite mentioning it in the plan.
|
|
167
|
+
|
|
168
|
+
Example JSON:
|
|
169
|
+
|
|
170
|
+
{{
|
|
171
|
+
"score": 0.75,
|
|
172
|
+
"reason": "The agent formed a clear plan covering flights, hotel, and agenda, but failed to execute the agenda preparation step, reducing adherence."
|
|
173
|
+
}}
|
|
174
|
+
|
|
175
|
+
**** END OF EXAMPLE ****
|
|
176
|
+
|
|
177
|
+
INPUTS:
|
|
178
|
+
|
|
179
|
+
USER TASK:
|
|
180
|
+
{task}
|
|
181
|
+
|
|
182
|
+
AGENT STEPS:
|
|
183
|
+
{steps_taken}
|
|
184
|
+
|
|
185
|
+
JSON:
|
|
186
|
+
"""
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
@staticmethod
|
|
190
|
+
def get_final_reason(
|
|
191
|
+
final_score, threshold, goal_evaluations, plan_evalautions
|
|
192
|
+
):
|
|
193
|
+
return textwrap.dedent(
|
|
194
|
+
f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
|
|
195
|
+
|
|
196
|
+
You are given:
|
|
197
|
+
- An agent's goal execution scores and reasons.
|
|
198
|
+
- The agent's plan evaluation scores and reasons.
|
|
199
|
+
- The **final combined score**.
|
|
200
|
+
- The **threshold** required to pass.
|
|
201
|
+
- Whether the result is a **pass** or **fail**.
|
|
202
|
+
|
|
203
|
+
Your job is to write a short, precise explanation of **why** the agent passed or failed — taking into account the quality of execution and planning, and the threshold.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
INSTRUCTIONS:
|
|
208
|
+
|
|
209
|
+
- Write 2-4 clear, objective sentences explaining the overall result.
|
|
210
|
+
- Explicitly reference both the task and plan performance — **both must be addressed**.
|
|
211
|
+
- Mention how the final score compares to the threshold.
|
|
212
|
+
- If the agent **passed**, highlight how both task execution and planning were sufficient to meet the goal.
|
|
213
|
+
- If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
|
|
214
|
+
- Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
FORMAT:
|
|
219
|
+
Return only a single string. Do **not** include JSON or any extra formatting.
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
Goal evaluations:
|
|
224
|
+
{goal_evaluations}
|
|
225
|
+
|
|
226
|
+
Plan evaluations:
|
|
227
|
+
{plan_evalautions}
|
|
228
|
+
|
|
229
|
+
Final Score: {final_score}
|
|
230
|
+
Threshold: {threshold}
|
|
231
|
+
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
232
|
+
|
|
233
|
+
Final Reason:
|
|
234
|
+
"""
|
|
235
|
+
)
|
|
@@ -18,14 +18,14 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
18
18
|
from deepeval.metrics.hallucination.schema import *
|
|
19
19
|
from deepeval.metrics.api import metric_data_manager
|
|
20
20
|
|
|
21
|
-
required_params: List[LLMTestCaseParams] = [
|
|
22
|
-
LLMTestCaseParams.INPUT,
|
|
23
|
-
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
24
|
-
LLMTestCaseParams.CONTEXT,
|
|
25
|
-
]
|
|
26
|
-
|
|
27
21
|
|
|
28
22
|
class HallucinationMetric(BaseMetric):
|
|
23
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
24
|
+
LLMTestCaseParams.INPUT,
|
|
25
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
+
LLMTestCaseParams.CONTEXT,
|
|
27
|
+
]
|
|
28
|
+
|
|
29
29
|
def __init__(
|
|
30
30
|
self,
|
|
31
31
|
threshold: float = 0.5,
|
|
@@ -55,7 +55,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
55
55
|
_log_metric_to_confident: bool = True,
|
|
56
56
|
) -> float:
|
|
57
57
|
|
|
58
|
-
check_llm_test_case_params(test_case,
|
|
58
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
59
59
|
|
|
60
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
61
|
with metric_progress_indicator(
|
|
@@ -102,7 +102,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
102
102
|
_log_metric_to_confident: bool = True,
|
|
103
103
|
) -> float:
|
|
104
104
|
|
|
105
|
-
check_llm_test_case_params(test_case,
|
|
105
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
106
106
|
|
|
107
107
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
108
108
|
with metric_progress_indicator(
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
|
|
|
18
18
|
from deepeval.telemetry import capture_metric_type
|
|
19
19
|
from deepeval.utils import update_pbar
|
|
20
20
|
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def format_metric_description(
|
|
23
27
|
metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
|
|
@@ -43,7 +47,7 @@ def metric_progress_indicator(
|
|
|
43
47
|
_show_indicator: bool = True,
|
|
44
48
|
_in_component: bool = False,
|
|
45
49
|
):
|
|
46
|
-
captured_async_mode = False if async_mode
|
|
50
|
+
captured_async_mode = False if async_mode is None else async_mode
|
|
47
51
|
with capture_metric_type(
|
|
48
52
|
metric.__name__,
|
|
49
53
|
async_mode=captured_async_mode,
|
|
@@ -250,6 +254,21 @@ async def safe_a_measure(
|
|
|
250
254
|
_log_metric_to_confident=False,
|
|
251
255
|
)
|
|
252
256
|
update_pbar(progress, pbar_eval_id)
|
|
257
|
+
|
|
258
|
+
except asyncio.CancelledError:
|
|
259
|
+
logger.info("caught asyncio.CancelledError")
|
|
260
|
+
|
|
261
|
+
# treat cancellation as a timeout so we still emit a MetricData
|
|
262
|
+
metric.error = (
|
|
263
|
+
"Timed out/cancelled while evaluating metric. "
|
|
264
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
265
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
266
|
+
)
|
|
267
|
+
metric.success = False
|
|
268
|
+
|
|
269
|
+
if not ignore_errors:
|
|
270
|
+
raise
|
|
271
|
+
|
|
253
272
|
except MissingTestCaseParamsError as e:
|
|
254
273
|
if skip_on_missing_params:
|
|
255
274
|
metric.skipped = True
|
|
@@ -277,5 +296,6 @@ async def safe_a_measure(
|
|
|
277
296
|
if ignore_errors:
|
|
278
297
|
metric.error = str(e)
|
|
279
298
|
metric.success = False # Assuming you want to set success to False
|
|
299
|
+
logger.info("a metric was marked as errored")
|
|
280
300
|
else:
|
|
281
301
|
raise
|
|
@@ -112,7 +112,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
112
112
|
|
|
113
113
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
114
114
|
with metric_progress_indicator(
|
|
115
|
-
self,
|
|
115
|
+
self,
|
|
116
|
+
async_mode=True,
|
|
117
|
+
_show_indicator=_show_indicator,
|
|
118
|
+
_in_component=_in_component,
|
|
116
119
|
):
|
|
117
120
|
if not test_case.mcp_servers:
|
|
118
121
|
error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MCPTaskCompletionMetric' metric."
|
|
@@ -241,8 +244,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
241
244
|
return tasks
|
|
242
245
|
|
|
243
246
|
def _calculate_score(self, scores: List[TaskScore]) -> float:
|
|
247
|
+
score_divsor = len(scores) if len(scores) > 0 else 1
|
|
244
248
|
total_score = sum(score.score for score in scores)
|
|
245
|
-
|
|
249
|
+
score = total_score / score_divsor
|
|
250
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
246
251
|
|
|
247
252
|
def is_successful(self) -> bool:
|
|
248
253
|
if self.error is not None:
|
|
@@ -125,7 +125,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
125
125
|
|
|
126
126
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
127
127
|
with metric_progress_indicator(
|
|
128
|
-
self,
|
|
128
|
+
self,
|
|
129
|
+
async_mode=True,
|
|
130
|
+
_show_indicator=_show_indicator,
|
|
131
|
+
_in_component=_in_component,
|
|
129
132
|
):
|
|
130
133
|
if not test_case.mcp_servers:
|
|
131
134
|
error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MultiTurnMCPUseMetric' metric."
|
|
@@ -312,13 +315,20 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
312
315
|
tool_accuracy_score: List[ToolScore],
|
|
313
316
|
args_accuracy_score: List[ArgsScore],
|
|
314
317
|
) -> float:
|
|
315
|
-
|
|
316
|
-
tool_accuracy_score
|
|
318
|
+
tool_divisor = (
|
|
319
|
+
len(tool_accuracy_score) if len(tool_accuracy_score) > 0 else 1
|
|
317
320
|
)
|
|
318
|
-
|
|
319
|
-
args_accuracy_score
|
|
321
|
+
args_divisor = (
|
|
322
|
+
len(args_accuracy_score) if len(args_accuracy_score) > 0 else 1
|
|
320
323
|
)
|
|
321
|
-
|
|
324
|
+
tool_score = (
|
|
325
|
+
sum(score.score for score in tool_accuracy_score) / tool_divisor
|
|
326
|
+
)
|
|
327
|
+
args_score = (
|
|
328
|
+
sum(score.score for score in args_accuracy_score) / args_divisor
|
|
329
|
+
)
|
|
330
|
+
score = min(tool_score, args_score)
|
|
331
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
322
332
|
|
|
323
333
|
def _generate_reason(
|
|
324
334
|
self,
|
|
@@ -271,9 +271,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
271
271
|
primitives_used_score: MCPPrimitivesScore,
|
|
272
272
|
argument_correctness_score: MCPArgsScore,
|
|
273
273
|
) -> float:
|
|
274
|
-
|
|
274
|
+
score = min(
|
|
275
275
|
primitives_used_score.score, argument_correctness_score.score
|
|
276
276
|
)
|
|
277
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
277
278
|
|
|
278
279
|
def _get_reason(
|
|
279
280
|
self,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional, List, Union
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseMultimodalMetric
|
|
4
|
-
from deepeval.test_case import
|
|
4
|
+
from deepeval.test_case import MLLMTestCase
|
|
5
5
|
from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
|
|
6
6
|
MultiModalContextualPrecisionTemplate,
|
|
7
7
|
)
|
|
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
)
|
|
15
15
|
from deepeval.test_case import LLMTestCaseParams
|
|
16
16
|
from deepeval.models import DeepEvalBaseMLLM
|
|
17
|
-
|
|
17
|
+
import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
|
|
18
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
19
|
|
|
20
20
|
|
|
@@ -72,7 +72,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
72
72
|
)
|
|
73
73
|
)
|
|
74
74
|
else:
|
|
75
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
75
|
+
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
76
76
|
self._generate_verdicts(
|
|
77
77
|
test_case.input,
|
|
78
78
|
test_case.expected_output,
|
|
@@ -110,7 +110,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
110
110
|
_show_indicator=_show_indicator,
|
|
111
111
|
_in_component=_in_component,
|
|
112
112
|
):
|
|
113
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
113
|
+
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
114
114
|
await self._a_generate_verdicts(
|
|
115
115
|
test_case.input,
|
|
116
116
|
test_case.expected_output,
|
|
@@ -130,12 +130,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
130
130
|
|
|
131
131
|
return self.score
|
|
132
132
|
|
|
133
|
-
async def _a_generate_reason(self, input: str):
|
|
133
|
+
async def _a_generate_reason(self, input: str) -> Optional[str]:
|
|
134
134
|
if self.include_reason is False:
|
|
135
135
|
return None
|
|
136
136
|
|
|
137
137
|
retrieval_contexts_verdicts = [
|
|
138
|
-
{"verdict": verdict.verdict, "
|
|
138
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
139
139
|
for verdict in self.verdicts
|
|
140
140
|
]
|
|
141
141
|
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
@@ -146,15 +146,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
146
146
|
|
|
147
147
|
if self.using_native_model:
|
|
148
148
|
res, cost = await self.model.a_generate(
|
|
149
|
-
prompt,
|
|
149
|
+
prompt,
|
|
150
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
150
151
|
)
|
|
151
152
|
self.evaluation_cost += cost
|
|
152
153
|
return res.reason
|
|
153
154
|
else:
|
|
154
155
|
try:
|
|
155
|
-
res: MultimodelContextualPrecisionScoreReason = (
|
|
156
|
+
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
156
157
|
await self.model.a_generate(
|
|
157
|
-
prompt,
|
|
158
|
+
prompt,
|
|
159
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
158
160
|
)
|
|
159
161
|
)
|
|
160
162
|
return res.reason
|
|
@@ -163,12 +165,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
163
165
|
data = trimAndLoadJson(res, self)
|
|
164
166
|
return data["reason"]
|
|
165
167
|
|
|
166
|
-
def _generate_reason(self, input: str):
|
|
168
|
+
def _generate_reason(self, input: str) -> Optional[str]:
|
|
167
169
|
if self.include_reason is False:
|
|
168
170
|
return None
|
|
169
171
|
|
|
170
172
|
retrieval_contexts_verdicts = [
|
|
171
|
-
{"verdict": verdict.verdict, "
|
|
173
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
172
174
|
for verdict in self.verdicts
|
|
173
175
|
]
|
|
174
176
|
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
@@ -179,15 +181,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
179
181
|
|
|
180
182
|
if self.using_native_model:
|
|
181
183
|
res, cost = self.model.generate(
|
|
182
|
-
prompt,
|
|
184
|
+
prompt,
|
|
185
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
183
186
|
)
|
|
184
187
|
self.evaluation_cost += cost
|
|
185
188
|
return res.reason
|
|
186
189
|
else:
|
|
187
190
|
try:
|
|
188
|
-
res: MultimodelContextualPrecisionScoreReason = (
|
|
191
|
+
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
189
192
|
self.model.generate(
|
|
190
|
-
prompt,
|
|
193
|
+
prompt,
|
|
194
|
+
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
191
195
|
)
|
|
192
196
|
)
|
|
193
197
|
return res.reason
|
|
@@ -198,21 +202,23 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
198
202
|
|
|
199
203
|
async def _a_generate_verdicts(
|
|
200
204
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
201
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
205
|
+
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
202
206
|
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
203
207
|
input=input,
|
|
204
208
|
expected_output=expected_output,
|
|
205
209
|
retrieval_context=retrieval_context,
|
|
206
210
|
)
|
|
207
211
|
if self.using_native_model:
|
|
208
|
-
res, cost = await self.model.a_generate(
|
|
212
|
+
res, cost = await self.model.a_generate(
|
|
213
|
+
prompt, schema=mcpschema.Verdicts
|
|
214
|
+
)
|
|
209
215
|
self.evaluation_cost += cost
|
|
210
216
|
verdicts = [item for item in res.verdicts]
|
|
211
217
|
return verdicts
|
|
212
218
|
else:
|
|
213
219
|
try:
|
|
214
|
-
res: Verdicts = await self.model.a_generate(
|
|
215
|
-
prompt, schema=Verdicts
|
|
220
|
+
res: mcpschema.Verdicts = await self.model.a_generate(
|
|
221
|
+
prompt, schema=mcpschema.Verdicts
|
|
216
222
|
)
|
|
217
223
|
verdicts = [item for item in res.verdicts]
|
|
218
224
|
return verdicts
|
|
@@ -220,34 +226,36 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
220
226
|
res = await self.model.a_generate(prompt)
|
|
221
227
|
data = trimAndLoadJson(res, self)
|
|
222
228
|
verdicts = [
|
|
223
|
-
ContextualPrecisionVerdict(**item)
|
|
229
|
+
mcpschema.ContextualPrecisionVerdict(**item)
|
|
224
230
|
for item in data["verdicts"]
|
|
225
231
|
]
|
|
226
232
|
return verdicts
|
|
227
233
|
|
|
228
234
|
def _generate_verdicts(
|
|
229
235
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
230
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
236
|
+
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
231
237
|
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
232
238
|
input=input,
|
|
233
239
|
expected_output=expected_output,
|
|
234
240
|
retrieval_context=retrieval_context,
|
|
235
241
|
)
|
|
236
242
|
if self.using_native_model:
|
|
237
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
243
|
+
res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
|
|
238
244
|
self.evaluation_cost += cost
|
|
239
245
|
verdicts = [item for item in res.verdicts]
|
|
240
246
|
return verdicts
|
|
241
247
|
else:
|
|
242
248
|
try:
|
|
243
|
-
res: Verdicts = self.model.generate(
|
|
249
|
+
res: mcpschema.Verdicts = self.model.generate(
|
|
250
|
+
prompt, schema=mcpschema.Verdicts
|
|
251
|
+
)
|
|
244
252
|
verdicts = [item for item in res.verdicts]
|
|
245
253
|
return verdicts
|
|
246
254
|
except TypeError:
|
|
247
255
|
res = self.model.generate(prompt)
|
|
248
256
|
data = trimAndLoadJson(res, self)
|
|
249
257
|
verdicts = [
|
|
250
|
-
ContextualPrecisionVerdict(**item)
|
|
258
|
+
mcpschema.ContextualPrecisionVerdict(**item)
|
|
251
259
|
for item in data["verdicts"]
|
|
252
260
|
]
|
|
253
261
|
return verdicts
|
|
@@ -284,7 +292,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
284
292
|
else:
|
|
285
293
|
try:
|
|
286
294
|
self.success = self.score >= self.threshold
|
|
287
|
-
except:
|
|
295
|
+
except TypeError:
|
|
288
296
|
self.success = False
|
|
289
297
|
return self.success
|
|
290
298
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .plan_adherence import PlanAdherenceMetric
|