deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,235 @@
1
+ from typing import List
2
+ import textwrap
3
+
4
+
5
+ class GoalAccuracyTemplate:
6
+ @staticmethod
7
+ def get_accuracy_score(task, steps_taken):
8
+ return textwrap.dedent(
9
+ f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
10
+
11
+ PURPOSE:
12
+
13
+ Evaluate whether the assistant's **visible output** (what the user actually saw) **fully and correctly achieved the user's stated goal.
14
+ Ignore internal reasoning, hidden tool calls, or retriever outputs unless their results were explicitly surfaced to the user.
15
+
16
+ The evaluation must be **strict and adversarial** — if the goal is not *clearly, fully, and correctly achieved*, assign a low score.
17
+
18
+ EVALUATION RULES
19
+
20
+ 1. **User-visible fulfillment only**
21
+ - Base your judgment solely on what the user would see in the assistant's message.
22
+ - Ignore hidden or internal steps unless their results were explicitly communicated.
23
+
24
+ 2. **Goal completion**
25
+ - The assistant must explicitly provide everything the user asked for.
26
+ - If even one subpart of the task is missing, incomplete, or vague, the score must be **≤ 0.5**.
27
+
28
+ 3. **Correctness and relevance**
29
+ - The information provided must be factually correct and directly relevant to the task.
30
+ - Hallucinated or unrelated content automatically lowers the score.
31
+
32
+ 4. **Self-sufficiency**
33
+ - The visible response must stand on its own; the user should not need prior context or follow-up clarification.
34
+
35
+ 5. **Strict bias toward failure**
36
+ - When uncertain, assume the goal was **not achieved**.
37
+ - The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
38
+
39
+ SCORING GUIDE:
40
+
41
+ - **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
42
+ - **0.75** → Mostly achieved; minor omissions or trivial inaccuracies.
43
+ - **0.5** → Partially achieved; core goal addressed, but key parts missing or incorrect.
44
+ - **0.25** → Weak attempt; loosely related but fails to satisfy the user’s request.
45
+ - **0.0** → Goal not achieved at all; irrelevant, wrong, or missing answer.
46
+
47
+ *When in doubt, choose the lower score.*
48
+
49
+ OUTPUT FORMAT:
50
+
51
+ Return only a valid JSON object with this structure:
52
+
53
+ {{
54
+ "score": 0.0,
55
+ "reason": "1-3 factual sentences explaining what parts of the user's goal were or were not achieved."
56
+ }}
57
+
58
+ The reason must:
59
+ - Be objective and concise.
60
+ - Refer to **specific missing or incorrect elements**.
61
+ - Avoid vague language (“somewhat correct”, “pretty accurate”).
62
+
63
+ EXAMPLES:
64
+
65
+ **Example 1**
66
+ Task: "Translate 'good night' into French."
67
+ Assistant Reply: "Bonne nuit."
68
+
69
+ {{
70
+ "score": 1.0,
71
+ "reason": "The assistant provided the exact, correct translation requested by the user."
72
+ }}
73
+
74
+ **Example 2**
75
+ Task: "List three renewable energy sources."
76
+ Assistant Reply: "Solar and wind energy."
77
+
78
+ {{
79
+ "score": 0.5,
80
+ "reason": "The assistant only listed two sources instead of three, so the goal was partially achieved."
81
+ }}
82
+
83
+ **Example 3**
84
+ Task: "Summarize this paragraph."
85
+ Assistant Reply: "It talks about technology."
86
+
87
+ {{
88
+ "score": 0.25,
89
+ "reason": "The summary is too vague and fails to convey key information from the text."
90
+ }}
91
+
92
+ *** END OF EXAMPLES ***
93
+
94
+ USER TASK:
95
+ {task}
96
+
97
+ AGENT STEPS:
98
+ {steps_taken}
99
+
100
+ JSON:
101
+ """
102
+ )
103
+
104
+ @staticmethod
105
+ def get_plan_evaluation_score(task, steps_taken):
106
+ return textwrap.dedent(
107
+ f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
108
+
109
+ OBJECTIVE:
110
+
111
+ Evaluate:
112
+
113
+ 1. **Plan Quality** — Was the agent's plan clear, complete, and logically structured to fully address the user's task?
114
+ 2. **Plan Adherence** — Did the agent consistently follow that plan without unjustified deviations, omissions, or extraneous steps?
115
+
116
+ Your judgment must be strict: a plan must be well-formed and execution must align with it for a high score.
117
+
118
+ EVALUATION CRITERIA
119
+
120
+ - Plan Quality:
121
+ - The plan should explicitly or implicitly outline all necessary steps to fulfill the user's task.
122
+ - It must be logically ordered, neither vague nor overly generic.
123
+ - Missing critical components or unclear structuring lowers the score drastically.
124
+
125
+ - Plan Adherence:
126
+ - Execution must closely match the planned steps.
127
+ - Any skipped, added, or rearranged steps without clear justification count as plan deviations.
128
+ - Minor, justified variations are acceptable but reduce the score slightly.
129
+
130
+ - General Rules:
131
+ - If no discernible plan exists, score ≤ 0.5 regardless of task completion.
132
+ - Tool use should be coherent within the plan, not ad hoc or speculative.
133
+ - This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
134
+
135
+ SCORING GUIDE:
136
+
137
+ - **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
138
+ - **0.75** → Mostly clear plan with minor omissions or small execution deviations that do not impact the overall strategy.
139
+ - **0.5** → Partial plan exists but is incomplete, vague, or only partially followed; notable deviations present.
140
+ - **0.25** → Weak or fragmented plan; execution frequently diverges or lacks coherence with any strategy.
141
+ - **0.0** → No evidence of a plan; execution appears random or unrelated to the user's task.
142
+
143
+ INSTRUCTIONS:
144
+
145
+ 1. Identify the agent's plan from the steps taken (explicit plans stated or implicit structure).
146
+ 2. Assess plan completeness and logical order relative to the user's task.
147
+ 3. Compare execution steps against the plan to check for adherence, noting any unjustified deviations.
148
+ 4. Deduct points for vagueness, missing critical steps, or inconsistent execution.
149
+
150
+ OUTPUT FORMAT:
151
+
152
+ Return only a valid JSON object with exactly two fields:
153
+
154
+ {{
155
+ "score": 0.0,
156
+ "reason": "1-3 concise sentences explaining the quality of the plan and how well execution matched it. Specify missing or extra steps, plan clarity, and adherence issues."
157
+ }}
158
+
159
+ EXAMPLE:
160
+
161
+ User Task: "Plan a business trip including booking a flight, hotel, and preparing an agenda."
162
+
163
+ Agent Steps include:
164
+ - Outlined flight, hotel, and agenda steps explicitly.
165
+ - Executed flight and hotel booking steps.
166
+ - Skipped agenda preparation despite mentioning it in the plan.
167
+
168
+ Example JSON:
169
+
170
+ {{
171
+ "score": 0.75,
172
+ "reason": "The agent formed a clear plan covering flights, hotel, and agenda, but failed to execute the agenda preparation step, reducing adherence."
173
+ }}
174
+
175
+ **** END OF EXAMPLE ****
176
+
177
+ INPUTS:
178
+
179
+ USER TASK:
180
+ {task}
181
+
182
+ AGENT STEPS:
183
+ {steps_taken}
184
+
185
+ JSON:
186
+ """
187
+ )
188
+
189
+ @staticmethod
190
+ def get_final_reason(
191
+ final_score, threshold, goal_evaluations, plan_evalautions
192
+ ):
193
+ return textwrap.dedent(
194
+ f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
195
+
196
+ You are given:
197
+ - An agent's goal execution scores and reasons.
198
+ - The agent's plan evaluation scores and reasons.
199
+ - The **final combined score**.
200
+ - The **threshold** required to pass.
201
+ - Whether the result is a **pass** or **fail**.
202
+
203
+ Your job is to write a short, precise explanation of **why** the agent passed or failed — taking into account the quality of execution and planning, and the threshold.
204
+
205
+ ---
206
+
207
+ INSTRUCTIONS:
208
+
209
+ - Write 2-4 clear, objective sentences explaining the overall result.
210
+ - Explicitly reference both the task and plan performance — **both must be addressed**.
211
+ - Mention how the final score compares to the threshold.
212
+ - If the agent **passed**, highlight how both task execution and planning were sufficient to meet the goal.
213
+ - If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
214
+ - Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
215
+
216
+ ---
217
+
218
+ FORMAT:
219
+ Return only a single string. Do **not** include JSON or any extra formatting.
220
+
221
+ ---
222
+
223
+ Goal evaluations:
224
+ {goal_evaluations}
225
+
226
+ Plan evaluations:
227
+ {plan_evalautions}
228
+
229
+ Final Score: {final_score}
230
+ Threshold: {threshold}
231
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
232
+
233
+ Final Reason:
234
+ """
235
+ )
@@ -18,14 +18,14 @@ from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.hallucination.schema import *
19
19
  from deepeval.metrics.api import metric_data_manager
20
20
 
21
- required_params: List[LLMTestCaseParams] = [
22
- LLMTestCaseParams.INPUT,
23
- LLMTestCaseParams.ACTUAL_OUTPUT,
24
- LLMTestCaseParams.CONTEXT,
25
- ]
26
-
27
21
 
28
22
  class HallucinationMetric(BaseMetric):
23
+ _required_params: List[LLMTestCaseParams] = [
24
+ LLMTestCaseParams.INPUT,
25
+ LLMTestCaseParams.ACTUAL_OUTPUT,
26
+ LLMTestCaseParams.CONTEXT,
27
+ ]
28
+
29
29
  def __init__(
30
30
  self,
31
31
  threshold: float = 0.5,
@@ -55,7 +55,7 @@ class HallucinationMetric(BaseMetric):
55
55
  _log_metric_to_confident: bool = True,
56
56
  ) -> float:
57
57
 
58
- check_llm_test_case_params(test_case, required_params, self)
58
+ check_llm_test_case_params(test_case, self._required_params, self)
59
59
 
60
60
  self.evaluation_cost = 0 if self.using_native_model else None
61
61
  with metric_progress_indicator(
@@ -102,7 +102,7 @@ class HallucinationMetric(BaseMetric):
102
102
  _log_metric_to_confident: bool = True,
103
103
  ) -> float:
104
104
 
105
- check_llm_test_case_params(test_case, required_params, self)
105
+ check_llm_test_case_params(test_case, self._required_params, self)
106
106
 
107
107
  self.evaluation_cost = 0 if self.using_native_model else None
108
108
  with metric_progress_indicator(
@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
18
18
  from deepeval.telemetry import capture_metric_type
19
19
  from deepeval.utils import update_pbar
20
20
 
21
+ import logging
22
+
23
+ logger = logging.getLogger(__name__)
24
+
21
25
 
22
26
  def format_metric_description(
23
27
  metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
@@ -43,7 +47,7 @@ def metric_progress_indicator(
43
47
  _show_indicator: bool = True,
44
48
  _in_component: bool = False,
45
49
  ):
46
- captured_async_mode = False if async_mode == None else async_mode
50
+ captured_async_mode = False if async_mode is None else async_mode
47
51
  with capture_metric_type(
48
52
  metric.__name__,
49
53
  async_mode=captured_async_mode,
@@ -250,6 +254,21 @@ async def safe_a_measure(
250
254
  _log_metric_to_confident=False,
251
255
  )
252
256
  update_pbar(progress, pbar_eval_id)
257
+
258
+ except asyncio.CancelledError:
259
+ logger.info("caught asyncio.CancelledError")
260
+
261
+ # treat cancellation as a timeout so we still emit a MetricData
262
+ metric.error = (
263
+ "Timed out/cancelled while evaluating metric. "
264
+ "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
265
+ "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
266
+ )
267
+ metric.success = False
268
+
269
+ if not ignore_errors:
270
+ raise
271
+
253
272
  except MissingTestCaseParamsError as e:
254
273
  if skip_on_missing_params:
255
274
  metric.skipped = True
@@ -277,5 +296,6 @@ async def safe_a_measure(
277
296
  if ignore_errors:
278
297
  metric.error = str(e)
279
298
  metric.success = False # Assuming you want to set success to False
299
+ logger.info("a metric was marked as errored")
280
300
  else:
281
301
  raise
@@ -112,7 +112,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
112
112
 
113
113
  self.evaluation_cost = 0 if self.using_native_model else None
114
114
  with metric_progress_indicator(
115
- self, async_mode=True, _show_indicator=_show_indicator
115
+ self,
116
+ async_mode=True,
117
+ _show_indicator=_show_indicator,
118
+ _in_component=_in_component,
116
119
  ):
117
120
  if not test_case.mcp_servers:
118
121
  error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MCPTaskCompletionMetric' metric."
@@ -241,8 +244,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
241
244
  return tasks
242
245
 
243
246
  def _calculate_score(self, scores: List[TaskScore]) -> float:
247
+ score_divsor = len(scores) if len(scores) > 0 else 1
244
248
  total_score = sum(score.score for score in scores)
245
- return total_score / len(scores)
249
+ score = total_score / score_divsor
250
+ return 0 if self.strict_mode and score < self.threshold else score
246
251
 
247
252
  def is_successful(self) -> bool:
248
253
  if self.error is not None:
@@ -125,7 +125,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
125
125
 
126
126
  self.evaluation_cost = 0 if self.using_native_model else None
127
127
  with metric_progress_indicator(
128
- self, async_mode=True, _show_indicator=_show_indicator
128
+ self,
129
+ async_mode=True,
130
+ _show_indicator=_show_indicator,
131
+ _in_component=_in_component,
129
132
  ):
130
133
  if not test_case.mcp_servers:
131
134
  error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MultiTurnMCPUseMetric' metric."
@@ -312,13 +315,20 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
312
315
  tool_accuracy_score: List[ToolScore],
313
316
  args_accuracy_score: List[ArgsScore],
314
317
  ) -> float:
315
- tool_score = sum(score.score for score in tool_accuracy_score) / len(
316
- tool_accuracy_score
318
+ tool_divisor = (
319
+ len(tool_accuracy_score) if len(tool_accuracy_score) > 0 else 1
317
320
  )
318
- args_score = sum(score.score for score in args_accuracy_score) / len(
319
- args_accuracy_score
321
+ args_divisor = (
322
+ len(args_accuracy_score) if len(args_accuracy_score) > 0 else 1
320
323
  )
321
- return min(tool_score, args_score)
324
+ tool_score = (
325
+ sum(score.score for score in tool_accuracy_score) / tool_divisor
326
+ )
327
+ args_score = (
328
+ sum(score.score for score in args_accuracy_score) / args_divisor
329
+ )
330
+ score = min(tool_score, args_score)
331
+ return 0 if self.strict_mode and score < self.threshold else score
322
332
 
323
333
  def _generate_reason(
324
334
  self,
@@ -271,9 +271,10 @@ class MCPUseMetric(BaseMetric):
271
271
  primitives_used_score: MCPPrimitivesScore,
272
272
  argument_correctness_score: MCPArgsScore,
273
273
  ) -> float:
274
- return min(
274
+ score = min(
275
275
  primitives_used_score.score, argument_correctness_score.score
276
276
  )
277
+ return 0 if self.strict_mode and score < self.threshold else score
277
278
 
278
279
  def _get_reason(
279
280
  self,
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, List, Union
2
2
 
3
3
  from deepeval.metrics import BaseMultimodalMetric
4
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
4
+ from deepeval.test_case import MLLMTestCase
5
5
  from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
6
6
  MultiModalContextualPrecisionTemplate,
7
7
  )
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
14
14
  )
15
15
  from deepeval.test_case import LLMTestCaseParams
16
16
  from deepeval.models import DeepEvalBaseMLLM
17
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema import *
17
+ import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
19
 
20
20
 
@@ -72,7 +72,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
72
72
  )
73
73
  )
74
74
  else:
75
- self.verdicts: List[ContextualPrecisionVerdict] = (
75
+ self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
76
76
  self._generate_verdicts(
77
77
  test_case.input,
78
78
  test_case.expected_output,
@@ -110,7 +110,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
110
110
  _show_indicator=_show_indicator,
111
111
  _in_component=_in_component,
112
112
  ):
113
- self.verdicts: List[ContextualPrecisionVerdict] = (
113
+ self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
114
114
  await self._a_generate_verdicts(
115
115
  test_case.input,
116
116
  test_case.expected_output,
@@ -130,12 +130,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
130
130
 
131
131
  return self.score
132
132
 
133
- async def _a_generate_reason(self, input: str):
133
+ async def _a_generate_reason(self, input: str) -> Optional[str]:
134
134
  if self.include_reason is False:
135
135
  return None
136
136
 
137
137
  retrieval_contexts_verdicts = [
138
- {"verdict": verdict.verdict, "reasons": verdict.reason}
138
+ {"verdict": verdict.verdict, "reason": verdict.reason}
139
139
  for verdict in self.verdicts
140
140
  ]
141
141
  prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -146,15 +146,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
146
146
 
147
147
  if self.using_native_model:
148
148
  res, cost = await self.model.a_generate(
149
- prompt, schema=MultimodelContextualPrecisionScoreReason
149
+ prompt,
150
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
150
151
  )
151
152
  self.evaluation_cost += cost
152
153
  return res.reason
153
154
  else:
154
155
  try:
155
- res: MultimodelContextualPrecisionScoreReason = (
156
+ res: mcpschema.MultimodelContextualPrecisionScoreReason = (
156
157
  await self.model.a_generate(
157
- prompt, schema=MultimodelContextualPrecisionScoreReason
158
+ prompt,
159
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
158
160
  )
159
161
  )
160
162
  return res.reason
@@ -163,12 +165,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
163
165
  data = trimAndLoadJson(res, self)
164
166
  return data["reason"]
165
167
 
166
- def _generate_reason(self, input: str):
168
+ def _generate_reason(self, input: str) -> Optional[str]:
167
169
  if self.include_reason is False:
168
170
  return None
169
171
 
170
172
  retrieval_contexts_verdicts = [
171
- {"verdict": verdict.verdict, "reasons": verdict.reason}
173
+ {"verdict": verdict.verdict, "reason": verdict.reason}
172
174
  for verdict in self.verdicts
173
175
  ]
174
176
  prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -179,15 +181,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
179
181
 
180
182
  if self.using_native_model:
181
183
  res, cost = self.model.generate(
182
- prompt, schema=MultimodelContextualPrecisionScoreReason
184
+ prompt,
185
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
183
186
  )
184
187
  self.evaluation_cost += cost
185
188
  return res.reason
186
189
  else:
187
190
  try:
188
- res: MultimodelContextualPrecisionScoreReason = (
191
+ res: mcpschema.MultimodelContextualPrecisionScoreReason = (
189
192
  self.model.generate(
190
- prompt, schema=MultimodelContextualPrecisionScoreReason
193
+ prompt,
194
+ schema=mcpschema.MultimodelContextualPrecisionScoreReason,
191
195
  )
192
196
  )
193
197
  return res.reason
@@ -198,21 +202,23 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
198
202
 
199
203
  async def _a_generate_verdicts(
200
204
  self, input: str, expected_output: str, retrieval_context: List[str]
201
- ) -> List[ContextualPrecisionVerdict]:
205
+ ) -> List[mcpschema.ContextualPrecisionVerdict]:
202
206
  prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
203
207
  input=input,
204
208
  expected_output=expected_output,
205
209
  retrieval_context=retrieval_context,
206
210
  )
207
211
  if self.using_native_model:
208
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
212
+ res, cost = await self.model.a_generate(
213
+ prompt, schema=mcpschema.Verdicts
214
+ )
209
215
  self.evaluation_cost += cost
210
216
  verdicts = [item for item in res.verdicts]
211
217
  return verdicts
212
218
  else:
213
219
  try:
214
- res: Verdicts = await self.model.a_generate(
215
- prompt, schema=Verdicts
220
+ res: mcpschema.Verdicts = await self.model.a_generate(
221
+ prompt, schema=mcpschema.Verdicts
216
222
  )
217
223
  verdicts = [item for item in res.verdicts]
218
224
  return verdicts
@@ -220,34 +226,36 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
220
226
  res = await self.model.a_generate(prompt)
221
227
  data = trimAndLoadJson(res, self)
222
228
  verdicts = [
223
- ContextualPrecisionVerdict(**item)
229
+ mcpschema.ContextualPrecisionVerdict(**item)
224
230
  for item in data["verdicts"]
225
231
  ]
226
232
  return verdicts
227
233
 
228
234
  def _generate_verdicts(
229
235
  self, input: str, expected_output: str, retrieval_context: List[str]
230
- ) -> List[ContextualPrecisionVerdict]:
236
+ ) -> List[mcpschema.ContextualPrecisionVerdict]:
231
237
  prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
232
238
  input=input,
233
239
  expected_output=expected_output,
234
240
  retrieval_context=retrieval_context,
235
241
  )
236
242
  if self.using_native_model:
237
- res, cost = self.model.generate(prompt, schema=Verdicts)
243
+ res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
238
244
  self.evaluation_cost += cost
239
245
  verdicts = [item for item in res.verdicts]
240
246
  return verdicts
241
247
  else:
242
248
  try:
243
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
249
+ res: mcpschema.Verdicts = self.model.generate(
250
+ prompt, schema=mcpschema.Verdicts
251
+ )
244
252
  verdicts = [item for item in res.verdicts]
245
253
  return verdicts
246
254
  except TypeError:
247
255
  res = self.model.generate(prompt)
248
256
  data = trimAndLoadJson(res, self)
249
257
  verdicts = [
250
- ContextualPrecisionVerdict(**item)
258
+ mcpschema.ContextualPrecisionVerdict(**item)
251
259
  for item in data["verdicts"]
252
260
  ]
253
261
  return verdicts
@@ -284,7 +292,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
284
292
  else:
285
293
  try:
286
294
  self.success = self.score >= self.threshold
287
- except:
295
+ except TypeError:
288
296
  self.success = False
289
297
  return self.success
290
298
 
@@ -0,0 +1 @@
1
+ from .plan_adherence import PlanAdherenceMetric