PyPI - deepeval - Versions diffs - 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl - Mend

deepeval 3.6.7py3-none-any.whl → 3.6.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

deepeval/_version.py +1 -1
deepeval/config/settings.py +104 -36
deepeval/config/utils.py +5 -0
deepeval/dataset/dataset.py +162 -30
deepeval/dataset/utils.py +41 -13
deepeval/errors.py +20 -2
deepeval/evaluate/execute.py +1662 -688
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +13 -3
deepeval/integrations/crewai/__init__.py +2 -1
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +8 -8
deepeval/metrics/indicator.py +21 -1
deepeval/metrics/mcp/mcp_task_completion.py +7 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/task_completion/task_completion.py +1 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/amazon_bedrock_model.py +20 -17
deepeval/models/llms/openai_model.py +10 -1
deepeval/models/retry_policy.py +103 -20
deepeval/openai/extractors.py +61 -16
deepeval/openai/patch.py +8 -12
deepeval/openai/types.py +1 -1
deepeval/openai/utils.py +108 -1
deepeval/prompt/prompt.py +1 -0
deepeval/prompt/utils.py +43 -14
deepeval/simulator/conversation_simulator.py +25 -18
deepeval/synthesizer/chunking/context_generator.py +9 -1
deepeval/synthesizer/synthesizer.py +11 -10
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/test_run.py +190 -207
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +23 -4
deepeval/tracing/trace_context.py +53 -38
deepeval/tracing/tracing.py +23 -0
deepeval/tracing/types.py +16 -14
deepeval/utils.py +21 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
deepeval/integrations/llama_index/agent/patched.py +0 -68
deepeval/tracing/message_types/__init__.py +0 -10
deepeval/tracing/message_types/base.py +0 -6
deepeval/tracing/message_types/messages.py +0 -14
deepeval/tracing/message_types/tools.py +0 -18
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
{deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0

deepeval/metrics/goal_accuracy/template.py ADDED Viewed

@@ -0,0 +1,235 @@
+from typing import List
+import textwrap
+class GoalAccuracyTemplate:
+    @staticmethod
+    def get_accuracy_score(task, steps_taken):
+        return textwrap.dedent(
+            f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
+                PURPOSE:
+                Evaluate whether the assistant's **visible output** (what the user actually saw) **fully and correctly achieved the user's stated goal.
+                Ignore internal reasoning, hidden tool calls, or retriever outputs unless their results were explicitly surfaced to the user.
+                The evaluation must be **strict and adversarial** — if the goal is not *clearly, fully, and correctly achieved*, assign a low score.
+                EVALUATION RULES
+                1. **User-visible fulfillment only**
+                - Base your judgment solely on what the user would see in the assistant's message.
+                - Ignore hidden or internal steps unless their results were explicitly communicated.
+                2. **Goal completion**
+                - The assistant must explicitly provide everything the user asked for.
+                - If even one subpart of the task is missing, incomplete, or vague, the score must be **≤ 0.5**.
+                3. **Correctness and relevance**
+                - The information provided must be factually correct and directly relevant to the task.
+                - Hallucinated or unrelated content automatically lowers the score.
+                4. **Self-sufficiency**
+                - The visible response must stand on its own; the user should not need prior context or follow-up clarification.
+                5. **Strict bias toward failure**
+                - When uncertain, assume the goal was **not achieved**.
+                - The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
+                SCORING GUIDE:
+                - **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
+                - **0.75** → Mostly achieved; minor omissions or trivial inaccuracies.
+                - **0.5** → Partially achieved; core goal addressed, but key parts missing or incorrect.
+                - **0.25** → Weak attempt; loosely related but fails to satisfy the user’s request.
+                - **0.0** → Goal not achieved at all; irrelevant, wrong, or missing answer.
+                *When in doubt, choose the lower score.*
+                OUTPUT FORMAT:
+                Return only a valid JSON object with this structure:
+                {{
+                    "score": 0.0,
+                    "reason": "1-3 factual sentences explaining what parts of the user's goal were or were not achieved."
+                }}
+                The reason must:
+                - Be objective and concise.
+                - Refer to **specific missing or incorrect elements**.
+                - Avoid vague language (“somewhat correct”, “pretty accurate”).
+                EXAMPLES:
+                **Example 1**
+                Task: "Translate 'good night' into French."
+                Assistant Reply: "Bonne nuit."
+                →
+                {{
+                    "score": 1.0,
+                    "reason": "The assistant provided the exact, correct translation requested by the user."
+                }}
+                **Example 2**
+                Task: "List three renewable energy sources."
+                Assistant Reply: "Solar and wind energy."
+                →
+                {{
+                    "score": 0.5,
+                    "reason": "The assistant only listed two sources instead of three, so the goal was partially achieved."
+                }}
+                **Example 3**
+                Task: "Summarize this paragraph."
+                Assistant Reply: "It talks about technology."
+                →
+                {{
+                    "score": 0.25,
+                    "reason": "The summary is too vague and fails to convey key information from the text."
+                }}
+                *** END OF EXAMPLES ***
+                USER TASK:
+                {task}
+                AGENT STEPS:
+                {steps_taken}
+                JSON:
+            """
+        )
+    @staticmethod
+    def get_plan_evaluation_score(task, steps_taken):
+        return textwrap.dedent(
+            f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
+                OBJECTIVE:
+                Evaluate:
+                1. **Plan Quality** — Was the agent's plan clear, complete, and logically structured to fully address the user's task?
+                2. **Plan Adherence** — Did the agent consistently follow that plan without unjustified deviations, omissions, or extraneous steps?
+                Your judgment must be strict: a plan must be well-formed and execution must align with it for a high score.
+                EVALUATION CRITERIA
+                - Plan Quality:
+                - The plan should explicitly or implicitly outline all necessary steps to fulfill the user's task.
+                - It must be logically ordered, neither vague nor overly generic.
+                - Missing critical components or unclear structuring lowers the score drastically.
+                - Plan Adherence:
+                - Execution must closely match the planned steps.
+                - Any skipped, added, or rearranged steps without clear justification count as plan deviations.
+                - Minor, justified variations are acceptable but reduce the score slightly.
+                - General Rules:
+                - If no discernible plan exists, score ≤ 0.5 regardless of task completion.
+                - Tool use should be coherent within the plan, not ad hoc or speculative.
+                - This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
+                SCORING GUIDE:
+                - **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
+                - **0.75** → Mostly clear plan with minor omissions or small execution deviations that do not impact the overall strategy.
+                - **0.5** → Partial plan exists but is incomplete, vague, or only partially followed; notable deviations present.
+                - **0.25** → Weak or fragmented plan; execution frequently diverges or lacks coherence with any strategy.
+                - **0.0** → No evidence of a plan; execution appears random or unrelated to the user's task.
+                INSTRUCTIONS:
+                1. Identify the agent's plan from the steps taken (explicit plans stated or implicit structure).
+                2. Assess plan completeness and logical order relative to the user's task.
+                3. Compare execution steps against the plan to check for adherence, noting any unjustified deviations.
+                4. Deduct points for vagueness, missing critical steps, or inconsistent execution.
+                OUTPUT FORMAT:
+                Return only a valid JSON object with exactly two fields:
+                {{
+                    "score": 0.0,
+                    "reason": "1-3 concise sentences explaining the quality of the plan and how well execution matched it. Specify missing or extra steps, plan clarity, and adherence issues."
+                }}
+                EXAMPLE:
+                User Task: "Plan a business trip including booking a flight, hotel, and preparing an agenda."
+                Agent Steps include:
+                - Outlined flight, hotel, and agenda steps explicitly.
+                - Executed flight and hotel booking steps.
+                - Skipped agenda preparation despite mentioning it in the plan.
+                Example JSON:
+                {{
+                    "score": 0.75,
+                    "reason": "The agent formed a clear plan covering flights, hotel, and agenda, but failed to execute the agenda preparation step, reducing adherence."
+                }}
+                **** END OF EXAMPLE ****
+                INPUTS:
+                USER TASK:
+                {task}
+                AGENT STEPS:
+                {steps_taken}
+                JSON:
+            """
+        )
+    @staticmethod
+    def get_final_reason(
+        final_score, threshold, goal_evaluations, plan_evalautions
+    ):
+        return textwrap.dedent(
+            f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
+                You are given:
+                - An agent's goal execution scores and reasons.
+                - The agent's plan evaluation scores and reasons.
+                - The **final combined score**.
+                - The **threshold** required to pass.
+                - Whether the result is a **pass** or **fail**.
+                Your job is to write a short, precise explanation of **why** the agent passed or failed — taking into account the quality of execution and planning, and the threshold.
+                ---
+                INSTRUCTIONS:
+                - Write 2-4 clear, objective sentences explaining the overall result.
+                - Explicitly reference both the task and plan performance — **both must be addressed**.
+                - Mention how the final score compares to the threshold.
+                - If the agent **passed**, highlight how both task execution and planning were sufficient to meet the goal.
+                - If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
+                - Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
+                ---
+                FORMAT:
+                Return only a single string. Do **not** include JSON or any extra formatting.
+                ---
+                Goal evaluations:
+                {goal_evaluations}
+                Plan evaluations:
+                {plan_evalautions}
+                Final Score: {final_score}
+                Threshold: {threshold}
+                Result: {"PASS" if final_score >= threshold else "FAIL"}
+                Final Reason:
+            """
+        )

deepeval/metrics/hallucination/hallucination.py CHANGED Viewed

@@ -18,14 +18,14 @@ from deepeval.metrics.indicator import metric_progress_indicator
 from deepeval.metrics.hallucination.schema import *
 from deepeval.metrics.api import metric_data_manager
-required_params: List[LLMTestCaseParams] = [
-    LLMTestCaseParams.INPUT,
-    LLMTestCaseParams.ACTUAL_OUTPUT,
-    LLMTestCaseParams.CONTEXT,
-]
 class HallucinationMetric(BaseMetric):
+    _required_params: List[LLMTestCaseParams] = [
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.CONTEXT,
+    ]
     def __init__(
         self,
         threshold: float = 0.5,
@@ -55,7 +55,7 @@ class HallucinationMetric(BaseMetric):
         _log_metric_to_confident: bool = True,
     ) -> float:
-        check_llm_test_case_params(test_case, required_params, self)
+        check_llm_test_case_params(test_case, self._required_params, self)
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
@@ -102,7 +102,7 @@ class HallucinationMetric(BaseMetric):
         _log_metric_to_confident: bool = True,
     ) -> float:
-        check_llm_test_case_params(test_case, required_params, self)
+        check_llm_test_case_params(test_case, self._required_params, self)
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(

deepeval/metrics/indicator.py CHANGED Viewed

@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
 from deepeval.telemetry import capture_metric_type
 from deepeval.utils import update_pbar
+import logging
+logger = logging.getLogger(__name__)
 def format_metric_description(
     metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
@@ -43,7 +47,7 @@ def metric_progress_indicator(
     _show_indicator: bool = True,
     _in_component: bool = False,
 ):
-    captured_async_mode = False if async_mode == None else async_mode
+    captured_async_mode = False if async_mode is None else async_mode
     with capture_metric_type(
         metric.__name__,
         async_mode=captured_async_mode,
@@ -250,6 +254,21 @@ async def safe_a_measure(
             _log_metric_to_confident=False,
         )
         update_pbar(progress, pbar_eval_id)
+    except asyncio.CancelledError:
+        logger.info("caught asyncio.CancelledError")
+        # treat cancellation as a timeout so we still emit a MetricData
+        metric.error = (
+            "Timed out/cancelled while evaluating metric. "
+            "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
+            "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
+        )
+        metric.success = False
+        if not ignore_errors:
+            raise
     except MissingTestCaseParamsError as e:
         if skip_on_missing_params:
             metric.skipped = True
@@ -277,5 +296,6 @@ async def safe_a_measure(
         if ignore_errors:
             metric.error = str(e)
             metric.success = False  # Assuming you want to set success to False
+            logger.info("a metric was marked as errored")
         else:
             raise

deepeval/metrics/mcp/mcp_task_completion.py CHANGED Viewed

@@ -112,7 +112,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
-            self, async_mode=True, _show_indicator=_show_indicator
+            self,
+            async_mode=True,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
         ):
             if not test_case.mcp_servers:
                 error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MCPTaskCompletionMetric' metric."
@@ -241,8 +244,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
         return tasks
     def _calculate_score(self, scores: List[TaskScore]) -> float:
+        score_divsor = len(scores) if len(scores) > 0 else 1
         total_score = sum(score.score for score in scores)
-        return total_score / len(scores)
+        score = total_score / score_divsor
+        return 0 if self.strict_mode and score < self.threshold else score
     def is_successful(self) -> bool:
         if self.error is not None:

deepeval/metrics/mcp/multi_turn_mcp_use_metric.py CHANGED Viewed

@@ -125,7 +125,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
         self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
-            self, async_mode=True, _show_indicator=_show_indicator
+            self,
+            async_mode=True,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
         ):
             if not test_case.mcp_servers:
                 error_str = "'mcp_servers' in a conversational test case cannot be empty for the 'MultiTurnMCPUseMetric' metric."
@@ -312,13 +315,20 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
         tool_accuracy_score: List[ToolScore],
         args_accuracy_score: List[ArgsScore],
     ) -> float:
-        tool_score = sum(score.score for score in tool_accuracy_score) / len(
-            tool_accuracy_score
+        tool_divisor = (
+            len(tool_accuracy_score) if len(tool_accuracy_score) > 0 else 1
         )
-        args_score = sum(score.score for score in args_accuracy_score) / len(
-            args_accuracy_score
+        args_divisor = (
+            len(args_accuracy_score) if len(args_accuracy_score) > 0 else 1
         )
-        return min(tool_score, args_score)
+        tool_score = (
+            sum(score.score for score in tool_accuracy_score) / tool_divisor
+        )
+        args_score = (
+            sum(score.score for score in args_accuracy_score) / args_divisor
+        )
+        score = min(tool_score, args_score)
+        return 0 if self.strict_mode and score < self.threshold else score
     def _generate_reason(
         self,

deepeval/metrics/mcp_use_metric/mcp_use_metric.py CHANGED Viewed

@@ -271,9 +271,10 @@ class MCPUseMetric(BaseMetric):
         primitives_used_score: MCPPrimitivesScore,
         argument_correctness_score: MCPArgsScore,
     ) -> float:
-        return min(
+        score = min(
             primitives_used_score.score, argument_correctness_score.score
         )
+        return 0 if self.strict_mode and score < self.threshold else score
     def _get_reason(
         self,

deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Optional, List, Union
 from deepeval.metrics import BaseMultimodalMetric
-from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
+from deepeval.test_case import MLLMTestCase
 from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
     MultiModalContextualPrecisionTemplate,
 )
@@ -14,7 +14,7 @@ from deepeval.metrics.utils import (
 )
 from deepeval.test_case import LLMTestCaseParams
 from deepeval.models import DeepEvalBaseMLLM
-from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema import *
+import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
 from deepeval.metrics.indicator import metric_progress_indicator
@@ -72,7 +72,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
                     )
                 )
             else:
-                self.verdicts: List[ContextualPrecisionVerdict] = (
+                self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
                     self._generate_verdicts(
                         test_case.input,
                         test_case.expected_output,
@@ -110,7 +110,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
             _show_indicator=_show_indicator,
             _in_component=_in_component,
         ):
-            self.verdicts: List[ContextualPrecisionVerdict] = (
+            self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
                 await self._a_generate_verdicts(
                     test_case.input,
                     test_case.expected_output,
@@ -130,12 +130,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
             return self.score
-    async def _a_generate_reason(self, input: str):
+    async def _a_generate_reason(self, input: str) -> Optional[str]:
         if self.include_reason is False:
             return None
         retrieval_contexts_verdicts = [
-            {"verdict": verdict.verdict, "reasons": verdict.reason}
+            {"verdict": verdict.verdict, "reason": verdict.reason}
             for verdict in self.verdicts
         ]
         prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -146,15 +146,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
         if self.using_native_model:
             res, cost = await self.model.a_generate(
-                prompt, schema=MultimodelContextualPrecisionScoreReason
+                prompt,
+                schema=mcpschema.MultimodelContextualPrecisionScoreReason,
             )
             self.evaluation_cost += cost
             return res.reason
         else:
             try:
-                res: MultimodelContextualPrecisionScoreReason = (
+                res: mcpschema.MultimodelContextualPrecisionScoreReason = (
                     await self.model.a_generate(
-                        prompt, schema=MultimodelContextualPrecisionScoreReason
+                        prompt,
+                        schema=mcpschema.MultimodelContextualPrecisionScoreReason,
                     )
                 )
                 return res.reason
@@ -163,12 +165,12 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
                 data = trimAndLoadJson(res, self)
                 return data["reason"]
-    def _generate_reason(self, input: str):
+    def _generate_reason(self, input: str) -> Optional[str]:
         if self.include_reason is False:
             return None
         retrieval_contexts_verdicts = [
-            {"verdict": verdict.verdict, "reasons": verdict.reason}
+            {"verdict": verdict.verdict, "reason": verdict.reason}
             for verdict in self.verdicts
         ]
         prompt = MultiModalContextualPrecisionTemplate.generate_reason(
@@ -179,15 +181,17 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
         if self.using_native_model:
             res, cost = self.model.generate(
-                prompt, schema=MultimodelContextualPrecisionScoreReason
+                prompt,
+                schema=mcpschema.MultimodelContextualPrecisionScoreReason,
             )
             self.evaluation_cost += cost
             return res.reason
         else:
             try:
-                res: MultimodelContextualPrecisionScoreReason = (
+                res: mcpschema.MultimodelContextualPrecisionScoreReason = (
                     self.model.generate(
-                        prompt, schema=MultimodelContextualPrecisionScoreReason
+                        prompt,
+                        schema=mcpschema.MultimodelContextualPrecisionScoreReason,
                     )
                 )
                 return res.reason
@@ -198,21 +202,23 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
     async def _a_generate_verdicts(
         self, input: str, expected_output: str, retrieval_context: List[str]
-    ) -> List[ContextualPrecisionVerdict]:
+    ) -> List[mcpschema.ContextualPrecisionVerdict]:
         prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
             input=input,
             expected_output=expected_output,
             retrieval_context=retrieval_context,
         )
         if self.using_native_model:
-            res, cost = await self.model.a_generate(prompt, schema=Verdicts)
+            res, cost = await self.model.a_generate(
+                prompt, schema=mcpschema.Verdicts
+            )
             self.evaluation_cost += cost
             verdicts = [item for item in res.verdicts]
             return verdicts
         else:
             try:
-                res: Verdicts = await self.model.a_generate(
-                    prompt, schema=Verdicts
+                res: mcpschema.Verdicts = await self.model.a_generate(
+                    prompt, schema=mcpschema.Verdicts
                 )
                 verdicts = [item for item in res.verdicts]
                 return verdicts
@@ -220,34 +226,36 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
                 res = await self.model.a_generate(prompt)
                 data = trimAndLoadJson(res, self)
                 verdicts = [
-                    ContextualPrecisionVerdict(**item)
+                    mcpschema.ContextualPrecisionVerdict(**item)
                     for item in data["verdicts"]
                 ]
                 return verdicts
     def _generate_verdicts(
         self, input: str, expected_output: str, retrieval_context: List[str]
-    ) -> List[ContextualPrecisionVerdict]:
+    ) -> List[mcpschema.ContextualPrecisionVerdict]:
         prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
             input=input,
             expected_output=expected_output,
             retrieval_context=retrieval_context,
         )
         if self.using_native_model:
-            res, cost = self.model.generate(prompt, schema=Verdicts)
+            res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
             self.evaluation_cost += cost
             verdicts = [item for item in res.verdicts]
             return verdicts
         else:
             try:
-                res: Verdicts = self.model.generate(prompt, schema=Verdicts)
+                res: mcpschema.Verdicts = self.model.generate(
+                    prompt, schema=mcpschema.Verdicts
+                )
                 verdicts = [item for item in res.verdicts]
                 return verdicts
             except TypeError:
                 res = self.model.generate(prompt)
                 data = trimAndLoadJson(res, self)
                 verdicts = [
-                    ContextualPrecisionVerdict(**item)
+                    mcpschema.ContextualPrecisionVerdict(**item)
                     for item in data["verdicts"]
                 ]
                 return verdicts
@@ -284,7 +292,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
         else:
             try:
                 self.success = self.score >= self.threshold
-            except:
+            except TypeError:
                 self.success = False
         return self.success

deepeval/metrics/plan_adherence/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .plan_adherence import PlanAdherenceMetric

deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

deepeval 3.6.7py3-none-any.whl → 3.6.9py3-none-any.whl