PyPI - deepeval - Versions diffs - 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl - Mend

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

deepeval/_version.py +1 -1
deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
deepeval/cli/main.py +42 -0
deepeval/confident/api.py +1 -0
deepeval/config/settings.py +22 -4
deepeval/constants.py +8 -1
deepeval/dataset/dataset.py +2 -11
deepeval/dataset/utils.py +1 -1
deepeval/errors.py +20 -2
deepeval/evaluate/evaluate.py +5 -1
deepeval/evaluate/execute.py +811 -248
deepeval/evaluate/types.py +1 -0
deepeval/evaluate/utils.py +33 -119
deepeval/integrations/crewai/__init__.py +7 -1
deepeval/integrations/crewai/handler.py +1 -1
deepeval/integrations/crewai/subs.py +51 -0
deepeval/integrations/crewai/tool.py +71 -0
deepeval/integrations/crewai/wrapper.py +45 -5
deepeval/integrations/llama_index/__init__.py +0 -4
deepeval/integrations/llama_index/handler.py +20 -21
deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
deepeval/metrics/__init__.py +13 -0
deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
deepeval/metrics/api.py +281 -0
deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
deepeval/metrics/base_metric.py +1 -0
deepeval/metrics/bias/bias.py +12 -3
deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
deepeval/metrics/conversational_dag/nodes.py +12 -4
deepeval/metrics/conversational_g_eval/__init__.py +3 -0
deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
deepeval/metrics/dag/dag.py +12 -0
deepeval/metrics/dag/nodes.py +12 -4
deepeval/metrics/dag/schema.py +1 -1
deepeval/metrics/dag/templates.py +2 -2
deepeval/metrics/faithfulness/faithfulness.py +12 -1
deepeval/metrics/g_eval/g_eval.py +11 -0
deepeval/metrics/goal_accuracy/__init__.py +1 -0
deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
deepeval/metrics/goal_accuracy/schema.py +17 -0
deepeval/metrics/goal_accuracy/template.py +235 -0
deepeval/metrics/hallucination/hallucination.py +20 -9
deepeval/metrics/indicator.py +8 -2
deepeval/metrics/json_correctness/json_correctness.py +12 -1
deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
deepeval/metrics/mcp/mcp_task_completion.py +20 -2
deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
deepeval/metrics/misuse/misuse.py +12 -1
deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
deepeval/metrics/non_advice/non_advice.py +12 -0
deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
deepeval/metrics/plan_adherence/__init__.py +1 -0
deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
deepeval/metrics/plan_adherence/schema.py +11 -0
deepeval/metrics/plan_adherence/template.py +170 -0
deepeval/metrics/plan_quality/__init__.py +1 -0
deepeval/metrics/plan_quality/plan_quality.py +292 -0
deepeval/metrics/plan_quality/schema.py +11 -0
deepeval/metrics/plan_quality/template.py +101 -0
deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
deepeval/metrics/role_adherence/role_adherence.py +12 -0
deepeval/metrics/role_violation/role_violation.py +12 -0
deepeval/metrics/step_efficiency/__init__.py +1 -0
deepeval/metrics/step_efficiency/schema.py +11 -0
deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
deepeval/metrics/step_efficiency/template.py +256 -0
deepeval/metrics/summarization/summarization.py +12 -1
deepeval/metrics/task_completion/task_completion.py +4 -0
deepeval/metrics/tool_correctness/schema.py +6 -0
deepeval/metrics/tool_correctness/template.py +88 -0
deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
deepeval/metrics/tool_use/__init__.py +1 -0
deepeval/metrics/tool_use/schema.py +19 -0
deepeval/metrics/tool_use/template.py +220 -0
deepeval/metrics/tool_use/tool_use.py +458 -0
deepeval/metrics/topic_adherence/__init__.py +1 -0
deepeval/metrics/topic_adherence/schema.py +16 -0
deepeval/metrics/topic_adherence/template.py +162 -0
deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
deepeval/metrics/toxicity/toxicity.py +12 -0
deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
deepeval/models/embedding_models/azure_embedding_model.py +37 -36
deepeval/models/embedding_models/local_embedding_model.py +30 -32
deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
deepeval/models/embedding_models/openai_embedding_model.py +22 -31
deepeval/models/llms/grok_model.py +1 -1
deepeval/models/llms/openai_model.py +2 -0
deepeval/openai/__init__.py +14 -32
deepeval/openai/extractors.py +85 -50
deepeval/openai/patch.py +258 -167
deepeval/openai/types.py +20 -0
deepeval/openai/utils.py +205 -56
deepeval/prompt/__init__.py +19 -1
deepeval/prompt/api.py +160 -0
deepeval/prompt/prompt.py +245 -62
deepeval/prompt/utils.py +186 -15
deepeval/synthesizer/chunking/context_generator.py +209 -152
deepeval/synthesizer/chunking/doc_chunker.py +46 -12
deepeval/synthesizer/synthesizer.py +19 -15
deepeval/test_case/api.py +131 -0
deepeval/test_case/llm_test_case.py +6 -2
deepeval/test_run/__init__.py +1 -0
deepeval/test_run/hyperparameters.py +47 -8
deepeval/test_run/test_run.py +292 -206
deepeval/tracing/__init__.py +2 -1
deepeval/tracing/api.py +3 -1
deepeval/tracing/otel/exporter.py +3 -4
deepeval/tracing/otel/utils.py +24 -5
deepeval/tracing/trace_context.py +89 -5
deepeval/tracing/tracing.py +74 -3
deepeval/tracing/types.py +20 -2
deepeval/tracing/utils.py +8 -0
deepeval/utils.py +21 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
deepeval/integrations/llama_index/agent/patched.py +0 -68
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
{deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0

deepeval/metrics/tool_correctness/tool_correctness.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from typing import List, Dict
+from typing import List, Dict, Optional, Union
 from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.utils import get_or_create_event_loop, prettify_list
 from deepeval.metrics.utils import (
     construct_verbose_logs,
     check_llm_test_case_params,
+    trimAndLoadJson,
+    initialize_model,
+    print_tools_called,
 )
+from deepeval.models import DeepEvalBaseLLM
 from deepeval.test_case import (
     LLMTestCase,
     LLMTestCaseParams,
@@ -12,6 +17,9 @@ from deepeval.test_case import (
     ToolCall,
 )
 from deepeval.metrics import BaseMetric
+from deepeval.metrics.api import metric_data_manager
+from deepeval.metrics.tool_correctness.template import ToolCorrectnessTemplate
+from deepeval.metrics.tool_correctness.schema import ToolSelectionScore
 class ToolCorrectnessMetric(BaseMetric):
@@ -24,15 +32,21 @@ class ToolCorrectnessMetric(BaseMetric):
     def __init__(
         self,
+        available_tools: List[ToolCall] = None,
         threshold: float = 0.5,
         evaluation_params: List[ToolCallParams] = [],
+        model: Optional[Union[str, DeepEvalBaseLLM]] = None,
         include_reason: bool = True,
+        async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
         should_exact_match: bool = False,
         should_consider_ordering: bool = False,
     ):
+        self.available_tools = available_tools
         self.threshold = 1 if strict_mode else threshold
+        self.model, self.using_native_model = initialize_model(model)
+        self.async_mode = async_mode
         self.include_reason = include_reason
         self.strict_mode = strict_mode
         self.verbose_mode = verbose_mode
@@ -45,18 +59,145 @@ class ToolCorrectnessMetric(BaseMetric):
         test_case: LLMTestCase,
         _show_indicator: bool = True,
         _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
     ) -> float:
         check_llm_test_case_params(test_case, self._required_params, self)
         self.test_case = test_case
+        self.evaluation_cost = 0 if self.using_native_model else None
         with metric_progress_indicator(
             self, _show_indicator=_show_indicator, _in_component=_in_component
+        ):
+            if self.async_mode:
+                loop = get_or_create_event_loop()
+                loop.run_until_complete(
+                    self.a_measure(
+                        test_case,
+                        _show_indicator=False,
+                        _in_component=_in_component,
+                        _log_metric_to_confident=_log_metric_to_confident,
+                    )
+                )
+            else:
+                self.tools_called: List[ToolCall] = test_case.tools_called
+                self.expected_tools: List[ToolCall] = test_case.expected_tools
+                tool_calling_score = self._calculate_score()
+                if self.available_tools:
+                    tool_selection_score = self._get_tool_selection_score(
+                        test_case.input,
+                        test_case.tools_called,
+                        self.available_tools,
+                    )
+                else:
+                    tool_selection_score = tool_selection_score = (
+                        ToolSelectionScore(
+                            score=1,
+                            reason="No available tools were provided to assess tool selection criteria",
+                        )
+                    )
+                score = min(tool_calling_score, tool_selection_score.score)
+                self.score = (
+                    0 if self.strict_mode and score < self.threshold else score
+                )
+                tool_calling_reason = self._generate_reason()
+                self.reason = self._construct_final_reason(
+                    tool_calling_reason, tool_selection_score.reason
+                )
+                self.success = self.score >= self.threshold
+                expected_tools_formatted = (
+                    "Expected Tools:\n[\n"
+                    + ",\n".join(
+                        self.indent_multiline_string(
+                            repr(tool_call), indent_level=4
+                        )
+                        for tool_call in self.expected_tools
+                    )
+                    + "\n]"
+                )
+                tools_called_formatted = (
+                    "Tools Called:\n[\n"
+                    + ",\n".join(
+                        self.indent_multiline_string(
+                            repr(tool_call), indent_level=4
+                        )
+                        for tool_call in self.tools_called
+                    )
+                    + "\n]"
+                )
+                available_tools_formatted = (
+                    (
+                        "Available Tools:\n[\n"
+                        + ",\n".join(
+                            self.indent_multiline_string(
+                                repr(tool_call), indent_level=4
+                            )
+                            for tool_call in self.available_tools
+                        )
+                        + "\n]"
+                    )
+                    if self.available_tools
+                    else "Available Tools: []"
+                )
+                self.verbose_logs = construct_verbose_logs(
+                    self,
+                    steps=[
+                        f"{expected_tools_formatted}",
+                        f"{tools_called_formatted}",
+                        f"{available_tools_formatted}",
+                        f"Tool Selection Score: {tool_selection_score.score}",
+                        f"Tool Selection Reason: {tool_selection_score.reason}",
+                        f"Final Score: {self.score}\nFinal Reason: {self.reason}",
+                    ],
+                )
+                if _log_metric_to_confident:
+                    metric_data_manager.post_metric_if_enabled(
+                        self, test_case=test_case
+                    )
+                return self.score
+    async def a_measure(
+        self,
+        test_case: LLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
+    ) -> float:
+        check_llm_test_case_params(test_case, self._required_params, self)
+        self.evaluation_cost = 0 if self.using_native_model else None
+        with metric_progress_indicator(
+            self,
+            async_mode=True,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
         ):
             self.tools_called: List[ToolCall] = test_case.tools_called
             self.expected_tools: List[ToolCall] = test_case.expected_tools
-            self.score = self._calculate_score()
-            self.reason = self._generate_reason()
+            tool_calling_score = self._calculate_score()
+            if self.available_tools:
+                tool_selection_score = await self._a_get_tool_selection_score(
+                    test_case.input,
+                    test_case.tools_called,
+                    self.available_tools,
+                )
+            else:
+                tool_selection_score = ToolSelectionScore(
+                    score=1,
+                    reason="No available tools were provided to assess tool selection criteria",
+                )
+            score = min(tool_calling_score, tool_selection_score.score)
+            self.score = (
+                0 if self.strict_mode and score < self.threshold else score
+            )
+            tool_calling_reason = self._generate_reason()
+            self.reason = self._construct_final_reason(
+                tool_calling_reason, tool_selection_score.reason
+            )
             self.success = self.score >= self.threshold
             expected_tools_formatted = (
                 "Expected Tools:\n[\n"
                 + ",\n".join(
@@ -77,25 +218,37 @@ class ToolCorrectnessMetric(BaseMetric):
                 )
                 + "\n]"
             )
-            steps = [
-                f"{expected_tools_formatted}",
-                f"{tools_called_formatted}",
-            ]
-            steps.append(f"Score: {self.score}\nReason: {self.reason}")
-            self.verbose_logs = construct_verbose_logs(self, steps=steps)
-            return self.score
+            available_tools_formatted = (
+                (
+                    "Available Tools:\n[\n"
+                    + ",\n".join(
+                        self.indent_multiline_string(
+                            repr(tool_call), indent_level=4
+                        )
+                        for tool_call in self.available_tools
+                    )
+                    + "\n]"
+                )
+                if self.available_tools
+                else "Available Tools: []"
+            )
+            self.verbose_logs = construct_verbose_logs(
+                self,
+                steps=[
+                    f"{expected_tools_formatted}",
+                    f"{tools_called_formatted}",
+                    f"{available_tools_formatted}",
+                    f"Tool Selection Score: {tool_selection_score.score}",
+                    f"Tool Selection Reason: {tool_selection_score.reason}",
+                    f"Final Score: {self.score}\nFinal Reason: {self.reason}",
+                ],
+            )
-    async def a_measure(
-        self,
-        test_case: LLMTestCase,
-        _show_indicator: bool = True,
-        _in_component: bool = False,
-    ) -> float:
-        return self.measure(
-            test_case,
-            _show_indicator=_show_indicator,
-            _in_component=_in_component,
-        )
+            if _log_metric_to_confident:
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
+            return self.score
     ##################################################
     ### Tool Correctness (Tool) ######################
@@ -146,10 +299,69 @@ class ToolCorrectnessMetric(BaseMetric):
             else:
                 return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
+    def _construct_final_reason(
+        self,
+        tool_calling_reason,
+        tool_selection_reason,
+    ):
+        final_reason = "[\n"
+        final_reason += "\t Tool Calling Reason: " + tool_calling_reason + "\n"
+        final_reason += (
+            "\t Tool Selection Reason: " + tool_selection_reason + "\n"
+        )
+        final_reason += "]\n"
+        return final_reason
     ##################################################
     ### Score Helper Functions #######################
     ##################################################
+    def _get_tool_selection_score(
+        self, user_input, tools_called, available_tools
+    ):
+        tools_called_formatted = print_tools_called(tools_called)
+        available_tools_formatted = print_tools_called(available_tools)
+        prompt = ToolCorrectnessTemplate.get_tool_selection_score(
+            user_input, tools_called_formatted, available_tools_formatted
+        )
+        if self.using_native_model:
+            res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res = self.model.generate(prompt, schema=ToolSelectionScore)
+                return res
+            except TypeError:
+                res = self.model.generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return ToolSelectionScore(**data)
+    async def _a_get_tool_selection_score(
+        self, user_input, tools_called, available_tools
+    ):
+        tools_called_formatted = print_tools_called(tools_called)
+        available_tools_formatted = print_tools_called(available_tools)
+        prompt = ToolCorrectnessTemplate.get_tool_selection_score(
+            user_input, tools_called_formatted, available_tools_formatted
+        )
+        if self.using_native_model:
+            res, cost = await self.model.a_generate(
+                prompt, schema=ToolSelectionScore
+            )
+            self.evaluation_cost += cost
+            return res
+        else:
+            try:
+                res = await self.model.a_generate(
+                    prompt, schema=ToolSelectionScore
+                )
+                return res
+            except TypeError:
+                res = await self.model.a_generate(prompt)
+                data = trimAndLoadJson(res, self)
+                return ToolSelectionScore(**data)
     # Calculate score
     def _calculate_score(self):
         if self.should_exact_match:

deepeval/metrics/tool_use/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .tool_use import ToolUseMetric

deepeval/metrics/tool_use/schema.py ADDED Viewed

@@ -0,0 +1,19 @@
+from pydantic import BaseModel
+class UserInputAndTools(BaseModel):
+    user_messages: str
+    assistant_messages: str
+    tools_called: str
+    available_tools: str
+    tools_used: bool
+class ToolSelectionScore(BaseModel):
+    score: float
+    reason: str
+class ArgumentCorrectnessScore(BaseModel):
+    score: float
+    reason: str

deepeval/metrics/tool_use/template.py ADDED Viewed

@@ -0,0 +1,220 @@
+import textwrap
+import json
+class ToolUseTemplate:
+    @staticmethod
+    def get_tool_selection_score(
+        user_input: str,
+        assistant_messages: str,
+        tools_called: str,
+        available_tools: str,
+    ) -> str:
+        return textwrap.dedent(
+            f"""You are an expert evaluator assessing the **Tool Selection Quality** of an AI agent.
+                OBJECTIVE
+                Evaluate whether the agent **selected the most appropriate tools** for completing the user's task, given a list of available tools.
+                This metric focuses **only** on which tools were chosen — **not** how they were used or whether they succeeded.
+                EVALUATION RULES
+                1. Relevance
+                - Each tool used must directly support the user's stated goal or a clear sub-task derived from it.
+                - Tools unrelated to the goal lower the score sharply.
+                2. Appropriateness
+                - The chosen tools must match their described purpose.
+                - If a more suitable tool existed and was ignored, score ≤ 0.5.
+                3. Necessity
+                - Every tool call must be justified by clear need.
+                - Redundant or speculative tool use (e.g., calling multiple tools that overlap) reduces the score.
+                4. Strictness
+                - When uncertain if a tool was required or correctly chosen, assume it was **not** appropriate.
+                - Only perfect alignment between the task and tool choice earns a high score.
+                SCORING GUIDE:
+                - **1.0** → Every tool used was necessary and perfectly matched to the task; no better alternative ignored.
+                - **0.75** → Tool selection was mostly correct, with only minor redundancy or a small omission.
+                - **0.5** → Mixed quality; some appropriate selections, but others questionable or missing.
+                - **0.25** → Poor selection; major mismatches or misuse of available tools.
+                - **0.0** → Tool selection irrelevant, random, or unjustified.
+                OUTPUT FORMAT:
+                Return a JSON object with:
+                {{
+                    "score": float between 0.0 and 1.0,
+                    "reason": "1-3 factual sentences explaining which tools were appropriate or inappropriate for the task, referencing specific tool names."
+                }}
+                USER INPUT:
+                {user_input}
+                ASSISTANT MESSAGES:
+                {assistant_messages}
+                TOOLS CALLED:
+                {tools_called}
+                AVAILABLE TOOLS:
+                {available_tools}
+                JSON:
+            """
+        )
+    @staticmethod
+    def get_argument_correctness_score(
+        user_input: str,
+        assistant_messages: str,
+        tools_called: str,
+        available_tools: str,
+    ) -> str:
+        return textwrap.dedent(
+            f"""You are an expert evaluator assessing the **Tool Argument Quality** of an AI agent.
+                OBJECTIVE:
+                Evaluate whether the **arguments and parameters** passed to each tool were:
+                - Correctly structured and complete.
+                - Contextually appropriate for the user's goal.
+                - Compatible with each tool's intended purpose.
+                This metric focuses **only** on argument-level correctness and relevance — not which tools were chosen.
+                EVALUATION RULES
+                1. Relevance
+                - Each argument must align with the task and the tool's documented input fields.
+                - Unrelated, empty, or default arguments reduce the score sharply.
+                2. **Completeness**
+                - All required parameters must be provided.
+                - Missing or malformed arguments (e.g., wrong data types or incomplete context) lower the score.
+                3. **Specificity**
+                - Arguments should reflect task-specific values, not generic placeholders.
+                - Overly vague or default arguments are penalized.
+                4. **Justification**
+                - Each argument must make sense in context.
+                - If it doesn't clearly derive from the user's request, assume it's incorrect.
+                5. **Strict Bias**
+                - When uncertain whether arguments fit the tool or task, assume they were **incorrect**.
+                SCORING GUIDE:
+                - **1.0** → All arguments are accurate, specific, and fully aligned with both the task and tool requirements.
+                - **0.75** → Mostly correct; minor omissions or small mismatches.
+                - **0.5** → Partial correctness; some valid parameters, but key ones missing or off-target.
+                - **0.25** → Poor argument quality; several invalid or irrelevant fields.
+                - **0.0** → Arguments nonsensical, generic, or unrelated to task/tool intent.
+                OUTPUT FORMAT:
+                Return a JSON object with:
+                {{
+                    "score": float between 0.0 and 1.0,
+                    "reason": "1-3 sentences explaining argument alignment or issues, referencing specific parameter names or values when possible."
+                }}
+                ---
+                USER INPUT:
+                {user_input}
+                ASSISTANT MESSAGES:
+                {assistant_messages}
+                TOOLS CALLED (with arguments):
+                {tools_called}
+                AVAILABLE TOOLS:
+                {available_tools}
+                JSON:
+            """
+        )
+    @staticmethod
+    def get_tool_selection_final_reason(
+        all_scores_and_reasons: str, final_score: float, threshold: float
+    ) -> str:
+        return textwrap.dedent(
+            f"""You are an expert evaluator summarizing the outcome of a **Tool Selection** evaluation.
+            You are given:
+            - A list of **tool selection sub-scores and reasons**, each describing how appropriately the agent chose tools for its task.
+            - The **final aggregated score** across all sub-evaluations.
+            - A **threshold** representing the minimum passing score.
+            Your task is to write a **single concise explanation (1-3 sentences)** that captures:
+            - Why the agent **passed or failed** based on tool choice quality.
+            - The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
+            - A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
+            RULES:
+            - Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
+            - Mention specific issues or strengths like redundancy, misuse, or perfect matching.
+            - Avoid vague or subjective language such as “pretty good” or “reasonable”.
+            - Do **not** reference argument-level details; this summary is only for tool choice quality.
+            - The result must read as a self-contained, factual justification.
+            FORMAT:
+            Return only a single plain-text string. Do **not** include JSON or other formatting.
+            All Tool Selection Sub-Scores and Reasons:
+            {all_scores_and_reasons}
+            Final Score: {final_score}
+            Threshold: {threshold}
+            Result: {"PASS" if final_score >= threshold else "FAIL"}
+            Final Reason:
+            """
+        )
+    @staticmethod
+    def get_tool_argument_final_reason(
+        all_scores_and_reasons: str, final_score: float, threshold: float
+    ) -> str:
+        return textwrap.dedent(
+            f"""You are an expert evaluator summarizing the outcome of a **Tool Argument Quality** evaluation.
+            You are given:
+            - A list of **argument-level sub-scores and reasons**, each evaluating whether the arguments passed to tools were accurate, complete, and contextually appropriate.
+            - The **final aggregated score** across all argument evaluations.
+            - A **threshold** representing the minimum passing score.
+            Your task is to write a **single concise explanation (1-3 sentences)** that clearly states:
+            - Why the agent **passed or failed** in its use of tool arguments.
+            - The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
+            - Whether the agent met or fell short of the threshold and why.
+            RULES:
+            - Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
+            - Reference specific argument-level problems or successes where helpful.
+            - Keep language objective and factual; avoid speculation or vague phrasing.
+            - The summary must stand alone as a clear explanation of the final result.
+            FORMAT:
+            Return only a single plain-text string. Do **not** include JSON or any extra formatting.
+            All Tool Argument Sub-Scores and Reasons:
+            {all_scores_and_reasons}
+            Final Score: {final_score}
+            Threshold: {threshold}
+            Result: {"PASS" if final_score >= threshold else "FAIL"}
+            Final Reason:
+            """
+        )

deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

deepeval 3.6.6py3-none-any.whl → 3.6.8py3-none-any.whl