PyPI - deepeval - Versions diffs - 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl - Mend

deepeval 3.4.7py3-none-any.whl → 3.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

deepeval/__init__.py +8 -7
deepeval/_version.py +1 -1
deepeval/cli/dotenv_handler.py +71 -0
deepeval/cli/main.py +1021 -280
deepeval/cli/utils.py +116 -2
deepeval/confident/api.py +29 -14
deepeval/config/__init__.py +0 -0
deepeval/config/settings.py +565 -0
deepeval/config/settings_manager.py +133 -0
deepeval/config/utils.py +86 -0
deepeval/dataset/__init__.py +1 -0
deepeval/dataset/dataset.py +70 -10
deepeval/dataset/test_run_tracer.py +82 -0
deepeval/dataset/utils.py +23 -0
deepeval/key_handler.py +64 -2
deepeval/metrics/__init__.py +4 -1
deepeval/metrics/answer_relevancy/template.py +7 -2
deepeval/metrics/conversational_dag/__init__.py +7 -0
deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
deepeval/metrics/conversational_dag/nodes.py +931 -0
deepeval/metrics/conversational_dag/templates.py +117 -0
deepeval/metrics/dag/dag.py +13 -4
deepeval/metrics/dag/graph.py +47 -15
deepeval/metrics/dag/utils.py +103 -38
deepeval/metrics/faithfulness/template.py +11 -8
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
deepeval/models/llms/amazon_bedrock_model.py +24 -3
deepeval/models/llms/openai_model.py +37 -41
deepeval/models/retry_policy.py +280 -0
deepeval/openai_agents/agent.py +4 -2
deepeval/synthesizer/chunking/doc_chunker.py +87 -51
deepeval/test_run/api.py +1 -0
deepeval/tracing/otel/exporter.py +20 -8
deepeval/tracing/otel/utils.py +57 -0
deepeval/tracing/tracing.py +37 -16
deepeval/tracing/utils.py +98 -1
deepeval/utils.py +111 -70
{deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
{deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
deepeval/env.py +0 -35
{deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
{deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
{deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0

deepeval/metrics/conversational_dag/templates.py ADDED Viewed

@@ -0,0 +1,117 @@
+from typing import List
+from textwrap import dedent
+class ConversationalVerdictNodeTemplate:
+    @staticmethod
+    def generate_reason(verbose_steps: List[str], score: float, name: str):
+        return dedent(
+            f"""You are given a metric name, its score, and a traversal path through a conversational evaluation DAG (Directed Acyclic Graph).
+                This DAG reflects step-by-step reasoning over a dialogue to arrive at the final verdict.
+                Each step in the DAG represents a judgment based on parts of the conversation — including roles and the contents they spoke of.
+                Your task is to explain **why the score was assigned**, using the traversal steps to justify the reasoning.
+                Metric Name:
+                {name}
+                Score:
+                {score}
+                DAG Traversal:
+                {verbose_steps}
+                **
+                IMPORTANT: Only return JSON with a 'reason' key.
+                Example:
+                {{
+                "reason": "The score is {score} because the assistant repeatedly failed to clarify the user's ambiguous statements, as shown in the DAG traversal path."
+                }}
+                **
+                JSON:
+            """
+        )
+class ConversationalTaskNodeTemplate:
+    @staticmethod
+    def generate_task_output(instructions: str, text: str):
+        return dedent(
+            f"""You are given a set of task instructions and a full conversation between a user and an assistant.
+                Instructions:
+                {instructions}
+                {text}
+                ===END OF INPUT===
+                **
+                IMPORTANT: Only return a JSON with the 'output' key containing the result of applying the instructions to the conversation.
+                Example:
+                {{
+                "output": "..."
+                }}
+                **
+                JSON:
+            """
+        )
+class ConversationalBinaryJudgementTemplate:
+    @staticmethod
+    def generate_binary_verdict(criteria: str, text: str):
+        return dedent(
+            f"""{criteria}
+                Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
+                Full Conversation:
+                {text}
+                **
+                IMPORTANT: Only return JSON with two keys:
+                - 'verdict': true or false
+                - 'reason': justification based on specific parts of the conversation
+                Example:
+                {{
+                "verdict": true,
+                "reason": "The assistant provided a clear and direct answer in response to every user query."
+                }}
+                **
+                JSON:
+            """
+        )
+class ConversationalNonBinaryJudgementTemplate:
+    @staticmethod
+    def generate_non_binary_verdict(
+        criteria: str, text: str, options: List[str]
+    ):
+        return dedent(
+            f"""{criteria}
+                You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
+                Options: {options}
+                Full Conversation:
+                {text}
+                **
+                IMPORTANT: Only return JSON with two keys:
+                - 'verdict': one of the listed options
+                - 'reason': explanation referencing specific conversation points
+                Example:
+                {{
+                "verdict": "{options[1]}",
+                "reason": "The assistant partially addressed the user’s issue but missed clarifying their follow-up question."
+                }}
+                **
+                JSON:
+            """
+        )

deepeval/metrics/dag/dag.py CHANGED Viewed

@@ -13,8 +13,8 @@ from deepeval.models import DeepEvalBaseLLM
 from deepeval.metrics.indicator import metric_progress_indicator
 from deepeval.metrics.g_eval.schema import *
 from deepeval.metrics.dag.graph import DeepAcyclicGraph
-from deepeval.metrics.dag.utils import copy_graph
 from deepeval.metrics.dag.utils import (
+    copy_graph,
     is_valid_dag_from_roots,
     extract_required_params,
 )
@@ -34,7 +34,12 @@ class DAGMetric(BaseMetric):
         verbose_mode: bool = False,
         _include_dag_suffix: bool = True,
     ):
-        if is_valid_dag_from_roots(dag.root_nodes) == False:
+        if (
+            is_valid_dag_from_roots(
+                root_nodes=dag.root_nodes, multiturn=dag.multiturn
+            )
+            == False
+        ):
             raise ValueError("Cycle detected in DAG graph.")
         self._verbose_steps: List[str] = []
@@ -56,7 +61,9 @@ class DAGMetric(BaseMetric):
         _in_component: bool = False,
     ) -> float:
         check_llm_test_case_params(
-            test_case, extract_required_params(self.dag.root_nodes), self
+            test_case,
+            extract_required_params(self.dag.root_nodes, self.dag.multiturn),
+            self,
         )
         self.evaluation_cost = 0 if self.using_native_model else None
@@ -91,7 +98,9 @@ class DAGMetric(BaseMetric):
         _in_component: bool = False,
     ) -> float:
         check_llm_test_case_params(
-            test_case, extract_required_params(self.dag.root_nodes), self
+            test_case,
+            extract_required_params(self.dag.root_nodes, self.dag.multiturn),
+            self,
         )
         self.evaluation_cost = 0 if self.using_native_model else None

deepeval/metrics/dag/graph.py CHANGED Viewed

@@ -1,39 +1,71 @@
 import asyncio
-from typing import List
+from typing import List, Union
 from deepeval.metrics.dag import (
     BaseNode,
     NonBinaryJudgementNode,
     BinaryJudgementNode,
 )
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import BaseMetric
+from deepeval.metrics.conversational_dag import (
+    ConversationalBaseNode,
+    ConversationalBinaryJudgementNode,
+    ConversationalNonBinaryJudgementNode,
+)
+from deepeval.test_case import LLMTestCase, ConversationalTestCase
+from deepeval.metrics import BaseMetric, BaseConversationalMetric
+def validate_root_nodes(
+    root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
+):
+    # see if all root nodes are of the same type, more verbose error message, actualy we should say we cannot mix multi and single turn nodes
+    if not all(isinstance(node, type(root_nodes[0])) for node in root_nodes):
+        raise ValueError("You cannot mix multi and single turn nodes")
+    return True
 class DeepAcyclicGraph:
+    multiturn: bool
     def __init__(
         self,
-        root_nodes: List[BaseNode],
+        root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
     ):
-        for root_node in root_nodes:
-            if isinstance(root_node, NonBinaryJudgementNode) or isinstance(
-                root_node, BinaryJudgementNode
-            ):
-                if len(root_nodes) > 1:
-                    raise ValueError(
-                        "You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
-                    )
+        validate_root_nodes(root_nodes)
+        self.multiturn = isinstance(root_nodes[0], ConversationalBaseNode)
+        if not self.multiturn:
+            for root_node in root_nodes:
+                if isinstance(root_node, NonBinaryJudgementNode) or isinstance(
+                    root_node, BinaryJudgementNode
+                ):
+                    if len(root_nodes) > 1:
+                        raise ValueError(
+                            "You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
+                        )
+        else:
+            for root_node in root_nodes:
+                if isinstance(
+                    root_node, ConversationalNonBinaryJudgementNode
+                ) or isinstance(root_node, ConversationalBinaryJudgementNode):
+                    if len(root_nodes) > 1:
+                        raise ValueError(
+                            "You cannot provide more than one root node when using 'ConversationalBinaryJudgementNode' or 'ConversationalNonBinaryJudgementNode' in root_nodes."
+                        )
         self.root_nodes = root_nodes
-    def _execute(self, metric: BaseMetric, test_case: LLMTestCase) -> None:
+    def _execute(
+        self,
+        metric: Union[BaseMetric, BaseConversationalMetric],
+        test_case: Union[LLMTestCase, ConversationalTestCase],
+    ) -> None:
         for root_node in self.root_nodes:
             root_node._execute(metric=metric, test_case=test_case, depth=0)
     async def _a_execute(
         self,
-        metric: BaseMetric,
-        test_case: LLMTestCase,
+        metric: Union[BaseMetric, BaseConversationalMetric],
+        test_case: Union[LLMTestCase, ConversationalTestCase],
     ) -> None:
         await asyncio.gather(
             *(

deepeval/metrics/dag/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Set, Dict, Optional
+from typing import Set, Dict, Optional, Union
 import inspect
 from deepeval.metrics.dag import (
@@ -9,18 +9,33 @@ from deepeval.metrics.dag import (
     TaskNode,
     DeepAcyclicGraph,
 )
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.metrics.conversational_dag import (
+    ConversationalBaseNode,
+    ConversationalBinaryJudgementNode,
+    ConversationalNonBinaryJudgementNode,
+    ConversationalTaskNode,
+    ConversationalVerdictNode,
+)
+from deepeval.test_case import LLMTestCaseParams, TurnParams
-def is_valid_dag_from_roots(root_nodes: list[BaseNode]) -> bool:
+def is_valid_dag_from_roots(
+    root_nodes: Union[list[BaseNode], list[ConversationalBaseNode]],
+    multiturn: bool,
+) -> bool:
     visited = set()
     for root in root_nodes:
-        if not is_valid_dag(root, visited, set()):
+        if not is_valid_dag(root, multiturn, visited, set()):
             return False
     return True
-def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
+def is_valid_dag(
+    node: Union[BaseNode, ConversationalBaseNode],
+    multiturn: bool,
+    visited=None,
+    stack=None,
+) -> bool:
     if visited is None:
         visited = set()
     if stack is None:
@@ -33,14 +48,24 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
     visited.add(node)
     stack.add(node)
-    if (
-        isinstance(node, TaskNode)
-        or isinstance(node, BinaryJudgementNode)
-        or isinstance(node, NonBinaryJudgementNode)
-    ):
-        for child in node.children:
-            if not is_valid_dag(child, visited, stack):
-                return False
+    if not multiturn:
+        if (
+            isinstance(node, TaskNode)
+            or isinstance(node, BinaryJudgementNode)
+            or isinstance(node, NonBinaryJudgementNode)
+        ):
+            for child in node.children:
+                if not is_valid_dag(child, multiturn, visited, stack):
+                    return False
+    else:
+        if (
+            isinstance(node, ConversationalTaskNode)
+            or isinstance(node, ConversationalBinaryJudgementNode)
+            or isinstance(node, ConversationalNonBinaryJudgementNode)
+        ):
+            for child in node.children:
+                if not is_valid_dag(child, multiturn, visited, stack):
+                    return False
     stack.remove(node)
     return True
@@ -48,29 +73,51 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
 def extract_required_params(
     nodes: list[BaseNode],
-    required_params: Optional[Set[LLMTestCaseParams]] = None,
-) -> Set[LLMTestCaseParams]:
+    multiturn: bool,
+    required_params: Optional[
+        Union[Set[LLMTestCaseParams], Set[TurnParams]]
+    ] = None,
+) -> Union[Set[LLMTestCaseParams], Set[TurnParams]]:
     if required_params is None:
         required_params = set()
     for node in nodes:
-        if (
-            isinstance(node, TaskNode)
-            or isinstance(node, BinaryJudgementNode)
-            or isinstance(node, NonBinaryJudgementNode)
-        ):
-            if node.evaluation_params is not None:
-                required_params.update(node.evaluation_params)
-            extract_required_params(node.children, required_params)
+        if not multiturn:
+            if (
+                isinstance(node, TaskNode)
+                or isinstance(node, BinaryJudgementNode)
+                or isinstance(node, NonBinaryJudgementNode)
+            ):
+                if node.evaluation_params is not None:
+                    required_params.update(node.evaluation_params)
+                extract_required_params(
+                    node.children, multiturn, required_params
+                )
+        else:
+            if (
+                isinstance(node, ConversationalTaskNode)
+                or isinstance(node, ConversationalBinaryJudgementNode)
+                or isinstance(node, ConversationalNonBinaryJudgementNode)
+            ):
+                if node.evaluation_params is not None:
+                    required_params.update(node.evaluation_params)
+                extract_required_params(
+                    node.children, multiturn, required_params
+                )
     return required_params
 def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
     # This mapping avoids re-copying nodes that appear in multiple places.
-    visited: Dict[BaseNode, BaseNode] = {}
-    def copy_node(node: BaseNode) -> BaseNode:
+    visited: Union[
+        Dict[BaseNode, BaseNode],
+        Dict[ConversationalBaseNode, ConversationalBaseNode],
+    ] = {}
+    def copy_node(
+        node: Union[BaseNode, ConversationalBaseNode],
+    ) -> Union[BaseNode, ConversationalBaseNode]:
         if node in visited:
             return visited[node]
@@ -98,22 +145,40 @@ def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
                 "_depth",
             ]
         }
-        if (
-            isinstance(node, TaskNode)
-            or isinstance(node, BinaryJudgementNode)
-            or isinstance(node, NonBinaryJudgementNode)
-        ):
-            copied_node = node_class(
-                **valid_args,
-                children=[copy_node(child) for child in node.children]
-            )
+        if not original_dag.multiturn:
+            if (
+                isinstance(node, TaskNode)
+                or isinstance(node, BinaryJudgementNode)
+                or isinstance(node, NonBinaryJudgementNode)
+            ):
+                copied_node = node_class(
+                    **valid_args,
+                    children=[copy_node(child) for child in node.children]
+                )
+            else:
+                if isinstance(node, VerdictNode) and node.child:
+                    copied_node = node_class(
+                        **valid_args, child=copy_node(node.child)
+                    )
+                else:
+                    copied_node = node_class(**valid_args)
         else:
-            if isinstance(node, VerdictNode) and node.child:
+            if (
+                isinstance(node, ConversationalTaskNode)
+                or isinstance(node, ConversationalBinaryJudgementNode)
+                or isinstance(node, ConversationalNonBinaryJudgementNode)
+            ):
                 copied_node = node_class(
-                    **valid_args, child=copy_node(node.child)
+                    **valid_args,
+                    children=[copy_node(child) for child in node.children]
                 )
             else:
-                copied_node = node_class(**valid_args)
+                if isinstance(node, ConversationalVerdictNode) and node.child:
+                    copied_node = node_class(
+                        **valid_args, child=copy_node(node.child)
+                    )
+                else:
+                    copied_node = node_class(**valid_args)
         visited[node] = copied_node
         return copied_node

deepeval/metrics/faithfulness/template.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Optional, List
 class FaithfulnessTemplate:
     @staticmethod
     def generate_claims(actual_output: str):
-        return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided text.
+        return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
 These truths, MUST BE COHERENT, and CANNOT be taken out of context.
 Example:
@@ -24,9 +24,10 @@ Example JSON:
 IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
 Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
 You should NOT include any prior knowledge, and take the text at face value when extracting claims.
+You should be aware that it is an AI that is outputting these claims.
 **
-Text:
+AI Output:
 {actual_output}
 JSON:
@@ -72,7 +73,7 @@ JSON:
     def generate_verdicts(claims: List[str], retrieval_context: str):
         return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
 The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
-Provide a 'reason' ONLY if the answer is 'no'.
+Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
 The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
 **
@@ -84,28 +85,30 @@ Example:
 {{
     "verdicts": [
         {{
-            "verdict": "idk"
+            "verdict": "idk",
+            "reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
         }},
         {{
-            "verdict": "idk"
+            "verdict": "idk",
+            "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
         }},
         {{
             "verdict": "yes"
         }},
         {{
             "verdict": "no",
-            "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
+            "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
         }},
         {{
             "verdict": "no",
-            "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
+            "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
         }},
     ]
 }}
 ===== END OF EXAMPLE ======
 The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
-You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
+You DON'T have to provide a reason if the answer is 'yes'.
 ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
 Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
 Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.

deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py CHANGED Viewed

@@ -39,7 +39,7 @@ class MultimodalAnswerRelevancyTemplate:
                     Please generate a list of JSON with two keys: `verdict` and `reason`.
                     The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement or image is relevant to addressing the original input, 'no' if the statement or image is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
                     The 'reason' is the reason for the verdict.
-                    Provide a 'reason' ONLY if the answer is 'no'.
+                    Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
                     The provided statements are statements and images generated in the actual output.
                     **
@@ -54,13 +54,15 @@ class MultimodalAnswerRelevancyTemplate:
                                 "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
                             }},
                             {{
-                                "verdict": "idk"
+                                "verdict": "idk",
+                                "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
                             }},
                             {{
-                                "verdict": "idk"
+                                "verdict": "idk",
+                                "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
                             }},
                             {{
-                                "verdict": "yes"
+                                "verdict": "yes",
                             }}
                         ]
                     }}

deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py CHANGED Viewed

@@ -95,7 +95,7 @@ class MultimodalFaithfulnessTemplate:
         return textwrap.dedent(
             f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
                 The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
-                Provide a 'reason' ONLY if the answer is 'no'.
+                Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
                 The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
                 **
@@ -107,10 +107,12 @@ class MultimodalFaithfulnessTemplate:
                 {{
                     "verdicts": [
                         {{
-                            "verdict": "idk"
+                            "verdict": "idk",
+                            "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
                         }},
                         {{
-                            "verdict": "idk"
+                            "verdict": "idk",
+                            "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
                         }},
                         {{
                             "verdict": "yes"
@@ -128,7 +130,7 @@ class MultimodalFaithfulnessTemplate:
                 ===== END OF EXAMPLE ======
                 The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
-                You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
+                You DON'T have to provide a reason if the answer is 'yes'.
                 ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
                 Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
                 Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.

deepeval/metrics/tool_correctness/tool_correctness.py CHANGED Viewed

@@ -223,9 +223,13 @@ class ToolCorrectnessMetric(BaseMetric):
                 total_score += best_score
                 matched_called_tools.add(best_called_tool)
         return (
-            total_score / len(self.expected_tools)
-            if self.expected_tools
-            else 0.0
+            1.0
+            if not self.expected_tools and not self.tools_called
+            else (
+                0.0
+                if not self.expected_tools
+                else total_score / len(self.expected_tools)
+            )
         )
     # Consider ordering score

deepeval/models/llms/amazon_bedrock_model.py CHANGED Viewed

@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
     ###############################################
     def get_converse_request_body(self, prompt: str) -> dict:
+        # Inline parameter translation with defaults
+        param_mapping = {
+            "max_tokens": "maxTokens",
+            "top_p": "topP",
+            "top_k": "topK",
+            "stop_sequences": "stopSequences",
+        }
+        # Start with defaults for required parameters
+        translated_kwargs = {
+            "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
+            "topP": self.generation_kwargs.get("top_p", 0),
+        }
+        # Add any other parameters from generation_kwargs
+        for key, value in self.generation_kwargs.items():
+            if key not in [
+                "max_tokens",
+                "top_p",
+            ]:  # Skip already handled defaults
+                aws_key = param_mapping.get(key, key)
+                translated_kwargs[aws_key] = value
         return {
             "messages": [{"role": "user", "content": [{"text": prompt}]}],
             "inferenceConfig": {
                 "temperature": self.temperature,
-                "topP": self.generation_kwargs.get("top_p", 0),
-                "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
-                **self.generation_kwargs,
+                **translated_kwargs,
             },
         }

deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl

deepeval 3.4.7py3-none-any.whl → 3.4.9py3-none-any.whl