deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deepeval/__init__.py +8 -7
  2. deepeval/_version.py +1 -1
  3. deepeval/cli/dotenv_handler.py +71 -0
  4. deepeval/cli/main.py +1021 -280
  5. deepeval/cli/utils.py +116 -2
  6. deepeval/confident/api.py +29 -14
  7. deepeval/config/__init__.py +0 -0
  8. deepeval/config/settings.py +565 -0
  9. deepeval/config/settings_manager.py +133 -0
  10. deepeval/config/utils.py +86 -0
  11. deepeval/dataset/__init__.py +1 -0
  12. deepeval/dataset/dataset.py +70 -10
  13. deepeval/dataset/test_run_tracer.py +82 -0
  14. deepeval/dataset/utils.py +23 -0
  15. deepeval/key_handler.py +64 -2
  16. deepeval/metrics/__init__.py +4 -1
  17. deepeval/metrics/answer_relevancy/template.py +7 -2
  18. deepeval/metrics/conversational_dag/__init__.py +7 -0
  19. deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
  20. deepeval/metrics/conversational_dag/nodes.py +931 -0
  21. deepeval/metrics/conversational_dag/templates.py +117 -0
  22. deepeval/metrics/dag/dag.py +13 -4
  23. deepeval/metrics/dag/graph.py +47 -15
  24. deepeval/metrics/dag/utils.py +103 -38
  25. deepeval/metrics/faithfulness/template.py +11 -8
  26. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  27. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  28. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  29. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  30. deepeval/models/llms/openai_model.py +37 -41
  31. deepeval/models/retry_policy.py +280 -0
  32. deepeval/openai_agents/agent.py +4 -2
  33. deepeval/synthesizer/chunking/doc_chunker.py +87 -51
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/tracing.py +37 -16
  38. deepeval/tracing/utils.py +98 -1
  39. deepeval/utils.py +111 -70
  40. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
  41. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
  42. deepeval/env.py +0 -35
  43. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
  44. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
  45. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,117 @@
1
+ from typing import List
2
+ from textwrap import dedent
3
+
4
+
5
+ class ConversationalVerdictNodeTemplate:
6
+ @staticmethod
7
+ def generate_reason(verbose_steps: List[str], score: float, name: str):
8
+ return dedent(
9
+ f"""You are given a metric name, its score, and a traversal path through a conversational evaluation DAG (Directed Acyclic Graph).
10
+ This DAG reflects step-by-step reasoning over a dialogue to arrive at the final verdict.
11
+
12
+ Each step in the DAG represents a judgment based on parts of the conversation — including roles and the contents they spoke of.
13
+
14
+ Your task is to explain **why the score was assigned**, using the traversal steps to justify the reasoning.
15
+
16
+ Metric Name:
17
+ {name}
18
+
19
+ Score:
20
+ {score}
21
+
22
+ DAG Traversal:
23
+ {verbose_steps}
24
+
25
+ **
26
+ IMPORTANT: Only return JSON with a 'reason' key.
27
+ Example:
28
+ {{
29
+ "reason": "The score is {score} because the assistant repeatedly failed to clarify the user's ambiguous statements, as shown in the DAG traversal path."
30
+ }}
31
+ **
32
+ JSON:
33
+ """
34
+ )
35
+
36
+
37
+ class ConversationalTaskNodeTemplate:
38
+ @staticmethod
39
+ def generate_task_output(instructions: str, text: str):
40
+ return dedent(
41
+ f"""You are given a set of task instructions and a full conversation between a user and an assistant.
42
+
43
+ Instructions:
44
+ {instructions}
45
+
46
+ {text}
47
+
48
+ ===END OF INPUT===
49
+
50
+ **
51
+ IMPORTANT: Only return a JSON with the 'output' key containing the result of applying the instructions to the conversation.
52
+ Example:
53
+ {{
54
+ "output": "..."
55
+ }}
56
+ **
57
+ JSON:
58
+ """
59
+ )
60
+
61
+
62
+ class ConversationalBinaryJudgementTemplate:
63
+ @staticmethod
64
+ def generate_binary_verdict(criteria: str, text: str):
65
+ return dedent(
66
+ f"""{criteria}
67
+
68
+ Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
69
+
70
+ Full Conversation:
71
+ {text}
72
+
73
+ **
74
+ IMPORTANT: Only return JSON with two keys:
75
+ - 'verdict': true or false
76
+ - 'reason': justification based on specific parts of the conversation
77
+
78
+ Example:
79
+ {{
80
+ "verdict": true,
81
+ "reason": "The assistant provided a clear and direct answer in response to every user query."
82
+ }}
83
+ **
84
+ JSON:
85
+ """
86
+ )
87
+
88
+
89
+ class ConversationalNonBinaryJudgementTemplate:
90
+ @staticmethod
91
+ def generate_non_binary_verdict(
92
+ criteria: str, text: str, options: List[str]
93
+ ):
94
+ return dedent(
95
+ f"""{criteria}
96
+
97
+ You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
98
+
99
+ Options: {options}
100
+
101
+ Full Conversation:
102
+ {text}
103
+
104
+ **
105
+ IMPORTANT: Only return JSON with two keys:
106
+ - 'verdict': one of the listed options
107
+ - 'reason': explanation referencing specific conversation points
108
+
109
+ Example:
110
+ {{
111
+ "verdict": "{options[1]}",
112
+ "reason": "The assistant partially addressed the user’s issue but missed clarifying their follow-up question."
113
+ }}
114
+ **
115
+ JSON:
116
+ """
117
+ )
@@ -13,8 +13,8 @@ from deepeval.models import DeepEvalBaseLLM
13
13
  from deepeval.metrics.indicator import metric_progress_indicator
14
14
  from deepeval.metrics.g_eval.schema import *
15
15
  from deepeval.metrics.dag.graph import DeepAcyclicGraph
16
- from deepeval.metrics.dag.utils import copy_graph
17
16
  from deepeval.metrics.dag.utils import (
17
+ copy_graph,
18
18
  is_valid_dag_from_roots,
19
19
  extract_required_params,
20
20
  )
@@ -34,7 +34,12 @@ class DAGMetric(BaseMetric):
34
34
  verbose_mode: bool = False,
35
35
  _include_dag_suffix: bool = True,
36
36
  ):
37
- if is_valid_dag_from_roots(dag.root_nodes) == False:
37
+ if (
38
+ is_valid_dag_from_roots(
39
+ root_nodes=dag.root_nodes, multiturn=dag.multiturn
40
+ )
41
+ == False
42
+ ):
38
43
  raise ValueError("Cycle detected in DAG graph.")
39
44
 
40
45
  self._verbose_steps: List[str] = []
@@ -56,7 +61,9 @@ class DAGMetric(BaseMetric):
56
61
  _in_component: bool = False,
57
62
  ) -> float:
58
63
  check_llm_test_case_params(
59
- test_case, extract_required_params(self.dag.root_nodes), self
64
+ test_case,
65
+ extract_required_params(self.dag.root_nodes, self.dag.multiturn),
66
+ self,
60
67
  )
61
68
 
62
69
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -91,7 +98,9 @@ class DAGMetric(BaseMetric):
91
98
  _in_component: bool = False,
92
99
  ) -> float:
93
100
  check_llm_test_case_params(
94
- test_case, extract_required_params(self.dag.root_nodes), self
101
+ test_case,
102
+ extract_required_params(self.dag.root_nodes, self.dag.multiturn),
103
+ self,
95
104
  )
96
105
 
97
106
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -1,39 +1,71 @@
1
1
  import asyncio
2
- from typing import List
2
+ from typing import List, Union
3
3
 
4
4
  from deepeval.metrics.dag import (
5
5
  BaseNode,
6
6
  NonBinaryJudgementNode,
7
7
  BinaryJudgementNode,
8
8
  )
9
- from deepeval.test_case import LLMTestCase
10
- from deepeval.metrics import BaseMetric
9
+ from deepeval.metrics.conversational_dag import (
10
+ ConversationalBaseNode,
11
+ ConversationalBinaryJudgementNode,
12
+ ConversationalNonBinaryJudgementNode,
13
+ )
14
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
15
+ from deepeval.metrics import BaseMetric, BaseConversationalMetric
16
+
17
+
18
+ def validate_root_nodes(
19
+ root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
20
+ ):
21
+ # see if all root nodes are of the same type, more verbose error message, actualy we should say we cannot mix multi and single turn nodes
22
+ if not all(isinstance(node, type(root_nodes[0])) for node in root_nodes):
23
+ raise ValueError("You cannot mix multi and single turn nodes")
24
+ return True
11
25
 
12
26
 
13
27
  class DeepAcyclicGraph:
28
+ multiturn: bool
29
+
14
30
  def __init__(
15
31
  self,
16
- root_nodes: List[BaseNode],
32
+ root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
17
33
  ):
18
- for root_node in root_nodes:
19
- if isinstance(root_node, NonBinaryJudgementNode) or isinstance(
20
- root_node, BinaryJudgementNode
21
- ):
22
- if len(root_nodes) > 1:
23
- raise ValueError(
24
- "You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
25
- )
34
+ validate_root_nodes(root_nodes)
35
+ self.multiturn = isinstance(root_nodes[0], ConversationalBaseNode)
26
36
 
37
+ if not self.multiturn:
38
+ for root_node in root_nodes:
39
+ if isinstance(root_node, NonBinaryJudgementNode) or isinstance(
40
+ root_node, BinaryJudgementNode
41
+ ):
42
+ if len(root_nodes) > 1:
43
+ raise ValueError(
44
+ "You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
45
+ )
46
+ else:
47
+ for root_node in root_nodes:
48
+ if isinstance(
49
+ root_node, ConversationalNonBinaryJudgementNode
50
+ ) or isinstance(root_node, ConversationalBinaryJudgementNode):
51
+ if len(root_nodes) > 1:
52
+ raise ValueError(
53
+ "You cannot provide more than one root node when using 'ConversationalBinaryJudgementNode' or 'ConversationalNonBinaryJudgementNode' in root_nodes."
54
+ )
27
55
  self.root_nodes = root_nodes
28
56
 
29
- def _execute(self, metric: BaseMetric, test_case: LLMTestCase) -> None:
57
+ def _execute(
58
+ self,
59
+ metric: Union[BaseMetric, BaseConversationalMetric],
60
+ test_case: Union[LLMTestCase, ConversationalTestCase],
61
+ ) -> None:
30
62
  for root_node in self.root_nodes:
31
63
  root_node._execute(metric=metric, test_case=test_case, depth=0)
32
64
 
33
65
  async def _a_execute(
34
66
  self,
35
- metric: BaseMetric,
36
- test_case: LLMTestCase,
67
+ metric: Union[BaseMetric, BaseConversationalMetric],
68
+ test_case: Union[LLMTestCase, ConversationalTestCase],
37
69
  ) -> None:
38
70
  await asyncio.gather(
39
71
  *(
@@ -1,4 +1,4 @@
1
- from typing import Set, Dict, Optional
1
+ from typing import Set, Dict, Optional, Union
2
2
  import inspect
3
3
 
4
4
  from deepeval.metrics.dag import (
@@ -9,18 +9,33 @@ from deepeval.metrics.dag import (
9
9
  TaskNode,
10
10
  DeepAcyclicGraph,
11
11
  )
12
- from deepeval.test_case import LLMTestCaseParams
12
+ from deepeval.metrics.conversational_dag import (
13
+ ConversationalBaseNode,
14
+ ConversationalBinaryJudgementNode,
15
+ ConversationalNonBinaryJudgementNode,
16
+ ConversationalTaskNode,
17
+ ConversationalVerdictNode,
18
+ )
19
+ from deepeval.test_case import LLMTestCaseParams, TurnParams
13
20
 
14
21
 
15
- def is_valid_dag_from_roots(root_nodes: list[BaseNode]) -> bool:
22
+ def is_valid_dag_from_roots(
23
+ root_nodes: Union[list[BaseNode], list[ConversationalBaseNode]],
24
+ multiturn: bool,
25
+ ) -> bool:
16
26
  visited = set()
17
27
  for root in root_nodes:
18
- if not is_valid_dag(root, visited, set()):
28
+ if not is_valid_dag(root, multiturn, visited, set()):
19
29
  return False
20
30
  return True
21
31
 
22
32
 
23
- def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
33
+ def is_valid_dag(
34
+ node: Union[BaseNode, ConversationalBaseNode],
35
+ multiturn: bool,
36
+ visited=None,
37
+ stack=None,
38
+ ) -> bool:
24
39
  if visited is None:
25
40
  visited = set()
26
41
  if stack is None:
@@ -33,14 +48,24 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
33
48
 
34
49
  visited.add(node)
35
50
  stack.add(node)
36
- if (
37
- isinstance(node, TaskNode)
38
- or isinstance(node, BinaryJudgementNode)
39
- or isinstance(node, NonBinaryJudgementNode)
40
- ):
41
- for child in node.children:
42
- if not is_valid_dag(child, visited, stack):
43
- return False
51
+ if not multiturn:
52
+ if (
53
+ isinstance(node, TaskNode)
54
+ or isinstance(node, BinaryJudgementNode)
55
+ or isinstance(node, NonBinaryJudgementNode)
56
+ ):
57
+ for child in node.children:
58
+ if not is_valid_dag(child, multiturn, visited, stack):
59
+ return False
60
+ else:
61
+ if (
62
+ isinstance(node, ConversationalTaskNode)
63
+ or isinstance(node, ConversationalBinaryJudgementNode)
64
+ or isinstance(node, ConversationalNonBinaryJudgementNode)
65
+ ):
66
+ for child in node.children:
67
+ if not is_valid_dag(child, multiturn, visited, stack):
68
+ return False
44
69
 
45
70
  stack.remove(node)
46
71
  return True
@@ -48,29 +73,51 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
48
73
 
49
74
  def extract_required_params(
50
75
  nodes: list[BaseNode],
51
- required_params: Optional[Set[LLMTestCaseParams]] = None,
52
- ) -> Set[LLMTestCaseParams]:
76
+ multiturn: bool,
77
+ required_params: Optional[
78
+ Union[Set[LLMTestCaseParams], Set[TurnParams]]
79
+ ] = None,
80
+ ) -> Union[Set[LLMTestCaseParams], Set[TurnParams]]:
53
81
  if required_params is None:
54
82
  required_params = set()
55
83
 
56
84
  for node in nodes:
57
- if (
58
- isinstance(node, TaskNode)
59
- or isinstance(node, BinaryJudgementNode)
60
- or isinstance(node, NonBinaryJudgementNode)
61
- ):
62
- if node.evaluation_params is not None:
63
- required_params.update(node.evaluation_params)
64
- extract_required_params(node.children, required_params)
85
+ if not multiturn:
86
+ if (
87
+ isinstance(node, TaskNode)
88
+ or isinstance(node, BinaryJudgementNode)
89
+ or isinstance(node, NonBinaryJudgementNode)
90
+ ):
91
+ if node.evaluation_params is not None:
92
+ required_params.update(node.evaluation_params)
93
+ extract_required_params(
94
+ node.children, multiturn, required_params
95
+ )
96
+ else:
97
+ if (
98
+ isinstance(node, ConversationalTaskNode)
99
+ or isinstance(node, ConversationalBinaryJudgementNode)
100
+ or isinstance(node, ConversationalNonBinaryJudgementNode)
101
+ ):
102
+ if node.evaluation_params is not None:
103
+ required_params.update(node.evaluation_params)
104
+ extract_required_params(
105
+ node.children, multiturn, required_params
106
+ )
65
107
 
66
108
  return required_params
67
109
 
68
110
 
69
111
  def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
70
112
  # This mapping avoids re-copying nodes that appear in multiple places.
71
- visited: Dict[BaseNode, BaseNode] = {}
72
-
73
- def copy_node(node: BaseNode) -> BaseNode:
113
+ visited: Union[
114
+ Dict[BaseNode, BaseNode],
115
+ Dict[ConversationalBaseNode, ConversationalBaseNode],
116
+ ] = {}
117
+
118
+ def copy_node(
119
+ node: Union[BaseNode, ConversationalBaseNode],
120
+ ) -> Union[BaseNode, ConversationalBaseNode]:
74
121
  if node in visited:
75
122
  return visited[node]
76
123
 
@@ -98,22 +145,40 @@ def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
98
145
  "_depth",
99
146
  ]
100
147
  }
101
- if (
102
- isinstance(node, TaskNode)
103
- or isinstance(node, BinaryJudgementNode)
104
- or isinstance(node, NonBinaryJudgementNode)
105
- ):
106
- copied_node = node_class(
107
- **valid_args,
108
- children=[copy_node(child) for child in node.children]
109
- )
148
+ if not original_dag.multiturn:
149
+ if (
150
+ isinstance(node, TaskNode)
151
+ or isinstance(node, BinaryJudgementNode)
152
+ or isinstance(node, NonBinaryJudgementNode)
153
+ ):
154
+ copied_node = node_class(
155
+ **valid_args,
156
+ children=[copy_node(child) for child in node.children]
157
+ )
158
+ else:
159
+ if isinstance(node, VerdictNode) and node.child:
160
+ copied_node = node_class(
161
+ **valid_args, child=copy_node(node.child)
162
+ )
163
+ else:
164
+ copied_node = node_class(**valid_args)
110
165
  else:
111
- if isinstance(node, VerdictNode) and node.child:
166
+ if (
167
+ isinstance(node, ConversationalTaskNode)
168
+ or isinstance(node, ConversationalBinaryJudgementNode)
169
+ or isinstance(node, ConversationalNonBinaryJudgementNode)
170
+ ):
112
171
  copied_node = node_class(
113
- **valid_args, child=copy_node(node.child)
172
+ **valid_args,
173
+ children=[copy_node(child) for child in node.children]
114
174
  )
115
175
  else:
116
- copied_node = node_class(**valid_args)
176
+ if isinstance(node, ConversationalVerdictNode) and node.child:
177
+ copied_node = node_class(
178
+ **valid_args, child=copy_node(node.child)
179
+ )
180
+ else:
181
+ copied_node = node_class(**valid_args)
117
182
 
118
183
  visited[node] = copied_node
119
184
  return copied_node
@@ -4,7 +4,7 @@ from typing import Optional, List
4
4
  class FaithfulnessTemplate:
5
5
  @staticmethod
6
6
  def generate_claims(actual_output: str):
7
- return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided text.
7
+ return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
8
8
  These truths, MUST BE COHERENT, and CANNOT be taken out of context.
9
9
 
10
10
  Example:
@@ -24,9 +24,10 @@ Example JSON:
24
24
  IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
25
25
  Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
26
26
  You should NOT include any prior knowledge, and take the text at face value when extracting claims.
27
+ You should be aware that it is an AI that is outputting these claims.
27
28
  **
28
29
 
29
- Text:
30
+ AI Output:
30
31
  {actual_output}
31
32
 
32
33
  JSON:
@@ -72,7 +73,7 @@ JSON:
72
73
  def generate_verdicts(claims: List[str], retrieval_context: str):
73
74
  return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
74
75
  The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
75
- Provide a 'reason' ONLY if the answer is 'no'.
76
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
76
77
  The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
77
78
 
78
79
  **
@@ -84,28 +85,30 @@ Example:
84
85
  {{
85
86
  "verdicts": [
86
87
  {{
87
- "verdict": "idk"
88
+ "verdict": "idk",
89
+ "reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
88
90
  }},
89
91
  {{
90
- "verdict": "idk"
92
+ "verdict": "idk",
93
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
91
94
  }},
92
95
  {{
93
96
  "verdict": "yes"
94
97
  }},
95
98
  {{
96
99
  "verdict": "no",
97
- "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
100
+ "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
98
101
  }},
99
102
  {{
100
103
  "verdict": "no",
101
- "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
104
+ "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
102
105
  }},
103
106
  ]
104
107
  }}
105
108
  ===== END OF EXAMPLE ======
106
109
 
107
110
  The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
108
- You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
111
+ You DON'T have to provide a reason if the answer is 'yes'.
109
112
  ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
110
113
  Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
111
114
  Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.
@@ -39,7 +39,7 @@ class MultimodalAnswerRelevancyTemplate:
39
39
  Please generate a list of JSON with two keys: `verdict` and `reason`.
40
40
  The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement or image is relevant to addressing the original input, 'no' if the statement or image is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
41
41
  The 'reason' is the reason for the verdict.
42
- Provide a 'reason' ONLY if the answer is 'no'.
42
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
43
43
  The provided statements are statements and images generated in the actual output.
44
44
 
45
45
  **
@@ -54,13 +54,15 @@ class MultimodalAnswerRelevancyTemplate:
54
54
  "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
55
55
  }},
56
56
  {{
57
- "verdict": "idk"
57
+ "verdict": "idk",
58
+ "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
58
59
  }},
59
60
  {{
60
- "verdict": "idk"
61
+ "verdict": "idk",
62
+ "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
61
63
  }},
62
64
  {{
63
- "verdict": "yes"
65
+ "verdict": "yes",
64
66
  }}
65
67
  ]
66
68
  }}
@@ -95,7 +95,7 @@ class MultimodalFaithfulnessTemplate:
95
95
  return textwrap.dedent(
96
96
  f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
97
97
  The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
98
- Provide a 'reason' ONLY if the answer is 'no'.
98
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
99
99
  The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
100
100
 
101
101
  **
@@ -107,10 +107,12 @@ class MultimodalFaithfulnessTemplate:
107
107
  {{
108
108
  "verdicts": [
109
109
  {{
110
- "verdict": "idk"
110
+ "verdict": "idk",
111
+ "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
111
112
  }},
112
113
  {{
113
- "verdict": "idk"
114
+ "verdict": "idk",
115
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
114
116
  }},
115
117
  {{
116
118
  "verdict": "yes"
@@ -128,7 +130,7 @@ class MultimodalFaithfulnessTemplate:
128
130
  ===== END OF EXAMPLE ======
129
131
 
130
132
  The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
131
- You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
133
+ You DON'T have to provide a reason if the answer is 'yes'.
132
134
  ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
133
135
  Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
134
136
  Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
@@ -223,9 +223,13 @@ class ToolCorrectnessMetric(BaseMetric):
223
223
  total_score += best_score
224
224
  matched_called_tools.add(best_called_tool)
225
225
  return (
226
- total_score / len(self.expected_tools)
227
- if self.expected_tools
228
- else 0.0
226
+ 1.0
227
+ if not self.expected_tools and not self.tools_called
228
+ else (
229
+ 0.0
230
+ if not self.expected_tools
231
+ else total_score / len(self.expected_tools)
232
+ )
229
233
  )
230
234
 
231
235
  # Consider ordering score
@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
115
115
  ###############################################
116
116
 
117
117
  def get_converse_request_body(self, prompt: str) -> dict:
118
+ # Inline parameter translation with defaults
119
+ param_mapping = {
120
+ "max_tokens": "maxTokens",
121
+ "top_p": "topP",
122
+ "top_k": "topK",
123
+ "stop_sequences": "stopSequences",
124
+ }
125
+
126
+ # Start with defaults for required parameters
127
+ translated_kwargs = {
128
+ "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
129
+ "topP": self.generation_kwargs.get("top_p", 0),
130
+ }
131
+
132
+ # Add any other parameters from generation_kwargs
133
+ for key, value in self.generation_kwargs.items():
134
+ if key not in [
135
+ "max_tokens",
136
+ "top_p",
137
+ ]: # Skip already handled defaults
138
+ aws_key = param_mapping.get(key, key)
139
+ translated_kwargs[aws_key] = value
140
+
118
141
  return {
119
142
  "messages": [{"role": "user", "content": [{"text": prompt}]}],
120
143
  "inferenceConfig": {
121
144
  "temperature": self.temperature,
122
- "topP": self.generation_kwargs.get("top_p", 0),
123
- "maxTokens": self.generation_kwargs.get("max_tokens", 1000),
124
- **self.generation_kwargs,
145
+ **translated_kwargs,
125
146
  },
126
147
  }
127
148