deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -7
- deepeval/_version.py +1 -1
- deepeval/cli/dotenv_handler.py +71 -0
- deepeval/cli/main.py +1021 -280
- deepeval/cli/utils.py +116 -2
- deepeval/confident/api.py +29 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/key_handler.py +64 -2
- deepeval/metrics/__init__.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/conversational_dag/__init__.py +7 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
- deepeval/metrics/conversational_dag/nodes.py +931 -0
- deepeval/metrics/conversational_dag/templates.py +117 -0
- deepeval/metrics/dag/dag.py +13 -4
- deepeval/metrics/dag/graph.py +47 -15
- deepeval/metrics/dag/utils.py +103 -38
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/synthesizer/chunking/doc_chunker.py +87 -51
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
- deepeval/env.py +0 -35
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from textwrap import dedent
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ConversationalVerdictNodeTemplate:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def generate_reason(verbose_steps: List[str], score: float, name: str):
|
|
8
|
+
return dedent(
|
|
9
|
+
f"""You are given a metric name, its score, and a traversal path through a conversational evaluation DAG (Directed Acyclic Graph).
|
|
10
|
+
This DAG reflects step-by-step reasoning over a dialogue to arrive at the final verdict.
|
|
11
|
+
|
|
12
|
+
Each step in the DAG represents a judgment based on parts of the conversation — including roles and the contents they spoke of.
|
|
13
|
+
|
|
14
|
+
Your task is to explain **why the score was assigned**, using the traversal steps to justify the reasoning.
|
|
15
|
+
|
|
16
|
+
Metric Name:
|
|
17
|
+
{name}
|
|
18
|
+
|
|
19
|
+
Score:
|
|
20
|
+
{score}
|
|
21
|
+
|
|
22
|
+
DAG Traversal:
|
|
23
|
+
{verbose_steps}
|
|
24
|
+
|
|
25
|
+
**
|
|
26
|
+
IMPORTANT: Only return JSON with a 'reason' key.
|
|
27
|
+
Example:
|
|
28
|
+
{{
|
|
29
|
+
"reason": "The score is {score} because the assistant repeatedly failed to clarify the user's ambiguous statements, as shown in the DAG traversal path."
|
|
30
|
+
}}
|
|
31
|
+
**
|
|
32
|
+
JSON:
|
|
33
|
+
"""
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConversationalTaskNodeTemplate:
|
|
38
|
+
@staticmethod
|
|
39
|
+
def generate_task_output(instructions: str, text: str):
|
|
40
|
+
return dedent(
|
|
41
|
+
f"""You are given a set of task instructions and a full conversation between a user and an assistant.
|
|
42
|
+
|
|
43
|
+
Instructions:
|
|
44
|
+
{instructions}
|
|
45
|
+
|
|
46
|
+
{text}
|
|
47
|
+
|
|
48
|
+
===END OF INPUT===
|
|
49
|
+
|
|
50
|
+
**
|
|
51
|
+
IMPORTANT: Only return a JSON with the 'output' key containing the result of applying the instructions to the conversation.
|
|
52
|
+
Example:
|
|
53
|
+
{{
|
|
54
|
+
"output": "..."
|
|
55
|
+
}}
|
|
56
|
+
**
|
|
57
|
+
JSON:
|
|
58
|
+
"""
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ConversationalBinaryJudgementTemplate:
|
|
63
|
+
@staticmethod
|
|
64
|
+
def generate_binary_verdict(criteria: str, text: str):
|
|
65
|
+
return dedent(
|
|
66
|
+
f"""{criteria}
|
|
67
|
+
|
|
68
|
+
Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
|
|
69
|
+
|
|
70
|
+
Full Conversation:
|
|
71
|
+
{text}
|
|
72
|
+
|
|
73
|
+
**
|
|
74
|
+
IMPORTANT: Only return JSON with two keys:
|
|
75
|
+
- 'verdict': true or false
|
|
76
|
+
- 'reason': justification based on specific parts of the conversation
|
|
77
|
+
|
|
78
|
+
Example:
|
|
79
|
+
{{
|
|
80
|
+
"verdict": true,
|
|
81
|
+
"reason": "The assistant provided a clear and direct answer in response to every user query."
|
|
82
|
+
}}
|
|
83
|
+
**
|
|
84
|
+
JSON:
|
|
85
|
+
"""
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ConversationalNonBinaryJudgementTemplate:
|
|
90
|
+
@staticmethod
|
|
91
|
+
def generate_non_binary_verdict(
|
|
92
|
+
criteria: str, text: str, options: List[str]
|
|
93
|
+
):
|
|
94
|
+
return dedent(
|
|
95
|
+
f"""{criteria}
|
|
96
|
+
|
|
97
|
+
You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
|
|
98
|
+
|
|
99
|
+
Options: {options}
|
|
100
|
+
|
|
101
|
+
Full Conversation:
|
|
102
|
+
{text}
|
|
103
|
+
|
|
104
|
+
**
|
|
105
|
+
IMPORTANT: Only return JSON with two keys:
|
|
106
|
+
- 'verdict': one of the listed options
|
|
107
|
+
- 'reason': explanation referencing specific conversation points
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
{{
|
|
111
|
+
"verdict": "{options[1]}",
|
|
112
|
+
"reason": "The assistant partially addressed the user’s issue but missed clarifying their follow-up question."
|
|
113
|
+
}}
|
|
114
|
+
**
|
|
115
|
+
JSON:
|
|
116
|
+
"""
|
|
117
|
+
)
|
deepeval/metrics/dag/dag.py
CHANGED
|
@@ -13,8 +13,8 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
13
13
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
14
|
from deepeval.metrics.g_eval.schema import *
|
|
15
15
|
from deepeval.metrics.dag.graph import DeepAcyclicGraph
|
|
16
|
-
from deepeval.metrics.dag.utils import copy_graph
|
|
17
16
|
from deepeval.metrics.dag.utils import (
|
|
17
|
+
copy_graph,
|
|
18
18
|
is_valid_dag_from_roots,
|
|
19
19
|
extract_required_params,
|
|
20
20
|
)
|
|
@@ -34,7 +34,12 @@ class DAGMetric(BaseMetric):
|
|
|
34
34
|
verbose_mode: bool = False,
|
|
35
35
|
_include_dag_suffix: bool = True,
|
|
36
36
|
):
|
|
37
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
is_valid_dag_from_roots(
|
|
39
|
+
root_nodes=dag.root_nodes, multiturn=dag.multiturn
|
|
40
|
+
)
|
|
41
|
+
== False
|
|
42
|
+
):
|
|
38
43
|
raise ValueError("Cycle detected in DAG graph.")
|
|
39
44
|
|
|
40
45
|
self._verbose_steps: List[str] = []
|
|
@@ -56,7 +61,9 @@ class DAGMetric(BaseMetric):
|
|
|
56
61
|
_in_component: bool = False,
|
|
57
62
|
) -> float:
|
|
58
63
|
check_llm_test_case_params(
|
|
59
|
-
test_case,
|
|
64
|
+
test_case,
|
|
65
|
+
extract_required_params(self.dag.root_nodes, self.dag.multiturn),
|
|
66
|
+
self,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -91,7 +98,9 @@ class DAGMetric(BaseMetric):
|
|
|
91
98
|
_in_component: bool = False,
|
|
92
99
|
) -> float:
|
|
93
100
|
check_llm_test_case_params(
|
|
94
|
-
test_case,
|
|
101
|
+
test_case,
|
|
102
|
+
extract_required_params(self.dag.root_nodes, self.dag.multiturn),
|
|
103
|
+
self,
|
|
95
104
|
)
|
|
96
105
|
|
|
97
106
|
self.evaluation_cost = 0 if self.using_native_model else None
|
deepeval/metrics/dag/graph.py
CHANGED
|
@@ -1,39 +1,71 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, Union
|
|
3
3
|
|
|
4
4
|
from deepeval.metrics.dag import (
|
|
5
5
|
BaseNode,
|
|
6
6
|
NonBinaryJudgementNode,
|
|
7
7
|
BinaryJudgementNode,
|
|
8
8
|
)
|
|
9
|
-
from deepeval.
|
|
10
|
-
|
|
9
|
+
from deepeval.metrics.conversational_dag import (
|
|
10
|
+
ConversationalBaseNode,
|
|
11
|
+
ConversationalBinaryJudgementNode,
|
|
12
|
+
ConversationalNonBinaryJudgementNode,
|
|
13
|
+
)
|
|
14
|
+
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
15
|
+
from deepeval.metrics import BaseMetric, BaseConversationalMetric
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def validate_root_nodes(
|
|
19
|
+
root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
|
|
20
|
+
):
|
|
21
|
+
# see if all root nodes are of the same type, more verbose error message, actualy we should say we cannot mix multi and single turn nodes
|
|
22
|
+
if not all(isinstance(node, type(root_nodes[0])) for node in root_nodes):
|
|
23
|
+
raise ValueError("You cannot mix multi and single turn nodes")
|
|
24
|
+
return True
|
|
11
25
|
|
|
12
26
|
|
|
13
27
|
class DeepAcyclicGraph:
|
|
28
|
+
multiturn: bool
|
|
29
|
+
|
|
14
30
|
def __init__(
|
|
15
31
|
self,
|
|
16
|
-
root_nodes: List[BaseNode],
|
|
32
|
+
root_nodes: Union[List[BaseNode], List[ConversationalBaseNode]],
|
|
17
33
|
):
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
root_node, BinaryJudgementNode
|
|
21
|
-
):
|
|
22
|
-
if len(root_nodes) > 1:
|
|
23
|
-
raise ValueError(
|
|
24
|
-
"You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
|
|
25
|
-
)
|
|
34
|
+
validate_root_nodes(root_nodes)
|
|
35
|
+
self.multiturn = isinstance(root_nodes[0], ConversationalBaseNode)
|
|
26
36
|
|
|
37
|
+
if not self.multiturn:
|
|
38
|
+
for root_node in root_nodes:
|
|
39
|
+
if isinstance(root_node, NonBinaryJudgementNode) or isinstance(
|
|
40
|
+
root_node, BinaryJudgementNode
|
|
41
|
+
):
|
|
42
|
+
if len(root_nodes) > 1:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"You cannot provide more than one root node when using 'BinaryJudgementNode' or 'NonBinaryJudgementNode' in root_nodes."
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
for root_node in root_nodes:
|
|
48
|
+
if isinstance(
|
|
49
|
+
root_node, ConversationalNonBinaryJudgementNode
|
|
50
|
+
) or isinstance(root_node, ConversationalBinaryJudgementNode):
|
|
51
|
+
if len(root_nodes) > 1:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"You cannot provide more than one root node when using 'ConversationalBinaryJudgementNode' or 'ConversationalNonBinaryJudgementNode' in root_nodes."
|
|
54
|
+
)
|
|
27
55
|
self.root_nodes = root_nodes
|
|
28
56
|
|
|
29
|
-
def _execute(
|
|
57
|
+
def _execute(
|
|
58
|
+
self,
|
|
59
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
60
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
61
|
+
) -> None:
|
|
30
62
|
for root_node in self.root_nodes:
|
|
31
63
|
root_node._execute(metric=metric, test_case=test_case, depth=0)
|
|
32
64
|
|
|
33
65
|
async def _a_execute(
|
|
34
66
|
self,
|
|
35
|
-
metric: BaseMetric,
|
|
36
|
-
test_case: LLMTestCase,
|
|
67
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
68
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
37
69
|
) -> None:
|
|
38
70
|
await asyncio.gather(
|
|
39
71
|
*(
|
deepeval/metrics/dag/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Set, Dict, Optional
|
|
1
|
+
from typing import Set, Dict, Optional, Union
|
|
2
2
|
import inspect
|
|
3
3
|
|
|
4
4
|
from deepeval.metrics.dag import (
|
|
@@ -9,18 +9,33 @@ from deepeval.metrics.dag import (
|
|
|
9
9
|
TaskNode,
|
|
10
10
|
DeepAcyclicGraph,
|
|
11
11
|
)
|
|
12
|
-
from deepeval.
|
|
12
|
+
from deepeval.metrics.conversational_dag import (
|
|
13
|
+
ConversationalBaseNode,
|
|
14
|
+
ConversationalBinaryJudgementNode,
|
|
15
|
+
ConversationalNonBinaryJudgementNode,
|
|
16
|
+
ConversationalTaskNode,
|
|
17
|
+
ConversationalVerdictNode,
|
|
18
|
+
)
|
|
19
|
+
from deepeval.test_case import LLMTestCaseParams, TurnParams
|
|
13
20
|
|
|
14
21
|
|
|
15
|
-
def is_valid_dag_from_roots(
|
|
22
|
+
def is_valid_dag_from_roots(
|
|
23
|
+
root_nodes: Union[list[BaseNode], list[ConversationalBaseNode]],
|
|
24
|
+
multiturn: bool,
|
|
25
|
+
) -> bool:
|
|
16
26
|
visited = set()
|
|
17
27
|
for root in root_nodes:
|
|
18
|
-
if not is_valid_dag(root, visited, set()):
|
|
28
|
+
if not is_valid_dag(root, multiturn, visited, set()):
|
|
19
29
|
return False
|
|
20
30
|
return True
|
|
21
31
|
|
|
22
32
|
|
|
23
|
-
def is_valid_dag(
|
|
33
|
+
def is_valid_dag(
|
|
34
|
+
node: Union[BaseNode, ConversationalBaseNode],
|
|
35
|
+
multiturn: bool,
|
|
36
|
+
visited=None,
|
|
37
|
+
stack=None,
|
|
38
|
+
) -> bool:
|
|
24
39
|
if visited is None:
|
|
25
40
|
visited = set()
|
|
26
41
|
if stack is None:
|
|
@@ -33,14 +48,24 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
|
|
|
33
48
|
|
|
34
49
|
visited.add(node)
|
|
35
50
|
stack.add(node)
|
|
36
|
-
if
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
51
|
+
if not multiturn:
|
|
52
|
+
if (
|
|
53
|
+
isinstance(node, TaskNode)
|
|
54
|
+
or isinstance(node, BinaryJudgementNode)
|
|
55
|
+
or isinstance(node, NonBinaryJudgementNode)
|
|
56
|
+
):
|
|
57
|
+
for child in node.children:
|
|
58
|
+
if not is_valid_dag(child, multiturn, visited, stack):
|
|
59
|
+
return False
|
|
60
|
+
else:
|
|
61
|
+
if (
|
|
62
|
+
isinstance(node, ConversationalTaskNode)
|
|
63
|
+
or isinstance(node, ConversationalBinaryJudgementNode)
|
|
64
|
+
or isinstance(node, ConversationalNonBinaryJudgementNode)
|
|
65
|
+
):
|
|
66
|
+
for child in node.children:
|
|
67
|
+
if not is_valid_dag(child, multiturn, visited, stack):
|
|
68
|
+
return False
|
|
44
69
|
|
|
45
70
|
stack.remove(node)
|
|
46
71
|
return True
|
|
@@ -48,29 +73,51 @@ def is_valid_dag(node: BaseNode, visited=None, stack=None) -> bool:
|
|
|
48
73
|
|
|
49
74
|
def extract_required_params(
|
|
50
75
|
nodes: list[BaseNode],
|
|
51
|
-
|
|
52
|
-
|
|
76
|
+
multiturn: bool,
|
|
77
|
+
required_params: Optional[
|
|
78
|
+
Union[Set[LLMTestCaseParams], Set[TurnParams]]
|
|
79
|
+
] = None,
|
|
80
|
+
) -> Union[Set[LLMTestCaseParams], Set[TurnParams]]:
|
|
53
81
|
if required_params is None:
|
|
54
82
|
required_params = set()
|
|
55
83
|
|
|
56
84
|
for node in nodes:
|
|
57
|
-
if
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
85
|
+
if not multiturn:
|
|
86
|
+
if (
|
|
87
|
+
isinstance(node, TaskNode)
|
|
88
|
+
or isinstance(node, BinaryJudgementNode)
|
|
89
|
+
or isinstance(node, NonBinaryJudgementNode)
|
|
90
|
+
):
|
|
91
|
+
if node.evaluation_params is not None:
|
|
92
|
+
required_params.update(node.evaluation_params)
|
|
93
|
+
extract_required_params(
|
|
94
|
+
node.children, multiturn, required_params
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
if (
|
|
98
|
+
isinstance(node, ConversationalTaskNode)
|
|
99
|
+
or isinstance(node, ConversationalBinaryJudgementNode)
|
|
100
|
+
or isinstance(node, ConversationalNonBinaryJudgementNode)
|
|
101
|
+
):
|
|
102
|
+
if node.evaluation_params is not None:
|
|
103
|
+
required_params.update(node.evaluation_params)
|
|
104
|
+
extract_required_params(
|
|
105
|
+
node.children, multiturn, required_params
|
|
106
|
+
)
|
|
65
107
|
|
|
66
108
|
return required_params
|
|
67
109
|
|
|
68
110
|
|
|
69
111
|
def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
|
|
70
112
|
# This mapping avoids re-copying nodes that appear in multiple places.
|
|
71
|
-
visited:
|
|
72
|
-
|
|
73
|
-
|
|
113
|
+
visited: Union[
|
|
114
|
+
Dict[BaseNode, BaseNode],
|
|
115
|
+
Dict[ConversationalBaseNode, ConversationalBaseNode],
|
|
116
|
+
] = {}
|
|
117
|
+
|
|
118
|
+
def copy_node(
|
|
119
|
+
node: Union[BaseNode, ConversationalBaseNode],
|
|
120
|
+
) -> Union[BaseNode, ConversationalBaseNode]:
|
|
74
121
|
if node in visited:
|
|
75
122
|
return visited[node]
|
|
76
123
|
|
|
@@ -98,22 +145,40 @@ def copy_graph(original_dag: DeepAcyclicGraph) -> DeepAcyclicGraph:
|
|
|
98
145
|
"_depth",
|
|
99
146
|
]
|
|
100
147
|
}
|
|
101
|
-
if
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
148
|
+
if not original_dag.multiturn:
|
|
149
|
+
if (
|
|
150
|
+
isinstance(node, TaskNode)
|
|
151
|
+
or isinstance(node, BinaryJudgementNode)
|
|
152
|
+
or isinstance(node, NonBinaryJudgementNode)
|
|
153
|
+
):
|
|
154
|
+
copied_node = node_class(
|
|
155
|
+
**valid_args,
|
|
156
|
+
children=[copy_node(child) for child in node.children]
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
if isinstance(node, VerdictNode) and node.child:
|
|
160
|
+
copied_node = node_class(
|
|
161
|
+
**valid_args, child=copy_node(node.child)
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
copied_node = node_class(**valid_args)
|
|
110
165
|
else:
|
|
111
|
-
if
|
|
166
|
+
if (
|
|
167
|
+
isinstance(node, ConversationalTaskNode)
|
|
168
|
+
or isinstance(node, ConversationalBinaryJudgementNode)
|
|
169
|
+
or isinstance(node, ConversationalNonBinaryJudgementNode)
|
|
170
|
+
):
|
|
112
171
|
copied_node = node_class(
|
|
113
|
-
**valid_args,
|
|
172
|
+
**valid_args,
|
|
173
|
+
children=[copy_node(child) for child in node.children]
|
|
114
174
|
)
|
|
115
175
|
else:
|
|
116
|
-
|
|
176
|
+
if isinstance(node, ConversationalVerdictNode) and node.child:
|
|
177
|
+
copied_node = node_class(
|
|
178
|
+
**valid_args, child=copy_node(node.child)
|
|
179
|
+
)
|
|
180
|
+
else:
|
|
181
|
+
copied_node = node_class(**valid_args)
|
|
117
182
|
|
|
118
183
|
visited[node] = copied_node
|
|
119
184
|
return copied_node
|
|
@@ -4,7 +4,7 @@ from typing import Optional, List
|
|
|
4
4
|
class FaithfulnessTemplate:
|
|
5
5
|
@staticmethod
|
|
6
6
|
def generate_claims(actual_output: str):
|
|
7
|
-
return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided
|
|
7
|
+
return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
|
|
8
8
|
These truths, MUST BE COHERENT, and CANNOT be taken out of context.
|
|
9
9
|
|
|
10
10
|
Example:
|
|
@@ -24,9 +24,10 @@ Example JSON:
|
|
|
24
24
|
IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
|
|
25
25
|
Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
|
|
26
26
|
You should NOT include any prior knowledge, and take the text at face value when extracting claims.
|
|
27
|
+
You should be aware that it is an AI that is outputting these claims.
|
|
27
28
|
**
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
AI Output:
|
|
30
31
|
{actual_output}
|
|
31
32
|
|
|
32
33
|
JSON:
|
|
@@ -72,7 +73,7 @@ JSON:
|
|
|
72
73
|
def generate_verdicts(claims: List[str], retrieval_context: str):
|
|
73
74
|
return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
74
75
|
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
|
|
75
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
76
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
76
77
|
The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
|
|
77
78
|
|
|
78
79
|
**
|
|
@@ -84,28 +85,30 @@ Example:
|
|
|
84
85
|
{{
|
|
85
86
|
"verdicts": [
|
|
86
87
|
{{
|
|
87
|
-
"verdict": "idk"
|
|
88
|
+
"verdict": "idk",
|
|
89
|
+
"reason": "The claim about Barack Obama is although incorrect, it is not directly addressed in the retrieval context, and so poses no contradiction."
|
|
88
90
|
}},
|
|
89
91
|
{{
|
|
90
|
-
"verdict": "idk"
|
|
92
|
+
"verdict": "idk",
|
|
93
|
+
"reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
|
|
91
94
|
}},
|
|
92
95
|
{{
|
|
93
96
|
"verdict": "yes"
|
|
94
97
|
}},
|
|
95
98
|
{{
|
|
96
99
|
"verdict": "no",
|
|
97
|
-
"reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
|
|
100
|
+
"reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead. This contradicts the retrieval context."
|
|
98
101
|
}},
|
|
99
102
|
{{
|
|
100
103
|
"verdict": "no",
|
|
101
|
-
"reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
|
|
104
|
+
"reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead. This contradicts the retrieval context."
|
|
102
105
|
}},
|
|
103
106
|
]
|
|
104
107
|
}}
|
|
105
108
|
===== END OF EXAMPLE ======
|
|
106
109
|
|
|
107
110
|
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
|
|
108
|
-
You DON'T have to provide a reason if the answer is 'yes'
|
|
111
|
+
You DON'T have to provide a reason if the answer is 'yes'.
|
|
109
112
|
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
|
|
110
113
|
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
|
111
114
|
Claims that are not backed up by the retrieval context or are not mentioned in it MUST be answered 'idk'.
|
|
@@ -39,7 +39,7 @@ class MultimodalAnswerRelevancyTemplate:
|
|
|
39
39
|
Please generate a list of JSON with two keys: `verdict` and `reason`.
|
|
40
40
|
The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement or image is relevant to addressing the original input, 'no' if the statement or image is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
|
|
41
41
|
The 'reason' is the reason for the verdict.
|
|
42
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
42
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
43
43
|
The provided statements are statements and images generated in the actual output.
|
|
44
44
|
|
|
45
45
|
**
|
|
@@ -54,13 +54,15 @@ class MultimodalAnswerRelevancyTemplate:
|
|
|
54
54
|
"reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
|
|
55
55
|
}},
|
|
56
56
|
{{
|
|
57
|
-
"verdict": "idk"
|
|
57
|
+
"verdict": "idk",
|
|
58
|
+
"reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
|
|
58
59
|
}},
|
|
59
60
|
{{
|
|
60
|
-
"verdict": "idk"
|
|
61
|
+
"verdict": "idk",
|
|
62
|
+
"reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
|
|
61
63
|
}},
|
|
62
64
|
{{
|
|
63
|
-
"verdict": "yes"
|
|
65
|
+
"verdict": "yes",
|
|
64
66
|
}}
|
|
65
67
|
]
|
|
66
68
|
}}
|
|
@@ -95,7 +95,7 @@ class MultimodalFaithfulnessTemplate:
|
|
|
95
95
|
return textwrap.dedent(
|
|
96
96
|
f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
97
97
|
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
|
|
98
|
-
Provide a 'reason' ONLY if the answer is 'no'.
|
|
98
|
+
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
99
99
|
The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
|
|
100
100
|
|
|
101
101
|
**
|
|
@@ -107,10 +107,12 @@ class MultimodalFaithfulnessTemplate:
|
|
|
107
107
|
{{
|
|
108
108
|
"verdicts": [
|
|
109
109
|
{{
|
|
110
|
-
"verdict": "idk"
|
|
110
|
+
"verdict": "idk",
|
|
111
|
+
"reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
|
|
111
112
|
}},
|
|
112
113
|
{{
|
|
113
|
-
"verdict": "idk"
|
|
114
|
+
"verdict": "idk",
|
|
115
|
+
"reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
|
|
114
116
|
}},
|
|
115
117
|
{{
|
|
116
118
|
"verdict": "yes"
|
|
@@ -128,7 +130,7 @@ class MultimodalFaithfulnessTemplate:
|
|
|
128
130
|
===== END OF EXAMPLE ======
|
|
129
131
|
|
|
130
132
|
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
|
|
131
|
-
You DON'T have to provide a reason if the answer is 'yes'
|
|
133
|
+
You DON'T have to provide a reason if the answer is 'yes'.
|
|
132
134
|
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
|
|
133
135
|
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
|
134
136
|
Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
|
|
@@ -223,9 +223,13 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
223
223
|
total_score += best_score
|
|
224
224
|
matched_called_tools.add(best_called_tool)
|
|
225
225
|
return (
|
|
226
|
-
|
|
227
|
-
if self.expected_tools
|
|
228
|
-
else
|
|
226
|
+
1.0
|
|
227
|
+
if not self.expected_tools and not self.tools_called
|
|
228
|
+
else (
|
|
229
|
+
0.0
|
|
230
|
+
if not self.expected_tools
|
|
231
|
+
else total_score / len(self.expected_tools)
|
|
232
|
+
)
|
|
229
233
|
)
|
|
230
234
|
|
|
231
235
|
# Consider ordering score
|
|
@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
115
115
|
###############################################
|
|
116
116
|
|
|
117
117
|
def get_converse_request_body(self, prompt: str) -> dict:
|
|
118
|
+
# Inline parameter translation with defaults
|
|
119
|
+
param_mapping = {
|
|
120
|
+
"max_tokens": "maxTokens",
|
|
121
|
+
"top_p": "topP",
|
|
122
|
+
"top_k": "topK",
|
|
123
|
+
"stop_sequences": "stopSequences",
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Start with defaults for required parameters
|
|
127
|
+
translated_kwargs = {
|
|
128
|
+
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
129
|
+
"topP": self.generation_kwargs.get("top_p", 0),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Add any other parameters from generation_kwargs
|
|
133
|
+
for key, value in self.generation_kwargs.items():
|
|
134
|
+
if key not in [
|
|
135
|
+
"max_tokens",
|
|
136
|
+
"top_p",
|
|
137
|
+
]: # Skip already handled defaults
|
|
138
|
+
aws_key = param_mapping.get(key, key)
|
|
139
|
+
translated_kwargs[aws_key] = value
|
|
140
|
+
|
|
118
141
|
return {
|
|
119
142
|
"messages": [{"role": "user", "content": [{"text": prompt}]}],
|
|
120
143
|
"inferenceConfig": {
|
|
121
144
|
"temperature": self.temperature,
|
|
122
|
-
|
|
123
|
-
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
124
|
-
**self.generation_kwargs,
|
|
145
|
+
**translated_kwargs,
|
|
125
146
|
},
|
|
126
147
|
}
|
|
127
148
|
|