deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from typing import Optional, List, Union
|
|
2
|
+
import asyncio
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
get_unit_interactions,
|
|
8
|
+
print_tools_called,
|
|
9
|
+
check_conversational_test_case_params,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.test_case import (
|
|
13
|
+
ConversationalTestCase,
|
|
14
|
+
TurnParams,
|
|
15
|
+
ToolCall,
|
|
16
|
+
Turn,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
19
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
20
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
|
+
from deepeval.metrics.tool_use.template import ToolUseTemplate
|
|
22
|
+
from deepeval.metrics.tool_use.schema import (
|
|
23
|
+
ToolSelectionScore,
|
|
24
|
+
UserInputAndTools,
|
|
25
|
+
ArgumentCorrectnessScore,
|
|
26
|
+
)
|
|
27
|
+
from deepeval.metrics.api import metric_data_manager
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ToolUseMetric(BaseConversationalMetric):
|
|
31
|
+
|
|
32
|
+
_required_test_case_params = [
|
|
33
|
+
TurnParams.ROLE,
|
|
34
|
+
TurnParams.CONTENT,
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
available_tools: List[ToolCall],
|
|
40
|
+
threshold: float = 0.5,
|
|
41
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
42
|
+
include_reason: bool = True,
|
|
43
|
+
async_mode: bool = True,
|
|
44
|
+
strict_mode: bool = False,
|
|
45
|
+
verbose_mode: bool = False,
|
|
46
|
+
):
|
|
47
|
+
self.available_tools = available_tools
|
|
48
|
+
self.threshold = 1 if strict_mode else threshold
|
|
49
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
50
|
+
self.evaluation_model = self.model.get_model_name()
|
|
51
|
+
self.include_reason = include_reason
|
|
52
|
+
self.async_mode = async_mode
|
|
53
|
+
self.strict_mode = strict_mode
|
|
54
|
+
self.verbose_mode = verbose_mode
|
|
55
|
+
|
|
56
|
+
def measure(
|
|
57
|
+
self,
|
|
58
|
+
test_case: ConversationalTestCase,
|
|
59
|
+
_show_indicator: bool = True,
|
|
60
|
+
_in_component: bool = False,
|
|
61
|
+
_log_metric_to_confident: bool = True,
|
|
62
|
+
):
|
|
63
|
+
check_conversational_test_case_params(
|
|
64
|
+
test_case, self._required_test_case_params, self
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
|
+
with metric_progress_indicator(
|
|
69
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
70
|
+
):
|
|
71
|
+
if self.async_mode:
|
|
72
|
+
loop = get_or_create_event_loop()
|
|
73
|
+
loop.run_until_complete(
|
|
74
|
+
self.a_measure(
|
|
75
|
+
test_case,
|
|
76
|
+
_show_indicator=False,
|
|
77
|
+
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
83
|
+
user_input_and_tools = self._get_user_input_and_turns(
|
|
84
|
+
unit_interactions
|
|
85
|
+
)
|
|
86
|
+
tool_selection_scores = [
|
|
87
|
+
self._get_tool_selection_score(user_and_tools)
|
|
88
|
+
for user_and_tools in user_input_and_tools
|
|
89
|
+
]
|
|
90
|
+
argument_correctness_scores = [
|
|
91
|
+
self._get_argument_correctness_score(user_and_tools)
|
|
92
|
+
for user_and_tools in user_input_and_tools
|
|
93
|
+
if user_and_tools.tools_used
|
|
94
|
+
]
|
|
95
|
+
self.score = self._calculate_score(
|
|
96
|
+
tool_selection_scores, argument_correctness_scores
|
|
97
|
+
)
|
|
98
|
+
tool_selection_reason = (
|
|
99
|
+
self._generate_reason_for_tool_selection(
|
|
100
|
+
tool_selection_scores
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
argument_correctness_reason = (
|
|
104
|
+
self._generate_reason_for_argument_correctness(
|
|
105
|
+
argument_correctness_scores
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
self.reason = str(
|
|
109
|
+
"\n".join(
|
|
110
|
+
[tool_selection_reason, argument_correctness_reason]
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self.verbose_logs = construct_verbose_logs(
|
|
115
|
+
self,
|
|
116
|
+
steps=[
|
|
117
|
+
f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
|
|
118
|
+
f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
|
|
119
|
+
f"Final Score: {self.score}",
|
|
120
|
+
f"Final Reason: {self.reason}",
|
|
121
|
+
],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if _log_metric_to_confident:
|
|
125
|
+
metric_data_manager.post_metric_if_enabled(
|
|
126
|
+
self, test_case=test_case
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return self.score
|
|
130
|
+
|
|
131
|
+
async def a_measure(
|
|
132
|
+
self,
|
|
133
|
+
test_case: ConversationalTestCase,
|
|
134
|
+
_show_indicator: bool = True,
|
|
135
|
+
_in_component: bool = False,
|
|
136
|
+
_log_metric_to_confident: bool = True,
|
|
137
|
+
):
|
|
138
|
+
check_conversational_test_case_params(
|
|
139
|
+
test_case, self._required_test_case_params, self
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
143
|
+
with metric_progress_indicator(
|
|
144
|
+
self,
|
|
145
|
+
async_mode=True,
|
|
146
|
+
_show_indicator=_show_indicator,
|
|
147
|
+
_in_component=_in_component,
|
|
148
|
+
):
|
|
149
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
150
|
+
user_input_and_tools = self._get_user_input_and_turns(
|
|
151
|
+
unit_interactions
|
|
152
|
+
)
|
|
153
|
+
tool_selection_scores = await asyncio.gather(
|
|
154
|
+
*[
|
|
155
|
+
self._a_get_tool_selection_score(user_and_tools)
|
|
156
|
+
for user_and_tools in user_input_and_tools
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
argument_correctness_scores = await asyncio.gather(
|
|
160
|
+
*[
|
|
161
|
+
self._a_get_argument_correctness_score(user_and_tools)
|
|
162
|
+
for user_and_tools in user_input_and_tools
|
|
163
|
+
if user_and_tools.tools_used
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
self.score = self._calculate_score(
|
|
167
|
+
tool_selection_scores, argument_correctness_scores
|
|
168
|
+
)
|
|
169
|
+
tool_selection_reason = (
|
|
170
|
+
await self._a_generate_reason_for_tool_selection(
|
|
171
|
+
tool_selection_scores
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
argument_correctness_reason = (
|
|
175
|
+
await self._a_generate_reason_for_argument_correctness(
|
|
176
|
+
argument_correctness_scores
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
self.reason = str(
|
|
180
|
+
"\n".join([tool_selection_reason, argument_correctness_reason])
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
self.verbose_logs = construct_verbose_logs(
|
|
184
|
+
self,
|
|
185
|
+
steps=[
|
|
186
|
+
f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
|
|
187
|
+
f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
|
|
188
|
+
f"Final Score: {self.score}",
|
|
189
|
+
f"Final Reason: {self.reason}",
|
|
190
|
+
],
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if _log_metric_to_confident:
|
|
194
|
+
metric_data_manager.post_metric_if_enabled(
|
|
195
|
+
self, test_case=test_case
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return self.score
|
|
199
|
+
|
|
200
|
+
def _get_argument_correctness_score(
|
|
201
|
+
self, user_and_tools: UserInputAndTools
|
|
202
|
+
):
|
|
203
|
+
prompt = ToolUseTemplate.get_argument_correctness_score(
|
|
204
|
+
user_and_tools.user_messages,
|
|
205
|
+
user_and_tools.assistant_messages,
|
|
206
|
+
user_and_tools.tools_called,
|
|
207
|
+
user_and_tools.available_tools,
|
|
208
|
+
)
|
|
209
|
+
if self.using_native_model:
|
|
210
|
+
res, cost = self.model.generate(
|
|
211
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
212
|
+
)
|
|
213
|
+
self.evaluation_cost += cost
|
|
214
|
+
return res
|
|
215
|
+
else:
|
|
216
|
+
try:
|
|
217
|
+
res: ArgumentCorrectnessScore = self.model.generate(
|
|
218
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
219
|
+
)
|
|
220
|
+
return res
|
|
221
|
+
except TypeError:
|
|
222
|
+
res = self.model.generate(prompt)
|
|
223
|
+
data = trimAndLoadJson(res, self)
|
|
224
|
+
return ArgumentCorrectnessScore(**data)
|
|
225
|
+
|
|
226
|
+
async def _a_get_argument_correctness_score(
|
|
227
|
+
self,
|
|
228
|
+
user_and_tools: UserInputAndTools,
|
|
229
|
+
):
|
|
230
|
+
prompt = ToolUseTemplate.get_argument_correctness_score(
|
|
231
|
+
user_and_tools.user_messages,
|
|
232
|
+
user_and_tools.assistant_messages,
|
|
233
|
+
user_and_tools.tools_called,
|
|
234
|
+
user_and_tools.available_tools,
|
|
235
|
+
)
|
|
236
|
+
if self.using_native_model:
|
|
237
|
+
res, cost = await self.model.a_generate(
|
|
238
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
239
|
+
)
|
|
240
|
+
self.evaluation_cost += cost
|
|
241
|
+
return res
|
|
242
|
+
else:
|
|
243
|
+
try:
|
|
244
|
+
res: ArgumentCorrectnessScore = await self.model.a_generate(
|
|
245
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
246
|
+
)
|
|
247
|
+
return res
|
|
248
|
+
except TypeError:
|
|
249
|
+
res = await self.model.a_generate(prompt)
|
|
250
|
+
data = trimAndLoadJson(res, self)
|
|
251
|
+
return ArgumentCorrectnessScore(**data)
|
|
252
|
+
|
|
253
|
+
def _get_tool_selection_score(
|
|
254
|
+
self,
|
|
255
|
+
user_and_tools: UserInputAndTools,
|
|
256
|
+
):
|
|
257
|
+
prompt = ToolUseTemplate.get_tool_selection_score(
|
|
258
|
+
user_and_tools.user_messages,
|
|
259
|
+
user_and_tools.assistant_messages,
|
|
260
|
+
user_and_tools.tools_called,
|
|
261
|
+
user_and_tools.available_tools,
|
|
262
|
+
)
|
|
263
|
+
if self.using_native_model:
|
|
264
|
+
res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
|
|
265
|
+
self.evaluation_cost += cost
|
|
266
|
+
return res
|
|
267
|
+
else:
|
|
268
|
+
try:
|
|
269
|
+
res: ToolSelectionScore = self.model.generate(
|
|
270
|
+
prompt, schema=ToolSelectionScore
|
|
271
|
+
)
|
|
272
|
+
return res
|
|
273
|
+
except TypeError:
|
|
274
|
+
res = self.model.generate(prompt)
|
|
275
|
+
data = trimAndLoadJson(res, self)
|
|
276
|
+
return ToolSelectionScore(**data)
|
|
277
|
+
|
|
278
|
+
async def _a_get_tool_selection_score(
|
|
279
|
+
self,
|
|
280
|
+
user_and_tools: UserInputAndTools,
|
|
281
|
+
):
|
|
282
|
+
prompt = ToolUseTemplate.get_tool_selection_score(
|
|
283
|
+
user_and_tools.user_messages,
|
|
284
|
+
user_and_tools.assistant_messages,
|
|
285
|
+
user_and_tools.tools_called,
|
|
286
|
+
user_and_tools.available_tools,
|
|
287
|
+
)
|
|
288
|
+
if self.using_native_model:
|
|
289
|
+
res, cost = await self.model.a_generate(
|
|
290
|
+
prompt, schema=ToolSelectionScore
|
|
291
|
+
)
|
|
292
|
+
self.evaluation_cost += cost
|
|
293
|
+
return res
|
|
294
|
+
else:
|
|
295
|
+
try:
|
|
296
|
+
res: ToolSelectionScore = await self.model.a_generate(
|
|
297
|
+
prompt, schema=ToolSelectionScore
|
|
298
|
+
)
|
|
299
|
+
return res
|
|
300
|
+
except TypeError:
|
|
301
|
+
res = await self.model.a_generate(prompt)
|
|
302
|
+
data = trimAndLoadJson(res, self)
|
|
303
|
+
return ToolSelectionScore(**data)
|
|
304
|
+
|
|
305
|
+
def _get_user_input_and_turns(
|
|
306
|
+
self,
|
|
307
|
+
unit_interactions: List[List[Turn]],
|
|
308
|
+
) -> List[UserInputAndTools]:
|
|
309
|
+
user_inputs_and_tools = []
|
|
310
|
+
available_tools = ",".join(
|
|
311
|
+
[repr(tool) for tool in self.available_tools]
|
|
312
|
+
)
|
|
313
|
+
for unit_interaction in unit_interactions:
|
|
314
|
+
if len(unit_interaction) < 2:
|
|
315
|
+
continue
|
|
316
|
+
user_messages = ""
|
|
317
|
+
assistant_messages = ""
|
|
318
|
+
tools_called = []
|
|
319
|
+
tools_used = False
|
|
320
|
+
for turn in unit_interaction:
|
|
321
|
+
if turn.role == "user":
|
|
322
|
+
user_messages += f"{turn.content} \n"
|
|
323
|
+
else:
|
|
324
|
+
break
|
|
325
|
+
for turn in unit_interaction[1:]:
|
|
326
|
+
if turn.role == "assistant":
|
|
327
|
+
assistant_messages += f"{turn.content} \n"
|
|
328
|
+
if turn.tools_called:
|
|
329
|
+
tools_called.extend(turn.tools_called)
|
|
330
|
+
tools_used = True
|
|
331
|
+
tools_called = ",".join([repr(tool) for tool in tools_called])
|
|
332
|
+
new_user_input_tools = UserInputAndTools(
|
|
333
|
+
user_messages=user_messages,
|
|
334
|
+
assistant_messages=assistant_messages,
|
|
335
|
+
tools_called=tools_called,
|
|
336
|
+
available_tools=available_tools,
|
|
337
|
+
tools_used=tools_used,
|
|
338
|
+
)
|
|
339
|
+
user_inputs_and_tools.append(new_user_input_tools)
|
|
340
|
+
return user_inputs_and_tools
|
|
341
|
+
|
|
342
|
+
def _calculate_score(
|
|
343
|
+
self,
|
|
344
|
+
tool_use_scores: List[ToolSelectionScore],
|
|
345
|
+
argument_correctness_scores: List[ArgumentCorrectnessScore],
|
|
346
|
+
):
|
|
347
|
+
tools_scores_sum = sum(
|
|
348
|
+
[tool_use_score.score for tool_use_score in tool_use_scores]
|
|
349
|
+
)
|
|
350
|
+
arguments_scores_sum = sum(
|
|
351
|
+
[
|
|
352
|
+
argument_correctness_score.score
|
|
353
|
+
for argument_correctness_score in argument_correctness_scores
|
|
354
|
+
]
|
|
355
|
+
)
|
|
356
|
+
tool_selections_scores_divisor = (
|
|
357
|
+
len(tool_use_scores) if len(tool_use_scores) > 0 else 1
|
|
358
|
+
)
|
|
359
|
+
argument_correctness_score_divisor = (
|
|
360
|
+
len(argument_correctness_scores)
|
|
361
|
+
if len(argument_correctness_scores) > 0
|
|
362
|
+
else 1
|
|
363
|
+
)
|
|
364
|
+
tools_selction_score = tools_scores_sum / tool_selections_scores_divisor
|
|
365
|
+
argument_correctness_score = (
|
|
366
|
+
arguments_scores_sum / argument_correctness_score_divisor
|
|
367
|
+
)
|
|
368
|
+
score = min(tools_selction_score, argument_correctness_score)
|
|
369
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
370
|
+
|
|
371
|
+
def _generate_reason_for_tool_selection(
|
|
372
|
+
self,
|
|
373
|
+
tool_use_scores: List[ToolSelectionScore],
|
|
374
|
+
):
|
|
375
|
+
scores_and_reasons = ""
|
|
376
|
+
for tool_use in tool_use_scores:
|
|
377
|
+
scores_and_reasons += (
|
|
378
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
379
|
+
)
|
|
380
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
381
|
+
scores_and_reasons, self.score, self.threshold
|
|
382
|
+
)
|
|
383
|
+
if self.using_native_model:
|
|
384
|
+
res, cost = self.model.generate(prompt)
|
|
385
|
+
self.evaluation_cost += cost
|
|
386
|
+
return res
|
|
387
|
+
else:
|
|
388
|
+
res = self.model.generate(prompt)
|
|
389
|
+
return res
|
|
390
|
+
|
|
391
|
+
def _generate_reason_for_argument_correctness(
|
|
392
|
+
self,
|
|
393
|
+
argument_correctness_scores: List[ArgumentCorrectnessScore],
|
|
394
|
+
):
|
|
395
|
+
scores_and_reasons = ""
|
|
396
|
+
for tool_use in argument_correctness_scores:
|
|
397
|
+
scores_and_reasons += (
|
|
398
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
399
|
+
)
|
|
400
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
401
|
+
scores_and_reasons, self.score, self.threshold
|
|
402
|
+
)
|
|
403
|
+
if self.using_native_model:
|
|
404
|
+
res, cost = self.model.generate(prompt)
|
|
405
|
+
self.evaluation_cost += cost
|
|
406
|
+
return res
|
|
407
|
+
else:
|
|
408
|
+
res = self.model.generate(prompt)
|
|
409
|
+
return res
|
|
410
|
+
|
|
411
|
+
async def _a_generate_reason_for_tool_selection(
|
|
412
|
+
self, tool_use_scores: List[ToolSelectionScore]
|
|
413
|
+
):
|
|
414
|
+
scores_and_reasons = ""
|
|
415
|
+
for tool_use in tool_use_scores:
|
|
416
|
+
scores_and_reasons += (
|
|
417
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
418
|
+
)
|
|
419
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
420
|
+
scores_and_reasons, self.score, self.threshold
|
|
421
|
+
)
|
|
422
|
+
if self.using_native_model:
|
|
423
|
+
res, cost = await self.model.a_generate(prompt)
|
|
424
|
+
self.evaluation_cost += cost
|
|
425
|
+
return res
|
|
426
|
+
else:
|
|
427
|
+
res = await self.model.a_generate(prompt)
|
|
428
|
+
return res
|
|
429
|
+
|
|
430
|
+
async def _a_generate_reason_for_argument_correctness(
|
|
431
|
+
self, argument_correctness_scores: List[ArgumentCorrectnessScore]
|
|
432
|
+
):
|
|
433
|
+
scores_and_reasons = ""
|
|
434
|
+
for tool_use in argument_correctness_scores:
|
|
435
|
+
scores_and_reasons += (
|
|
436
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
437
|
+
)
|
|
438
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
439
|
+
scores_and_reasons, self.score, self.threshold
|
|
440
|
+
)
|
|
441
|
+
if self.using_native_model:
|
|
442
|
+
res, cost = await self.model.a_generate(prompt)
|
|
443
|
+
self.evaluation_cost += cost
|
|
444
|
+
return res
|
|
445
|
+
else:
|
|
446
|
+
res = await self.model.a_generate(prompt)
|
|
447
|
+
return res
|
|
448
|
+
|
|
449
|
+
def is_successful(self) -> bool:
|
|
450
|
+
try:
|
|
451
|
+
self.success = self.score >= self.threshold
|
|
452
|
+
except (AttributeError, TypeError):
|
|
453
|
+
self.success = False
|
|
454
|
+
return self.success
|
|
455
|
+
|
|
456
|
+
@property
|
|
457
|
+
def __name__(self):
|
|
458
|
+
return "Tool Use"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .topic_adherence import TopicAdherenceMetric
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import List, Dict, Literal
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class QAPair(BaseModel):
|
|
6
|
+
question: str
|
|
7
|
+
response: str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QAPairs(BaseModel):
|
|
11
|
+
qa_pairs: List[QAPair]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RelevancyVerdict(BaseModel):
|
|
15
|
+
verdict: Literal["TP", "TN", "FP", "FN"]
|
|
16
|
+
reason: str
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import textwrap
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TopicAdherenceTemplate:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def get_qa_pairs(
|
|
9
|
+
conversation: str,
|
|
10
|
+
) -> str:
|
|
11
|
+
return textwrap.dedent(
|
|
12
|
+
f"""Your task is to extract question-answer (QA) pairs from a multi-turn conversation between a `user` and an `assistant`.
|
|
13
|
+
|
|
14
|
+
You must return only valid pairs where:
|
|
15
|
+
- The **question** comes from the `user`.
|
|
16
|
+
- The **response** comes from the `assistant`.
|
|
17
|
+
- Both question and response must appear **explicitly** in the conversation.
|
|
18
|
+
|
|
19
|
+
Do not infer information beyond what is stated. Ignore irrelevant or conversational turns (e.g. greetings, affirmations) that do not constitute clear QA pairs.
|
|
20
|
+
If there are multiple questions and multiple answers in a single sentence, break them into separate pairs. Each pair must be standalone, and should not contain more than one question or response.
|
|
21
|
+
|
|
22
|
+
OUTPUT Format:
|
|
23
|
+
Return a **JSON object** with a single 2 keys:
|
|
24
|
+
- `"question"`: the user's question
|
|
25
|
+
- `"response"`: the assistant's direct response
|
|
26
|
+
|
|
27
|
+
If no valid QA pairs are found, return:
|
|
28
|
+
```json
|
|
29
|
+
{{
|
|
30
|
+
question: "",
|
|
31
|
+
response: ""
|
|
32
|
+
}}
|
|
33
|
+
|
|
34
|
+
CHAIN OF THOUGHT:
|
|
35
|
+
- Read the full conversation sequentially.
|
|
36
|
+
- Identify user turns that clearly ask a question (explicit or strongly implied).
|
|
37
|
+
- Match each question with the immediate assistant response.
|
|
38
|
+
- Only include pairs where the assistant's reply directly addresses the user's question.
|
|
39
|
+
- Do not include incomplete, ambiguous, or out-of-context entries.
|
|
40
|
+
|
|
41
|
+
EXAMPLE:
|
|
42
|
+
|
|
43
|
+
Conversation:
|
|
44
|
+
|
|
45
|
+
user: Which food is best for diabetic patients?
|
|
46
|
+
assistant: Steel-cut oats are good for diabetic patients
|
|
47
|
+
user: Is it better if I eat muesli instead of oats?
|
|
48
|
+
assistant: While muesli is good for diabetic people, steel-cut oats are preferred. Refer to your nutritionist for better guidance.
|
|
49
|
+
|
|
50
|
+
Example JSON:
|
|
51
|
+
{{
|
|
52
|
+
"question": "Which food is best for diabetic patients?",
|
|
53
|
+
"response": "Steel-cut oats are good for diabetic patients"
|
|
54
|
+
}}
|
|
55
|
+
===== END OF EXAMPLE ======
|
|
56
|
+
|
|
57
|
+
**
|
|
58
|
+
IMPORTANT: Please make sure to only return in JSON format with one key: 'qa_pairs' and the value MUST be a list of dictionaries
|
|
59
|
+
**
|
|
60
|
+
|
|
61
|
+
Conversation:
|
|
62
|
+
{conversation}
|
|
63
|
+
JSON:
|
|
64
|
+
"""
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_qa_pair_verdict(
|
|
69
|
+
relevant_topics: List[str],
|
|
70
|
+
question: str,
|
|
71
|
+
response: str,
|
|
72
|
+
) -> str:
|
|
73
|
+
return textwrap.dedent(
|
|
74
|
+
f"""You are given:
|
|
75
|
+
- A list of **relevant topics**
|
|
76
|
+
- A **user question**
|
|
77
|
+
- An **assistant response**
|
|
78
|
+
|
|
79
|
+
Your task is to:
|
|
80
|
+
1. Determine if the question is relevant to the list of topics.
|
|
81
|
+
2. If it is relevant, evaluate whether the response properly answers the question.
|
|
82
|
+
3. Based on both relevance and correctness, assign one of four possible verdicts.
|
|
83
|
+
4. Give a simple, comprehensive reason explaining why this question-answer pair was assigned this verdict
|
|
84
|
+
|
|
85
|
+
VERDICTS:
|
|
86
|
+
- `"TP"` (True Positive): Question is relevant and the response correctly answers it.
|
|
87
|
+
- `"FN"` (False Negative): Question is relevant, but the assistant refused to answer or gave an irrelevant response.
|
|
88
|
+
- `"FP"` (False Positive): Question is NOT relevant, but the assistant still gave an answer (based on general/training knowledge).
|
|
89
|
+
- `"TN"` (True Negative): Question is NOT relevant, and the assistant correctly refused to answer.
|
|
90
|
+
|
|
91
|
+
OUTPUT FORMAT:
|
|
92
|
+
Return only a **JSON object** with one key:
|
|
93
|
+
```json
|
|
94
|
+
{{
|
|
95
|
+
"verdict": "TP" // or TN, FP, FN
|
|
96
|
+
"reason": "Reason why the verdict is 'TP'"
|
|
97
|
+
}}
|
|
98
|
+
|
|
99
|
+
CHAIN OF THOUGHT:
|
|
100
|
+
- Check if the question aligns with any of the relevant topics.
|
|
101
|
+
- If yes:
|
|
102
|
+
- Assess if the response is correct, complete, and directly answers the question.
|
|
103
|
+
- If no:
|
|
104
|
+
- Check if the assistant refused appropriately or gave an unwarranted answer.
|
|
105
|
+
- Choose the correct verdict using the definitions above.
|
|
106
|
+
|
|
107
|
+
EXAMPLE:
|
|
108
|
+
|
|
109
|
+
Relevant topics: ["heath nutrition", "food and their benefits"]
|
|
110
|
+
Question: "Which food is best for diabetic patients?"
|
|
111
|
+
Response: "Steel-cut oats are good for diabetic patients"
|
|
112
|
+
|
|
113
|
+
Example JSON:
|
|
114
|
+
{{
|
|
115
|
+
"verdict": "TP",
|
|
116
|
+
"reason": The question asks about food for diabetic patients and the response clearly answers that oats are good for diabetic patients. Both align with the relevant topics of heath nutrition and food and their benefits...
|
|
117
|
+
}}
|
|
118
|
+
|
|
119
|
+
===== END OF EXAMPLE ======
|
|
120
|
+
|
|
121
|
+
**
|
|
122
|
+
IMPORTANT: Please make sure to only return in JSON format with two keys: 'verdict' and 'reason'
|
|
123
|
+
**
|
|
124
|
+
|
|
125
|
+
Relevant topics: {relevant_topics}
|
|
126
|
+
Question: {question}
|
|
127
|
+
Response: {response}
|
|
128
|
+
|
|
129
|
+
JSON:
|
|
130
|
+
"""
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def generate_reason(success, score, threshold, TP, TN, FP, FN) -> str:
|
|
135
|
+
return textwrap.dedent(
|
|
136
|
+
f"""You are given a score for a metric that calculates whether an agent has adhered to it's topics.
|
|
137
|
+
You are also given a list of reasons for the truth table values that were used to calculate final score.
|
|
138
|
+
|
|
139
|
+
Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
|
|
140
|
+
|
|
141
|
+
Pass: {success}
|
|
142
|
+
Score: {score}
|
|
143
|
+
Threshold: {threshold}
|
|
144
|
+
|
|
145
|
+
Here are the reasons for all truth table entries:
|
|
146
|
+
|
|
147
|
+
True positive reasons: {TP[1]}
|
|
148
|
+
True negative reasons: {TN[1]}
|
|
149
|
+
False positives reasons: {FP[1]}
|
|
150
|
+
False negatives reasons: {FN[1]}
|
|
151
|
+
|
|
152
|
+
Score calculation = Number of True Positives + Number of True Negatives / Total number of table entries
|
|
153
|
+
|
|
154
|
+
**
|
|
155
|
+
IMPORTANT: Now generate a comprehensive reason that explains why this metric failed. You MUST output only the reason as a string and nothing else.
|
|
156
|
+
**
|
|
157
|
+
|
|
158
|
+
Output ONLY the reason, DON"T output anything else.
|
|
159
|
+
|
|
160
|
+
Reason:
|
|
161
|
+
"""
|
|
162
|
+
)
|