deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,15 @@
1
- from typing import List, Dict
1
+ from typing import List, Dict, Optional, Union
2
2
 
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
5
  from deepeval.metrics.utils import (
5
6
  construct_verbose_logs,
6
7
  check_llm_test_case_params,
8
+ trimAndLoadJson,
9
+ initialize_model,
10
+ print_tools_called,
7
11
  )
12
+ from deepeval.models import DeepEvalBaseLLM
8
13
  from deepeval.test_case import (
9
14
  LLMTestCase,
10
15
  LLMTestCaseParams,
@@ -12,6 +17,9 @@ from deepeval.test_case import (
12
17
  ToolCall,
13
18
  )
14
19
  from deepeval.metrics import BaseMetric
20
+ from deepeval.metrics.api import metric_data_manager
21
+ from deepeval.metrics.tool_correctness.template import ToolCorrectnessTemplate
22
+ from deepeval.metrics.tool_correctness.schema import ToolSelectionScore
15
23
 
16
24
 
17
25
  class ToolCorrectnessMetric(BaseMetric):
@@ -24,15 +32,21 @@ class ToolCorrectnessMetric(BaseMetric):
24
32
 
25
33
  def __init__(
26
34
  self,
35
+ available_tools: List[ToolCall] = None,
27
36
  threshold: float = 0.5,
28
37
  evaluation_params: List[ToolCallParams] = [],
38
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
29
39
  include_reason: bool = True,
40
+ async_mode: bool = True,
30
41
  strict_mode: bool = False,
31
42
  verbose_mode: bool = False,
32
43
  should_exact_match: bool = False,
33
44
  should_consider_ordering: bool = False,
34
45
  ):
46
+ self.available_tools = available_tools
35
47
  self.threshold = 1 if strict_mode else threshold
48
+ self.model, self.using_native_model = initialize_model(model)
49
+ self.async_mode = async_mode
36
50
  self.include_reason = include_reason
37
51
  self.strict_mode = strict_mode
38
52
  self.verbose_mode = verbose_mode
@@ -45,18 +59,145 @@ class ToolCorrectnessMetric(BaseMetric):
45
59
  test_case: LLMTestCase,
46
60
  _show_indicator: bool = True,
47
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
48
63
  ) -> float:
49
64
 
50
65
  check_llm_test_case_params(test_case, self._required_params, self)
51
66
  self.test_case = test_case
67
+ self.evaluation_cost = 0 if self.using_native_model else None
68
+
52
69
  with metric_progress_indicator(
53
70
  self, _show_indicator=_show_indicator, _in_component=_in_component
71
+ ):
72
+ if self.async_mode:
73
+ loop = get_or_create_event_loop()
74
+ loop.run_until_complete(
75
+ self.a_measure(
76
+ test_case,
77
+ _show_indicator=False,
78
+ _in_component=_in_component,
79
+ _log_metric_to_confident=_log_metric_to_confident,
80
+ )
81
+ )
82
+ else:
83
+ self.tools_called: List[ToolCall] = test_case.tools_called
84
+ self.expected_tools: List[ToolCall] = test_case.expected_tools
85
+ tool_calling_score = self._calculate_score()
86
+ if self.available_tools:
87
+ tool_selection_score = self._get_tool_selection_score(
88
+ test_case.input,
89
+ test_case.tools_called,
90
+ self.available_tools,
91
+ )
92
+ else:
93
+ tool_selection_score = tool_selection_score = (
94
+ ToolSelectionScore(
95
+ score=1,
96
+ reason="No available tools were provided to assess tool selection criteria",
97
+ )
98
+ )
99
+ score = min(tool_calling_score, tool_selection_score.score)
100
+ self.score = (
101
+ 0 if self.strict_mode and score < self.threshold else score
102
+ )
103
+ tool_calling_reason = self._generate_reason()
104
+ self.reason = self._construct_final_reason(
105
+ tool_calling_reason, tool_selection_score.reason
106
+ )
107
+ self.success = self.score >= self.threshold
108
+
109
+ expected_tools_formatted = (
110
+ "Expected Tools:\n[\n"
111
+ + ",\n".join(
112
+ self.indent_multiline_string(
113
+ repr(tool_call), indent_level=4
114
+ )
115
+ for tool_call in self.expected_tools
116
+ )
117
+ + "\n]"
118
+ )
119
+ tools_called_formatted = (
120
+ "Tools Called:\n[\n"
121
+ + ",\n".join(
122
+ self.indent_multiline_string(
123
+ repr(tool_call), indent_level=4
124
+ )
125
+ for tool_call in self.tools_called
126
+ )
127
+ + "\n]"
128
+ )
129
+ available_tools_formatted = (
130
+ (
131
+ "Available Tools:\n[\n"
132
+ + ",\n".join(
133
+ self.indent_multiline_string(
134
+ repr(tool_call), indent_level=4
135
+ )
136
+ for tool_call in self.available_tools
137
+ )
138
+ + "\n]"
139
+ )
140
+ if self.available_tools
141
+ else "Available Tools: []"
142
+ )
143
+ self.verbose_logs = construct_verbose_logs(
144
+ self,
145
+ steps=[
146
+ f"{expected_tools_formatted}",
147
+ f"{tools_called_formatted}",
148
+ f"{available_tools_formatted}",
149
+ f"Tool Selection Score: {tool_selection_score.score}",
150
+ f"Tool Selection Reason: {tool_selection_score.reason}",
151
+ f"Final Score: {self.score}\nFinal Reason: {self.reason}",
152
+ ],
153
+ )
154
+
155
+ if _log_metric_to_confident:
156
+ metric_data_manager.post_metric_if_enabled(
157
+ self, test_case=test_case
158
+ )
159
+ return self.score
160
+
161
+ async def a_measure(
162
+ self,
163
+ test_case: LLMTestCase,
164
+ _show_indicator: bool = True,
165
+ _in_component: bool = False,
166
+ _log_metric_to_confident: bool = True,
167
+ ) -> float:
168
+ check_llm_test_case_params(test_case, self._required_params, self)
169
+
170
+ self.evaluation_cost = 0 if self.using_native_model else None
171
+ with metric_progress_indicator(
172
+ self,
173
+ async_mode=True,
174
+ _show_indicator=_show_indicator,
175
+ _in_component=_in_component,
54
176
  ):
55
177
  self.tools_called: List[ToolCall] = test_case.tools_called
56
178
  self.expected_tools: List[ToolCall] = test_case.expected_tools
57
- self.score = self._calculate_score()
58
- self.reason = self._generate_reason()
179
+ tool_calling_score = self._calculate_score()
180
+ if self.available_tools:
181
+ tool_selection_score = await self._a_get_tool_selection_score(
182
+ test_case.input,
183
+ test_case.tools_called,
184
+ self.available_tools,
185
+ )
186
+ else:
187
+ tool_selection_score = ToolSelectionScore(
188
+ score=1,
189
+ reason="No available tools were provided to assess tool selection criteria",
190
+ )
191
+ score = min(tool_calling_score, tool_selection_score.score)
192
+ self.score = (
193
+ 0 if self.strict_mode and score < self.threshold else score
194
+ )
195
+ tool_calling_reason = self._generate_reason()
196
+ self.reason = self._construct_final_reason(
197
+ tool_calling_reason, tool_selection_score.reason
198
+ )
59
199
  self.success = self.score >= self.threshold
200
+
60
201
  expected_tools_formatted = (
61
202
  "Expected Tools:\n[\n"
62
203
  + ",\n".join(
@@ -77,25 +218,37 @@ class ToolCorrectnessMetric(BaseMetric):
77
218
  )
78
219
  + "\n]"
79
220
  )
80
- steps = [
81
- f"{expected_tools_formatted}",
82
- f"{tools_called_formatted}",
83
- ]
84
- steps.append(f"Score: {self.score}\nReason: {self.reason}")
85
- self.verbose_logs = construct_verbose_logs(self, steps=steps)
86
- return self.score
221
+ available_tools_formatted = (
222
+ (
223
+ "Available Tools:\n[\n"
224
+ + ",\n".join(
225
+ self.indent_multiline_string(
226
+ repr(tool_call), indent_level=4
227
+ )
228
+ for tool_call in self.available_tools
229
+ )
230
+ + "\n]"
231
+ )
232
+ if self.available_tools
233
+ else "Available Tools: []"
234
+ )
235
+ self.verbose_logs = construct_verbose_logs(
236
+ self,
237
+ steps=[
238
+ f"{expected_tools_formatted}",
239
+ f"{tools_called_formatted}",
240
+ f"{available_tools_formatted}",
241
+ f"Tool Selection Score: {tool_selection_score.score}",
242
+ f"Tool Selection Reason: {tool_selection_score.reason}",
243
+ f"Final Score: {self.score}\nFinal Reason: {self.reason}",
244
+ ],
245
+ )
87
246
 
88
- async def a_measure(
89
- self,
90
- test_case: LLMTestCase,
91
- _show_indicator: bool = True,
92
- _in_component: bool = False,
93
- ) -> float:
94
- return self.measure(
95
- test_case,
96
- _show_indicator=_show_indicator,
97
- _in_component=_in_component,
98
- )
247
+ if _log_metric_to_confident:
248
+ metric_data_manager.post_metric_if_enabled(
249
+ self, test_case=test_case
250
+ )
251
+ return self.score
99
252
 
100
253
  ##################################################
101
254
  ### Tool Correctness (Tool) ######################
@@ -146,10 +299,69 @@ class ToolCorrectnessMetric(BaseMetric):
146
299
  else:
147
300
  return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
148
301
 
302
+ def _construct_final_reason(
303
+ self,
304
+ tool_calling_reason,
305
+ tool_selection_reason,
306
+ ):
307
+ final_reason = "[\n"
308
+ final_reason += "\t Tool Calling Reason: " + tool_calling_reason + "\n"
309
+ final_reason += (
310
+ "\t Tool Selection Reason: " + tool_selection_reason + "\n"
311
+ )
312
+ final_reason += "]\n"
313
+ return final_reason
314
+
149
315
  ##################################################
150
316
  ### Score Helper Functions #######################
151
317
  ##################################################
152
318
 
319
+ def _get_tool_selection_score(
320
+ self, user_input, tools_called, available_tools
321
+ ):
322
+ tools_called_formatted = print_tools_called(tools_called)
323
+ available_tools_formatted = print_tools_called(available_tools)
324
+ prompt = ToolCorrectnessTemplate.get_tool_selection_score(
325
+ user_input, tools_called_formatted, available_tools_formatted
326
+ )
327
+ if self.using_native_model:
328
+ res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
329
+ self.evaluation_cost += cost
330
+ return res
331
+ else:
332
+ try:
333
+ res = self.model.generate(prompt, schema=ToolSelectionScore)
334
+ return res
335
+ except TypeError:
336
+ res = self.model.generate(prompt)
337
+ data = trimAndLoadJson(res, self)
338
+ return ToolSelectionScore(**data)
339
+
340
+ async def _a_get_tool_selection_score(
341
+ self, user_input, tools_called, available_tools
342
+ ):
343
+ tools_called_formatted = print_tools_called(tools_called)
344
+ available_tools_formatted = print_tools_called(available_tools)
345
+ prompt = ToolCorrectnessTemplate.get_tool_selection_score(
346
+ user_input, tools_called_formatted, available_tools_formatted
347
+ )
348
+ if self.using_native_model:
349
+ res, cost = await self.model.a_generate(
350
+ prompt, schema=ToolSelectionScore
351
+ )
352
+ self.evaluation_cost += cost
353
+ return res
354
+ else:
355
+ try:
356
+ res = await self.model.a_generate(
357
+ prompt, schema=ToolSelectionScore
358
+ )
359
+ return res
360
+ except TypeError:
361
+ res = await self.model.a_generate(prompt)
362
+ data = trimAndLoadJson(res, self)
363
+ return ToolSelectionScore(**data)
364
+
153
365
  # Calculate score
154
366
  def _calculate_score(self):
155
367
  if self.should_exact_match:
@@ -0,0 +1 @@
1
+ from .tool_use import ToolUseMetric
@@ -0,0 +1,19 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class UserInputAndTools(BaseModel):
5
+ user_messages: str
6
+ assistant_messages: str
7
+ tools_called: str
8
+ available_tools: str
9
+ tools_used: bool
10
+
11
+
12
+ class ToolSelectionScore(BaseModel):
13
+ score: float
14
+ reason: str
15
+
16
+
17
+ class ArgumentCorrectnessScore(BaseModel):
18
+ score: float
19
+ reason: str
@@ -0,0 +1,220 @@
1
+ import textwrap
2
+ import json
3
+
4
+
5
+ class ToolUseTemplate:
6
+
7
+ @staticmethod
8
+ def get_tool_selection_score(
9
+ user_input: str,
10
+ assistant_messages: str,
11
+ tools_called: str,
12
+ available_tools: str,
13
+ ) -> str:
14
+ return textwrap.dedent(
15
+ f"""You are an expert evaluator assessing the **Tool Selection Quality** of an AI agent.
16
+
17
+ OBJECTIVE
18
+ Evaluate whether the agent **selected the most appropriate tools** for completing the user's task, given a list of available tools.
19
+
20
+ This metric focuses **only** on which tools were chosen — **not** how they were used or whether they succeeded.
21
+
22
+ EVALUATION RULES
23
+
24
+ 1. Relevance
25
+ - Each tool used must directly support the user's stated goal or a clear sub-task derived from it.
26
+ - Tools unrelated to the goal lower the score sharply.
27
+
28
+ 2. Appropriateness
29
+ - The chosen tools must match their described purpose.
30
+ - If a more suitable tool existed and was ignored, score ≤ 0.5.
31
+
32
+ 3. Necessity
33
+ - Every tool call must be justified by clear need.
34
+ - Redundant or speculative tool use (e.g., calling multiple tools that overlap) reduces the score.
35
+
36
+ 4. Strictness
37
+ - When uncertain if a tool was required or correctly chosen, assume it was **not** appropriate.
38
+ - Only perfect alignment between the task and tool choice earns a high score.
39
+
40
+ SCORING GUIDE:
41
+
42
+ - **1.0** → Every tool used was necessary and perfectly matched to the task; no better alternative ignored.
43
+ - **0.75** → Tool selection was mostly correct, with only minor redundancy or a small omission.
44
+ - **0.5** → Mixed quality; some appropriate selections, but others questionable or missing.
45
+ - **0.25** → Poor selection; major mismatches or misuse of available tools.
46
+ - **0.0** → Tool selection irrelevant, random, or unjustified.
47
+
48
+ OUTPUT FORMAT:
49
+
50
+ Return a JSON object with:
51
+
52
+ {{
53
+ "score": float between 0.0 and 1.0,
54
+ "reason": "1-3 factual sentences explaining which tools were appropriate or inappropriate for the task, referencing specific tool names."
55
+ }}
56
+
57
+ USER INPUT:
58
+ {user_input}
59
+
60
+ ASSISTANT MESSAGES:
61
+ {assistant_messages}
62
+
63
+ TOOLS CALLED:
64
+ {tools_called}
65
+
66
+ AVAILABLE TOOLS:
67
+ {available_tools}
68
+
69
+ JSON:
70
+ """
71
+ )
72
+
73
+ @staticmethod
74
+ def get_argument_correctness_score(
75
+ user_input: str,
76
+ assistant_messages: str,
77
+ tools_called: str,
78
+ available_tools: str,
79
+ ) -> str:
80
+ return textwrap.dedent(
81
+ f"""You are an expert evaluator assessing the **Tool Argument Quality** of an AI agent.
82
+
83
+ OBJECTIVE:
84
+
85
+ Evaluate whether the **arguments and parameters** passed to each tool were:
86
+ - Correctly structured and complete.
87
+ - Contextually appropriate for the user's goal.
88
+ - Compatible with each tool's intended purpose.
89
+
90
+ This metric focuses **only** on argument-level correctness and relevance — not which tools were chosen.
91
+
92
+ EVALUATION RULES
93
+
94
+ 1. Relevance
95
+ - Each argument must align with the task and the tool's documented input fields.
96
+ - Unrelated, empty, or default arguments reduce the score sharply.
97
+
98
+ 2. **Completeness**
99
+ - All required parameters must be provided.
100
+ - Missing or malformed arguments (e.g., wrong data types or incomplete context) lower the score.
101
+
102
+ 3. **Specificity**
103
+ - Arguments should reflect task-specific values, not generic placeholders.
104
+ - Overly vague or default arguments are penalized.
105
+
106
+ 4. **Justification**
107
+ - Each argument must make sense in context.
108
+ - If it doesn't clearly derive from the user's request, assume it's incorrect.
109
+
110
+ 5. **Strict Bias**
111
+ - When uncertain whether arguments fit the tool or task, assume they were **incorrect**.
112
+
113
+ SCORING GUIDE:
114
+
115
+ - **1.0** → All arguments are accurate, specific, and fully aligned with both the task and tool requirements.
116
+ - **0.75** → Mostly correct; minor omissions or small mismatches.
117
+ - **0.5** → Partial correctness; some valid parameters, but key ones missing or off-target.
118
+ - **0.25** → Poor argument quality; several invalid or irrelevant fields.
119
+ - **0.0** → Arguments nonsensical, generic, or unrelated to task/tool intent.
120
+
121
+ OUTPUT FORMAT:
122
+
123
+ Return a JSON object with:
124
+ {{
125
+ "score": float between 0.0 and 1.0,
126
+ "reason": "1-3 sentences explaining argument alignment or issues, referencing specific parameter names or values when possible."
127
+ }}
128
+
129
+ ---
130
+
131
+ USER INPUT:
132
+ {user_input}
133
+
134
+ ASSISTANT MESSAGES:
135
+ {assistant_messages}
136
+
137
+ TOOLS CALLED (with arguments):
138
+ {tools_called}
139
+
140
+ AVAILABLE TOOLS:
141
+ {available_tools}
142
+
143
+ JSON:
144
+ """
145
+ )
146
+
147
+ @staticmethod
148
+ def get_tool_selection_final_reason(
149
+ all_scores_and_reasons: str, final_score: float, threshold: float
150
+ ) -> str:
151
+ return textwrap.dedent(
152
+ f"""You are an expert evaluator summarizing the outcome of a **Tool Selection** evaluation.
153
+
154
+ You are given:
155
+ - A list of **tool selection sub-scores and reasons**, each describing how appropriately the agent chose tools for its task.
156
+ - The **final aggregated score** across all sub-evaluations.
157
+ - A **threshold** representing the minimum passing score.
158
+
159
+ Your task is to write a **single concise explanation (1-3 sentences)** that captures:
160
+ - Why the agent **passed or failed** based on tool choice quality.
161
+ - The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
162
+ - A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
163
+
164
+ RULES:
165
+ - Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
166
+ - Mention specific issues or strengths like redundancy, misuse, or perfect matching.
167
+ - Avoid vague or subjective language such as “pretty good” or “reasonable”.
168
+ - Do **not** reference argument-level details; this summary is only for tool choice quality.
169
+ - The result must read as a self-contained, factual justification.
170
+
171
+ FORMAT:
172
+ Return only a single plain-text string. Do **not** include JSON or other formatting.
173
+
174
+ All Tool Selection Sub-Scores and Reasons:
175
+ {all_scores_and_reasons}
176
+
177
+ Final Score: {final_score}
178
+ Threshold: {threshold}
179
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
180
+
181
+ Final Reason:
182
+ """
183
+ )
184
+
185
+ @staticmethod
186
+ def get_tool_argument_final_reason(
187
+ all_scores_and_reasons: str, final_score: float, threshold: float
188
+ ) -> str:
189
+ return textwrap.dedent(
190
+ f"""You are an expert evaluator summarizing the outcome of a **Tool Argument Quality** evaluation.
191
+
192
+ You are given:
193
+ - A list of **argument-level sub-scores and reasons**, each evaluating whether the arguments passed to tools were accurate, complete, and contextually appropriate.
194
+ - The **final aggregated score** across all argument evaluations.
195
+ - A **threshold** representing the minimum passing score.
196
+
197
+ Your task is to write a **single concise explanation (1-3 sentences)** that clearly states:
198
+ - Why the agent **passed or failed** in its use of tool arguments.
199
+ - The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
200
+ - Whether the agent met or fell short of the threshold and why.
201
+
202
+ RULES:
203
+ - Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
204
+ - Reference specific argument-level problems or successes where helpful.
205
+ - Keep language objective and factual; avoid speculation or vague phrasing.
206
+ - The summary must stand alone as a clear explanation of the final result.
207
+
208
+ FORMAT:
209
+ Return only a single plain-text string. Do **not** include JSON or any extra formatting.
210
+
211
+ All Tool Argument Sub-Scores and Reasons:
212
+ {all_scores_and_reasons}
213
+
214
+ Final Score: {final_score}
215
+ Threshold: {threshold}
216
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
217
+
218
+ Final Reason:
219
+ """
220
+ )