ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
- wxo_agentic_evaluation/analyze_run.py +822 -344
- wxo_agentic_evaluation/arg_configs.py +39 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +29 -4
- wxo_agentic_evaluation/evaluation_package.py +197 -18
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +105 -108
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -0
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +64 -1
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +20 -2
- wxo_agentic_evaluation/quick_eval.py +23 -11
- wxo_agentic_evaluation/record_chat.py +18 -10
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
- wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +12 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +15 -5
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +140 -20
- wxo_agentic_evaluation/wxo_client.py +81 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"name": "function_selection_appropriateness",
|
|
4
|
+
"task_description": "You are an evaluator assessing whether a specific function is an appropriate next step in a user-assistant conversation.\n\nTASK: Determine if the selected function aligns with the user's current request and is justified by the conversation history and tool inventory.\nIMPORTANT: Evaluate only the function selection appropriateness. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\nEVALUATION CRITERIA:\n\n1. EVIDENCE SOURCES:\n- Use only the conversation history (system prompts, user and assistant turns, prior tool calls and outputs).\n- Use only the tool inventory, which includes function names and descriptions.\n- Do not infer capabilities, tool logic, or user goals beyond what is explicitly provided.\n\n2. DIALOGUE CONTEXT:\n- Conversations alternate between user and assistant.\n- Assistant turns may include multiple tool calls.\n- Tool calls may be:\n - INDEPENDENT: order does not matter\n - CHAINED: later calls depend on earlier ones\n- Consider the evolving context of the user's goal.\n\n3. APPROPRIATENESS DEFINITION:\nA function is appropriate if:\n- It addresses the user's stated or implied request.\n- It contributes meaningfully to the immediate goal.\n- It aligns with the documented function description.\n- It is not redundant with earlier tool calls (e.g. a duplication of the same tool call with the same parameters that is unnecessary).\n\nNote: More than one function may be appropriate. If this call satisfies a relevant part of the user's request, it is valid.\n\n4. WHEN INAPPROPRIATE:\nFlag the function as inappropriate only if:\n- A better function exists in the inventory\n- No valid function matches the request\n- The call is redundant or unnecessary\n\nProvide a corrected tool call if applicable, or use a placeholder such as:\n- {\"name\": \"no_function\"}\n- {\"name\": \"no_function_redundant\"}\n\n5. CONSERVATIVE JUDGMENT:\n- Do not assume tool behavior not stated in descriptions.\n- Do not reference tools or logic outside the inventory.\n- Do not invent user intent.\n- Do not penalize a valid call just because alternatives exist.\n\nOnly mark the function as inappropriate if clear, explicit evidence from the conversation or tool descriptions supports that judgment.\nOnly evaluate the function selection - e.g. if the function selection is appropriate, but the parameter values are incorrect, mark it as a correct function selection with a maximum score - you MUST ignore parameter values mistakes.",
|
|
5
|
+
"jsonschema": {
|
|
6
|
+
"title": "function_selection_appropriateness",
|
|
7
|
+
"description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.",
|
|
8
|
+
"type": "object",
|
|
9
|
+
"additionalProperties": false,
|
|
10
|
+
"properties": {
|
|
11
|
+
"evidence": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "Cite exact quotes from the conversation history or tool inventory that support your rating. Do not paraphrase or interpret-include only directly quoted material that supports or challenges the function's appropriateness."
|
|
14
|
+
},
|
|
15
|
+
"explanation": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "Explain why this function call does or does not match the user's immediate intent. Refer to specific conversation turns and tool descriptions. If inappropriate, state whether a better function exists, the function is unnecessary, or none in the inventory are suitable. Please keep your explanation as BRIEF and to the point as possible."
|
|
18
|
+
},
|
|
19
|
+
"output": {
|
|
20
|
+
"type": "integer",
|
|
21
|
+
"minimum": 1,
|
|
22
|
+
"maximum": 5,
|
|
23
|
+
"threshold_low": 3,
|
|
24
|
+
"threshold_high": 5,
|
|
25
|
+
"description": "Function Selection Appropriateness Score (1-5):\n\n- 5: Clearly appropriate and well-justified\n The selected function directly aligns with the user's request, is grounded in the conversation history, and matches the documented purpose in the tool inventory.\n Example:\n User: \"Can you check the weather in Rome tomorrow?\"\nSelected function: get_weather(city=\"Rome\", date=\"2025-08-06\")\n\n- 4: Reasonable choice with minor gaps\n The function is a plausible and helpful next step based on the conversation, though some details (e.g., goal inference or chaining rationale) may not be fully explicit. Example:\n User: \"What's my email address?\"\n Selected function: get_user_id()\n (A precondition for get_user_email(user_id=...) is this get_user_id())\n\n- 3: Ambiguous or weakly justified\n The function may relate to the user's goal, but there is not enough evidence in the conversation or function description to clearly support it.\n Example:\n User: \"I'm planning a trip to Tokyo.\"\n Selected function: get_weather(city=\"Tokyo\")\n (User didn't request weather, but it's a useful proactive step)\n\n- 2: Likely inappropriate\n The function does not clearly support the user's request or appears redundant given earlier tool calls.\n Example:\n User: \"Book me a restaurant in Berlin.\"\n Selected function: get_weather(city=\"Berlin\")\n\n- 1: Clearly inappropriate or unjustified\n The function is unrelated to the conversation, violates tool constraints, or invents a user goal.\n Example:\n User: \"Remind me to call Alex.\"\n Selected function: get_weather(city=\"New York\")"
|
|
26
|
+
},
|
|
27
|
+
"confidence": {
|
|
28
|
+
"type": "number",
|
|
29
|
+
"minimum": 0,
|
|
30
|
+
"maximum": 1,
|
|
31
|
+
"threshold_low": 0,
|
|
32
|
+
"threshold_high": 1,
|
|
33
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
34
|
+
},
|
|
35
|
+
"correction": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.",
|
|
38
|
+
"properties": {
|
|
39
|
+
"reason_types": {
|
|
40
|
+
"type": "array",
|
|
41
|
+
"description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.",
|
|
42
|
+
"items": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"enum": [
|
|
45
|
+
"IRRELEVANT_FUNCTION",
|
|
46
|
+
"BETTER_FUNCTION_EXISTS",
|
|
47
|
+
"MISSING_FUNCTION",
|
|
48
|
+
"REDUNDANT_CALL",
|
|
49
|
+
"OTHER"
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"reasons": {
|
|
54
|
+
"type": "string",
|
|
55
|
+
"description": "Brief justification for why this function is inappropriate. State which criteria it violates and why."
|
|
56
|
+
},
|
|
57
|
+
"corrected_function_name": {
|
|
58
|
+
"type": "string",
|
|
59
|
+
"description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary."
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"required": []
|
|
63
|
+
},
|
|
64
|
+
"actionable_recommendations": {
|
|
65
|
+
"type": "array",
|
|
66
|
+
"description": "Provide actionable recommendations to help the agent developer prevent function selection issues. Only include recommendations when the function selection was incorrect and the advice can genuinely help. Recommend only items that are important and can significantly improve the agent, such as:\n\n1. TOOL_DOCUMENTATION: Update the tool description to clarify requirements, usage details, connections to other tools (e.g., prerequisites or authentication steps that must occur before this tool call), edge cases, and other relevant information. Suggest the exact fix—addition, deletion, or adjustment.\n\n2. TOOL_USAGE_EXAMPLES: Add concrete usage examples to the tool description to help the agent understand when and how to use this function correctly. Include examples of user queries and, if relevant, prerequisite tool calls.\n\n3. INSTRUCTIONS_ADDITIONS: Add general instructions that can help the agent avoid such mistakes. Only include instructions that are essential and do not repeat existing ones. If an existing instruction (usually in the system prompt) can be improved, recommend updating it. Use this recommendation only if a clear, general instruction will significantly improve the agent in preventing this specific type of mistake.\n\n4. OTHER: If none of the above categories apply or if you have a unique recommendation, use this category. Be specific and actionable in your suggestions.",
|
|
67
|
+
"items": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"properties": {
|
|
70
|
+
"recommendation": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "A specific, actionable recommendation to improve the agent's function selection process.",
|
|
73
|
+
"enum": [
|
|
74
|
+
"TOOL_DOCUMENTATION",
|
|
75
|
+
"TOOL_USAGE_EXAMPLES",
|
|
76
|
+
"INSTRUCTIONS_ADDITIONS",
|
|
77
|
+
"OTHER"
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
"tool_name": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"description": "The name of the tool to which the recommendation applies."
|
|
83
|
+
},
|
|
84
|
+
"details": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"description": "A detailed explanation of the recommendation, including what specific changes should be made, how they will improve function selection, and any relevant examples or best practices."
|
|
87
|
+
},
|
|
88
|
+
"quote": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "The specific quote of the additions made to the tool documentation (description in the tool specification), examples (that will be added to the tool documentation), or instructions that will be added to the system prompt of the agent."
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"required": [
|
|
94
|
+
"recommendation",
|
|
95
|
+
"tool_name",
|
|
96
|
+
"details",
|
|
97
|
+
"quote"
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
"required": [
|
|
103
|
+
"explanation",
|
|
104
|
+
"evidence",
|
|
105
|
+
"output",
|
|
106
|
+
"confidence",
|
|
107
|
+
"correction",
|
|
108
|
+
"actionable_recommendations"
|
|
109
|
+
]
|
|
110
|
+
},
|
|
111
|
+
"examples": [
|
|
112
|
+
{
|
|
113
|
+
"user_kwargs": {
|
|
114
|
+
"conversation_context": [
|
|
115
|
+
{
|
|
116
|
+
"role": "user",
|
|
117
|
+
"content": "What time is it in Tokyo?"
|
|
118
|
+
}
|
|
119
|
+
],
|
|
120
|
+
"tools_inventory": [
|
|
121
|
+
{
|
|
122
|
+
"name": "translate_text",
|
|
123
|
+
"description": "Translate text to a target language",
|
|
124
|
+
"parameters": {
|
|
125
|
+
"text": "string",
|
|
126
|
+
"target": "string"
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"name": "get_time",
|
|
131
|
+
"description": "Retrieve current local time",
|
|
132
|
+
"parameters": {
|
|
133
|
+
"timezone": "string"
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
],
|
|
137
|
+
"selected_function": "translate_text",
|
|
138
|
+
"proposed_tool_call": {
|
|
139
|
+
"id": "call_001",
|
|
140
|
+
"type": "function",
|
|
141
|
+
"function": {
|
|
142
|
+
"name": "translate_text",
|
|
143
|
+
"arguments": "{ \"text\": \"What time is it in Tokyo?\" }"
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"output": {
|
|
148
|
+
"evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text\nBetter tool available: get_time",
|
|
149
|
+
"explanation": "The function translate_text is completely inappropriate for the user's request. The user clearly asked for current time information, which should be handled by get_time.",
|
|
150
|
+
"output": 1,
|
|
151
|
+
"confidence": 0.95,
|
|
152
|
+
"correction": {
|
|
153
|
+
"reason_types": [
|
|
154
|
+
"IRRELEVANT_FUNCTION",
|
|
155
|
+
"BETTER_FUNCTION_EXISTS"
|
|
156
|
+
],
|
|
157
|
+
"reasons": "translate_text translates text, not time. get_time directly answers the request.",
|
|
158
|
+
"corrected_function_name": "get_time"
|
|
159
|
+
},
|
|
160
|
+
"actionable_recommendations": [
|
|
161
|
+
{
|
|
162
|
+
"recommendation": "TOOL_DOCUMENTATION",
|
|
163
|
+
"tool_name": "get_time",
|
|
164
|
+
"details": "Update the get_time description to make it clear it handles city names as well as timezones.",
|
|
165
|
+
"quote": "Retrieve the current local time for a given timezone or city. Use this for all time-related queries."
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"recommendation": "OTHER",
|
|
169
|
+
"tool_name": "get_time",
|
|
170
|
+
"details": "Consider renaming get_time to get_local_time to make its purpose clearer and reduce confusion with other potential time-related tools.",
|
|
171
|
+
"quote": "Rename get_time to get_local_time."
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
}
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"user_kwargs": {
|
|
178
|
+
"conversation_context": [
|
|
179
|
+
{
|
|
180
|
+
"role": "user",
|
|
181
|
+
"content": "What is the weather like in Paris?"
|
|
182
|
+
}
|
|
183
|
+
],
|
|
184
|
+
"tools_inventory": [
|
|
185
|
+
{
|
|
186
|
+
"name": "get_weather",
|
|
187
|
+
"description": "Retrieve current weather conditions for a specified city.",
|
|
188
|
+
"parameters": {
|
|
189
|
+
"city": "string"
|
|
190
|
+
}
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"name": "get_time",
|
|
194
|
+
"description": "Retrieve current local time for a specified timezone.",
|
|
195
|
+
"parameters": {
|
|
196
|
+
"timezone": "string"
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
],
|
|
200
|
+
"selected_function": "get_weather",
|
|
201
|
+
"proposed_tool_call": {
|
|
202
|
+
"id": "call_001",
|
|
203
|
+
"type": "function",
|
|
204
|
+
"function": {
|
|
205
|
+
"name": "get_weather",
|
|
206
|
+
"arguments": "{ \"city\": \"London\" }"
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"output": {
|
|
211
|
+
"evidence": "User request for weather in London is clear and unambiguous.",
|
|
212
|
+
"explanation": "The function selection is appropriate as it directly addresses the user's request for weather information. Note: The mistake in the 'city' parameter is non relevant to the selection of the function.",
|
|
213
|
+
"output": 5,
|
|
214
|
+
"confidence": 0.95,
|
|
215
|
+
"correction": {},
|
|
216
|
+
"actionable_recommendations": []
|
|
217
|
+
}
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"user_kwargs": {
|
|
221
|
+
"conversation_context": [
|
|
222
|
+
{
|
|
223
|
+
"role": "user",
|
|
224
|
+
"content": "Can you help me understand what 'Bonjour' means?"
|
|
225
|
+
}
|
|
226
|
+
],
|
|
227
|
+
"tools_inventory": [
|
|
228
|
+
{
|
|
229
|
+
"name": "detect_language",
|
|
230
|
+
"description": "Detect the language of the input text",
|
|
231
|
+
"parameters": {
|
|
232
|
+
"text": "string"
|
|
233
|
+
}
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
"name": "translate_text",
|
|
237
|
+
"description": "Translate text to a target language. If the source language is not specified or unclear, use detect_language first.",
|
|
238
|
+
"parameters": {
|
|
239
|
+
"text": "string",
|
|
240
|
+
"target": "string"
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
],
|
|
244
|
+
"selected_function": "translate_text",
|
|
245
|
+
"proposed_tool_call": {
|
|
246
|
+
"id": "call_003",
|
|
247
|
+
"type": "function",
|
|
248
|
+
"function": {
|
|
249
|
+
"name": "translate_text",
|
|
250
|
+
"arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }"
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
},
|
|
254
|
+
"output": {
|
|
255
|
+
"evidence": "User request: 'Can you help me understand what Bonjour means?'\nFunction called: translate_text\nAmbiguity: user might want translation OR language detection.",
|
|
256
|
+
"explanation": "The assistant chose translate_text, which is plausible, but the user might also be asking for language identification. A better approach would be to clarify intent or first run detect_language.",
|
|
257
|
+
"output": 2,
|
|
258
|
+
"confidence": 0.94,
|
|
259
|
+
"correction": {
|
|
260
|
+
"reason_types": [
|
|
261
|
+
"BETTER_FUNCTION_EXISTS"
|
|
262
|
+
],
|
|
263
|
+
"reasons": "The user may want language identification, not just translation.",
|
|
264
|
+
"corrected_function_name": "detect_language"
|
|
265
|
+
},
|
|
266
|
+
"actionable_recommendations": [
|
|
267
|
+
{
|
|
268
|
+
"recommendation": "TOOL_DOCUMENTATION",
|
|
269
|
+
"tool_name": "translate_text",
|
|
270
|
+
"details": "Clarify in translate_text description that it requires knowing the target language, and if unknown, detect_language must be called first.",
|
|
271
|
+
"quote": "This tool should get the target language, therefore if you don't know the target language, you must use detect_language first."
|
|
272
|
+
},
|
|
273
|
+
{
|
|
274
|
+
"recommendation": "TOOL_USAGE_EXAMPLES",
|
|
275
|
+
"tool_name": "translate_text",
|
|
276
|
+
"details": "Add example to translate_text usage that demonstrates the need for detect_language.",
|
|
277
|
+
"quote": "Example: User: \"What does 'Bonjour' mean?\" -> Tool call: detect_language with text: \"Bonjour\" and then translate_text with text: \"Bonjour\" and target: \"en\"."
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
"recommendation": "INSTRUCTIONS_ADDITIONS",
|
|
281
|
+
"tool_name": "translate_text",
|
|
282
|
+
"details": "Add an instruction to clarify the use of detect_language in ambiguous cases.",
|
|
283
|
+
"quote": "When you encounter a phrase in an unknown language, you must first use detect_language to identify the language. In ambiguous user requests, always clarify with the user before proceeding with the task."
|
|
284
|
+
}
|
|
285
|
+
]
|
|
286
|
+
}
|
|
287
|
+
},
|
|
288
|
+
{
|
|
289
|
+
"user_kwargs": {
|
|
290
|
+
"conversation_context": [
|
|
291
|
+
{
|
|
292
|
+
"role": "user",
|
|
293
|
+
"content": "What is my email address?"
|
|
294
|
+
},
|
|
295
|
+
{
|
|
296
|
+
"role": "assistant",
|
|
297
|
+
"tool_call": {
|
|
298
|
+
"name": "get_user_email",
|
|
299
|
+
"arguments": {
|
|
300
|
+
"user_id": "your_user_id"
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
"role": "assistant",
|
|
306
|
+
"tool_response": {"error": "User ID not found"}
|
|
307
|
+
}
|
|
308
|
+
],
|
|
309
|
+
"tools_inventory": [
|
|
310
|
+
{
|
|
311
|
+
"name": "get_user_id",
|
|
312
|
+
"description": "Retrieve the user's unique identifier",
|
|
313
|
+
"parameters": {}
|
|
314
|
+
},
|
|
315
|
+
{
|
|
316
|
+
"name": "get_user_email",
|
|
317
|
+
"description": "Retrieve the user's email address using their user ID",
|
|
318
|
+
"parameters": {
|
|
319
|
+
"user_id": "string"
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
],
|
|
323
|
+
"selected_function": "get_user_email",
|
|
324
|
+
"proposed_tool_call": {
|
|
325
|
+
"name": "get_user_email",
|
|
326
|
+
"arguments": {
|
|
327
|
+
"user_id": "your_user_id"
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
},
|
|
331
|
+
"output": {
|
|
332
|
+
"evidence": "",
|
|
333
|
+
"explanation": "The function get_user_email is inappropriate because the previous attempt to call it failed due to a missing user ID. The correct next step is to call get_user_id to obtain the necessary identifier before retrying get_user_email.",
|
|
334
|
+
"output": 1,
|
|
335
|
+
"confidence": 0.96,
|
|
336
|
+
"correction": {
|
|
337
|
+
"reason_types": [
|
|
338
|
+
"REDUNDANT_CALL"
|
|
339
|
+
],
|
|
340
|
+
"reasons": "This is a redundant call to get_user_email without having the user_id and is a duplication of the previous incorrect call.",
|
|
341
|
+
"corrected_function_name": "no_function_redundant"
|
|
342
|
+
},
|
|
343
|
+
"actionable_recommendations": [
|
|
344
|
+
{
|
|
345
|
+
"recommendation": "INSTRUCTIONS_ADDITIONS",
|
|
346
|
+
"tool_name": "get_user_id",
|
|
347
|
+
"details": "Add instructions to always retrieve necessary identifiers before dependent calls, and do not repeat tool calls that were previously unsuccessful - instead, try to obtain the missing information first.",
|
|
348
|
+
"quote": "Always retrieve necessary values before dependent calls. Do not repeat tool calls that were previously unsuccessful - instead, try to obtain the missing information first from other tools or by asking the user clarifying questions."
|
|
349
|
+
}
|
|
350
|
+
]
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
]
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
"name": "agentic_constraints_satisfaction",
|
|
357
|
+
"task_description": "Evaluate whether the proposed tool call satisfies all agentic constraints required for execution.\n\nYour judgment must be based strictly on the provided conversation history and tools inventory. Do not infer or assume any constraints or workflows not explicitly documented.\nIMPORTANT: Evaluate only the agentic constraints satisfaction. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\n---\n\n### 1. Explicit Prerequisites\n- Confirm that required prior actions (e.g., authentication, file upload) have been completed.\n- Accept only direct evidence such as: \"Auth token acquired\" or \"File uploaded successfully.\"\n\n---\n\n### 2. Tool Sequencing\n- Enforce sequencing only when the tool description explicitly states a required order.\n- If no such requirement is documented, assume calls can be executed independently.\n\n---\n\n### 3. Call Independence\n- Calls that are not explicitly marked as dependent may be treated as parallel or order-independent.\n- Do not impose workflow constraints unless specified.\n\n---\n\n### 4. Redundancy\n- Flag the tool call as redundant only if the same function with the same parameters was already executed and no contextual change justifies repetition.\n\n---\n\n### Final Rule\nYour evaluation must be conservative: flag constraint violations only when there is clear, explicit evidence. Do not assume or invent requirements not documented in the conversation or tool descriptions.",
|
|
358
|
+
"jsonschema": {
|
|
359
|
+
"title": "agentic_constraints_satisfaction",
|
|
360
|
+
"type": "object",
|
|
361
|
+
"additionalProperties": false,
|
|
362
|
+
"properties": {
|
|
363
|
+
"evidence": {
|
|
364
|
+
"type": "string",
|
|
365
|
+
"description": "Quote SPECIFIC lines from the conversation history or tool specifications that demonstrate whether prerequisites are satisfied or violated. Include exact text showing completions or absences of required steps."
|
|
366
|
+
},
|
|
367
|
+
"explanation": {
|
|
368
|
+
"type": "string",
|
|
369
|
+
"description": "Provide a BRIEF explanation of whether all prerequisite constraints are satisfied for this tool call. Address explicit dependencies, required sequence, parameter relationships, and potential redundancy. Base your explanation SOLELY on evidence from the provided conversation and tool specifications. Please keep your explanation as BRIEF and to the point as possible."
|
|
370
|
+
},
|
|
371
|
+
"output": {
|
|
372
|
+
"type": "integer",
|
|
373
|
+
"minimum": 1,
|
|
374
|
+
"maximum": 5,
|
|
375
|
+
"threshold_low": 3,
|
|
376
|
+
"threshold_high": 5,
|
|
377
|
+
"description": "Agentic Constraints Satisfaction Score (1-5):\n\n- 5: Fully compliant with all known constraints\n The tool call satisfies all documented prerequisites, sequencing rules, and redundancy checks based on explicit evidence in the conversation and tool inventory.\n Example:\n User: \"Here is my file.\"\n Assistant: (File upload confirmed)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 4: Likely compliant, with minor uncertainty\n All major constraints are satisfied, but some minor dependency or precondition may not be fully confirmed, though it is likely met based on adjacent context.\n Example:\n User: \"I've uploaded the file already.\"\n (No explicit upload confirmation, but no contradiction)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 3: Ambiguous or undetermined\n It is unclear whether all required conditions or dependencies are met. The call might be valid, but confirmation is lacking.\n Example:\n User: \"Can you summarize the document?\"\n (No evidence of file upload or confirmation)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 2: Likely violates one or more constraints\n At least one key constraint-such as missing authentication, a required prior step, or improper sequencing-appears unsatisfied.\n Example:\n User: \"Please delete my account.\"\n (No authentication step documented)\n -> Tool call: delete_account(user_id=\"456\")\n\n- 1: Clearly violates agentic constraints\n The tool call ignores an explicit precondition, repeats a redundant action, or breaks an ordering rule described in the tool documentation.\n Example:\n Assistant already called: upload_file(name=\"report.pdf\")\n -> Immediately repeats: upload_file(name=\"report.pdf\") without any contextual change"
|
|
378
|
+
},
|
|
379
|
+
"confidence": {
|
|
380
|
+
"type": "number",
|
|
381
|
+
"minimum": 0,
|
|
382
|
+
"maximum": 1,
|
|
383
|
+
"threshold_low": 0,
|
|
384
|
+
"threshold_high": 1,
|
|
385
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
386
|
+
},
|
|
387
|
+
"correction": {
|
|
388
|
+
"type": "object",
|
|
389
|
+
"description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).",
|
|
390
|
+
"properties": {
|
|
391
|
+
"reason_types": {
|
|
392
|
+
"type": "array",
|
|
393
|
+
"description": "Types of constraint violations identified",
|
|
394
|
+
"items": {
|
|
395
|
+
"type": "string",
|
|
396
|
+
"enum": [
|
|
397
|
+
"MISSING_PREREQUISITE",
|
|
398
|
+
"REDUNDANT_CALL",
|
|
399
|
+
"OTHER"
|
|
400
|
+
]
|
|
401
|
+
}
|
|
402
|
+
},
|
|
403
|
+
"reasons": {
|
|
404
|
+
"type": "string",
|
|
405
|
+
"description": "Brief explanation of the specific constraint violations"
|
|
406
|
+
},
|
|
407
|
+
"prerequisite_tool_calls": {
|
|
408
|
+
"type": "array",
|
|
409
|
+
"description": "List of prerequisite tool calls that must be executed before this one",
|
|
410
|
+
"items": {
|
|
411
|
+
"type": "object",
|
|
412
|
+
"properties": {
|
|
413
|
+
"name": {
|
|
414
|
+
"type": "string",
|
|
415
|
+
"description": "Name of the prerequisite function"
|
|
416
|
+
}
|
|
417
|
+
},
|
|
418
|
+
"required": []
|
|
419
|
+
}
|
|
420
|
+
},
|
|
421
|
+
"corrected_function_name": {
|
|
422
|
+
"type": "string",
|
|
423
|
+
"description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary."
|
|
424
|
+
}
|
|
425
|
+
},
|
|
426
|
+
"required": []
|
|
427
|
+
},
|
|
428
|
+
"actionable_recommendations": {
|
|
429
|
+
"type": "array",
|
|
430
|
+
"description": "Provide specific, actionable recommendations for the agent developer to address identified prerequisite or constraint satisfaction issues. Only provide recommendations when problems are detected. Include multiple detailed suggestions where appropriate, such as:\n\n1. PREREQUISITE_TRACKING: Suggest specific mechanisms for tracking and verifying completion of prerequisite steps, such as 'Implement a prerequisite tracking system that maintains a state object {\"authentication\": true, \"language_detected\": false, etc.} and checks this before making dependent calls'.\n\n2. TOOL_DEPENDENCY_DOCUMENTATION: Recommend explicit documentation of tool dependencies in function descriptions, like 'Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\"'.\n\n3. TOOL_CALL_SEQUENCING: Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for file operations that enforces the sequence: 1) check_permissions -> 2) upload_file -> 3) process_file'.\n\n4. SYSTEM_PROMPT_ADDITIONS: Suggest specific instructions in the system prompt to enforce dependencies, such as 'Add to system prompt: \"Always perform language detection before translation. Never skip prerequisite steps, even when the source language seems obvious\"'.\n\n5. TOOL_SPECIFICATION_IMPROVEMENTS: Recommend enhancements to tool descriptions that highlight dependencies, like 'Add a \"Prerequisites\" section to each tool's description that lists required prior successful calls and their parameters'.\n\n6. ERROR_HANDLING_IMPROVEMENTS: Suggest specific error recovery procedures, such as 'If authentication fails, implement an automatic retry with explicit user notification: \"I'm having trouble authenticating. Let me try again with [authentication method]\"'.\n\n7. REDUNDANCY_DETECTION: Recommend logic for preventing redundant calls, like 'Implement a result caching mechanism that stores outputs from previous calls and checks if an identical call has already been made in the current session'.\n\n8. CONTEXTUAL_AWARENESS: Suggest improvements for maintaining awareness of conversation state, such as 'Create a context tracker that monitors which entities (files, queries, data objects) have been successfully processed and are available for subsequent operations'.\n\n9. MULTI_STEP_OPERATION_DOCUMENTATION: Recommend clearer documentation for operations requiring multiple steps, like 'Create a developer guide section on \"Chained Operations\" that details common sequences of tool calls with sample code and conversation examples'.\n\n10. AGENT_ARCHITECTURE_IMPROVEMENTS: Suggest architectural changes to better support dependencies, such as 'Implement a prerequisite validation middleware layer that intercepts all tool calls and verifies that required prior calls exist in the conversation history before proceeding'.\n\n11. OTHER: If none of the above categories fit or if you have a unique recommendation, use this category. Be specific and actionable in your suggestions.\n\nEnsure recommendations are thorough, specific, and directly address the root cause of the constraint satisfaction issue.",
|
|
431
|
+
"items": {
|
|
432
|
+
"type": "object",
|
|
433
|
+
"properties": {
|
|
434
|
+
"recommendation": {
|
|
435
|
+
"type": "string",
|
|
436
|
+
"description": "A specific, actionable recommendation to improve the agent's ability to satisfy agentic constraints in future tool calls.",
|
|
437
|
+
"enum": [
|
|
438
|
+
"PREREQUISITE_TRACKING",
|
|
439
|
+
"TOOL_DEPENDENCY_DOCUMENTATION",
|
|
440
|
+
"TOOL_CALL_SEQUENCING",
|
|
441
|
+
"SYSTEM_PROMPT_ADDITIONS",
|
|
442
|
+
"TOOL_SPECIFICATION_IMPROVEMENTS",
|
|
443
|
+
"ERROR_HANDLING_IMPROVEMENTS",
|
|
444
|
+
"REDUNDANCY_DETECTION",
|
|
445
|
+
"CONTEXTUAL_AWARENESS",
|
|
446
|
+
"MULTI_STEP_OPERATION_DOCUMENTATION",
|
|
447
|
+
"AGENT_ARCHITECTURE_IMPROVEMENTS",
|
|
448
|
+
"OTHER"
|
|
449
|
+
]
|
|
450
|
+
},
|
|
451
|
+
"tool_name": {
|
|
452
|
+
"type": "string",
|
|
453
|
+
"description": "The name of the tool to which the recommendation applies."
|
|
454
|
+
},
|
|
455
|
+
"details": {
|
|
456
|
+
"type": "string",
|
|
457
|
+
"description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve the agent's ability to satisfy agentic constraints, and any relevant examples or best practices."
|
|
458
|
+
},
|
|
459
|
+
"quote": {
|
|
460
|
+
"type": "string",
|
|
461
|
+
"description": "The specific quote of the additions made to the documentation, examples, or instructions."
|
|
462
|
+
}
|
|
463
|
+
},
|
|
464
|
+
"required": [
|
|
465
|
+
"recommendation",
|
|
466
|
+
"tool_name",
|
|
467
|
+
"details",
|
|
468
|
+
"quote"
|
|
469
|
+
]
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
},
|
|
473
|
+
"required": [
|
|
474
|
+
"explanation",
|
|
475
|
+
"evidence",
|
|
476
|
+
"output",
|
|
477
|
+
"confidence",
|
|
478
|
+
"correction",
|
|
479
|
+
"actionable_recommendations"
|
|
480
|
+
]
|
|
481
|
+
},
|
|
482
|
+
"examples": [
|
|
483
|
+
{
|
|
484
|
+
"user_kwargs": {
|
|
485
|
+
"conversation_context": [
|
|
486
|
+
{
|
|
487
|
+
"role": "assistant",
|
|
488
|
+
"content": "Auth token acquired for user 42."
|
|
489
|
+
}
|
|
490
|
+
],
|
|
491
|
+
"tools_inventory": [
|
|
492
|
+
{
|
|
493
|
+
"name": "get_order_history",
|
|
494
|
+
"description": "Retrieve past orders",
|
|
495
|
+
"parameters": {
|
|
496
|
+
"user_id": "integer"
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
],
|
|
500
|
+
"selected_function": "get_order_history",
|
|
501
|
+
"proposed_tool_call": {
|
|
502
|
+
"id": "call_002",
|
|
503
|
+
"type": "function",
|
|
504
|
+
"function": {
|
|
505
|
+
"name": "get_order_history",
|
|
506
|
+
"arguments": "{ \"user_id\": 42 }"
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
},
|
|
510
|
+
"output": {
|
|
511
|
+
"evidence": "Assistant message explicitly states: \"Auth token acquired for user 42.\" The tool specification for get_order_history does not mention any additional prerequisites beyond authentication.",
|
|
512
|
+
"explanation": "This tool call satisfies all agentic constraints. The prerequisite of authentication is explicitly confirmed in the conversation history with \"Auth token acquired for user 42.\" No additional sequencing requirements or redundant calls are indicated.",
|
|
513
|
+
"output": 5,
|
|
514
|
+
"confidence": 0.95,
|
|
515
|
+
"correction": {},
|
|
516
|
+
"actionable_recommendations": []
|
|
517
|
+
}
|
|
518
|
+
},
|
|
519
|
+
{
|
|
520
|
+
"user_kwargs": {
|
|
521
|
+
"conversation_context": [
|
|
522
|
+
{
|
|
523
|
+
"role": "user",
|
|
524
|
+
"content": "Translate 'Hola' to English."
|
|
525
|
+
}
|
|
526
|
+
],
|
|
527
|
+
"tools_inventory": [
|
|
528
|
+
{
|
|
529
|
+
"name": "translate_text",
|
|
530
|
+
"description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.",
|
|
531
|
+
"parameters": {
|
|
532
|
+
"text": "string",
|
|
533
|
+
"target": "string"
|
|
534
|
+
}
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"name": "detect_language",
|
|
538
|
+
"description": "Detect the language of the input text",
|
|
539
|
+
"parameters": {
|
|
540
|
+
"text": "string"
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
],
|
|
544
|
+
"selected_function": "translate_text",
|
|
545
|
+
"proposed_tool_call": {
|
|
546
|
+
"id": "call_002",
|
|
547
|
+
"type": "function",
|
|
548
|
+
"function": {
|
|
549
|
+
"name": "translate_text",
|
|
550
|
+
"arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }"
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
},
|
|
554
|
+
"output": {
|
|
555
|
+
"evidence": "Tool description explicitly states: \"translate_text: Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.\" The conversation history contains no detect_language call or language detection result.",
|
|
556
|
+
"explanation": "This tool call violates an agentic constraint clearly stated in the tool description. The translate_text function explicitly requires a prior successful call to detect_language for the input text, as stated in its description. The conversation history shows no evidence of any detect_language call being made before this translate_text call.",
|
|
557
|
+
"output": 1,
|
|
558
|
+
"confidence": 0.95,
|
|
559
|
+
"correction": {
|
|
560
|
+
"reason_types": [
|
|
561
|
+
"MISSING_PREREQUISITE"
|
|
562
|
+
],
|
|
563
|
+
"reasons": "Missing required prerequisite detect_language call as explicitly stated in translate_text description",
|
|
564
|
+
"prerequisite_tool_calls": [
|
|
565
|
+
{
|
|
566
|
+
"name": "detect_language"
|
|
567
|
+
}
|
|
568
|
+
]
|
|
569
|
+
},
|
|
570
|
+
"actionable_recommendations": [
|
|
571
|
+
{
|
|
572
|
+
"recommendation": "PREREQUISITE_TRACKING",
|
|
573
|
+
"tool_name": "translate_text",
|
|
574
|
+
"details": "Implement a prerequisite tracking system that maintains a state object {\"language_detected\": false} and checks this before making dependent calls like translate_text.",
|
|
575
|
+
"quote": "Implement a prerequisite tracking system that maintains a state object {\"language_detected\": false} and checks this before making dependent calls."
|
|
576
|
+
},
|
|
577
|
+
{
|
|
578
|
+
"recommendation": "TOOL_DEPENDENCY_DOCUMENTATION",
|
|
579
|
+
"tool_name": "translate_text",
|
|
580
|
+
"details": "Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\".",
|
|
581
|
+
"quote": "Update the translate_text function description to state: \"REQUIRES: Prior successful call to detect_language for the same text input. Example workflow: 1) detect_language -> 2) translate_text\"."
|
|
582
|
+
},
|
|
583
|
+
{
|
|
584
|
+
"recommendation": "SYSTEM_PROMPT_ADDITIONS",
|
|
585
|
+
"tool_name": "translate_text",
|
|
586
|
+
"details": "Add to system prompt an explicit requirement for language detection before translation.",
|
|
587
|
+
"quote": "Add to system prompt: \"Always perform language detection before translation. Never skip prerequisite steps, even when the source language seems obvious.\""
|
|
588
|
+
},
|
|
589
|
+
{
|
|
590
|
+
"recommendation": "TOOL_CALL_SEQUENCING",
|
|
591
|
+
"tool_name": "translate_text",
|
|
592
|
+
"details": "Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for translation that enforces the sequence: 1) detect_language -> 2) translate_text'.",
|
|
593
|
+
"quote": "Propose flowchart or decision tree implementations for proper tool sequencing, like 'Create a predefined workflow for translation that enforces the sequence: 1) detect_language -> 2) translate_text'."
|
|
594
|
+
}
|
|
595
|
+
]
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
]
|
|
599
|
+
}
|
|
600
|
+
]
|