ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (35) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +35 -31
  3. wxo_agentic_evaluation/analyze_run.py +805 -344
  4. wxo_agentic_evaluation/arg_configs.py +10 -1
  5. wxo_agentic_evaluation/description_quality_checker.py +11 -2
  6. wxo_agentic_evaluation/evaluation_package.py +8 -3
  7. wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
  8. wxo_agentic_evaluation/external_agent/types.py +3 -9
  9. wxo_agentic_evaluation/inference_backend.py +46 -79
  10. wxo_agentic_evaluation/llm_matching.py +14 -2
  11. wxo_agentic_evaluation/main.py +1 -1
  12. wxo_agentic_evaluation/metrics/__init__.py +1 -0
  13. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  14. wxo_agentic_evaluation/metrics/metrics.py +43 -1
  15. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  16. wxo_agentic_evaluation/prompt/template_render.py +4 -2
  17. wxo_agentic_evaluation/quick_eval.py +7 -9
  18. wxo_agentic_evaluation/record_chat.py +22 -29
  19. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
  20. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
  21. wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
  22. wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  26. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
  27. wxo_agentic_evaluation/resource_map.py +3 -1
  28. wxo_agentic_evaluation/service_instance.py +7 -0
  29. wxo_agentic_evaluation/type.py +1 -1
  30. wxo_agentic_evaluation/utils/__init__.py +3 -0
  31. wxo_agentic_evaluation/utils/parsers.py +71 -0
  32. wxo_agentic_evaluation/utils/utils.py +131 -16
  33. wxo_agentic_evaluation/wxo_client.py +80 -0
  34. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
  35. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,783 @@
1
+ [
2
+ {
3
+ "name": "general_hallucination_check",
4
+ "task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history and adhere the tool specifications. Your assessment must be based **strictly on explicit evidence** from these sources and correctly formatted based on the API specifications. Do **not** assume or hallucinate any information that is not clearly documented and provided.\n\n---\n\n#### 1. Grounding Sources\n\nA parameter value is considered grounded if it originates from one of the following:\n\n- An explicit user message in the conversation \n- An assistant message that the user confirmed or acknowledged \n- The output of a previous tool call \n- A documented default value in the API specification \n\n---\n\n#### 2. Parameter Value Classification\n\nEach parameter value must be labeled using one of the following categories:\n\n- **CORRECT** \n The value is explicitly mentioned, clearly implied in the conversation, or matches a documented default.\n\n- **MISSING_INFORMATION** \n The value is underspecified or incomplete given the current context (e.g., '05/06' when a full date is required).\n\n- **FORMAT ERROR** \n The value is conceptually correct but incorrectly formatted (e.g., wrong units, casing, or structure) based on the API specifications and descriptions.\n\n- **CONTRADICTORY_VALUES** \n The value violates documented constraints or logical relationships between parameters (e.g., `start_time` should be < `end_time`).\n\n- **DEFAULT_ISSUE** \n The default value of the parameter is not the correct value based on the conversation history.\n\n---\n\n#### 3. Handling Default Values\n\n- Optional parameters may only use default values that are **explicitly documented** in the API specification. \n- Defaults that are assumed or undocumented count as hallucinations. \n- If no value is provided and no default exists, the parameter should be omitted.\n\n---\n\n#### 4. Acceptable Transformations\n\nSome transformations are permitted, but only when grounded:\n\n- **Synonyms** (e.g., \"NYC\" -> \"New York\") are allowed only when normalized in the conversation. \n- **Unit conversions** are valid only if the conversation explicitly mentions the conversion. \n- **Format changes** (e.g., date or time reformatting) are acceptable only if acknowledged by the assistant or supported by the specification.\n\n---\n\n#### 5. Multi-Call Context\n\nIf the tool call appears in a sequence:\n\n- You may use outputs from earlier tool calls to justify parameter values. \n- Consider dependencies and ordering between calls when assessing grounding. \n\n---\n\n#### Conservative Judgment Principle\n\nWhen in doubt, err on the side of caution. If grounding cannot be clearly established, treat the parameter as incorrect.\n",
5
+ "jsonschema": {
6
+ "title": "general_hallucination_check",
7
+ "description": "Assessment of tool call grounding accuracy, following the rubric defined above.",
8
+ "type": "object",
9
+ "additionalProperties": false,
10
+ "properties": {
11
+ "evidence": {
12
+ "type": "string",
13
+ "description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence."
14
+ },
15
+ "explanation": {
16
+ "type": "string",
17
+ "description": "For EACH parameter value, explain BRIEFLY its grounding status by citing specific evidence from the conversation history or tool specification. Address whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters. Include direct quotes that support or contradict the value. Please keep your explanation as BRIEF and to the point as possible."
18
+ },
19
+ "output": {
20
+ "type": "integer",
21
+ "minimum": 1,
22
+ "maximum": 5,
23
+ "threshold_low": 4,
24
+ "threshold_high": 5,
25
+ "description": "Reflection Quality Score (1-5):\n\n- 5: All values are correct - fully grounded and well-formatted.\n All parameter values are explicitly grounded in the conversation or tool output, and exactly match the expected types, formats, and units.\n Example:\n User: \"Book a hotel in Paris for two adults from August 10th to 12th.\"\n \u2192 Tool call includes: location: \"Paris\", guests: 2, check_in: \"2025-08-10\", check_out: \"2025-08-12\"\n\n- 4: All values are correct based on the conversation history and tool specifications - can be logically inferred (can be implicitly grounded).\n Some parameter values are not directly mentioned but can be logically inferred from the context. No evidence of hallucination.\n Example:\n User: \"I want to book a hotel in Tokyo for me and my wife.\"\n \u2192 Tool call includes: location: \"Tokyo\", guests: 2\n\n- 3: One or more values are probably incorrect or ungrounded - ambiguous or undetermined\n It's unclear whether some parameter values are grounded. They might be inferred, hallucinated, or drawn from partial context, or some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing.\n Example:\n User: \"Book a hotel in Rome.\"\n \u2192 Tool call includes: location: \"Rome\", guests: 2\n\n- 2: One or more values are probably incorrect - likely hallucinated or incorrect format.\n Several parameter values do not align with the conversation context and are likely hallucinated, or have format errors.\n Example:\n User: \"Find a hotel.\"\n \u2192 Tool call includes: location: \"New York\", guests: 4\n\n- 1: One or more values are clearly incorrect - certainly hallucinated or fabricated.\n Some parameter values are clearly invented, irrelevant, contradict known context, or do not follow the API specifications.\n Example:\n User: \"Remind me to call John.\"\n \u2192 Tool call includes: location: \"Barcelona\", guests: 6, check_in: \"2025-07-01\""
26
+ },
27
+ "confidence": {
28
+ "type": "number",
29
+ "minimum": 0,
30
+ "maximum": 1,
31
+ "threshold_low": 0,
32
+ "threshold_high": 1,
33
+ "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
34
+ },
35
+ "correction": {
36
+ "type": "object",
37
+ "description": "Corrections to the tool call. If there are no issues, provide an empty object {}. If there are issues, include parameter_issues and optionally a corrected tool_call if a full reconstruction is possible.",
38
+ "properties": {
39
+ "parameter_issues": {
40
+ "type": "array",
41
+ "description": "List of parameters with issues. Leave empty if no issues were found.",
42
+ "items": {
43
+ "type": "object",
44
+ "properties": {
45
+ "parameter_name": {
46
+ "type": "string",
47
+ "description": "Name of the parameter with an issue."
48
+ },
49
+ "reason_types": {
50
+ "type": "array",
51
+ "description": "List of issue categories affecting this parameter.",
52
+ "items": {
53
+ "type": "string",
54
+ "enum": [
55
+ "MISSING_INFORMATION",
56
+ "FORMAT_ERROR",
57
+ "CONTRADICTORY_VALUES",
58
+ "DEFAULT_ISSUE",
59
+ "OTHER"
60
+ ]
61
+ }
62
+ },
63
+ "reasons": {
64
+ "type": "string",
65
+ "description": "Brief explanation of the issue(s) affecting this parameter."
66
+ },
67
+ "corrected_value": {
68
+ "type": "object",
69
+ "description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.",
70
+ "additionalProperties": true
71
+ }
72
+ },
73
+ "required": []
74
+ }
75
+ },
76
+ "tool_call": {
77
+ "type": "object",
78
+ "description": "Optional complete corrected tool call. Include only if all corrected values are available and no values require further user input.",
79
+ "properties": {
80
+ "name": {
81
+ "type": "string",
82
+ "description": "Name of the function to call."
83
+ },
84
+ "arguments": {
85
+ "type": "object",
86
+ "description": "Full set of corrected arguments for the function call, including both modified and retained values.",
87
+ "additionalProperties": true
88
+ }
89
+ },
90
+ "required": []
91
+ }
92
+ },
93
+ "required": []
94
+ },
95
+ "actionable_recommendations": {
96
+ "type": "array",
97
+ "description": "Provide actionable recommendations to help the agent developer prevent parameter hallucinations and similar mistakes in future tool calls. Include multiple detailed suggestions only when there is a mistake and the recommendation can genuinely help. Recommend only items that are important and can significantly improve the agent, such as:\n\n1. PARAMETER_DOCUMENTATION: Update the API parameter specification to clarify parameter requirements, formats, units, edge cases, and other relevant information. Suggest the exact fix—addition, deletion, or adjustment.\n2. PARAMETER_EXAMPLES: Add examples to the parameter documentation that will help prevent similar mistakes in the future.\n3. INSTRUCTIONS_ADDITIONS: Add general instructions that can help the agent handle such mistakes - DO NOT include the specific parameter name, but give a general instruction for this specific mistake. Only include instructions that are essential for the agent and do not repeat existing ones. If an existing instruction (usually in the system prompt) can be improved, recommend updating it. Use this recommendation only if a clear, general instruction will significantly improve the agent in this kind of specific mistake.\n4. OTHER: Any other relevant recommendations that can help improve the agent's performance in future tool calls.",
98
+ "items": {
99
+ "type": "object",
100
+ "properties": {
101
+ "recommendation": {
102
+ "type": "string",
103
+ "description": "A specific, actionable recommendation to reduce parameter hallucinations.",
104
+ "enum": [
105
+ "PARAMETER_DOCUMENTATION",
106
+ "PARAMETER_EXAMPLES",
107
+ "INSTRUCTIONS_ADDITIONS",
108
+ "OTHER"
109
+ ]
110
+ },
111
+ "parameter_name": {
112
+ "type": "string",
113
+ "description": "The name of the parameter to which the recommendation applies."
114
+ },
115
+ "details": {
116
+ "type": "string",
117
+ "description": "An explanation of the recommendation, including what specific changes should be made, how they will prevent parameter hallucinations, and any relevant examples or best practices."
118
+ },
119
+ "quote": {
120
+ "type": "string",
121
+ "description": "The specific quote of the additions made to the parameter documentation (description in the tool specification), examples (that will be added to the tool parameter documentation), or instructions that will be added to the system prompt of the agent."
122
+ }
123
+ },
124
+ "required": [
125
+ "recommendation",
126
+ "parameter_name",
127
+ "details",
128
+ "quote"
129
+ ]
130
+ }
131
+ }
132
+ },
133
+ "required": [
134
+ "evidence",
135
+ "explanation",
136
+ "output",
137
+ "confidence",
138
+ "correction",
139
+ "actionable_recommendations"
140
+ ]
141
+ },
142
+ "examples": [
143
+ {
144
+ "user_kwargs": {
145
+ "conversation_context": [
146
+ {
147
+ "role": "system",
148
+ "content": "You are an helpful assistant."
149
+ },
150
+ {
151
+ "role": "user",
152
+ "content": "Can you show me my profile details?"
153
+ },
154
+ {
155
+ "role": "assistant",
156
+ "content": "Logged in as user 55; auth token acquired."
157
+ },
158
+ {
159
+ "role": "assistant",
160
+ "content": "Ready to fetch your profile."
161
+ }
162
+ ],
163
+ "tool_inventory": [
164
+ {
165
+ "type": "function",
166
+ "function": {
167
+ "name": "get_user_profile",
168
+ "description": "Retrieves a user's profile details",
169
+ "parameters": {
170
+ "type": "object",
171
+ "properties": {
172
+ "user_id": {
173
+ "type": "integer",
174
+ "description": "The ID of the user whose profile is being retrieved"
175
+ }
176
+ },
177
+ "required": [
178
+ "user_id"
179
+ ]
180
+ }
181
+ }
182
+ }
183
+ ],
184
+ "tool_call": {
185
+ "id": "call_001",
186
+ "type": "function",
187
+ "function": {
188
+ "name": "get_user_profile",
189
+ "arguments": "{ \"user_id\": 55 }"
190
+ }
191
+ }
192
+ },
193
+ "output": {
194
+ "evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"",
195
+ "explanation": "The parameter value user_id=55 is directly grounded in the conversation history. The assistant's message explicitly states \"Logged in as user 55\", providing the exact user_id that is used in the tool call. This is a clear example of proper grounding from conversation context.",
196
+ "output": 5,
197
+ "confidence": 0.99,
198
+ "correction": {},
199
+ "actionable_recommendations": []
200
+ }
201
+ },
202
+ {
203
+ "user_kwargs": {
204
+ "conversation_context": [
205
+ {
206
+ "role": "system",
207
+ "content": "You are an helpful assistant."
208
+ },
209
+ {
210
+ "role": "user",
211
+ "content": "Can you show me my profile details?"
212
+ }
213
+ ],
214
+ "tool_inventory": [
215
+ {
216
+ "type": "function",
217
+ "function": {
218
+ "name": "get_user_profile",
219
+ "description": "Retrieves a user's profile details",
220
+ "parameters": {
221
+ "type": "object",
222
+ "properties": {
223
+ "user_id": {
224
+ "type": "integer",
225
+ "description": "The ID of the user whose profile is being retrieved"
226
+ }
227
+ },
228
+ "required": ["user_id"]
229
+ }
230
+ }
231
+ }
232
+ ],
233
+ "tool_call": {
234
+ "id": "call_001",
235
+ "type": "function",
236
+ "function": {
237
+ "name": "get_user_profile",
238
+ "arguments": "{ \"user_id\": 12345 }"
239
+ }
240
+ }
241
+ },
242
+ "output": {
243
+ "evidence": "User never mentioned an ID. Tool spec requires user_id but does not define a default. The assistant fabricated '12345'.",
244
+ "explanation": "The assistant hallucinated a user_id value with no grounding in conversation or tool specification.",
245
+ "output": 1,
246
+ "confidence": 1,
247
+ "correction": {
248
+ "parameter_issues": [
249
+ {
250
+ "parameter_name": "user_id",
251
+ "reason_types": ["MISSING_INFORMATION"],
252
+ "reasons": "No user ID in context or conversation. The assistant invented '12345'.",
253
+ "corrected_value": {
254
+ "need_more_information": "Please provide your user ID so I can fetch the profile."
255
+ }
256
+ }
257
+ ]
258
+ },
259
+ "actionable_recommendations": [
260
+ {
261
+ "recommendation": "PARAMETER_DOCUMENTATION",
262
+ "parameter_name": "user_id",
263
+ "details": "Add to the documentation of the user_id parameter to clarify its required nature.",
264
+ "quote": "user_id must be explicitly provided by the user or derived from authentication tool response. No default should ever be assumed."
265
+ },
266
+ {
267
+ "recommendation": "INSTRUCTIONS_ADDITIONS",
268
+ "parameter_name": "user_id",
269
+ "details": "Add instructions to never invent parameter values, and if needed, ask the user for clarification.",
270
+ "quote": "Never invent parameter values. If missing, explicitly ask the user to provide it."
271
+ }
272
+ ]
273
+ }
274
+ },
275
+ {
276
+ "user_kwargs": {
277
+ "conversation_context": [
278
+ {
279
+ "role": "system",
280
+ "content": "You are a travel booking assistant."
281
+ },
282
+ {
283
+ "role": "user",
284
+ "content": "Book me a flight to Paris on September 15th."
285
+ }
286
+ ],
287
+ "tool_inventory": [
288
+ {
289
+ "type": "function",
290
+ "function": {
291
+ "name": "book_flight",
292
+ "description": "Books a flight to a specified destination on a given date.",
293
+ "parameters": {
294
+ "type": "object",
295
+ "properties": {
296
+ "destination": {
297
+ "type": "string",
298
+ "description": "The destination city or airport for the flight."
299
+ },
300
+ "departure_date": {
301
+ "type": "string",
302
+ "description": "The date of departure, in YYYY-MM-DD format."
303
+ },
304
+ "seat_class": {
305
+ "type": "string",
306
+ "description": "Seat class: economy, business, or first.",
307
+ "enum": ["economy", "business", "first"]
308
+ }
309
+ },
310
+ "required": ["destination", "departure_date", "seat_class"]
311
+ }
312
+ }
313
+ }
314
+ ],
315
+ "tool_call": {
316
+ "id": "call_002",
317
+ "type": "function",
318
+ "function": {
319
+ "name": "book_flight",
320
+ "arguments": "{ \"destination\": \"Paris\", \"departure_date\": \"2025-09-15\", \"seat_class\": \"first\" }"
321
+ }
322
+ }
323
+ },
324
+ "output": {
325
+ "evidence": "User asked for flight to Paris on 2025-09-15 but never specified seat_class. Assistant hallucinated 'first'.",
326
+ "explanation": "The destination and date are grounded. The seat_class parameter is hallucinated because the user never mentioned it.",
327
+ "output": 1,
328
+ "confidence": 0.92,
329
+ "correction": {
330
+ "parameter_issues": [
331
+ {
332
+ "parameter_name": "seat_class",
333
+ "reason_types": ["MISSING_INFORMATION"],
334
+ "reasons": "Seat class was never provided by the user or tool defaults.",
335
+ "corrected_value": {
336
+ "need_more_information": "Which seat class would you like: economy, business, or first?"
337
+ }
338
+ }
339
+ ]
340
+ },
341
+ "actionable_recommendations": [
342
+ {
343
+ "recommendation": "PARAMETER_DOCUMENTATION",
344
+ "parameter_name": "seat_class",
345
+ "details": "Add to the documentation of seat_class to clarify that it is a required parameter.",
346
+ "quote": "Required. Must be explicitly provided by user. No default assumed."
347
+ },
348
+ {
349
+ "recommendation": "INSTRUCTIONS_ADDITIONS",
350
+ "parameter_name": "seat_class",
351
+ "details": "Add instructions to always confirm required parameters with the user if not provided.",
352
+ "quote": "Always confirm all required details with the user before calling tools."
353
+ }
354
+ ]
355
+ }
356
+ },
357
+ {
358
+ "user_kwargs": {
359
+ "conversation_context": [
360
+ {
361
+ "role": "system",
362
+ "content": "You are a flight booking assistant."
363
+ },
364
+ {
365
+ "role": "user",
366
+ "content": "I want to fly to Rome on 15/09/2025 in economy class."
367
+ }
368
+ ],
369
+ "tool_inventory": [
370
+ {
371
+ "type": "function",
372
+ "function": {
373
+ "name": "book_flight",
374
+ "description": "Books a flight to a specified destination on a given date.",
375
+ "parameters": {
376
+ "type": "object",
377
+ "properties": {
378
+ "destination": { "type": "string" },
379
+ "departure_date": { "type": "string", "description": "Date in YYYY-MM-DD format" },
380
+ "seat_class": {
381
+ "type": "string",
382
+ "enum": ["economy", "business", "first"]
383
+ }
384
+ },
385
+ "required": ["destination", "departure_date", "seat_class"]
386
+ }
387
+ }
388
+ }
389
+ ],
390
+ "tool_call": {
391
+ "id": "call_003",
392
+ "type": "function",
393
+ "function": {
394
+ "name": "book_flight",
395
+ "arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"15/09/2025\", \"seat_class\": \"economy\" }"
396
+ }
397
+ }
398
+ },
399
+ "output": {
400
+ "evidence": "User provided date as '15/09/2025'. Tool spec requires YYYY-MM-DD. Assistant forwarded incorrect format directly.",
401
+ "explanation": "All parameters are grounded, but departure_date is in the wrong format.",
402
+ "output": 2,
403
+ "confidence": 0.95,
404
+ "correction": {
405
+ "parameter_issues": [
406
+ {
407
+ "parameter_name": "departure_date",
408
+ "reason_types": ["FORMAT_ERROR"],
409
+ "reasons": "Expected 'YYYY-MM-DD'. Received '15/09/2025'.",
410
+ "corrected_value": { "departure_date": "2025-09-15" }
411
+ }
412
+ ]
413
+ },
414
+ "actionable_recommendations": [
415
+ {
416
+ "recommendation": "PARAMETER_DOCUMENTATION",
417
+ "parameter_name": "departure_date",
418
+ "details": "Clarify the required date format in the parameter documentation.",
419
+ "quote": "YYYY-MM-DD only. No other formats accepted. If a different format is given in the conversation, reformat it to the required format before calling the tool."
420
+ },
421
+ {
422
+ "recommendation": "PARAMETER_EXAMPLES",
423
+ "parameter_name": "departure_date",
424
+ "details": "Add an example of a departure date that was reformatted correctly from 'DD/MM/YYYY' to 'YYYY-MM-DD'.",
425
+ "quote": "Example: given a user input with a departure date of '15/09/2025', it should be reformatted to '2025-09-15' before calling the tool."
426
+ },
427
+ {
428
+ "recommendation": "INSTRUCTIONS_ADDITIONS",
429
+ "parameter_name": "departure_date",
430
+ "details": "Add instructions to always reformat dates to the required format before calling the tool.",
431
+ "quote": "Make sure you adhere to the required formats and units as specified in the API documentation, and if a different format is given in the conversation, reformat it to the required format before calling the tool."
432
+ }
433
+ ]
434
+ }
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "name": "general_value_format_alignment",
440
+ "task_description": "Evaluate whether ALL parameter values in the function call conform to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter values. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ALL parameter values in the tool call to ensure they meet the exact type, format, and unit requirements. Consider the collective format compliance of the entire function call.\n---\n\n### 1. Data Type Compliance\n- Check that each value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints for each parameter, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility between parameter values in the same call\n- Check that parameter relationships don't violate constraints\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: All parameter values exactly match required types, formats, and units\n- **4 - Minor Deviation**: All values are fundamentally correct but some have minor formatting issues that are still likely to be accepted by the API or parser unless strict validation is enforced\n- **3 - Moderate Deviation**: Some parameter types or formats are partially incorrect; may cause issues\n- **2 - Major Deviation**: Multiple parameters have significant type and format violations; likely to fail\n- **1 - Complete Mismatch**: One or more parameter values are entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and values seem plausible, prefer a higher score (4-5)\n- If the spec is clear and values deviate, assign a lower score (1-3)\n- Consider the cumulative impact of all format issues across parameters\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.",
441
+ "jsonschema": {
442
+ "title": "general_value_format_alignment",
443
+ "description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.",
444
+ "type": "object",
445
+ "additionalProperties": false,
446
+ "properties": {
447
+ "evidence": {
448
+ "type": "string",
449
+ "description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment."
450
+ },
451
+ "explanation": {
452
+ "type": "string",
453
+ "description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call."
454
+ },
455
+ "output": {
456
+ "type": "integer",
457
+ "minimum": 1,
458
+ "maximum": 5,
459
+ "threshold_low": 4,
460
+ "threshold_high": 5,
461
+ "description": "An integer from 1 to 5 indicating how well all parameter values conform to the required types, formats, and units as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: All parameter values exactly match the expected types, formats, and units. Example: all dates in 'YYYY-MM-DD', all numbers as integers, all strings properly formatted.\n\n4 - Minor Deviation: All values are fundamentally correct but some have small formatting issues (e.g., missing leading zeros) that are still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: Some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing. Example: mix of correct and incorrectly formatted dates.\n\n2 - Major Deviation: Multiple parameter values significantly violate the expected type, format, or unit and are very likely to fail. Example: wrong data types, completely wrong formats.\n\n1 - Complete Mismatch: One or more parameter values are entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'; expected date, value is random text."
462
+ },
463
+ "confidence": {
464
+ "type": "number",
465
+ "minimum": 0,
466
+ "maximum": 1,
467
+ "threshold_low": 0,
468
+ "threshold_high": 1,
469
+ "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
470
+ },
471
+ "correction": {
472
+ "type": "object",
473
+ "description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.",
474
+ "properties": {
475
+ "parameter_issues": {
476
+ "type": "array",
477
+ "description": "List of parameters with format issues. Leave empty if no issues were found.",
478
+ "items": {
479
+ "type": "object",
480
+ "properties": {
481
+ "parameter_name": {
482
+ "type": "string",
483
+ "description": "Name of the parameter with a format issue."
484
+ },
485
+ "reason_types": {
486
+ "type": "array",
487
+ "description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.",
488
+ "items": {
489
+ "type": "string",
490
+ "enum": [
491
+ "TYPE_ERROR",
492
+ "FORMAT_ERROR",
493
+ "UNIT_ERROR",
494
+ "PATTERN_ERROR",
495
+ "CONSISTENCY_ERROR",
496
+ "OTHER"
497
+ ]
498
+ }
499
+ },
500
+ "reasons": {
501
+ "type": "string",
502
+ "description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit."
503
+ },
504
+ "corrected_value": {
505
+ "type": "object",
506
+ "description": "An object containing the corrected parameter value in the form: { \"<parameter_name>\": <corrected_value> }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.",
507
+ "additionalProperties": true
508
+ }
509
+ },
510
+ "required": [
511
+ "parameter_name",
512
+ "reason_types",
513
+ "reasons",
514
+ "corrected_value"
515
+ ]
516
+ }
517
+ },
518
+ "tool_call": {
519
+ "type": "object",
520
+ "description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.",
521
+ "properties": {
522
+ "name": {
523
+ "type": "string",
524
+ "description": "Name of the function to call."
525
+ },
526
+ "arguments": {
527
+ "type": "object",
528
+ "description": "Set of corrected arguments for the function call, with all format issues resolved.",
529
+ "additionalProperties": true
530
+ }
531
+ },
532
+ "required": [
533
+ "name",
534
+ "arguments"
535
+ ]
536
+ }
537
+ },
538
+ "required": []
539
+ },
540
+ "actionable_recommendations": {
541
+ "type": "array",
542
+ "description": "Provide actionable suggestions for avoiding format errors in future tool calls, only if there are issues. Address root causes like unclear format specs, missing examples, or lack of validation. Include suggestions such as:\n\n1. PARAMETER_FORMAT_DOCUMENTATION: Clarify or expand parameter format expectations in the API documentation.\n2. PARAMETER_EXAMPLES: Add example values with correct formatting in the parameter documentation.\n3. PARAMETER_VALIDATION: Introduce regex or rule-based format validation in the API to catch errors before tool calls.\n4. PARAMETER_CONVERSION: Add conversion helpers for user input to ensure correct formatting.\n5. UNIT_STANDARDS: Standardize unit expectations and flag missing or incorrect units in the tools implementation.\n6. PARAMETER_NAMING_CONVENTIONS: Suggest clearer parameter names that indicate expected formats or units.\n7. SYSTEM_PROMPT_ADDITIONS: Recommend specific additions to the system prompt that would help the agent avoid similar format issues in the future.\n8. PRE_CALL_REFLECTION_STRATEGIES: Propose strategies for the agent to reflect on parameter values before making tool calls, such as checking format compliance.\n9. PARAMETER_SOURCE_TRACKING: Suggest implementing a mechanism to track where each parameter value originated from to improve transparency and debugging.\n10. CLARIFICATION_PROMPTS: Recommend adding clarification prompts when parameter values are ambiguous or missing expected formats.\n11. OTHER: Any other specific recommendations that would help prevent similar format issues in the future.",
543
+ "items": {
544
+ "type": "object",
545
+ "properties": {
546
+ "recommendation": {
547
+ "type": "string",
548
+ "description": "A specific, actionable recommendation to improve the agent's parameter formatting process.",
549
+ "enum": [
550
+ "PARAMETER_FORMAT_DOCUMENTATION",
551
+ "PARAMETER_EXAMPLES",
552
+ "PARAMETER_VALIDATION",
553
+ "PARAMETER_CONVERSION",
554
+ "UNIT_STANDARDS",
555
+ "PARAMETER_NAMING_CONVENTIONS",
556
+ "SYSTEM_PROMPT_ADDITIONS",
557
+ "PRE_CALL_REFLECTION_STRATEGIES",
558
+ "PARAMETER_SOURCE_TRACKING",
559
+ "CLARIFICATION_PROMPTS",
560
+ "OTHER"
561
+ ]
562
+ },
563
+ "parameter_name": {
564
+ "type": "string",
565
+ "description": "The name of the parameter to which the recommendation applies."
566
+ },
567
+ "details": {
568
+ "type": "string",
569
+ "description": "Detailed explanation of the recommendation, including what specific changes should be made, how they will improve parameter formatting, and any relevant examples or best practices."
570
+ },
571
+ "quote": {
572
+ "type": "string",
573
+ "description": "The specific quote of the additions made to the documentation, examples, or instructions."
574
+ }
575
+ },
576
+ "required": [
577
+ "recommendation",
578
+ "parameter_name",
579
+ "details",
580
+ "quote"
581
+ ]
582
+ }
583
+ }
584
+ },
585
+ "required": [
586
+ "evidence",
587
+ "explanation",
588
+ "output",
589
+ "confidence",
590
+ "correction",
591
+ "actionable_recommendations"
592
+ ]
593
+ },
594
+ "examples": [
595
+ {
596
+ "user_kwargs": {
597
+ "conversation_context": [
598
+ {
599
+ "role": "user",
600
+ "content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class."
601
+ }
602
+ ],
603
+ "tool_inventory": [
604
+ {
605
+ "type": "function",
606
+ "function": {
607
+ "name": "book_flight",
608
+ "description": "Books a flight with specified parameters",
609
+ "parameters": {
610
+ "type": "object",
611
+ "properties": {
612
+ "destination": {
613
+ "type": "string",
614
+ "description": "Destination city"
615
+ },
616
+ "departure_date": {
617
+ "type": "string",
618
+ "format": "date",
619
+ "description": "Date in YYYY-MM-DD format"
620
+ },
621
+ "passengers": {
622
+ "type": "integer",
623
+ "description": "Number of passengers"
624
+ },
625
+ "class": {
626
+ "type": "string",
627
+ "enum": [
628
+ "economy",
629
+ "business",
630
+ "first"
631
+ ],
632
+ "description": "Flight class"
633
+ }
634
+ },
635
+ "required": [
636
+ "destination",
637
+ "departure_date",
638
+ "passengers",
639
+ "class"
640
+ ]
641
+ }
642
+ }
643
+ }
644
+ ],
645
+ "tool_call": {
646
+ "id": "call_001",
647
+ "type": "function",
648
+ "function": {
649
+ "name": "book_flight",
650
+ "arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}"
651
+ }
652
+ }
653
+ },
654
+ "output": {
655
+ "evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).",
656
+ "explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.",
657
+ "output": 5,
658
+ "confidence": 0.98,
659
+ "correction": {},
660
+ "actionable_recommendations": []
661
+ }
662
+ },
663
+ {
664
+ "user_kwargs": {
665
+ "conversation_context": [
666
+ {
667
+ "role": "user",
668
+ "content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM."
669
+ }
670
+ ],
671
+ "tool_inventory": [
672
+ {
673
+ "type": "function",
674
+ "function": {
675
+ "name": "set_timer_and_meeting",
676
+ "description": "Sets a timer and schedules a meeting",
677
+ "parameters": {
678
+ "type": "object",
679
+ "properties": {
680
+ "timer_duration": {
681
+ "type": "string",
682
+ "description": "Duration in format 'number unit' (e.g., '5 minutes')"
683
+ },
684
+ "meeting_date": {
685
+ "type": "string",
686
+ "description": "Date in YYYY-MM-DD format"
687
+ },
688
+ "meeting_time": {
689
+ "type": "string",
690
+ "description": "Time in HH:MM 24-hour format"
691
+ }
692
+ },
693
+ "required": [
694
+ "timer_duration",
695
+ "meeting_date",
696
+ "meeting_time"
697
+ ]
698
+ }
699
+ }
700
+ }
701
+ ],
702
+ "tool_call": {
703
+ "id": "call_002",
704
+ "type": "function",
705
+ "function": {
706
+ "name": "set_timer_and_meeting",
707
+ "arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}"
708
+ }
709
+ }
710
+ },
711
+ "output": {
712
+ "evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).",
713
+ "explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.",
714
+ "output": 2,
715
+ "confidence": 0.92,
716
+ "correction": {
717
+ "parameter_issues": [
718
+ {
719
+ "parameter_name": "timer_duration",
720
+ "reason_types": [
721
+ "FORMAT_ERROR",
722
+ "UNIT_ERROR"
723
+ ],
724
+ "reasons": "Value '300' lacks required unit and space format, should be '5 minutes'",
725
+ "corrected_value": {
726
+ "timer_duration": "5 minutes"
727
+ }
728
+ },
729
+ {
730
+ "parameter_name": "meeting_date",
731
+ "reason_types": [
732
+ "FORMAT_ERROR"
733
+ ],
734
+ "reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format",
735
+ "corrected_value": {
736
+ "meeting_date": "2025-08-07"
737
+ }
738
+ },
739
+ {
740
+ "parameter_name": "meeting_time",
741
+ "reason_types": [
742
+ "FORMAT_ERROR"
743
+ ],
744
+ "reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM",
745
+ "corrected_value": {
746
+ "meeting_time": "15:00"
747
+ }
748
+ }
749
+ ],
750
+ "tool_call": {
751
+ "name": "set_timer_and_meeting",
752
+ "arguments": {
753
+ "timer_duration": "5 minutes",
754
+ "meeting_date": "2025-08-07",
755
+ "meeting_time": "15:00"
756
+ }
757
+ }
758
+ },
759
+ "actionable_recommendations": [
760
+ {
761
+ "recommendation": "PARAMETER_EXAMPLES",
762
+ "parameter_name": "timer_duration",
763
+ "details": "Add clear examples in the API documentation showing timer_duration as '5 minutes', meeting_date as '2025-08-07', and meeting_time as '15:00' to prevent format confusion.",
764
+ "quote": "Examples: `{ \"timer_duration\": \"5 minutes\" }`, `{ \"timer_duration\": \"10 seconds\" }`"
765
+ },
766
+ {
767
+ "recommendation": "SYSTEM_PROMPT_ADDITIONS",
768
+ "parameter_name": "timer_duration",
769
+ "details": "Add to system prompt to always convert relative dates, 12-hour times, and ensure timer durations include both number and unit with a space before calling tools.",
770
+ "quote": "Always convert relative dates like 'tomorrow' to YYYY-MM-DD format, convert 12-hour time to 24-hour HH:MM format, and ensure timer durations include both number and unit with a space."
771
+ },
772
+ {
773
+ "recommendation": "PARAMETER_VALIDATION",
774
+ "parameter_name": "timer_duration",
775
+ "details": "Implement pre-call validation to check timer_duration pattern, date format compliance, and time format before making tool calls.",
776
+ "quote": "Validate 'timer_duration' matches pattern '^\\d+\\s+(minutes?|seconds?|hours?)$', 'meeting_date' is in 'YYYY-MM-DD', and 'meeting_time' is in 'HH:MM' 24-hour format before calling the tool."
777
+ }
778
+ ]
779
+ }
780
+ }
781
+ ]
782
+ }
783
+ ]