ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD +97 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +114 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/__init__.py +15 -6
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -3
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +138 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +11 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"name": "general_hallucination_check",
|
|
4
|
+
"task_description": "Evaluate whether each parameter value in the function call is correct and **directly supported** by the provided conversation history and adhere the API specification. Your assessment must be based **strictly on explicit evidence** from these sources and correctly formatted based on the API specifications. Do **not** assume or hallucinate any information that is not clearly documented and provided.\n\n---\n\n#### 1. Grounding Sources\n\nA parameter value is considered grounded if it originates from one of the following:\n\n- An explicit user message in the conversation \n- An assistant message that the user confirmed or acknowledged \n- The output of a previous tool call \n- A documented default value in the API specification \n\n---\n\n#### 2. Parameter Value Classification\n\nEach parameter value must be labeled using one of the following categories:\n\n- **CORRECT** \n The value is explicitly mentioned, clearly implied in the conversation, or matches a documented default.\n\n- **MISSING_INFORMATION** \n The value is underspecified or incomplete given the current context (e.g., '05/06' when a full date is required).\n\n- **FORMAT ERROR** \n The value is conceptually correct but incorrectly formatted (e.g., wrong units, casing, or structure) based on the API specifications and descriptions.\n\n- **CONTRADICTORY_VALUES** \n The value violates documented constraints or logical relationships between parameters (e.g., `start_time` should be < `end_time`).\n\n- **DEFAULT_ISSUE** \n The default value of the parameter is not the correct value based on the conversation history.\n\n---\n\n#### 3. Handling Default Values\n\n- Optional parameters may only use default values that are **explicitly documented** in the API specification. \n- Defaults that are assumed or undocumented count as hallucinations. \n- If no value is provided and no default exists, the parameter should be omitted.\n\n---\n\n#### 4. Acceptable Transformations\n\nSome transformations are permitted, but only when grounded:\n\n- **Synonyms** (e.g., \"NYC\" -> \"New York\") are allowed only when normalized in the conversation. \n- **Unit conversions** are valid only if the conversation explicitly mentions the conversion. \n- **Format changes** (e.g., date or time reformatting) are acceptable only if acknowledged by the assistant or supported by the specification.\n\n---\n\n#### 5. Multi-Call Context\n\nIf the tool call appears in a sequence:\n\n- You may use outputs from earlier tool calls to justify parameter values. \n- Consider dependencies and ordering between calls when assessing grounding. \n\n---\n\n#### Conservative Judgment Principle\n\nWhen in doubt, err on the side of caution. If grounding cannot be clearly established, treat the parameter as incorrect.\n",
|
|
5
|
+
"jsonschema": {
|
|
6
|
+
"title": "general_hallucination_check",
|
|
7
|
+
"description": "Assessment of tool call grounding accuracy, following the rubric defined in the task description.",
|
|
8
|
+
"type": "object",
|
|
9
|
+
"additionalProperties": false,
|
|
10
|
+
"properties": {
|
|
11
|
+
"evidence": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "Provide EXACT quotes from user messages, assistant responses, prior tool outputs, or API specifications that demonstrate whether each parameter value is grounded. Do not paraphrase or interpret-use only direct evidence."
|
|
14
|
+
},
|
|
15
|
+
"explanation": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "For EACH parameter value, explain its grounding status by referencing specific evidence from the conversation or tool specification. Indicate whether the value is correctly grounded, hallucinated, needs more information, has format errors, or contradicts other parameters."
|
|
18
|
+
},
|
|
19
|
+
"output": {
|
|
20
|
+
"type": "integer",
|
|
21
|
+
"minimum": 1,
|
|
22
|
+
"maximum": 5,
|
|
23
|
+
"threshold_low": 4,
|
|
24
|
+
"threshold_high": 5,
|
|
25
|
+
"description": "Reflection Quality Score (1-5):\n\n- 5: All values are correct - fully grounded and well-formatted.\n All parameter values are explicitly grounded in the conversation or tool output, and exactly match the expected types, formats, and units.\n Example:\n User: \"Book a hotel in Paris for two adults from August 10th to 12th.\"\n \u2192 Tool call includes: location: \"Paris\", guests: 2, check_in: \"2025-08-10\", check_out: \"2025-08-12\"\n\n- 4: All values are correct based on the conversation history and tool specifications - can be logically inferred (can be implicitly grounded).\n Some parameter values are not directly mentioned but can be logically inferred from the context. No evidence of hallucination.\n Example:\n User: \"I want to book a hotel in Tokyo for me and my wife.\"\n \u2192 Tool call includes: location: \"Tokyo\", guests: 2\n\n- 3: One or more values are probably incorrect or ungrounded - ambiguous or undetermined\n It's unclear whether some parameter values are grounded. They might be inferred, hallucinated, or drawn from partial context, or some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing.\n Example:\n User: \"Book a hotel in Rome.\"\n \u2192 Tool call includes: location: \"Rome\", guests: 2\n\n- 2: One or more values are probably incorrect - likely hallucinated or incorrect format.\n Several parameter values do not align with the conversation context and are likely hallucinated, or have format errors.\n Example:\n User: \"Find a hotel.\"\n \u2192 Tool call includes: location: \"New York\", guests: 4\n\n- 1: One or more values are clearly incorrect - certainly hallucinated or fabricated.\n Some parameter values are clearly invented, irrelevant, contradict known context, or do not follow the API specifications.\n Example:\n User: \"Remind me to call John.\"\n \u2192 Tool call includes: location: \"Barcelona\", guests: 6, check_in: \"2025-07-01\""
|
|
26
|
+
},
|
|
27
|
+
"confidence": {
|
|
28
|
+
"type": "number",
|
|
29
|
+
"minimum": 0,
|
|
30
|
+
"maximum": 1,
|
|
31
|
+
"threshold_low": 0,
|
|
32
|
+
"threshold_high": 1,
|
|
33
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
34
|
+
},
|
|
35
|
+
"correction": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"description": "Corrections to the tool call. For correctly grounded calls, use an empty object {}. For flawed calls, include parameter_issues and optionally a corrected tool_call if the full correction can be reconstructed.",
|
|
38
|
+
"properties": {
|
|
39
|
+
"parameter_issues": {
|
|
40
|
+
"type": "array",
|
|
41
|
+
"description": "List of parameters with issues. Leave empty if no issues were found.",
|
|
42
|
+
"items": {
|
|
43
|
+
"type": "object",
|
|
44
|
+
"properties": {
|
|
45
|
+
"parameter_name": {
|
|
46
|
+
"type": "string",
|
|
47
|
+
"description": "Name of the parameter with an issue."
|
|
48
|
+
},
|
|
49
|
+
"reason_types": {
|
|
50
|
+
"type": "array",
|
|
51
|
+
"description": "One or more categories explaining the issue with this parameter value.",
|
|
52
|
+
"items": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"enum": [
|
|
55
|
+
"MISSING_INFORMATION",
|
|
56
|
+
"FORMAT_ERROR",
|
|
57
|
+
"CONTRADICTORY_VALUES",
|
|
58
|
+
"DEFAULT_ISSUE",
|
|
59
|
+
"OTHER"
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
"reasons": {
|
|
64
|
+
"type": "string",
|
|
65
|
+
"description": "Brief explanation of the issue(s) for this parameter."
|
|
66
|
+
},
|
|
67
|
+
"corrected_value": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"description": "Object containing parameter name as key and corrected value as value. Use 'need_more_information' as the key and clarification question for the user if additional information is required. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-grounded, return an empty object {}.",
|
|
70
|
+
"additionalProperties": true
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
"required": []
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
"tool_call": {
|
|
77
|
+
"type": "object",
|
|
78
|
+
"description": "Complete corrected tool call, including all fixed arguments. Must be included if a full reconstruction is possible.",
|
|
79
|
+
"properties": {
|
|
80
|
+
"name": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"description": "Name of the function to call."
|
|
83
|
+
},
|
|
84
|
+
"arguments": {
|
|
85
|
+
"type": "object",
|
|
86
|
+
"description": "Set of corrected arguments for the function call, including both modified and retained values.",
|
|
87
|
+
"additionalProperties": true
|
|
88
|
+
}
|
|
89
|
+
},
|
|
90
|
+
"required": []
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"required": []
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
"required": [
|
|
97
|
+
"evidence",
|
|
98
|
+
"explanation",
|
|
99
|
+
"output",
|
|
100
|
+
"confidence",
|
|
101
|
+
"correction"
|
|
102
|
+
]
|
|
103
|
+
},
|
|
104
|
+
"examples": [
|
|
105
|
+
{
|
|
106
|
+
"user_kwargs": {
|
|
107
|
+
"conversation_context": [
|
|
108
|
+
{
|
|
109
|
+
"role": "assistant",
|
|
110
|
+
"content": "Logged in as user 55; auth token acquired."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"role": "assistant",
|
|
114
|
+
"content": "Ready to fetch your profile."
|
|
115
|
+
}
|
|
116
|
+
],
|
|
117
|
+
"tool_inventory": [
|
|
118
|
+
{
|
|
119
|
+
"type": "function",
|
|
120
|
+
"function": {
|
|
121
|
+
"name": "get_user_profile",
|
|
122
|
+
"description": "Retrieves a user's profile details",
|
|
123
|
+
"parameters": {
|
|
124
|
+
"type": "object",
|
|
125
|
+
"properties": {
|
|
126
|
+
"user_id": {
|
|
127
|
+
"type": "integer",
|
|
128
|
+
"description": "The ID of the user whose profile is being retrieved"
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
"required": [
|
|
132
|
+
"user_id"
|
|
133
|
+
]
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
],
|
|
138
|
+
"tool_call": {
|
|
139
|
+
"id": "call_001",
|
|
140
|
+
"type": "function",
|
|
141
|
+
"function": {
|
|
142
|
+
"name": "get_user_profile",
|
|
143
|
+
"arguments": "{ \"user_id\": 55 }"
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"output": {
|
|
148
|
+
"evidence": "Assistant message: \"Logged in as user 55; auth token acquired.\"",
|
|
149
|
+
"explanation": "The user_id=55 parameter is properly grounded in the conversation history, as evidenced by the assistant's explicit statement \"Logged in as user 55.\"",
|
|
150
|
+
"output": 5,
|
|
151
|
+
"confidence": 0.95,
|
|
152
|
+
"correction": {}
|
|
153
|
+
}
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"user_kwargs": {
|
|
157
|
+
"conversation_context": [
|
|
158
|
+
{
|
|
159
|
+
"role": "assistant",
|
|
160
|
+
"content": "Ready to fetch your profile."
|
|
161
|
+
}
|
|
162
|
+
],
|
|
163
|
+
"tool_inventory": [
|
|
164
|
+
{
|
|
165
|
+
"type": "function",
|
|
166
|
+
"function": {
|
|
167
|
+
"name": "get_user_profile",
|
|
168
|
+
"description": "Retrieves a user's profile details",
|
|
169
|
+
"parameters": {
|
|
170
|
+
"type": "object",
|
|
171
|
+
"properties": {
|
|
172
|
+
"user_id": {
|
|
173
|
+
"type": "integer",
|
|
174
|
+
"description": "The ID of the user whose profile is being retrieved"
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
"required": [
|
|
178
|
+
"user_id"
|
|
179
|
+
]
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
],
|
|
184
|
+
"tool_call": {
|
|
185
|
+
"id": "call_001",
|
|
186
|
+
"type": "function",
|
|
187
|
+
"function": {
|
|
188
|
+
"name": "get_user_profile",
|
|
189
|
+
"arguments": "{ \"user_id\": 42 }"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
},
|
|
193
|
+
"output": {
|
|
194
|
+
"evidence": "Complete conversation history contains no mention of user ID 42. Tool specification for get_user_profile does not specify any default value for the user_id parameter.",
|
|
195
|
+
"explanation": "The user_id=42 parameter is a hallucinated value with no grounding in the conversation or tool specification.",
|
|
196
|
+
"output": 1,
|
|
197
|
+
"confidence": 0.91,
|
|
198
|
+
"correction": {
|
|
199
|
+
"parameter_issues": [
|
|
200
|
+
{
|
|
201
|
+
"parameter_name": "user_id",
|
|
202
|
+
"reason_types": [
|
|
203
|
+
"MISSING_INFORMATION"
|
|
204
|
+
],
|
|
205
|
+
"reasons": "Value 42 is invented without any basis in conversation or specification",
|
|
206
|
+
"corrected_value": {
|
|
207
|
+
"need_more_information": "Please provide the user ID to retrieve the profile."
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
]
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"user_kwargs": {
|
|
216
|
+
"conversation_context": [
|
|
217
|
+
{
|
|
218
|
+
"role": "user",
|
|
219
|
+
"content": "I want to book a flight to Rome on 19-07-2024. I want to fly business class."
|
|
220
|
+
}
|
|
221
|
+
],
|
|
222
|
+
"tool_inventory": [
|
|
223
|
+
{
|
|
224
|
+
"type": "function",
|
|
225
|
+
"function": {
|
|
226
|
+
"name": "book_flight",
|
|
227
|
+
"description": "Books a flight to a specified destination on a given date.",
|
|
228
|
+
"parameters": {
|
|
229
|
+
"type": "object",
|
|
230
|
+
"properties": {
|
|
231
|
+
"destination": {
|
|
232
|
+
"type": "string",
|
|
233
|
+
"description": "The destination city or airport for the flight"
|
|
234
|
+
},
|
|
235
|
+
"departure_date": {
|
|
236
|
+
"type": "string",
|
|
237
|
+
"description": "The date of departure, in YYYY-MM-DD format, e.g., '2024-07-19'"
|
|
238
|
+
},
|
|
239
|
+
"seat_class": {
|
|
240
|
+
"type": "string",
|
|
241
|
+
"description": "The class of the seat (must be one of: economy, business, or first)",
|
|
242
|
+
"enum": [
|
|
243
|
+
"economy",
|
|
244
|
+
"business",
|
|
245
|
+
"first"
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
},
|
|
249
|
+
"required": [
|
|
250
|
+
"destination",
|
|
251
|
+
"departure_date",
|
|
252
|
+
"seat_class"
|
|
253
|
+
]
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
],
|
|
258
|
+
"tool_call": {
|
|
259
|
+
"id": "call_001",
|
|
260
|
+
"type": "function",
|
|
261
|
+
"function": {
|
|
262
|
+
"name": "book_flight",
|
|
263
|
+
"arguments": "{ \"destination\": \"Rome\", \"departure_date\": \"19-07-2024\", \"seat_class\": \"business\" }"
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
},
|
|
267
|
+
"output": {
|
|
268
|
+
"evidence": "- User message: \"I want to book a flight to Rome on 19-07-2024. I want to fly business class.\"\n- Tool spec requires 'departure_date' in YYYY-MM-DD format.\n- 'seat_class' is correctly grounded as 'business'.",
|
|
269
|
+
"explanation": "The 'destination' parameter is correctly grounded as 'Rome'. The 'departure_date' is incorrectly formatted as '19-07-2024' instead of the required 'YYYY-MM-DD' format. The 'seat_class' is correctly grounded as 'business', but the date format error needs to be addressed.",
|
|
270
|
+
"output": 2,
|
|
271
|
+
"confidence": 0.95,
|
|
272
|
+
"correction": {
|
|
273
|
+
"parameter_issues": [
|
|
274
|
+
{
|
|
275
|
+
"parameter_name": "departure_date",
|
|
276
|
+
"reason_types": [
|
|
277
|
+
"FORMAT_ERROR"
|
|
278
|
+
],
|
|
279
|
+
"reasons": "The date format '19-07-2024' does not conform to the required 'YYYY-MM-DD' format.",
|
|
280
|
+
"corrected_value": {
|
|
281
|
+
"departure_date": "2024-07-19"
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
],
|
|
285
|
+
"tool_call": {
|
|
286
|
+
"name": "book_flight",
|
|
287
|
+
"arguments": {
|
|
288
|
+
"destination": "Rome",
|
|
289
|
+
"departure_date": "2024-07-19",
|
|
290
|
+
"seat_class": "business"
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
]
|
|
297
|
+
},
|
|
298
|
+
{
|
|
299
|
+
"name": "general_value_format_alignment",
|
|
300
|
+
"task_description": "Evaluate whether ALL parameter values in the function call conform to the required type, format, and unit conventions defined in the API specification.\n\nYour judgment must be strictly evidence-based: rely exclusively on the provided API documentation and the parameter values. Do not assume or infer formatting rules that are not explicitly documented.\n\nIMPORTANT: Evaluate ALL parameter values in the tool call to ensure they meet the exact type, format, and unit requirements. Consider the collective format compliance of the entire function call.\n---\n\n### 1. Data Type Compliance\n- Check that each value matches the required data type (e.g., string, integer, boolean, object, array)\n- Verify that numbers are represented as numeric types, not strings\n- Ensure strings are quoted correctly if required\n- Confirm booleans are true/false (not \"true\"/\"false\") when the type is boolean\n\n---\n\n### 2. Format Specification Compliance\n- Validate adherence to documented format constraints for each parameter, including:\n - Date and time formats (e.g., ISO 8601, YYYY-MM-DD, timezone presence)\n - Currency (e.g., $USD, symbol placement, decimal precision)\n - Special patterns (e.g., phone numbers, email addresses, postal codes)\n- Enforce regular expressions or formatting rules defined in the schema\n\n---\n\n### 3. Unit Compliance\n- Verify presence or absence of unit suffixes/prefixes as specified (e.g., \"kg\", \"seconds\")\n- Confirm unit types are consistent with spec (e.g., Celsius vs Fahrenheit)\n- Reject any extra units when raw values are required\n\n---\n\n### 4. Consistency Requirements\n- Ensure values are consistent with related parameters (e.g., both timestamps in UTC)\n- Validate compatibility between parameter values in the same call\n- Check that parameter relationships don't violate constraints\n\n---\n\n### Format Alignment Rating Scale\n- **5 - Perfect Alignment**: All parameter values exactly match required types, formats, and units\n- **4 - Minor Deviation**: All values are fundamentally correct but some have minor formatting issues that are still likely to be accepted by the API or parser unless strict validation is enforced\n- **3 - Moderate Deviation**: Some parameter types or formats are partially incorrect; may cause issues\n- **2 - Major Deviation**: Multiple parameters have significant type and format violations; likely to fail\n- **1 - Complete Mismatch**: One or more parameter values are entirely incompatible with specification\n\n---\n\n### Final Rule\nBe conservative in your assessment:\n- If the spec is ambiguous and values seem plausible, prefer a higher score (4-5)\n- If the spec is clear and values deviate, assign a lower score (1-3)\n- Consider the cumulative impact of all format issues across parameters\n\nYour evaluation is critical: even minor formatting issues can lead to tool failures or incorrect outputs. Careful review ensures reliability in downstream tool use.",
|
|
301
|
+
"jsonschema": {
|
|
302
|
+
"title": "general_value_format_alignment",
|
|
303
|
+
"description": "Assessment of all parameter values' compliance with required type, format, and unit specifications, based on the rubric above.",
|
|
304
|
+
"type": "object",
|
|
305
|
+
"additionalProperties": false,
|
|
306
|
+
"properties": {
|
|
307
|
+
"evidence": {
|
|
308
|
+
"type": "string",
|
|
309
|
+
"description": "Quote the specification's type/format definitions for each parameter and include the actual parameter values provided. Cite EXACT text from the tool specification that supports your judgment."
|
|
310
|
+
},
|
|
311
|
+
"explanation": {
|
|
312
|
+
"type": "string",
|
|
313
|
+
"description": "For each parameter value, explain why it conforms or does not conform to the specification's requirements. Address data type, format, units, and any pattern or constraint violations. For incorrect values, describe exactly what is wrong and how it affects the overall tool call."
|
|
314
|
+
},
|
|
315
|
+
"output": {
|
|
316
|
+
"type": "integer",
|
|
317
|
+
"minimum": 1,
|
|
318
|
+
"maximum": 5,
|
|
319
|
+
"threshold_low": 4,
|
|
320
|
+
"threshold_high": 5,
|
|
321
|
+
"description": "An integer from 1 to 5 indicating how well all parameter values conform to the required types, formats, and units as defined in the API specification.\n\nScore meanings:\n\n5 - Perfect Alignment: All parameter values exactly match the expected types, formats, and units. Example: all dates in 'YYYY-MM-DD', all numbers as integers, all strings properly formatted.\n\n4 - Minor Deviation: All values are fundamentally correct but some have small formatting issues (e.g., missing leading zeros) that are still likely to be accepted by the API or parser unless strict validation is enforced. Example: expected '2025-08-05', value is '2025-8-5'.\n\n3 - Moderate Deviation: Some parameter values partially match the expected format but are likely to be rejected or misinterpreted by automatic processing. Example: mix of correct and incorrectly formatted dates.\n\n2 - Major Deviation: Multiple parameter values significantly violate the expected type, format, or unit and are very likely to fail. Example: wrong data types, completely wrong formats.\n\n1 - Complete Mismatch: One or more parameter values are entirely incompatible with the required format or type. Example: expected boolean, value is 'maybe'; expected date, value is random text."
|
|
322
|
+
},
|
|
323
|
+
"confidence": {
|
|
324
|
+
"type": "number",
|
|
325
|
+
"minimum": 0,
|
|
326
|
+
"maximum": 1,
|
|
327
|
+
"threshold_low": 0,
|
|
328
|
+
"threshold_high": 1,
|
|
329
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
330
|
+
},
|
|
331
|
+
"correction": {
|
|
332
|
+
"type": "object",
|
|
333
|
+
"description": "If output >= 4, return {}. If output <= 3, provide parameter_issues and a corrected tool_call.",
|
|
334
|
+
"properties": {
|
|
335
|
+
"parameter_issues": {
|
|
336
|
+
"type": "array",
|
|
337
|
+
"description": "List of parameters with format issues. Leave empty if no issues were found.",
|
|
338
|
+
"items": {
|
|
339
|
+
"type": "object",
|
|
340
|
+
"properties": {
|
|
341
|
+
"parameter_name": {
|
|
342
|
+
"type": "string",
|
|
343
|
+
"description": "Name of the parameter with a format issue."
|
|
344
|
+
},
|
|
345
|
+
"reason_types": {
|
|
346
|
+
"type": "array",
|
|
347
|
+
"description": "List of format issues identified. Use one or more of: TYPE_ERROR, FORMAT_ERROR, UNIT_ERROR, PATTERN_ERROR, CONSISTENCY_ERROR, OTHER.",
|
|
348
|
+
"items": {
|
|
349
|
+
"type": "string",
|
|
350
|
+
"enum": [
|
|
351
|
+
"TYPE_ERROR",
|
|
352
|
+
"FORMAT_ERROR",
|
|
353
|
+
"UNIT_ERROR",
|
|
354
|
+
"PATTERN_ERROR",
|
|
355
|
+
"CONSISTENCY_ERROR",
|
|
356
|
+
"OTHER"
|
|
357
|
+
]
|
|
358
|
+
}
|
|
359
|
+
},
|
|
360
|
+
"reasons": {
|
|
361
|
+
"type": "string",
|
|
362
|
+
"description": "Short explanation of the specific issue(s) with the parameter's format, type, or unit."
|
|
363
|
+
},
|
|
364
|
+
"corrected_value": {
|
|
365
|
+
"type": "object",
|
|
366
|
+
"description": "An object containing the corrected parameter value in the form: { \"<parameter_name>\": <corrected_value> }. If you cannot correct the value, use 'need_more_information' as the key and provide a clarification question for the user. Use 'need_more_tool_calls' if the value cannot be corrected without additional tool calls. If the value is well-formatted, return an empty object {}.",
|
|
367
|
+
"additionalProperties": true
|
|
368
|
+
}
|
|
369
|
+
},
|
|
370
|
+
"required": [
|
|
371
|
+
"parameter_name",
|
|
372
|
+
"reason_types",
|
|
373
|
+
"reasons",
|
|
374
|
+
"corrected_value"
|
|
375
|
+
]
|
|
376
|
+
}
|
|
377
|
+
},
|
|
378
|
+
"tool_call": {
|
|
379
|
+
"type": "object",
|
|
380
|
+
"description": "Complete corrected tool call with all format issues fixed, including both modified and retained parameter values. Must be included if a full reconstruction is possible.",
|
|
381
|
+
"properties": {
|
|
382
|
+
"name": {
|
|
383
|
+
"type": "string",
|
|
384
|
+
"description": "Name of the function to call."
|
|
385
|
+
},
|
|
386
|
+
"arguments": {
|
|
387
|
+
"type": "object",
|
|
388
|
+
"description": "Set of corrected arguments for the function call, with all format issues resolved.",
|
|
389
|
+
"additionalProperties": true
|
|
390
|
+
}
|
|
391
|
+
},
|
|
392
|
+
"required": [
|
|
393
|
+
"name",
|
|
394
|
+
"arguments"
|
|
395
|
+
]
|
|
396
|
+
}
|
|
397
|
+
},
|
|
398
|
+
"required": []
|
|
399
|
+
}
|
|
400
|
+
},
|
|
401
|
+
"required": [
|
|
402
|
+
"evidence",
|
|
403
|
+
"explanation",
|
|
404
|
+
"output",
|
|
405
|
+
"confidence",
|
|
406
|
+
"correction"
|
|
407
|
+
]
|
|
408
|
+
},
|
|
409
|
+
"examples": [
|
|
410
|
+
{
|
|
411
|
+
"user_kwargs": {
|
|
412
|
+
"conversation_context": [
|
|
413
|
+
{
|
|
414
|
+
"role": "user",
|
|
415
|
+
"content": "Book a flight to Paris on 2025-08-15 for 2 adults in economy class."
|
|
416
|
+
}
|
|
417
|
+
],
|
|
418
|
+
"tool_inventory": [
|
|
419
|
+
{
|
|
420
|
+
"type": "function",
|
|
421
|
+
"function": {
|
|
422
|
+
"name": "book_flight",
|
|
423
|
+
"description": "Books a flight with specified parameters",
|
|
424
|
+
"parameters": {
|
|
425
|
+
"type": "object",
|
|
426
|
+
"properties": {
|
|
427
|
+
"destination": {
|
|
428
|
+
"type": "string",
|
|
429
|
+
"description": "Destination city"
|
|
430
|
+
},
|
|
431
|
+
"departure_date": {
|
|
432
|
+
"type": "string",
|
|
433
|
+
"format": "date",
|
|
434
|
+
"description": "Date in YYYY-MM-DD format"
|
|
435
|
+
},
|
|
436
|
+
"passengers": {
|
|
437
|
+
"type": "integer",
|
|
438
|
+
"description": "Number of passengers"
|
|
439
|
+
},
|
|
440
|
+
"class": {
|
|
441
|
+
"type": "string",
|
|
442
|
+
"enum": [
|
|
443
|
+
"economy",
|
|
444
|
+
"business",
|
|
445
|
+
"first"
|
|
446
|
+
],
|
|
447
|
+
"description": "Flight class"
|
|
448
|
+
}
|
|
449
|
+
},
|
|
450
|
+
"required": [
|
|
451
|
+
"destination",
|
|
452
|
+
"departure_date",
|
|
453
|
+
"passengers",
|
|
454
|
+
"class"
|
|
455
|
+
]
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
],
|
|
460
|
+
"tool_call": {
|
|
461
|
+
"id": "call_001",
|
|
462
|
+
"type": "function",
|
|
463
|
+
"function": {
|
|
464
|
+
"name": "book_flight",
|
|
465
|
+
"arguments": "{\"destination\": \"Paris\", \"departure_date\": \"2025-08-15\", \"passengers\": 2, \"class\": \"economy\"}"
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
},
|
|
469
|
+
"output": {
|
|
470
|
+
"evidence": "Tool spec requires: destination (string), departure_date (string, YYYY-MM-DD format), passengers (integer), class (enum: economy/business/first). Values provided: 'Paris' (string), '2025-08-15' (string, YYYY-MM-DD), 2 (integer), 'economy' (valid enum).",
|
|
471
|
+
"explanation": "All parameter values perfectly conform to their specifications: destination is a proper string, departure_date follows the exact YYYY-MM-DD format, passengers is provided as an integer (not string), and class matches one of the required enum values. No format issues detected.",
|
|
472
|
+
"output": 5,
|
|
473
|
+
"confidence": 0.98,
|
|
474
|
+
"correction": {}
|
|
475
|
+
}
|
|
476
|
+
},
|
|
477
|
+
{
|
|
478
|
+
"user_kwargs": {
|
|
479
|
+
"conversation_context": [
|
|
480
|
+
{
|
|
481
|
+
"role": "user",
|
|
482
|
+
"content": "Set a timer for 5 minutes and schedule a meeting for tomorrow at 3 PM."
|
|
483
|
+
}
|
|
484
|
+
],
|
|
485
|
+
"tool_inventory": [
|
|
486
|
+
{
|
|
487
|
+
"type": "function",
|
|
488
|
+
"function": {
|
|
489
|
+
"name": "set_timer_and_meeting",
|
|
490
|
+
"description": "Sets a timer and schedules a meeting",
|
|
491
|
+
"parameters": {
|
|
492
|
+
"type": "object",
|
|
493
|
+
"properties": {
|
|
494
|
+
"timer_duration": {
|
|
495
|
+
"type": "string",
|
|
496
|
+
"pattern": "^\\d+\\s+(minutes?|seconds?|hours?)$",
|
|
497
|
+
"description": "Duration in format 'number unit' (e.g., '5 minutes')"
|
|
498
|
+
},
|
|
499
|
+
"meeting_date": {
|
|
500
|
+
"type": "string",
|
|
501
|
+
"format": "date",
|
|
502
|
+
"description": "Date in YYYY-MM-DD format"
|
|
503
|
+
},
|
|
504
|
+
"meeting_time": {
|
|
505
|
+
"type": "string",
|
|
506
|
+
"pattern": "^([01]?[0-9]|2[0-3]):[0-5][0-9]$",
|
|
507
|
+
"description": "Time in HH:MM 24-hour format"
|
|
508
|
+
}
|
|
509
|
+
},
|
|
510
|
+
"required": [
|
|
511
|
+
"timer_duration",
|
|
512
|
+
"meeting_date",
|
|
513
|
+
"meeting_time"
|
|
514
|
+
]
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
],
|
|
519
|
+
"tool_call": {
|
|
520
|
+
"id": "call_002",
|
|
521
|
+
"type": "function",
|
|
522
|
+
"function": {
|
|
523
|
+
"name": "set_timer_and_meeting",
|
|
524
|
+
"arguments": "{\"timer_duration\": \"300\", \"meeting_date\": \"tomorrow\", \"meeting_time\": \"3 PM\"}"
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
},
|
|
528
|
+
"output": {
|
|
529
|
+
"evidence": "Tool spec requires: timer_duration (string matching pattern '^\\d+\\s+(minutes?|seconds?|hours?)$'), meeting_date (string, YYYY-MM-DD format), meeting_time (string, HH:MM 24-hour format). Values provided: '300' (no unit, violates pattern), 'tomorrow' (not YYYY-MM-DD), '3 PM' (not 24-hour format).",
|
|
530
|
+
"explanation": "All three parameters have significant format violations: timer_duration '300' lacks the required unit and space (should be '5 minutes'), meeting_date 'tomorrow' doesn't follow YYYY-MM-DD format (should be '2025-08-07'), and meeting_time '3 PM' uses 12-hour format instead of required 24-hour HH:MM (should be '15:00'). These errors will likely cause API rejection.",
|
|
531
|
+
"output": 2,
|
|
532
|
+
"confidence": 0.92,
|
|
533
|
+
"correction": {
|
|
534
|
+
"parameter_issues": [
|
|
535
|
+
{
|
|
536
|
+
"parameter_name": "timer_duration",
|
|
537
|
+
"reason_types": [
|
|
538
|
+
"FORMAT_ERROR",
|
|
539
|
+
"UNIT_ERROR"
|
|
540
|
+
],
|
|
541
|
+
"reasons": "Value '300' lacks required unit and space format, should be '5 minutes'",
|
|
542
|
+
"corrected_value": {
|
|
543
|
+
"timer_duration": "5 minutes"
|
|
544
|
+
}
|
|
545
|
+
},
|
|
546
|
+
{
|
|
547
|
+
"parameter_name": "meeting_date",
|
|
548
|
+
"reason_types": [
|
|
549
|
+
"FORMAT_ERROR"
|
|
550
|
+
],
|
|
551
|
+
"reasons": "Value 'tomorrow' doesn't follow required YYYY-MM-DD format",
|
|
552
|
+
"corrected_value": {
|
|
553
|
+
"meeting_date": "2025-08-07"
|
|
554
|
+
}
|
|
555
|
+
},
|
|
556
|
+
{
|
|
557
|
+
"parameter_name": "meeting_time",
|
|
558
|
+
"reason_types": [
|
|
559
|
+
"FORMAT_ERROR"
|
|
560
|
+
],
|
|
561
|
+
"reasons": "Value '3 PM' uses 12-hour format instead of required 24-hour HH:MM",
|
|
562
|
+
"corrected_value": {
|
|
563
|
+
"meeting_time": "15:00"
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
],
|
|
567
|
+
"tool_call": {
|
|
568
|
+
"name": "set_timer_and_meeting",
|
|
569
|
+
"arguments": {
|
|
570
|
+
"timer_duration": "5 minutes",
|
|
571
|
+
"meeting_date": "2025-08-07",
|
|
572
|
+
"meeting_time": "15:00"
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
]
|
|
579
|
+
}
|
|
580
|
+
]
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.base import (
|
|
4
|
+
FunctionMetricsPrompt,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
_function_system = (
|
|
8
|
+
"### Task Description:\n\n"
|
|
9
|
+
"{{ task_description }}\n\n"
|
|
10
|
+
"Your output must conform to the following JSON schema, in the same order as the fields appear in the schema:\n"
|
|
11
|
+
"{{ metric_jsonschema }}"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
_function_user = (
|
|
15
|
+
"Conversation context:\n"
|
|
16
|
+
"{{ conversation_context }}\n\n"
|
|
17
|
+
"Tools Inventory:\n"
|
|
18
|
+
"{{ tools_inventory }}\n\n"
|
|
19
|
+
"Proposed function call:\n"
|
|
20
|
+
"{{ proposed_tool_call }}\n\n"
|
|
21
|
+
"Function name:\n"
|
|
22
|
+
"{{ selected_function }}\n\n"
|
|
23
|
+
"Return a JSON object as specified in the system prompt. You MUST keep the same order of fields in the JSON object as provided in the JSON schema and examples."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FunctionSelectionPrompt(FunctionMetricsPrompt):
|
|
28
|
+
"""Prompt builder for function-selection metrics."""
|
|
29
|
+
|
|
30
|
+
system_template = _function_system
|
|
31
|
+
user_template = _function_user
|