ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"name": "function_selection_appropriateness",
|
|
4
|
+
"task_description": "You are an evaluator assessing whether a specific function is an appropriate next step in a user-assistant conversation.\n\nTASK: Determine if the selected function aligns with the user's current request and is justified by the conversation history and tool inventory.\nIMPORTANT: Evaluate only the function selection appropriateness. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\nEVALUATION CRITERIA:\n\n1. EVIDENCE SOURCES:\n- Use only the conversation history (system prompts, user and assistant turns, prior tool calls and outputs).\n- Use only the tool inventory, which includes function names and descriptions.\n- Do not infer capabilities, tool logic, or user goals beyond what is explicitly provided.\n\n2. DIALOGUE CONTEXT:\n- Conversations alternate between user and assistant.\n- Assistant turns may include multiple tool calls.\n- Tool calls may be:\n - INDEPENDENT: order does not matter\n - CHAINED: later calls depend on earlier ones\n- Consider the evolving context of the user's goal.\n\n3. APPROPRIATENESS DEFINITION:\nA function is appropriate if:\n- It addresses the user's stated or implied request.\n- It contributes meaningfully to the immediate goal.\n- It aligns with the documented function description.\n- It is not redundant with earlier tool calls (e.g. a duplication of the same tool call with the same parameters that is unnecessary).\n\nNote: More than one function may be appropriate. If this call satisfies a relevant part of the user's request, it is valid.\n\n4. WHEN INAPPROPRIATE:\nFlag the function as inappropriate only if:\n- A better function exists in the inventory\n- No valid function matches the request\n- The call is redundant or unnecessary\n\nProvide a corrected tool call if applicable, or use a placeholder such as:\n- {\"name\": \"no_function\"}\n- {\"name\": \"no_function_redundant\"}\n\n5. CONSERVATIVE JUDGMENT:\n- Do not assume tool behavior not stated in descriptions.\n- Do not reference tools or logic outside the inventory.\n- Do not invent user intent.\n- Do not penalize a valid call just because alternatives exist.\n\nOnly mark the function as inappropriate if clear, explicit evidence from the conversation or tool descriptions supports that judgment.",
|
|
5
|
+
"jsonschema": {
|
|
6
|
+
"title": "function_selection_appropriateness",
|
|
7
|
+
"description": "Assessment of whether the selected function is an appropriate next step based on the user's request and the provided tool descriptions.",
|
|
8
|
+
"type": "object",
|
|
9
|
+
"additionalProperties": false,
|
|
10
|
+
"properties": {
|
|
11
|
+
"evidence": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "Provide direct quotes from the conversation and tool descriptions that justify your assessment. Only include grounded evidence-do not paraphrase or infer."
|
|
14
|
+
},
|
|
15
|
+
"explanation": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "Briefly explain why the selected function is or isn't appropriate for the user's request. Reference the user's message and the relevant tool description. If the function is inappropriate, state whether: (1) a better function exists, (2) no function is suitable, or (3) the call is redundant or incorrectly ordered."
|
|
18
|
+
},
|
|
19
|
+
"output": {
|
|
20
|
+
"type": "integer",
|
|
21
|
+
"minimum": 1,
|
|
22
|
+
"maximum": 5,
|
|
23
|
+
"threshold_low": 3,
|
|
24
|
+
"threshold_high": 5,
|
|
25
|
+
"description": "Function Selection Appropriateness Score (1-5):\n\n- 5: Clearly appropriate and well-justified\n The selected function directly aligns with the user's request, is grounded in the conversation history, and matches the documented purpose in the tool inventory.\n Example:\n User: \"Can you check the weather in Rome tomorrow?\"\nSelected function: get_weather(city=\"Rome\", date=\"2025-08-06\")\n\n- 4: Reasonable choice with minor gaps\n The function is a plausible and helpful next step based on the conversation, though some details (e.g., goal inference or chaining rationale) may not be fully explicit. Example:\n User: \"What's my email address?\"\n Selected function: get_user_id()\n (A precondition for get_user_email(user_id=...) is this get_user_id())\n\n- 3: Ambiguous or weakly justified\n The function may relate to the user's goal, but there is not enough evidence in the conversation or function description to clearly support it.\n Example:\n User: \"I'm planning a trip to Tokyo.\"\n Selected function: get_weather(city=\"Tokyo\")\n (User didn't request weather, but it's a useful proactive step)\n\n- 2: Likely inappropriate\n The function does not clearly support the user's request or appears redundant given earlier tool calls.\n Example:\n User: \"Book me a restaurant in Berlin.\"\n Selected function: get_weather(city=\"Berlin\")\n\n- 1: Clearly inappropriate or unjustified\n The function is unrelated to the conversation, violates tool constraints, or invents a user goal.\n Example:\n User: \"Remind me to call Alex.\"\n Selected function: get_weather(city=\"New York\")"
|
|
26
|
+
},
|
|
27
|
+
"confidence": {
|
|
28
|
+
"type": "number",
|
|
29
|
+
"minimum": 0,
|
|
30
|
+
"maximum": 1,
|
|
31
|
+
"threshold_low": 0,
|
|
32
|
+
"threshold_high": 1,
|
|
33
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
34
|
+
},
|
|
35
|
+
"correction": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"description": "Required when there are issues. Leave empty ({}) when there are no issues. For invalid function calls, specify the issue(s), explanation, and corrected tool call or placeholder.",
|
|
38
|
+
"properties": {
|
|
39
|
+
"reason_types": {
|
|
40
|
+
"type": "array",
|
|
41
|
+
"description": "Categories of issues with this function selection. Use one or more of: IRRELEVANT_FUNCTION, BETTER_FUNCTION_EXISTS, MISSING_FUNCTION, REDUNDANT_CALL, OTHER.",
|
|
42
|
+
"items": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"enum": [
|
|
45
|
+
"IRRELEVANT_FUNCTION",
|
|
46
|
+
"BETTER_FUNCTION_EXISTS",
|
|
47
|
+
"MISSING_FUNCTION",
|
|
48
|
+
"REDUNDANT_CALL",
|
|
49
|
+
"OTHER"
|
|
50
|
+
]
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"reasons": {
|
|
54
|
+
"type": "string",
|
|
55
|
+
"description": "Short explanation of the problem with the function selection."
|
|
56
|
+
},
|
|
57
|
+
"corrected_function_name": {
|
|
58
|
+
"type": "string",
|
|
59
|
+
"description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary."
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"required": []
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": [
|
|
66
|
+
"explanation",
|
|
67
|
+
"evidence",
|
|
68
|
+
"output",
|
|
69
|
+
"confidence",
|
|
70
|
+
"correction"
|
|
71
|
+
]
|
|
72
|
+
},
|
|
73
|
+
"examples": [
|
|
74
|
+
{
|
|
75
|
+
"user_kwargs": {
|
|
76
|
+
"conversation_context": [
|
|
77
|
+
{
|
|
78
|
+
"role": "user",
|
|
79
|
+
"content": "What time is it in Tokyo?"
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
"tools_inventory": [
|
|
83
|
+
{
|
|
84
|
+
"name": "translate_text",
|
|
85
|
+
"description": "Translate text to a target language",
|
|
86
|
+
"parameters": {
|
|
87
|
+
"text": "string",
|
|
88
|
+
"target": "string"
|
|
89
|
+
}
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"name": "get_time",
|
|
93
|
+
"description": "Retrieve current local time",
|
|
94
|
+
"parameters": {
|
|
95
|
+
"timezone": "string"
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
],
|
|
99
|
+
"selected_function": "translate_text",
|
|
100
|
+
"proposed_tool_call": {
|
|
101
|
+
"id": "call_001",
|
|
102
|
+
"type": "function",
|
|
103
|
+
"function": {
|
|
104
|
+
"name": "translate_text",
|
|
105
|
+
"arguments": "{ \"text\": \"What time is it in Tokyo?\" }"
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
},
|
|
109
|
+
"output": {
|
|
110
|
+
"evidence": "User request: \"What time is it in Tokyo?\"\nFunction called: translate_text with description: \"Translate text to a target language\"\nMore appropriate function available: get_time with description: \"Retrieve current local time\"",
|
|
111
|
+
"explanation": "The translate_text function is misused, as the user's request for the current time in Tokyo should have been handled by the get_time function instead.",
|
|
112
|
+
"output": 1,
|
|
113
|
+
"confidence": 0.95,
|
|
114
|
+
"correction": {
|
|
115
|
+
"reason_types": [
|
|
116
|
+
"IRRELEVANT_FUNCTION",
|
|
117
|
+
"BETTER_FUNCTION_EXISTS"
|
|
118
|
+
],
|
|
119
|
+
"reasons": "translate_text performs text translation, not time retrieval; get_time function directly addresses user's request for current time in Tokyo",
|
|
120
|
+
"corrected_function_name": "get_time"
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"user_kwargs": {
|
|
126
|
+
"conversation_context": [
|
|
127
|
+
{
|
|
128
|
+
"role": "user",
|
|
129
|
+
"content": "Show me my calendar events from July 17 2024 to July 21 2024."
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
"role": "assistant",
|
|
133
|
+
"content": "{\"id\":\"call_001\",\"type\":\"function\",\"function\":{\"name\":\"get_calendar_events\",\"arguments\":{\"start_date\":\"2024-07-17\",\"end_date\":\"2024-07-21\"}}}"
|
|
134
|
+
}
|
|
135
|
+
],
|
|
136
|
+
"tools_inventory": [
|
|
137
|
+
{
|
|
138
|
+
"name": "get_calendar_events",
|
|
139
|
+
"description": "Retrieve calendar events for a given time range (e.g., today, upcoming, past).",
|
|
140
|
+
"parameters": {
|
|
141
|
+
"start_date": "string",
|
|
142
|
+
"end_date": "string"
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
],
|
|
146
|
+
"selected_function": "get_calendar_events",
|
|
147
|
+
"proposed_tool_call": {
|
|
148
|
+
"id": "call_002",
|
|
149
|
+
"type": "function",
|
|
150
|
+
"function": {
|
|
151
|
+
"name": "get_calendar_events",
|
|
152
|
+
"arguments": "{ \"start_date\": \"2024-07-17\", \"end_date\": \"2024-07-21\" }"
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
"output": {
|
|
157
|
+
"evidence": "The same function call (get_calendar_events with start_date=2024-07-17 and end_date=2024-07-21) was already executed earlier and produced a valid response. There was no user message requesting updated information.",
|
|
158
|
+
"explanation": "This tool call is redundant. The assistant re-issued the same call with identical parameters and without any new conversational signal. Such repetition adds no value.",
|
|
159
|
+
"output": 1,
|
|
160
|
+
"confidence": 0.97,
|
|
161
|
+
"correction": {
|
|
162
|
+
"reason_types": [
|
|
163
|
+
"REDUNDANT_CALL"
|
|
164
|
+
],
|
|
165
|
+
"reasons": "Identical tool call was already made and completed. No user input suggests the need for an update.",
|
|
166
|
+
"corrected_function_name": "no_function_redundant"
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
"user_kwargs": {
|
|
172
|
+
"conversation_context": [
|
|
173
|
+
{
|
|
174
|
+
"role": "user",
|
|
175
|
+
"content": "Remind me to call Mom tomorrow at 6 PM."
|
|
176
|
+
}
|
|
177
|
+
],
|
|
178
|
+
"tools_inventory": [
|
|
179
|
+
{
|
|
180
|
+
"name": "get_reminders",
|
|
181
|
+
"description": "Retrieve your scheduled reminders.",
|
|
182
|
+
"parameters": {}
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"name": "delete_reminder",
|
|
186
|
+
"description": "Remove a reminder by its ID.",
|
|
187
|
+
"parameters": {
|
|
188
|
+
"reminder_id": "string"
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
],
|
|
192
|
+
"selected_function": "get_reminders",
|
|
193
|
+
"proposed_tool_call": {
|
|
194
|
+
"id": "call_001",
|
|
195
|
+
"type": "function",
|
|
196
|
+
"function": {
|
|
197
|
+
"name": "get_reminders",
|
|
198
|
+
"arguments": "{}"
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
"output": {
|
|
203
|
+
"evidence": "User request: \"Remind me to call Mom tomorrow at 6 PM.\" Available tools are get_reminders and delete_reminder, neither of which supports creating new reminders.",
|
|
204
|
+
"explanation": "The user's intent is to schedule a reminder, but the available tools do not support this. get_reminders retrieves existing reminders, and delete_reminder removes them. There is no tool for creating a reminder.",
|
|
205
|
+
"output": 1,
|
|
206
|
+
"confidence": 0.93,
|
|
207
|
+
"correction": {
|
|
208
|
+
"reason_types": [
|
|
209
|
+
"MISSING_FUNCTION"
|
|
210
|
+
],
|
|
211
|
+
"reasons": "User request requires a reminder creation tool, which is not present in the tool inventory.",
|
|
212
|
+
"corrected_function_name": "no_function"
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
"user_kwargs": {
|
|
218
|
+
"conversation_context": [
|
|
219
|
+
{
|
|
220
|
+
"role": "user",
|
|
221
|
+
"content": "What is the weather like in Paris?"
|
|
222
|
+
}
|
|
223
|
+
],
|
|
224
|
+
"tools_inventory": [
|
|
225
|
+
{
|
|
226
|
+
"name": "get_weather",
|
|
227
|
+
"description": "Retrieve current weather conditions for a specified city.",
|
|
228
|
+
"parameters": {
|
|
229
|
+
"city": "string"
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"name": "get_time",
|
|
234
|
+
"description": "Retrieve current local time for a specified timezone.",
|
|
235
|
+
"parameters": {
|
|
236
|
+
"timezone": "string"
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
],
|
|
240
|
+
"selected_function": "get_weather",
|
|
241
|
+
"proposed_tool_call": {
|
|
242
|
+
"id": "call_001",
|
|
243
|
+
"type": "function",
|
|
244
|
+
"function": {
|
|
245
|
+
"name": "get_weather",
|
|
246
|
+
"arguments": "{ \"city\": \"Paris\" }"
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
},
|
|
250
|
+
"output": {
|
|
251
|
+
"evidence": "User request for weather in Paris is clear and unambiguous.",
|
|
252
|
+
"explanation": "The proposed tool call is appropriate as it directly addresses the user's request for weather information.",
|
|
253
|
+
"output": 5,
|
|
254
|
+
"confidence": 0.99,
|
|
255
|
+
"correction": {}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
]
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
"name": "agentic_constraints_satisfaction",
|
|
262
|
+
"task_description": "Evaluate whether the proposed tool call satisfies all agentic constraints required for safe execution.\n\nYour judgment must be based strictly on the provided conversation history and tools inventory. Do not infer or assume any constraints or workflows not explicitly documented.\nIMPORTANT: Evaluate only the agentic constraints satisfaction. Do not assess the function's execution, parameter values correctness, or any other aspect.\n\n---\n\n### 1. Explicit Prerequisites\n- Confirm that required prior actions (e.g., authentication, file upload) have been completed.\n- Accept only direct evidence such as: \"Auth token acquired\" or \"File uploaded successfully.\"\n\n---\n\n### 2. Tool Sequencing\n- Enforce sequencing only when the tool description explicitly states a required order.\n- If no such requirement is documented, assume calls can be executed independently.\n\n---\n\n### 3. Call Independence\n- Calls that are not explicitly marked as dependent may be treated as parallel or order-independent.\n- Do not impose workflow constraints unless specified.\n\n---\n\n### 4. Redundancy\n- Flag the tool call as redundant only if the same function with the same parameters was already executed and no contextual change justifies repetition.\n\n---\n\n### Final Rule\nYour evaluation must be conservative: flag constraint violations only when there is clear, explicit evidence. Do not assume or invent requirements not documented in the conversation or tool descriptions.",
|
|
263
|
+
"jsonschema": {
|
|
264
|
+
"title": "agentic_constraints_satisfaction",
|
|
265
|
+
"description": "Assessment of whether the tool call satisfies all agentic constraints as defined in the provided conversation and tool inventory.",
|
|
266
|
+
"type": "object",
|
|
267
|
+
"additionalProperties": false,
|
|
268
|
+
"properties": {
|
|
269
|
+
"evidence": {
|
|
270
|
+
"type": "string",
|
|
271
|
+
"description": "Quote specific lines from the conversation or tool specifications showing whether constraints (e.g., prerequisites, sequence, parameter relationships) are satisfied or violated. Include exact wording, not paraphrased summaries."
|
|
272
|
+
},
|
|
273
|
+
"explanation": {
|
|
274
|
+
"type": "string",
|
|
275
|
+
"description": "Explain clearly whether all agentic constraints are satisfied. Address prerequisite completions, sequencing logic, redundancy, parameter presence, and parameter consistency-based solely on the provided context."
|
|
276
|
+
},
|
|
277
|
+
"output": {
|
|
278
|
+
"type": "integer",
|
|
279
|
+
"minimum": 1,
|
|
280
|
+
"maximum": 5,
|
|
281
|
+
"threshold_low": 3,
|
|
282
|
+
"threshold_high": 5,
|
|
283
|
+
"description": "Agentic Constraints Satisfaction Score (1-5):\n\n- 5: Fully compliant with all known constraints\n The tool call satisfies all documented prerequisites, sequencing rules, and redundancy checks based on explicit evidence in the conversation and tool inventory.\n Example:\n User: \"Here is my file.\"\n Assistant: (File upload confirmed)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 4: Likely compliant, with minor uncertainty\n All major constraints are satisfied, but some minor dependency or precondition may not be fully confirmed, though it is likely met based on adjacent context.\n Example:\n User: \"I've uploaded the file already.\"\n (No explicit upload confirmation, but no contradiction)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 3: Ambiguous or undetermined\n It is unclear whether all required conditions or dependencies are met. The call might be valid, but confirmation is lacking.\n Example:\n User: \"Can you summarize the document?\"\n (No evidence of file upload or confirmation)\n -> Tool call: summarize_uploaded_file(file_id=\"abc123\")\n\n- 2: Likely violates one or more constraints\n At least one key constraint-such as missing authentication, a required prior step, or improper sequencing-appears unsatisfied.\n Example:\n User: \"Please delete my account.\"\n (No authentication step documented)\n -> Tool call: delete_account(user_id=\"456\")\n\n- 1: Clearly violates agentic constraints\n The tool call ignores an explicit precondition, repeats a redundant action, or breaks an ordering rule described in the tool documentation.\n Example:\n Assistant already called: upload_file(name=\"report.pdf\")\n -> Immediately repeats: upload_file(name=\"report.pdf\") without any contextual change"
|
|
284
|
+
},
|
|
285
|
+
"confidence": {
|
|
286
|
+
"type": "number",
|
|
287
|
+
"minimum": 0,
|
|
288
|
+
"maximum": 1,
|
|
289
|
+
"threshold_low": 0,
|
|
290
|
+
"threshold_high": 1,
|
|
291
|
+
"description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Assign higher confidence when evidence is clear, complete, and consistent, and lower confidence when it is ambiguous, incomplete, or conflicting."
|
|
292
|
+
},
|
|
293
|
+
"correction": {
|
|
294
|
+
"type": "object",
|
|
295
|
+
"description": "For satisfied constraints: Provide an empty object {}. For constraint violations: Provide an object with reason_types, reasons, and either prerequisite_tool_calls (for missing prerequisites) or corrected_function (for parameter issues).",
|
|
296
|
+
"properties": {
|
|
297
|
+
"reason_types": {
|
|
298
|
+
"type": "array",
|
|
299
|
+
"description": "Categories of constraint violations, if any.",
|
|
300
|
+
"items": {
|
|
301
|
+
"type": "string",
|
|
302
|
+
"enum": [
|
|
303
|
+
"MISSING_PREREQUISITE",
|
|
304
|
+
"REDUNDANT_CALL",
|
|
305
|
+
"OTHER"
|
|
306
|
+
]
|
|
307
|
+
}
|
|
308
|
+
},
|
|
309
|
+
"reasons": {
|
|
310
|
+
"type": "string",
|
|
311
|
+
"description": "Concise justification for the detected constraint violations, tied to specific evidence."
|
|
312
|
+
},
|
|
313
|
+
"prerequisite_tool_calls": {
|
|
314
|
+
"type": "array",
|
|
315
|
+
"description": "If the issue is missing prerequisites, list the tool calls that should precede this one.",
|
|
316
|
+
"items": {
|
|
317
|
+
"type": "object",
|
|
318
|
+
"properties": {
|
|
319
|
+
"name": {
|
|
320
|
+
"type": "string",
|
|
321
|
+
"description": "Name of the required function"
|
|
322
|
+
}
|
|
323
|
+
},
|
|
324
|
+
"required": []
|
|
325
|
+
}
|
|
326
|
+
},
|
|
327
|
+
"corrected_function_name": {
|
|
328
|
+
"type": "string",
|
|
329
|
+
"description": "Name of the corrected function to call. Use \"no_function\" if no function should be called, or \"no_function_redundant\" if the function call is unnecessary."
|
|
330
|
+
}
|
|
331
|
+
},
|
|
332
|
+
"required": []
|
|
333
|
+
}
|
|
334
|
+
},
|
|
335
|
+
"required": [
|
|
336
|
+
"evidence",
|
|
337
|
+
"explanation",
|
|
338
|
+
"output",
|
|
339
|
+
"confidence",
|
|
340
|
+
"correction"
|
|
341
|
+
]
|
|
342
|
+
},
|
|
343
|
+
"examples": [
|
|
344
|
+
{
|
|
345
|
+
"user_kwargs": {
|
|
346
|
+
"conversation_context": [
|
|
347
|
+
{
|
|
348
|
+
"role": "assistant",
|
|
349
|
+
"content": "Auth token acquired for user 42."
|
|
350
|
+
}
|
|
351
|
+
],
|
|
352
|
+
"tools_inventory": [
|
|
353
|
+
{
|
|
354
|
+
"name": "get_order_history",
|
|
355
|
+
"description": "Retrieve past orders",
|
|
356
|
+
"parameters": {
|
|
357
|
+
"user_id": "integer"
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
],
|
|
361
|
+
"selected_function": "get_order_history",
|
|
362
|
+
"proposed_tool_call": {
|
|
363
|
+
"id": "call_001",
|
|
364
|
+
"type": "function",
|
|
365
|
+
"function": {
|
|
366
|
+
"name": "get_order_history",
|
|
367
|
+
"arguments": "{ \"user_id\": 42 }"
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
},
|
|
371
|
+
"output": {
|
|
372
|
+
"evidence": "Assistant message: \"Auth token acquired for user 42.\"",
|
|
373
|
+
"explanation": "Authentication is confirmed by assistant's message. No other prerequisites are required by the tool description. All agentic constraints are satisfied.",
|
|
374
|
+
"output": 5,
|
|
375
|
+
"confidence": 0.95,
|
|
376
|
+
"correction": {}
|
|
377
|
+
}
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
"user_kwargs": {
|
|
381
|
+
"conversation_context": [
|
|
382
|
+
{
|
|
383
|
+
"role": "user",
|
|
384
|
+
"content": "Translate 'Hola' to English."
|
|
385
|
+
}
|
|
386
|
+
],
|
|
387
|
+
"tools_inventory": [
|
|
388
|
+
{
|
|
389
|
+
"name": "translate_text",
|
|
390
|
+
"description": "Translate text to target language. REQUIRES: Prior successful call to detect_language for the input text.",
|
|
391
|
+
"parameters": {
|
|
392
|
+
"text": "string",
|
|
393
|
+
"target": "string"
|
|
394
|
+
}
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
"name": "detect_language",
|
|
398
|
+
"description": "Detect the language of the input text.",
|
|
399
|
+
"parameters": {
|
|
400
|
+
"text": "string"
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
],
|
|
404
|
+
"selected_function": "translate_text",
|
|
405
|
+
"proposed_tool_call": {
|
|
406
|
+
"id": "call_001",
|
|
407
|
+
"type": "function",
|
|
408
|
+
"function": {
|
|
409
|
+
"name": "translate_text",
|
|
410
|
+
"arguments": "{ \"text\": \"Hola\", \"target\": \"en\" }"
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
},
|
|
414
|
+
"output": {
|
|
415
|
+
"evidence": "Tool description requires prior detect_language call. No such call appears in conversation.",
|
|
416
|
+
"explanation": "translate_text was used without satisfying its documented prerequisite, violating agentic constraints.",
|
|
417
|
+
"output": 1,
|
|
418
|
+
"confidence": 0.95,
|
|
419
|
+
"correction": {
|
|
420
|
+
"reason_types": [
|
|
421
|
+
"MISSING_PREREQUISITE"
|
|
422
|
+
],
|
|
423
|
+
"reasons": "translate_text was called before detect_language despite a required sequence.",
|
|
424
|
+
"prerequisite_tool_calls": [
|
|
425
|
+
{
|
|
426
|
+
"name": "detect_language"
|
|
427
|
+
}
|
|
428
|
+
]
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
"user_kwargs": {
|
|
434
|
+
"conversation_context": [
|
|
435
|
+
{
|
|
436
|
+
"role": "user",
|
|
437
|
+
"content": "Please translate 'Bonjour' to English."
|
|
438
|
+
}
|
|
439
|
+
],
|
|
440
|
+
"tools_inventory": [
|
|
441
|
+
{
|
|
442
|
+
"name": "translate_text",
|
|
443
|
+
"description": "Translate input text to a specified target language.",
|
|
444
|
+
"parameters": {
|
|
445
|
+
"text": "string",
|
|
446
|
+
"target": "string"
|
|
447
|
+
}
|
|
448
|
+
},
|
|
449
|
+
{
|
|
450
|
+
"name": "detect_language",
|
|
451
|
+
"description": "Detect the language of the input text.",
|
|
452
|
+
"parameters": {
|
|
453
|
+
"text": "string"
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
],
|
|
457
|
+
"selected_function": "translate_text",
|
|
458
|
+
"proposed_tool_call": {
|
|
459
|
+
"id": "call_001",
|
|
460
|
+
"type": "function",
|
|
461
|
+
"function": {
|
|
462
|
+
"name": "translate_text",
|
|
463
|
+
"arguments": "{ \"text\": \"Bonjour\", \"target\": \"en\" }"
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
},
|
|
467
|
+
"output": {
|
|
468
|
+
"evidence": "The translate_text tool description does not specify any prerequisite or dependency on detect_language.",
|
|
469
|
+
"explanation": "There is no constraint violation. The tool can be used independently since its description does not document any required sequencing or prerequisite.",
|
|
470
|
+
"output": 5,
|
|
471
|
+
"confidence": 0.94,
|
|
472
|
+
"correction": {}
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
]
|
|
476
|
+
}
|
|
477
|
+
]
|