ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
import re
|
|
4
|
+
from typing import (
|
|
5
|
+
Any,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
Tuple,
|
|
10
|
+
Union,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
|
|
14
|
+
GeneralMetricsPrompt,
|
|
15
|
+
)
|
|
16
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
|
|
17
|
+
FunctionSelectionPrompt,
|
|
18
|
+
)
|
|
19
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.loader import (
|
|
20
|
+
PromptKind,
|
|
21
|
+
load_prompts_from_list,
|
|
22
|
+
)
|
|
23
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.adapters import (
|
|
24
|
+
BaseAdapter,
|
|
25
|
+
OpenAIAdapter,
|
|
26
|
+
)
|
|
27
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.transformation_prompts import (
|
|
28
|
+
GENERATE_CODE_SCHEMA,
|
|
29
|
+
GENERATE_CODE_SYSTEM,
|
|
30
|
+
GENERATE_CODE_USER,
|
|
31
|
+
MULTI_EXTRACT_UNITS_SYSTEM,
|
|
32
|
+
MULTI_EXTRACT_UNITS_USER,
|
|
33
|
+
build_multi_extract_units_schema,
|
|
34
|
+
)
|
|
35
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
36
|
+
SemanticCategoryResult,
|
|
37
|
+
SemanticResult,
|
|
38
|
+
ToolCall,
|
|
39
|
+
ToolSpec,
|
|
40
|
+
TransformResult,
|
|
41
|
+
)
|
|
42
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
43
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
|
|
44
|
+
MetricRunner,
|
|
45
|
+
MetricRunResult,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SemanticChecker:
|
|
50
|
+
"""
|
|
51
|
+
Orchestrates semantic metrics (and optional unit-transforms)
|
|
52
|
+
for a single function call.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
general_metrics: JSON-schema dicts for general metrics.
|
|
56
|
+
function_metrics: JSON-schema dicts for function-selection metrics.
|
|
57
|
+
parameter_metrics: JSON-schema dicts for parameter-level metrics.
|
|
58
|
+
metrics_client: an WatsonXProvider instance for metric evaluation.
|
|
59
|
+
codegen_client: an WatsonXProvider instance for transformation codegen.
|
|
60
|
+
transform_enabled: whether to run unit-conversion checks.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
metrics_client: WatsonXProvider,
|
|
66
|
+
*,
|
|
67
|
+
general_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
68
|
+
function_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
69
|
+
parameter_metrics: Optional[List[Dict[str, Any]]] = None,
|
|
70
|
+
codegen_client: Optional[WatsonXProvider] = None,
|
|
71
|
+
transform_enabled: Optional[bool] = False,
|
|
72
|
+
) -> None:
|
|
73
|
+
self.metrics_client = metrics_client
|
|
74
|
+
|
|
75
|
+
self.transform_enabled = transform_enabled
|
|
76
|
+
self.codegen_client = codegen_client
|
|
77
|
+
|
|
78
|
+
self.general_prompts = []
|
|
79
|
+
if general_metrics is not None:
|
|
80
|
+
self.general_prompts = load_prompts_from_list(
|
|
81
|
+
general_metrics, PromptKind.GENERAL
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
self.function_prompts = []
|
|
85
|
+
if function_metrics is not None:
|
|
86
|
+
self.function_prompts = load_prompts_from_list(
|
|
87
|
+
function_metrics, PromptKind.FUNCTION_SELECTION
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.parameter_prompts = []
|
|
91
|
+
if parameter_metrics is not None:
|
|
92
|
+
self.parameter_prompts = load_prompts_from_list(
|
|
93
|
+
parameter_metrics, PromptKind.PARAMETER
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _make_adapter(self, apis_specs, tool_call):
|
|
97
|
+
first = apis_specs[0]
|
|
98
|
+
if isinstance(first, ToolSpec):
|
|
99
|
+
return OpenAIAdapter(apis_specs, tool_call)
|
|
100
|
+
raise TypeError("Unsupported spec type")
|
|
101
|
+
|
|
102
|
+
def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]:
|
|
103
|
+
"""
|
|
104
|
+
Return a mapping of every parameter name in the spec inventory
|
|
105
|
+
to its value from the call (or defaulted if missing).
|
|
106
|
+
"""
|
|
107
|
+
call_args = adapter.get_parameters()
|
|
108
|
+
merged: Dict[str, Any] = {}
|
|
109
|
+
# Find the function in the inventory
|
|
110
|
+
function_parameters = (
|
|
111
|
+
adapter.get_tool_spec(adapter.get_function_name())
|
|
112
|
+
.get("parameters", {})
|
|
113
|
+
.get("properties", {})
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
for pname, pschema in function_parameters.items():
|
|
117
|
+
if pname in call_args:
|
|
118
|
+
merged[pname] = call_args[pname]
|
|
119
|
+
elif "default" in pschema:
|
|
120
|
+
merged[pname] = pschema["default"]
|
|
121
|
+
else:
|
|
122
|
+
merged[pname] = (
|
|
123
|
+
f"Default value from parameter description (if defined): '{pschema.get('description', 'No description provided')}'"
|
|
124
|
+
f" Otherwise, by the default value of type: {pschema.get('type', 'object')}"
|
|
125
|
+
)
|
|
126
|
+
return merged
|
|
127
|
+
|
|
128
|
+
def extract_all_units_sync(
|
|
129
|
+
self,
|
|
130
|
+
context: Union[str, List[Dict[str, str]]],
|
|
131
|
+
adapter: BaseAdapter,
|
|
132
|
+
params: List[str],
|
|
133
|
+
retries: int = 1,
|
|
134
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
135
|
+
"""
|
|
136
|
+
Synchronously extract user_value/user_units_or_format/spec_units_or_format for every parameter in `params`
|
|
137
|
+
by issuing a single LLM call.
|
|
138
|
+
Returns a dict mapping each parameter name to its classification object.
|
|
139
|
+
"""
|
|
140
|
+
# Build the combined JSON Schema requiring one object per parameter
|
|
141
|
+
multi_schema = build_multi_extract_units_schema(params)
|
|
142
|
+
schema_str = json.dumps(multi_schema, indent=2)
|
|
143
|
+
|
|
144
|
+
# Build the "full_spec" JSON Schema snippet for all parameters
|
|
145
|
+
full_spec_json = json.dumps(
|
|
146
|
+
adapter.get_tool_spec(adapter.get_function_name()).model_dump(),
|
|
147
|
+
indent=2,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Format system and user prompts
|
|
151
|
+
system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
|
|
152
|
+
user_prompt = MULTI_EXTRACT_UNITS_USER.format(
|
|
153
|
+
context=context,
|
|
154
|
+
full_spec=full_spec_json,
|
|
155
|
+
parameter_names=", ".join(params),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Single synchronous LLM call
|
|
159
|
+
try:
|
|
160
|
+
response: Dict[str, Any] = self.metrics_client.generate(
|
|
161
|
+
prompt=[
|
|
162
|
+
{"role": "system", "content": system_prompt},
|
|
163
|
+
{"role": "user", "content": user_prompt},
|
|
164
|
+
],
|
|
165
|
+
schema=multi_schema,
|
|
166
|
+
retries=retries,
|
|
167
|
+
)
|
|
168
|
+
except Exception:
|
|
169
|
+
response = {
|
|
170
|
+
pname: {
|
|
171
|
+
"user_value": None,
|
|
172
|
+
"user_units_or_format": None,
|
|
173
|
+
"spec_units_or_format": None,
|
|
174
|
+
}
|
|
175
|
+
for pname in params
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return response
|
|
179
|
+
|
|
180
|
+
def run_sync(
|
|
181
|
+
self,
|
|
182
|
+
apis_specs: List[ToolSpec],
|
|
183
|
+
tool_call: ToolCall,
|
|
184
|
+
context: Union[str, List[Dict[str, str]]],
|
|
185
|
+
retries: int = 1,
|
|
186
|
+
transform_enabled: Optional[bool] = None,
|
|
187
|
+
) -> SemanticResult:
|
|
188
|
+
"""
|
|
189
|
+
Synchronous semantic-only evaluation.
|
|
190
|
+
|
|
191
|
+
Returns a SemanticResult:
|
|
192
|
+
{
|
|
193
|
+
"general": {metric_name: result, …} or None
|
|
194
|
+
"function_selection": {…} or None
|
|
195
|
+
"parameter": {param_name: {metric_name: result}, …} or None
|
|
196
|
+
"transform": {param_name: TransformResult, …} or None
|
|
197
|
+
}
|
|
198
|
+
"""
|
|
199
|
+
# 1) Normalize via adapter
|
|
200
|
+
adapter = self._make_adapter(apis_specs, tool_call)
|
|
201
|
+
tools_inventory_summary = adapter.get_tools_inventory_summary()
|
|
202
|
+
call_dict = adapter.get_call_dict()
|
|
203
|
+
fn_name = adapter.get_function_name()
|
|
204
|
+
cur_tool_spec = adapter.get_tool_spec(fn_name)
|
|
205
|
+
params = self._collect_params(adapter)
|
|
206
|
+
|
|
207
|
+
if transform_enabled is not None:
|
|
208
|
+
old_transform_enabled = self.transform_enabled
|
|
209
|
+
self.transform_enabled = transform_enabled
|
|
210
|
+
|
|
211
|
+
# 2) GENERAL METRICS
|
|
212
|
+
general_results: Optional[SemanticCategoryResult]
|
|
213
|
+
entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
|
|
214
|
+
for prompt in self.general_prompts:
|
|
215
|
+
entries.append(
|
|
216
|
+
(
|
|
217
|
+
prompt,
|
|
218
|
+
{
|
|
219
|
+
"conversation_context": context,
|
|
220
|
+
"tool_inventory": cur_tool_spec,
|
|
221
|
+
"tool_call": call_dict,
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
if entries:
|
|
226
|
+
try:
|
|
227
|
+
runner = MetricRunner(entries)
|
|
228
|
+
sync_results = runner.run_all(
|
|
229
|
+
self.metrics_client.generate,
|
|
230
|
+
prompt_param_name="prompt",
|
|
231
|
+
schema_param_name="schema",
|
|
232
|
+
retries=retries,
|
|
233
|
+
)
|
|
234
|
+
general_results = SemanticCategoryResult.from_results(sync_results)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
general_results = {"error": str(e)}
|
|
237
|
+
else:
|
|
238
|
+
general_results = None
|
|
239
|
+
|
|
240
|
+
# 3) FUNCTION-SELECTION METRICS
|
|
241
|
+
function_results: Optional[SemanticCategoryResult]
|
|
242
|
+
func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
|
|
243
|
+
for prompt in self.function_prompts:
|
|
244
|
+
func_entries.append(
|
|
245
|
+
(
|
|
246
|
+
prompt,
|
|
247
|
+
{
|
|
248
|
+
"conversation_context": context,
|
|
249
|
+
"tools_inventory": tools_inventory_summary,
|
|
250
|
+
"proposed_tool_call": call_dict,
|
|
251
|
+
"selected_function": fn_name,
|
|
252
|
+
},
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
if func_entries:
|
|
256
|
+
try:
|
|
257
|
+
runner = MetricRunner(func_entries)
|
|
258
|
+
sync_results = runner.run_all(
|
|
259
|
+
self.metrics_client.generate,
|
|
260
|
+
prompt_param_name="prompt",
|
|
261
|
+
schema_param_name="schema",
|
|
262
|
+
retries=retries,
|
|
263
|
+
)
|
|
264
|
+
function_results = SemanticCategoryResult.from_results(sync_results)
|
|
265
|
+
except Exception as e:
|
|
266
|
+
function_results = {"error": str(e)}
|
|
267
|
+
else:
|
|
268
|
+
function_results = None
|
|
269
|
+
|
|
270
|
+
# 4) PARAMETER-LEVEL METRICS
|
|
271
|
+
parameter_results: Optional[Dict[str, SemanticCategoryResult]] = {}
|
|
272
|
+
for pname, pval in params.items():
|
|
273
|
+
# Each parameter has its own prompts
|
|
274
|
+
try:
|
|
275
|
+
param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
|
|
276
|
+
for prompt in self.parameter_prompts:
|
|
277
|
+
param_entries.append(
|
|
278
|
+
(
|
|
279
|
+
prompt,
|
|
280
|
+
{
|
|
281
|
+
"conversation_context": context,
|
|
282
|
+
"tool_inventory": cur_tool_spec,
|
|
283
|
+
"tool_call": call_dict,
|
|
284
|
+
"parameter_name": pname,
|
|
285
|
+
"parameter_value": pval,
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
runner = MetricRunner(param_entries)
|
|
290
|
+
sync_results = runner.run_all(
|
|
291
|
+
self.metrics_client.generate,
|
|
292
|
+
prompt_param_name="prompt",
|
|
293
|
+
schema_param_name="schema",
|
|
294
|
+
retries=retries,
|
|
295
|
+
)
|
|
296
|
+
parameter_results[pname] = SemanticCategoryResult.from_results(
|
|
297
|
+
sync_results
|
|
298
|
+
)
|
|
299
|
+
except Exception as e:
|
|
300
|
+
parameter_results[pname] = {"error": str(e)}
|
|
301
|
+
|
|
302
|
+
if not parameter_results:
|
|
303
|
+
parameter_results = None
|
|
304
|
+
|
|
305
|
+
# Base SemanticResult without transforms
|
|
306
|
+
result = SemanticResult(
|
|
307
|
+
general=general_results,
|
|
308
|
+
function_selection=function_results,
|
|
309
|
+
parameter=parameter_results,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# 5) OPTIONAL TRANSFORMS
|
|
313
|
+
params = adapter.get_parameters()
|
|
314
|
+
if self.transform_enabled and params:
|
|
315
|
+
if transform_enabled is not None:
|
|
316
|
+
self.transform_enabled = old_transform_enabled
|
|
317
|
+
|
|
318
|
+
transform_out: Dict[str, TransformResult] = {}
|
|
319
|
+
|
|
320
|
+
# 5a) Extract units for all parameters in one synchronous call
|
|
321
|
+
units_map = self.extract_all_units_sync(
|
|
322
|
+
context=context,
|
|
323
|
+
adapter=adapter,
|
|
324
|
+
params=list(params.keys()),
|
|
325
|
+
retries=retries,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# 5b) Generate code & execute for each parameter needing conversion
|
|
329
|
+
for pname, units in units_map.items():
|
|
330
|
+
user_units = units.get("user_units_or_format") or ""
|
|
331
|
+
spec_units = units.get("spec_units_or_format") or ""
|
|
332
|
+
user_value = units.get("user_value")
|
|
333
|
+
transformation_summary = units.get("transformation_summary", "")
|
|
334
|
+
gen_code = ""
|
|
335
|
+
|
|
336
|
+
# Only generate code if user_units differs from spec_units and user_value is present
|
|
337
|
+
if (
|
|
338
|
+
user_units
|
|
339
|
+
and user_value is not None
|
|
340
|
+
and spec_units
|
|
341
|
+
and (user_units != spec_units)
|
|
342
|
+
):
|
|
343
|
+
try:
|
|
344
|
+
prompt = GENERATE_CODE_USER.format(
|
|
345
|
+
old_value=user_value,
|
|
346
|
+
old_units=user_units,
|
|
347
|
+
transformed_value=str(params[pname]),
|
|
348
|
+
transformed_units=spec_units,
|
|
349
|
+
transformed_type=type(params[pname]).__name__,
|
|
350
|
+
transformation_summary=transformation_summary,
|
|
351
|
+
)
|
|
352
|
+
gen_code = self.codegen_client.generate(
|
|
353
|
+
prompt=[
|
|
354
|
+
{"role": "system", "content": GENERATE_CODE_SYSTEM},
|
|
355
|
+
{"role": "user", "content": prompt},
|
|
356
|
+
],
|
|
357
|
+
schema=GENERATE_CODE_SCHEMA,
|
|
358
|
+
retries=retries,
|
|
359
|
+
).get("generated_code", "")
|
|
360
|
+
except Exception:
|
|
361
|
+
gen_code = ""
|
|
362
|
+
|
|
363
|
+
# 5c) Execute & validate
|
|
364
|
+
tr = self._execute_code_and_validate(
|
|
365
|
+
code=gen_code,
|
|
366
|
+
user_val=str(user_value or ""),
|
|
367
|
+
api_val=str(params[pname]),
|
|
368
|
+
units=units,
|
|
369
|
+
)
|
|
370
|
+
transform_out[pname] = tr
|
|
371
|
+
|
|
372
|
+
if transform_out:
|
|
373
|
+
result.transform = transform_out
|
|
374
|
+
else:
|
|
375
|
+
result.transform = None
|
|
376
|
+
|
|
377
|
+
return result
|
|
378
|
+
|
|
379
|
+
def _execute_code_and_validate(
|
|
380
|
+
self,
|
|
381
|
+
code: str,
|
|
382
|
+
user_val: str,
|
|
383
|
+
api_val: str,
|
|
384
|
+
units: Dict[str, Any],
|
|
385
|
+
) -> TransformResult:
|
|
386
|
+
"""
|
|
387
|
+
Strip code fences, install imports, exec code, compare, return TransformResult.
|
|
388
|
+
"""
|
|
389
|
+
clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
|
|
390
|
+
|
|
391
|
+
# install imports
|
|
392
|
+
for mod in set(
|
|
393
|
+
re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
|
|
394
|
+
):
|
|
395
|
+
try:
|
|
396
|
+
__import__(mod)
|
|
397
|
+
except ImportError as e:
|
|
398
|
+
return TransformResult(
|
|
399
|
+
units=units,
|
|
400
|
+
generated_code=clean,
|
|
401
|
+
execution_success=False,
|
|
402
|
+
correct=True,
|
|
403
|
+
execution_output=None,
|
|
404
|
+
correction=None,
|
|
405
|
+
error=f"Error: {e}. Could not import module '{mod}'. Please install the package and try again,"
|
|
406
|
+
" or run the generated code manually:\n"
|
|
407
|
+
f"transformation_code({user_val}) == convert_example_str_transformed_to_transformed_type({api_val})",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
ns: Dict[str, Any] = {}
|
|
411
|
+
try:
|
|
412
|
+
exec(clean, ns)
|
|
413
|
+
fn_t = ns.get("transformation_code")
|
|
414
|
+
fn_c = ns.get("convert_example_str_transformed_to_transformed_type")
|
|
415
|
+
if not callable(fn_t) or not callable(fn_c):
|
|
416
|
+
raise ValueError("Generated code missing required functions")
|
|
417
|
+
|
|
418
|
+
out_t = fn_t(user_val)
|
|
419
|
+
out_c = fn_c(api_val)
|
|
420
|
+
if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
|
|
421
|
+
success = math.isclose(out_t, out_c, abs_tol=1e-3)
|
|
422
|
+
else:
|
|
423
|
+
success = str(out_t) == str(out_c)
|
|
424
|
+
|
|
425
|
+
correction = None
|
|
426
|
+
if not success:
|
|
427
|
+
correction = (
|
|
428
|
+
f"The transformation code validation found an issue with the units transformation "
|
|
429
|
+
f"of the parameter.\n"
|
|
430
|
+
f"The user request value is '{user_val}' with units '{units.get('user_units_or_format')}' and "
|
|
431
|
+
f"the API call value is '{api_val}' with units '{units.get('spec_units_or_format')}'.\n"
|
|
432
|
+
f"Expected transformation is '{out_t}' based on the code.\n"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
correct = correction is None
|
|
436
|
+
|
|
437
|
+
return TransformResult(
|
|
438
|
+
units=units,
|
|
439
|
+
generated_code=clean,
|
|
440
|
+
execution_success=True,
|
|
441
|
+
correct=correct,
|
|
442
|
+
execution_output={"transformed": out_t, "converted": out_c},
|
|
443
|
+
correction=correction,
|
|
444
|
+
error=None,
|
|
445
|
+
)
|
|
446
|
+
except Exception as e:
|
|
447
|
+
return TransformResult(
|
|
448
|
+
units=units,
|
|
449
|
+
generated_code=clean,
|
|
450
|
+
execution_success=False,
|
|
451
|
+
correct=True,
|
|
452
|
+
execution_output=None,
|
|
453
|
+
correction=None,
|
|
454
|
+
error=str(e),
|
|
455
|
+
)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from jsonschema import (
|
|
4
|
+
Draft7Validator,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
8
|
+
StaticMetricResult,
|
|
9
|
+
StaticResult,
|
|
10
|
+
ToolCall,
|
|
11
|
+
ToolSpec,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# ----------------------------------------
|
|
15
|
+
# Human-readable descriptions for checks
|
|
16
|
+
# ----------------------------------------
|
|
17
|
+
_STATIC_CHECKS: Dict[str, str] = {
|
|
18
|
+
"non_existent_function": "Function name not found in the provided API specification.",
|
|
19
|
+
"non_existent_parameter": "One or more parameters are not defined for the specified function.",
|
|
20
|
+
"incorrect_parameter_type": "One or more parameters have values whose types don't match the expected types.",
|
|
21
|
+
"missing_required_parameter": "One or more required parameters are missing from the call.",
|
|
22
|
+
"allowed_values_violation": "One or more parameters have values outside the allowed enumeration.",
|
|
23
|
+
"json_schema_validation": "The API call does not conform to the provided JSON Schema.",
|
|
24
|
+
"empty_api_spec": "There are no API specifications provided or they are invalid.",
|
|
25
|
+
"invalid_api_spec": "The API specifications provided are not valid Tool or ToolSpec instances.",
|
|
26
|
+
"invalid_tool_call": "The provided ToolCall is not a valid instance of ToolCall.",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
|
|
31
|
+
"""
|
|
32
|
+
Perform static validation on a single tool call.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
apis_specs: Non-empty list of ToolSpec instances (OpenAI spec for ToolCall)
|
|
36
|
+
api_call: Single call to validate: ToolCall instance (OpenAI tool call)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
StaticResult(metrics=..., final_decision=bool)
|
|
40
|
+
"""
|
|
41
|
+
if not isinstance(apis_specs, list) or not apis_specs:
|
|
42
|
+
return StaticResult(
|
|
43
|
+
metrics={
|
|
44
|
+
"empty_api_spec": StaticMetricResult(
|
|
45
|
+
description=_STATIC_CHECKS["empty_api_spec"],
|
|
46
|
+
valid=False,
|
|
47
|
+
explanation="No API specifications provided.",
|
|
48
|
+
)
|
|
49
|
+
},
|
|
50
|
+
final_decision=False,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if not all(isinstance(spec, ToolSpec) for spec in apis_specs):
|
|
54
|
+
return StaticResult(
|
|
55
|
+
metrics={
|
|
56
|
+
"invalid_api_spec": StaticMetricResult(
|
|
57
|
+
description=_STATIC_CHECKS["invalid_api_spec"],
|
|
58
|
+
valid=False,
|
|
59
|
+
explanation="Invalid API specifications provided; expected ToolSpec instances (List of ToolSpec).",
|
|
60
|
+
)
|
|
61
|
+
},
|
|
62
|
+
final_decision=False,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if not isinstance(api_call, ToolCall):
|
|
66
|
+
return StaticResult(
|
|
67
|
+
metrics={
|
|
68
|
+
"invalid_tool_call": StaticMetricResult(
|
|
69
|
+
description=_STATIC_CHECKS["invalid_tool_call"],
|
|
70
|
+
valid=False,
|
|
71
|
+
explanation="Invalid ToolCall provided; expected ToolCall instance.",
|
|
72
|
+
)
|
|
73
|
+
},
|
|
74
|
+
final_decision=False,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
errors = _check_tool_call(specs=apis_specs, call=api_call)
|
|
78
|
+
|
|
79
|
+
# Build metrics results: missing key => valid
|
|
80
|
+
metrics: Dict[str, StaticMetricResult] = {}
|
|
81
|
+
for check_name, desc in _STATIC_CHECKS.items():
|
|
82
|
+
valid = check_name not in errors
|
|
83
|
+
metrics[check_name] = StaticMetricResult(
|
|
84
|
+
description=desc,
|
|
85
|
+
valid=valid,
|
|
86
|
+
explanation=None if valid else errors.get(check_name),
|
|
87
|
+
)
|
|
88
|
+
final_decision = all(m.valid for m in metrics.values())
|
|
89
|
+
return StaticResult(metrics=metrics, final_decision=final_decision)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
|
|
93
|
+
"""
|
|
94
|
+
Static checks for OpenAI ToolCall + ToolSpec list.
|
|
95
|
+
Returns mapping of failed check keys -> explanation.
|
|
96
|
+
"""
|
|
97
|
+
errors: Dict[str, str] = {}
|
|
98
|
+
|
|
99
|
+
# 1) Function existence
|
|
100
|
+
spec = next((s for s in specs if s.function.name == call.function.name), None)
|
|
101
|
+
if not spec:
|
|
102
|
+
errors["non_existent_function"] = (
|
|
103
|
+
f"Function '{call.function.name}' does not exist in the provided API specifications:"
|
|
104
|
+
f" {', '.join(s.function.name for s in specs)}."
|
|
105
|
+
)
|
|
106
|
+
return errors
|
|
107
|
+
|
|
108
|
+
params_schema = spec.function.parameters
|
|
109
|
+
properties = params_schema.get("properties", params_schema)
|
|
110
|
+
parsed_arguments = call.function.parsed_arguments
|
|
111
|
+
|
|
112
|
+
# 2) Parameter existence check
|
|
113
|
+
if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
|
|
114
|
+
errors["non_existent_parameter"] = (
|
|
115
|
+
f"Parameters not defined in function '{call.function.name}': "
|
|
116
|
+
f"{', '.join(sorted(non_existent_params))}. "
|
|
117
|
+
f"Possible parameters are: {', '.join(sorted(properties.keys()))}."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# 3) JSON Schema validation
|
|
121
|
+
validator = Draft7Validator(params_schema)
|
|
122
|
+
|
|
123
|
+
missing_required = []
|
|
124
|
+
incorrect_types = []
|
|
125
|
+
invalid_enum = []
|
|
126
|
+
other_errors = []
|
|
127
|
+
|
|
128
|
+
for error in validator.iter_errors(parsed_arguments):
|
|
129
|
+
field = ".".join(str(x) for x in error.path) if error.path else "unknown"
|
|
130
|
+
if error.validator == "required":
|
|
131
|
+
missing_required.append(error.message)
|
|
132
|
+
elif error.validator == "type":
|
|
133
|
+
incorrect_types.append(f"{field}: {error.message}")
|
|
134
|
+
elif error.validator == "enum":
|
|
135
|
+
invalid_enum.append(f"{field}: {error.message}")
|
|
136
|
+
else:
|
|
137
|
+
other_errors.append(f"{field}: {error.message}")
|
|
138
|
+
|
|
139
|
+
if missing_required:
|
|
140
|
+
errors["missing_required_parameter"] = (
|
|
141
|
+
"Missing required parameter(s): " + "; ".join(missing_required)
|
|
142
|
+
)
|
|
143
|
+
if incorrect_types:
|
|
144
|
+
errors["incorrect_parameter_type"] = (
|
|
145
|
+
"Incorrect parameter type(s): " + "; ".join(incorrect_types)
|
|
146
|
+
)
|
|
147
|
+
if invalid_enum:
|
|
148
|
+
errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
|
|
149
|
+
invalid_enum
|
|
150
|
+
)
|
|
151
|
+
if other_errors:
|
|
152
|
+
errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
|
|
153
|
+
other_errors
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return errors
|