ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +65 -20
  8. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  9. wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
  10. wxo_agentic_evaluation/inference_backend.py +117 -14
  11. wxo_agentic_evaluation/llm_user.py +2 -1
  12. wxo_agentic_evaluation/main.py +5 -0
  13. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  14. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  15. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  16. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  17. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  19. wxo_agentic_evaluation/quick_eval.py +342 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  21. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  22. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  23. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  24. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  46. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  47. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  48. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  49. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  50. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  51. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  52. wxo_agentic_evaluation/service_instance.py +2 -2
  53. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  54. wxo_agentic_evaluation/tool_planner.py +3 -1
  55. wxo_agentic_evaluation/type.py +33 -2
  56. wxo_agentic_evaluation/utils/__init__.py +0 -1
  57. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  58. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  59. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  60. wxo_agentic_evaluation/utils/utils.py +167 -5
  61. ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD +0 -56
  62. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  63. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,455 @@
1
+ import json
2
+ import math
3
+ import re
4
+ from typing import (
5
+ Any,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Tuple,
10
+ Union,
11
+ )
12
+
13
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
14
+ GeneralMetricsPrompt,
15
+ )
16
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
17
+ FunctionSelectionPrompt,
18
+ )
19
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.loader import (
20
+ PromptKind,
21
+ load_prompts_from_list,
22
+ )
23
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.adapters import (
24
+ BaseAdapter,
25
+ OpenAIAdapter,
26
+ )
27
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.transformation_prompts import (
28
+ GENERATE_CODE_SCHEMA,
29
+ GENERATE_CODE_SYSTEM,
30
+ GENERATE_CODE_USER,
31
+ MULTI_EXTRACT_UNITS_SYSTEM,
32
+ MULTI_EXTRACT_UNITS_USER,
33
+ build_multi_extract_units_schema,
34
+ )
35
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
36
+ SemanticCategoryResult,
37
+ SemanticResult,
38
+ ToolCall,
39
+ ToolSpec,
40
+ TransformResult,
41
+ )
42
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
43
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
44
+ MetricRunner,
45
+ MetricRunResult,
46
+ )
47
+
48
+
49
+ class SemanticChecker:
50
+ """
51
+ Orchestrates semantic metrics (and optional unit-transforms)
52
+ for a single function call.
53
+
54
+ Args:
55
+ general_metrics: JSON-schema dicts for general metrics.
56
+ function_metrics: JSON-schema dicts for function-selection metrics.
57
+ parameter_metrics: JSON-schema dicts for parameter-level metrics.
58
+ metrics_client: an WatsonXProvider instance for metric evaluation.
59
+ codegen_client: an WatsonXProvider instance for transformation codegen.
60
+ transform_enabled: whether to run unit-conversion checks.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ metrics_client: WatsonXProvider,
66
+ *,
67
+ general_metrics: Optional[List[Dict[str, Any]]] = None,
68
+ function_metrics: Optional[List[Dict[str, Any]]] = None,
69
+ parameter_metrics: Optional[List[Dict[str, Any]]] = None,
70
+ codegen_client: Optional[WatsonXProvider] = None,
71
+ transform_enabled: Optional[bool] = False,
72
+ ) -> None:
73
+ self.metrics_client = metrics_client
74
+
75
+ self.transform_enabled = transform_enabled
76
+ self.codegen_client = codegen_client
77
+
78
+ self.general_prompts = []
79
+ if general_metrics is not None:
80
+ self.general_prompts = load_prompts_from_list(
81
+ general_metrics, PromptKind.GENERAL
82
+ )
83
+
84
+ self.function_prompts = []
85
+ if function_metrics is not None:
86
+ self.function_prompts = load_prompts_from_list(
87
+ function_metrics, PromptKind.FUNCTION_SELECTION
88
+ )
89
+
90
+ self.parameter_prompts = []
91
+ if parameter_metrics is not None:
92
+ self.parameter_prompts = load_prompts_from_list(
93
+ parameter_metrics, PromptKind.PARAMETER
94
+ )
95
+
96
+ def _make_adapter(self, apis_specs, tool_call):
97
+ first = apis_specs[0]
98
+ if isinstance(first, ToolSpec):
99
+ return OpenAIAdapter(apis_specs, tool_call)
100
+ raise TypeError("Unsupported spec type")
101
+
102
+ def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]:
103
+ """
104
+ Return a mapping of every parameter name in the spec inventory
105
+ to its value from the call (or defaulted if missing).
106
+ """
107
+ call_args = adapter.get_parameters()
108
+ merged: Dict[str, Any] = {}
109
+ # Find the function in the inventory
110
+ function_parameters = (
111
+ adapter.get_tool_spec(adapter.get_function_name())
112
+ .get("parameters", {})
113
+ .get("properties", {})
114
+ )
115
+
116
+ for pname, pschema in function_parameters.items():
117
+ if pname in call_args:
118
+ merged[pname] = call_args[pname]
119
+ elif "default" in pschema:
120
+ merged[pname] = pschema["default"]
121
+ else:
122
+ merged[pname] = (
123
+ f"Default value from parameter description (if defined): '{pschema.get('description', 'No description provided')}'"
124
+ f" Otherwise, by the default value of type: {pschema.get('type', 'object')}"
125
+ )
126
+ return merged
127
+
128
+ def extract_all_units_sync(
129
+ self,
130
+ context: Union[str, List[Dict[str, str]]],
131
+ adapter: BaseAdapter,
132
+ params: List[str],
133
+ retries: int = 1,
134
+ ) -> Dict[str, Dict[str, Any]]:
135
+ """
136
+ Synchronously extract user_value/user_units_or_format/spec_units_or_format for every parameter in `params`
137
+ by issuing a single LLM call.
138
+ Returns a dict mapping each parameter name to its classification object.
139
+ """
140
+ # Build the combined JSON Schema requiring one object per parameter
141
+ multi_schema = build_multi_extract_units_schema(params)
142
+ schema_str = json.dumps(multi_schema, indent=2)
143
+
144
+ # Build the "full_spec" JSON Schema snippet for all parameters
145
+ full_spec_json = json.dumps(
146
+ adapter.get_tool_spec(adapter.get_function_name()).model_dump(),
147
+ indent=2,
148
+ )
149
+
150
+ # Format system and user prompts
151
+ system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
152
+ user_prompt = MULTI_EXTRACT_UNITS_USER.format(
153
+ context=context,
154
+ full_spec=full_spec_json,
155
+ parameter_names=", ".join(params),
156
+ )
157
+
158
+ # Single synchronous LLM call
159
+ try:
160
+ response: Dict[str, Any] = self.metrics_client.generate(
161
+ prompt=[
162
+ {"role": "system", "content": system_prompt},
163
+ {"role": "user", "content": user_prompt},
164
+ ],
165
+ schema=multi_schema,
166
+ retries=retries,
167
+ )
168
+ except Exception:
169
+ response = {
170
+ pname: {
171
+ "user_value": None,
172
+ "user_units_or_format": None,
173
+ "spec_units_or_format": None,
174
+ }
175
+ for pname in params
176
+ }
177
+
178
+ return response
179
+
180
+ def run_sync(
181
+ self,
182
+ apis_specs: List[ToolSpec],
183
+ tool_call: ToolCall,
184
+ context: Union[str, List[Dict[str, str]]],
185
+ retries: int = 1,
186
+ transform_enabled: Optional[bool] = None,
187
+ ) -> SemanticResult:
188
+ """
189
+ Synchronous semantic-only evaluation.
190
+
191
+ Returns a SemanticResult:
192
+ {
193
+ "general": {metric_name: result, …} or None
194
+ "function_selection": {…} or None
195
+ "parameter": {param_name: {metric_name: result}, …} or None
196
+ "transform": {param_name: TransformResult, …} or None
197
+ }
198
+ """
199
+ # 1) Normalize via adapter
200
+ adapter = self._make_adapter(apis_specs, tool_call)
201
+ tools_inventory_summary = adapter.get_tools_inventory_summary()
202
+ call_dict = adapter.get_call_dict()
203
+ fn_name = adapter.get_function_name()
204
+ cur_tool_spec = adapter.get_tool_spec(fn_name)
205
+ params = self._collect_params(adapter)
206
+
207
+ if transform_enabled is not None:
208
+ old_transform_enabled = self.transform_enabled
209
+ self.transform_enabled = transform_enabled
210
+
211
+ # 2) GENERAL METRICS
212
+ general_results: Optional[SemanticCategoryResult]
213
+ entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
214
+ for prompt in self.general_prompts:
215
+ entries.append(
216
+ (
217
+ prompt,
218
+ {
219
+ "conversation_context": context,
220
+ "tool_inventory": cur_tool_spec,
221
+ "tool_call": call_dict,
222
+ },
223
+ )
224
+ )
225
+ if entries:
226
+ try:
227
+ runner = MetricRunner(entries)
228
+ sync_results = runner.run_all(
229
+ self.metrics_client.generate,
230
+ prompt_param_name="prompt",
231
+ schema_param_name="schema",
232
+ retries=retries,
233
+ )
234
+ general_results = SemanticCategoryResult.from_results(sync_results)
235
+ except Exception as e:
236
+ general_results = {"error": str(e)}
237
+ else:
238
+ general_results = None
239
+
240
+ # 3) FUNCTION-SELECTION METRICS
241
+ function_results: Optional[SemanticCategoryResult]
242
+ func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
243
+ for prompt in self.function_prompts:
244
+ func_entries.append(
245
+ (
246
+ prompt,
247
+ {
248
+ "conversation_context": context,
249
+ "tools_inventory": tools_inventory_summary,
250
+ "proposed_tool_call": call_dict,
251
+ "selected_function": fn_name,
252
+ },
253
+ )
254
+ )
255
+ if func_entries:
256
+ try:
257
+ runner = MetricRunner(func_entries)
258
+ sync_results = runner.run_all(
259
+ self.metrics_client.generate,
260
+ prompt_param_name="prompt",
261
+ schema_param_name="schema",
262
+ retries=retries,
263
+ )
264
+ function_results = SemanticCategoryResult.from_results(sync_results)
265
+ except Exception as e:
266
+ function_results = {"error": str(e)}
267
+ else:
268
+ function_results = None
269
+
270
+ # 4) PARAMETER-LEVEL METRICS
271
+ parameter_results: Optional[Dict[str, SemanticCategoryResult]] = {}
272
+ for pname, pval in params.items():
273
+ # Each parameter has its own prompts
274
+ try:
275
+ param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
276
+ for prompt in self.parameter_prompts:
277
+ param_entries.append(
278
+ (
279
+ prompt,
280
+ {
281
+ "conversation_context": context,
282
+ "tool_inventory": cur_tool_spec,
283
+ "tool_call": call_dict,
284
+ "parameter_name": pname,
285
+ "parameter_value": pval,
286
+ },
287
+ )
288
+ )
289
+ runner = MetricRunner(param_entries)
290
+ sync_results = runner.run_all(
291
+ self.metrics_client.generate,
292
+ prompt_param_name="prompt",
293
+ schema_param_name="schema",
294
+ retries=retries,
295
+ )
296
+ parameter_results[pname] = SemanticCategoryResult.from_results(
297
+ sync_results
298
+ )
299
+ except Exception as e:
300
+ parameter_results[pname] = {"error": str(e)}
301
+
302
+ if not parameter_results:
303
+ parameter_results = None
304
+
305
+ # Base SemanticResult without transforms
306
+ result = SemanticResult(
307
+ general=general_results,
308
+ function_selection=function_results,
309
+ parameter=parameter_results,
310
+ )
311
+
312
+ # 5) OPTIONAL TRANSFORMS
313
+ params = adapter.get_parameters()
314
+ if self.transform_enabled and params:
315
+ if transform_enabled is not None:
316
+ self.transform_enabled = old_transform_enabled
317
+
318
+ transform_out: Dict[str, TransformResult] = {}
319
+
320
+ # 5a) Extract units for all parameters in one synchronous call
321
+ units_map = self.extract_all_units_sync(
322
+ context=context,
323
+ adapter=adapter,
324
+ params=list(params.keys()),
325
+ retries=retries,
326
+ )
327
+
328
+ # 5b) Generate code & execute for each parameter needing conversion
329
+ for pname, units in units_map.items():
330
+ user_units = units.get("user_units_or_format") or ""
331
+ spec_units = units.get("spec_units_or_format") or ""
332
+ user_value = units.get("user_value")
333
+ transformation_summary = units.get("transformation_summary", "")
334
+ gen_code = ""
335
+
336
+ # Only generate code if user_units differs from spec_units and user_value is present
337
+ if (
338
+ user_units
339
+ and user_value is not None
340
+ and spec_units
341
+ and (user_units != spec_units)
342
+ ):
343
+ try:
344
+ prompt = GENERATE_CODE_USER.format(
345
+ old_value=user_value,
346
+ old_units=user_units,
347
+ transformed_value=str(params[pname]),
348
+ transformed_units=spec_units,
349
+ transformed_type=type(params[pname]).__name__,
350
+ transformation_summary=transformation_summary,
351
+ )
352
+ gen_code = self.codegen_client.generate(
353
+ prompt=[
354
+ {"role": "system", "content": GENERATE_CODE_SYSTEM},
355
+ {"role": "user", "content": prompt},
356
+ ],
357
+ schema=GENERATE_CODE_SCHEMA,
358
+ retries=retries,
359
+ ).get("generated_code", "")
360
+ except Exception:
361
+ gen_code = ""
362
+
363
+ # 5c) Execute & validate
364
+ tr = self._execute_code_and_validate(
365
+ code=gen_code,
366
+ user_val=str(user_value or ""),
367
+ api_val=str(params[pname]),
368
+ units=units,
369
+ )
370
+ transform_out[pname] = tr
371
+
372
+ if transform_out:
373
+ result.transform = transform_out
374
+ else:
375
+ result.transform = None
376
+
377
+ return result
378
+
379
+ def _execute_code_and_validate(
380
+ self,
381
+ code: str,
382
+ user_val: str,
383
+ api_val: str,
384
+ units: Dict[str, Any],
385
+ ) -> TransformResult:
386
+ """
387
+ Strip code fences, install imports, exec code, compare, return TransformResult.
388
+ """
389
+ clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
390
+
391
+ # install imports
392
+ for mod in set(
393
+ re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
394
+ ):
395
+ try:
396
+ __import__(mod)
397
+ except ImportError as e:
398
+ return TransformResult(
399
+ units=units,
400
+ generated_code=clean,
401
+ execution_success=False,
402
+ correct=True,
403
+ execution_output=None,
404
+ correction=None,
405
+ error=f"Error: {e}. Could not import module '{mod}'. Please install the package and try again,"
406
+ " or run the generated code manually:\n"
407
+ f"transformation_code({user_val}) == convert_example_str_transformed_to_transformed_type({api_val})",
408
+ )
409
+
410
+ ns: Dict[str, Any] = {}
411
+ try:
412
+ exec(clean, ns)
413
+ fn_t = ns.get("transformation_code")
414
+ fn_c = ns.get("convert_example_str_transformed_to_transformed_type")
415
+ if not callable(fn_t) or not callable(fn_c):
416
+ raise ValueError("Generated code missing required functions")
417
+
418
+ out_t = fn_t(user_val)
419
+ out_c = fn_c(api_val)
420
+ if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
421
+ success = math.isclose(out_t, out_c, abs_tol=1e-3)
422
+ else:
423
+ success = str(out_t) == str(out_c)
424
+
425
+ correction = None
426
+ if not success:
427
+ correction = (
428
+ f"The transformation code validation found an issue with the units transformation "
429
+ f"of the parameter.\n"
430
+ f"The user request value is '{user_val}' with units '{units.get('user_units_or_format')}' and "
431
+ f"the API call value is '{api_val}' with units '{units.get('spec_units_or_format')}'.\n"
432
+ f"Expected transformation is '{out_t}' based on the code.\n"
433
+ )
434
+
435
+ correct = correction is None
436
+
437
+ return TransformResult(
438
+ units=units,
439
+ generated_code=clean,
440
+ execution_success=True,
441
+ correct=correct,
442
+ execution_output={"transformed": out_t, "converted": out_c},
443
+ correction=correction,
444
+ error=None,
445
+ )
446
+ except Exception as e:
447
+ return TransformResult(
448
+ units=units,
449
+ generated_code=clean,
450
+ execution_success=False,
451
+ correct=True,
452
+ execution_output=None,
453
+ correction=None,
454
+ error=str(e),
455
+ )
@@ -0,0 +1,156 @@
1
+ from typing import Dict, List
2
+
3
+ from jsonschema import (
4
+ Draft7Validator,
5
+ )
6
+
7
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
8
+ StaticMetricResult,
9
+ StaticResult,
10
+ ToolCall,
11
+ ToolSpec,
12
+ )
13
+
14
+ # ----------------------------------------
15
+ # Human-readable descriptions for checks
16
+ # ----------------------------------------
17
+ _STATIC_CHECKS: Dict[str, str] = {
18
+ "non_existent_function": "Function name not found in the provided API specification.",
19
+ "non_existent_parameter": "One or more parameters are not defined for the specified function.",
20
+ "incorrect_parameter_type": "One or more parameters have values whose types don't match the expected types.",
21
+ "missing_required_parameter": "One or more required parameters are missing from the call.",
22
+ "allowed_values_violation": "One or more parameters have values outside the allowed enumeration.",
23
+ "json_schema_validation": "The API call does not conform to the provided JSON Schema.",
24
+ "empty_api_spec": "There are no API specifications provided or they are invalid.",
25
+ "invalid_api_spec": "The API specifications provided are not valid Tool or ToolSpec instances.",
26
+ "invalid_tool_call": "The provided ToolCall is not a valid instance of ToolCall.",
27
+ }
28
+
29
+
30
+ def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
31
+ """
32
+ Perform static validation on a single tool call.
33
+
34
+ Args:
35
+ apis_specs: Non-empty list of ToolSpec instances (OpenAI spec for ToolCall)
36
+ api_call: Single call to validate: ToolCall instance (OpenAI tool call)
37
+
38
+ Returns:
39
+ StaticResult(metrics=..., final_decision=bool)
40
+ """
41
+ if not isinstance(apis_specs, list) or not apis_specs:
42
+ return StaticResult(
43
+ metrics={
44
+ "empty_api_spec": StaticMetricResult(
45
+ description=_STATIC_CHECKS["empty_api_spec"],
46
+ valid=False,
47
+ explanation="No API specifications provided.",
48
+ )
49
+ },
50
+ final_decision=False,
51
+ )
52
+
53
+ if not all(isinstance(spec, ToolSpec) for spec in apis_specs):
54
+ return StaticResult(
55
+ metrics={
56
+ "invalid_api_spec": StaticMetricResult(
57
+ description=_STATIC_CHECKS["invalid_api_spec"],
58
+ valid=False,
59
+ explanation="Invalid API specifications provided; expected ToolSpec instances (List of ToolSpec).",
60
+ )
61
+ },
62
+ final_decision=False,
63
+ )
64
+
65
+ if not isinstance(api_call, ToolCall):
66
+ return StaticResult(
67
+ metrics={
68
+ "invalid_tool_call": StaticMetricResult(
69
+ description=_STATIC_CHECKS["invalid_tool_call"],
70
+ valid=False,
71
+ explanation="Invalid ToolCall provided; expected ToolCall instance.",
72
+ )
73
+ },
74
+ final_decision=False,
75
+ )
76
+
77
+ errors = _check_tool_call(specs=apis_specs, call=api_call)
78
+
79
+ # Build metrics results: missing key => valid
80
+ metrics: Dict[str, StaticMetricResult] = {}
81
+ for check_name, desc in _STATIC_CHECKS.items():
82
+ valid = check_name not in errors
83
+ metrics[check_name] = StaticMetricResult(
84
+ description=desc,
85
+ valid=valid,
86
+ explanation=None if valid else errors.get(check_name),
87
+ )
88
+ final_decision = all(m.valid for m in metrics.values())
89
+ return StaticResult(metrics=metrics, final_decision=final_decision)
90
+
91
+
92
+ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
93
+ """
94
+ Static checks for OpenAI ToolCall + ToolSpec list.
95
+ Returns mapping of failed check keys -> explanation.
96
+ """
97
+ errors: Dict[str, str] = {}
98
+
99
+ # 1) Function existence
100
+ spec = next((s for s in specs if s.function.name == call.function.name), None)
101
+ if not spec:
102
+ errors["non_existent_function"] = (
103
+ f"Function '{call.function.name}' does not exist in the provided API specifications:"
104
+ f" {', '.join(s.function.name for s in specs)}."
105
+ )
106
+ return errors
107
+
108
+ params_schema = spec.function.parameters
109
+ properties = params_schema.get("properties", params_schema)
110
+ parsed_arguments = call.function.parsed_arguments
111
+
112
+ # 2) Parameter existence check
113
+ if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
114
+ errors["non_existent_parameter"] = (
115
+ f"Parameters not defined in function '{call.function.name}': "
116
+ f"{', '.join(sorted(non_existent_params))}. "
117
+ f"Possible parameters are: {', '.join(sorted(properties.keys()))}."
118
+ )
119
+
120
+ # 3) JSON Schema validation
121
+ validator = Draft7Validator(params_schema)
122
+
123
+ missing_required = []
124
+ incorrect_types = []
125
+ invalid_enum = []
126
+ other_errors = []
127
+
128
+ for error in validator.iter_errors(parsed_arguments):
129
+ field = ".".join(str(x) for x in error.path) if error.path else "unknown"
130
+ if error.validator == "required":
131
+ missing_required.append(error.message)
132
+ elif error.validator == "type":
133
+ incorrect_types.append(f"{field}: {error.message}")
134
+ elif error.validator == "enum":
135
+ invalid_enum.append(f"{field}: {error.message}")
136
+ else:
137
+ other_errors.append(f"{field}: {error.message}")
138
+
139
+ if missing_required:
140
+ errors["missing_required_parameter"] = (
141
+ "Missing required parameter(s): " + "; ".join(missing_required)
142
+ )
143
+ if incorrect_types:
144
+ errors["incorrect_parameter_type"] = (
145
+ "Incorrect parameter type(s): " + "; ".join(incorrect_types)
146
+ )
147
+ if invalid_enum:
148
+ errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
149
+ invalid_enum
150
+ )
151
+ if other_errors:
152
+ errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
153
+ other_errors
154
+ )
155
+
156
+ return errors