ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,465 @@
1
+ import json
2
+ import math
3
+ import re
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
7
+ GeneralMetricsPrompt,
8
+ )
9
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
10
+ FunctionSelectionPrompt,
11
+ )
12
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.loader import (
13
+ PromptKind,
14
+ load_prompts_from_list,
15
+ )
16
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.adapters import (
17
+ BaseAdapter,
18
+ OpenAIAdapter,
19
+ )
20
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.transformation_prompts import (
21
+ GENERATE_CODE_SCHEMA,
22
+ GENERATE_CODE_SYSTEM,
23
+ GENERATE_CODE_USER,
24
+ MULTI_EXTRACT_UNITS_SYSTEM,
25
+ MULTI_EXTRACT_UNITS_USER,
26
+ build_multi_extract_units_schema,
27
+ )
28
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
29
+ SemanticCategoryResult,
30
+ SemanticResult,
31
+ ToolCall,
32
+ ToolSpec,
33
+ TransformResult,
34
+ )
35
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
36
+ MetricRunner,
37
+ MetricRunResult,
38
+ )
39
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
40
+ WatsonXProvider,
41
+ )
42
+
43
+
44
+ class SemanticChecker:
45
+ """
46
+ Orchestrates semantic metrics (and optional unit-transforms)
47
+ for a single function call.
48
+
49
+ Args:
50
+ general_metrics: JSON-schema dicts for general metrics.
51
+ function_metrics: JSON-schema dicts for function-selection metrics.
52
+ parameter_metrics: JSON-schema dicts for parameter-level metrics.
53
+ metrics_client: an WatsonXProvider instance for metric evaluation.
54
+ codegen_client: an WatsonXProvider instance for transformation codegen.
55
+ transform_enabled: whether to run unit-conversion checks.
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ metrics_client: WatsonXProvider,
61
+ *,
62
+ general_metrics: Optional[List[Dict[str, Any]]] = None,
63
+ function_metrics: Optional[List[Dict[str, Any]]] = None,
64
+ parameter_metrics: Optional[List[Dict[str, Any]]] = None,
65
+ codegen_client: Optional[WatsonXProvider] = None,
66
+ transform_enabled: Optional[bool] = False,
67
+ ) -> None:
68
+ self.metrics_client = metrics_client
69
+
70
+ self.transform_enabled = transform_enabled
71
+ self.codegen_client = codegen_client
72
+
73
+ self.general_prompts = []
74
+ if general_metrics is not None:
75
+ self.general_prompts = load_prompts_from_list(
76
+ general_metrics, PromptKind.GENERAL
77
+ )
78
+
79
+ self.function_prompts = []
80
+ if function_metrics is not None:
81
+ self.function_prompts = load_prompts_from_list(
82
+ function_metrics, PromptKind.FUNCTION_SELECTION
83
+ )
84
+
85
+ self.parameter_prompts = []
86
+ if parameter_metrics is not None:
87
+ self.parameter_prompts = load_prompts_from_list(
88
+ parameter_metrics, PromptKind.PARAMETER
89
+ )
90
+
91
+ def _make_adapter(self, apis_specs, tool_call):
92
+ first = apis_specs[0]
93
+ if isinstance(first, ToolSpec):
94
+ return OpenAIAdapter(apis_specs, tool_call)
95
+ raise TypeError("Unsupported spec type")
96
+
97
+ def _collect_params(self, adapter: BaseAdapter) -> Dict[str, Any]:
98
+ """
99
+ Return a mapping of every parameter name in the spec inventory
100
+ to its value from the call (or defaulted if missing).
101
+ """
102
+ call_args = adapter.get_parameters()
103
+ merged: Dict[str, Any] = {}
104
+ # Find the function in the inventory
105
+ function_parameters = (
106
+ adapter.get_tool_spec(adapter.get_function_name())
107
+ .get("parameters", {})
108
+ .get("properties", {})
109
+ )
110
+
111
+ for pname, pschema in function_parameters.items():
112
+ if pname in call_args:
113
+ merged[pname] = call_args[pname]
114
+ elif "default" in pschema:
115
+ merged[pname] = pschema["default"]
116
+ else:
117
+ merged[pname] = (
118
+ f"Default value from parameter description (if defined): '{pschema.get('description', 'No description provided')}'"
119
+ f" Otherwise, by the default value of type: {pschema.get('type', 'object')}"
120
+ )
121
+ return merged
122
+
123
+ def extract_all_units_sync(
124
+ self,
125
+ context: Union[str, List[Dict[str, str]]],
126
+ adapter: BaseAdapter,
127
+ params: List[str],
128
+ retries: int = 1,
129
+ ) -> Dict[str, Dict[str, Any]]:
130
+ """
131
+ Synchronously extract user_value/user_units_or_format/spec_units_or_format for every parameter in `params`
132
+ by issuing a single LLM call.
133
+ Returns a dict mapping each parameter name to its classification object.
134
+ """
135
+ # Build the combined JSON Schema requiring one object per parameter
136
+ multi_schema = build_multi_extract_units_schema(params)
137
+ schema_str = json.dumps(multi_schema, indent=2)
138
+
139
+ # Build the "full_spec" JSON Schema snippet for all parameters
140
+ full_spec_json = json.dumps(
141
+ adapter.get_tool_spec(adapter.get_function_name()).model_dump(),
142
+ indent=2,
143
+ )
144
+
145
+ # Format system and user prompts
146
+ system_prompt = MULTI_EXTRACT_UNITS_SYSTEM.format(schema=schema_str)
147
+ user_prompt = MULTI_EXTRACT_UNITS_USER.format(
148
+ context=context,
149
+ full_spec=full_spec_json,
150
+ parameter_names=", ".join(params),
151
+ )
152
+
153
+ # Single synchronous LLM call
154
+ try:
155
+ response: Dict[str, Any] = self.metrics_client.generate(
156
+ prompt=[
157
+ {"role": "system", "content": system_prompt},
158
+ {"role": "user", "content": user_prompt},
159
+ ],
160
+ schema=multi_schema,
161
+ retries=retries,
162
+ )
163
+ except Exception:
164
+ response = {
165
+ pname: {
166
+ "user_value": None,
167
+ "user_units_or_format": None,
168
+ "spec_units_or_format": None,
169
+ }
170
+ for pname in params
171
+ }
172
+
173
+ return response
174
+
175
+ def run_sync(
176
+ self,
177
+ apis_specs: List[ToolSpec],
178
+ tool_call: ToolCall,
179
+ context: Union[str, List[Dict[str, str]]],
180
+ retries: int = 1,
181
+ transform_enabled: Optional[bool] = None,
182
+ ) -> SemanticResult:
183
+ """
184
+ Synchronous semantic-only evaluation.
185
+
186
+ Returns a SemanticResult:
187
+ {
188
+ "general": {metric_name: result, …} or None
189
+ "function_selection": {…} or None
190
+ "parameter": {param_name: {metric_name: result}, …} or None
191
+ "transform": {param_name: TransformResult, …} or None
192
+ }
193
+ """
194
+ # 1) Normalize via adapter
195
+ adapter = self._make_adapter(apis_specs, tool_call)
196
+ tools_inventory_summary = adapter.get_tools_inventory_summary()
197
+ call_dict = adapter.get_call_dict()
198
+ fn_name = adapter.get_function_name()
199
+ cur_tool_spec = adapter.get_tool_spec(fn_name)
200
+ params = self._collect_params(adapter)
201
+
202
+ if transform_enabled is not None:
203
+ old_transform_enabled = self.transform_enabled
204
+ self.transform_enabled = transform_enabled
205
+
206
+ # 2) GENERAL METRICS
207
+ general_results: Optional[SemanticCategoryResult]
208
+ entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = []
209
+ for prompt in self.general_prompts:
210
+ entries.append(
211
+ (
212
+ prompt,
213
+ {
214
+ "conversation_context": context,
215
+ "tool_inventory": cur_tool_spec,
216
+ "tool_call": call_dict,
217
+ },
218
+ )
219
+ )
220
+ if entries:
221
+ try:
222
+ runner = MetricRunner(entries)
223
+ sync_results = runner.run_all(
224
+ self.metrics_client.generate,
225
+ prompt_param_name="prompt",
226
+ schema_param_name="schema",
227
+ retries=retries,
228
+ )
229
+ general_results = SemanticCategoryResult.from_results(
230
+ sync_results
231
+ )
232
+ except Exception as e:
233
+ general_results = {"error": str(e)}
234
+ else:
235
+ general_results = None
236
+
237
+ # 3) FUNCTION-SELECTION METRICS
238
+ function_results: Optional[SemanticCategoryResult]
239
+ func_entries: List[Tuple[FunctionSelectionPrompt, Dict[str, Any]]] = []
240
+ for prompt in self.function_prompts:
241
+ func_entries.append(
242
+ (
243
+ prompt,
244
+ {
245
+ "conversation_context": context,
246
+ "tools_inventory": tools_inventory_summary,
247
+ "proposed_tool_call": call_dict,
248
+ "selected_function": fn_name,
249
+ },
250
+ )
251
+ )
252
+ if func_entries:
253
+ try:
254
+ runner = MetricRunner(func_entries)
255
+ sync_results = runner.run_all(
256
+ self.metrics_client.generate,
257
+ prompt_param_name="prompt",
258
+ schema_param_name="schema",
259
+ retries=retries,
260
+ )
261
+ function_results = SemanticCategoryResult.from_results(
262
+ sync_results
263
+ )
264
+ except Exception as e:
265
+ function_results = {"error": str(e)}
266
+ else:
267
+ function_results = None
268
+
269
+ # 4) PARAMETER-LEVEL METRICS
270
+ parameter_results: Optional[Dict[str, SemanticCategoryResult]] = {}
271
+ for pname, pval in params.items():
272
+ # Each parameter has its own prompts
273
+ try:
274
+ param_entries: List[
275
+ Tuple[ParameterMetricsPrompt, Dict[str, Any]]
276
+ ] = []
277
+ for prompt in self.parameter_prompts:
278
+ param_entries.append(
279
+ (
280
+ prompt,
281
+ {
282
+ "conversation_context": context,
283
+ "tool_inventory": cur_tool_spec,
284
+ "tool_call": call_dict,
285
+ "parameter_name": pname,
286
+ "parameter_value": pval,
287
+ },
288
+ )
289
+ )
290
+ runner = MetricRunner(param_entries)
291
+ sync_results = runner.run_all(
292
+ self.metrics_client.generate,
293
+ prompt_param_name="prompt",
294
+ schema_param_name="schema",
295
+ retries=retries,
296
+ )
297
+ parameter_results[pname] = SemanticCategoryResult.from_results(
298
+ sync_results
299
+ )
300
+ except Exception as e:
301
+ parameter_results[pname] = {"error": str(e)}
302
+
303
+ if not parameter_results:
304
+ parameter_results = None
305
+
306
+ # Base SemanticResult without transforms
307
+ result = SemanticResult(
308
+ general=general_results,
309
+ function_selection=function_results,
310
+ parameter=parameter_results,
311
+ )
312
+
313
+ # 5) OPTIONAL TRANSFORMS
314
+ params = adapter.get_parameters()
315
+ if self.transform_enabled and params:
316
+ if transform_enabled is not None:
317
+ self.transform_enabled = old_transform_enabled
318
+
319
+ transform_out: Dict[str, TransformResult] = {}
320
+
321
+ # 5a) Extract units for all parameters in one synchronous call
322
+ units_map = self.extract_all_units_sync(
323
+ context=context,
324
+ adapter=adapter,
325
+ params=list(params.keys()),
326
+ retries=retries,
327
+ )
328
+
329
+ # 5b) Generate code & execute for each parameter needing conversion
330
+ for pname, units in units_map.items():
331
+ user_units = units.get("user_units_or_format") or ""
332
+ spec_units = units.get("spec_units_or_format") or ""
333
+ user_value = units.get("user_value")
334
+ transformation_summary = units.get("transformation_summary", "")
335
+ gen_code = ""
336
+
337
+ # Only generate code if user_units differs from spec_units and user_value is present
338
+ if (
339
+ user_units
340
+ and user_value is not None
341
+ and spec_units
342
+ and (user_units != spec_units)
343
+ ):
344
+ try:
345
+ prompt = GENERATE_CODE_USER.format(
346
+ old_value=user_value,
347
+ old_units=user_units,
348
+ transformed_value=str(params[pname]),
349
+ transformed_units=spec_units,
350
+ transformed_type=type(params[pname]).__name__,
351
+ transformation_summary=transformation_summary,
352
+ )
353
+ gen_code = self.codegen_client.generate(
354
+ prompt=[
355
+ {
356
+ "role": "system",
357
+ "content": GENERATE_CODE_SYSTEM,
358
+ },
359
+ {"role": "user", "content": prompt},
360
+ ],
361
+ schema=GENERATE_CODE_SCHEMA,
362
+ retries=retries,
363
+ ).get("generated_code", "")
364
+ except Exception:
365
+ gen_code = ""
366
+
367
+ # 5c) Execute & validate
368
+ tr = self._execute_code_and_validate(
369
+ code=gen_code,
370
+ user_val=str(user_value or ""),
371
+ api_val=str(params[pname]),
372
+ units=units,
373
+ )
374
+ transform_out[pname] = tr
375
+
376
+ if transform_out:
377
+ result.transform = transform_out
378
+ else:
379
+ result.transform = None
380
+
381
+ return result
382
+
383
+ def _execute_code_and_validate(
384
+ self,
385
+ code: str,
386
+ user_val: str,
387
+ api_val: str,
388
+ units: Dict[str, Any],
389
+ ) -> TransformResult:
390
+ """
391
+ Strip code fences, install imports, exec code, compare, return TransformResult.
392
+ """
393
+ clean = re.sub(
394
+ r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
395
+ ).strip()
396
+
397
+ # install imports
398
+ for mod in set(
399
+ re.findall(
400
+ r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
401
+ )
402
+ ):
403
+ try:
404
+ __import__(mod)
405
+ except ImportError as e:
406
+ return TransformResult(
407
+ units=units,
408
+ generated_code=clean,
409
+ execution_success=False,
410
+ correct=True,
411
+ execution_output=None,
412
+ correction=None,
413
+ error=f"Error: {e}. Could not import module '{mod}'. Please install the package and try again,"
414
+ " or run the generated code manually:\n"
415
+ f"transformation_code({user_val}) == convert_example_str_transformed_to_transformed_type({api_val})",
416
+ )
417
+
418
+ ns: Dict[str, Any] = {}
419
+ try:
420
+ exec(clean, ns)
421
+ fn_t = ns.get("transformation_code")
422
+ fn_c = ns.get("convert_example_str_transformed_to_transformed_type")
423
+ if not callable(fn_t) or not callable(fn_c):
424
+ raise ValueError("Generated code missing required functions")
425
+
426
+ out_t = fn_t(user_val)
427
+ out_c = fn_c(api_val)
428
+ if isinstance(out_t, (int, float)) and isinstance(
429
+ out_c, (int, float)
430
+ ):
431
+ success = math.isclose(out_t, out_c, abs_tol=1e-3)
432
+ else:
433
+ success = str(out_t) == str(out_c)
434
+
435
+ correction = None
436
+ if not success:
437
+ correction = (
438
+ f"The transformation code validation found an issue with the units transformation "
439
+ f"of the parameter.\n"
440
+ f"The user request value is '{user_val}' with units '{units.get('user_units_or_format')}' and "
441
+ f"the API call value is '{api_val}' with units '{units.get('spec_units_or_format')}'.\n"
442
+ f"Expected transformation is '{out_t}' based on the code.\n"
443
+ )
444
+
445
+ correct = correction is None
446
+
447
+ return TransformResult(
448
+ units=units,
449
+ generated_code=clean,
450
+ execution_success=True,
451
+ correct=correct,
452
+ execution_output={"transformed": out_t, "converted": out_c},
453
+ correction=correction,
454
+ error=None,
455
+ )
456
+ except Exception as e:
457
+ return TransformResult(
458
+ units=units,
459
+ generated_code=clean,
460
+ execution_success=False,
461
+ correct=True,
462
+ execution_output=None,
463
+ correction=None,
464
+ error=str(e),
465
+ )
@@ -0,0 +1,162 @@
1
+ from typing import Dict, List
2
+
3
+ from jsonschema import Draft7Validator
4
+
5
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
6
+ StaticMetricResult,
7
+ StaticResult,
8
+ ToolCall,
9
+ ToolSpec,
10
+ )
11
+
12
+ # ----------------------------------------
13
+ # Human-readable descriptions for checks
14
+ # ----------------------------------------
15
+ _STATIC_CHECKS: Dict[str, str] = {
16
+ "non_existent_function": "Function name not found in the provided API specification.",
17
+ "non_existent_parameter": "One or more parameters are not defined for the specified function.",
18
+ "incorrect_parameter_type": "One or more parameters have values whose types don't match the expected types.",
19
+ "missing_required_parameter": "One or more required parameters are missing from the call.",
20
+ "allowed_values_violation": "One or more parameters have values outside the allowed enumeration.",
21
+ "json_schema_validation": "The API call does not conform to the provided JSON Schema.",
22
+ "empty_api_spec": "There are no API specifications provided or they are invalid.",
23
+ "invalid_api_spec": "The API specifications provided are not valid Tool or ToolSpec instances.",
24
+ "invalid_tool_call": "The provided ToolCall is not a valid instance of ToolCall.",
25
+ }
26
+
27
+
28
+ def evaluate_static(
29
+ apis_specs: List[ToolSpec], api_call: ToolCall
30
+ ) -> StaticResult:
31
+ """
32
+ Perform static validation on a single tool call.
33
+
34
+ Args:
35
+ apis_specs: Non-empty list of ToolSpec instances (OpenAI spec for ToolCall)
36
+ api_call: Single call to validate: ToolCall instance (OpenAI tool call)
37
+
38
+ Returns:
39
+ StaticResult(metrics=..., final_decision=bool)
40
+ """
41
+ if not isinstance(apis_specs, list) or not apis_specs:
42
+ return StaticResult(
43
+ metrics={
44
+ "empty_api_spec": StaticMetricResult(
45
+ description=_STATIC_CHECKS["empty_api_spec"],
46
+ valid=False,
47
+ explanation="No API specifications provided.",
48
+ )
49
+ },
50
+ final_decision=False,
51
+ )
52
+
53
+ if not all(isinstance(spec, ToolSpec) for spec in apis_specs):
54
+ return StaticResult(
55
+ metrics={
56
+ "invalid_api_spec": StaticMetricResult(
57
+ description=_STATIC_CHECKS["invalid_api_spec"],
58
+ valid=False,
59
+ explanation="Invalid API specifications provided; expected ToolSpec instances (List of ToolSpec).",
60
+ )
61
+ },
62
+ final_decision=False,
63
+ )
64
+
65
+ if not isinstance(api_call, ToolCall):
66
+ return StaticResult(
67
+ metrics={
68
+ "invalid_tool_call": StaticMetricResult(
69
+ description=_STATIC_CHECKS["invalid_tool_call"],
70
+ valid=False,
71
+ explanation="Invalid ToolCall provided; expected ToolCall instance.",
72
+ )
73
+ },
74
+ final_decision=False,
75
+ )
76
+
77
+ errors = _check_tool_call(specs=apis_specs, call=api_call)
78
+
79
+ # Build metrics results: missing key => valid
80
+ metrics: Dict[str, StaticMetricResult] = {}
81
+ for check_name, desc in _STATIC_CHECKS.items():
82
+ valid = check_name not in errors
83
+ metrics[check_name] = StaticMetricResult(
84
+ description=desc,
85
+ valid=valid,
86
+ explanation=None if valid else errors.get(check_name),
87
+ )
88
+ final_decision = all(m.valid for m in metrics.values())
89
+ return StaticResult(metrics=metrics, final_decision=final_decision)
90
+
91
+
92
+ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
93
+ """
94
+ Static checks for OpenAI ToolCall + ToolSpec list.
95
+ Returns mapping of failed check keys -> explanation.
96
+ """
97
+ errors: Dict[str, str] = {}
98
+
99
+ # 1) Function existence
100
+ spec = next(
101
+ (s for s in specs if s.function.name == call.function.name), None
102
+ )
103
+ if not spec:
104
+ errors["non_existent_function"] = (
105
+ f"Function '{call.function.name}' does not exist in the provided API specifications:"
106
+ f" {', '.join(s.function.name for s in specs)}."
107
+ )
108
+ return errors
109
+
110
+ params_schema = spec.function.parameters
111
+ properties = params_schema.get("properties", params_schema)
112
+ parsed_arguments = call.function.parsed_arguments
113
+
114
+ # 2) Parameter existence check
115
+ if non_existent_params := set(parsed_arguments.keys()) - set(
116
+ properties.keys()
117
+ ):
118
+ errors["non_existent_parameter"] = (
119
+ f"Parameters not defined in function '{call.function.name}': "
120
+ f"{', '.join(sorted(non_existent_params))}. "
121
+ f"Possible parameters are: {', '.join(sorted(properties.keys()))}."
122
+ )
123
+
124
+ # 3) JSON Schema validation
125
+ validator = Draft7Validator(params_schema)
126
+
127
+ missing_required = []
128
+ incorrect_types = []
129
+ invalid_enum = []
130
+ other_errors = []
131
+
132
+ for error in validator.iter_errors(parsed_arguments):
133
+ field = (
134
+ ".".join(str(x) for x in error.path) if error.path else "unknown"
135
+ )
136
+ if error.validator == "required":
137
+ missing_required.append(error.message)
138
+ elif error.validator == "type":
139
+ incorrect_types.append(f"{field}: {error.message}")
140
+ elif error.validator == "enum":
141
+ invalid_enum.append(f"{field}: {error.message}")
142
+ else:
143
+ other_errors.append(f"{field}: {error.message}")
144
+
145
+ if missing_required:
146
+ errors["missing_required_parameter"] = (
147
+ "Missing required parameter(s): " + "; ".join(missing_required)
148
+ )
149
+ if incorrect_types:
150
+ errors["incorrect_parameter_type"] = (
151
+ "Incorrect parameter type(s): " + "; ".join(incorrect_types)
152
+ )
153
+ if invalid_enum:
154
+ errors["allowed_values_violation"] = (
155
+ "Invalid parameter value(s): " + "; ".join(invalid_enum)
156
+ )
157
+ if other_errors:
158
+ errors["json_schema_validation"] = (
159
+ "Other validation error(s): " + "; ".join(other_errors)
160
+ )
161
+
162
+ return errors