ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,31 @@
1
- from wxo_agentic_evaluation.service_provider import get_provider
2
- from wxo_agentic_evaluation.resource_map import ResourceMap
3
- from wxo_agentic_evaluation.llm_user import LLMUser
4
- from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
1
+ import dataclasses
2
+ import glob
3
+ import json
4
+ import os
5
+ import traceback
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ import rich
9
+ import yaml
10
+ from jsonargparse import CLI
11
+ from rich.progress import Progress
12
+
13
+ from wxo_agentic_evaluation.arg_configs import AttackConfig
5
14
  from wxo_agentic_evaluation.inference_backend import (
6
15
  EvaluationController,
7
- get_wxo_client,
8
16
  WXOInferenceBackend,
17
+ get_wxo_client,
18
+ )
19
+ from wxo_agentic_evaluation.llm_user import LLMUser
20
+ from wxo_agentic_evaluation.prompt.template_render import (
21
+ LlamaUserTemplateRenderer,
9
22
  )
10
- from wxo_agentic_evaluation.type import AttackData
11
-
12
- from wxo_agentic_evaluation.arg_configs import AttackConfig
13
23
  from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
24
+ from wxo_agentic_evaluation.resource_map import ResourceMap
25
+ from wxo_agentic_evaluation.service_provider import get_provider
26
+ from wxo_agentic_evaluation.type import AttackData
14
27
  from wxo_agentic_evaluation.utils import json_dump
15
28
 
16
- import os
17
- import json
18
- import traceback
19
- import yaml
20
- import dataclasses
21
- import glob
22
- import rich
23
- from rich.progress import Progress
24
- from concurrent.futures import ThreadPoolExecutor
25
- from jsonargparse import CLI
26
-
27
29
 
28
30
  def process_attack(task_n, attack_path, config, inference_backend, llm_user):
29
31
  tc_name = os.path.basename(attack_path).replace(".json", "")
@@ -31,7 +33,9 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
31
33
  attack: AttackData = AttackData.model_validate(json.load(f))
32
34
 
33
35
  evaluation_controller = EvaluationController(
34
- wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
36
+ wxo_inference_backend=inference_backend,
37
+ llm_user=llm_user,
38
+ config=config,
35
39
  )
36
40
  rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
37
41
  history, _, _ = evaluation_controller.run(
@@ -46,7 +50,8 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
46
50
  result.append(message.model_dump())
47
51
 
48
52
  json_dump(
49
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
53
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
54
+ result,
50
55
  )
51
56
 
52
57
  return result
@@ -55,19 +60,26 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
55
60
  def run_attacks(config: AttackConfig):
56
61
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
57
62
  wxo_client = get_wxo_client(
58
- config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
63
+ config.auth_config.url,
64
+ config.auth_config.tenant_name,
65
+ config.auth_config.token,
59
66
  )
60
67
  resource_map = ResourceMap(wxo_client)
61
68
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
62
69
  llm_user = LLMUser(
63
70
  wai_client=get_provider(
64
- config=config.provider_config, model_id=config.llm_user_config.model_id
71
+ config=config.provider_config,
72
+ model_id=config.llm_user_config.model_id,
73
+ ),
74
+ template=LlamaUserTemplateRenderer(
75
+ config.llm_user_config.prompt_config
65
76
  ),
66
- template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
67
77
  user_response_style=config.llm_user_config.user_response_style,
68
78
  )
69
79
 
70
- print(f"Running red teaming attacks with tenant {config.auth_config.tenant_name}")
80
+ print(
81
+ f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
82
+ )
71
83
  os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
72
84
 
73
85
  results_list = []
@@ -81,7 +93,9 @@ def run_attacks(config: AttackConfig):
81
93
  task_n = 0
82
94
 
83
95
  for attack_path in attack_paths:
84
- if not attack_path.endswith(".json") or attack_path.endswith("agent.json"):
96
+ if not attack_path.endswith(".json") or attack_path.endswith(
97
+ "agent.json"
98
+ ):
85
99
  continue
86
100
 
87
101
  future = executor.submit(
@@ -118,7 +132,9 @@ def run_attacks(config: AttackConfig):
118
132
  ) as f:
119
133
  yaml.safe_dump(dataclasses.asdict(config), f)
120
134
 
121
- with open(os.path.join(config.output_dir, "attacks_results.json"), "w") as f:
135
+ with open(
136
+ os.path.join(config.output_dir, "attacks_results.json"), "w"
137
+ ) as f:
122
138
  json.dump(attack_results, f, indent=2)
123
139
 
124
140
  print(f"Attack results saved to {config.output_dir}")
@@ -1,7 +1,9 @@
1
1
  from abc import ABC
2
2
 
3
3
  from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
4
- from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
4
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
5
+ MetricPrompt,
6
+ )
5
7
 
6
8
 
7
9
  class FunctionMetricsPrompt(MetricPrompt, ABC):
@@ -1,14 +1,7 @@
1
1
  import json
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import (
5
- Any,
6
- Dict,
7
- Iterable,
8
- List,
9
- Tuple,
10
- Union,
11
- )
4
+ from typing import Any, Dict, Iterable, List, Tuple, Union
12
5
 
13
6
  from pydantic import ValidationError
14
7
 
@@ -18,7 +11,10 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function
18
11
  from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
19
12
  FunctionSelectionPrompt,
20
13
  )
21
- from wxo_agentic_evaluation.referenceless_eval.metrics import Metric, MetricPrompt
14
+ from wxo_agentic_evaluation.referenceless_eval.metrics import (
15
+ Metric,
16
+ MetricPrompt,
17
+ )
22
18
 
23
19
  PromptType = Union[
24
20
  GeneralMetricsPrompt,
@@ -70,7 +66,9 @@ def load_prompts_from_jsonl(
70
66
  raise LoaderError(f"File not found: {path}")
71
67
 
72
68
  prompts: List[PromptType] = []
73
- for lineno, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
69
+ for lineno, raw in enumerate(
70
+ p.read_text(encoding="utf-8").splitlines(), start=1
71
+ ):
74
72
  if not raw.strip():
75
73
  continue
76
74
  try:
@@ -96,7 +94,9 @@ def load_prompts_from_jsonl(
96
94
  # Instantiate prompt
97
95
  prompt: MetricPrompt
98
96
  try:
99
- prompt = PromptCls(metric=metric, task_description=metric.description)
97
+ prompt = PromptCls(
98
+ metric=metric, task_description=metric.description
99
+ )
100
100
  except TypeError:
101
101
  prompt = PromptCls(metric=metric)
102
102
 
@@ -158,7 +158,9 @@ def load_prompts_from_list(
158
158
  raise LoaderError(f"Record {idx} invalid schema: {e}") from e
159
159
 
160
160
  try:
161
- prompt = PromptCls(metric=metric, task_description=rec["task_description"])
161
+ prompt = PromptCls(
162
+ metric=metric, task_description=rec["task_description"]
163
+ )
162
164
  except TypeError:
163
165
  prompt = PromptCls(metric=metric)
164
166
 
@@ -167,11 +169,15 @@ def load_prompts_from_list(
167
169
  user_kwargs = ex["user_kwargs"]
168
170
  output = ex["output"]
169
171
  except KeyError as e:
170
- raise LoaderError(f"Record {idx}, example {ex_idx} missing {e}") from e
172
+ raise LoaderError(
173
+ f"Record {idx}, example {ex_idx} missing {e}"
174
+ ) from e
171
175
  try:
172
176
  prompt.add_example(user_kwargs, output)
173
177
  except (ValidationError, ValueError) as e:
174
- raise LoaderError(f"Record {idx}, example {ex_idx} invalid: {e}") from e
178
+ raise LoaderError(
179
+ f"Record {idx}, example {ex_idx} invalid: {e}"
180
+ ) from e
175
181
 
176
182
  prompts.append(prompt)
177
183
 
@@ -211,7 +217,9 @@ def load_prompts_from_metrics(
211
217
 
212
218
  # Instantiate prompt with the metric's description as task_description
213
219
  try:
214
- prompt = PromptCls(metric=metric, task_description=metric.description)
220
+ prompt = PromptCls(
221
+ metric=metric, task_description=metric.description
222
+ )
215
223
  except TypeError:
216
224
  # Fallback if constructor signature differs
217
225
  prompt = PromptCls(metric=metric)
@@ -1,8 +1,4 @@
1
- from typing import (
2
- Any,
3
- Dict,
4
- List,
5
- )
1
+ from typing import Any, Dict, List
6
2
 
7
3
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
8
4
  ToolCall,
@@ -65,7 +61,9 @@ class OpenAIAdapter(BaseAdapter):
65
61
  ]
66
62
 
67
63
  def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
68
- tool = next((t for t in self.specs if t.function.name == tool_name), None)
64
+ tool = next(
65
+ (t for t in self.specs if t.function.name == tool_name), None
66
+ )
69
67
  return tool.function.model_dump() if tool else {}
70
68
 
71
69
  def get_call_dict(self) -> Dict[str, Any]:
@@ -87,11 +85,18 @@ class OpenAIAdapter(BaseAdapter):
87
85
 
88
86
  def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
89
87
  spec = next(
90
- (s for s in self.specs if s.function.name == self.get_function_name()), None
88
+ (
89
+ s
90
+ for s in self.specs
91
+ if s.function.name == self.get_function_name()
92
+ ),
93
+ None,
91
94
  )
92
95
  if not spec:
93
96
  return {"type": "object", "properties": {}, "required": []}
94
- props = spec.function.parameters.get("properties", spec.function.parameters)
97
+ props = spec.function.parameters.get(
98
+ "properties", spec.function.parameters
99
+ )
95
100
  if param_name not in props:
96
101
  return {"type": "object", "properties": {}, "required": []}
97
102
  return {
@@ -1,8 +1,9 @@
1
- import json
2
1
  import importlib.resources
2
+ import json
3
3
  from pathlib import Path
4
4
  from typing import Dict, List, Optional, Union
5
5
 
6
+ from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
6
7
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
7
8
  SemanticChecker,
8
9
  )
@@ -19,13 +20,16 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
19
20
  ToolCall,
20
21
  ToolSpec,
21
22
  )
22
- from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
23
- from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import LLMKitWrapper
23
+ from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
24
+ LLMKitWrapper,
25
+ )
26
+
24
27
 
25
28
  def metrics_dir():
26
29
  path = importlib.resources.files(metrics)
27
30
  return path
28
31
 
32
+
29
33
  # Default metric JSON paths
30
34
  _METRICS_DIR = metrics_dir()
31
35
  _DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
@@ -36,10 +40,14 @@ _DEFAULT_FUNCSEL = (
36
40
  _METRICS_DIR / "function_selection" / "function_selection_metrics.json"
37
41
  )
38
42
  _DEFAULT_FUNCSEL_RUNTIME = (
39
- _METRICS_DIR / "function_selection" / "function_selection_metrics_runtime.json"
43
+ _METRICS_DIR
44
+ / "function_selection"
45
+ / "function_selection_metrics_runtime.json"
40
46
  )
41
47
  _DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
42
- _DEFAULT_PARAM_RUNTIME = _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
48
+ _DEFAULT_PARAM_RUNTIME = (
49
+ _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
50
+ )
43
51
 
44
52
 
45
53
  class ReflectionPipeline:
@@ -88,11 +96,19 @@ class ReflectionPipeline:
88
96
  for metrics, default_path in [
89
97
  (
90
98
  self.general_metrics,
91
- _DEFAULT_GENERAL_RUNTIME if runtime_pipeline else _DEFAULT_GENERAL,
99
+ (
100
+ _DEFAULT_GENERAL_RUNTIME
101
+ if runtime_pipeline
102
+ else _DEFAULT_GENERAL
103
+ ),
92
104
  ),
93
105
  (
94
106
  self.function_metrics,
95
- _DEFAULT_FUNCSEL_RUNTIME if runtime_pipeline else _DEFAULT_FUNCSEL,
107
+ (
108
+ _DEFAULT_FUNCSEL_RUNTIME
109
+ if runtime_pipeline
110
+ else _DEFAULT_FUNCSEL
111
+ ),
96
112
  ),
97
113
  (
98
114
  self.parameter_metrics,
@@ -104,7 +120,9 @@ class ReflectionPipeline:
104
120
  continue
105
121
 
106
122
  # Handle metric names list
107
- if isinstance(metrics, list) and all(isinstance(x, str) for x in metrics):
123
+ if isinstance(metrics, list) and all(
124
+ isinstance(x, str) for x in metrics
125
+ ):
108
126
  # Load the default JSON file
109
127
  if not default_path.is_file():
110
128
  raise FileNotFoundError(
@@ -116,7 +134,9 @@ class ReflectionPipeline:
116
134
 
117
135
  # Filter metrics by name
118
136
  filtered_metrics = [
119
- metric for metric in all_metrics if metric.get("name") in metrics
137
+ metric
138
+ for metric in all_metrics
139
+ if metric.get("name") in metrics
120
140
  ]
121
141
 
122
142
  # Remove examples from prompts if requested
@@ -125,7 +145,9 @@ class ReflectionPipeline:
125
145
  metric.pop("examples", None)
126
146
 
127
147
  if len(filtered_metrics) != len(metrics):
128
- found_names = {metric.get("name") for metric in filtered_metrics}
148
+ found_names = {
149
+ metric.get("name") for metric in filtered_metrics
150
+ }
129
151
  missing = set(metrics) - found_names
130
152
  raise ValueError(f"Metrics not found: {missing}")
131
153
 
@@ -140,14 +162,20 @@ class ReflectionPipeline:
140
162
  if isinstance(metrics, list) and all(
141
163
  isinstance(x, FunctionCallMetric) for x in metrics
142
164
  ):
143
- metrics_definitions.append([metric.model_dump() for metric in metrics])
165
+ metrics_definitions.append(
166
+ [metric.model_dump() for metric in metrics]
167
+ )
144
168
  else:
145
169
  if not metrics.is_file():
146
- raise FileNotFoundError(f"Metrics file not found: {metrics}")
170
+ raise FileNotFoundError(
171
+ f"Metrics file not found: {metrics}"
172
+ )
147
173
  metrics_definitions.append(
148
174
  [
149
175
  json.loads(json_obj)
150
- for json_obj in metrics.read_text(encoding="utf8").splitlines()
176
+ for json_obj in metrics.read_text(
177
+ encoding="utf8"
178
+ ).splitlines()
151
179
  if json_obj.strip()
152
180
  ]
153
181
  )
@@ -1,14 +1,7 @@
1
1
  import json
2
2
  import math
3
3
  import re
4
- from typing import (
5
- Any,
6
- Dict,
7
- List,
8
- Optional,
9
- Tuple,
10
- Union,
11
- )
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
5
 
13
6
  from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
14
7
  GeneralMetricsPrompt,
@@ -39,11 +32,13 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
39
32
  ToolSpec,
40
33
  TransformResult,
41
34
  )
42
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
43
35
  from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
44
36
  MetricRunner,
45
37
  MetricRunResult,
46
38
  )
39
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
40
+ WatsonXProvider,
41
+ )
47
42
 
48
43
 
49
44
  class SemanticChecker:
@@ -231,7 +226,9 @@ class SemanticChecker:
231
226
  schema_param_name="schema",
232
227
  retries=retries,
233
228
  )
234
- general_results = SemanticCategoryResult.from_results(sync_results)
229
+ general_results = SemanticCategoryResult.from_results(
230
+ sync_results
231
+ )
235
232
  except Exception as e:
236
233
  general_results = {"error": str(e)}
237
234
  else:
@@ -261,7 +258,9 @@ class SemanticChecker:
261
258
  schema_param_name="schema",
262
259
  retries=retries,
263
260
  )
264
- function_results = SemanticCategoryResult.from_results(sync_results)
261
+ function_results = SemanticCategoryResult.from_results(
262
+ sync_results
263
+ )
265
264
  except Exception as e:
266
265
  function_results = {"error": str(e)}
267
266
  else:
@@ -272,7 +271,9 @@ class SemanticChecker:
272
271
  for pname, pval in params.items():
273
272
  # Each parameter has its own prompts
274
273
  try:
275
- param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
274
+ param_entries: List[
275
+ Tuple[ParameterMetricsPrompt, Dict[str, Any]]
276
+ ] = []
276
277
  for prompt in self.parameter_prompts:
277
278
  param_entries.append(
278
279
  (
@@ -351,7 +352,10 @@ class SemanticChecker:
351
352
  )
352
353
  gen_code = self.codegen_client.generate(
353
354
  prompt=[
354
- {"role": "system", "content": GENERATE_CODE_SYSTEM},
355
+ {
356
+ "role": "system",
357
+ "content": GENERATE_CODE_SYSTEM,
358
+ },
355
359
  {"role": "user", "content": prompt},
356
360
  ],
357
361
  schema=GENERATE_CODE_SCHEMA,
@@ -386,11 +390,15 @@ class SemanticChecker:
386
390
  """
387
391
  Strip code fences, install imports, exec code, compare, return TransformResult.
388
392
  """
389
- clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
393
+ clean = re.sub(
394
+ r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
395
+ ).strip()
390
396
 
391
397
  # install imports
392
398
  for mod in set(
393
- re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
399
+ re.findall(
400
+ r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
401
+ )
394
402
  ):
395
403
  try:
396
404
  __import__(mod)
@@ -417,7 +425,9 @@ class SemanticChecker:
417
425
 
418
426
  out_t = fn_t(user_val)
419
427
  out_c = fn_c(api_val)
420
- if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
428
+ if isinstance(out_t, (int, float)) and isinstance(
429
+ out_c, (int, float)
430
+ ):
421
431
  success = math.isclose(out_t, out_c, abs_tol=1e-3)
422
432
  else:
423
433
  success = str(out_t) == str(out_c)
@@ -1,8 +1,6 @@
1
1
  from typing import Dict, List
2
2
 
3
- from jsonschema import (
4
- Draft7Validator,
5
- )
3
+ from jsonschema import Draft7Validator
6
4
 
7
5
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
8
6
  StaticMetricResult,
@@ -27,7 +25,9 @@ _STATIC_CHECKS: Dict[str, str] = {
27
25
  }
28
26
 
29
27
 
30
- def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
28
+ def evaluate_static(
29
+ apis_specs: List[ToolSpec], api_call: ToolCall
30
+ ) -> StaticResult:
31
31
  """
32
32
  Perform static validation on a single tool call.
33
33
 
@@ -97,7 +97,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
97
97
  errors: Dict[str, str] = {}
98
98
 
99
99
  # 1) Function existence
100
- spec = next((s for s in specs if s.function.name == call.function.name), None)
100
+ spec = next(
101
+ (s for s in specs if s.function.name == call.function.name), None
102
+ )
101
103
  if not spec:
102
104
  errors["non_existent_function"] = (
103
105
  f"Function '{call.function.name}' does not exist in the provided API specifications:"
@@ -110,7 +112,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
110
112
  parsed_arguments = call.function.parsed_arguments
111
113
 
112
114
  # 2) Parameter existence check
113
- if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
115
+ if non_existent_params := set(parsed_arguments.keys()) - set(
116
+ properties.keys()
117
+ ):
114
118
  errors["non_existent_parameter"] = (
115
119
  f"Parameters not defined in function '{call.function.name}': "
116
120
  f"{', '.join(sorted(non_existent_params))}. "
@@ -126,7 +130,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
126
130
  other_errors = []
127
131
 
128
132
  for error in validator.iter_errors(parsed_arguments):
129
- field = ".".join(str(x) for x in error.path) if error.path else "unknown"
133
+ field = (
134
+ ".".join(str(x) for x in error.path) if error.path else "unknown"
135
+ )
130
136
  if error.validator == "required":
131
137
  missing_required.append(error.message)
132
138
  elif error.validator == "type":
@@ -145,12 +151,12 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
145
151
  "Incorrect parameter type(s): " + "; ".join(incorrect_types)
146
152
  )
147
153
  if invalid_enum:
148
- errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
149
- invalid_enum
154
+ errors["allowed_values_violation"] = (
155
+ "Invalid parameter value(s): " + "; ".join(invalid_enum)
150
156
  )
151
157
  if other_errors:
152
- errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
153
- other_errors
158
+ errors["json_schema_validation"] = (
159
+ "Other validation error(s): " + "; ".join(other_errors)
154
160
  )
155
161
 
156
162
  return errors