ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +9 -3
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation.py +42 -0
  14. wxo_agentic_evaluation/evaluation_package.py +117 -70
  15. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  16. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  17. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  18. wxo_agentic_evaluation/external_agent/types.py +12 -5
  19. wxo_agentic_evaluation/inference_backend.py +183 -79
  20. wxo_agentic_evaluation/llm_matching.py +4 -3
  21. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  22. wxo_agentic_evaluation/llm_user.py +7 -3
  23. wxo_agentic_evaluation/main.py +175 -67
  24. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  25. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  26. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  27. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
  28. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  29. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  30. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  31. wxo_agentic_evaluation/quick_eval.py +49 -23
  32. wxo_agentic_evaluation/record_chat.py +70 -33
  33. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  34. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  35. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  43. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  44. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  46. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  47. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  48. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  49. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  50. wxo_agentic_evaluation/resource_map.py +2 -1
  51. wxo_agentic_evaluation/service_instance.py +103 -21
  52. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  53. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
  54. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  55. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  56. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  57. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  58. wxo_agentic_evaluation/tool_planner.py +128 -44
  59. wxo_agentic_evaluation/type.py +12 -9
  60. wxo_agentic_evaluation/utils/__init__.py +1 -0
  61. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  62. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  63. wxo_agentic_evaluation/utils/utils.py +83 -52
  64. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  65. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
  66. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,23 @@
1
+ import ast
1
2
  import json
2
- import random
3
3
  import os
4
- import ast
4
+ import random
5
+
5
6
  import rich
7
+ from jsonargparse import CLI
6
8
 
7
- from wxo_agentic_evaluation.utils.utils import load_agents
8
- from wxo_agentic_evaluation.red_teaming.attack_list import RED_TEAMING_ATTACKS, print_attacks
9
- from wxo_agentic_evaluation.type import AttackCategory
9
+ from wxo_agentic_evaluation.arg_configs import AttackGeneratorConfig
10
10
  from wxo_agentic_evaluation.prompt.template_render import (
11
- OnPolicyAttackGeneratorTemplateRenderer,
12
11
  OffPolicyAttackGeneratorTemplateRenderer,
12
+ OnPolicyAttackGeneratorTemplateRenderer,
13
+ )
14
+ from wxo_agentic_evaluation.red_teaming.attack_list import (
15
+ RED_TEAMING_ATTACKS,
16
+ print_attacks,
13
17
  )
14
18
  from wxo_agentic_evaluation.service_provider import get_provider
15
- from wxo_agentic_evaluation.arg_configs import AttackGeneratorConfig
16
- from jsonargparse import CLI
19
+ from wxo_agentic_evaluation.type import AttackCategory
20
+ from wxo_agentic_evaluation.utils.utils import load_agents
17
21
 
18
22
  root_dir = os.path.dirname(os.path.dirname(__file__))
19
23
  ON_POLICY_ATTACK_GENERATION_PROMPT = os.path.join(
@@ -60,13 +64,17 @@ class AttackGenerator:
60
64
  if f.lower().endswith(".json")
61
65
  ]
62
66
  if not json_files:
63
- rich.print(f"[yellow]WARNING:[/yellow] No .json files found in directory {path}")
67
+ rich.print(
68
+ f"[yellow]WARNING:[/yellow] No .json files found in directory {path}"
69
+ )
64
70
  continue
65
71
  paths_to_read = json_files
66
72
  elif os.path.isfile(path):
67
73
  paths_to_read = [path]
68
74
  else:
69
- rich.print(f"[yellow]WARNING:[/yellow] Path not found, skipping: {path}")
75
+ rich.print(
76
+ f"[yellow]WARNING:[/yellow] Path not found, skipping: {path}"
77
+ )
70
78
  continue
71
79
 
72
80
  for file_path in paths_to_read:
@@ -74,7 +82,9 @@ class AttackGenerator:
74
82
  with open(file_path) as f:
75
83
  data = json.load(f)
76
84
  except Exception as e:
77
- rich.print(f"[red]ERROR:[/red] Failed to load {file_path}: {e}")
85
+ rich.print(
86
+ f"[red]ERROR:[/red] Failed to load {file_path}: {e}"
87
+ )
78
88
  continue
79
89
 
80
90
  info = {
@@ -107,7 +117,7 @@ class AttackGenerator:
107
117
  if agent["name"].endswith("_manager"):
108
118
  manager_agent_name = agent["name"]
109
119
  break
110
-
120
+
111
121
  if manager_agent_name is None:
112
122
  manager_agent_name = target_agent_name
113
123
  rich.print(
@@ -122,7 +132,9 @@ class AttackGenerator:
122
132
  if attack.get("attack_name") == clean_name:
123
133
  return attack
124
134
  rich.print(f"[red]ERROR:[/red] No attack found with name: {name}")
125
- rich.print("[green]INFO:[/green] See the list of available attacks below under the \"Name\" column:")
135
+ rich.print(
136
+ '[green]INFO:[/green] See the list of available attacks below under the "Name" column:'
137
+ )
126
138
  print_attacks()
127
139
 
128
140
  return None
@@ -171,7 +183,9 @@ class AttackGenerator:
171
183
  tools_list=tools,
172
184
  agent_instructions=policy_instructions,
173
185
  original_story=info.get("story", ""),
174
- original_starting_sentence=info.get("starting_sentence", ""),
186
+ original_starting_sentence=info.get(
187
+ "starting_sentence", ""
188
+ ),
175
189
  )
176
190
  res = self.llm_client.query(on_policy_prompt)
177
191
  try:
@@ -221,11 +235,15 @@ class AttackGenerator:
221
235
  if attack_category == AttackCategory.off_policy:
222
236
  off_policy_prompt = self.off_policy_renderer.render(
223
237
  original_story=info.get("story", ""),
224
- original_starting_sentence=info.get("starting_sentence", ""),
238
+ original_starting_sentence=info.get(
239
+ "starting_sentence", ""
240
+ ),
225
241
  )
226
242
  res = self.llm_client.query(off_policy_prompt)
227
243
  try:
228
- off_policy_attack_data = ast.literal_eval(res.strip())[0]
244
+ off_policy_attack_data = ast.literal_eval(res.strip())[
245
+ 0
246
+ ]
229
247
  except:
230
248
  off_policy_attack_data = {}
231
249
 
@@ -249,11 +267,13 @@ class AttackGenerator:
249
267
  "modified_starting_sentence", ""
250
268
  )
251
269
 
252
- results.append({"dataset": info.get("dataset"), "attack": out})
270
+ results.append(
271
+ {"dataset": info.get("dataset"), "attack": out}
272
+ )
253
273
 
254
274
  if output_dir is None:
255
275
  output_dir = os.path.join(os.getcwd(), "red_team_attacks")
256
-
276
+
257
277
  os.makedirs(output_dir, exist_ok=True)
258
278
  for idx, res in enumerate(results):
259
279
  attack = res.get("attack", {})
@@ -1,29 +1,31 @@
1
- from wxo_agentic_evaluation.service_provider import get_provider
2
- from wxo_agentic_evaluation.resource_map import ResourceMap
3
- from wxo_agentic_evaluation.llm_user import LLMUser
4
- from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
1
+ import dataclasses
2
+ import glob
3
+ import json
4
+ import os
5
+ import traceback
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ import rich
9
+ import yaml
10
+ from jsonargparse import CLI
11
+ from rich.progress import Progress
12
+
13
+ from wxo_agentic_evaluation.arg_configs import AttackConfig
5
14
  from wxo_agentic_evaluation.inference_backend import (
6
15
  EvaluationController,
7
- get_wxo_client,
8
16
  WXOInferenceBackend,
17
+ get_wxo_client,
18
+ )
19
+ from wxo_agentic_evaluation.llm_user import LLMUser
20
+ from wxo_agentic_evaluation.prompt.template_render import (
21
+ LlamaUserTemplateRenderer,
9
22
  )
10
- from wxo_agentic_evaluation.type import AttackData
11
-
12
- from wxo_agentic_evaluation.arg_configs import AttackConfig
13
23
  from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
24
+ from wxo_agentic_evaluation.resource_map import ResourceMap
25
+ from wxo_agentic_evaluation.service_provider import get_provider
26
+ from wxo_agentic_evaluation.type import AttackData
14
27
  from wxo_agentic_evaluation.utils import json_dump
15
28
 
16
- import os
17
- import json
18
- import traceback
19
- import yaml
20
- import dataclasses
21
- import glob
22
- import rich
23
- from rich.progress import Progress
24
- from concurrent.futures import ThreadPoolExecutor
25
- from jsonargparse import CLI
26
-
27
29
 
28
30
  def process_attack(task_n, attack_path, config, inference_backend, llm_user):
29
31
  tc_name = os.path.basename(attack_path).replace(".json", "")
@@ -31,7 +33,9 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
31
33
  attack: AttackData = AttackData.model_validate(json.load(f))
32
34
 
33
35
  evaluation_controller = EvaluationController(
34
- wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
36
+ wxo_inference_backend=inference_backend,
37
+ llm_user=llm_user,
38
+ config=config,
35
39
  )
36
40
  rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
37
41
  history, _, _ = evaluation_controller.run(
@@ -46,7 +50,8 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
46
50
  result.append(message.model_dump())
47
51
 
48
52
  json_dump(
49
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
53
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
54
+ result,
50
55
  )
51
56
 
52
57
  return result
@@ -55,19 +60,26 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
55
60
  def run_attacks(config: AttackConfig):
56
61
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
57
62
  wxo_client = get_wxo_client(
58
- config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
63
+ config.auth_config.url,
64
+ config.auth_config.tenant_name,
65
+ config.auth_config.token,
59
66
  )
60
67
  resource_map = ResourceMap(wxo_client)
61
68
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
62
69
  llm_user = LLMUser(
63
70
  wai_client=get_provider(
64
- config=config.provider_config, model_id=config.llm_user_config.model_id
71
+ config=config.provider_config,
72
+ model_id=config.llm_user_config.model_id,
73
+ ),
74
+ template=LlamaUserTemplateRenderer(
75
+ config.llm_user_config.prompt_config
65
76
  ),
66
- template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
67
77
  user_response_style=config.llm_user_config.user_response_style,
68
78
  )
69
79
 
70
- print(f"Running red teaming attacks with tenant {config.auth_config.tenant_name}")
80
+ print(
81
+ f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
82
+ )
71
83
  os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
72
84
 
73
85
  results_list = []
@@ -81,7 +93,9 @@ def run_attacks(config: AttackConfig):
81
93
  task_n = 0
82
94
 
83
95
  for attack_path in attack_paths:
84
- if not attack_path.endswith(".json") or attack_path.endswith("agent.json"):
96
+ if not attack_path.endswith(".json") or attack_path.endswith(
97
+ "agent.json"
98
+ ):
85
99
  continue
86
100
 
87
101
  future = executor.submit(
@@ -118,7 +132,9 @@ def run_attacks(config: AttackConfig):
118
132
  ) as f:
119
133
  yaml.safe_dump(dataclasses.asdict(config), f)
120
134
 
121
- with open(os.path.join(config.output_dir, "attacks_results.json"), "w") as f:
135
+ with open(
136
+ os.path.join(config.output_dir, "attacks_results.json"), "w"
137
+ ) as f:
122
138
  json.dump(attack_results, f, indent=2)
123
139
 
124
140
  print(f"Attack results saved to {config.output_dir}")
@@ -1,7 +1,9 @@
1
1
  from abc import ABC
2
2
 
3
3
  from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
4
- from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
4
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
5
+ MetricPrompt,
6
+ )
5
7
 
6
8
 
7
9
  class FunctionMetricsPrompt(MetricPrompt, ABC):
@@ -1,14 +1,7 @@
1
1
  import json
2
2
  from enum import Enum
3
3
  from pathlib import Path
4
- from typing import (
5
- Any,
6
- Dict,
7
- Iterable,
8
- List,
9
- Tuple,
10
- Union,
11
- )
4
+ from typing import Any, Dict, Iterable, List, Tuple, Union
12
5
 
13
6
  from pydantic import ValidationError
14
7
 
@@ -18,7 +11,10 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function
18
11
  from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
19
12
  FunctionSelectionPrompt,
20
13
  )
21
- from wxo_agentic_evaluation.referenceless_eval.metrics import Metric, MetricPrompt
14
+ from wxo_agentic_evaluation.referenceless_eval.metrics import (
15
+ Metric,
16
+ MetricPrompt,
17
+ )
22
18
 
23
19
  PromptType = Union[
24
20
  GeneralMetricsPrompt,
@@ -70,7 +66,9 @@ def load_prompts_from_jsonl(
70
66
  raise LoaderError(f"File not found: {path}")
71
67
 
72
68
  prompts: List[PromptType] = []
73
- for lineno, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
69
+ for lineno, raw in enumerate(
70
+ p.read_text(encoding="utf-8").splitlines(), start=1
71
+ ):
74
72
  if not raw.strip():
75
73
  continue
76
74
  try:
@@ -96,7 +94,9 @@ def load_prompts_from_jsonl(
96
94
  # Instantiate prompt
97
95
  prompt: MetricPrompt
98
96
  try:
99
- prompt = PromptCls(metric=metric, task_description=metric.description)
97
+ prompt = PromptCls(
98
+ metric=metric, task_description=metric.description
99
+ )
100
100
  except TypeError:
101
101
  prompt = PromptCls(metric=metric)
102
102
 
@@ -158,7 +158,9 @@ def load_prompts_from_list(
158
158
  raise LoaderError(f"Record {idx} invalid schema: {e}") from e
159
159
 
160
160
  try:
161
- prompt = PromptCls(metric=metric, task_description=rec["task_description"])
161
+ prompt = PromptCls(
162
+ metric=metric, task_description=rec["task_description"]
163
+ )
162
164
  except TypeError:
163
165
  prompt = PromptCls(metric=metric)
164
166
 
@@ -167,11 +169,15 @@ def load_prompts_from_list(
167
169
  user_kwargs = ex["user_kwargs"]
168
170
  output = ex["output"]
169
171
  except KeyError as e:
170
- raise LoaderError(f"Record {idx}, example {ex_idx} missing {e}") from e
172
+ raise LoaderError(
173
+ f"Record {idx}, example {ex_idx} missing {e}"
174
+ ) from e
171
175
  try:
172
176
  prompt.add_example(user_kwargs, output)
173
177
  except (ValidationError, ValueError) as e:
174
- raise LoaderError(f"Record {idx}, example {ex_idx} invalid: {e}") from e
178
+ raise LoaderError(
179
+ f"Record {idx}, example {ex_idx} invalid: {e}"
180
+ ) from e
175
181
 
176
182
  prompts.append(prompt)
177
183
 
@@ -211,7 +217,9 @@ def load_prompts_from_metrics(
211
217
 
212
218
  # Instantiate prompt with the metric's description as task_description
213
219
  try:
214
- prompt = PromptCls(metric=metric, task_description=metric.description)
220
+ prompt = PromptCls(
221
+ metric=metric, task_description=metric.description
222
+ )
215
223
  except TypeError:
216
224
  # Fallback if constructor signature differs
217
225
  prompt = PromptCls(metric=metric)
@@ -1,8 +1,4 @@
1
- from typing import (
2
- Any,
3
- Dict,
4
- List,
5
- )
1
+ from typing import Any, Dict, List
6
2
 
7
3
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
8
4
  ToolCall,
@@ -65,7 +61,9 @@ class OpenAIAdapter(BaseAdapter):
65
61
  ]
66
62
 
67
63
  def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
68
- tool = next((t for t in self.specs if t.function.name == tool_name), None)
64
+ tool = next(
65
+ (t for t in self.specs if t.function.name == tool_name), None
66
+ )
69
67
  return tool.function.model_dump() if tool else {}
70
68
 
71
69
  def get_call_dict(self) -> Dict[str, Any]:
@@ -87,11 +85,18 @@ class OpenAIAdapter(BaseAdapter):
87
85
 
88
86
  def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
89
87
  spec = next(
90
- (s for s in self.specs if s.function.name == self.get_function_name()), None
88
+ (
89
+ s
90
+ for s in self.specs
91
+ if s.function.name == self.get_function_name()
92
+ ),
93
+ None,
91
94
  )
92
95
  if not spec:
93
96
  return {"type": "object", "properties": {}, "required": []}
94
- props = spec.function.parameters.get("properties", spec.function.parameters)
97
+ props = spec.function.parameters.get(
98
+ "properties", spec.function.parameters
99
+ )
95
100
  if param_name not in props:
96
101
  return {"type": "object", "properties": {}, "required": []}
97
102
  return {
@@ -1,8 +1,9 @@
1
- import json
2
1
  import importlib.resources
2
+ import json
3
3
  from pathlib import Path
4
4
  from typing import Dict, List, Optional, Union
5
5
 
6
+ from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
6
7
  from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
7
8
  SemanticChecker,
8
9
  )
@@ -19,13 +20,16 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
19
20
  ToolCall,
20
21
  ToolSpec,
21
22
  )
22
- from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
23
- from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import LLMKitWrapper
23
+ from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
24
+ LLMKitWrapper,
25
+ )
26
+
24
27
 
25
28
  def metrics_dir():
26
29
  path = importlib.resources.files(metrics)
27
30
  return path
28
31
 
32
+
29
33
  # Default metric JSON paths
30
34
  _METRICS_DIR = metrics_dir()
31
35
  _DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
@@ -36,10 +40,14 @@ _DEFAULT_FUNCSEL = (
36
40
  _METRICS_DIR / "function_selection" / "function_selection_metrics.json"
37
41
  )
38
42
  _DEFAULT_FUNCSEL_RUNTIME = (
39
- _METRICS_DIR / "function_selection" / "function_selection_metrics_runtime.json"
43
+ _METRICS_DIR
44
+ / "function_selection"
45
+ / "function_selection_metrics_runtime.json"
40
46
  )
41
47
  _DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
42
- _DEFAULT_PARAM_RUNTIME = _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
48
+ _DEFAULT_PARAM_RUNTIME = (
49
+ _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
50
+ )
43
51
 
44
52
 
45
53
  class ReflectionPipeline:
@@ -88,11 +96,19 @@ class ReflectionPipeline:
88
96
  for metrics, default_path in [
89
97
  (
90
98
  self.general_metrics,
91
- _DEFAULT_GENERAL_RUNTIME if runtime_pipeline else _DEFAULT_GENERAL,
99
+ (
100
+ _DEFAULT_GENERAL_RUNTIME
101
+ if runtime_pipeline
102
+ else _DEFAULT_GENERAL
103
+ ),
92
104
  ),
93
105
  (
94
106
  self.function_metrics,
95
- _DEFAULT_FUNCSEL_RUNTIME if runtime_pipeline else _DEFAULT_FUNCSEL,
107
+ (
108
+ _DEFAULT_FUNCSEL_RUNTIME
109
+ if runtime_pipeline
110
+ else _DEFAULT_FUNCSEL
111
+ ),
96
112
  ),
97
113
  (
98
114
  self.parameter_metrics,
@@ -104,7 +120,9 @@ class ReflectionPipeline:
104
120
  continue
105
121
 
106
122
  # Handle metric names list
107
- if isinstance(metrics, list) and all(isinstance(x, str) for x in metrics):
123
+ if isinstance(metrics, list) and all(
124
+ isinstance(x, str) for x in metrics
125
+ ):
108
126
  # Load the default JSON file
109
127
  if not default_path.is_file():
110
128
  raise FileNotFoundError(
@@ -116,7 +134,9 @@ class ReflectionPipeline:
116
134
 
117
135
  # Filter metrics by name
118
136
  filtered_metrics = [
119
- metric for metric in all_metrics if metric.get("name") in metrics
137
+ metric
138
+ for metric in all_metrics
139
+ if metric.get("name") in metrics
120
140
  ]
121
141
 
122
142
  # Remove examples from prompts if requested
@@ -125,7 +145,9 @@ class ReflectionPipeline:
125
145
  metric.pop("examples", None)
126
146
 
127
147
  if len(filtered_metrics) != len(metrics):
128
- found_names = {metric.get("name") for metric in filtered_metrics}
148
+ found_names = {
149
+ metric.get("name") for metric in filtered_metrics
150
+ }
129
151
  missing = set(metrics) - found_names
130
152
  raise ValueError(f"Metrics not found: {missing}")
131
153
 
@@ -140,14 +162,20 @@ class ReflectionPipeline:
140
162
  if isinstance(metrics, list) and all(
141
163
  isinstance(x, FunctionCallMetric) for x in metrics
142
164
  ):
143
- metrics_definitions.append([metric.model_dump() for metric in metrics])
165
+ metrics_definitions.append(
166
+ [metric.model_dump() for metric in metrics]
167
+ )
144
168
  else:
145
169
  if not metrics.is_file():
146
- raise FileNotFoundError(f"Metrics file not found: {metrics}")
170
+ raise FileNotFoundError(
171
+ f"Metrics file not found: {metrics}"
172
+ )
147
173
  metrics_definitions.append(
148
174
  [
149
175
  json.loads(json_obj)
150
- for json_obj in metrics.read_text(encoding="utf8").splitlines()
176
+ for json_obj in metrics.read_text(
177
+ encoding="utf8"
178
+ ).splitlines()
151
179
  if json_obj.strip()
152
180
  ]
153
181
  )
@@ -1,14 +1,7 @@
1
1
  import json
2
2
  import math
3
3
  import re
4
- from typing import (
5
- Any,
6
- Dict,
7
- List,
8
- Optional,
9
- Tuple,
10
- Union,
11
- )
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
12
5
 
13
6
  from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
14
7
  GeneralMetricsPrompt,
@@ -39,11 +32,13 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
39
32
  ToolSpec,
40
33
  TransformResult,
41
34
  )
42
- from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
43
35
  from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
44
36
  MetricRunner,
45
37
  MetricRunResult,
46
38
  )
39
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
40
+ WatsonXProvider,
41
+ )
47
42
 
48
43
 
49
44
  class SemanticChecker:
@@ -231,7 +226,9 @@ class SemanticChecker:
231
226
  schema_param_name="schema",
232
227
  retries=retries,
233
228
  )
234
- general_results = SemanticCategoryResult.from_results(sync_results)
229
+ general_results = SemanticCategoryResult.from_results(
230
+ sync_results
231
+ )
235
232
  except Exception as e:
236
233
  general_results = {"error": str(e)}
237
234
  else:
@@ -261,7 +258,9 @@ class SemanticChecker:
261
258
  schema_param_name="schema",
262
259
  retries=retries,
263
260
  )
264
- function_results = SemanticCategoryResult.from_results(sync_results)
261
+ function_results = SemanticCategoryResult.from_results(
262
+ sync_results
263
+ )
265
264
  except Exception as e:
266
265
  function_results = {"error": str(e)}
267
266
  else:
@@ -272,7 +271,9 @@ class SemanticChecker:
272
271
  for pname, pval in params.items():
273
272
  # Each parameter has its own prompts
274
273
  try:
275
- param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
274
+ param_entries: List[
275
+ Tuple[ParameterMetricsPrompt, Dict[str, Any]]
276
+ ] = []
276
277
  for prompt in self.parameter_prompts:
277
278
  param_entries.append(
278
279
  (
@@ -351,7 +352,10 @@ class SemanticChecker:
351
352
  )
352
353
  gen_code = self.codegen_client.generate(
353
354
  prompt=[
354
- {"role": "system", "content": GENERATE_CODE_SYSTEM},
355
+ {
356
+ "role": "system",
357
+ "content": GENERATE_CODE_SYSTEM,
358
+ },
355
359
  {"role": "user", "content": prompt},
356
360
  ],
357
361
  schema=GENERATE_CODE_SCHEMA,
@@ -386,11 +390,15 @@ class SemanticChecker:
386
390
  """
387
391
  Strip code fences, install imports, exec code, compare, return TransformResult.
388
392
  """
389
- clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
393
+ clean = re.sub(
394
+ r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
395
+ ).strip()
390
396
 
391
397
  # install imports
392
398
  for mod in set(
393
- re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
399
+ re.findall(
400
+ r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
401
+ )
394
402
  ):
395
403
  try:
396
404
  __import__(mod)
@@ -417,7 +425,9 @@ class SemanticChecker:
417
425
 
418
426
  out_t = fn_t(user_val)
419
427
  out_c = fn_c(api_val)
420
- if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
428
+ if isinstance(out_t, (int, float)) and isinstance(
429
+ out_c, (int, float)
430
+ ):
421
431
  success = math.isclose(out_t, out_c, abs_tol=1e-3)
422
432
  else:
423
433
  success = str(out_t) == str(out_c)