ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +9 -3
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +117 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +183 -79
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +103 -21
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,23 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
import json
|
|
2
|
-
import random
|
|
3
3
|
import os
|
|
4
|
-
import
|
|
4
|
+
import random
|
|
5
|
+
|
|
5
6
|
import rich
|
|
7
|
+
from jsonargparse import CLI
|
|
6
8
|
|
|
7
|
-
from wxo_agentic_evaluation.
|
|
8
|
-
from wxo_agentic_evaluation.red_teaming.attack_list import RED_TEAMING_ATTACKS, print_attacks
|
|
9
|
-
from wxo_agentic_evaluation.type import AttackCategory
|
|
9
|
+
from wxo_agentic_evaluation.arg_configs import AttackGeneratorConfig
|
|
10
10
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
11
|
-
OnPolicyAttackGeneratorTemplateRenderer,
|
|
12
11
|
OffPolicyAttackGeneratorTemplateRenderer,
|
|
12
|
+
OnPolicyAttackGeneratorTemplateRenderer,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.red_teaming.attack_list import (
|
|
15
|
+
RED_TEAMING_ATTACKS,
|
|
16
|
+
print_attacks,
|
|
13
17
|
)
|
|
14
18
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
15
|
-
from wxo_agentic_evaluation.
|
|
16
|
-
from
|
|
19
|
+
from wxo_agentic_evaluation.type import AttackCategory
|
|
20
|
+
from wxo_agentic_evaluation.utils.utils import load_agents
|
|
17
21
|
|
|
18
22
|
root_dir = os.path.dirname(os.path.dirname(__file__))
|
|
19
23
|
ON_POLICY_ATTACK_GENERATION_PROMPT = os.path.join(
|
|
@@ -60,13 +64,17 @@ class AttackGenerator:
|
|
|
60
64
|
if f.lower().endswith(".json")
|
|
61
65
|
]
|
|
62
66
|
if not json_files:
|
|
63
|
-
rich.print(
|
|
67
|
+
rich.print(
|
|
68
|
+
f"[yellow]WARNING:[/yellow] No .json files found in directory {path}"
|
|
69
|
+
)
|
|
64
70
|
continue
|
|
65
71
|
paths_to_read = json_files
|
|
66
72
|
elif os.path.isfile(path):
|
|
67
73
|
paths_to_read = [path]
|
|
68
74
|
else:
|
|
69
|
-
rich.print(
|
|
75
|
+
rich.print(
|
|
76
|
+
f"[yellow]WARNING:[/yellow] Path not found, skipping: {path}"
|
|
77
|
+
)
|
|
70
78
|
continue
|
|
71
79
|
|
|
72
80
|
for file_path in paths_to_read:
|
|
@@ -74,7 +82,9 @@ class AttackGenerator:
|
|
|
74
82
|
with open(file_path) as f:
|
|
75
83
|
data = json.load(f)
|
|
76
84
|
except Exception as e:
|
|
77
|
-
rich.print(
|
|
85
|
+
rich.print(
|
|
86
|
+
f"[red]ERROR:[/red] Failed to load {file_path}: {e}"
|
|
87
|
+
)
|
|
78
88
|
continue
|
|
79
89
|
|
|
80
90
|
info = {
|
|
@@ -107,7 +117,7 @@ class AttackGenerator:
|
|
|
107
117
|
if agent["name"].endswith("_manager"):
|
|
108
118
|
manager_agent_name = agent["name"]
|
|
109
119
|
break
|
|
110
|
-
|
|
120
|
+
|
|
111
121
|
if manager_agent_name is None:
|
|
112
122
|
manager_agent_name = target_agent_name
|
|
113
123
|
rich.print(
|
|
@@ -122,7 +132,9 @@ class AttackGenerator:
|
|
|
122
132
|
if attack.get("attack_name") == clean_name:
|
|
123
133
|
return attack
|
|
124
134
|
rich.print(f"[red]ERROR:[/red] No attack found with name: {name}")
|
|
125
|
-
rich.print(
|
|
135
|
+
rich.print(
|
|
136
|
+
'[green]INFO:[/green] See the list of available attacks below under the "Name" column:'
|
|
137
|
+
)
|
|
126
138
|
print_attacks()
|
|
127
139
|
|
|
128
140
|
return None
|
|
@@ -171,7 +183,9 @@ class AttackGenerator:
|
|
|
171
183
|
tools_list=tools,
|
|
172
184
|
agent_instructions=policy_instructions,
|
|
173
185
|
original_story=info.get("story", ""),
|
|
174
|
-
original_starting_sentence=info.get(
|
|
186
|
+
original_starting_sentence=info.get(
|
|
187
|
+
"starting_sentence", ""
|
|
188
|
+
),
|
|
175
189
|
)
|
|
176
190
|
res = self.llm_client.query(on_policy_prompt)
|
|
177
191
|
try:
|
|
@@ -221,11 +235,15 @@ class AttackGenerator:
|
|
|
221
235
|
if attack_category == AttackCategory.off_policy:
|
|
222
236
|
off_policy_prompt = self.off_policy_renderer.render(
|
|
223
237
|
original_story=info.get("story", ""),
|
|
224
|
-
original_starting_sentence=info.get(
|
|
238
|
+
original_starting_sentence=info.get(
|
|
239
|
+
"starting_sentence", ""
|
|
240
|
+
),
|
|
225
241
|
)
|
|
226
242
|
res = self.llm_client.query(off_policy_prompt)
|
|
227
243
|
try:
|
|
228
|
-
off_policy_attack_data = ast.literal_eval(res.strip())[
|
|
244
|
+
off_policy_attack_data = ast.literal_eval(res.strip())[
|
|
245
|
+
0
|
|
246
|
+
]
|
|
229
247
|
except:
|
|
230
248
|
off_policy_attack_data = {}
|
|
231
249
|
|
|
@@ -249,11 +267,13 @@ class AttackGenerator:
|
|
|
249
267
|
"modified_starting_sentence", ""
|
|
250
268
|
)
|
|
251
269
|
|
|
252
|
-
results.append(
|
|
270
|
+
results.append(
|
|
271
|
+
{"dataset": info.get("dataset"), "attack": out}
|
|
272
|
+
)
|
|
253
273
|
|
|
254
274
|
if output_dir is None:
|
|
255
275
|
output_dir = os.path.join(os.getcwd(), "red_team_attacks")
|
|
256
|
-
|
|
276
|
+
|
|
257
277
|
os.makedirs(output_dir, exist_ok=True)
|
|
258
278
|
for idx, res in enumerate(results):
|
|
259
279
|
attack = res.get("attack", {})
|
|
@@ -1,29 +1,31 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import dataclasses
|
|
2
|
+
import glob
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
|
|
8
|
+
import rich
|
|
9
|
+
import yaml
|
|
10
|
+
from jsonargparse import CLI
|
|
11
|
+
from rich.progress import Progress
|
|
12
|
+
|
|
13
|
+
from wxo_agentic_evaluation.arg_configs import AttackConfig
|
|
5
14
|
from wxo_agentic_evaluation.inference_backend import (
|
|
6
15
|
EvaluationController,
|
|
7
|
-
get_wxo_client,
|
|
8
16
|
WXOInferenceBackend,
|
|
17
|
+
get_wxo_client,
|
|
18
|
+
)
|
|
19
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
20
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
21
|
+
LlamaUserTemplateRenderer,
|
|
9
22
|
)
|
|
10
|
-
from wxo_agentic_evaluation.type import AttackData
|
|
11
|
-
|
|
12
|
-
from wxo_agentic_evaluation.arg_configs import AttackConfig
|
|
13
23
|
from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
|
|
24
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
25
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
26
|
+
from wxo_agentic_evaluation.type import AttackData
|
|
14
27
|
from wxo_agentic_evaluation.utils import json_dump
|
|
15
28
|
|
|
16
|
-
import os
|
|
17
|
-
import json
|
|
18
|
-
import traceback
|
|
19
|
-
import yaml
|
|
20
|
-
import dataclasses
|
|
21
|
-
import glob
|
|
22
|
-
import rich
|
|
23
|
-
from rich.progress import Progress
|
|
24
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
25
|
-
from jsonargparse import CLI
|
|
26
|
-
|
|
27
29
|
|
|
28
30
|
def process_attack(task_n, attack_path, config, inference_backend, llm_user):
|
|
29
31
|
tc_name = os.path.basename(attack_path).replace(".json", "")
|
|
@@ -31,7 +33,9 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
|
|
|
31
33
|
attack: AttackData = AttackData.model_validate(json.load(f))
|
|
32
34
|
|
|
33
35
|
evaluation_controller = EvaluationController(
|
|
34
|
-
wxo_inference_backend=inference_backend,
|
|
36
|
+
wxo_inference_backend=inference_backend,
|
|
37
|
+
llm_user=llm_user,
|
|
38
|
+
config=config,
|
|
35
39
|
)
|
|
36
40
|
rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
|
|
37
41
|
history, _, _ = evaluation_controller.run(
|
|
@@ -46,7 +50,8 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
|
|
|
46
50
|
result.append(message.model_dump())
|
|
47
51
|
|
|
48
52
|
json_dump(
|
|
49
|
-
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
53
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
54
|
+
result,
|
|
50
55
|
)
|
|
51
56
|
|
|
52
57
|
return result
|
|
@@ -55,19 +60,26 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
|
|
|
55
60
|
def run_attacks(config: AttackConfig):
|
|
56
61
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
57
62
|
wxo_client = get_wxo_client(
|
|
58
|
-
config.auth_config.url,
|
|
63
|
+
config.auth_config.url,
|
|
64
|
+
config.auth_config.tenant_name,
|
|
65
|
+
config.auth_config.token,
|
|
59
66
|
)
|
|
60
67
|
resource_map = ResourceMap(wxo_client)
|
|
61
68
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
62
69
|
llm_user = LLMUser(
|
|
63
70
|
wai_client=get_provider(
|
|
64
|
-
config=config.provider_config,
|
|
71
|
+
config=config.provider_config,
|
|
72
|
+
model_id=config.llm_user_config.model_id,
|
|
73
|
+
),
|
|
74
|
+
template=LlamaUserTemplateRenderer(
|
|
75
|
+
config.llm_user_config.prompt_config
|
|
65
76
|
),
|
|
66
|
-
template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
|
|
67
77
|
user_response_style=config.llm_user_config.user_response_style,
|
|
68
78
|
)
|
|
69
79
|
|
|
70
|
-
print(
|
|
80
|
+
print(
|
|
81
|
+
f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
|
|
82
|
+
)
|
|
71
83
|
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
72
84
|
|
|
73
85
|
results_list = []
|
|
@@ -81,7 +93,9 @@ def run_attacks(config: AttackConfig):
|
|
|
81
93
|
task_n = 0
|
|
82
94
|
|
|
83
95
|
for attack_path in attack_paths:
|
|
84
|
-
if not attack_path.endswith(".json") or attack_path.endswith(
|
|
96
|
+
if not attack_path.endswith(".json") or attack_path.endswith(
|
|
97
|
+
"agent.json"
|
|
98
|
+
):
|
|
85
99
|
continue
|
|
86
100
|
|
|
87
101
|
future = executor.submit(
|
|
@@ -118,7 +132,9 @@ def run_attacks(config: AttackConfig):
|
|
|
118
132
|
) as f:
|
|
119
133
|
yaml.safe_dump(dataclasses.asdict(config), f)
|
|
120
134
|
|
|
121
|
-
with open(
|
|
135
|
+
with open(
|
|
136
|
+
os.path.join(config.output_dir, "attacks_results.json"), "w"
|
|
137
|
+
) as f:
|
|
122
138
|
json.dump(attack_results, f, indent=2)
|
|
123
139
|
|
|
124
140
|
print(f"Attack results saved to {config.output_dir}")
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
|
|
3
3
|
from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
|
|
4
|
-
from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import
|
|
4
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
|
|
5
|
+
MetricPrompt,
|
|
6
|
+
)
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class FunctionMetricsPrompt(MetricPrompt, ABC):
|
|
@@ -1,14 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
5
|
-
Any,
|
|
6
|
-
Dict,
|
|
7
|
-
Iterable,
|
|
8
|
-
List,
|
|
9
|
-
Tuple,
|
|
10
|
-
Union,
|
|
11
|
-
)
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
|
12
5
|
|
|
13
6
|
from pydantic import ValidationError
|
|
14
7
|
|
|
@@ -18,7 +11,10 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function
|
|
|
18
11
|
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
|
|
19
12
|
FunctionSelectionPrompt,
|
|
20
13
|
)
|
|
21
|
-
from wxo_agentic_evaluation.referenceless_eval.metrics import
|
|
14
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics import (
|
|
15
|
+
Metric,
|
|
16
|
+
MetricPrompt,
|
|
17
|
+
)
|
|
22
18
|
|
|
23
19
|
PromptType = Union[
|
|
24
20
|
GeneralMetricsPrompt,
|
|
@@ -70,7 +66,9 @@ def load_prompts_from_jsonl(
|
|
|
70
66
|
raise LoaderError(f"File not found: {path}")
|
|
71
67
|
|
|
72
68
|
prompts: List[PromptType] = []
|
|
73
|
-
for lineno, raw in enumerate(
|
|
69
|
+
for lineno, raw in enumerate(
|
|
70
|
+
p.read_text(encoding="utf-8").splitlines(), start=1
|
|
71
|
+
):
|
|
74
72
|
if not raw.strip():
|
|
75
73
|
continue
|
|
76
74
|
try:
|
|
@@ -96,7 +94,9 @@ def load_prompts_from_jsonl(
|
|
|
96
94
|
# Instantiate prompt
|
|
97
95
|
prompt: MetricPrompt
|
|
98
96
|
try:
|
|
99
|
-
prompt = PromptCls(
|
|
97
|
+
prompt = PromptCls(
|
|
98
|
+
metric=metric, task_description=metric.description
|
|
99
|
+
)
|
|
100
100
|
except TypeError:
|
|
101
101
|
prompt = PromptCls(metric=metric)
|
|
102
102
|
|
|
@@ -158,7 +158,9 @@ def load_prompts_from_list(
|
|
|
158
158
|
raise LoaderError(f"Record {idx} invalid schema: {e}") from e
|
|
159
159
|
|
|
160
160
|
try:
|
|
161
|
-
prompt = PromptCls(
|
|
161
|
+
prompt = PromptCls(
|
|
162
|
+
metric=metric, task_description=rec["task_description"]
|
|
163
|
+
)
|
|
162
164
|
except TypeError:
|
|
163
165
|
prompt = PromptCls(metric=metric)
|
|
164
166
|
|
|
@@ -167,11 +169,15 @@ def load_prompts_from_list(
|
|
|
167
169
|
user_kwargs = ex["user_kwargs"]
|
|
168
170
|
output = ex["output"]
|
|
169
171
|
except KeyError as e:
|
|
170
|
-
raise LoaderError(
|
|
172
|
+
raise LoaderError(
|
|
173
|
+
f"Record {idx}, example {ex_idx} missing {e}"
|
|
174
|
+
) from e
|
|
171
175
|
try:
|
|
172
176
|
prompt.add_example(user_kwargs, output)
|
|
173
177
|
except (ValidationError, ValueError) as e:
|
|
174
|
-
raise LoaderError(
|
|
178
|
+
raise LoaderError(
|
|
179
|
+
f"Record {idx}, example {ex_idx} invalid: {e}"
|
|
180
|
+
) from e
|
|
175
181
|
|
|
176
182
|
prompts.append(prompt)
|
|
177
183
|
|
|
@@ -211,7 +217,9 @@ def load_prompts_from_metrics(
|
|
|
211
217
|
|
|
212
218
|
# Instantiate prompt with the metric's description as task_description
|
|
213
219
|
try:
|
|
214
|
-
prompt = PromptCls(
|
|
220
|
+
prompt = PromptCls(
|
|
221
|
+
metric=metric, task_description=metric.description
|
|
222
|
+
)
|
|
215
223
|
except TypeError:
|
|
216
224
|
# Fallback if constructor signature differs
|
|
217
225
|
prompt = PromptCls(metric=metric)
|
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
Any,
|
|
3
|
-
Dict,
|
|
4
|
-
List,
|
|
5
|
-
)
|
|
1
|
+
from typing import Any, Dict, List
|
|
6
2
|
|
|
7
3
|
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
8
4
|
ToolCall,
|
|
@@ -65,7 +61,9 @@ class OpenAIAdapter(BaseAdapter):
|
|
|
65
61
|
]
|
|
66
62
|
|
|
67
63
|
def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
|
|
68
|
-
tool = next(
|
|
64
|
+
tool = next(
|
|
65
|
+
(t for t in self.specs if t.function.name == tool_name), None
|
|
66
|
+
)
|
|
69
67
|
return tool.function.model_dump() if tool else {}
|
|
70
68
|
|
|
71
69
|
def get_call_dict(self) -> Dict[str, Any]:
|
|
@@ -87,11 +85,18 @@ class OpenAIAdapter(BaseAdapter):
|
|
|
87
85
|
|
|
88
86
|
def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
|
|
89
87
|
spec = next(
|
|
90
|
-
(
|
|
88
|
+
(
|
|
89
|
+
s
|
|
90
|
+
for s in self.specs
|
|
91
|
+
if s.function.name == self.get_function_name()
|
|
92
|
+
),
|
|
93
|
+
None,
|
|
91
94
|
)
|
|
92
95
|
if not spec:
|
|
93
96
|
return {"type": "object", "properties": {}, "required": []}
|
|
94
|
-
props = spec.function.parameters.get(
|
|
97
|
+
props = spec.function.parameters.get(
|
|
98
|
+
"properties", spec.function.parameters
|
|
99
|
+
)
|
|
95
100
|
if param_name not in props:
|
|
96
101
|
return {"type": "object", "properties": {}, "required": []}
|
|
97
102
|
return {
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import importlib.resources
|
|
2
|
+
import json
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
|
|
6
7
|
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
|
|
7
8
|
SemanticChecker,
|
|
8
9
|
)
|
|
@@ -19,13 +20,16 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
|
|
|
19
20
|
ToolCall,
|
|
20
21
|
ToolSpec,
|
|
21
22
|
)
|
|
22
|
-
from wxo_agentic_evaluation.
|
|
23
|
-
|
|
23
|
+
from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
|
|
24
|
+
LLMKitWrapper,
|
|
25
|
+
)
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
def metrics_dir():
|
|
26
29
|
path = importlib.resources.files(metrics)
|
|
27
30
|
return path
|
|
28
31
|
|
|
32
|
+
|
|
29
33
|
# Default metric JSON paths
|
|
30
34
|
_METRICS_DIR = metrics_dir()
|
|
31
35
|
_DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
|
|
@@ -36,10 +40,14 @@ _DEFAULT_FUNCSEL = (
|
|
|
36
40
|
_METRICS_DIR / "function_selection" / "function_selection_metrics.json"
|
|
37
41
|
)
|
|
38
42
|
_DEFAULT_FUNCSEL_RUNTIME = (
|
|
39
|
-
_METRICS_DIR
|
|
43
|
+
_METRICS_DIR
|
|
44
|
+
/ "function_selection"
|
|
45
|
+
/ "function_selection_metrics_runtime.json"
|
|
40
46
|
)
|
|
41
47
|
_DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
|
|
42
|
-
_DEFAULT_PARAM_RUNTIME =
|
|
48
|
+
_DEFAULT_PARAM_RUNTIME = (
|
|
49
|
+
_METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
|
|
50
|
+
)
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
class ReflectionPipeline:
|
|
@@ -88,11 +96,19 @@ class ReflectionPipeline:
|
|
|
88
96
|
for metrics, default_path in [
|
|
89
97
|
(
|
|
90
98
|
self.general_metrics,
|
|
91
|
-
|
|
99
|
+
(
|
|
100
|
+
_DEFAULT_GENERAL_RUNTIME
|
|
101
|
+
if runtime_pipeline
|
|
102
|
+
else _DEFAULT_GENERAL
|
|
103
|
+
),
|
|
92
104
|
),
|
|
93
105
|
(
|
|
94
106
|
self.function_metrics,
|
|
95
|
-
|
|
107
|
+
(
|
|
108
|
+
_DEFAULT_FUNCSEL_RUNTIME
|
|
109
|
+
if runtime_pipeline
|
|
110
|
+
else _DEFAULT_FUNCSEL
|
|
111
|
+
),
|
|
96
112
|
),
|
|
97
113
|
(
|
|
98
114
|
self.parameter_metrics,
|
|
@@ -104,7 +120,9 @@ class ReflectionPipeline:
|
|
|
104
120
|
continue
|
|
105
121
|
|
|
106
122
|
# Handle metric names list
|
|
107
|
-
if isinstance(metrics, list) and all(
|
|
123
|
+
if isinstance(metrics, list) and all(
|
|
124
|
+
isinstance(x, str) for x in metrics
|
|
125
|
+
):
|
|
108
126
|
# Load the default JSON file
|
|
109
127
|
if not default_path.is_file():
|
|
110
128
|
raise FileNotFoundError(
|
|
@@ -116,7 +134,9 @@ class ReflectionPipeline:
|
|
|
116
134
|
|
|
117
135
|
# Filter metrics by name
|
|
118
136
|
filtered_metrics = [
|
|
119
|
-
metric
|
|
137
|
+
metric
|
|
138
|
+
for metric in all_metrics
|
|
139
|
+
if metric.get("name") in metrics
|
|
120
140
|
]
|
|
121
141
|
|
|
122
142
|
# Remove examples from prompts if requested
|
|
@@ -125,7 +145,9 @@ class ReflectionPipeline:
|
|
|
125
145
|
metric.pop("examples", None)
|
|
126
146
|
|
|
127
147
|
if len(filtered_metrics) != len(metrics):
|
|
128
|
-
found_names = {
|
|
148
|
+
found_names = {
|
|
149
|
+
metric.get("name") for metric in filtered_metrics
|
|
150
|
+
}
|
|
129
151
|
missing = set(metrics) - found_names
|
|
130
152
|
raise ValueError(f"Metrics not found: {missing}")
|
|
131
153
|
|
|
@@ -140,14 +162,20 @@ class ReflectionPipeline:
|
|
|
140
162
|
if isinstance(metrics, list) and all(
|
|
141
163
|
isinstance(x, FunctionCallMetric) for x in metrics
|
|
142
164
|
):
|
|
143
|
-
metrics_definitions.append(
|
|
165
|
+
metrics_definitions.append(
|
|
166
|
+
[metric.model_dump() for metric in metrics]
|
|
167
|
+
)
|
|
144
168
|
else:
|
|
145
169
|
if not metrics.is_file():
|
|
146
|
-
raise FileNotFoundError(
|
|
170
|
+
raise FileNotFoundError(
|
|
171
|
+
f"Metrics file not found: {metrics}"
|
|
172
|
+
)
|
|
147
173
|
metrics_definitions.append(
|
|
148
174
|
[
|
|
149
175
|
json.loads(json_obj)
|
|
150
|
-
for json_obj in metrics.read_text(
|
|
176
|
+
for json_obj in metrics.read_text(
|
|
177
|
+
encoding="utf8"
|
|
178
|
+
).splitlines()
|
|
151
179
|
if json_obj.strip()
|
|
152
180
|
]
|
|
153
181
|
)
|
|
@@ -1,14 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
import re
|
|
4
|
-
from typing import
|
|
5
|
-
Any,
|
|
6
|
-
Dict,
|
|
7
|
-
List,
|
|
8
|
-
Optional,
|
|
9
|
-
Tuple,
|
|
10
|
-
Union,
|
|
11
|
-
)
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
12
5
|
|
|
13
6
|
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
|
|
14
7
|
GeneralMetricsPrompt,
|
|
@@ -39,11 +32,13 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
|
|
|
39
32
|
ToolSpec,
|
|
40
33
|
TransformResult,
|
|
41
34
|
)
|
|
42
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
43
35
|
from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
|
|
44
36
|
MetricRunner,
|
|
45
37
|
MetricRunResult,
|
|
46
38
|
)
|
|
39
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
40
|
+
WatsonXProvider,
|
|
41
|
+
)
|
|
47
42
|
|
|
48
43
|
|
|
49
44
|
class SemanticChecker:
|
|
@@ -231,7 +226,9 @@ class SemanticChecker:
|
|
|
231
226
|
schema_param_name="schema",
|
|
232
227
|
retries=retries,
|
|
233
228
|
)
|
|
234
|
-
general_results = SemanticCategoryResult.from_results(
|
|
229
|
+
general_results = SemanticCategoryResult.from_results(
|
|
230
|
+
sync_results
|
|
231
|
+
)
|
|
235
232
|
except Exception as e:
|
|
236
233
|
general_results = {"error": str(e)}
|
|
237
234
|
else:
|
|
@@ -261,7 +258,9 @@ class SemanticChecker:
|
|
|
261
258
|
schema_param_name="schema",
|
|
262
259
|
retries=retries,
|
|
263
260
|
)
|
|
264
|
-
function_results = SemanticCategoryResult.from_results(
|
|
261
|
+
function_results = SemanticCategoryResult.from_results(
|
|
262
|
+
sync_results
|
|
263
|
+
)
|
|
265
264
|
except Exception as e:
|
|
266
265
|
function_results = {"error": str(e)}
|
|
267
266
|
else:
|
|
@@ -272,7 +271,9 @@ class SemanticChecker:
|
|
|
272
271
|
for pname, pval in params.items():
|
|
273
272
|
# Each parameter has its own prompts
|
|
274
273
|
try:
|
|
275
|
-
param_entries: List[
|
|
274
|
+
param_entries: List[
|
|
275
|
+
Tuple[ParameterMetricsPrompt, Dict[str, Any]]
|
|
276
|
+
] = []
|
|
276
277
|
for prompt in self.parameter_prompts:
|
|
277
278
|
param_entries.append(
|
|
278
279
|
(
|
|
@@ -351,7 +352,10 @@ class SemanticChecker:
|
|
|
351
352
|
)
|
|
352
353
|
gen_code = self.codegen_client.generate(
|
|
353
354
|
prompt=[
|
|
354
|
-
{
|
|
355
|
+
{
|
|
356
|
+
"role": "system",
|
|
357
|
+
"content": GENERATE_CODE_SYSTEM,
|
|
358
|
+
},
|
|
355
359
|
{"role": "user", "content": prompt},
|
|
356
360
|
],
|
|
357
361
|
schema=GENERATE_CODE_SCHEMA,
|
|
@@ -386,11 +390,15 @@ class SemanticChecker:
|
|
|
386
390
|
"""
|
|
387
391
|
Strip code fences, install imports, exec code, compare, return TransformResult.
|
|
388
392
|
"""
|
|
389
|
-
clean = re.sub(
|
|
393
|
+
clean = re.sub(
|
|
394
|
+
r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
|
|
395
|
+
).strip()
|
|
390
396
|
|
|
391
397
|
# install imports
|
|
392
398
|
for mod in set(
|
|
393
|
-
re.findall(
|
|
399
|
+
re.findall(
|
|
400
|
+
r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
|
|
401
|
+
)
|
|
394
402
|
):
|
|
395
403
|
try:
|
|
396
404
|
__import__(mod)
|
|
@@ -417,7 +425,9 @@ class SemanticChecker:
|
|
|
417
425
|
|
|
418
426
|
out_t = fn_t(user_val)
|
|
419
427
|
out_c = fn_c(api_val)
|
|
420
|
-
if isinstance(out_t, (int, float)) and isinstance(
|
|
428
|
+
if isinstance(out_t, (int, float)) and isinstance(
|
|
429
|
+
out_c, (int, float)
|
|
430
|
+
):
|
|
421
431
|
success = math.isclose(out_t, out_c, abs_tol=1e-3)
|
|
422
432
|
else:
|
|
423
433
|
success = str(out_t) == str(out_c)
|