ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import ValidationError
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
|
|
9
|
+
GeneralMetricsPrompt,
|
|
10
|
+
)
|
|
11
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
|
|
12
|
+
FunctionSelectionPrompt,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics import (
|
|
15
|
+
Metric,
|
|
16
|
+
MetricPrompt,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
PromptType = Union[
|
|
20
|
+
GeneralMetricsPrompt,
|
|
21
|
+
FunctionSelectionPrompt,
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Enum for prompt kinds
|
|
26
|
+
class PromptKind(str, Enum):
|
|
27
|
+
GENERAL = "general"
|
|
28
|
+
FUNCTION_SELECTION = "function_selection"
|
|
29
|
+
PARAMETER = "parameter"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Map enum → Prompt class
|
|
33
|
+
_PROMPT_CLASS_MAP: Dict[PromptKind, Any] = {
|
|
34
|
+
PromptKind.GENERAL: GeneralMetricsPrompt,
|
|
35
|
+
PromptKind.FUNCTION_SELECTION: FunctionSelectionPrompt,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LoaderError(Exception):
|
|
40
|
+
"""Raised when prompt loading fails."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_prompts_from_jsonl(
|
|
44
|
+
path: Union[str, Path],
|
|
45
|
+
kind: PromptKind,
|
|
46
|
+
) -> List[PromptType]:
|
|
47
|
+
"""
|
|
48
|
+
Load prompts from a JSONL file.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
path: .jsonl file path.
|
|
52
|
+
kind: PromptKind value.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of PromptType, each with its examples loaded.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
LoaderError on I/O, parse, or validation errors.
|
|
59
|
+
"""
|
|
60
|
+
PromptCls = _PROMPT_CLASS_MAP.get(kind)
|
|
61
|
+
if PromptCls is None:
|
|
62
|
+
raise LoaderError(f"Unknown PromptKind: {kind}")
|
|
63
|
+
|
|
64
|
+
p = Path(path)
|
|
65
|
+
if not p.is_file():
|
|
66
|
+
raise LoaderError(f"File not found: {path}")
|
|
67
|
+
|
|
68
|
+
prompts: List[PromptType] = []
|
|
69
|
+
for lineno, raw in enumerate(
|
|
70
|
+
p.read_text(encoding="utf-8").splitlines(), start=1
|
|
71
|
+
):
|
|
72
|
+
if not raw.strip():
|
|
73
|
+
continue
|
|
74
|
+
try:
|
|
75
|
+
rec = json.loads(raw)
|
|
76
|
+
except json.JSONDecodeError as e:
|
|
77
|
+
raise LoaderError(f"{path}:{lineno} invalid JSON: {e}") from e
|
|
78
|
+
|
|
79
|
+
# Extract
|
|
80
|
+
try:
|
|
81
|
+
schema = rec["jsonschema"]
|
|
82
|
+
examples = rec.get("examples", [])
|
|
83
|
+
description = rec.get("description", schema.get("description", ""))
|
|
84
|
+
except KeyError as e:
|
|
85
|
+
raise LoaderError(f"{path}:{lineno} missing key {e}") from e
|
|
86
|
+
|
|
87
|
+
# Build metric
|
|
88
|
+
try:
|
|
89
|
+
metric = Metric.from_jsonschema(schema)
|
|
90
|
+
metric.description = description
|
|
91
|
+
except Exception as e:
|
|
92
|
+
raise LoaderError(f"{path}:{lineno} invalid schema: {e}") from e
|
|
93
|
+
|
|
94
|
+
# Instantiate prompt
|
|
95
|
+
prompt: MetricPrompt
|
|
96
|
+
try:
|
|
97
|
+
prompt = PromptCls(
|
|
98
|
+
metric=metric, task_description=metric.description
|
|
99
|
+
)
|
|
100
|
+
except TypeError:
|
|
101
|
+
prompt = PromptCls(metric=metric)
|
|
102
|
+
|
|
103
|
+
# Load examples
|
|
104
|
+
for ex_idx, ex in enumerate(examples, start=1):
|
|
105
|
+
try:
|
|
106
|
+
user_kwargs = ex["user_kwargs"]
|
|
107
|
+
output = ex["output"]
|
|
108
|
+
except KeyError as e:
|
|
109
|
+
raise LoaderError(
|
|
110
|
+
f"{path}:{lineno}, example {ex_idx} missing {e}"
|
|
111
|
+
) from e
|
|
112
|
+
try:
|
|
113
|
+
prompt.add_example(user_kwargs, output)
|
|
114
|
+
except (ValidationError, ValueError) as e:
|
|
115
|
+
raise LoaderError(
|
|
116
|
+
f"{path}:{lineno}, example {ex_idx} invalid: {e}"
|
|
117
|
+
) from e
|
|
118
|
+
|
|
119
|
+
prompts.append(prompt)
|
|
120
|
+
|
|
121
|
+
return prompts
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_prompts_from_list(
|
|
125
|
+
records: Iterable[Dict[str, Any]], kind: PromptKind
|
|
126
|
+
) -> List[PromptType]:
|
|
127
|
+
"""
|
|
128
|
+
Load prompts from an in-memory list of dicts, same structure as JSONL.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
records: Iterable of dicts with keys {schema, thresholds, examples, description}.
|
|
132
|
+
kind: PromptKind value.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of PromptType.
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
LoaderError on missing data or validation failures.
|
|
139
|
+
"""
|
|
140
|
+
PromptCls = _PROMPT_CLASS_MAP.get(kind)
|
|
141
|
+
if PromptCls is None:
|
|
142
|
+
raise LoaderError(f"Unknown PromptKind: {kind}")
|
|
143
|
+
|
|
144
|
+
prompts: List[PromptType] = []
|
|
145
|
+
for idx, rec in enumerate(records, start=1):
|
|
146
|
+
# same logic as JSONL loader
|
|
147
|
+
try:
|
|
148
|
+
schema = rec["jsonschema"]
|
|
149
|
+
examples = rec.get("examples", [])
|
|
150
|
+
description = schema.get("description", rec.get("name", ""))
|
|
151
|
+
except KeyError as e:
|
|
152
|
+
raise LoaderError(f"Record {idx} missing key {e}") from e
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
metric = Metric.from_jsonschema(schema)
|
|
156
|
+
metric.description = description
|
|
157
|
+
except Exception as e:
|
|
158
|
+
raise LoaderError(f"Record {idx} invalid schema: {e}") from e
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
prompt = PromptCls(
|
|
162
|
+
metric=metric, task_description=rec["task_description"]
|
|
163
|
+
)
|
|
164
|
+
except TypeError:
|
|
165
|
+
prompt = PromptCls(metric=metric)
|
|
166
|
+
|
|
167
|
+
for ex_idx, ex in enumerate(examples, start=1):
|
|
168
|
+
try:
|
|
169
|
+
user_kwargs = ex["user_kwargs"]
|
|
170
|
+
output = ex["output"]
|
|
171
|
+
except KeyError as e:
|
|
172
|
+
raise LoaderError(
|
|
173
|
+
f"Record {idx}, example {ex_idx} missing {e}"
|
|
174
|
+
) from e
|
|
175
|
+
try:
|
|
176
|
+
prompt.add_example(user_kwargs, output)
|
|
177
|
+
except (ValidationError, ValueError) as e:
|
|
178
|
+
raise LoaderError(
|
|
179
|
+
f"Record {idx}, example {ex_idx} invalid: {e}"
|
|
180
|
+
) from e
|
|
181
|
+
|
|
182
|
+
prompts.append(prompt)
|
|
183
|
+
|
|
184
|
+
return prompts
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def load_prompts_from_metrics(
|
|
188
|
+
metrics_with_examples: Iterable[Tuple[Metric, List[Dict[str, Any]]]],
|
|
189
|
+
kind: PromptKind,
|
|
190
|
+
) -> List[PromptType]:
|
|
191
|
+
"""
|
|
192
|
+
Instantiate prompts directly from Metric objects and example data.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
metrics_with_examples: An iterable of (Metric instance, examples) tuples.
|
|
196
|
+
Each examples list item must be a dict with:
|
|
197
|
+
- "user_kwargs": Dict[str, Any]
|
|
198
|
+
- "output": Dict[str, Any]
|
|
199
|
+
kind: Which PromptKind to use (GENERAL, FUNCTION_SELECTION, PARAMETER).
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
A list of PromptType, each with its few-shot examples loaded.
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
LoaderError: on missing data or validation errors.
|
|
206
|
+
"""
|
|
207
|
+
PromptCls = _PROMPT_CLASS_MAP.get(kind)
|
|
208
|
+
if PromptCls is None:
|
|
209
|
+
raise LoaderError(f"Unknown PromptKind: {kind}")
|
|
210
|
+
|
|
211
|
+
prompts: List[PromptType] = []
|
|
212
|
+
for idx, (metric, examples) in enumerate(metrics_with_examples, start=1):
|
|
213
|
+
if not isinstance(metric, Metric):
|
|
214
|
+
raise LoaderError(
|
|
215
|
+
f"Item {idx}: expected a Metric instance, got {type(metric)}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Instantiate prompt with the metric's description as task_description
|
|
219
|
+
try:
|
|
220
|
+
prompt = PromptCls(
|
|
221
|
+
metric=metric, task_description=metric.description
|
|
222
|
+
)
|
|
223
|
+
except TypeError:
|
|
224
|
+
# Fallback if constructor signature differs
|
|
225
|
+
prompt = PromptCls(metric=metric)
|
|
226
|
+
|
|
227
|
+
# Add each provided example
|
|
228
|
+
for ex_idx, ex in enumerate(examples or [], start=1):
|
|
229
|
+
if "user_kwargs" not in ex or "output" not in ex:
|
|
230
|
+
raise LoaderError(
|
|
231
|
+
f"Metric {metric.name}, example {ex_idx}: "
|
|
232
|
+
"each example must include 'user_kwargs' and 'output'."
|
|
233
|
+
)
|
|
234
|
+
user_kwargs = ex["user_kwargs"]
|
|
235
|
+
output = ex["output"]
|
|
236
|
+
try:
|
|
237
|
+
prompt.add_example(user_kwargs, output)
|
|
238
|
+
except (ValidationError, ValueError) as e:
|
|
239
|
+
raise LoaderError(
|
|
240
|
+
f"Metric {metric.name}, example {ex_idx} invalid: {e}"
|
|
241
|
+
) from e
|
|
242
|
+
|
|
243
|
+
prompts.append(prompt)
|
|
244
|
+
|
|
245
|
+
return prompts
|
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
4
|
+
ToolCall,
|
|
5
|
+
ToolSpec,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
# ────────────────────────────────────────────────────────────────────────────────
|
|
9
|
+
# Adapter definitions
|
|
10
|
+
# ────────────────────────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseAdapter:
|
|
14
|
+
"""Abstract adapter to unify different API spec and call representations."""
|
|
15
|
+
|
|
16
|
+
def get_tools_inventory(self) -> List[Dict[str, Any]]:
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
def get_tools_inventory_summary(self) -> List[Dict[str, Any]]:
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
def get_call_dict(self) -> Dict[str, Any]:
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
|
|
28
|
+
def get_function_name(self) -> str:
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
def get_parameters(self) -> Dict[str, Any]:
|
|
32
|
+
raise NotImplementedError
|
|
33
|
+
|
|
34
|
+
def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OpenAIAdapter(BaseAdapter):
|
|
39
|
+
"""Adapter for ToolSpec + ToolCall inputs."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, specs: List[ToolSpec], call: ToolCall):
|
|
42
|
+
self.specs = specs
|
|
43
|
+
self.call = call
|
|
44
|
+
|
|
45
|
+
def get_tools_inventory(self) -> List[Dict[str, Any]]:
|
|
46
|
+
return [spec.model_dump() for spec in self.specs]
|
|
47
|
+
|
|
48
|
+
def get_tools_inventory_summary(self) -> List[Dict[str, Any]]:
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
"tool_name": spec.function.name,
|
|
52
|
+
"tool_description": spec.function.description,
|
|
53
|
+
"tool_parameters": {
|
|
54
|
+
prop_name: prop_d["type"]
|
|
55
|
+
for prop_name, prop_d in spec.function.parameters.get(
|
|
56
|
+
"properties", {}
|
|
57
|
+
).items()
|
|
58
|
+
},
|
|
59
|
+
}
|
|
60
|
+
for spec in self.specs
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
|
|
64
|
+
tool = next(
|
|
65
|
+
(t for t in self.specs if t.function.name == tool_name), None
|
|
66
|
+
)
|
|
67
|
+
return tool.function.model_dump() if tool else {}
|
|
68
|
+
|
|
69
|
+
def get_call_dict(self) -> Dict[str, Any]:
|
|
70
|
+
call_dict = {
|
|
71
|
+
"id": self.call.id,
|
|
72
|
+
"type": "function",
|
|
73
|
+
"function": {
|
|
74
|
+
"name": self.call.function.name,
|
|
75
|
+
"arguments": self.call.function.arguments,
|
|
76
|
+
},
|
|
77
|
+
}
|
|
78
|
+
return call_dict
|
|
79
|
+
|
|
80
|
+
def get_function_name(self) -> str:
|
|
81
|
+
return self.call.function.name
|
|
82
|
+
|
|
83
|
+
def get_parameters(self) -> Dict[str, Any]:
|
|
84
|
+
return self.call.function.parsed_arguments
|
|
85
|
+
|
|
86
|
+
def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
|
|
87
|
+
spec = next(
|
|
88
|
+
(
|
|
89
|
+
s
|
|
90
|
+
for s in self.specs
|
|
91
|
+
if s.function.name == self.get_function_name()
|
|
92
|
+
),
|
|
93
|
+
None,
|
|
94
|
+
)
|
|
95
|
+
if not spec:
|
|
96
|
+
return {"type": "object", "properties": {}, "required": []}
|
|
97
|
+
props = spec.function.parameters.get(
|
|
98
|
+
"properties", spec.function.parameters
|
|
99
|
+
)
|
|
100
|
+
if param_name not in props:
|
|
101
|
+
return {"type": "object", "properties": {}, "required": []}
|
|
102
|
+
return {
|
|
103
|
+
"type": "object",
|
|
104
|
+
"properties": {param_name: props[param_name]},
|
|
105
|
+
"required": [param_name],
|
|
106
|
+
}
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import importlib.resources
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
|
|
7
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
|
|
8
|
+
SemanticChecker,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.static_checker import (
|
|
11
|
+
evaluate_static,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
14
|
+
FunctionCallInput,
|
|
15
|
+
FunctionCallMetric,
|
|
16
|
+
PipelineResult,
|
|
17
|
+
SemanticResult,
|
|
18
|
+
StaticMetricResult,
|
|
19
|
+
StaticResult,
|
|
20
|
+
ToolCall,
|
|
21
|
+
ToolSpec,
|
|
22
|
+
)
|
|
23
|
+
from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
|
|
24
|
+
LLMKitWrapper,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def metrics_dir():
|
|
29
|
+
path = importlib.resources.files(metrics)
|
|
30
|
+
return path
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Default metric JSON paths
|
|
34
|
+
_METRICS_DIR = metrics_dir()
|
|
35
|
+
_DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
|
|
36
|
+
_DEFAULT_GENERAL_RUNTIME = (
|
|
37
|
+
_METRICS_DIR / "function_call" / "general_metrics_runtime.json"
|
|
38
|
+
)
|
|
39
|
+
_DEFAULT_FUNCSEL = (
|
|
40
|
+
_METRICS_DIR / "function_selection" / "function_selection_metrics.json"
|
|
41
|
+
)
|
|
42
|
+
_DEFAULT_FUNCSEL_RUNTIME = (
|
|
43
|
+
_METRICS_DIR
|
|
44
|
+
/ "function_selection"
|
|
45
|
+
/ "function_selection_metrics_runtime.json"
|
|
46
|
+
)
|
|
47
|
+
_DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
|
|
48
|
+
_DEFAULT_PARAM_RUNTIME = (
|
|
49
|
+
_METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ReflectionPipeline:
|
|
54
|
+
"""
|
|
55
|
+
High-level orchestration for function-call reflection.
|
|
56
|
+
|
|
57
|
+
Modes:
|
|
58
|
+
• static_only: schema checks
|
|
59
|
+
• semantic_only: LLM metrics + transforms
|
|
60
|
+
• run: full static -> semantic -> assemble -> PipelineResult
|
|
61
|
+
|
|
62
|
+
Supports sync, custom JSON overrides, and any registered LLM.
|
|
63
|
+
runtime_pipeline: if set to true, use faster prompts (no actionable recommendations, shorter explanations)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
metrics_client: LLMKitWrapper,
|
|
69
|
+
codegen_client: Optional[LLMKitWrapper] = None,
|
|
70
|
+
general_metrics: Optional[
|
|
71
|
+
Union[Path, List[FunctionCallMetric], List[str]]
|
|
72
|
+
] = _DEFAULT_GENERAL_RUNTIME,
|
|
73
|
+
function_metrics: Optional[
|
|
74
|
+
Union[Path, List[FunctionCallMetric], List[str]]
|
|
75
|
+
] = _DEFAULT_FUNCSEL_RUNTIME,
|
|
76
|
+
parameter_metrics: Optional[
|
|
77
|
+
Union[Path, List[FunctionCallMetric], List[str]]
|
|
78
|
+
] = _DEFAULT_PARAM_RUNTIME,
|
|
79
|
+
transform_enabled: Optional[bool] = False,
|
|
80
|
+
runtime_pipeline: Optional[bool] = True,
|
|
81
|
+
use_examples: Optional[bool] = True,
|
|
82
|
+
):
|
|
83
|
+
|
|
84
|
+
self.metrics_client = metrics_client
|
|
85
|
+
if codegen_client is None:
|
|
86
|
+
self.codegen_client = metrics_client
|
|
87
|
+
else:
|
|
88
|
+
self.codegen_client = codegen_client
|
|
89
|
+
|
|
90
|
+
self.general_metrics = general_metrics
|
|
91
|
+
self.function_metrics = function_metrics
|
|
92
|
+
self.parameter_metrics = parameter_metrics
|
|
93
|
+
|
|
94
|
+
metrics_definitions = []
|
|
95
|
+
|
|
96
|
+
for metrics, default_path in [
|
|
97
|
+
(
|
|
98
|
+
self.general_metrics,
|
|
99
|
+
(
|
|
100
|
+
_DEFAULT_GENERAL_RUNTIME
|
|
101
|
+
if runtime_pipeline
|
|
102
|
+
else _DEFAULT_GENERAL
|
|
103
|
+
),
|
|
104
|
+
),
|
|
105
|
+
(
|
|
106
|
+
self.function_metrics,
|
|
107
|
+
(
|
|
108
|
+
_DEFAULT_FUNCSEL_RUNTIME
|
|
109
|
+
if runtime_pipeline
|
|
110
|
+
else _DEFAULT_FUNCSEL
|
|
111
|
+
),
|
|
112
|
+
),
|
|
113
|
+
(
|
|
114
|
+
self.parameter_metrics,
|
|
115
|
+
_DEFAULT_PARAM_RUNTIME if runtime_pipeline else _DEFAULT_PARAM,
|
|
116
|
+
),
|
|
117
|
+
]:
|
|
118
|
+
if not metrics:
|
|
119
|
+
metrics_definitions.append(None)
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# Handle metric names list
|
|
123
|
+
if isinstance(metrics, list) and all(
|
|
124
|
+
isinstance(x, str) for x in metrics
|
|
125
|
+
):
|
|
126
|
+
# Load the default JSON file
|
|
127
|
+
if not default_path.is_file():
|
|
128
|
+
raise FileNotFoundError(
|
|
129
|
+
f"Default metrics file not found: {default_path}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
with default_path.open("r") as f_in:
|
|
133
|
+
all_metrics = json.load(f_in)
|
|
134
|
+
|
|
135
|
+
# Filter metrics by name
|
|
136
|
+
filtered_metrics = [
|
|
137
|
+
metric
|
|
138
|
+
for metric in all_metrics
|
|
139
|
+
if metric.get("name") in metrics
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
# Remove examples from prompts if requested
|
|
143
|
+
if not use_examples:
|
|
144
|
+
for metric in filtered_metrics:
|
|
145
|
+
metric.pop("examples", None)
|
|
146
|
+
|
|
147
|
+
if len(filtered_metrics) != len(metrics):
|
|
148
|
+
found_names = {
|
|
149
|
+
metric.get("name") for metric in filtered_metrics
|
|
150
|
+
}
|
|
151
|
+
missing = set(metrics) - found_names
|
|
152
|
+
raise ValueError(f"Metrics not found: {missing}")
|
|
153
|
+
|
|
154
|
+
metrics_definitions.append(filtered_metrics)
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
# Handle Path or List[FunctionCallMetric] (existing logic)
|
|
158
|
+
if not isinstance(metrics, (Path, list)):
|
|
159
|
+
raise TypeError(
|
|
160
|
+
"metrics must be Path, List[FunctionCallMetric], List[str], or None"
|
|
161
|
+
)
|
|
162
|
+
if isinstance(metrics, list) and all(
|
|
163
|
+
isinstance(x, FunctionCallMetric) for x in metrics
|
|
164
|
+
):
|
|
165
|
+
metrics_definitions.append(
|
|
166
|
+
[metric.model_dump() for metric in metrics]
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
if not metrics.is_file():
|
|
170
|
+
raise FileNotFoundError(
|
|
171
|
+
f"Metrics file not found: {metrics}"
|
|
172
|
+
)
|
|
173
|
+
metrics_definitions.append(
|
|
174
|
+
[
|
|
175
|
+
json.loads(json_obj)
|
|
176
|
+
for json_obj in metrics.read_text(
|
|
177
|
+
encoding="utf8"
|
|
178
|
+
).splitlines()
|
|
179
|
+
if json_obj.strip()
|
|
180
|
+
]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
gen_defs, fun_defs, par_defs = None, None, None
|
|
184
|
+
|
|
185
|
+
if metrics_definitions:
|
|
186
|
+
gen_defs = metrics_definitions[0]
|
|
187
|
+
if len(metrics_definitions) >= 2:
|
|
188
|
+
fun_defs = metrics_definitions[1]
|
|
189
|
+
if len(metrics_definitions) >= 3:
|
|
190
|
+
par_defs = metrics_definitions[2]
|
|
191
|
+
|
|
192
|
+
# 3) Initialize semantic checker
|
|
193
|
+
self.semantic_checker = SemanticChecker(
|
|
194
|
+
general_metrics=gen_defs,
|
|
195
|
+
function_metrics=fun_defs,
|
|
196
|
+
parameter_metrics=par_defs,
|
|
197
|
+
metrics_client=self.metrics_client,
|
|
198
|
+
codegen_client=self.codegen_client,
|
|
199
|
+
transform_enabled=transform_enabled,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def static_only(
|
|
204
|
+
inventory: List[ToolSpec],
|
|
205
|
+
call: ToolCall,
|
|
206
|
+
) -> StaticResult:
|
|
207
|
+
"""
|
|
208
|
+
Run schema-based static checks.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
StaticResult with per-check results and final_decision.
|
|
212
|
+
"""
|
|
213
|
+
try:
|
|
214
|
+
return evaluate_static(inventory, call)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
return StaticResult(
|
|
217
|
+
metrics={
|
|
218
|
+
"json_schema_validation": StaticMetricResult(
|
|
219
|
+
description="Invalid JSON schema",
|
|
220
|
+
valid=False,
|
|
221
|
+
explanation=f"error parsing JSON schema: {str(e)}",
|
|
222
|
+
)
|
|
223
|
+
},
|
|
224
|
+
final_decision=False,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def semantic_sync(
|
|
228
|
+
self,
|
|
229
|
+
conversation: Union[str, List[Dict[str, str]]],
|
|
230
|
+
inventory: List[ToolSpec],
|
|
231
|
+
call: ToolCall,
|
|
232
|
+
retries: Optional[int] = 2,
|
|
233
|
+
transform_enabled: Optional[bool] = None,
|
|
234
|
+
) -> SemanticResult:
|
|
235
|
+
"""
|
|
236
|
+
Synchronous LLM-based semantic metrics (+ optional transforms).
|
|
237
|
+
"""
|
|
238
|
+
# delegate to SemanticChecker
|
|
239
|
+
return self.semantic_checker.run_sync(
|
|
240
|
+
inventory,
|
|
241
|
+
call,
|
|
242
|
+
conversation,
|
|
243
|
+
retries=retries,
|
|
244
|
+
transform_enabled=transform_enabled,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
def run_sync(
|
|
248
|
+
self,
|
|
249
|
+
conversation: Union[str, List[Dict[str, str]]],
|
|
250
|
+
inventory: List[ToolSpec],
|
|
251
|
+
call: ToolCall,
|
|
252
|
+
continue_on_static: Optional[bool] = False,
|
|
253
|
+
retries: Optional[int] = 1,
|
|
254
|
+
transform_enabled: Optional[bool] = None,
|
|
255
|
+
) -> PipelineResult:
|
|
256
|
+
"""
|
|
257
|
+
Full sync pipeline: static -> semantic -> assemble PipelineResult.
|
|
258
|
+
"""
|
|
259
|
+
static_res = self.static_only(inventory, call)
|
|
260
|
+
|
|
261
|
+
if not static_res.final_decision and not continue_on_static:
|
|
262
|
+
inputs = FunctionCallInput(
|
|
263
|
+
conversation_context=conversation,
|
|
264
|
+
tools_inventory=inventory,
|
|
265
|
+
tool_call=call,
|
|
266
|
+
)
|
|
267
|
+
return PipelineResult(
|
|
268
|
+
inputs=inputs,
|
|
269
|
+
static=static_res,
|
|
270
|
+
semantic=SemanticResult(
|
|
271
|
+
general=None,
|
|
272
|
+
function_selection=None,
|
|
273
|
+
parameter=None,
|
|
274
|
+
transform=None,
|
|
275
|
+
),
|
|
276
|
+
overall_valid=False,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
semantic_res = self.semantic_sync(
|
|
280
|
+
conversation, inventory, call, retries, transform_enabled
|
|
281
|
+
)
|
|
282
|
+
return PipelineResult(
|
|
283
|
+
inputs=FunctionCallInput(
|
|
284
|
+
conversation_context=conversation,
|
|
285
|
+
tools_inventory=inventory,
|
|
286
|
+
tool_call=call,
|
|
287
|
+
),
|
|
288
|
+
static=static_res,
|
|
289
|
+
semantic=semantic_res,
|
|
290
|
+
overall_valid=True,
|
|
291
|
+
)
|