ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -1,49 +1,64 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import csv
|
|
2
|
+
import dataclasses
|
|
3
|
+
import glob
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import traceback
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
import rich
|
|
12
|
+
import yaml
|
|
13
|
+
from jsonargparse import CLI
|
|
14
|
+
from rich.progress import Progress
|
|
15
|
+
|
|
16
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
17
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
5
18
|
from wxo_agentic_evaluation.inference_backend import (
|
|
6
19
|
EvaluationController,
|
|
20
|
+
WXOInferenceBackend,
|
|
7
21
|
get_wxo_client,
|
|
8
|
-
WXOInferenceBackend
|
|
9
22
|
)
|
|
10
|
-
from
|
|
11
|
-
from wxo_agentic_evaluation.
|
|
23
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
24
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
25
|
+
KnowledgeBaseMetricSummary,
|
|
26
|
+
TextMatchType,
|
|
27
|
+
ToolCallAndRoutingMetrics,
|
|
28
|
+
)
|
|
29
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
30
|
+
LlamaUserTemplateRenderer,
|
|
31
|
+
)
|
|
32
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
33
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
12
34
|
from wxo_agentic_evaluation.type import EvaluationData
|
|
13
|
-
|
|
14
|
-
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
35
|
+
from wxo_agentic_evaluation.utils import json_dump
|
|
15
36
|
from wxo_agentic_evaluation.utils.utils import (
|
|
16
|
-
create_table,
|
|
17
37
|
SummaryPanel,
|
|
18
|
-
|
|
38
|
+
create_table,
|
|
39
|
+
safe_divide,
|
|
19
40
|
)
|
|
20
|
-
from wxo_agentic_evaluation.utils import json_dump
|
|
21
|
-
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
|
|
22
|
-
import os
|
|
23
|
-
import json
|
|
24
|
-
import traceback
|
|
25
|
-
import yaml
|
|
26
|
-
import dataclasses
|
|
27
|
-
import glob
|
|
28
|
-
import rich
|
|
29
|
-
import csv
|
|
30
|
-
from rich.progress import Progress
|
|
31
|
-
from pathlib import Path
|
|
32
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
33
|
-
from jsonargparse import CLI
|
|
34
41
|
|
|
35
42
|
|
|
36
|
-
def process_test_case(
|
|
43
|
+
def process_test_case(
|
|
44
|
+
task_n, test_case, config, inference_backend, resource_map, llm_user
|
|
45
|
+
):
|
|
37
46
|
summary_results_for_path = []
|
|
38
47
|
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
39
48
|
with open(test_case, "r") as f:
|
|
40
49
|
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
41
50
|
|
|
42
51
|
evaluation_controller = EvaluationController(
|
|
43
|
-
wxo_inference_backend=inference_backend,
|
|
52
|
+
wxo_inference_backend=inference_backend,
|
|
53
|
+
llm_user=llm_user,
|
|
54
|
+
config=config,
|
|
44
55
|
)
|
|
45
56
|
rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
|
|
46
|
-
|
|
57
|
+
(
|
|
58
|
+
history,
|
|
59
|
+
call_tracker,
|
|
60
|
+
conversational_search_data,
|
|
61
|
+
) = evaluation_controller.run(
|
|
47
62
|
task_n,
|
|
48
63
|
test_case.story,
|
|
49
64
|
agent_name=test_case.agent,
|
|
@@ -54,7 +69,8 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
54
69
|
result.append(message.model_dump())
|
|
55
70
|
|
|
56
71
|
json_dump(
|
|
57
|
-
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
72
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
73
|
+
result,
|
|
58
74
|
)
|
|
59
75
|
|
|
60
76
|
if len(conversational_search_data) > 0:
|
|
@@ -73,7 +89,7 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
73
89
|
messages=history,
|
|
74
90
|
ground_truth=test_case,
|
|
75
91
|
conversational_search_data=conversational_search_data,
|
|
76
|
-
resource_map=resource_map
|
|
92
|
+
resource_map=resource_map,
|
|
77
93
|
)
|
|
78
94
|
(
|
|
79
95
|
keyword_semantic_matches,
|
|
@@ -85,7 +101,9 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
85
101
|
for message in messages_with_reason:
|
|
86
102
|
temp.append(message.model_dump())
|
|
87
103
|
json_dump(
|
|
88
|
-
os.path.join(
|
|
104
|
+
os.path.join(
|
|
105
|
+
config.output_dir, "messages", tc_name + ".messages.analyze.json"
|
|
106
|
+
),
|
|
89
107
|
temp,
|
|
90
108
|
)
|
|
91
109
|
|
|
@@ -108,18 +126,29 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
108
126
|
def main(config: TestConfig):
|
|
109
127
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
110
128
|
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
111
|
-
rich.print(
|
|
112
|
-
|
|
129
|
+
rich.print(
|
|
130
|
+
"[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
|
|
131
|
+
)
|
|
132
|
+
config.enable_manual_user_input = (
|
|
133
|
+
False # disable manual user input for parallel execution
|
|
134
|
+
)
|
|
113
135
|
# reason: threads continue to stream messages while waiting for user input, which is not desired
|
|
114
136
|
# and the manual input prompt is not labelled properly in the UI
|
|
115
137
|
wxo_client = get_wxo_client(
|
|
116
|
-
config.auth_config.url,
|
|
138
|
+
config.auth_config.url,
|
|
139
|
+
config.auth_config.tenant_name,
|
|
140
|
+
config.auth_config.token,
|
|
117
141
|
)
|
|
118
142
|
resource_map = ResourceMap(wxo_client)
|
|
119
143
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
120
144
|
llm_user = LLMUser(
|
|
121
|
-
wai_client=get_provider(
|
|
122
|
-
|
|
145
|
+
wai_client=get_provider(
|
|
146
|
+
config=config.provider_config,
|
|
147
|
+
model_id=config.llm_user_config.model_id,
|
|
148
|
+
),
|
|
149
|
+
template=LlamaUserTemplateRenderer(
|
|
150
|
+
config.llm_user_config.prompt_config
|
|
151
|
+
),
|
|
123
152
|
user_response_style=config.llm_user_config.user_response_style,
|
|
124
153
|
)
|
|
125
154
|
|
|
@@ -127,7 +156,9 @@ def main(config: TestConfig):
|
|
|
127
156
|
|
|
128
157
|
results_list = []
|
|
129
158
|
|
|
130
|
-
knowledge_base_output_folder =
|
|
159
|
+
knowledge_base_output_folder = (
|
|
160
|
+
Path(config.output_dir) / "knowledge_base_metrics"
|
|
161
|
+
)
|
|
131
162
|
knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
|
|
132
163
|
detailed_rag_output_file = (
|
|
133
164
|
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
@@ -143,7 +174,9 @@ def main(config: TestConfig):
|
|
|
143
174
|
[
|
|
144
175
|
os.path.basename(f).replace(".messages", "")
|
|
145
176
|
for f in glob.glob(
|
|
146
|
-
os.path.join(
|
|
177
|
+
os.path.join(
|
|
178
|
+
config.output_dir, "messages", "*.messages.json"
|
|
179
|
+
)
|
|
147
180
|
)
|
|
148
181
|
]
|
|
149
182
|
)
|
|
@@ -153,7 +186,7 @@ def main(config: TestConfig):
|
|
|
153
186
|
if os.path.isdir(test_path):
|
|
154
187
|
test_path = os.path.join(test_path, "*.json")
|
|
155
188
|
test_cases.extend(sorted(glob.glob(test_path)))
|
|
156
|
-
|
|
189
|
+
|
|
157
190
|
futures = []
|
|
158
191
|
task_n = 0
|
|
159
192
|
for test_case in test_cases:
|
|
@@ -161,7 +194,9 @@ def main(config: TestConfig):
|
|
|
161
194
|
continue
|
|
162
195
|
if config.skip_available_results:
|
|
163
196
|
if test_case in available_res:
|
|
164
|
-
print(
|
|
197
|
+
print(
|
|
198
|
+
f"Skipping test case {test_case} as results already exist."
|
|
199
|
+
)
|
|
165
200
|
continue
|
|
166
201
|
|
|
167
202
|
future = executor.submit(
|
|
@@ -180,7 +215,8 @@ def main(config: TestConfig):
|
|
|
180
215
|
if futures:
|
|
181
216
|
with Progress() as progress:
|
|
182
217
|
task1 = progress.add_task(
|
|
183
|
-
f"[purple]Evaluating {len(futures)} tasks...",
|
|
218
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
219
|
+
total=len(futures),
|
|
184
220
|
)
|
|
185
221
|
for test_case, future in futures:
|
|
186
222
|
try:
|
|
@@ -200,27 +236,55 @@ def main(config: TestConfig):
|
|
|
200
236
|
SummaryPanel(rag_metric_summary).print()
|
|
201
237
|
|
|
202
238
|
with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
|
|
203
|
-
json.dump(
|
|
239
|
+
json.dump(
|
|
240
|
+
rag_metric_summary.model_dump(by_alias=True)["detailed"],
|
|
241
|
+
f,
|
|
242
|
+
indent=4,
|
|
243
|
+
)
|
|
204
244
|
|
|
205
245
|
with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
|
|
206
|
-
json.dump(
|
|
246
|
+
json.dump(
|
|
247
|
+
rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
|
|
248
|
+
)
|
|
207
249
|
|
|
208
250
|
if len(tool_call_metrics) > 0:
|
|
209
251
|
# remove the average row if exist
|
|
210
252
|
tool_call_metrics = [
|
|
211
|
-
row
|
|
253
|
+
row
|
|
254
|
+
for row in tool_call_metrics
|
|
255
|
+
if row.dataset_name != "Summary (Average)"
|
|
212
256
|
]
|
|
213
257
|
|
|
214
|
-
def filter_display_only_values(
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
258
|
+
def filter_display_only_values(
|
|
259
|
+
tool_call_metric: ToolCallAndRoutingMetrics,
|
|
260
|
+
):
|
|
261
|
+
row = {
|
|
262
|
+
"Dataset": tool_call_metric.dataset_name,
|
|
263
|
+
"Total Steps": tool_call_metric.total_steps,
|
|
264
|
+
"LLM Steps": tool_call_metric.llm_step,
|
|
265
|
+
"Total Tool Calls": tool_call_metric.total_tool_calls,
|
|
266
|
+
"Tool Call Precision": tool_call_metric.tool_call_precision,
|
|
267
|
+
"Tool Call Recall": tool_call_metric.tool_call_recall,
|
|
268
|
+
"Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
|
|
269
|
+
"Text Match": tool_call_metric.text_match,
|
|
270
|
+
"Journey Success": tool_call_metric.is_success,
|
|
271
|
+
"Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
|
|
272
|
+
}
|
|
218
273
|
return row
|
|
219
274
|
|
|
220
275
|
def create_avg_row(metrics: List[dict]):
|
|
221
|
-
avg_row = {
|
|
222
|
-
|
|
223
|
-
|
|
276
|
+
avg_row = {
|
|
277
|
+
"Dataset": "Summary (Average)",
|
|
278
|
+
"Total Steps": 0,
|
|
279
|
+
"LLM Steps": 0,
|
|
280
|
+
"Total Tool Calls": 0,
|
|
281
|
+
"Tool Call Precision": 0,
|
|
282
|
+
"Tool Call Recall": 0,
|
|
283
|
+
"Agent Routing Accuracy": 0,
|
|
284
|
+
"Text Match": 0,
|
|
285
|
+
"Journey Success": 0,
|
|
286
|
+
"Avg Resp Time (sec)": 0,
|
|
287
|
+
}
|
|
224
288
|
if metrics:
|
|
225
289
|
for row in metrics:
|
|
226
290
|
avg_row["Total Steps"] += row["Total Steps"]
|
|
@@ -228,33 +292,77 @@ def main(config: TestConfig):
|
|
|
228
292
|
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
229
293
|
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
230
294
|
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
231
|
-
avg_row["Agent Routing Accuracy"] += row[
|
|
232
|
-
|
|
295
|
+
avg_row["Agent Routing Accuracy"] += row[
|
|
296
|
+
"Agent Routing Accuracy"
|
|
297
|
+
]
|
|
298
|
+
avg_row["Text Match"] += (
|
|
299
|
+
row["Text Match"] == TextMatchType.text_match.value
|
|
300
|
+
)
|
|
233
301
|
avg_row["Journey Success"] += row["Journey Success"]
|
|
234
302
|
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
235
303
|
|
|
236
|
-
avg_row["Total Steps"] = round(
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
avg_row["
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
avg_row["
|
|
243
|
-
|
|
244
|
-
|
|
304
|
+
avg_row["Total Steps"] = round(
|
|
305
|
+
safe_divide(avg_row["Total Steps"], len(metrics)), 2
|
|
306
|
+
)
|
|
307
|
+
avg_row["LLM Steps"] = round(
|
|
308
|
+
safe_divide(avg_row["LLM Steps"], len(metrics)), 2
|
|
309
|
+
)
|
|
310
|
+
avg_row["Total Tool Calls"] = round(
|
|
311
|
+
safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
|
|
312
|
+
)
|
|
313
|
+
avg_row["Tool Call Precision"] = round(
|
|
314
|
+
safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
|
|
315
|
+
)
|
|
316
|
+
avg_row["Tool Call Recall"] = round(
|
|
317
|
+
safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
|
|
318
|
+
)
|
|
319
|
+
avg_row["Agent Routing Accuracy"] = round(
|
|
320
|
+
safe_divide(
|
|
321
|
+
avg_row["Agent Routing Accuracy"], len(metrics)
|
|
322
|
+
),
|
|
323
|
+
2,
|
|
324
|
+
)
|
|
325
|
+
avg_row["Text Match"] = round(
|
|
326
|
+
safe_divide(
|
|
327
|
+
avg_row["Text Match"],
|
|
328
|
+
len(
|
|
329
|
+
[
|
|
330
|
+
row
|
|
331
|
+
for row in metrics
|
|
332
|
+
if row["Text Match"]
|
|
333
|
+
!= TextMatchType.text_match.na
|
|
334
|
+
]
|
|
335
|
+
),
|
|
336
|
+
),
|
|
337
|
+
2,
|
|
338
|
+
)
|
|
339
|
+
avg_row["Journey Success"] = round(
|
|
340
|
+
safe_divide(avg_row["Journey Success"], len(metrics)), 2
|
|
341
|
+
)
|
|
342
|
+
avg_row["Avg Resp Time (sec)"] = round(
|
|
343
|
+
safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
|
|
344
|
+
)
|
|
245
345
|
return avg_row
|
|
246
346
|
|
|
247
347
|
tool_call_metrics_for_display = []
|
|
248
348
|
for row in tool_call_metrics:
|
|
249
|
-
tool_call_metrics_for_display.append(
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
349
|
+
tool_call_metrics_for_display.append(
|
|
350
|
+
filter_display_only_values(row)
|
|
351
|
+
)
|
|
352
|
+
tool_call_metrics_for_display.append(
|
|
353
|
+
create_avg_row(tool_call_metrics_for_display)
|
|
354
|
+
)
|
|
355
|
+
tool_call_table_for_display = create_table(
|
|
356
|
+
tool_call_metrics_for_display
|
|
357
|
+
)
|
|
358
|
+
|
|
253
359
|
if tool_call_table_for_display:
|
|
254
360
|
tool_call_table_for_display.print()
|
|
255
361
|
|
|
256
362
|
if len(tool_call_metrics) > 0:
|
|
257
|
-
tool_call_metrics = [
|
|
363
|
+
tool_call_metrics = [
|
|
364
|
+
metric.model_dump() for metric in tool_call_metrics
|
|
365
|
+
]
|
|
258
366
|
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
259
367
|
header = list(tool_call_metrics[0].keys())
|
|
260
368
|
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from typing import List, Mapping, Any, Tuple, Optional
|
|
3
2
|
from enum import Enum
|
|
3
|
+
from typing import Any, List, Mapping, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, computed_field
|
|
6
6
|
|
|
7
|
-
from wxo_agentic_evaluation.metrics.llm_as_judge import
|
|
7
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
8
|
+
AnswerRelevancy,
|
|
9
|
+
Faithfulness,
|
|
10
|
+
)
|
|
8
11
|
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
9
12
|
|
|
10
13
|
|
|
@@ -13,7 +16,7 @@ def average(array):
|
|
|
13
16
|
return math.nan
|
|
14
17
|
|
|
15
18
|
else:
|
|
16
|
-
return sum(array)/len(array)
|
|
19
|
+
return sum(array) / len(array)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
class KnowledgeBaseMetrics(BaseModel):
|
|
@@ -54,7 +57,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
|
|
|
54
57
|
}
|
|
55
58
|
else:
|
|
56
59
|
values = groupby[name]
|
|
57
|
-
values.get("knowledge_base_name").append(
|
|
60
|
+
values.get("knowledge_base_name").append(
|
|
61
|
+
knowledge_base_name
|
|
62
|
+
)
|
|
58
63
|
values.get("faithfulness").append(faithfulness)
|
|
59
64
|
values.get("answer_relevancy").append(answer_relevancy)
|
|
60
65
|
values.get("confidence_scores").append(confidence_scores)
|
|
@@ -109,6 +114,7 @@ class KeywordSemanticSearchMetric(BaseModel):
|
|
|
109
114
|
message: str
|
|
110
115
|
goal_detail: str
|
|
111
116
|
|
|
117
|
+
|
|
112
118
|
class TextMatchType(Enum):
|
|
113
119
|
text_match = "Summary Matched"
|
|
114
120
|
text_mismatch = "Summary MisMatched"
|
|
@@ -117,12 +123,14 @@ class TextMatchType(Enum):
|
|
|
117
123
|
|
|
118
124
|
class ToolCallAndRoutingMetrics(BaseModel):
|
|
119
125
|
dataset_name: str = ""
|
|
120
|
-
total_steps: int=0
|
|
121
|
-
llm_step: int =0
|
|
126
|
+
total_steps: int = 0
|
|
127
|
+
llm_step: int = 0
|
|
122
128
|
total_tool_calls: int = 0
|
|
123
129
|
expected_tool_calls: int = 0
|
|
124
130
|
correct_tool_calls: int = 0
|
|
125
|
-
relevant_tool_calls: int =
|
|
131
|
+
relevant_tool_calls: int = (
|
|
132
|
+
0 # calls with the same function but different args
|
|
133
|
+
)
|
|
126
134
|
total_routing_calls: int = 0
|
|
127
135
|
relevant_routing_calls: int = 0
|
|
128
136
|
tool_calls_with_incorrect_parameter: int = 0
|
|
@@ -135,7 +143,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
135
143
|
def tool_call_recall(self) -> float:
|
|
136
144
|
return round(
|
|
137
145
|
(
|
|
138
|
-
self.correct_tool_calls/self.expected_tool_calls
|
|
146
|
+
self.correct_tool_calls / self.expected_tool_calls
|
|
139
147
|
if self.expected_tool_calls > 0
|
|
140
148
|
else 0.0
|
|
141
149
|
),
|
|
@@ -147,8 +155,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
147
155
|
def tool_call_precision(self) -> float:
|
|
148
156
|
return round(
|
|
149
157
|
(
|
|
150
|
-
(self.correct_tool_calls)
|
|
151
|
-
/ self.total_tool_calls
|
|
158
|
+
(self.correct_tool_calls) / self.total_tool_calls
|
|
152
159
|
if self.total_tool_calls > 0
|
|
153
160
|
else 0.0
|
|
154
161
|
),
|
|
@@ -167,11 +174,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
167
174
|
2,
|
|
168
175
|
)
|
|
169
176
|
|
|
177
|
+
|
|
170
178
|
class FailedStaticTestCases(BaseModel):
|
|
171
179
|
metric_name: str
|
|
172
180
|
description: str
|
|
173
181
|
explanation: str
|
|
174
182
|
|
|
183
|
+
|
|
175
184
|
class FailedSemanticTestCases(BaseModel):
|
|
176
185
|
metric_name: str
|
|
177
186
|
evidence: str
|
|
@@ -179,11 +188,16 @@ class FailedSemanticTestCases(BaseModel):
|
|
|
179
188
|
output: int
|
|
180
189
|
confidence: float
|
|
181
190
|
|
|
191
|
+
|
|
182
192
|
class ReferenceLessEvalMetrics(BaseModel):
|
|
183
193
|
dataset_name: str
|
|
184
194
|
number_of_tool_calls: int
|
|
185
195
|
number_of_successful_tool_calls: int
|
|
186
196
|
number_of_static_failed_tool_calls: int
|
|
187
197
|
number_of_semantic_failed_tool_calls: int
|
|
188
|
-
failed_static_tool_calls: Optional[
|
|
189
|
-
|
|
198
|
+
failed_static_tool_calls: Optional[
|
|
199
|
+
List[Tuple[int, List[FailedStaticTestCases]]]
|
|
200
|
+
]
|
|
201
|
+
failed_semantic_tool_calls: Optional[
|
|
202
|
+
List[Tuple[int, List[FailedSemanticTestCases]]]
|
|
203
|
+
]
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
import jinja2
|
|
2
1
|
from typing import List
|
|
2
|
+
|
|
3
|
+
import jinja2
|
|
4
|
+
|
|
3
5
|
from wxo_agentic_evaluation.type import ToolDefinition
|
|
4
6
|
|
|
7
|
+
|
|
5
8
|
class JinjaTemplateRenderer:
|
|
6
9
|
def __init__(self, template_path: str):
|
|
7
10
|
self._template_env = jinja2.Environment(
|
|
@@ -20,7 +23,11 @@ class JinjaTemplateRenderer:
|
|
|
20
23
|
|
|
21
24
|
class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
|
|
22
25
|
def render(
|
|
23
|
-
self,
|
|
26
|
+
self,
|
|
27
|
+
user_story: str,
|
|
28
|
+
user_response_style: List,
|
|
29
|
+
conversation_history: List,
|
|
30
|
+
attack_instructions: str = None,
|
|
24
31
|
) -> str:
|
|
25
32
|
return super().render(
|
|
26
33
|
user_story=user_story,
|
|
@@ -32,12 +39,17 @@ class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
|
|
|
32
39
|
|
|
33
40
|
class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
34
41
|
def render(self, keywords_text: str, response_text: str) -> str:
|
|
35
|
-
return super().render(
|
|
42
|
+
return super().render(
|
|
43
|
+
keywords_text=keywords_text, response_text=response_text
|
|
44
|
+
)
|
|
36
45
|
|
|
37
46
|
|
|
38
47
|
class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
39
48
|
def render(self, expected_text: str, actual_text: str) -> str:
|
|
40
|
-
return super().render(
|
|
49
|
+
return super().render(
|
|
50
|
+
expected_text=expected_text, actual_text=actual_text
|
|
51
|
+
)
|
|
52
|
+
|
|
41
53
|
|
|
42
54
|
class BadToolDescriptionRenderer(JinjaTemplateRenderer):
|
|
43
55
|
def render(self, tool_definition: ToolDefinition) -> str:
|
|
@@ -51,7 +63,9 @@ class LlamaKeywordsGenerationTemplateRenderer(JinjaTemplateRenderer):
|
|
|
51
63
|
|
|
52
64
|
class FaithfulnessTemplateRenderer(JinjaTemplateRenderer):
|
|
53
65
|
def render(self, claim, retrieval_context):
|
|
54
|
-
return super().render(
|
|
66
|
+
return super().render(
|
|
67
|
+
claim=claim, supporting_evidence=retrieval_context
|
|
68
|
+
)
|
|
55
69
|
|
|
56
70
|
|
|
57
71
|
class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
|
|
@@ -60,13 +74,16 @@ class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
|
|
|
60
74
|
|
|
61
75
|
|
|
62
76
|
class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
|
|
63
|
-
def render(
|
|
77
|
+
def render(
|
|
78
|
+
self, user_story: str, agent_name: str, available_tools: str
|
|
79
|
+
) -> str:
|
|
64
80
|
return super().render(
|
|
65
81
|
user_story=user_story,
|
|
66
82
|
agent_name=agent_name,
|
|
67
83
|
available_tools=available_tools,
|
|
68
84
|
)
|
|
69
|
-
|
|
85
|
+
|
|
86
|
+
|
|
70
87
|
class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
|
|
71
88
|
def render(self, tool_signature: str, step: dict, inputs: dict) -> str:
|
|
72
89
|
return super().render(
|
|
@@ -75,8 +92,9 @@ class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
75
92
|
inputs=inputs,
|
|
76
93
|
)
|
|
77
94
|
|
|
95
|
+
|
|
78
96
|
class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
|
|
79
|
-
def render(self, tool_call_history: List, available_tools:str) -> str:
|
|
97
|
+
def render(self, tool_call_history: List, available_tools: str) -> str:
|
|
80
98
|
return super().render(
|
|
81
99
|
tool_call_history=tool_call_history,
|
|
82
100
|
available_tools=available_tools,
|
|
@@ -102,6 +120,7 @@ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
102
120
|
example_str=example_str,
|
|
103
121
|
)
|
|
104
122
|
|
|
123
|
+
|
|
105
124
|
class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
|
|
106
125
|
def render(
|
|
107
126
|
self,
|
|
@@ -110,7 +129,8 @@ class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
|
|
|
110
129
|
return super().render(
|
|
111
130
|
input_data=input_data,
|
|
112
131
|
)
|
|
113
|
-
|
|
132
|
+
|
|
133
|
+
|
|
114
134
|
class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
115
135
|
def render(
|
|
116
136
|
self,
|
|
@@ -125,7 +145,8 @@ class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
125
145
|
original_story=original_story,
|
|
126
146
|
original_starting_sentence=original_starting_sentence,
|
|
127
147
|
)
|
|
128
|
-
|
|
148
|
+
|
|
149
|
+
|
|
129
150
|
class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
130
151
|
def render(
|
|
131
152
|
self,
|
|
@@ -135,4 +156,4 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
135
156
|
return super().render(
|
|
136
157
|
original_story=original_story,
|
|
137
158
|
original_starting_sentence=original_starting_sentence,
|
|
138
|
-
)
|
|
159
|
+
)
|