ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +2 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +132 -13
- wxo_agentic_evaluation/inference_backend.py +52 -14
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- wxo_agentic_evaluation/utils/utils.py +32 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -3,8 +3,11 @@ import dataclasses
|
|
|
3
3
|
import glob
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import traceback
|
|
8
|
+
from collections import defaultdict
|
|
7
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
+
from datetime import datetime
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import List
|
|
10
13
|
|
|
@@ -41,10 +44,17 @@ from wxo_agentic_evaluation.utils.utils import (
|
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
def process_test_case(
|
|
44
|
-
task_n,
|
|
47
|
+
task_n,
|
|
48
|
+
test_case,
|
|
49
|
+
config,
|
|
50
|
+
inference_backend,
|
|
51
|
+
resource_map,
|
|
52
|
+
llm_user,
|
|
53
|
+
run_idx: int = 0,
|
|
45
54
|
):
|
|
46
55
|
summary_results_for_path = []
|
|
47
56
|
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
57
|
+
run_tag = f".run{run_idx+1}" if getattr(config, "n_runs", 1) > 1 else ""
|
|
48
58
|
with open(test_case, "r") as f:
|
|
49
59
|
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
50
60
|
|
|
@@ -69,12 +79,14 @@ def process_test_case(
|
|
|
69
79
|
result.append(message.model_dump())
|
|
70
80
|
|
|
71
81
|
json_dump(
|
|
72
|
-
os.path.join(
|
|
82
|
+
os.path.join(
|
|
83
|
+
config.output_dir, "messages", tc_name + run_tag + ".messages.json"
|
|
84
|
+
),
|
|
73
85
|
result,
|
|
74
86
|
)
|
|
75
87
|
|
|
76
88
|
if len(conversational_search_data) > 0:
|
|
77
|
-
fn = tc_name + ".retrieval_context.json"
|
|
89
|
+
fn = tc_name + run_tag + ".retrieval_context.json"
|
|
78
90
|
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
79
91
|
out_folder.mkdir(exist_ok=True)
|
|
80
92
|
rc = [context.model_dump() for context in conversational_search_data]
|
|
@@ -100,15 +112,60 @@ def process_test_case(
|
|
|
100
112
|
temp = []
|
|
101
113
|
for message in messages_with_reason:
|
|
102
114
|
temp.append(message.model_dump())
|
|
115
|
+
expected_tools = [
|
|
116
|
+
gd.tool_name
|
|
117
|
+
for gd in test_case.goal_details
|
|
118
|
+
if getattr(gd, "type", None) == "tool_call"
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
raw_actual = []
|
|
122
|
+
for m in history:
|
|
123
|
+
try:
|
|
124
|
+
if getattr(m, "type", None) == "tool_call":
|
|
125
|
+
payload = (
|
|
126
|
+
json.loads(m.content)
|
|
127
|
+
if isinstance(m.content, str)
|
|
128
|
+
else m.content
|
|
129
|
+
)
|
|
130
|
+
name = (payload or {}).get("name")
|
|
131
|
+
if name:
|
|
132
|
+
raw_actual.append(str(name).strip())
|
|
133
|
+
except Exception:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
expected_set = set(expected_tools)
|
|
137
|
+
agent_names = (
|
|
138
|
+
set(getattr(resource_map, "agent2tools", {}).keys())
|
|
139
|
+
if resource_map
|
|
140
|
+
else set()
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
filtered_actual_tool_calls = [n for n in raw_actual if n not in agent_names]
|
|
144
|
+
|
|
145
|
+
missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
|
|
146
|
+
|
|
147
|
+
temp.append(
|
|
148
|
+
{
|
|
149
|
+
"meta": {
|
|
150
|
+
"expected_tool_calls": expected_tools,
|
|
151
|
+
"actual_tool_calls": filtered_actual_tool_calls,
|
|
152
|
+
"missed_tool_calls": missed_tool_calls,
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
)
|
|
103
156
|
json_dump(
|
|
104
157
|
os.path.join(
|
|
105
|
-
config.output_dir,
|
|
158
|
+
config.output_dir,
|
|
159
|
+
"messages",
|
|
160
|
+
tc_name + run_tag + ".messages.analyze.json",
|
|
106
161
|
),
|
|
107
162
|
temp,
|
|
108
163
|
)
|
|
109
164
|
|
|
110
165
|
json_dump(
|
|
111
|
-
os.path.join(
|
|
166
|
+
os.path.join(
|
|
167
|
+
config.output_dir, "messages", tc_name + run_tag + ".metrics.json"
|
|
168
|
+
),
|
|
112
169
|
metrics.model_dump(),
|
|
113
170
|
)
|
|
114
171
|
|
|
@@ -125,6 +182,9 @@ def process_test_case(
|
|
|
125
182
|
|
|
126
183
|
def main(config: TestConfig):
|
|
127
184
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
185
|
+
if not getattr(config, "skip_available_results", False):
|
|
186
|
+
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
187
|
+
config.output_dir = os.path.join(config.output_dir, ts)
|
|
128
188
|
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
129
189
|
rich.print(
|
|
130
190
|
"[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
|
|
@@ -168,18 +228,24 @@ def main(config: TestConfig):
|
|
|
168
228
|
)
|
|
169
229
|
|
|
170
230
|
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
171
|
-
|
|
231
|
+
|
|
232
|
+
def _removesuffix(s, suf): # py<3.9 safety
|
|
233
|
+
return s[: -len(suf)] if s.endswith(suf) else s
|
|
234
|
+
|
|
235
|
+
available_runs = defaultdict(set)
|
|
172
236
|
if config.skip_available_results:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
237
|
+
for f in glob.glob(
|
|
238
|
+
os.path.join(config.output_dir, "messages", "*.messages.json")
|
|
239
|
+
):
|
|
240
|
+
# strip the fixed tail
|
|
241
|
+
name = _removesuffix(os.path.basename(f), ".messages.json")
|
|
242
|
+
# match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
|
|
243
|
+
m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
|
|
244
|
+
if not m:
|
|
245
|
+
continue
|
|
246
|
+
stem = m.group("stem")
|
|
247
|
+
run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
|
|
248
|
+
available_runs[stem].add(run_num)
|
|
183
249
|
|
|
184
250
|
test_cases = []
|
|
185
251
|
for test_path in config.test_paths:
|
|
@@ -189,28 +255,35 @@ def main(config: TestConfig):
|
|
|
189
255
|
|
|
190
256
|
futures = []
|
|
191
257
|
task_n = 0
|
|
258
|
+
n_runs = getattr(config, "n_runs", 1)
|
|
192
259
|
for test_case in test_cases:
|
|
193
260
|
if not test_case.endswith(".json") or test_case.endswith("agent.json"):
|
|
194
261
|
continue
|
|
195
|
-
|
|
196
|
-
|
|
262
|
+
stem = Path(test_case).stem
|
|
263
|
+
|
|
264
|
+
for run_idx in range(n_runs):
|
|
265
|
+
run_number = run_idx + 1
|
|
266
|
+
|
|
267
|
+
# Skip precisely this (test, run) if results exist
|
|
268
|
+
if config.skip_available_results and (
|
|
269
|
+
run_number in available_runs.get(stem, set())
|
|
270
|
+
):
|
|
197
271
|
print(
|
|
198
|
-
f"Skipping
|
|
272
|
+
f"Skipping {stem} run {run_number} as results already exist."
|
|
199
273
|
)
|
|
200
274
|
continue
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
task_n += 1
|
|
275
|
+
future = executor.submit(
|
|
276
|
+
process_test_case,
|
|
277
|
+
task_n,
|
|
278
|
+
test_case,
|
|
279
|
+
config,
|
|
280
|
+
inference_backend,
|
|
281
|
+
resource_map,
|
|
282
|
+
llm_user,
|
|
283
|
+
run_idx, # 👈 pass run index
|
|
284
|
+
)
|
|
285
|
+
futures.append(((test_case, run_idx), future))
|
|
286
|
+
task_n += 1
|
|
214
287
|
|
|
215
288
|
if futures:
|
|
216
289
|
with Progress() as progress:
|
|
@@ -218,7 +291,7 @@ def main(config: TestConfig):
|
|
|
218
291
|
f"[purple]Evaluating {len(futures)} tasks...",
|
|
219
292
|
total=len(futures),
|
|
220
293
|
)
|
|
221
|
-
for test_case, future in futures:
|
|
294
|
+
for (test_case, run_idx), future in futures:
|
|
222
295
|
try:
|
|
223
296
|
results_list.extend(future.result())
|
|
224
297
|
except Exception as e:
|
|
@@ -275,6 +348,7 @@ def main(config: TestConfig):
|
|
|
275
348
|
def create_avg_row(metrics: List[dict]):
|
|
276
349
|
avg_row = {
|
|
277
350
|
"Dataset": "Summary (Average)",
|
|
351
|
+
"Runs": 0,
|
|
278
352
|
"Total Steps": 0,
|
|
279
353
|
"LLM Steps": 0,
|
|
280
354
|
"Total Tool Calls": 0,
|
|
@@ -287,6 +361,7 @@ def main(config: TestConfig):
|
|
|
287
361
|
}
|
|
288
362
|
if metrics:
|
|
289
363
|
for row in metrics:
|
|
364
|
+
avg_row["Runs"] += row.get("Runs", 0)
|
|
290
365
|
avg_row["Total Steps"] += row["Total Steps"]
|
|
291
366
|
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
292
367
|
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
@@ -295,63 +370,124 @@ def main(config: TestConfig):
|
|
|
295
370
|
avg_row["Agent Routing Accuracy"] += row[
|
|
296
371
|
"Agent Routing Accuracy"
|
|
297
372
|
]
|
|
298
|
-
avg_row["Text Match"] +=
|
|
299
|
-
row["Text Match"] == TextMatchType.text_match.value
|
|
300
|
-
)
|
|
373
|
+
avg_row["Text Match"] += row["Text Match"]
|
|
301
374
|
avg_row["Journey Success"] += row["Journey Success"]
|
|
302
375
|
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
303
376
|
|
|
377
|
+
n = len(metrics)
|
|
378
|
+
# Average over datasets
|
|
379
|
+
avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
|
|
304
380
|
avg_row["Total Steps"] = round(
|
|
305
|
-
safe_divide(avg_row["Total Steps"],
|
|
381
|
+
safe_divide(avg_row["Total Steps"], n), 2
|
|
306
382
|
)
|
|
307
383
|
avg_row["LLM Steps"] = round(
|
|
308
|
-
safe_divide(avg_row["LLM Steps"],
|
|
384
|
+
safe_divide(avg_row["LLM Steps"], n), 2
|
|
309
385
|
)
|
|
310
386
|
avg_row["Total Tool Calls"] = round(
|
|
311
|
-
safe_divide(avg_row["Total Tool Calls"],
|
|
387
|
+
safe_divide(avg_row["Total Tool Calls"], n), 2
|
|
312
388
|
)
|
|
313
389
|
avg_row["Tool Call Precision"] = round(
|
|
314
|
-
safe_divide(avg_row["Tool Call Precision"],
|
|
390
|
+
safe_divide(avg_row["Tool Call Precision"], n), 2
|
|
315
391
|
)
|
|
316
392
|
avg_row["Tool Call Recall"] = round(
|
|
317
|
-
safe_divide(avg_row["Tool Call Recall"],
|
|
393
|
+
safe_divide(avg_row["Tool Call Recall"], n), 2
|
|
318
394
|
)
|
|
319
395
|
avg_row["Agent Routing Accuracy"] = round(
|
|
320
|
-
safe_divide(
|
|
321
|
-
avg_row["Agent Routing Accuracy"], len(metrics)
|
|
322
|
-
),
|
|
323
|
-
2,
|
|
396
|
+
safe_divide(avg_row["Agent Routing Accuracy"], n), 2
|
|
324
397
|
)
|
|
325
398
|
avg_row["Text Match"] = round(
|
|
326
|
-
safe_divide(
|
|
327
|
-
avg_row["Text Match"],
|
|
328
|
-
len(
|
|
329
|
-
[
|
|
330
|
-
row
|
|
331
|
-
for row in metrics
|
|
332
|
-
if row["Text Match"]
|
|
333
|
-
!= TextMatchType.text_match.na
|
|
334
|
-
]
|
|
335
|
-
),
|
|
336
|
-
),
|
|
337
|
-
2,
|
|
399
|
+
safe_divide(avg_row["Text Match"], n), 2
|
|
338
400
|
)
|
|
339
401
|
avg_row["Journey Success"] = round(
|
|
340
|
-
safe_divide(avg_row["Journey Success"],
|
|
402
|
+
safe_divide(avg_row["Journey Success"], n), 2
|
|
341
403
|
)
|
|
342
404
|
avg_row["Avg Resp Time (sec)"] = round(
|
|
343
|
-
safe_divide(avg_row["Avg Resp Time (sec)"],
|
|
405
|
+
safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
|
|
344
406
|
)
|
|
407
|
+
|
|
345
408
|
return avg_row
|
|
346
409
|
|
|
347
|
-
|
|
348
|
-
for
|
|
349
|
-
|
|
350
|
-
|
|
410
|
+
grouped = defaultdict(list)
|
|
411
|
+
for m in tool_call_metrics:
|
|
412
|
+
grouped[m.dataset_name].append(filter_display_only_values(m))
|
|
413
|
+
|
|
414
|
+
numeric_keys = [
|
|
415
|
+
"Total Steps",
|
|
416
|
+
"LLM Steps",
|
|
417
|
+
"Total Tool Calls",
|
|
418
|
+
"Tool Call Precision",
|
|
419
|
+
"Tool Call Recall",
|
|
420
|
+
"Agent Routing Accuracy",
|
|
421
|
+
"Avg Resp Time (sec)",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
def mean(vals):
|
|
425
|
+
return round(sum(vals) / len(vals), 2) if vals else None
|
|
426
|
+
|
|
427
|
+
def _to_pct(value, decimals=0):
|
|
428
|
+
if value is None:
|
|
429
|
+
return "NA"
|
|
430
|
+
try:
|
|
431
|
+
return f"{round(float(value) * 100, decimals)}%"
|
|
432
|
+
except Exception:
|
|
433
|
+
return "NA"
|
|
434
|
+
|
|
435
|
+
per_test_rows = []
|
|
436
|
+
for ds, rows in grouped.items():
|
|
437
|
+
out = {"Dataset": ds}
|
|
438
|
+
# Average numeric columns over runs
|
|
439
|
+
for k in numeric_keys:
|
|
440
|
+
out[k] = mean(
|
|
441
|
+
[r[k] for r in rows if isinstance(r.get(k), (int, float))]
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Add total runs per dataset
|
|
445
|
+
out["Runs"] = round(float(len(rows)), 2)
|
|
446
|
+
|
|
447
|
+
# Journey Success -> numeric fraction in [0,1]
|
|
448
|
+
js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
|
|
449
|
+
out["Journey Success"] = round(
|
|
450
|
+
safe_divide(sum(js_vals), len(js_vals)), 2
|
|
351
451
|
)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
452
|
+
|
|
453
|
+
# Text Match -> numeric fraction in [0,1]
|
|
454
|
+
tm_hits = 0
|
|
455
|
+
tm_den = len(rows)
|
|
456
|
+
for r in rows:
|
|
457
|
+
val = r.get("Text Match")
|
|
458
|
+
if str(val).strip() == TextMatchType.text_match.value:
|
|
459
|
+
tm_hits += 1
|
|
460
|
+
out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
|
|
461
|
+
|
|
462
|
+
per_test_rows.append(out)
|
|
463
|
+
|
|
464
|
+
# Keep the old overall-avg logic: apply it over the per-test rows (each test counted once)
|
|
465
|
+
overall_row = create_avg_row(per_test_rows)
|
|
466
|
+
tool_call_metrics_for_display = per_test_rows + [overall_row]
|
|
467
|
+
|
|
468
|
+
column_order = [
|
|
469
|
+
"Dataset",
|
|
470
|
+
"Runs",
|
|
471
|
+
"Total Steps",
|
|
472
|
+
"LLM Steps",
|
|
473
|
+
"Total Tool Calls",
|
|
474
|
+
"Tool Call Precision",
|
|
475
|
+
"Tool Call Recall",
|
|
476
|
+
"Agent Routing Accuracy",
|
|
477
|
+
"Text Match",
|
|
478
|
+
"Journey Success",
|
|
479
|
+
"Avg Resp Time (sec)",
|
|
480
|
+
]
|
|
481
|
+
for row in tool_call_metrics_for_display:
|
|
482
|
+
row["Text Match"] = _to_pct(row.get("Text Match"), decimals=0)
|
|
483
|
+
row["Journey Success"] = _to_pct(
|
|
484
|
+
row.get("Journey Success"), decimals=0
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
tool_call_metrics_for_display = [
|
|
488
|
+
{col: row.get(col, "") for col in column_order}
|
|
489
|
+
for row in tool_call_metrics_for_display
|
|
490
|
+
]
|
|
355
491
|
tool_call_table_for_display = create_table(
|
|
356
492
|
tool_call_metrics_for_display
|
|
357
493
|
)
|