ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (25) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/METADATA +7 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/RECORD +24 -21
  3. wxo_agentic_evaluation/analyze_run.py +357 -28
  4. wxo_agentic_evaluation/arg_configs.py +1 -0
  5. wxo_agentic_evaluation/evaluation_package.py +129 -13
  6. wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
  7. wxo_agentic_evaluation/external_agent/types.py +3 -9
  8. wxo_agentic_evaluation/inference_backend.py +27 -8
  9. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  10. wxo_agentic_evaluation/main.py +202 -66
  11. wxo_agentic_evaluation/main_v2.py +426 -0
  12. wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
  13. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  14. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  15. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  16. wxo_agentic_evaluation/prompt/template_render.py +14 -0
  17. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  18. wxo_agentic_evaluation/record_chat.py +20 -24
  19. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
  20. wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
  21. wxo_agentic_evaluation/service_instance.py +14 -14
  22. wxo_agentic_evaluation/utils/utils.py +32 -0
  23. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  24. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/WHEEL +0 -0
  25. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,11 @@ import dataclasses
3
3
  import glob
4
4
  import json
5
5
  import os
6
+ import re
6
7
  import traceback
8
+ from collections import defaultdict
7
9
  from concurrent.futures import ThreadPoolExecutor
10
+ from datetime import datetime
8
11
  from pathlib import Path
9
12
  from typing import List
10
13
 
@@ -41,10 +44,17 @@ from wxo_agentic_evaluation.utils.utils import (
41
44
 
42
45
 
43
46
  def process_test_case(
44
- task_n, test_case, config, inference_backend, resource_map, llm_user
47
+ task_n,
48
+ test_case,
49
+ config,
50
+ inference_backend,
51
+ resource_map,
52
+ llm_user,
53
+ run_idx: int = 0,
45
54
  ):
46
55
  summary_results_for_path = []
47
56
  tc_name = os.path.basename(test_case).replace(".json", "")
57
+ run_tag = f".run{run_idx+1}" if getattr(config, "n_runs", 1) > 1 else ""
48
58
  with open(test_case, "r") as f:
49
59
  test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
50
60
 
@@ -69,12 +79,14 @@ def process_test_case(
69
79
  result.append(message.model_dump())
70
80
 
71
81
  json_dump(
72
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
82
+ os.path.join(
83
+ config.output_dir, "messages", tc_name + run_tag + ".messages.json"
84
+ ),
73
85
  result,
74
86
  )
75
87
 
76
88
  if len(conversational_search_data) > 0:
77
- fn = tc_name + ".retrieval_context.json"
89
+ fn = tc_name + run_tag + ".retrieval_context.json"
78
90
  out_folder = Path(config.output_dir) / "knowledge_base_metrics"
79
91
  out_folder.mkdir(exist_ok=True)
80
92
  rc = [context.model_dump() for context in conversational_search_data]
@@ -100,15 +112,60 @@ def process_test_case(
100
112
  temp = []
101
113
  for message in messages_with_reason:
102
114
  temp.append(message.model_dump())
115
+ expected_tools = [
116
+ gd.tool_name
117
+ for gd in test_case.goal_details
118
+ if getattr(gd, "type", None) == "tool_call"
119
+ ]
120
+
121
+ raw_actual = []
122
+ for m in history:
123
+ try:
124
+ if getattr(m, "type", None) == "tool_call":
125
+ payload = (
126
+ json.loads(m.content)
127
+ if isinstance(m.content, str)
128
+ else m.content
129
+ )
130
+ name = (payload or {}).get("name")
131
+ if name:
132
+ raw_actual.append(str(name).strip())
133
+ except Exception:
134
+ pass
135
+
136
+ expected_set = set(expected_tools)
137
+ agent_names = (
138
+ set(getattr(resource_map, "agent2tools", {}).keys())
139
+ if resource_map
140
+ else set()
141
+ )
142
+
143
+ filtered_actual_tool_calls = [n for n in raw_actual if n not in agent_names]
144
+
145
+ missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
146
+
147
+ temp.append(
148
+ {
149
+ "meta": {
150
+ "expected_tool_calls": expected_tools,
151
+ "actual_tool_calls": filtered_actual_tool_calls,
152
+ "missed_tool_calls": missed_tool_calls,
153
+ }
154
+ }
155
+ )
103
156
  json_dump(
104
157
  os.path.join(
105
- config.output_dir, "messages", tc_name + ".messages.analyze.json"
158
+ config.output_dir,
159
+ "messages",
160
+ tc_name + run_tag + ".messages.analyze.json",
106
161
  ),
107
162
  temp,
108
163
  )
109
164
 
110
165
  json_dump(
111
- os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
166
+ os.path.join(
167
+ config.output_dir, "messages", tc_name + run_tag + ".metrics.json"
168
+ ),
112
169
  metrics.model_dump(),
113
170
  )
114
171
 
@@ -125,6 +182,9 @@ def process_test_case(
125
182
 
126
183
  def main(config: TestConfig):
127
184
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
185
+ if not getattr(config, "skip_available_results", False):
186
+ ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
187
+ config.output_dir = os.path.join(config.output_dir, ts)
128
188
  if config.num_workers > 1 and config.enable_manual_user_input:
129
189
  rich.print(
130
190
  "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
@@ -168,18 +228,24 @@ def main(config: TestConfig):
168
228
  )
169
229
 
170
230
  os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
171
- available_res = set()
231
+
232
+ def _removesuffix(s, suf): # py<3.9 safety
233
+ return s[: -len(suf)] if s.endswith(suf) else s
234
+
235
+ available_runs = defaultdict(set)
172
236
  if config.skip_available_results:
173
- available_res = set(
174
- [
175
- os.path.basename(f).replace(".messages", "")
176
- for f in glob.glob(
177
- os.path.join(
178
- config.output_dir, "messages", "*.messages.json"
179
- )
180
- )
181
- ]
182
- )
237
+ for f in glob.glob(
238
+ os.path.join(config.output_dir, "messages", "*.messages.json")
239
+ ):
240
+ # strip the fixed tail
241
+ name = _removesuffix(os.path.basename(f), ".messages.json")
242
+ # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
243
+ m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
244
+ if not m:
245
+ continue
246
+ stem = m.group("stem")
247
+ run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
248
+ available_runs[stem].add(run_num)
183
249
 
184
250
  test_cases = []
185
251
  for test_path in config.test_paths:
@@ -189,28 +255,35 @@ def main(config: TestConfig):
189
255
 
190
256
  futures = []
191
257
  task_n = 0
258
+ n_runs = getattr(config, "n_runs", 1)
192
259
  for test_case in test_cases:
193
260
  if not test_case.endswith(".json") or test_case.endswith("agent.json"):
194
261
  continue
195
- if config.skip_available_results:
196
- if test_case in available_res:
262
+ stem = Path(test_case).stem
263
+
264
+ for run_idx in range(n_runs):
265
+ run_number = run_idx + 1
266
+
267
+ # Skip precisely this (test, run) if results exist
268
+ if config.skip_available_results and (
269
+ run_number in available_runs.get(stem, set())
270
+ ):
197
271
  print(
198
- f"Skipping test case {test_case} as results already exist."
272
+ f"Skipping {stem} run {run_number} as results already exist."
199
273
  )
200
274
  continue
201
-
202
- future = executor.submit(
203
- process_test_case,
204
- task_n,
205
- test_case,
206
- config,
207
- inference_backend,
208
- resource_map,
209
- llm_user,
210
- )
211
-
212
- futures.append((test_case, future))
213
- task_n += 1
275
+ future = executor.submit(
276
+ process_test_case,
277
+ task_n,
278
+ test_case,
279
+ config,
280
+ inference_backend,
281
+ resource_map,
282
+ llm_user,
283
+ run_idx, # 👈 pass run index
284
+ )
285
+ futures.append(((test_case, run_idx), future))
286
+ task_n += 1
214
287
 
215
288
  if futures:
216
289
  with Progress() as progress:
@@ -218,7 +291,7 @@ def main(config: TestConfig):
218
291
  f"[purple]Evaluating {len(futures)} tasks...",
219
292
  total=len(futures),
220
293
  )
221
- for test_case, future in futures:
294
+ for (test_case, run_idx), future in futures:
222
295
  try:
223
296
  results_list.extend(future.result())
224
297
  except Exception as e:
@@ -275,6 +348,7 @@ def main(config: TestConfig):
275
348
  def create_avg_row(metrics: List[dict]):
276
349
  avg_row = {
277
350
  "Dataset": "Summary (Average)",
351
+ "Runs": 0,
278
352
  "Total Steps": 0,
279
353
  "LLM Steps": 0,
280
354
  "Total Tool Calls": 0,
@@ -287,6 +361,7 @@ def main(config: TestConfig):
287
361
  }
288
362
  if metrics:
289
363
  for row in metrics:
364
+ avg_row["Runs"] += row.get("Runs", 0)
290
365
  avg_row["Total Steps"] += row["Total Steps"]
291
366
  avg_row["LLM Steps"] += row["LLM Steps"]
292
367
  avg_row["Total Tool Calls"] += row["Total Tool Calls"]
@@ -295,63 +370,124 @@ def main(config: TestConfig):
295
370
  avg_row["Agent Routing Accuracy"] += row[
296
371
  "Agent Routing Accuracy"
297
372
  ]
298
- avg_row["Text Match"] += (
299
- row["Text Match"] == TextMatchType.text_match.value
300
- )
373
+ avg_row["Text Match"] += row["Text Match"]
301
374
  avg_row["Journey Success"] += row["Journey Success"]
302
375
  avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
303
376
 
377
+ n = len(metrics)
378
+ # Average over datasets
379
+ avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
304
380
  avg_row["Total Steps"] = round(
305
- safe_divide(avg_row["Total Steps"], len(metrics)), 2
381
+ safe_divide(avg_row["Total Steps"], n), 2
306
382
  )
307
383
  avg_row["LLM Steps"] = round(
308
- safe_divide(avg_row["LLM Steps"], len(metrics)), 2
384
+ safe_divide(avg_row["LLM Steps"], n), 2
309
385
  )
310
386
  avg_row["Total Tool Calls"] = round(
311
- safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
387
+ safe_divide(avg_row["Total Tool Calls"], n), 2
312
388
  )
313
389
  avg_row["Tool Call Precision"] = round(
314
- safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
390
+ safe_divide(avg_row["Tool Call Precision"], n), 2
315
391
  )
316
392
  avg_row["Tool Call Recall"] = round(
317
- safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
393
+ safe_divide(avg_row["Tool Call Recall"], n), 2
318
394
  )
319
395
  avg_row["Agent Routing Accuracy"] = round(
320
- safe_divide(
321
- avg_row["Agent Routing Accuracy"], len(metrics)
322
- ),
323
- 2,
396
+ safe_divide(avg_row["Agent Routing Accuracy"], n), 2
324
397
  )
325
398
  avg_row["Text Match"] = round(
326
- safe_divide(
327
- avg_row["Text Match"],
328
- len(
329
- [
330
- row
331
- for row in metrics
332
- if row["Text Match"]
333
- != TextMatchType.text_match.na
334
- ]
335
- ),
336
- ),
337
- 2,
399
+ safe_divide(avg_row["Text Match"], n), 2
338
400
  )
339
401
  avg_row["Journey Success"] = round(
340
- safe_divide(avg_row["Journey Success"], len(metrics)), 2
402
+ safe_divide(avg_row["Journey Success"], n), 2
341
403
  )
342
404
  avg_row["Avg Resp Time (sec)"] = round(
343
- safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
405
+ safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
344
406
  )
407
+
345
408
  return avg_row
346
409
 
347
- tool_call_metrics_for_display = []
348
- for row in tool_call_metrics:
349
- tool_call_metrics_for_display.append(
350
- filter_display_only_values(row)
410
+ grouped = defaultdict(list)
411
+ for m in tool_call_metrics:
412
+ grouped[m.dataset_name].append(filter_display_only_values(m))
413
+
414
+ numeric_keys = [
415
+ "Total Steps",
416
+ "LLM Steps",
417
+ "Total Tool Calls",
418
+ "Tool Call Precision",
419
+ "Tool Call Recall",
420
+ "Agent Routing Accuracy",
421
+ "Avg Resp Time (sec)",
422
+ ]
423
+
424
+ def mean(vals):
425
+ return round(sum(vals) / len(vals), 2) if vals else None
426
+
427
+ def _to_pct(value, decimals=0):
428
+ if value is None:
429
+ return "NA"
430
+ try:
431
+ return f"{round(float(value) * 100, decimals)}%"
432
+ except Exception:
433
+ return "NA"
434
+
435
+ per_test_rows = []
436
+ for ds, rows in grouped.items():
437
+ out = {"Dataset": ds}
438
+ # Average numeric columns over runs
439
+ for k in numeric_keys:
440
+ out[k] = mean(
441
+ [r[k] for r in rows if isinstance(r.get(k), (int, float))]
442
+ )
443
+
444
+ # Add total runs per dataset
445
+ out["Runs"] = round(float(len(rows)), 2)
446
+
447
+ # Journey Success -> numeric fraction in [0,1]
448
+ js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
449
+ out["Journey Success"] = round(
450
+ safe_divide(sum(js_vals), len(js_vals)), 2
351
451
  )
352
- tool_call_metrics_for_display.append(
353
- create_avg_row(tool_call_metrics_for_display)
354
- )
452
+
453
+ # Text Match -> numeric fraction in [0,1]
454
+ tm_hits = 0
455
+ tm_den = len(rows)
456
+ for r in rows:
457
+ val = r.get("Text Match")
458
+ if str(val).strip() == TextMatchType.text_match.value:
459
+ tm_hits += 1
460
+ out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
461
+
462
+ per_test_rows.append(out)
463
+
464
+ # Keep the old overall-avg logic: apply it over the per-test rows (each test counted once)
465
+ overall_row = create_avg_row(per_test_rows)
466
+ tool_call_metrics_for_display = per_test_rows + [overall_row]
467
+
468
+ column_order = [
469
+ "Dataset",
470
+ "Runs",
471
+ "Total Steps",
472
+ "LLM Steps",
473
+ "Total Tool Calls",
474
+ "Tool Call Precision",
475
+ "Tool Call Recall",
476
+ "Agent Routing Accuracy",
477
+ "Text Match",
478
+ "Journey Success",
479
+ "Avg Resp Time (sec)",
480
+ ]
481
+ for row in tool_call_metrics_for_display:
482
+ row["Text Match"] = _to_pct(row.get("Text Match"), decimals=0)
483
+ row["Journey Success"] = _to_pct(
484
+ row.get("Journey Success"), decimals=0
485
+ )
486
+
487
+ tool_call_metrics_for_display = [
488
+ {col: row.get(col, "") for col in column_order}
489
+ for row in tool_call_metrics_for_display
490
+ ]
355
491
  tool_call_table_for_display = create_table(
356
492
  tool_call_metrics_for_display
357
493
  )