ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (27) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
  3. wxo_agentic_evaluation/analyze_run.py +357 -28
  4. wxo_agentic_evaluation/arg_configs.py +2 -1
  5. wxo_agentic_evaluation/evaluation.py +42 -0
  6. wxo_agentic_evaluation/evaluation_package.py +132 -13
  7. wxo_agentic_evaluation/inference_backend.py +52 -14
  8. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  9. wxo_agentic_evaluation/main.py +202 -66
  10. wxo_agentic_evaluation/main_v2.py +426 -0
  11. wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
  12. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  13. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  14. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  15. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  16. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  17. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  18. wxo_agentic_evaluation/prompt/template_render.py +14 -0
  19. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
  21. wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
  22. wxo_agentic_evaluation/service_instance.py +79 -10
  23. wxo_agentic_evaluation/service_provider/__init__.py +1 -1
  24. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
  25. wxo_agentic_evaluation/utils/utils.py +32 -0
  26. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
  27. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,426 @@
1
+ import csv
2
+ import dataclasses
3
+ import glob
4
+ import json
5
+ import os
6
+ import traceback
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ import rich
12
+ import yaml
13
+ from jsonargparse import CLI
14
+ from rich.progress import Progress
15
+
16
+ from wxo_agentic_evaluation.arg_configs import TestConfig
17
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
18
+ from wxo_agentic_evaluation.inference_backend import (
19
+ EvaluationController,
20
+ WXOInferenceBackend,
21
+ get_wxo_client,
22
+ )
23
+ from wxo_agentic_evaluation.llm_user import LLMUser
24
+ from wxo_agentic_evaluation.metrics.metrics import (
25
+ KnowledgeBaseMetricSummary,
26
+ TextMatchType,
27
+ ToolCallAndRoutingMetrics,
28
+ )
29
+ from wxo_agentic_evaluation.prompt.template_render import (
30
+ LlamaUserTemplateRenderer,
31
+ )
32
+ from wxo_agentic_evaluation.resource_map import ResourceMap
33
+ from wxo_agentic_evaluation.service_provider import get_provider
34
+ from wxo_agentic_evaluation.type import EvaluationData
35
+ from wxo_agentic_evaluation.utils import json_dump
36
+ from wxo_agentic_evaluation.utils.utils import (
37
+ SummaryPanel,
38
+ create_table,
39
+ safe_divide,
40
+ )
41
+
42
+
43
+ def trajectory_generation(task_n, test_case, config, inference_backend, resource_map, llm_user):
44
+ tc_name = os.path.basename(test_case).replace(".json", "")
45
+ with open(test_case, "r") as f:
46
+ test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
47
+
48
+ evaluation_controller = EvaluationController(
49
+ wxo_inference_backend=inference_backend,
50
+ llm_user=llm_user,
51
+ config=config,
52
+ )
53
+ rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
54
+ (
55
+ history,
56
+ call_tracker,
57
+ conversational_search_data,
58
+ ) = evaluation_controller.run(
59
+ task_n,
60
+ test_case.story,
61
+ agent_name=test_case.agent,
62
+ starting_user_input=test_case.starting_sentence,
63
+ )
64
+ result = list()
65
+ for message in history:
66
+ result.append(message.model_dump())
67
+
68
+ json_dump(
69
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
70
+ result,
71
+ )
72
+
73
+ if len(conversational_search_data) > 0:
74
+ fn = tc_name + ".retrieval_context.json"
75
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
76
+ out_folder.mkdir(exist_ok=True)
77
+ rc = [context.model_dump() for context in conversational_search_data]
78
+ json_dump(out_folder / fn, rc)
79
+
80
+
81
+ def evaluation():
82
+
83
+
84
+
85
+ def process_test_case(
86
+ task_n, test_case, config, inference_backend, resource_map, llm_user
87
+ ):
88
+ summary_results_for_path = []
89
+ tc_name = os.path.basename(test_case).replace(".json", "")
90
+ with open(test_case, "r") as f:
91
+ test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
92
+
93
+ evaluation_controller = EvaluationController(
94
+ wxo_inference_backend=inference_backend,
95
+ llm_user=llm_user,
96
+ config=config,
97
+ )
98
+ rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
99
+ (
100
+ history,
101
+ call_tracker,
102
+ conversational_search_data,
103
+ ) = evaluation_controller.run(
104
+ task_n,
105
+ test_case.story,
106
+ agent_name=test_case.agent,
107
+ starting_user_input=test_case.starting_sentence,
108
+ )
109
+ result = list()
110
+ for message in history:
111
+ result.append(message.model_dump())
112
+
113
+ json_dump(
114
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
115
+ result,
116
+ )
117
+
118
+ if len(conversational_search_data) > 0:
119
+ fn = tc_name + ".retrieval_context.json"
120
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
121
+ out_folder.mkdir(exist_ok=True)
122
+ rc = [context.model_dump() for context in conversational_search_data]
123
+ json_dump(out_folder / fn, rc)
124
+
125
+ # If data annotation run, skip summary generation
126
+ if config.data_annotation_run:
127
+ return summary_results_for_path # empty result set, skip summary
128
+
129
+ evaluation_package = EvaluationPackage(
130
+ test_case_name=tc_name,
131
+ messages=history,
132
+ ground_truth=test_case,
133
+ conversational_search_data=conversational_search_data,
134
+ resource_map=resource_map,
135
+ )
136
+ (
137
+ keyword_semantic_matches,
138
+ knowledge_base_metrics,
139
+ messages_with_reason,
140
+ metrics,
141
+ ) = evaluation_package.generate_summary()
142
+ temp = []
143
+ for message in messages_with_reason:
144
+ temp.append(message.model_dump())
145
+ json_dump(
146
+ os.path.join(
147
+ config.output_dir, "messages", tc_name + ".messages.analyze.json"
148
+ ),
149
+ temp,
150
+ )
151
+
152
+ json_dump(
153
+ os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
154
+ metrics.model_dump(),
155
+ )
156
+
157
+ metrics.dataset_name = tc_name
158
+ metrics.avg_resp_time = (
159
+ sum(call_tracker.generic) + sum(call_tracker.tool_call)
160
+ ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
161
+ metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
162
+
163
+ summary_results_for_path.append((metrics, knowledge_base_metrics))
164
+
165
+ return summary_results_for_path
166
+
167
+
168
+ def main(config: TestConfig):
169
+ executor = ThreadPoolExecutor(max_workers=config.num_workers)
170
+ if config.num_workers > 1 and config.enable_manual_user_input:
171
+ rich.print(
172
+ "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
173
+ )
174
+ config.enable_manual_user_input = (
175
+ False # disable manual user input for parallel execution
176
+ )
177
+ # reason: threads continue to stream messages while waiting for user input, which is not desired
178
+ # and the manual input prompt is not labelled properly in the UI
179
+ wxo_client = get_wxo_client(
180
+ config.auth_config.url,
181
+ config.auth_config.tenant_name,
182
+ config.auth_config.token,
183
+ )
184
+ resource_map = ResourceMap(wxo_client)
185
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
186
+ llm_user = LLMUser(
187
+ wai_client=get_provider(
188
+ config=config.provider_config,
189
+ model_id=config.llm_user_config.model_id,
190
+ ),
191
+ template=LlamaUserTemplateRenderer(
192
+ config.llm_user_config.prompt_config
193
+ ),
194
+ user_response_style=config.llm_user_config.user_response_style,
195
+ )
196
+
197
+ print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
198
+
199
+ results_list = []
200
+
201
+ knowledge_base_output_folder = (
202
+ Path(config.output_dir) / "knowledge_base_metrics"
203
+ )
204
+ knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
205
+ detailed_rag_output_file = (
206
+ knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
207
+ )
208
+ summary_rag_output_file = (
209
+ Path(config.output_dir) / "knowledge_base_summary_metrics.json"
210
+ )
211
+
212
+ os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
213
+ available_res = set()
214
+ if config.skip_available_results:
215
+ available_res = set(
216
+ [
217
+ os.path.basename(f).replace(".messages", "")
218
+ for f in glob.glob(
219
+ os.path.join(
220
+ config.output_dir, "messages", "*.messages.json"
221
+ )
222
+ )
223
+ ]
224
+ )
225
+
226
+ test_cases = []
227
+ for test_path in config.test_paths:
228
+ if os.path.isdir(test_path):
229
+ test_path = os.path.join(test_path, "*.json")
230
+ test_cases.extend(sorted(glob.glob(test_path)))
231
+
232
+ futures = []
233
+ task_n = 0
234
+ for test_case in test_cases:
235
+ if not test_case.endswith(".json") or test_case.endswith("agent.json"):
236
+ continue
237
+ if config.skip_available_results:
238
+ if test_case in available_res:
239
+ print(
240
+ f"Skipping test case {test_case} as results already exist."
241
+ )
242
+ continue
243
+
244
+ future = executor.submit(
245
+ process_test_case,
246
+ task_n,
247
+ test_case,
248
+ config,
249
+ inference_backend,
250
+ resource_map,
251
+ llm_user,
252
+ )
253
+
254
+ futures.append((test_case, future))
255
+ task_n += 1
256
+
257
+ if futures:
258
+ with Progress() as progress:
259
+ task1 = progress.add_task(
260
+ f"[purple]Evaluating {len(futures)} tasks...",
261
+ total=len(futures),
262
+ )
263
+ for test_case, future in futures:
264
+ try:
265
+ results_list.extend(future.result())
266
+ except Exception as e:
267
+ rich.print(f"test case {test_case} fails with {e}")
268
+ traceback.print_exc()
269
+ finally:
270
+ progress.update(task1, advance=1)
271
+
272
+ tool_call_metrics = [metric[0] for metric in results_list]
273
+ knowledge_base_metrics = [metric[1] for metric in results_list]
274
+
275
+ rag_metric_summary = KnowledgeBaseMetricSummary(
276
+ knowledge_base_metrics=knowledge_base_metrics
277
+ )
278
+ SummaryPanel(rag_metric_summary).print()
279
+
280
+ with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
281
+ json.dump(
282
+ rag_metric_summary.model_dump(by_alias=True)["detailed"],
283
+ f,
284
+ indent=4,
285
+ )
286
+
287
+ with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
288
+ json.dump(
289
+ rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
290
+ )
291
+
292
+ if len(tool_call_metrics) > 0:
293
+ # remove the average row if exist
294
+ tool_call_metrics = [
295
+ row
296
+ for row in tool_call_metrics
297
+ if row.dataset_name != "Summary (Average)"
298
+ ]
299
+
300
+ def filter_display_only_values(
301
+ tool_call_metric: ToolCallAndRoutingMetrics,
302
+ ):
303
+ row = {
304
+ "Dataset": tool_call_metric.dataset_name,
305
+ "Total Steps": tool_call_metric.total_steps,
306
+ "LLM Steps": tool_call_metric.llm_step,
307
+ "Total Tool Calls": tool_call_metric.total_tool_calls,
308
+ "Tool Call Precision": tool_call_metric.tool_call_precision,
309
+ "Tool Call Recall": tool_call_metric.tool_call_recall,
310
+ "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
311
+ "Text Match": tool_call_metric.text_match,
312
+ "Journey Success": tool_call_metric.is_success,
313
+ "Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
314
+ }
315
+ return row
316
+
317
+ def create_avg_row(metrics: List[dict]):
318
+ avg_row = {
319
+ "Dataset": "Summary (Average)",
320
+ "Total Steps": 0,
321
+ "LLM Steps": 0,
322
+ "Total Tool Calls": 0,
323
+ "Tool Call Precision": 0,
324
+ "Tool Call Recall": 0,
325
+ "Agent Routing Accuracy": 0,
326
+ "Text Match": 0,
327
+ "Journey Success": 0,
328
+ "Avg Resp Time (sec)": 0,
329
+ }
330
+ if metrics:
331
+ for row in metrics:
332
+ avg_row["Total Steps"] += row["Total Steps"]
333
+ avg_row["LLM Steps"] += row["LLM Steps"]
334
+ avg_row["Total Tool Calls"] += row["Total Tool Calls"]
335
+ avg_row["Tool Call Precision"] += row["Tool Call Precision"]
336
+ avg_row["Tool Call Recall"] += row["Tool Call Recall"]
337
+ avg_row["Agent Routing Accuracy"] += row[
338
+ "Agent Routing Accuracy"
339
+ ]
340
+ avg_row["Text Match"] += (
341
+ row["Text Match"] == TextMatchType.text_match.value
342
+ )
343
+ avg_row["Journey Success"] += row["Journey Success"]
344
+ avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
345
+
346
+ avg_row["Total Steps"] = round(
347
+ safe_divide(avg_row["Total Steps"], len(metrics)), 2
348
+ )
349
+ avg_row["LLM Steps"] = round(
350
+ safe_divide(avg_row["LLM Steps"], len(metrics)), 2
351
+ )
352
+ avg_row["Total Tool Calls"] = round(
353
+ safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
354
+ )
355
+ avg_row["Tool Call Precision"] = round(
356
+ safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
357
+ )
358
+ avg_row["Tool Call Recall"] = round(
359
+ safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
360
+ )
361
+ avg_row["Agent Routing Accuracy"] = round(
362
+ safe_divide(
363
+ avg_row["Agent Routing Accuracy"], len(metrics)
364
+ ),
365
+ 2,
366
+ )
367
+ avg_row["Text Match"] = round(
368
+ safe_divide(
369
+ avg_row["Text Match"],
370
+ len(
371
+ [
372
+ row
373
+ for row in metrics
374
+ if row["Text Match"]
375
+ != TextMatchType.text_match.na
376
+ ]
377
+ ),
378
+ ),
379
+ 2,
380
+ )
381
+ avg_row["Journey Success"] = round(
382
+ safe_divide(avg_row["Journey Success"], len(metrics)), 2
383
+ )
384
+ avg_row["Avg Resp Time (sec)"] = round(
385
+ safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
386
+ )
387
+ return avg_row
388
+
389
+ tool_call_metrics_for_display = []
390
+ for row in tool_call_metrics:
391
+ tool_call_metrics_for_display.append(
392
+ filter_display_only_values(row)
393
+ )
394
+ tool_call_metrics_for_display.append(
395
+ create_avg_row(tool_call_metrics_for_display)
396
+ )
397
+ tool_call_table_for_display = create_table(
398
+ tool_call_metrics_for_display
399
+ )
400
+
401
+ if tool_call_table_for_display:
402
+ tool_call_table_for_display.print()
403
+
404
+ if len(tool_call_metrics) > 0:
405
+ tool_call_metrics = [
406
+ metric.model_dump() for metric in tool_call_metrics
407
+ ]
408
+ output_file = os.path.join(config.output_dir, "summary_metrics.csv")
409
+ header = list(tool_call_metrics[0].keys())
410
+
411
+ with open(output_file, "w") as file:
412
+ csv_writer = csv.writer(file)
413
+ csv_writer.writerow(header)
414
+ for entry in tool_call_metrics:
415
+ csv_writer.writerow([entry[name] for name in header])
416
+
417
+ with open(
418
+ os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
419
+ ) as f:
420
+ yaml.safe_dump(dataclasses.asdict(config), f)
421
+
422
+ print(f"Results saved to {config.output_dir}")
423
+
424
+
425
+ if __name__ == "__main__":
426
+ main(CLI(TestConfig, as_positional=False))
@@ -44,3 +44,28 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
44
44
  "answer_relevancy": self.answer_relevancy,
45
45
  "answer_relevancy_score": self.answer_relevancy_score,
46
46
  }
47
+
48
+
49
+ class AnswerDerailment(BaseLLMJudgeMetric):
50
+ in_scope: str | float
51
+ statement: str
52
+ reason: str
53
+
54
+ def table(self):
55
+ return {
56
+ "statement": ",".join(self.statement),
57
+ "reason": self.reason,
58
+ }
59
+
60
+
61
+ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
62
+ is_safe: str | float
63
+ statement: str
64
+ reason: str
65
+
66
+ def table(self):
67
+ return {
68
+ "statement": ",".join(self.statement),
69
+ "reason": self.reason,
70
+ "unsafe_topic_score": str(self.is_safe),
71
+ }
@@ -0,0 +1,67 @@
1
+ import json
2
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
3
+ from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
4
+
5
+ with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
6
+ test_data = json.load(f)
7
+
8
+
9
+ goal_temp = []
10
+
11
+ goals = {}
12
+ goal_details = []
13
+
14
+ i = 0
15
+ for action in test_data[0]["info"]["task"]["actions"]:
16
+ goal_temp.append(action["name"] + f"_{i}")
17
+ goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
18
+ goal_details.append(goal_detail)
19
+
20
+ if len(goal_temp) == 1:
21
+ goals[goal_temp[0]] = []
22
+ else:
23
+ for i in range(len(goal_temp)-1):
24
+ goals.update({goal_temp[i]: goal_temp[i+1]})
25
+
26
+ gt_data = {
27
+ "agent": "airline_agent",
28
+ "goals": goals,
29
+ "goal_details": goal_details,
30
+ "story": test_data[0]["info"]["task"]["instruction"],
31
+ "starting_sentence": "",
32
+ }
33
+ print("2")
34
+ gt_data = EvaluationData.model_validate(gt_data)
35
+
36
+ tc_name = "airline_1"
37
+
38
+ print(test_data[0]["traj"][0])
39
+
40
+ history = []
41
+ for msg in test_data[0]["traj"]:
42
+ if msg["role"] == "tool":
43
+ print(msg["content"])
44
+ history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
45
+ event=EventTypes.message_created))
46
+ else:
47
+ history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
48
+
49
+ print(f"length of history {history}")
50
+
51
+ evaluation_package = EvaluationPackage(
52
+ test_case_name=tc_name,
53
+ messages=history,
54
+ ground_truth=gt_data,
55
+ conversational_search_data=None,
56
+ resource_map=None
57
+ )
58
+ print("1")
59
+ (
60
+ keyword_semantic_matches,
61
+ knowledge_base_metrics,
62
+ messages_with_reason,
63
+ metrics,
64
+ ) = evaluation_package.generate_summary()
65
+
66
+
67
+ print(metrics)
@@ -0,0 +1,21 @@
1
+ from typing import Any, Dict, List, Union, Optional
2
+ from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
3
+ import json
4
+
5
+ # with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
6
+ # data = json.load(f)
7
+ #
8
+ # otel_traces = data["calls"][-1]["messages"]
9
+
10
+
11
+ def convert_otel_to_message(otel_traces):
12
+ history = []
13
+ for row in otel_traces:
14
+ print(row)
15
+ content = row["content"]
16
+ print(row.keys())
17
+ role = row.get("role", "assistant")
18
+
19
+ history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
20
+
21
+ return history