ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +2 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +132 -13
- wxo_agentic_evaluation/inference_backend.py +52 -14
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- wxo_agentic_evaluation/utils/utils.py +32 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import dataclasses
|
|
3
|
+
import glob
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import traceback
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
import rich
|
|
12
|
+
import yaml
|
|
13
|
+
from jsonargparse import CLI
|
|
14
|
+
from rich.progress import Progress
|
|
15
|
+
|
|
16
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
17
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
18
|
+
from wxo_agentic_evaluation.inference_backend import (
|
|
19
|
+
EvaluationController,
|
|
20
|
+
WXOInferenceBackend,
|
|
21
|
+
get_wxo_client,
|
|
22
|
+
)
|
|
23
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
24
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
25
|
+
KnowledgeBaseMetricSummary,
|
|
26
|
+
TextMatchType,
|
|
27
|
+
ToolCallAndRoutingMetrics,
|
|
28
|
+
)
|
|
29
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
30
|
+
LlamaUserTemplateRenderer,
|
|
31
|
+
)
|
|
32
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
33
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
34
|
+
from wxo_agentic_evaluation.type import EvaluationData
|
|
35
|
+
from wxo_agentic_evaluation.utils import json_dump
|
|
36
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
37
|
+
SummaryPanel,
|
|
38
|
+
create_table,
|
|
39
|
+
safe_divide,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def trajectory_generation(task_n, test_case, config, inference_backend, resource_map, llm_user):
|
|
44
|
+
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
45
|
+
with open(test_case, "r") as f:
|
|
46
|
+
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
47
|
+
|
|
48
|
+
evaluation_controller = EvaluationController(
|
|
49
|
+
wxo_inference_backend=inference_backend,
|
|
50
|
+
llm_user=llm_user,
|
|
51
|
+
config=config,
|
|
52
|
+
)
|
|
53
|
+
rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
|
|
54
|
+
(
|
|
55
|
+
history,
|
|
56
|
+
call_tracker,
|
|
57
|
+
conversational_search_data,
|
|
58
|
+
) = evaluation_controller.run(
|
|
59
|
+
task_n,
|
|
60
|
+
test_case.story,
|
|
61
|
+
agent_name=test_case.agent,
|
|
62
|
+
starting_user_input=test_case.starting_sentence,
|
|
63
|
+
)
|
|
64
|
+
result = list()
|
|
65
|
+
for message in history:
|
|
66
|
+
result.append(message.model_dump())
|
|
67
|
+
|
|
68
|
+
json_dump(
|
|
69
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
70
|
+
result,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if len(conversational_search_data) > 0:
|
|
74
|
+
fn = tc_name + ".retrieval_context.json"
|
|
75
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
76
|
+
out_folder.mkdir(exist_ok=True)
|
|
77
|
+
rc = [context.model_dump() for context in conversational_search_data]
|
|
78
|
+
json_dump(out_folder / fn, rc)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def evaluation():
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def process_test_case(
|
|
86
|
+
task_n, test_case, config, inference_backend, resource_map, llm_user
|
|
87
|
+
):
|
|
88
|
+
summary_results_for_path = []
|
|
89
|
+
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
90
|
+
with open(test_case, "r") as f:
|
|
91
|
+
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
92
|
+
|
|
93
|
+
evaluation_controller = EvaluationController(
|
|
94
|
+
wxo_inference_backend=inference_backend,
|
|
95
|
+
llm_user=llm_user,
|
|
96
|
+
config=config,
|
|
97
|
+
)
|
|
98
|
+
rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
|
|
99
|
+
(
|
|
100
|
+
history,
|
|
101
|
+
call_tracker,
|
|
102
|
+
conversational_search_data,
|
|
103
|
+
) = evaluation_controller.run(
|
|
104
|
+
task_n,
|
|
105
|
+
test_case.story,
|
|
106
|
+
agent_name=test_case.agent,
|
|
107
|
+
starting_user_input=test_case.starting_sentence,
|
|
108
|
+
)
|
|
109
|
+
result = list()
|
|
110
|
+
for message in history:
|
|
111
|
+
result.append(message.model_dump())
|
|
112
|
+
|
|
113
|
+
json_dump(
|
|
114
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
115
|
+
result,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if len(conversational_search_data) > 0:
|
|
119
|
+
fn = tc_name + ".retrieval_context.json"
|
|
120
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
121
|
+
out_folder.mkdir(exist_ok=True)
|
|
122
|
+
rc = [context.model_dump() for context in conversational_search_data]
|
|
123
|
+
json_dump(out_folder / fn, rc)
|
|
124
|
+
|
|
125
|
+
# If data annotation run, skip summary generation
|
|
126
|
+
if config.data_annotation_run:
|
|
127
|
+
return summary_results_for_path # empty result set, skip summary
|
|
128
|
+
|
|
129
|
+
evaluation_package = EvaluationPackage(
|
|
130
|
+
test_case_name=tc_name,
|
|
131
|
+
messages=history,
|
|
132
|
+
ground_truth=test_case,
|
|
133
|
+
conversational_search_data=conversational_search_data,
|
|
134
|
+
resource_map=resource_map,
|
|
135
|
+
)
|
|
136
|
+
(
|
|
137
|
+
keyword_semantic_matches,
|
|
138
|
+
knowledge_base_metrics,
|
|
139
|
+
messages_with_reason,
|
|
140
|
+
metrics,
|
|
141
|
+
) = evaluation_package.generate_summary()
|
|
142
|
+
temp = []
|
|
143
|
+
for message in messages_with_reason:
|
|
144
|
+
temp.append(message.model_dump())
|
|
145
|
+
json_dump(
|
|
146
|
+
os.path.join(
|
|
147
|
+
config.output_dir, "messages", tc_name + ".messages.analyze.json"
|
|
148
|
+
),
|
|
149
|
+
temp,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
json_dump(
|
|
153
|
+
os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
|
|
154
|
+
metrics.model_dump(),
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
metrics.dataset_name = tc_name
|
|
158
|
+
metrics.avg_resp_time = (
|
|
159
|
+
sum(call_tracker.generic) + sum(call_tracker.tool_call)
|
|
160
|
+
) / (len(call_tracker.generic) + len(call_tracker.tool_call))
|
|
161
|
+
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
162
|
+
|
|
163
|
+
summary_results_for_path.append((metrics, knowledge_base_metrics))
|
|
164
|
+
|
|
165
|
+
return summary_results_for_path
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def main(config: TestConfig):
|
|
169
|
+
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
170
|
+
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
171
|
+
rich.print(
|
|
172
|
+
"[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
|
|
173
|
+
)
|
|
174
|
+
config.enable_manual_user_input = (
|
|
175
|
+
False # disable manual user input for parallel execution
|
|
176
|
+
)
|
|
177
|
+
# reason: threads continue to stream messages while waiting for user input, which is not desired
|
|
178
|
+
# and the manual input prompt is not labelled properly in the UI
|
|
179
|
+
wxo_client = get_wxo_client(
|
|
180
|
+
config.auth_config.url,
|
|
181
|
+
config.auth_config.tenant_name,
|
|
182
|
+
config.auth_config.token,
|
|
183
|
+
)
|
|
184
|
+
resource_map = ResourceMap(wxo_client)
|
|
185
|
+
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
186
|
+
llm_user = LLMUser(
|
|
187
|
+
wai_client=get_provider(
|
|
188
|
+
config=config.provider_config,
|
|
189
|
+
model_id=config.llm_user_config.model_id,
|
|
190
|
+
),
|
|
191
|
+
template=LlamaUserTemplateRenderer(
|
|
192
|
+
config.llm_user_config.prompt_config
|
|
193
|
+
),
|
|
194
|
+
user_response_style=config.llm_user_config.user_response_style,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
|
|
198
|
+
|
|
199
|
+
results_list = []
|
|
200
|
+
|
|
201
|
+
knowledge_base_output_folder = (
|
|
202
|
+
Path(config.output_dir) / "knowledge_base_metrics"
|
|
203
|
+
)
|
|
204
|
+
knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
|
|
205
|
+
detailed_rag_output_file = (
|
|
206
|
+
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
207
|
+
)
|
|
208
|
+
summary_rag_output_file = (
|
|
209
|
+
Path(config.output_dir) / "knowledge_base_summary_metrics.json"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
213
|
+
available_res = set()
|
|
214
|
+
if config.skip_available_results:
|
|
215
|
+
available_res = set(
|
|
216
|
+
[
|
|
217
|
+
os.path.basename(f).replace(".messages", "")
|
|
218
|
+
for f in glob.glob(
|
|
219
|
+
os.path.join(
|
|
220
|
+
config.output_dir, "messages", "*.messages.json"
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
test_cases = []
|
|
227
|
+
for test_path in config.test_paths:
|
|
228
|
+
if os.path.isdir(test_path):
|
|
229
|
+
test_path = os.path.join(test_path, "*.json")
|
|
230
|
+
test_cases.extend(sorted(glob.glob(test_path)))
|
|
231
|
+
|
|
232
|
+
futures = []
|
|
233
|
+
task_n = 0
|
|
234
|
+
for test_case in test_cases:
|
|
235
|
+
if not test_case.endswith(".json") or test_case.endswith("agent.json"):
|
|
236
|
+
continue
|
|
237
|
+
if config.skip_available_results:
|
|
238
|
+
if test_case in available_res:
|
|
239
|
+
print(
|
|
240
|
+
f"Skipping test case {test_case} as results already exist."
|
|
241
|
+
)
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
future = executor.submit(
|
|
245
|
+
process_test_case,
|
|
246
|
+
task_n,
|
|
247
|
+
test_case,
|
|
248
|
+
config,
|
|
249
|
+
inference_backend,
|
|
250
|
+
resource_map,
|
|
251
|
+
llm_user,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
futures.append((test_case, future))
|
|
255
|
+
task_n += 1
|
|
256
|
+
|
|
257
|
+
if futures:
|
|
258
|
+
with Progress() as progress:
|
|
259
|
+
task1 = progress.add_task(
|
|
260
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
261
|
+
total=len(futures),
|
|
262
|
+
)
|
|
263
|
+
for test_case, future in futures:
|
|
264
|
+
try:
|
|
265
|
+
results_list.extend(future.result())
|
|
266
|
+
except Exception as e:
|
|
267
|
+
rich.print(f"test case {test_case} fails with {e}")
|
|
268
|
+
traceback.print_exc()
|
|
269
|
+
finally:
|
|
270
|
+
progress.update(task1, advance=1)
|
|
271
|
+
|
|
272
|
+
tool_call_metrics = [metric[0] for metric in results_list]
|
|
273
|
+
knowledge_base_metrics = [metric[1] for metric in results_list]
|
|
274
|
+
|
|
275
|
+
rag_metric_summary = KnowledgeBaseMetricSummary(
|
|
276
|
+
knowledge_base_metrics=knowledge_base_metrics
|
|
277
|
+
)
|
|
278
|
+
SummaryPanel(rag_metric_summary).print()
|
|
279
|
+
|
|
280
|
+
with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
|
|
281
|
+
json.dump(
|
|
282
|
+
rag_metric_summary.model_dump(by_alias=True)["detailed"],
|
|
283
|
+
f,
|
|
284
|
+
indent=4,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
|
|
288
|
+
json.dump(
|
|
289
|
+
rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if len(tool_call_metrics) > 0:
|
|
293
|
+
# remove the average row if exist
|
|
294
|
+
tool_call_metrics = [
|
|
295
|
+
row
|
|
296
|
+
for row in tool_call_metrics
|
|
297
|
+
if row.dataset_name != "Summary (Average)"
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
def filter_display_only_values(
|
|
301
|
+
tool_call_metric: ToolCallAndRoutingMetrics,
|
|
302
|
+
):
|
|
303
|
+
row = {
|
|
304
|
+
"Dataset": tool_call_metric.dataset_name,
|
|
305
|
+
"Total Steps": tool_call_metric.total_steps,
|
|
306
|
+
"LLM Steps": tool_call_metric.llm_step,
|
|
307
|
+
"Total Tool Calls": tool_call_metric.total_tool_calls,
|
|
308
|
+
"Tool Call Precision": tool_call_metric.tool_call_precision,
|
|
309
|
+
"Tool Call Recall": tool_call_metric.tool_call_recall,
|
|
310
|
+
"Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
|
|
311
|
+
"Text Match": tool_call_metric.text_match,
|
|
312
|
+
"Journey Success": tool_call_metric.is_success,
|
|
313
|
+
"Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
|
|
314
|
+
}
|
|
315
|
+
return row
|
|
316
|
+
|
|
317
|
+
def create_avg_row(metrics: List[dict]):
|
|
318
|
+
avg_row = {
|
|
319
|
+
"Dataset": "Summary (Average)",
|
|
320
|
+
"Total Steps": 0,
|
|
321
|
+
"LLM Steps": 0,
|
|
322
|
+
"Total Tool Calls": 0,
|
|
323
|
+
"Tool Call Precision": 0,
|
|
324
|
+
"Tool Call Recall": 0,
|
|
325
|
+
"Agent Routing Accuracy": 0,
|
|
326
|
+
"Text Match": 0,
|
|
327
|
+
"Journey Success": 0,
|
|
328
|
+
"Avg Resp Time (sec)": 0,
|
|
329
|
+
}
|
|
330
|
+
if metrics:
|
|
331
|
+
for row in metrics:
|
|
332
|
+
avg_row["Total Steps"] += row["Total Steps"]
|
|
333
|
+
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
334
|
+
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
335
|
+
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
336
|
+
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
337
|
+
avg_row["Agent Routing Accuracy"] += row[
|
|
338
|
+
"Agent Routing Accuracy"
|
|
339
|
+
]
|
|
340
|
+
avg_row["Text Match"] += (
|
|
341
|
+
row["Text Match"] == TextMatchType.text_match.value
|
|
342
|
+
)
|
|
343
|
+
avg_row["Journey Success"] += row["Journey Success"]
|
|
344
|
+
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
345
|
+
|
|
346
|
+
avg_row["Total Steps"] = round(
|
|
347
|
+
safe_divide(avg_row["Total Steps"], len(metrics)), 2
|
|
348
|
+
)
|
|
349
|
+
avg_row["LLM Steps"] = round(
|
|
350
|
+
safe_divide(avg_row["LLM Steps"], len(metrics)), 2
|
|
351
|
+
)
|
|
352
|
+
avg_row["Total Tool Calls"] = round(
|
|
353
|
+
safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
|
|
354
|
+
)
|
|
355
|
+
avg_row["Tool Call Precision"] = round(
|
|
356
|
+
safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
|
|
357
|
+
)
|
|
358
|
+
avg_row["Tool Call Recall"] = round(
|
|
359
|
+
safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
|
|
360
|
+
)
|
|
361
|
+
avg_row["Agent Routing Accuracy"] = round(
|
|
362
|
+
safe_divide(
|
|
363
|
+
avg_row["Agent Routing Accuracy"], len(metrics)
|
|
364
|
+
),
|
|
365
|
+
2,
|
|
366
|
+
)
|
|
367
|
+
avg_row["Text Match"] = round(
|
|
368
|
+
safe_divide(
|
|
369
|
+
avg_row["Text Match"],
|
|
370
|
+
len(
|
|
371
|
+
[
|
|
372
|
+
row
|
|
373
|
+
for row in metrics
|
|
374
|
+
if row["Text Match"]
|
|
375
|
+
!= TextMatchType.text_match.na
|
|
376
|
+
]
|
|
377
|
+
),
|
|
378
|
+
),
|
|
379
|
+
2,
|
|
380
|
+
)
|
|
381
|
+
avg_row["Journey Success"] = round(
|
|
382
|
+
safe_divide(avg_row["Journey Success"], len(metrics)), 2
|
|
383
|
+
)
|
|
384
|
+
avg_row["Avg Resp Time (sec)"] = round(
|
|
385
|
+
safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
|
|
386
|
+
)
|
|
387
|
+
return avg_row
|
|
388
|
+
|
|
389
|
+
tool_call_metrics_for_display = []
|
|
390
|
+
for row in tool_call_metrics:
|
|
391
|
+
tool_call_metrics_for_display.append(
|
|
392
|
+
filter_display_only_values(row)
|
|
393
|
+
)
|
|
394
|
+
tool_call_metrics_for_display.append(
|
|
395
|
+
create_avg_row(tool_call_metrics_for_display)
|
|
396
|
+
)
|
|
397
|
+
tool_call_table_for_display = create_table(
|
|
398
|
+
tool_call_metrics_for_display
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
if tool_call_table_for_display:
|
|
402
|
+
tool_call_table_for_display.print()
|
|
403
|
+
|
|
404
|
+
if len(tool_call_metrics) > 0:
|
|
405
|
+
tool_call_metrics = [
|
|
406
|
+
metric.model_dump() for metric in tool_call_metrics
|
|
407
|
+
]
|
|
408
|
+
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
409
|
+
header = list(tool_call_metrics[0].keys())
|
|
410
|
+
|
|
411
|
+
with open(output_file, "w") as file:
|
|
412
|
+
csv_writer = csv.writer(file)
|
|
413
|
+
csv_writer.writerow(header)
|
|
414
|
+
for entry in tool_call_metrics:
|
|
415
|
+
csv_writer.writerow([entry[name] for name in header])
|
|
416
|
+
|
|
417
|
+
with open(
|
|
418
|
+
os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
|
|
419
|
+
) as f:
|
|
420
|
+
yaml.safe_dump(dataclasses.asdict(config), f)
|
|
421
|
+
|
|
422
|
+
print(f"Results saved to {config.output_dir}")
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
if __name__ == "__main__":
|
|
426
|
+
main(CLI(TestConfig, as_positional=False))
|
|
@@ -44,3 +44,28 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
|
|
|
44
44
|
"answer_relevancy": self.answer_relevancy,
|
|
45
45
|
"answer_relevancy_score": self.answer_relevancy_score,
|
|
46
46
|
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AnswerDerailment(BaseLLMJudgeMetric):
|
|
50
|
+
in_scope: str | float
|
|
51
|
+
statement: str
|
|
52
|
+
reason: str
|
|
53
|
+
|
|
54
|
+
def table(self):
|
|
55
|
+
return {
|
|
56
|
+
"statement": ",".join(self.statement),
|
|
57
|
+
"reason": self.reason,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AnswerUnsafeTopic(BaseLLMJudgeMetric):
|
|
62
|
+
is_safe: str | float
|
|
63
|
+
statement: str
|
|
64
|
+
reason: str
|
|
65
|
+
|
|
66
|
+
def table(self):
|
|
67
|
+
return {
|
|
68
|
+
"statement": ",".join(self.statement),
|
|
69
|
+
"reason": self.reason,
|
|
70
|
+
"unsafe_topic_score": str(self.is_safe),
|
|
71
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
3
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
|
|
4
|
+
|
|
5
|
+
with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
|
|
6
|
+
test_data = json.load(f)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
goal_temp = []
|
|
10
|
+
|
|
11
|
+
goals = {}
|
|
12
|
+
goal_details = []
|
|
13
|
+
|
|
14
|
+
i = 0
|
|
15
|
+
for action in test_data[0]["info"]["task"]["actions"]:
|
|
16
|
+
goal_temp.append(action["name"] + f"_{i}")
|
|
17
|
+
goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
|
|
18
|
+
goal_details.append(goal_detail)
|
|
19
|
+
|
|
20
|
+
if len(goal_temp) == 1:
|
|
21
|
+
goals[goal_temp[0]] = []
|
|
22
|
+
else:
|
|
23
|
+
for i in range(len(goal_temp)-1):
|
|
24
|
+
goals.update({goal_temp[i]: goal_temp[i+1]})
|
|
25
|
+
|
|
26
|
+
gt_data = {
|
|
27
|
+
"agent": "airline_agent",
|
|
28
|
+
"goals": goals,
|
|
29
|
+
"goal_details": goal_details,
|
|
30
|
+
"story": test_data[0]["info"]["task"]["instruction"],
|
|
31
|
+
"starting_sentence": "",
|
|
32
|
+
}
|
|
33
|
+
print("2")
|
|
34
|
+
gt_data = EvaluationData.model_validate(gt_data)
|
|
35
|
+
|
|
36
|
+
tc_name = "airline_1"
|
|
37
|
+
|
|
38
|
+
print(test_data[0]["traj"][0])
|
|
39
|
+
|
|
40
|
+
history = []
|
|
41
|
+
for msg in test_data[0]["traj"]:
|
|
42
|
+
if msg["role"] == "tool":
|
|
43
|
+
print(msg["content"])
|
|
44
|
+
history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
|
|
45
|
+
event=EventTypes.message_created))
|
|
46
|
+
else:
|
|
47
|
+
history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
|
|
48
|
+
|
|
49
|
+
print(f"length of history {history}")
|
|
50
|
+
|
|
51
|
+
evaluation_package = EvaluationPackage(
|
|
52
|
+
test_case_name=tc_name,
|
|
53
|
+
messages=history,
|
|
54
|
+
ground_truth=gt_data,
|
|
55
|
+
conversational_search_data=None,
|
|
56
|
+
resource_map=None
|
|
57
|
+
)
|
|
58
|
+
print("1")
|
|
59
|
+
(
|
|
60
|
+
keyword_semantic_matches,
|
|
61
|
+
knowledge_base_metrics,
|
|
62
|
+
messages_with_reason,
|
|
63
|
+
metrics,
|
|
64
|
+
) = evaluation_package.generate_summary()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
print(metrics)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union, Optional
|
|
2
|
+
from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
# with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
6
|
+
# data = json.load(f)
|
|
7
|
+
#
|
|
8
|
+
# otel_traces = data["calls"][-1]["messages"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def convert_otel_to_message(otel_traces):
|
|
12
|
+
history = []
|
|
13
|
+
for row in otel_traces:
|
|
14
|
+
print(row)
|
|
15
|
+
content = row["content"]
|
|
16
|
+
print(row.keys())
|
|
17
|
+
role = row.get("role", "assistant")
|
|
18
|
+
|
|
19
|
+
history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
|
|
20
|
+
|
|
21
|
+
return history
|