ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
- wxo_agentic_evaluation/analyze_run.py +49 -32
- wxo_agentic_evaluation/arg_configs.py +30 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +20 -4
- wxo_agentic_evaluation/evaluation_package.py +189 -15
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +64 -34
- wxo_agentic_evaluation/llm_matching.py +92 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -1
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/metrics.py +24 -3
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/template_render.py +16 -0
- wxo_agentic_evaluation/quick_eval.py +17 -3
- wxo_agentic_evaluation/record_chat.py +17 -6
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
- wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
- wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
- wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
- wxo_agentic_evaluation/service_instance.py +5 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +14 -4
- wxo_agentic_evaluation/utils/__init__.py +43 -5
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/utils.py +14 -9
- wxo_agentic_evaluation/wxo_client.py +2 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import csv
|
|
2
3
|
import dataclasses
|
|
3
4
|
import glob
|
|
@@ -7,6 +8,7 @@ import re
|
|
|
7
8
|
import traceback
|
|
8
9
|
from collections import defaultdict
|
|
9
10
|
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
from dataclasses import asdict
|
|
10
12
|
from datetime import datetime
|
|
11
13
|
from pathlib import Path
|
|
12
14
|
from typing import List
|
|
@@ -16,15 +18,16 @@ import yaml
|
|
|
16
18
|
from jsonargparse import CLI
|
|
17
19
|
from rich.progress import Progress
|
|
18
20
|
|
|
19
|
-
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
21
|
+
from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
|
|
20
22
|
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
21
23
|
from wxo_agentic_evaluation.inference_backend import (
|
|
22
24
|
EvaluationController,
|
|
23
25
|
WXOInferenceBackend,
|
|
24
26
|
)
|
|
25
|
-
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
26
27
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
28
|
+
from wxo_agentic_evaluation.metrics.evaluations import Extractor
|
|
27
29
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
30
|
+
CustomEvalMetrics,
|
|
28
31
|
KnowledgeBaseMetricSummary,
|
|
29
32
|
TextMatchType,
|
|
30
33
|
ToolCallAndRoutingMetrics,
|
|
@@ -33,46 +36,61 @@ from wxo_agentic_evaluation.prompt.template_render import (
|
|
|
33
36
|
LlamaUserTemplateRenderer,
|
|
34
37
|
)
|
|
35
38
|
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
36
|
-
from wxo_agentic_evaluation.service_provider import
|
|
39
|
+
from wxo_agentic_evaluation.service_provider import (
|
|
40
|
+
LOGGING_ENABLED,
|
|
41
|
+
get_provider,
|
|
42
|
+
)
|
|
43
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
37
44
|
from wxo_agentic_evaluation.type import EvaluationData
|
|
38
45
|
from wxo_agentic_evaluation.utils import json_dump
|
|
46
|
+
from wxo_agentic_evaluation.utils.evaluation_discovery import (
|
|
47
|
+
find_evaluation_subclasses,
|
|
48
|
+
)
|
|
39
49
|
from wxo_agentic_evaluation.utils.utils import (
|
|
40
50
|
SummaryPanel,
|
|
41
51
|
create_table,
|
|
42
52
|
safe_divide,
|
|
43
53
|
)
|
|
54
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
44
55
|
|
|
45
56
|
|
|
46
57
|
def process_test_case(
|
|
47
|
-
task_n,
|
|
48
|
-
test_case,
|
|
49
|
-
config,
|
|
50
|
-
inference_backend,
|
|
51
|
-
resource_map,
|
|
52
|
-
llm_user,
|
|
58
|
+
task_n: int,
|
|
59
|
+
test_case: str,
|
|
60
|
+
config: TestConfig,
|
|
61
|
+
inference_backend: WXOInferenceBackend,
|
|
62
|
+
resource_map: ResourceMap,
|
|
63
|
+
llm_user: LLMUser,
|
|
64
|
+
llmaaj_provider: Provider,
|
|
53
65
|
run_idx: int = 0,
|
|
54
66
|
):
|
|
55
67
|
summary_results_for_path = []
|
|
56
|
-
|
|
57
|
-
run_tag = f".run{run_idx+1}" if
|
|
68
|
+
test_case_name = os.path.basename(test_case).replace(".json", "")
|
|
69
|
+
run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
|
|
70
|
+
|
|
58
71
|
with open(test_case, "r") as f:
|
|
59
|
-
|
|
72
|
+
evaluation_data = EvaluationData.model_validate(json.load(f))
|
|
60
73
|
|
|
61
74
|
evaluation_controller = EvaluationController(
|
|
62
75
|
wxo_inference_backend=inference_backend,
|
|
63
76
|
llm_user=llm_user,
|
|
64
77
|
config=config,
|
|
65
78
|
)
|
|
66
|
-
|
|
79
|
+
|
|
80
|
+
rich.print(
|
|
81
|
+
f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
|
|
82
|
+
)
|
|
83
|
+
|
|
67
84
|
(
|
|
68
85
|
history,
|
|
69
86
|
call_tracker,
|
|
70
87
|
conversational_search_data,
|
|
71
88
|
) = evaluation_controller.run(
|
|
72
89
|
task_n,
|
|
73
|
-
|
|
74
|
-
agent_name=
|
|
75
|
-
starting_user_input=
|
|
90
|
+
evaluation_data.story,
|
|
91
|
+
agent_name=evaluation_data.agent,
|
|
92
|
+
starting_user_input=evaluation_data.starting_sentence,
|
|
93
|
+
max_user_turns=evaluation_data.max_user_turns,
|
|
76
94
|
)
|
|
77
95
|
result = list()
|
|
78
96
|
for message in history:
|
|
@@ -80,13 +98,15 @@ def process_test_case(
|
|
|
80
98
|
|
|
81
99
|
json_dump(
|
|
82
100
|
os.path.join(
|
|
83
|
-
config.output_dir,
|
|
101
|
+
config.output_dir,
|
|
102
|
+
"messages",
|
|
103
|
+
f"{test_case_name}{run_tag}.messages.json",
|
|
84
104
|
),
|
|
85
105
|
result,
|
|
86
106
|
)
|
|
87
107
|
|
|
88
108
|
if len(conversational_search_data) > 0:
|
|
89
|
-
fn =
|
|
109
|
+
fn = f"{test_case_name}{run_tag}.retrieval_context.json"
|
|
90
110
|
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
91
111
|
out_folder.mkdir(exist_ok=True)
|
|
92
112
|
rc = [context.model_dump() for context in conversational_search_data]
|
|
@@ -96,25 +116,51 @@ def process_test_case(
|
|
|
96
116
|
if config.data_annotation_run:
|
|
97
117
|
return summary_results_for_path # empty result set, skip summary
|
|
98
118
|
|
|
119
|
+
# Handle custom extractions
|
|
120
|
+
all_extractors = []
|
|
121
|
+
if config.extrators_config.paths is not None:
|
|
122
|
+
for path in config.extrators_config.paths:
|
|
123
|
+
extractors = find_evaluation_subclasses(
|
|
124
|
+
directory=path, base_class_name="Extractor"
|
|
125
|
+
)
|
|
126
|
+
for extractor_class in extractors:
|
|
127
|
+
extractor: Extractor = extractor_class()
|
|
128
|
+
all_extractors.append(extractor)
|
|
129
|
+
|
|
130
|
+
# Handle custom evaluations
|
|
131
|
+
all_custom_evals = []
|
|
132
|
+
if config.custom_metrics_config.paths is not None:
|
|
133
|
+
for path in config.custom_metrics_config.paths:
|
|
134
|
+
custom_eval_classes = find_evaluation_subclasses(path)
|
|
135
|
+
for _class in custom_eval_classes:
|
|
136
|
+
custom_eval = _class(llm_client=llmaaj_provider)
|
|
137
|
+
all_custom_evals.append(custom_eval)
|
|
138
|
+
|
|
99
139
|
evaluation_package = EvaluationPackage(
|
|
100
|
-
test_case_name=
|
|
140
|
+
test_case_name=test_case_name,
|
|
101
141
|
messages=history,
|
|
102
|
-
ground_truth=
|
|
142
|
+
ground_truth=evaluation_data,
|
|
103
143
|
conversational_search_data=conversational_search_data,
|
|
104
144
|
resource_map=resource_map,
|
|
145
|
+
config=config,
|
|
146
|
+
custom_evals=all_custom_evals,
|
|
147
|
+
extractors=all_extractors,
|
|
148
|
+
similarity_threshold=config.similarity_threshold,
|
|
149
|
+
enable_fuzzy_matching=config.enable_fuzzy_matching,
|
|
105
150
|
)
|
|
106
151
|
(
|
|
107
152
|
keyword_semantic_matches,
|
|
108
153
|
knowledge_base_metrics,
|
|
109
154
|
messages_with_reason,
|
|
110
155
|
metrics,
|
|
156
|
+
custom_metrics,
|
|
111
157
|
) = evaluation_package.generate_summary()
|
|
112
158
|
temp = []
|
|
113
159
|
for message in messages_with_reason:
|
|
114
160
|
temp.append(message.model_dump())
|
|
115
161
|
expected_tools = [
|
|
116
162
|
gd.tool_name
|
|
117
|
-
for gd in
|
|
163
|
+
for gd in evaluation_data.goal_details
|
|
118
164
|
if getattr(gd, "type", None) == "tool_call"
|
|
119
165
|
]
|
|
120
166
|
|
|
@@ -157,25 +203,29 @@ def process_test_case(
|
|
|
157
203
|
os.path.join(
|
|
158
204
|
config.output_dir,
|
|
159
205
|
"messages",
|
|
160
|
-
|
|
206
|
+
f"{test_case_name}{run_tag}.messages.analyze.json",
|
|
161
207
|
),
|
|
162
208
|
temp,
|
|
163
209
|
)
|
|
164
210
|
|
|
165
211
|
json_dump(
|
|
166
212
|
os.path.join(
|
|
167
|
-
config.output_dir,
|
|
213
|
+
config.output_dir,
|
|
214
|
+
"messages",
|
|
215
|
+
f"{test_case_name}{run_tag}.metrics.json",
|
|
168
216
|
),
|
|
169
217
|
metrics.model_dump(),
|
|
170
218
|
)
|
|
171
219
|
|
|
172
|
-
metrics.dataset_name =
|
|
220
|
+
metrics.dataset_name = test_case_name
|
|
173
221
|
metrics.avg_resp_time = (
|
|
174
222
|
sum(call_tracker.generic) + sum(call_tracker.tool_call)
|
|
175
223
|
) / (len(call_tracker.generic) + len(call_tracker.tool_call))
|
|
176
224
|
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
177
225
|
|
|
178
|
-
summary_results_for_path.append(
|
|
226
|
+
summary_results_for_path.append(
|
|
227
|
+
(metrics, knowledge_base_metrics, custom_metrics)
|
|
228
|
+
)
|
|
179
229
|
|
|
180
230
|
return summary_results_for_path
|
|
181
231
|
|
|
@@ -199,19 +249,49 @@ def main(config: TestConfig):
|
|
|
199
249
|
config.auth_config.tenant_name,
|
|
200
250
|
config.auth_config.token,
|
|
201
251
|
)
|
|
252
|
+
|
|
202
253
|
resource_map = ResourceMap(wxo_client)
|
|
203
254
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
255
|
+
original_provider_config = config.provider_config
|
|
256
|
+
provider_config_dict = asdict(original_provider_config)
|
|
257
|
+
|
|
258
|
+
provider_kwargs = {
|
|
259
|
+
"config": ProviderConfig(**provider_config_dict),
|
|
260
|
+
"model_id": config.llm_user_config.model_id,
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if provider_config_dict.get("provider", "gateway") == "gateway":
|
|
264
|
+
provider_kwargs.update(
|
|
265
|
+
token=config.auth_config.token or wxo_client.api_key,
|
|
266
|
+
instance_url=wxo_client.service_url,
|
|
267
|
+
)
|
|
268
|
+
config.auth_config.token = (
|
|
269
|
+
config.auth_config.token or wxo_client.api_key
|
|
270
|
+
)
|
|
271
|
+
config.auth_config.url = (
|
|
272
|
+
config.auth_config.url or wxo_client.service_url
|
|
273
|
+
)
|
|
274
|
+
|
|
204
275
|
llm_user = LLMUser(
|
|
205
|
-
wai_client=get_provider(
|
|
206
|
-
config=config.provider_config,
|
|
207
|
-
model_id=config.llm_user_config.model_id,
|
|
208
|
-
),
|
|
276
|
+
wai_client=get_provider(**provider_kwargs),
|
|
209
277
|
template=LlamaUserTemplateRenderer(
|
|
210
278
|
config.llm_user_config.prompt_config
|
|
211
279
|
),
|
|
212
280
|
user_response_style=config.llm_user_config.user_response_style,
|
|
213
281
|
)
|
|
214
282
|
|
|
283
|
+
llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
|
|
284
|
+
llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
|
|
285
|
+
|
|
286
|
+
llamaj_config_dict["model_id"] = (
|
|
287
|
+
config.custom_metrics_config.llmaaj_config.model_id
|
|
288
|
+
)
|
|
289
|
+
llamaj_config_dict["embedding_model_id"] = (
|
|
290
|
+
config.custom_metrics_config.llmaaj_config.embedding_model_id
|
|
291
|
+
)
|
|
292
|
+
llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
|
|
293
|
+
llmaaj_provider = get_provider(**llamaj_provider_kwargs)
|
|
294
|
+
|
|
215
295
|
print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
|
|
216
296
|
|
|
217
297
|
results_list = []
|
|
@@ -247,7 +327,7 @@ def main(config: TestConfig):
|
|
|
247
327
|
run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
|
|
248
328
|
available_runs[stem].add(run_num)
|
|
249
329
|
|
|
250
|
-
test_cases = []
|
|
330
|
+
test_cases: list[str] = []
|
|
251
331
|
for test_path in config.test_paths:
|
|
252
332
|
if os.path.isdir(test_path):
|
|
253
333
|
test_path = os.path.join(test_path, "*.json")
|
|
@@ -256,9 +336,11 @@ def main(config: TestConfig):
|
|
|
256
336
|
futures = []
|
|
257
337
|
task_n = 0
|
|
258
338
|
n_runs = getattr(config, "n_runs", 1)
|
|
339
|
+
|
|
259
340
|
for test_case in test_cases:
|
|
260
341
|
if not test_case.endswith(".json") or test_case.endswith("agent.json"):
|
|
261
342
|
continue
|
|
343
|
+
|
|
262
344
|
stem = Path(test_case).stem
|
|
263
345
|
|
|
264
346
|
for run_idx in range(n_runs):
|
|
@@ -272,6 +354,7 @@ def main(config: TestConfig):
|
|
|
272
354
|
f"Skipping {stem} run {run_number} as results already exist."
|
|
273
355
|
)
|
|
274
356
|
continue
|
|
357
|
+
|
|
275
358
|
future = executor.submit(
|
|
276
359
|
process_test_case,
|
|
277
360
|
task_n,
|
|
@@ -280,28 +363,42 @@ def main(config: TestConfig):
|
|
|
280
363
|
inference_backend,
|
|
281
364
|
resource_map,
|
|
282
365
|
llm_user,
|
|
366
|
+
llmaaj_provider,
|
|
283
367
|
run_idx, # 👈 pass run index
|
|
284
368
|
)
|
|
285
369
|
futures.append(((test_case, run_idx), future))
|
|
286
370
|
task_n += 1
|
|
287
371
|
|
|
288
372
|
if futures:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
total=len(futures),
|
|
293
|
-
)
|
|
373
|
+
|
|
374
|
+
if LOGGING_ENABLED:
|
|
375
|
+
# No progress bar when logging - just process tasks
|
|
294
376
|
for (test_case, run_idx), future in futures:
|
|
295
377
|
try:
|
|
296
378
|
results_list.extend(future.result())
|
|
297
379
|
except Exception as e:
|
|
298
380
|
rich.print(f"test case {test_case} fails with {e}")
|
|
299
381
|
traceback.print_exc()
|
|
300
|
-
|
|
301
|
-
|
|
382
|
+
else:
|
|
383
|
+
with Progress() as progress:
|
|
384
|
+
task1 = progress.add_task(
|
|
385
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
386
|
+
total=len(futures),
|
|
387
|
+
)
|
|
388
|
+
for (test_case, run_idx), future in futures:
|
|
389
|
+
try:
|
|
390
|
+
results_list.extend(future.result())
|
|
391
|
+
except Exception as e:
|
|
392
|
+
rich.print(f"test case {test_case} fails with {e}")
|
|
393
|
+
traceback.print_exc()
|
|
394
|
+
finally:
|
|
395
|
+
progress.update(task1, advance=1)
|
|
302
396
|
|
|
303
397
|
tool_call_metrics = [metric[0] for metric in results_list]
|
|
304
398
|
knowledge_base_metrics = [metric[1] for metric in results_list]
|
|
399
|
+
custom_metrics: List[CustomEvalMetrics] = [
|
|
400
|
+
metric[2] for metric in results_list
|
|
401
|
+
]
|
|
305
402
|
|
|
306
403
|
rag_metric_summary = KnowledgeBaseMetricSummary(
|
|
307
404
|
knowledge_base_metrics=knowledge_base_metrics
|
|
@@ -502,11 +599,23 @@ def main(config: TestConfig):
|
|
|
502
599
|
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
503
600
|
header = list(tool_call_metrics[0].keys())
|
|
504
601
|
|
|
505
|
-
with open(output_file, "w") as file:
|
|
602
|
+
with open(output_file, "w", newline="") as file:
|
|
506
603
|
csv_writer = csv.writer(file)
|
|
507
604
|
csv_writer.writerow(header)
|
|
508
605
|
for entry in tool_call_metrics:
|
|
509
606
|
csv_writer.writerow([entry[name] for name in header])
|
|
607
|
+
# Check if any custom metrics have been calculated
|
|
608
|
+
if any([m.custom_metrics for m in custom_metrics]):
|
|
609
|
+
custom_metrics_display_data = []
|
|
610
|
+
for metric in custom_metrics:
|
|
611
|
+
row = {}
|
|
612
|
+
row["dataset_name"] = metric.dataset_name
|
|
613
|
+
for metric in metric.custom_metrics:
|
|
614
|
+
row[metric.eval_name] = metric.value
|
|
615
|
+
custom_metrics_display_data.append(row)
|
|
616
|
+
create_table(
|
|
617
|
+
custom_metrics_display_data, title="Custom Metrics"
|
|
618
|
+
).print()
|
|
510
619
|
|
|
511
620
|
with open(
|
|
512
621
|
os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.metrics.metrics import Metric
|
|
6
|
+
from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
8
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message
|
|
9
|
+
from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
|
|
10
|
+
|
|
11
|
+
root_dir: str = os.path.dirname(os.path.dirname(__file__))
|
|
12
|
+
LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Extractor(ABC):
|
|
16
|
+
@property
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def name(self) -> str:
|
|
19
|
+
"""Unique name for the extractor."""
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def extract(
|
|
25
|
+
messages: list[Message],
|
|
26
|
+
**kwargs,
|
|
27
|
+
) -> Any:
|
|
28
|
+
"""Extract data from messages."""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Evaluation(ABC):
|
|
33
|
+
"""Abstract base class for all evaluations."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, llm_client: Optional[Provider] = None) -> None:
|
|
36
|
+
self._llm_client = llm_client
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def llm_client(self) -> Any:
|
|
40
|
+
"""Access client, require it if used."""
|
|
41
|
+
if self._llm_client is None:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
f"{self.__class__.__name__} requires a client, but none was provided"
|
|
44
|
+
)
|
|
45
|
+
return self._llm_client
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def name(self) -> str:
|
|
50
|
+
"""Unique name for the evaluator."""
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def evaluate(
|
|
55
|
+
self,
|
|
56
|
+
messages: list[Message],
|
|
57
|
+
ground_truth: EvaluationData,
|
|
58
|
+
extracted_context: Dict[str, Any],
|
|
59
|
+
) -> Optional[Metric]:
|
|
60
|
+
"""
|
|
61
|
+
Evaluation method.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
messages: agent and user conversational messages (includes tool calls)
|
|
65
|
+
ground_truth: ground truth data
|
|
66
|
+
extracted_context: dictionary containing data derived from the messages
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Metic
|
|
70
|
+
"""
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class LLMaaJEvaluation(Evaluation, ABC):
|
|
75
|
+
"""Evaluation metric for LLMaaJ."""
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def llmaaj_instructions(self) -> str:
|
|
80
|
+
"""LLMaaJ instructions for the evaluator."""
|
|
81
|
+
raise NotImplementedError
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def format_llm_output(self, string: str) -> int | float | bool | str:
|
|
85
|
+
"""Format the output of the LLMaaJ query."""
|
|
86
|
+
raise NotImplementedError
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def selected_context_keys(self) -> set[str]:
|
|
90
|
+
"""Override to implement context keys to pass to the prompt."""
|
|
91
|
+
return set()
|
|
92
|
+
|
|
93
|
+
def select_context(
|
|
94
|
+
self, extracted_context: Dict[str, Any]
|
|
95
|
+
) -> dict[str, Any]:
|
|
96
|
+
"""Additional context to be added to the prompt."""
|
|
97
|
+
selected_context = {
|
|
98
|
+
key: value
|
|
99
|
+
for key, value in extracted_context.items()
|
|
100
|
+
if key in self.selected_context_keys
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return selected_context
|
|
104
|
+
|
|
105
|
+
def evaluate(
|
|
106
|
+
self,
|
|
107
|
+
messages: list[Message],
|
|
108
|
+
ground_truth: EvaluationData,
|
|
109
|
+
extracted_context: Dict[str, Any],
|
|
110
|
+
) -> Optional[Metric]:
|
|
111
|
+
renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
|
|
112
|
+
parsed = ParsedMessages(messages=messages)
|
|
113
|
+
if parsed.user_input is None or parsed.agent_response is None:
|
|
114
|
+
return None
|
|
115
|
+
context = str(self.select_context(extracted_context))
|
|
116
|
+
prompt = renderer.render(
|
|
117
|
+
user_input=parsed.user_input,
|
|
118
|
+
agent_answer=parsed.agent_response,
|
|
119
|
+
llmaaj_instructions=self.llmaaj_instructions,
|
|
120
|
+
context=context,
|
|
121
|
+
)
|
|
122
|
+
score_str = self.llm_client.query(prompt)
|
|
123
|
+
value = self.format_llm_output(score_str)
|
|
124
|
+
return Metric(eval_name=self.name, value=value)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from typing import Any, List, Mapping, Optional, Tuple
|
|
3
2
|
from enum import Enum, StrEnum
|
|
3
|
+
from typing import Any, List, Mapping, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, computed_field
|
|
6
|
+
from pydantic.fields import Field
|
|
6
7
|
|
|
7
8
|
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
8
9
|
AnswerRelevancy,
|
|
@@ -18,11 +19,13 @@ def average(array):
|
|
|
18
19
|
else:
|
|
19
20
|
return sum(array) / len(array)
|
|
20
21
|
|
|
22
|
+
|
|
21
23
|
class DescriptionQuality(StrEnum):
|
|
22
24
|
GOOD = "GOOD"
|
|
23
25
|
BAD = "BAD"
|
|
24
26
|
MISSING = "MISSING"
|
|
25
27
|
|
|
28
|
+
|
|
26
29
|
class DescriptionQualityMetric(BaseModel):
|
|
27
30
|
tool_name: str = None
|
|
28
31
|
description_score: float | None = None
|
|
@@ -33,9 +36,9 @@ class DescriptionQualityMetric(BaseModel):
|
|
|
33
36
|
def is_bad_description(self) -> Optional[bool]:
|
|
34
37
|
if self.description_score and self.threshold:
|
|
35
38
|
return self.description_score >= self.threshold
|
|
36
|
-
|
|
39
|
+
|
|
37
40
|
return None
|
|
38
|
-
|
|
41
|
+
|
|
39
42
|
@computed_field
|
|
40
43
|
@property
|
|
41
44
|
def description_quality(self) -> str:
|
|
@@ -46,6 +49,7 @@ class DescriptionQualityMetric(BaseModel):
|
|
|
46
49
|
else:
|
|
47
50
|
return DescriptionQuality.GOOD
|
|
48
51
|
|
|
52
|
+
|
|
49
53
|
class KnowledgeBaseMetrics(BaseModel):
|
|
50
54
|
dataset_name: str = None
|
|
51
55
|
knowledge_base_name: str = (
|
|
@@ -208,6 +212,7 @@ class Annotation(BaseModel):
|
|
|
208
212
|
quote: str
|
|
209
213
|
parameter_name: Optional[str]
|
|
210
214
|
|
|
215
|
+
|
|
211
216
|
class FailedStaticTestCases(BaseModel):
|
|
212
217
|
metric_name: str
|
|
213
218
|
description: str
|
|
@@ -243,3 +248,19 @@ class ReferenceLessEvalMetrics(BaseModel):
|
|
|
243
248
|
failed_semantic_tool_calls: Optional[
|
|
244
249
|
List[Tuple[int, List[FailedSemanticTestCases]]]
|
|
245
250
|
]
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class Metric(BaseModel):
|
|
254
|
+
"""Generic metric result."""
|
|
255
|
+
|
|
256
|
+
eval_name: str = Field(description="name of eval that produce metric")
|
|
257
|
+
value: int | float | bool | str = Field(description="metric value")
|
|
258
|
+
metadata: Optional[dict] = Field(
|
|
259
|
+
default=None,
|
|
260
|
+
description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class CustomEvalMetrics(BaseModel):
|
|
265
|
+
dataset_name: str
|
|
266
|
+
custom_metrics: list[Metric]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
|
|
3
|
+
{{llmaaj_instructions}}
|
|
4
|
+
|
|
5
|
+
<|start_header_id|>user<|end_header_id|>
|
|
6
|
+
|
|
7
|
+
User question: {{user_input}}
|
|
8
|
+
|
|
9
|
+
Answer: {{agent_answer}}
|
|
10
|
+
|
|
11
|
+
Additional Conversationl Context: {{context}}
|
|
12
|
+
|
|
13
|
+
<|eot_id|>
|
|
14
|
+
|
|
15
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
@@ -173,3 +173,19 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
173
173
|
original_story=original_story,
|
|
174
174
|
original_starting_sentence=original_starting_sentence,
|
|
175
175
|
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class LLMaaJTemplateRenderer(JinjaTemplateRenderer):
|
|
179
|
+
def render(
|
|
180
|
+
self,
|
|
181
|
+
user_input: str,
|
|
182
|
+
agent_answer: str,
|
|
183
|
+
llmaaj_instructions: str,
|
|
184
|
+
context: str,
|
|
185
|
+
) -> str:
|
|
186
|
+
return super().render(
|
|
187
|
+
user_input=user_input,
|
|
188
|
+
agent_answer=agent_answer,
|
|
189
|
+
llmaaj_instructions=llmaaj_instructions,
|
|
190
|
+
context=context,
|
|
191
|
+
)
|
|
@@ -15,7 +15,6 @@ from wxo_agentic_evaluation.inference_backend import (
|
|
|
15
15
|
EvaluationController,
|
|
16
16
|
WXOInferenceBackend,
|
|
17
17
|
)
|
|
18
|
-
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
19
18
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
20
19
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
21
20
|
FailedSemanticTestCases,
|
|
@@ -38,6 +37,7 @@ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
|
38
37
|
ToolExtractionOpenAIFormat,
|
|
39
38
|
)
|
|
40
39
|
from wxo_agentic_evaluation.utils.utils import ReferencelessEvalPanel
|
|
40
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
41
41
|
|
|
42
42
|
ROOT_DIR = os.path.dirname(__file__)
|
|
43
43
|
MODEL_ID = "meta-llama/llama-3-405b-instruct"
|
|
@@ -62,7 +62,7 @@ def process_test_case(
|
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
summary, referenceless_metrics = evaluation_controller.generate_summary(
|
|
65
|
-
task_n, all_tools, messages
|
|
65
|
+
task_n, all_tools, messages, inference_backend
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
outfolder = Path(f"{config.output_dir}/quick-eval")
|
|
@@ -111,7 +111,11 @@ class QuickEvalController(EvaluationController):
|
|
|
111
111
|
return messages
|
|
112
112
|
|
|
113
113
|
def generate_summary(
|
|
114
|
-
self,
|
|
114
|
+
self,
|
|
115
|
+
task_n,
|
|
116
|
+
tools: List[Mapping[str, Any]],
|
|
117
|
+
messages: List[Message],
|
|
118
|
+
inference_backend=None,
|
|
115
119
|
) -> Tuple[ReferenceLessEvalMetrics, List[ExtendedMessage]]:
|
|
116
120
|
# run reference-less evaluation
|
|
117
121
|
rich.print(f"[b][Task-{task_n}] Starting Quick Evaluation")
|
|
@@ -123,6 +127,7 @@ class QuickEvalController(EvaluationController):
|
|
|
123
127
|
MODEL_ID,
|
|
124
128
|
task_n,
|
|
125
129
|
self.test_case_name,
|
|
130
|
+
inference_backend=inference_backend,
|
|
126
131
|
)
|
|
127
132
|
referenceless_results = te.run(examples=processed_data)
|
|
128
133
|
rich.print(f"[b][Task-{task_n}] Finished Quick Evaluation")
|
|
@@ -307,11 +312,20 @@ def main(config: QuickEvalConfig):
|
|
|
307
312
|
config.auth_config.tenant_name,
|
|
308
313
|
config.auth_config.token,
|
|
309
314
|
)
|
|
315
|
+
auth = getattr(config, "auth_config", None)
|
|
316
|
+
extra_kwargs = {}
|
|
317
|
+
instance_url = getattr(auth, "url", None) if auth else None
|
|
318
|
+
token = getattr(auth, "token", None) if auth else None
|
|
319
|
+
if instance_url:
|
|
320
|
+
extra_kwargs["instance_url"] = instance_url
|
|
321
|
+
if token:
|
|
322
|
+
extra_kwargs["token"] = token
|
|
310
323
|
inference_backend = WXOInferenceBackend(wxo_client)
|
|
311
324
|
llm_user = LLMUser(
|
|
312
325
|
wai_client=get_provider(
|
|
313
326
|
config=config.provider_config,
|
|
314
327
|
model_id=config.llm_user_config.model_id,
|
|
328
|
+
**extra_kwargs,
|
|
315
329
|
),
|
|
316
330
|
template=LlamaUserTemplateRenderer(
|
|
317
331
|
config.llm_user_config.prompt_config
|