ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -1,270 +1,152 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from typing import List
|
|
11
|
-
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
12
|
-
from wxo_agentic_evaluation.type import EvaluationData
|
|
1
|
+
import dataclasses
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from jsonargparse import CLI
|
|
13
10
|
|
|
14
11
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
12
|
+
from wxo_agentic_evaluation.clients import bootstrap_clients
|
|
13
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
14
|
+
extract_metrics,
|
|
15
|
+
format_metrics_for_display,
|
|
16
|
+
)
|
|
17
|
+
from wxo_agentic_evaluation.runner import process_test_case
|
|
18
|
+
from wxo_agentic_evaluation.scheduler import (
|
|
19
|
+
discover_tests,
|
|
20
|
+
enumerate_jobs,
|
|
21
|
+
run_jobs,
|
|
22
|
+
)
|
|
15
23
|
from wxo_agentic_evaluation.utils.utils import (
|
|
16
|
-
create_table,
|
|
17
24
|
SummaryPanel,
|
|
18
|
-
|
|
25
|
+
create_table,
|
|
26
|
+
csv_dump,
|
|
19
27
|
)
|
|
20
|
-
from wxo_agentic_evaluation.
|
|
21
|
-
from wxo_agentic_evaluation.metrics.
|
|
22
|
-
import
|
|
23
|
-
import
|
|
24
|
-
import traceback
|
|
25
|
-
import yaml
|
|
26
|
-
import dataclasses
|
|
27
|
-
import glob
|
|
28
|
-
import rich
|
|
29
|
-
import csv
|
|
30
|
-
from rich.progress import Progress
|
|
31
|
-
from pathlib import Path
|
|
32
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
33
|
-
from jsonargparse import CLI
|
|
34
|
-
|
|
28
|
+
from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
|
|
29
|
+
from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
|
|
30
|
+
from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
|
|
31
|
+
from wxo_agentic_evaluation.langfuse_evaluation_package import EvaluationRunner, sample_aggregator
|
|
35
32
|
|
|
36
|
-
def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
|
|
37
|
-
summary_results_for_path = []
|
|
38
|
-
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
39
|
-
with open(test_case, "r") as f:
|
|
40
|
-
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
41
33
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
fn = tc_name + ".retrieval_context.json"
|
|
62
|
-
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
63
|
-
out_folder.mkdir(exist_ok=True)
|
|
64
|
-
rc = [context.model_dump() for context in conversational_search_data]
|
|
65
|
-
json_dump(out_folder / fn, rc)
|
|
66
|
-
|
|
67
|
-
# If data annotation run, skip summary generation
|
|
68
|
-
if config.data_annotation_run:
|
|
69
|
-
return summary_results_for_path # empty result set, skip summary
|
|
70
|
-
|
|
71
|
-
evaluation_package = EvaluationPackage(
|
|
72
|
-
test_case_name=tc_name,
|
|
73
|
-
messages=history,
|
|
74
|
-
ground_truth=test_case,
|
|
75
|
-
conversational_search_data=conversational_search_data,
|
|
76
|
-
resource_map=resource_map
|
|
77
|
-
)
|
|
78
|
-
(
|
|
79
|
-
keyword_semantic_matches,
|
|
80
|
-
knowledge_base_metrics,
|
|
81
|
-
messages_with_reason,
|
|
82
|
-
metrics,
|
|
83
|
-
) = evaluation_package.generate_summary()
|
|
84
|
-
temp = []
|
|
85
|
-
for message in messages_with_reason:
|
|
86
|
-
temp.append(message.model_dump())
|
|
87
|
-
json_dump(
|
|
88
|
-
os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
|
|
89
|
-
temp,
|
|
90
|
-
)
|
|
34
|
+
def main(config: TestConfig):
|
|
35
|
+
# setup
|
|
36
|
+
clients = bootstrap_clients(config)
|
|
37
|
+
if not getattr(config, "skip_available_results", False):
|
|
38
|
+
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
39
|
+
config.output_dir = os.path.join(config.output_dir, ts)
|
|
40
|
+
|
|
41
|
+
if not config.skip_legacy_evaluation:
|
|
42
|
+
knowledge_base_output_folder = (
|
|
43
|
+
Path(config.output_dir) / "knowledge_base_metrics"
|
|
44
|
+
)
|
|
45
|
+
knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
|
|
46
|
+
detailed_rag_output_file = (
|
|
47
|
+
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
48
|
+
)
|
|
49
|
+
summary_rag_output_file = (
|
|
50
|
+
Path(config.output_dir) / "knowledge_base_summary_metrics.json"
|
|
51
|
+
)
|
|
52
|
+
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
91
53
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
54
|
+
# discover & schedule tests
|
|
55
|
+
test_cases = discover_tests(
|
|
56
|
+
config.test_paths, config.enable_recursive_search
|
|
95
57
|
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
102
|
-
|
|
103
|
-
summary_results_for_path.append((metrics, knowledge_base_metrics))
|
|
104
|
-
|
|
105
|
-
return summary_results_for_path
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def main(config: TestConfig):
|
|
109
|
-
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
110
|
-
wxo_client = get_wxo_client(
|
|
111
|
-
config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
|
|
58
|
+
jobs = enumerate_jobs(
|
|
59
|
+
test_cases,
|
|
60
|
+
config.n_runs,
|
|
61
|
+
config.skip_available_results,
|
|
62
|
+
config.output_dir,
|
|
112
63
|
)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
llm_user = LLMUser(
|
|
116
|
-
wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
|
|
117
|
-
template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
|
|
118
|
-
user_response_style=config.llm_user_config.user_response_style,
|
|
64
|
+
results = run_jobs(
|
|
65
|
+
jobs, config, clients, process_test_case, config.num_workers
|
|
119
66
|
)
|
|
120
67
|
|
|
121
|
-
|
|
68
|
+
# extract
|
|
69
|
+
tool_metrics, kb_summary, custom_metrics = extract_metrics(results)
|
|
122
70
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
129
|
-
)
|
|
130
|
-
summary_rag_output_file = (
|
|
131
|
-
Path(config.output_dir) / "knowledge_base_summary_metrics.json"
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
135
|
-
available_res = set()
|
|
136
|
-
if config.skip_available_results:
|
|
137
|
-
available_res = set(
|
|
138
|
-
[
|
|
139
|
-
os.path.basename(f).replace(".messages", "")
|
|
140
|
-
for f in glob.glob(
|
|
141
|
-
os.path.join(config.output_dir, "messages", "*.messages.json")
|
|
142
|
-
)
|
|
143
|
-
]
|
|
71
|
+
if not config.skip_legacy_evaluation:
|
|
72
|
+
# write results
|
|
73
|
+
csv_dump(
|
|
74
|
+
pathlib.Path(config.output_dir) / "summary_metrics.csv",
|
|
75
|
+
rows=[metric.model_dump() for metric in tool_metrics],
|
|
144
76
|
)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
if
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
77
|
+
for file_path, key in [
|
|
78
|
+
(detailed_rag_output_file, "detailed"),
|
|
79
|
+
(summary_rag_output_file, "summary"),
|
|
80
|
+
]:
|
|
81
|
+
with open(file_path, "w+", encoding="utf-8") as f:
|
|
82
|
+
json.dump(kb_summary.model_dump(by_alias=True)[key], f, indent=4)
|
|
83
|
+
|
|
84
|
+
# print results
|
|
85
|
+
SummaryPanel(kb_summary).print()
|
|
86
|
+
tool_table = create_table(
|
|
87
|
+
format_metrics_for_display(tool_metrics), title="Agent Metrics"
|
|
88
|
+
)
|
|
89
|
+
if tool_table:
|
|
90
|
+
tool_table.print()
|
|
91
|
+
if any(cm.custom_metrics for cm in custom_metrics):
|
|
92
|
+
rows = []
|
|
93
|
+
for cm in custom_metrics:
|
|
94
|
+
row = {"dataset_name": cm.dataset_name}
|
|
95
|
+
for m in cm.custom_metrics:
|
|
96
|
+
row[m.eval_name] = str(
|
|
97
|
+
m.value
|
|
98
|
+
) # Convert to string to avoid type issues
|
|
99
|
+
rows.append(row)
|
|
100
|
+
custom_metrics_table = create_table(rows, title="Custom Metrics")
|
|
101
|
+
if custom_metrics_table:
|
|
102
|
+
custom_metrics_table.print()
|
|
103
|
+
else:
|
|
104
|
+
collection_name = os.path.basename(config.output_dir) + "_collection"
|
|
105
|
+
collection = LangfuseCollection(
|
|
106
|
+
name=collection_name,
|
|
107
|
+
description="",
|
|
108
|
+
)
|
|
109
|
+
dataset_paths = []
|
|
110
|
+
session_ids = []
|
|
111
|
+
for test_case in test_cases:
|
|
112
|
+
name = os.path.basename(test_case).replace(".json", "")
|
|
113
|
+
with open(os.path.join(config.output_dir, f"{name}.metadata.json"), "r") as f:
|
|
114
|
+
metadata = json.load(f)
|
|
115
|
+
session_id = metadata["thread_id"]
|
|
116
|
+
dataset_paths.append(test_case)
|
|
117
|
+
session_ids.append(session_id)
|
|
118
|
+
|
|
119
|
+
collection.upload(paths=dataset_paths)
|
|
120
|
+
|
|
121
|
+
langfuse_collection = LangfuseCollection(name=collection_name)
|
|
122
|
+
|
|
123
|
+
journey_sucess_metric = JourneySuccessMetric()
|
|
124
|
+
tool_calling = ToolCalling()
|
|
125
|
+
|
|
126
|
+
run = EvaluationRunner(
|
|
127
|
+
evaluation_name=os.path.basename(config.output_dir) + "_evaluation",
|
|
128
|
+
run_name=os.path.basename(config.output_dir) + "_run",
|
|
129
|
+
session_ids=session_ids,
|
|
130
|
+
collection=langfuse_collection,
|
|
131
|
+
metrics=[journey_sucess_metric, tool_calling],
|
|
132
|
+
aggregator=sample_aggregator
|
|
170
133
|
)
|
|
171
134
|
|
|
172
|
-
|
|
173
|
-
task_n += 1
|
|
174
|
-
|
|
175
|
-
if futures:
|
|
176
|
-
with Progress() as progress:
|
|
177
|
-
task1 = progress.add_task(
|
|
178
|
-
f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
|
|
179
|
-
)
|
|
180
|
-
for test_case, future in futures:
|
|
181
|
-
try:
|
|
182
|
-
results_list.extend(future.result())
|
|
183
|
-
except Exception as e:
|
|
184
|
-
rich.print(f"test case {test_case} fails with {e}")
|
|
185
|
-
traceback.print_exc()
|
|
186
|
-
finally:
|
|
187
|
-
progress.update(task1, advance=1)
|
|
188
|
-
|
|
189
|
-
tool_call_metrics = [metric[0] for metric in results_list]
|
|
190
|
-
knowledge_base_metrics = [metric[1] for metric in results_list]
|
|
191
|
-
|
|
192
|
-
rag_metric_summary = KnowledgeBaseMetricSummary(
|
|
193
|
-
knowledge_base_metrics=knowledge_base_metrics
|
|
194
|
-
)
|
|
195
|
-
SummaryPanel(rag_metric_summary).print()
|
|
196
|
-
|
|
197
|
-
with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
|
|
198
|
-
json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
|
|
199
|
-
|
|
200
|
-
with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
|
|
201
|
-
json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
|
|
202
|
-
|
|
203
|
-
if len(tool_call_metrics) > 0:
|
|
204
|
-
# remove the average row if exist
|
|
205
|
-
tool_call_metrics = [
|
|
206
|
-
row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
|
|
207
|
-
]
|
|
208
|
-
|
|
209
|
-
def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
|
|
210
|
-
row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
|
|
211
|
-
"LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
|
|
212
|
-
"Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
|
|
213
|
-
return row
|
|
214
|
-
|
|
215
|
-
def create_avg_row(metrics: List[dict]):
|
|
216
|
-
avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
|
|
217
|
-
"LLM Steps": 0, "Total Tool Calls":0, "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
|
|
218
|
-
"Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
|
|
219
|
-
if metrics:
|
|
220
|
-
for row in metrics:
|
|
221
|
-
avg_row["Total Steps"] += row["Total Steps"]
|
|
222
|
-
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
223
|
-
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
224
|
-
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
225
|
-
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
226
|
-
avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
|
|
227
|
-
avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
|
|
228
|
-
avg_row["Journey Success"] += row["Journey Success"]
|
|
229
|
-
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
230
|
-
|
|
231
|
-
avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
|
|
232
|
-
avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
|
|
233
|
-
avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
|
|
234
|
-
avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
|
|
235
|
-
avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
|
|
236
|
-
avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
|
|
237
|
-
avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
|
|
238
|
-
avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
|
|
239
|
-
avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
|
|
240
|
-
return avg_row
|
|
241
|
-
|
|
242
|
-
tool_call_metrics_for_display = []
|
|
243
|
-
for row in tool_call_metrics:
|
|
244
|
-
tool_call_metrics_for_display.append(filter_display_only_values(row))
|
|
245
|
-
tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
|
|
246
|
-
tool_call_table_for_display = create_table(tool_call_metrics_for_display)
|
|
247
|
-
|
|
248
|
-
if tool_call_table_for_display:
|
|
249
|
-
tool_call_table_for_display.print()
|
|
250
|
-
|
|
251
|
-
if len(tool_call_metrics) > 0:
|
|
252
|
-
tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
|
|
253
|
-
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
254
|
-
header = list(tool_call_metrics[0].keys())
|
|
255
|
-
|
|
256
|
-
with open(output_file, "w") as file:
|
|
257
|
-
csv_writer = csv.writer(file)
|
|
258
|
-
csv_writer.writerow(header)
|
|
259
|
-
for entry in tool_call_metrics:
|
|
260
|
-
csv_writer.writerow([entry[name] for name in header])
|
|
135
|
+
run.evaluate()
|
|
261
136
|
|
|
137
|
+
# persist config
|
|
262
138
|
with open(
|
|
263
|
-
|
|
139
|
+
pathlib.Path(config.output_dir) / "config.yml", "w", encoding="utf-8"
|
|
264
140
|
) as f:
|
|
265
141
|
yaml.safe_dump(dataclasses.asdict(config), f)
|
|
266
142
|
|
|
267
|
-
|
|
143
|
+
if not config.skip_legacy_evaluation:
|
|
144
|
+
print(f"Results saved to {config.output_dir}")
|
|
145
|
+
else:
|
|
146
|
+
print(f"Config and metadata saved to {config.output_dir}")
|
|
147
|
+
print(f"Langfuse Evaluation run completed for collection {collection_name}:")
|
|
148
|
+
for session_id in session_ids:
|
|
149
|
+
print(f" - http://localhost:3010/project/orchestrate-lite/sessions/{session_id}")
|
|
268
150
|
|
|
269
151
|
|
|
270
152
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.metrics.evaluations import Evaluation
|
|
2
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
3
|
+
Annotation,
|
|
4
|
+
FailedSemanticTestCases,
|
|
5
|
+
FailedStaticTestCases,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
def argument_matching(expected, actual):
|
|
9
|
+
if actual is None:
|
|
10
|
+
return False
|
|
11
|
+
for field in actual:
|
|
12
|
+
if field not in expected:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
return True
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics import Evaluation
|
|
4
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
5
|
+
|
|
6
|
+
class DummyMetric(Evaluation):
|
|
7
|
+
def __init__(self, llm_client = None):
|
|
8
|
+
super().__init__(llm_client)
|
|
9
|
+
|
|
10
|
+
def evaluate(self, messages, ground_truth, extracted_context, metadata = ..., **kwargs):
|
|
11
|
+
return LangfuseMetric(
|
|
12
|
+
eval_name="dummy_metric",
|
|
13
|
+
value=True,
|
|
14
|
+
metadata=metadata,
|
|
15
|
+
data_type="BOOLEAN",
|
|
16
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.metrics.metrics import Metric
|
|
6
|
+
from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
8
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
9
|
+
from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
|
|
10
|
+
|
|
11
|
+
root_dir: str = os.path.dirname(os.path.dirname(__file__))
|
|
12
|
+
LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Evaluation(ABC):
|
|
16
|
+
"""Abstract base class for all evaluations."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, llm_client: Optional[Provider] = None) -> None:
|
|
19
|
+
self._llm_client = llm_client
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def llm_client(self) -> Any:
|
|
23
|
+
"""Access client, require it if used."""
|
|
24
|
+
if self._llm_client is None:
|
|
25
|
+
raise RuntimeError(
|
|
26
|
+
f"{self.__class__.__name__} requires a client, but none was provided"
|
|
27
|
+
)
|
|
28
|
+
return self._llm_client
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def name(self) -> str:
|
|
33
|
+
"""Unique name for the evaluator."""
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def evaluate(
|
|
38
|
+
self,
|
|
39
|
+
messages: list[Message],
|
|
40
|
+
ground_truth: OrchestrateDataset,
|
|
41
|
+
extracted_context: Dict[str, Any],
|
|
42
|
+
) -> Optional[Metric]:
|
|
43
|
+
"""
|
|
44
|
+
Evaluation method.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
messages: agent and user conversational messages (includes tool calls)
|
|
48
|
+
ground_truth: ground truth data
|
|
49
|
+
extracted_context: dictionary containing data derived from the messages
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Metic
|
|
53
|
+
"""
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class LLMaaJEvaluation(Evaluation, ABC):
|
|
58
|
+
"""Evaluation metric for LLMaaJ."""
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def llmaaj_instructions(self) -> str:
|
|
63
|
+
"""LLMaaJ instructions for the evaluator."""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def format_llm_output(self, string: str) -> int | float | bool | str:
|
|
68
|
+
"""Format the output of the LLMaaJ query."""
|
|
69
|
+
raise NotImplementedError
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def selected_context_keys(self) -> set[str]:
|
|
73
|
+
"""Override to implement context keys to pass to the prompt."""
|
|
74
|
+
return set()
|
|
75
|
+
|
|
76
|
+
def select_context(
|
|
77
|
+
self, extracted_context: Dict[str, Any]
|
|
78
|
+
) -> dict[str, Any]:
|
|
79
|
+
"""Additional context to be added to the prompt."""
|
|
80
|
+
selected_context = {
|
|
81
|
+
key: value
|
|
82
|
+
for key, value in extracted_context.items()
|
|
83
|
+
if key in self.selected_context_keys
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return selected_context
|
|
87
|
+
|
|
88
|
+
def evaluate(
|
|
89
|
+
self,
|
|
90
|
+
messages: list[Message],
|
|
91
|
+
ground_truth: OrchestrateDataset,
|
|
92
|
+
extracted_context: Dict[str, Any],
|
|
93
|
+
) -> Optional[Metric]:
|
|
94
|
+
renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
|
|
95
|
+
parsed = ParsedMessages(messages=messages)
|
|
96
|
+
if parsed.user_input is None or parsed.agent_response is None:
|
|
97
|
+
return None
|
|
98
|
+
context = str(self.select_context(extracted_context))
|
|
99
|
+
prompt = renderer.render(
|
|
100
|
+
user_input=parsed.user_input,
|
|
101
|
+
agent_answer=parsed.agent_response,
|
|
102
|
+
llmaaj_instructions=self.llmaaj_instructions,
|
|
103
|
+
context=context,
|
|
104
|
+
)
|
|
105
|
+
score_str = self.llm_client.query(prompt)
|
|
106
|
+
value = self.format_llm_output(score_str)
|
|
107
|
+
return Metric(eval_name=self.name, value=value)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
|
|
7
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
8
|
+
|
|
9
|
+
## fix later
|
|
10
|
+
from wxo_agentic_evaluation.otel_parser.parser_types import (
|
|
11
|
+
Message as OtelMessage,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
16
|
+
- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
|
|
17
|
+
a dummy node to the ground truth and the labelled messages to take into account
|
|
18
|
+
single, summary step goals.
|
|
19
|
+
"""
|
|
20
|
+
DUMMY_GRAPH_NODE_NAME = "dummy-goal"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class JourneySuccessMetric(Evaluation):
|
|
24
|
+
def __init__(self, llm_client=None):
|
|
25
|
+
super().__init__(llm_client)
|
|
26
|
+
self.is_strict = True
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def name(self):
|
|
30
|
+
return "Journey Success"
|
|
31
|
+
|
|
32
|
+
def find_terminal_nodes(self, graph: dict[str, list[str]]) -> set[str]:
|
|
33
|
+
"""Finds terminal nodes (nodes with no outgoing edges).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
graph: the input graph
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
a set of the terminal nodes
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
seen_nodes = set() # track seen nodes
|
|
43
|
+
non_terminal_nodes = set() # track nodes with children
|
|
44
|
+
|
|
45
|
+
for node in graph:
|
|
46
|
+
seen_nodes.add(node)
|
|
47
|
+
if graph[node]:
|
|
48
|
+
non_terminal_nodes.add(node)
|
|
49
|
+
for n in graph[node]:
|
|
50
|
+
seen_nodes.add(n)
|
|
51
|
+
return seen_nodes - non_terminal_nodes
|
|
52
|
+
|
|
53
|
+
def is_topological_sort(
|
|
54
|
+
self,
|
|
55
|
+
graph: dict[str, list[str]],
|
|
56
|
+
ordering: list[str],
|
|
57
|
+
is_strict: bool = True,
|
|
58
|
+
) -> bool:
|
|
59
|
+
"""Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
|
|
63
|
+
ordering: the nodes visited, in order
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
|
|
67
|
+
"""
|
|
68
|
+
# No keyword match or goal details were achieved
|
|
69
|
+
if not ordering:
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
if is_strict:
|
|
73
|
+
# strict matching: only consider most recent tool call
|
|
74
|
+
position = {node: [i] for i, node in enumerate(ordering)}
|
|
75
|
+
else:
|
|
76
|
+
# lenient matching: consider all tool calls (account for all indexes of the node)
|
|
77
|
+
position = defaultdict(list)
|
|
78
|
+
for i, node in enumerate(ordering):
|
|
79
|
+
position[node].append(i)
|
|
80
|
+
|
|
81
|
+
terminal_nodes = self.find_terminal_nodes(graph)
|
|
82
|
+
# adds a dummy node for each terminal node
|
|
83
|
+
next_idx = (
|
|
84
|
+
max(val for values in position.values() for val in values) + 1
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for n in terminal_nodes:
|
|
88
|
+
graph[n] = [DUMMY_GRAPH_NODE_NAME]
|
|
89
|
+
graph[DUMMY_GRAPH_NODE_NAME] = []
|
|
90
|
+
position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
|
|
91
|
+
next_idx += 1
|
|
92
|
+
|
|
93
|
+
for node in graph:
|
|
94
|
+
for child_nodes in graph[node]:
|
|
95
|
+
# Current node/children doesn't show up in made calls
|
|
96
|
+
if node not in position or child_nodes not in position:
|
|
97
|
+
return False
|
|
98
|
+
# Current node doesn't show up before any of its child
|
|
99
|
+
# all index in current nodes are larger than every child nodes' index
|
|
100
|
+
if all(
|
|
101
|
+
curr >= max(position[child_nodes])
|
|
102
|
+
for curr in position[node]
|
|
103
|
+
):
|
|
104
|
+
return False
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
def evaluate(
|
|
108
|
+
self, messages, ground_truth, extracted_context, metadata, **kwargs
|
|
109
|
+
):
|
|
110
|
+
labeled_messages = extracted_context.get("labeled_messages")
|
|
111
|
+
correct_tool_calls = []
|
|
112
|
+
|
|
113
|
+
for message_idx, matching_goal_details in labeled_messages.items():
|
|
114
|
+
msg_tool_call = messages[message_idx]
|
|
115
|
+
msg_tool_call = msg_tool_call.tool_calls[0].function
|
|
116
|
+
for goal_detail in matching_goal_details:
|
|
117
|
+
args_match = argument_matching(
|
|
118
|
+
expected=goal_detail.args,
|
|
119
|
+
actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if args_match:
|
|
123
|
+
correct_tool_calls.append(goal_detail.name)
|
|
124
|
+
|
|
125
|
+
is_topological_sort = self.is_topological_sort(
|
|
126
|
+
graph=ground_truth.goals,
|
|
127
|
+
ordering=correct_tool_calls,
|
|
128
|
+
is_strict=self.is_strict,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return LangfuseMetric(
|
|
132
|
+
eval_name=self.name,
|
|
133
|
+
comment="",
|
|
134
|
+
value=is_topological_sort,
|
|
135
|
+
data_type="NUMERIC",
|
|
136
|
+
metadata=metadata,
|
|
137
|
+
)
|