ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.base_user import BaseUserSimulator
|
|
4
|
+
from wxo_agentic_evaluation.prompt.template_render import UserTemplateRenderer
|
|
5
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
6
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMUserV2(BaseUserSimulator):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
llm_client: Provider,
|
|
13
|
+
user_prompt_path: str,
|
|
14
|
+
):
|
|
15
|
+
self.llm_client = llm_client
|
|
16
|
+
self.user_prompt_path = user_prompt_path
|
|
17
|
+
self.prompt_template = UserTemplateRenderer(
|
|
18
|
+
template_path=user_prompt_path
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def _get_system_prompt(
|
|
22
|
+
self, user_story: str, user_response_style: List[str] = None
|
|
23
|
+
) -> Message:
|
|
24
|
+
# Get the user system prompt
|
|
25
|
+
prompt_messages = self.prompt_template.render(
|
|
26
|
+
user_story=user_story,
|
|
27
|
+
user_response_style=user_response_style,
|
|
28
|
+
)
|
|
29
|
+
return Message(**prompt_messages[0], type=ContentType.text)
|
|
30
|
+
|
|
31
|
+
def _get_message_dicts(self, messages: List[Message]) -> List[dict]:
|
|
32
|
+
# Convert messages to dictionary format for the llm client
|
|
33
|
+
return [message.model_dump() for message in messages]
|
|
34
|
+
|
|
35
|
+
def _filter_conversation_history(
|
|
36
|
+
self, conversation_history: List[Message]
|
|
37
|
+
) -> List[Message]:
|
|
38
|
+
# Filter out the agent system prompt
|
|
39
|
+
return [
|
|
40
|
+
message
|
|
41
|
+
for message in conversation_history
|
|
42
|
+
if message.role != "system"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
def flip_message_roles(self, messages: List[Message]) -> List[Message]:
|
|
46
|
+
# We flip the roles of messages in conversation history to basically prompt the
|
|
47
|
+
# user simulator with the assistant message as the user input message
|
|
48
|
+
# This helps to get the llm to respond as a natural user with the given story.
|
|
49
|
+
new_messages = []
|
|
50
|
+
for message in messages:
|
|
51
|
+
if message.role == "user":
|
|
52
|
+
new_messages.append(
|
|
53
|
+
Message(
|
|
54
|
+
role="assistant",
|
|
55
|
+
content=message.content,
|
|
56
|
+
type=ContentType.text,
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
new_messages.append(
|
|
61
|
+
Message(
|
|
62
|
+
role="user",
|
|
63
|
+
content=message.content,
|
|
64
|
+
type=ContentType.text,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
return new_messages
|
|
68
|
+
|
|
69
|
+
def generate_user_input(
|
|
70
|
+
self,
|
|
71
|
+
user_story: str,
|
|
72
|
+
conversation_history: List[Message],
|
|
73
|
+
user_response_style: List[str] = None,
|
|
74
|
+
starting_user_input: Message = None,
|
|
75
|
+
**kwargs,
|
|
76
|
+
) -> Message:
|
|
77
|
+
# Get the user system prompt
|
|
78
|
+
system_prompt = self._get_system_prompt(user_story, user_response_style)
|
|
79
|
+
|
|
80
|
+
conversation_history = self._filter_conversation_history(
|
|
81
|
+
conversation_history
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
## Adding dummy message if not provided from the simulation side.
|
|
85
|
+
if len(conversation_history) == 0:
|
|
86
|
+
conversation_history.append(
|
|
87
|
+
Message(
|
|
88
|
+
role="assistant",
|
|
89
|
+
content="Hi! How can I help you today?",
|
|
90
|
+
type=ContentType.text,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
conversation_history = self.flip_message_roles(conversation_history)
|
|
95
|
+
|
|
96
|
+
# build the conversation history with the system prompt
|
|
97
|
+
messages = [system_prompt] + conversation_history
|
|
98
|
+
|
|
99
|
+
if starting_user_input is not None:
|
|
100
|
+
# If starting user input is provided, return it as is for the initial turn
|
|
101
|
+
return starting_user_input
|
|
102
|
+
else:
|
|
103
|
+
|
|
104
|
+
# Get response from LLM for simulation
|
|
105
|
+
response = self.llm_client.chat(
|
|
106
|
+
messages=self._get_message_dicts(messages)
|
|
107
|
+
)
|
|
108
|
+
response_message = Message(
|
|
109
|
+
role="user",
|
|
110
|
+
content=response.choices[0].message.content,
|
|
111
|
+
type=ContentType.text,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return response_message
|
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -1,383 +1,152 @@
|
|
|
1
|
-
import csv
|
|
2
1
|
import dataclasses
|
|
3
|
-
import glob
|
|
4
2
|
import json
|
|
5
3
|
import os
|
|
6
|
-
import
|
|
7
|
-
from
|
|
4
|
+
import pathlib
|
|
5
|
+
from datetime import datetime
|
|
8
6
|
from pathlib import Path
|
|
9
|
-
from typing import List
|
|
10
7
|
|
|
11
|
-
import rich
|
|
12
8
|
import yaml
|
|
13
9
|
from jsonargparse import CLI
|
|
14
|
-
from rich.progress import Progress
|
|
15
10
|
|
|
16
11
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
17
|
-
from wxo_agentic_evaluation.
|
|
18
|
-
from wxo_agentic_evaluation.inference_backend import (
|
|
19
|
-
EvaluationController,
|
|
20
|
-
WXOInferenceBackend,
|
|
21
|
-
get_wxo_client,
|
|
22
|
-
)
|
|
23
|
-
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
12
|
+
from wxo_agentic_evaluation.clients import bootstrap_clients
|
|
24
13
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
ToolCallAndRoutingMetrics,
|
|
14
|
+
extract_metrics,
|
|
15
|
+
format_metrics_for_display,
|
|
28
16
|
)
|
|
29
|
-
from wxo_agentic_evaluation.
|
|
30
|
-
|
|
17
|
+
from wxo_agentic_evaluation.runner import process_test_case
|
|
18
|
+
from wxo_agentic_evaluation.scheduler import (
|
|
19
|
+
discover_tests,
|
|
20
|
+
enumerate_jobs,
|
|
21
|
+
run_jobs,
|
|
31
22
|
)
|
|
32
|
-
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
33
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
34
|
-
from wxo_agentic_evaluation.type import EvaluationData
|
|
35
|
-
from wxo_agentic_evaluation.utils import json_dump
|
|
36
23
|
from wxo_agentic_evaluation.utils.utils import (
|
|
37
24
|
SummaryPanel,
|
|
38
25
|
create_table,
|
|
39
|
-
|
|
26
|
+
csv_dump,
|
|
40
27
|
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
):
|
|
46
|
-
summary_results_for_path = []
|
|
47
|
-
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
48
|
-
with open(test_case, "r") as f:
|
|
49
|
-
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
50
|
-
|
|
51
|
-
evaluation_controller = EvaluationController(
|
|
52
|
-
wxo_inference_backend=inference_backend,
|
|
53
|
-
llm_user=llm_user,
|
|
54
|
-
config=config,
|
|
55
|
-
)
|
|
56
|
-
rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
|
|
57
|
-
(
|
|
58
|
-
history,
|
|
59
|
-
call_tracker,
|
|
60
|
-
conversational_search_data,
|
|
61
|
-
) = evaluation_controller.run(
|
|
62
|
-
task_n,
|
|
63
|
-
test_case.story,
|
|
64
|
-
agent_name=test_case.agent,
|
|
65
|
-
starting_user_input=test_case.starting_sentence,
|
|
66
|
-
)
|
|
67
|
-
result = list()
|
|
68
|
-
for message in history:
|
|
69
|
-
result.append(message.model_dump())
|
|
70
|
-
|
|
71
|
-
json_dump(
|
|
72
|
-
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
|
|
73
|
-
result,
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
if len(conversational_search_data) > 0:
|
|
77
|
-
fn = tc_name + ".retrieval_context.json"
|
|
78
|
-
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
79
|
-
out_folder.mkdir(exist_ok=True)
|
|
80
|
-
rc = [context.model_dump() for context in conversational_search_data]
|
|
81
|
-
json_dump(out_folder / fn, rc)
|
|
82
|
-
|
|
83
|
-
# If data annotation run, skip summary generation
|
|
84
|
-
if config.data_annotation_run:
|
|
85
|
-
return summary_results_for_path # empty result set, skip summary
|
|
86
|
-
|
|
87
|
-
evaluation_package = EvaluationPackage(
|
|
88
|
-
test_case_name=tc_name,
|
|
89
|
-
messages=history,
|
|
90
|
-
ground_truth=test_case,
|
|
91
|
-
conversational_search_data=conversational_search_data,
|
|
92
|
-
resource_map=resource_map,
|
|
93
|
-
)
|
|
94
|
-
(
|
|
95
|
-
keyword_semantic_matches,
|
|
96
|
-
knowledge_base_metrics,
|
|
97
|
-
messages_with_reason,
|
|
98
|
-
metrics,
|
|
99
|
-
) = evaluation_package.generate_summary()
|
|
100
|
-
temp = []
|
|
101
|
-
for message in messages_with_reason:
|
|
102
|
-
temp.append(message.model_dump())
|
|
103
|
-
json_dump(
|
|
104
|
-
os.path.join(
|
|
105
|
-
config.output_dir, "messages", tc_name + ".messages.analyze.json"
|
|
106
|
-
),
|
|
107
|
-
temp,
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
json_dump(
|
|
111
|
-
os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
|
|
112
|
-
metrics.model_dump(),
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
metrics.dataset_name = tc_name
|
|
116
|
-
metrics.avg_resp_time = (
|
|
117
|
-
sum(call_tracker.generic) + sum(call_tracker.tool_call)
|
|
118
|
-
) / (len(call_tracker.generic) + len(call_tracker.tool_call))
|
|
119
|
-
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
120
|
-
|
|
121
|
-
summary_results_for_path.append((metrics, knowledge_base_metrics))
|
|
122
|
-
|
|
123
|
-
return summary_results_for_path
|
|
28
|
+
from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
|
|
29
|
+
from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
|
|
30
|
+
from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
|
|
31
|
+
from wxo_agentic_evaluation.langfuse_evaluation_package import EvaluationRunner, sample_aggregator
|
|
124
32
|
|
|
125
33
|
|
|
126
34
|
def main(config: TestConfig):
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
35
|
+
# setup
|
|
36
|
+
clients = bootstrap_clients(config)
|
|
37
|
+
if not getattr(config, "skip_available_results", False):
|
|
38
|
+
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
39
|
+
config.output_dir = os.path.join(config.output_dir, ts)
|
|
40
|
+
|
|
41
|
+
if not config.skip_legacy_evaluation:
|
|
42
|
+
knowledge_base_output_folder = (
|
|
43
|
+
Path(config.output_dir) / "knowledge_base_metrics"
|
|
131
44
|
)
|
|
132
|
-
|
|
133
|
-
|
|
45
|
+
knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
|
|
46
|
+
detailed_rag_output_file = (
|
|
47
|
+
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
134
48
|
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
config.
|
|
139
|
-
config.auth_config.tenant_name,
|
|
140
|
-
config.auth_config.token,
|
|
141
|
-
)
|
|
142
|
-
resource_map = ResourceMap(wxo_client)
|
|
143
|
-
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
144
|
-
llm_user = LLMUser(
|
|
145
|
-
wai_client=get_provider(
|
|
146
|
-
config=config.provider_config,
|
|
147
|
-
model_id=config.llm_user_config.model_id,
|
|
148
|
-
),
|
|
149
|
-
template=LlamaUserTemplateRenderer(
|
|
150
|
-
config.llm_user_config.prompt_config
|
|
151
|
-
),
|
|
152
|
-
user_response_style=config.llm_user_config.user_response_style,
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
|
|
156
|
-
|
|
157
|
-
results_list = []
|
|
49
|
+
summary_rag_output_file = (
|
|
50
|
+
Path(config.output_dir) / "knowledge_base_summary_metrics.json"
|
|
51
|
+
)
|
|
52
|
+
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
158
53
|
|
|
159
|
-
|
|
160
|
-
|
|
54
|
+
# discover & schedule tests
|
|
55
|
+
test_cases = discover_tests(
|
|
56
|
+
config.test_paths, config.enable_recursive_search
|
|
161
57
|
)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
58
|
+
jobs = enumerate_jobs(
|
|
59
|
+
test_cases,
|
|
60
|
+
config.n_runs,
|
|
61
|
+
config.skip_available_results,
|
|
62
|
+
config.output_dir,
|
|
165
63
|
)
|
|
166
|
-
|
|
167
|
-
|
|
64
|
+
results = run_jobs(
|
|
65
|
+
jobs, config, clients, process_test_case, config.num_workers
|
|
168
66
|
)
|
|
169
67
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
if config.skip_available_results:
|
|
173
|
-
available_res = set(
|
|
174
|
-
[
|
|
175
|
-
os.path.basename(f).replace(".messages", "")
|
|
176
|
-
for f in glob.glob(
|
|
177
|
-
os.path.join(
|
|
178
|
-
config.output_dir, "messages", "*.messages.json"
|
|
179
|
-
)
|
|
180
|
-
)
|
|
181
|
-
]
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
test_cases = []
|
|
185
|
-
for test_path in config.test_paths:
|
|
186
|
-
if os.path.isdir(test_path):
|
|
187
|
-
test_path = os.path.join(test_path, "*.json")
|
|
188
|
-
test_cases.extend(sorted(glob.glob(test_path)))
|
|
189
|
-
|
|
190
|
-
futures = []
|
|
191
|
-
task_n = 0
|
|
192
|
-
for test_case in test_cases:
|
|
193
|
-
if not test_case.endswith(".json") or test_case.endswith("agent.json"):
|
|
194
|
-
continue
|
|
195
|
-
if config.skip_available_results:
|
|
196
|
-
if test_case in available_res:
|
|
197
|
-
print(
|
|
198
|
-
f"Skipping test case {test_case} as results already exist."
|
|
199
|
-
)
|
|
200
|
-
continue
|
|
201
|
-
|
|
202
|
-
future = executor.submit(
|
|
203
|
-
process_test_case,
|
|
204
|
-
task_n,
|
|
205
|
-
test_case,
|
|
206
|
-
config,
|
|
207
|
-
inference_backend,
|
|
208
|
-
resource_map,
|
|
209
|
-
llm_user,
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
futures.append((test_case, future))
|
|
213
|
-
task_n += 1
|
|
214
|
-
|
|
215
|
-
if futures:
|
|
216
|
-
with Progress() as progress:
|
|
217
|
-
task1 = progress.add_task(
|
|
218
|
-
f"[purple]Evaluating {len(futures)} tasks...",
|
|
219
|
-
total=len(futures),
|
|
220
|
-
)
|
|
221
|
-
for test_case, future in futures:
|
|
222
|
-
try:
|
|
223
|
-
results_list.extend(future.result())
|
|
224
|
-
except Exception as e:
|
|
225
|
-
rich.print(f"test case {test_case} fails with {e}")
|
|
226
|
-
traceback.print_exc()
|
|
227
|
-
finally:
|
|
228
|
-
progress.update(task1, advance=1)
|
|
229
|
-
|
|
230
|
-
tool_call_metrics = [metric[0] for metric in results_list]
|
|
231
|
-
knowledge_base_metrics = [metric[1] for metric in results_list]
|
|
68
|
+
# extract
|
|
69
|
+
tool_metrics, kb_summary, custom_metrics = extract_metrics(results)
|
|
232
70
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
|
|
239
|
-
json.dump(
|
|
240
|
-
rag_metric_summary.model_dump(by_alias=True)["detailed"],
|
|
241
|
-
f,
|
|
242
|
-
indent=4,
|
|
71
|
+
if not config.skip_legacy_evaluation:
|
|
72
|
+
# write results
|
|
73
|
+
csv_dump(
|
|
74
|
+
pathlib.Path(config.output_dir) / "summary_metrics.csv",
|
|
75
|
+
rows=[metric.model_dump() for metric in tool_metrics],
|
|
243
76
|
)
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
77
|
+
for file_path, key in [
|
|
78
|
+
(detailed_rag_output_file, "detailed"),
|
|
79
|
+
(summary_rag_output_file, "summary"),
|
|
80
|
+
]:
|
|
81
|
+
with open(file_path, "w+", encoding="utf-8") as f:
|
|
82
|
+
json.dump(kb_summary.model_dump(by_alias=True)[key], f, indent=4)
|
|
83
|
+
|
|
84
|
+
# print results
|
|
85
|
+
SummaryPanel(kb_summary).print()
|
|
86
|
+
tool_table = create_table(
|
|
87
|
+
format_metrics_for_display(tool_metrics), title="Agent Metrics"
|
|
248
88
|
)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
"Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
|
|
269
|
-
"Text Match": tool_call_metric.text_match,
|
|
270
|
-
"Journey Success": tool_call_metric.is_success,
|
|
271
|
-
"Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
|
|
272
|
-
}
|
|
273
|
-
return row
|
|
274
|
-
|
|
275
|
-
def create_avg_row(metrics: List[dict]):
|
|
276
|
-
avg_row = {
|
|
277
|
-
"Dataset": "Summary (Average)",
|
|
278
|
-
"Total Steps": 0,
|
|
279
|
-
"LLM Steps": 0,
|
|
280
|
-
"Total Tool Calls": 0,
|
|
281
|
-
"Tool Call Precision": 0,
|
|
282
|
-
"Tool Call Recall": 0,
|
|
283
|
-
"Agent Routing Accuracy": 0,
|
|
284
|
-
"Text Match": 0,
|
|
285
|
-
"Journey Success": 0,
|
|
286
|
-
"Avg Resp Time (sec)": 0,
|
|
287
|
-
}
|
|
288
|
-
if metrics:
|
|
289
|
-
for row in metrics:
|
|
290
|
-
avg_row["Total Steps"] += row["Total Steps"]
|
|
291
|
-
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
292
|
-
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
293
|
-
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
294
|
-
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
295
|
-
avg_row["Agent Routing Accuracy"] += row[
|
|
296
|
-
"Agent Routing Accuracy"
|
|
297
|
-
]
|
|
298
|
-
avg_row["Text Match"] += (
|
|
299
|
-
row["Text Match"] == TextMatchType.text_match.value
|
|
300
|
-
)
|
|
301
|
-
avg_row["Journey Success"] += row["Journey Success"]
|
|
302
|
-
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
303
|
-
|
|
304
|
-
avg_row["Total Steps"] = round(
|
|
305
|
-
safe_divide(avg_row["Total Steps"], len(metrics)), 2
|
|
306
|
-
)
|
|
307
|
-
avg_row["LLM Steps"] = round(
|
|
308
|
-
safe_divide(avg_row["LLM Steps"], len(metrics)), 2
|
|
309
|
-
)
|
|
310
|
-
avg_row["Total Tool Calls"] = round(
|
|
311
|
-
safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
|
|
312
|
-
)
|
|
313
|
-
avg_row["Tool Call Precision"] = round(
|
|
314
|
-
safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
|
|
315
|
-
)
|
|
316
|
-
avg_row["Tool Call Recall"] = round(
|
|
317
|
-
safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
|
|
318
|
-
)
|
|
319
|
-
avg_row["Agent Routing Accuracy"] = round(
|
|
320
|
-
safe_divide(
|
|
321
|
-
avg_row["Agent Routing Accuracy"], len(metrics)
|
|
322
|
-
),
|
|
323
|
-
2,
|
|
324
|
-
)
|
|
325
|
-
avg_row["Text Match"] = round(
|
|
326
|
-
safe_divide(
|
|
327
|
-
avg_row["Text Match"],
|
|
328
|
-
len(
|
|
329
|
-
[
|
|
330
|
-
row
|
|
331
|
-
for row in metrics
|
|
332
|
-
if row["Text Match"]
|
|
333
|
-
!= TextMatchType.text_match.na
|
|
334
|
-
]
|
|
335
|
-
),
|
|
336
|
-
),
|
|
337
|
-
2,
|
|
338
|
-
)
|
|
339
|
-
avg_row["Journey Success"] = round(
|
|
340
|
-
safe_divide(avg_row["Journey Success"], len(metrics)), 2
|
|
341
|
-
)
|
|
342
|
-
avg_row["Avg Resp Time (sec)"] = round(
|
|
343
|
-
safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
|
|
344
|
-
)
|
|
345
|
-
return avg_row
|
|
346
|
-
|
|
347
|
-
tool_call_metrics_for_display = []
|
|
348
|
-
for row in tool_call_metrics:
|
|
349
|
-
tool_call_metrics_for_display.append(
|
|
350
|
-
filter_display_only_values(row)
|
|
351
|
-
)
|
|
352
|
-
tool_call_metrics_for_display.append(
|
|
353
|
-
create_avg_row(tool_call_metrics_for_display)
|
|
89
|
+
if tool_table:
|
|
90
|
+
tool_table.print()
|
|
91
|
+
if any(cm.custom_metrics for cm in custom_metrics):
|
|
92
|
+
rows = []
|
|
93
|
+
for cm in custom_metrics:
|
|
94
|
+
row = {"dataset_name": cm.dataset_name}
|
|
95
|
+
for m in cm.custom_metrics:
|
|
96
|
+
row[m.eval_name] = str(
|
|
97
|
+
m.value
|
|
98
|
+
) # Convert to string to avoid type issues
|
|
99
|
+
rows.append(row)
|
|
100
|
+
custom_metrics_table = create_table(rows, title="Custom Metrics")
|
|
101
|
+
if custom_metrics_table:
|
|
102
|
+
custom_metrics_table.print()
|
|
103
|
+
else:
|
|
104
|
+
collection_name = os.path.basename(config.output_dir) + "_collection"
|
|
105
|
+
collection = LangfuseCollection(
|
|
106
|
+
name=collection_name,
|
|
107
|
+
description="",
|
|
354
108
|
)
|
|
355
|
-
|
|
356
|
-
|
|
109
|
+
dataset_paths = []
|
|
110
|
+
session_ids = []
|
|
111
|
+
for test_case in test_cases:
|
|
112
|
+
name = os.path.basename(test_case).replace(".json", "")
|
|
113
|
+
with open(os.path.join(config.output_dir, f"{name}.metadata.json"), "r") as f:
|
|
114
|
+
metadata = json.load(f)
|
|
115
|
+
session_id = metadata["thread_id"]
|
|
116
|
+
dataset_paths.append(test_case)
|
|
117
|
+
session_ids.append(session_id)
|
|
118
|
+
|
|
119
|
+
collection.upload(paths=dataset_paths)
|
|
120
|
+
|
|
121
|
+
langfuse_collection = LangfuseCollection(name=collection_name)
|
|
122
|
+
|
|
123
|
+
journey_sucess_metric = JourneySuccessMetric()
|
|
124
|
+
tool_calling = ToolCalling()
|
|
125
|
+
|
|
126
|
+
run = EvaluationRunner(
|
|
127
|
+
evaluation_name=os.path.basename(config.output_dir) + "_evaluation",
|
|
128
|
+
run_name=os.path.basename(config.output_dir) + "_run",
|
|
129
|
+
session_ids=session_ids,
|
|
130
|
+
collection=langfuse_collection,
|
|
131
|
+
metrics=[journey_sucess_metric, tool_calling],
|
|
132
|
+
aggregator=sample_aggregator
|
|
357
133
|
)
|
|
358
134
|
|
|
359
|
-
|
|
360
|
-
tool_call_table_for_display.print()
|
|
361
|
-
|
|
362
|
-
if len(tool_call_metrics) > 0:
|
|
363
|
-
tool_call_metrics = [
|
|
364
|
-
metric.model_dump() for metric in tool_call_metrics
|
|
365
|
-
]
|
|
366
|
-
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
367
|
-
header = list(tool_call_metrics[0].keys())
|
|
368
|
-
|
|
369
|
-
with open(output_file, "w") as file:
|
|
370
|
-
csv_writer = csv.writer(file)
|
|
371
|
-
csv_writer.writerow(header)
|
|
372
|
-
for entry in tool_call_metrics:
|
|
373
|
-
csv_writer.writerow([entry[name] for name in header])
|
|
135
|
+
run.evaluate()
|
|
374
136
|
|
|
137
|
+
# persist config
|
|
375
138
|
with open(
|
|
376
|
-
|
|
139
|
+
pathlib.Path(config.output_dir) / "config.yml", "w", encoding="utf-8"
|
|
377
140
|
) as f:
|
|
378
141
|
yaml.safe_dump(dataclasses.asdict(config), f)
|
|
379
142
|
|
|
380
|
-
|
|
143
|
+
if not config.skip_legacy_evaluation:
|
|
144
|
+
print(f"Results saved to {config.output_dir}")
|
|
145
|
+
else:
|
|
146
|
+
print(f"Config and metadata saved to {config.output_dir}")
|
|
147
|
+
print(f"Langfuse Evaluation run completed for collection {collection_name}:")
|
|
148
|
+
for session_id in session_ids:
|
|
149
|
+
print(f" - http://localhost:3010/project/orchestrate-lite/sessions/{session_id}")
|
|
381
150
|
|
|
382
151
|
|
|
383
152
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.metrics.evaluations import Evaluation
|
|
2
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
3
|
+
Annotation,
|
|
4
|
+
FailedSemanticTestCases,
|
|
5
|
+
FailedStaticTestCases,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
def argument_matching(expected, actual):
|
|
9
|
+
if actual is None:
|
|
10
|
+
return False
|
|
11
|
+
for field in actual:
|
|
12
|
+
if field not in expected:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
return True
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics import Evaluation
|
|
4
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
5
|
+
|
|
6
|
+
class DummyMetric(Evaluation):
|
|
7
|
+
def __init__(self, llm_client = None):
|
|
8
|
+
super().__init__(llm_client)
|
|
9
|
+
|
|
10
|
+
def evaluate(self, messages, ground_truth, extracted_context, metadata = ..., **kwargs):
|
|
11
|
+
return LangfuseMetric(
|
|
12
|
+
eval_name="dummy_metric",
|
|
13
|
+
value=True,
|
|
14
|
+
metadata=metadata,
|
|
15
|
+
data_type="BOOLEAN",
|
|
16
|
+
)
|