ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
|
|
|
390
390
|
)
|
|
391
391
|
|
|
392
392
|
@model_validator(mode="after")
|
|
393
|
-
def compute_overall(
|
|
393
|
+
def compute_overall(self) -> Self:
|
|
394
394
|
"""
|
|
395
395
|
After validation, compute overall_valid as AND of:
|
|
396
396
|
• all semantic is_correct flags
|
|
397
397
|
• if transform exists: all execution_success flags
|
|
398
398
|
"""
|
|
399
|
-
static: StaticResult =
|
|
399
|
+
static: StaticResult = self.static
|
|
400
400
|
if static:
|
|
401
401
|
# static checks
|
|
402
402
|
ok = static.final_decision
|
|
403
403
|
|
|
404
|
-
sem: SemanticResult =
|
|
404
|
+
sem: SemanticResult = self.semantic
|
|
405
405
|
if sem:
|
|
406
406
|
# semantic checks
|
|
407
407
|
if sem.general and sem.general.metrics:
|
|
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
|
|
|
441
441
|
if param_avgs:
|
|
442
442
|
cat_avgs.append(sum(param_avgs) / len(param_avgs))
|
|
443
443
|
|
|
444
|
-
|
|
444
|
+
self.overall_avg_score = (
|
|
445
445
|
sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
|
|
446
446
|
)
|
|
447
|
-
|
|
448
|
-
return
|
|
447
|
+
self.overall_valid = ok
|
|
448
|
+
return self
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
# ----------------------------------------------------------------------
|
|
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
|
|
|
531
531
|
)
|
|
532
532
|
|
|
533
533
|
@model_validator(mode="after")
|
|
534
|
-
def _parse_arguments(
|
|
534
|
+
def _parse_arguments(self) -> Self:
|
|
535
535
|
"""
|
|
536
536
|
After model construction, parse the `arguments` JSON string
|
|
537
537
|
into `parsed_arguments`, or raise a ValidationError.
|
|
538
538
|
"""
|
|
539
539
|
try:
|
|
540
|
-
raw =
|
|
541
|
-
|
|
540
|
+
raw = self.arguments
|
|
541
|
+
self.parsed_arguments = json.loads(raw)
|
|
542
542
|
except json.JSONDecodeError as e:
|
|
543
543
|
raise ValidationError(f"Invalid JSON in arguments: {e}") from e
|
|
544
|
-
return
|
|
544
|
+
return self
|
|
545
545
|
|
|
546
546
|
|
|
547
547
|
class ToolCall(BaseModel):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import Any, List, Mapping
|
|
2
|
+
from typing import Any, List, Mapping, Optional
|
|
3
3
|
|
|
4
4
|
import rich
|
|
5
5
|
|
|
@@ -14,8 +14,20 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
|
|
|
14
14
|
ToolCall,
|
|
15
15
|
ToolSpec,
|
|
16
16
|
)
|
|
17
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
18
|
+
WXORuntimeAdapter,
|
|
19
|
+
)
|
|
17
20
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
18
21
|
from wxo_agentic_evaluation.type import Message
|
|
22
|
+
from wxo_agentic_evaluation.utils.gateway_provider_utils import (
|
|
23
|
+
get_provider_kwargs,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
DEFAULT_GENERATION_PARAMS = {
|
|
27
|
+
"min_new_tokens": 0,
|
|
28
|
+
"decoding_method": "greedy",
|
|
29
|
+
"max_new_tokens": 4096,
|
|
30
|
+
}
|
|
19
31
|
|
|
20
32
|
|
|
21
33
|
class ReferencelessEvaluation:
|
|
@@ -31,20 +43,29 @@ class ReferencelessEvaluation:
|
|
|
31
43
|
def __init__(
|
|
32
44
|
self,
|
|
33
45
|
api_spec: List[Mapping[str, Any]],
|
|
34
|
-
messages: List[Message],
|
|
35
46
|
model_id: str,
|
|
36
47
|
task_n: str,
|
|
37
48
|
dataset_name: str,
|
|
49
|
+
runtime_pipeline: bool = True,
|
|
50
|
+
generation_params=DEFAULT_GENERATION_PARAMS,
|
|
51
|
+
inference_backend: Optional[WXORuntimeAdapter] = None,
|
|
38
52
|
):
|
|
39
53
|
|
|
40
|
-
|
|
54
|
+
extra_kwargs = {}
|
|
55
|
+
if inference_backend is not None:
|
|
56
|
+
wxo_client = getattr(inference_backend, "wxo_client")
|
|
57
|
+
instance_url = getattr(wxo_client, "service_url", None)
|
|
58
|
+
token = getattr(wxo_client, "api_key", None)
|
|
59
|
+
if instance_url:
|
|
60
|
+
extra_kwargs["instance_url"] = instance_url
|
|
61
|
+
if token:
|
|
62
|
+
extra_kwargs["token"] = token
|
|
63
|
+
|
|
64
|
+
self.metrics_client = ReferencelessEvaluation.get_metrics_client(
|
|
41
65
|
model_id=model_id,
|
|
42
|
-
params=
|
|
43
|
-
"min_new_tokens": 0,
|
|
44
|
-
"decoding_method": "greedy",
|
|
45
|
-
"max_new_tokens": 4096,
|
|
46
|
-
},
|
|
66
|
+
params=generation_params,
|
|
47
67
|
referenceless_eval=True,
|
|
68
|
+
**extra_kwargs,
|
|
48
69
|
)
|
|
49
70
|
|
|
50
71
|
self.pipeline = ReflectionPipeline(
|
|
@@ -52,39 +73,54 @@ class ReferencelessEvaluation:
|
|
|
52
73
|
general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
|
|
53
74
|
function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
|
|
54
75
|
parameter_metrics=None,
|
|
76
|
+
runtime_pipeline=runtime_pipeline,
|
|
55
77
|
)
|
|
56
78
|
|
|
57
79
|
self.task_n = task_n
|
|
58
80
|
self.dataset_name = dataset_name
|
|
59
81
|
|
|
60
82
|
self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
|
|
61
|
-
self.messages = messages
|
|
62
83
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
for example in examples:
|
|
66
|
-
result = self.pipeline.run_sync(
|
|
67
|
-
conversation=example["context"],
|
|
68
|
-
inventory=self.apis_specs,
|
|
69
|
-
call=example["call"],
|
|
70
|
-
continue_on_static=False,
|
|
71
|
-
retries=2,
|
|
72
|
-
)
|
|
73
|
-
result_dict = result.model_dump()
|
|
74
|
-
results.append(result_dict)
|
|
84
|
+
@staticmethod
|
|
85
|
+
def get_metrics_client(**kwargs):
|
|
75
86
|
|
|
76
|
-
|
|
87
|
+
provider_kwargs = get_provider_kwargs(**kwargs)
|
|
77
88
|
|
|
78
|
-
|
|
79
|
-
|
|
89
|
+
return get_provider(
|
|
90
|
+
**provider_kwargs,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def fmt_tool_call(tool_id, tool_call_name, arguments, context):
|
|
95
|
+
call = {
|
|
96
|
+
"call": {
|
|
97
|
+
"id": tool_id,
|
|
98
|
+
"type": "function",
|
|
99
|
+
"function": {
|
|
100
|
+
"name": tool_call_name,
|
|
101
|
+
"arguments": arguments,
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
"context": context,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return call
|
|
80
108
|
|
|
109
|
+
@staticmethod
|
|
110
|
+
def fmt_msgs_referenceless(
|
|
111
|
+
messages: List[Message],
|
|
112
|
+
) -> List[Mapping[str, Any]]:
|
|
113
|
+
"""Assume that the last item in the `messages` array is the tool call, and preceding items
|
|
114
|
+
in the messages array is the context.
|
|
115
|
+
"""
|
|
116
|
+
examples = []
|
|
81
117
|
processed_data = [
|
|
82
118
|
{
|
|
83
119
|
k: msg.model_dump().get(k)
|
|
84
120
|
for k in ["role", "content", "type"]
|
|
85
121
|
if k in msg.model_dump()
|
|
86
122
|
}
|
|
87
|
-
for msg in
|
|
123
|
+
for msg in messages
|
|
88
124
|
]
|
|
89
125
|
|
|
90
126
|
for idx, message in enumerate(processed_data):
|
|
@@ -97,22 +133,47 @@ class ReferencelessEvaluation:
|
|
|
97
133
|
if tool_call_msg["name"].startswith("transfer_to"):
|
|
98
134
|
continue
|
|
99
135
|
|
|
100
|
-
call =
|
|
101
|
-
"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
"arguments": json.dumps(tool_call_msg["args"]),
|
|
107
|
-
},
|
|
108
|
-
},
|
|
109
|
-
"context": context,
|
|
110
|
-
}
|
|
136
|
+
call = ReferencelessEvaluation.fmt_tool_call(
|
|
137
|
+
tool_id=tool_call_msg.get("id", "1"),
|
|
138
|
+
tool_call_name=tool_call_msg["name"],
|
|
139
|
+
arguments=json.dumps(tool_call_msg["args"]),
|
|
140
|
+
context=context,
|
|
141
|
+
)
|
|
111
142
|
examples.append(call)
|
|
112
143
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
144
|
+
return examples
|
|
145
|
+
|
|
146
|
+
def _run_pipeline(self, examples: List[Mapping[str, Any]]):
|
|
147
|
+
results = []
|
|
148
|
+
for example in examples:
|
|
149
|
+
result = self.pipeline.run_sync(
|
|
150
|
+
conversation=example["context"],
|
|
151
|
+
inventory=self.apis_specs,
|
|
152
|
+
call=example["call"],
|
|
153
|
+
continue_on_static=False,
|
|
154
|
+
retries=2,
|
|
155
|
+
)
|
|
156
|
+
result_dict = result.model_dump()
|
|
157
|
+
results.append(result_dict)
|
|
158
|
+
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
def run(self, examples: List[Mapping[str, str]], verbose=False):
|
|
162
|
+
"""`examples` should be an array where each element is formatted:
|
|
163
|
+
|
|
164
|
+
call = {
|
|
165
|
+
"call": {
|
|
166
|
+
"id": tool_call_msg.get("id", "1"),
|
|
167
|
+
"type": "function",
|
|
168
|
+
"function": {
|
|
169
|
+
"name": tool_call_msg["name"],
|
|
170
|
+
"arguments": json.dumps(tool_call_msg["args"]),
|
|
171
|
+
},
|
|
172
|
+
},
|
|
173
|
+
"context": context,
|
|
174
|
+
}
|
|
175
|
+
"""
|
|
176
|
+
|
|
116
177
|
examples = [
|
|
117
178
|
{
|
|
118
179
|
"call": ToolCall.model_validate(ex["call"]),
|
|
@@ -120,6 +181,11 @@ class ReferencelessEvaluation:
|
|
|
120
181
|
}
|
|
121
182
|
for ex in examples
|
|
122
183
|
]
|
|
184
|
+
|
|
185
|
+
if verbose:
|
|
186
|
+
rich.print(
|
|
187
|
+
f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
|
|
188
|
+
)
|
|
123
189
|
results = self._run_pipeline(examples)
|
|
124
190
|
|
|
125
191
|
return results
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
|
|
3
|
-
from wxo_agentic_evaluation.
|
|
3
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
4
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ResourceMap:
|
|
@@ -34,6 +35,7 @@ class ResourceMap:
|
|
|
34
35
|
|
|
35
36
|
if resp.status_code == 200:
|
|
36
37
|
agents = resp.json()
|
|
38
|
+
self.all_agent_objs = agents
|
|
37
39
|
for agent in agents:
|
|
38
40
|
agent_name = agent["name"]
|
|
39
41
|
tools = [tool_map[id] for id in agent["tools"]]
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import rich
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
9
|
+
from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import (
|
|
10
|
+
EvaluationController,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
13
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
14
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
15
|
+
CustomEvalMetrics,
|
|
16
|
+
KnowledgeBaseMetricSummary,
|
|
17
|
+
ToolCallAndRoutingMetrics,
|
|
18
|
+
)
|
|
19
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
20
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
21
|
+
WXORuntimeAdapter,
|
|
22
|
+
)
|
|
23
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
24
|
+
from wxo_agentic_evaluation.type import OrchestrateDataset
|
|
25
|
+
from wxo_agentic_evaluation.utils import json_dump
|
|
26
|
+
from wxo_agentic_evaluation.utils.evaluation_discovery import (
|
|
27
|
+
find_evaluation_subclasses,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _save_data(
|
|
32
|
+
config: TestConfig,
|
|
33
|
+
test_case_name: str,
|
|
34
|
+
run_tag: str,
|
|
35
|
+
data,
|
|
36
|
+
file_path: str | None = None,
|
|
37
|
+
file_suffix: str | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Save data to a JSON file.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Test configuration
|
|
44
|
+
test_case_name: Test case name
|
|
45
|
+
run_tag: Run tag
|
|
46
|
+
data: Data to save
|
|
47
|
+
file_path: Complete file path (optional)
|
|
48
|
+
file_suffix: File suffix for messages directory (optional)
|
|
49
|
+
"""
|
|
50
|
+
if file_path:
|
|
51
|
+
json_dump(str(file_path), data)
|
|
52
|
+
elif file_suffix:
|
|
53
|
+
json_dump(
|
|
54
|
+
os.path.join(
|
|
55
|
+
config.output_dir,
|
|
56
|
+
"messages",
|
|
57
|
+
f"{test_case_name}{run_tag}{file_suffix}",
|
|
58
|
+
),
|
|
59
|
+
data,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Handle conversational search data
|
|
63
|
+
if (
|
|
64
|
+
isinstance(data, list)
|
|
65
|
+
and data
|
|
66
|
+
and hasattr(data[0], "model_dump")
|
|
67
|
+
and file_suffix == ".retrieval_context.json"
|
|
68
|
+
):
|
|
69
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
70
|
+
out_folder.mkdir(exist_ok=True)
|
|
71
|
+
retrieval_context = [context.model_dump() for context in data]
|
|
72
|
+
json_dump(
|
|
73
|
+
str(out_folder / f"{test_case_name}{run_tag}{file_suffix}"),
|
|
74
|
+
retrieval_context,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _process_tool_calls(
|
|
79
|
+
history: List,
|
|
80
|
+
evaluation_data: OrchestrateDataset,
|
|
81
|
+
resource_map: ResourceMap,
|
|
82
|
+
) -> Tuple[List[str], List[str], List[str]]:
|
|
83
|
+
"""
|
|
84
|
+
Process tool calls from history and evaluation data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
history: Message history
|
|
88
|
+
evaluation_data: evaluation data
|
|
89
|
+
resource_map: Resource map
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Tuple of (expected tool calls, actual tool calls, missed tool calls)
|
|
93
|
+
"""
|
|
94
|
+
expected_tools = [
|
|
95
|
+
goal_detail.tool_name
|
|
96
|
+
for goal_detail in evaluation_data.goal_details
|
|
97
|
+
if getattr(goal_detail, "type", None) == "tool_call"
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
raw_actual = []
|
|
101
|
+
for message in history:
|
|
102
|
+
try:
|
|
103
|
+
if getattr(message, "type", None) == "tool_call":
|
|
104
|
+
payload = (
|
|
105
|
+
json.loads(message.content)
|
|
106
|
+
if isinstance(message.content, str)
|
|
107
|
+
else message.content
|
|
108
|
+
)
|
|
109
|
+
name = (payload or {}).get("name")
|
|
110
|
+
if name:
|
|
111
|
+
raw_actual.append(str(name).strip())
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
expected_set = set(expected_tools)
|
|
116
|
+
agent_names = (
|
|
117
|
+
set(getattr(resource_map, "agent2tools", {}).keys())
|
|
118
|
+
if resource_map
|
|
119
|
+
else set()
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
filtered_actual_tool_calls = [
|
|
123
|
+
name for name in raw_actual if name not in agent_names
|
|
124
|
+
]
|
|
125
|
+
missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
|
|
126
|
+
|
|
127
|
+
return expected_tools, filtered_actual_tool_calls, missed_tool_calls
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def process_test_case(
|
|
131
|
+
task_n: int,
|
|
132
|
+
test_case: str,
|
|
133
|
+
config: TestConfig,
|
|
134
|
+
runtime_adapter: WXORuntimeAdapter,
|
|
135
|
+
resource_map: ResourceMap,
|
|
136
|
+
llm_user: LLMUser,
|
|
137
|
+
llmaaj_provider: Provider,
|
|
138
|
+
run_idx: int = 0,
|
|
139
|
+
) -> List[
|
|
140
|
+
Tuple[
|
|
141
|
+
ToolCallAndRoutingMetrics, KnowledgeBaseMetricSummary, CustomEvalMetrics
|
|
142
|
+
]
|
|
143
|
+
]:
|
|
144
|
+
"""
|
|
145
|
+
Process a single test case.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
task_n: Task number
|
|
149
|
+
test_case: Path to the test case file
|
|
150
|
+
config: Test configuration
|
|
151
|
+
inference_backend: Inference backend
|
|
152
|
+
resource_map: Resource map
|
|
153
|
+
llm_user: LLM user
|
|
154
|
+
llmaaj_provider: Provider for custom metrics
|
|
155
|
+
run_idx: Run index
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
List of tuples (metrics, knowledge_base_metrics, custom_metrics)
|
|
159
|
+
"""
|
|
160
|
+
summary_results_for_path = []
|
|
161
|
+
test_case_name = os.path.basename(test_case).replace(".json", "")
|
|
162
|
+
run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
|
|
163
|
+
|
|
164
|
+
with open(test_case, "r") as f:
|
|
165
|
+
evaluation_data = OrchestrateDataset.model_validate(json.load(f))
|
|
166
|
+
|
|
167
|
+
# Set up evaluation controller and run test
|
|
168
|
+
evaluation_controller = EvaluationController(
|
|
169
|
+
runtime=runtime_adapter,
|
|
170
|
+
llm_user=llm_user,
|
|
171
|
+
config=config,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
rich.print(
|
|
175
|
+
f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Run the evaluation
|
|
179
|
+
history, call_tracker, conversational_search_data, _ = (
|
|
180
|
+
evaluation_controller.run(
|
|
181
|
+
task_n,
|
|
182
|
+
story=evaluation_data.story,
|
|
183
|
+
agent_name=evaluation_data.agent,
|
|
184
|
+
starting_user_input=evaluation_data.starting_sentence,
|
|
185
|
+
max_user_turns=evaluation_data.max_user_turns,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Save metadata (that contains thread_id)
|
|
190
|
+
json_dump(
|
|
191
|
+
os.path.join(
|
|
192
|
+
config.output_dir,
|
|
193
|
+
f"{test_case_name}{run_tag}.metadata.json",
|
|
194
|
+
),
|
|
195
|
+
call_tracker.metadata,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if config.skip_legacy_evaluation:
|
|
199
|
+
return summary_results_for_path # empty result set, skip evaluation
|
|
200
|
+
|
|
201
|
+
# Save message history
|
|
202
|
+
result = [message.model_dump() for message in history]
|
|
203
|
+
_save_data(
|
|
204
|
+
config, test_case_name, run_tag, result, file_suffix=".messages.json"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Save conversational search data if available
|
|
208
|
+
if conversational_search_data:
|
|
209
|
+
retrieval_context = [
|
|
210
|
+
context.model_dump() for context in conversational_search_data
|
|
211
|
+
]
|
|
212
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
213
|
+
out_folder.mkdir(exist_ok=True)
|
|
214
|
+
file_path = str(
|
|
215
|
+
out_folder / f"{test_case_name}{run_tag}.retrieval_context.json"
|
|
216
|
+
)
|
|
217
|
+
_save_data(
|
|
218
|
+
config,
|
|
219
|
+
test_case_name,
|
|
220
|
+
run_tag,
|
|
221
|
+
retrieval_context,
|
|
222
|
+
file_path=file_path,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# If data annotation run, skip summary generation
|
|
226
|
+
if config.data_annotation_run:
|
|
227
|
+
return summary_results_for_path # empty result set, skip summary
|
|
228
|
+
|
|
229
|
+
# Load custom extractors and evaluations
|
|
230
|
+
all_extractors = []
|
|
231
|
+
all_custom_evals = []
|
|
232
|
+
|
|
233
|
+
# Load custom extractors
|
|
234
|
+
if config.extractors_config.paths is not None:
|
|
235
|
+
for path in config.extrators_config.paths:
|
|
236
|
+
extractors = find_evaluation_subclasses(
|
|
237
|
+
directory=path, base_class_name="Extractor"
|
|
238
|
+
)
|
|
239
|
+
for extractor_class in extractors:
|
|
240
|
+
all_extractors.append(extractor_class())
|
|
241
|
+
|
|
242
|
+
# Load custom evaluations
|
|
243
|
+
if config.custom_metrics_config.paths is not None:
|
|
244
|
+
for path in config.custom_metrics_config.paths:
|
|
245
|
+
custom_eval_classes = find_evaluation_subclasses(path)
|
|
246
|
+
for _class in custom_eval_classes:
|
|
247
|
+
all_custom_evals.append(_class(llm_client=llmaaj_provider))
|
|
248
|
+
|
|
249
|
+
# Create evaluation package and generate summary
|
|
250
|
+
evaluation_package = EvaluationPackage(
|
|
251
|
+
test_case_name=test_case_name,
|
|
252
|
+
messages=history,
|
|
253
|
+
ground_truth=evaluation_data,
|
|
254
|
+
conversational_search_data=conversational_search_data,
|
|
255
|
+
resource_map=resource_map,
|
|
256
|
+
config=config,
|
|
257
|
+
custom_evals=all_custom_evals,
|
|
258
|
+
extractors=all_extractors,
|
|
259
|
+
similarity_threshold=config.similarity_threshold,
|
|
260
|
+
enable_fuzzy_matching=config.enable_fuzzy_matching,
|
|
261
|
+
strict_topological_matching=config.strict_topological_matching,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Generate summary
|
|
265
|
+
(
|
|
266
|
+
_keyword_semantic_matches,
|
|
267
|
+
knowledge_base_metrics,
|
|
268
|
+
messages_with_reason,
|
|
269
|
+
metrics,
|
|
270
|
+
custom_metrics,
|
|
271
|
+
) = evaluation_package.generate_summary()
|
|
272
|
+
|
|
273
|
+
# Process messages with reason
|
|
274
|
+
temp = [message.model_dump() for message in messages_with_reason]
|
|
275
|
+
|
|
276
|
+
# Process tool calls
|
|
277
|
+
expected_tools, filtered_actual_tool_calls, missed_tool_calls = (
|
|
278
|
+
_process_tool_calls(history, evaluation_data, resource_map)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Add meta information
|
|
282
|
+
temp.append(
|
|
283
|
+
{
|
|
284
|
+
"meta": {
|
|
285
|
+
"expected_tool_calls": expected_tools,
|
|
286
|
+
"actual_tool_calls": filtered_actual_tool_calls,
|
|
287
|
+
"missed_tool_calls": missed_tool_calls,
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Save analysis results
|
|
293
|
+
_save_data(
|
|
294
|
+
config,
|
|
295
|
+
test_case_name,
|
|
296
|
+
run_tag,
|
|
297
|
+
temp,
|
|
298
|
+
file_suffix=".messages.analyze.json",
|
|
299
|
+
)
|
|
300
|
+
_save_data(
|
|
301
|
+
config,
|
|
302
|
+
test_case_name,
|
|
303
|
+
run_tag,
|
|
304
|
+
metrics.model_dump(),
|
|
305
|
+
file_suffix=".metrics.json",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Update metrics
|
|
309
|
+
metrics.dataset_name = test_case_name
|
|
310
|
+
|
|
311
|
+
# Calculate average response time
|
|
312
|
+
metrics.avg_resp_time = 0.0
|
|
313
|
+
if hasattr(call_tracker, "generic") and hasattr(call_tracker, "tool_call"):
|
|
314
|
+
generic_calls = getattr(call_tracker, "generic", [])
|
|
315
|
+
tool_calls = getattr(call_tracker, "tool_call", [])
|
|
316
|
+
|
|
317
|
+
if generic_calls or tool_calls:
|
|
318
|
+
total_time = sum(generic_calls) + sum(tool_calls)
|
|
319
|
+
total_calls = len(generic_calls) + len(tool_calls)
|
|
320
|
+
if total_calls > 0:
|
|
321
|
+
metrics.avg_resp_time = round(total_time / total_calls, 2)
|
|
322
|
+
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
323
|
+
|
|
324
|
+
# Add results to summary
|
|
325
|
+
summary_results_for_path.append(
|
|
326
|
+
(metrics, knowledge_base_metrics, custom_metrics)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
return summary_results_for_path
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.type import CallTracker, Message, RuntimeResponse
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RuntimeAdapter:
|
|
6
|
+
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def run(
|
|
9
|
+
self,
|
|
10
|
+
user_message: Message,
|
|
11
|
+
context: dict,
|
|
12
|
+
thread_id=None,
|
|
13
|
+
) -> RuntimeResponse:
|
|
14
|
+
pass
|