ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Awaitable,
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
Tuple,
|
|
10
|
+
TypeVar,
|
|
11
|
+
Union,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
Prompt = Union[str, List[Dict[str, Any]]]
|
|
17
|
+
PromptAndSchema = Tuple[
|
|
18
|
+
Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
|
|
19
|
+
]
|
|
20
|
+
SyncGen = Callable[[Prompt], Union[str, Any]]
|
|
21
|
+
BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
|
|
22
|
+
AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
|
|
23
|
+
AsyncBatchGen = Callable[[List[Prompt]], Awaitable[List[Union[str, Any]]]]
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PromptResult(BaseModel):
|
|
29
|
+
"""
|
|
30
|
+
Holds the prompt sent and the response (or error).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
prompt: Prompt
|
|
34
|
+
response: Optional[Any] = None
|
|
35
|
+
error: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PromptRunner:
|
|
39
|
+
"""
|
|
40
|
+
Runs a collection of prompts through various generation strategies.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
prompts: the list of prompts to run.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self, prompts: Optional[List[Union[Prompt, PromptAndSchema]]] = None
|
|
48
|
+
) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Args:
|
|
51
|
+
prompts: initial list of prompts (strings or chat messages).
|
|
52
|
+
"""
|
|
53
|
+
self.prompts: List[Union[Prompt, PromptAndSchema]] = prompts or []
|
|
54
|
+
|
|
55
|
+
def add_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
|
|
56
|
+
"""Append a prompt to the runner."""
|
|
57
|
+
self.prompts.append(prompt)
|
|
58
|
+
|
|
59
|
+
def remove_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
|
|
60
|
+
"""Remove a prompt (first occurrence)."""
|
|
61
|
+
self.prompts.remove(prompt)
|
|
62
|
+
|
|
63
|
+
def clear_prompts(self) -> None:
|
|
64
|
+
"""Remove all prompts."""
|
|
65
|
+
self.prompts.clear()
|
|
66
|
+
|
|
67
|
+
def get_prompt_and_schema(
|
|
68
|
+
self, prompt: Union[Prompt, PromptAndSchema]
|
|
69
|
+
) -> Tuple[Prompt, Optional[Dict[str, Any]]]:
|
|
70
|
+
"""
|
|
71
|
+
Extract the prompt and schema from a Prompt object.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
prompt: The prompt to extract from.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Tuple of (prompt, schema).
|
|
78
|
+
"""
|
|
79
|
+
if isinstance(prompt, tuple):
|
|
80
|
+
return prompt[0], prompt[1]
|
|
81
|
+
return prompt, None
|
|
82
|
+
|
|
83
|
+
def run_all(
|
|
84
|
+
self,
|
|
85
|
+
gen_fn: SyncGen,
|
|
86
|
+
prompt_param_name: str = "prompt",
|
|
87
|
+
schema_param_name: Optional[str] = None,
|
|
88
|
+
**kwargs: Any,
|
|
89
|
+
) -> List[PromptResult]:
|
|
90
|
+
"""
|
|
91
|
+
Run each prompt through a synchronous single-prompt generator.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
gen_fn: Callable taking one Prompt, returning str or Any.
|
|
95
|
+
prompt_param_name: Name of the parameter for the prompt.
|
|
96
|
+
schema_param_name: Name of the parameter for the schema.
|
|
97
|
+
kwargs: Additional arguments to pass to the function.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of PromptResult.
|
|
101
|
+
"""
|
|
102
|
+
results: List[PromptResult] = []
|
|
103
|
+
for p in self.prompts:
|
|
104
|
+
try:
|
|
105
|
+
prompt, schema = self.get_prompt_and_schema(p)
|
|
106
|
+
args = {prompt_param_name: prompt, **kwargs}
|
|
107
|
+
if schema_param_name and schema:
|
|
108
|
+
args[schema_param_name] = schema
|
|
109
|
+
resp = gen_fn(**args)
|
|
110
|
+
results.append(PromptResult(prompt=prompt, response=resp))
|
|
111
|
+
except Exception as e:
|
|
112
|
+
results.append(PromptResult(prompt=prompt, error=str(e)))
|
|
113
|
+
return results
|
|
114
|
+
|
|
115
|
+
async def run_async(
|
|
116
|
+
self,
|
|
117
|
+
async_fn: AsyncGen,
|
|
118
|
+
max_parallel: int = 10,
|
|
119
|
+
prompt_param_name: str = "prompt",
|
|
120
|
+
schema_param_name: Optional[str] = None,
|
|
121
|
+
**kwargs: Any,
|
|
122
|
+
) -> List[PromptResult]:
|
|
123
|
+
"""
|
|
124
|
+
Run each prompt through an async single-prompt generator with concurrency limit.
|
|
125
|
+
Results are returned in the same order as self.prompts.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
async_fn: Async callable taking one Prompt, returning str or Any.
|
|
129
|
+
max_parallel: Max concurrent tasks.
|
|
130
|
+
prompt_param_name: Name of the parameter for the prompt.
|
|
131
|
+
schema_param_name: Name of the parameter for the schema.
|
|
132
|
+
kwargs: Additional arguments to pass to the async function.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of PromptResult.
|
|
136
|
+
"""
|
|
137
|
+
semaphore = asyncio.Semaphore(max_parallel)
|
|
138
|
+
|
|
139
|
+
async def _run_one(index: int, p: Prompt) -> Tuple[int, PromptResult]:
|
|
140
|
+
async with semaphore:
|
|
141
|
+
try:
|
|
142
|
+
prompt, schema = self.get_prompt_and_schema(p)
|
|
143
|
+
args = {prompt_param_name: prompt, **kwargs}
|
|
144
|
+
if schema_param_name and schema:
|
|
145
|
+
args[schema_param_name] = schema
|
|
146
|
+
resp = await async_fn(**args)
|
|
147
|
+
return index, PromptResult(prompt=prompt, response=resp)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
return index, PromptResult(prompt=prompt, error=str(e))
|
|
150
|
+
|
|
151
|
+
tasks = [
|
|
152
|
+
asyncio.create_task(_run_one(i, p))
|
|
153
|
+
for i, p in enumerate(self.prompts)
|
|
154
|
+
]
|
|
155
|
+
indexed_results = await asyncio.gather(*tasks)
|
|
156
|
+
# Sort results to match original order
|
|
157
|
+
indexed_results.sort(key=lambda x: x[0])
|
|
158
|
+
return [res for _, res in indexed_results]
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, List, Mapping, Optional
|
|
3
|
+
|
|
4
|
+
import rich
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.consts import (
|
|
7
|
+
METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
|
|
8
|
+
METRIC_GENERAL_HALLUCINATION_CHECK,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.pipeline import (
|
|
11
|
+
ReflectionPipeline,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
|
|
14
|
+
ToolCall,
|
|
15
|
+
ToolSpec,
|
|
16
|
+
)
|
|
17
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
18
|
+
WXORuntimeAdapter,
|
|
19
|
+
)
|
|
20
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
21
|
+
from wxo_agentic_evaluation.type import Message
|
|
22
|
+
from wxo_agentic_evaluation.utils.gateway_provider_utils import (
|
|
23
|
+
get_provider_kwargs,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
DEFAULT_GENERATION_PARAMS = {
|
|
27
|
+
"min_new_tokens": 0,
|
|
28
|
+
"decoding_method": "greedy",
|
|
29
|
+
"max_new_tokens": 4096,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ReferencelessEvaluation:
|
|
34
|
+
"""
|
|
35
|
+
Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
|
|
36
|
+
Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
|
|
37
|
+
---
|
|
38
|
+
Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
|
|
39
|
+
Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
|
|
40
|
+
Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
api_spec: List[Mapping[str, Any]],
|
|
46
|
+
model_id: str,
|
|
47
|
+
task_n: str,
|
|
48
|
+
dataset_name: str,
|
|
49
|
+
runtime_pipeline: bool = True,
|
|
50
|
+
generation_params=DEFAULT_GENERATION_PARAMS,
|
|
51
|
+
inference_backend: Optional[WXORuntimeAdapter] = None,
|
|
52
|
+
):
|
|
53
|
+
|
|
54
|
+
extra_kwargs = {}
|
|
55
|
+
if inference_backend is not None:
|
|
56
|
+
wxo_client = getattr(inference_backend, "wxo_client")
|
|
57
|
+
instance_url = getattr(wxo_client, "service_url", None)
|
|
58
|
+
token = getattr(wxo_client, "api_key", None)
|
|
59
|
+
if instance_url:
|
|
60
|
+
extra_kwargs["instance_url"] = instance_url
|
|
61
|
+
if token:
|
|
62
|
+
extra_kwargs["token"] = token
|
|
63
|
+
|
|
64
|
+
self.metrics_client = ReferencelessEvaluation.get_metrics_client(
|
|
65
|
+
model_id=model_id,
|
|
66
|
+
params=generation_params,
|
|
67
|
+
referenceless_eval=True,
|
|
68
|
+
**extra_kwargs,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
self.pipeline = ReflectionPipeline(
|
|
72
|
+
metrics_client=self.metrics_client,
|
|
73
|
+
general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
|
|
74
|
+
function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
|
|
75
|
+
parameter_metrics=None,
|
|
76
|
+
runtime_pipeline=runtime_pipeline,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
self.task_n = task_n
|
|
80
|
+
self.dataset_name = dataset_name
|
|
81
|
+
|
|
82
|
+
self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def get_metrics_client(**kwargs):
|
|
86
|
+
|
|
87
|
+
provider_kwargs = get_provider_kwargs(**kwargs)
|
|
88
|
+
|
|
89
|
+
return get_provider(
|
|
90
|
+
**provider_kwargs,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def fmt_tool_call(tool_id, tool_call_name, arguments, context):
|
|
95
|
+
call = {
|
|
96
|
+
"call": {
|
|
97
|
+
"id": tool_id,
|
|
98
|
+
"type": "function",
|
|
99
|
+
"function": {
|
|
100
|
+
"name": tool_call_name,
|
|
101
|
+
"arguments": arguments,
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
"context": context,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return call
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def fmt_msgs_referenceless(
|
|
111
|
+
messages: List[Message],
|
|
112
|
+
) -> List[Mapping[str, Any]]:
|
|
113
|
+
"""Assume that the last item in the `messages` array is the tool call, and preceding items
|
|
114
|
+
in the messages array is the context.
|
|
115
|
+
"""
|
|
116
|
+
examples = []
|
|
117
|
+
processed_data = [
|
|
118
|
+
{
|
|
119
|
+
k: msg.model_dump().get(k)
|
|
120
|
+
for k in ["role", "content", "type"]
|
|
121
|
+
if k in msg.model_dump()
|
|
122
|
+
}
|
|
123
|
+
for msg in messages
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
for idx, message in enumerate(processed_data):
|
|
127
|
+
role = message["role"]
|
|
128
|
+
content = message["content"]
|
|
129
|
+
context = processed_data[:idx]
|
|
130
|
+
|
|
131
|
+
if role == "assistant" and message["type"] == "tool_call":
|
|
132
|
+
tool_call_msg = json.loads(content)
|
|
133
|
+
if tool_call_msg["name"].startswith("transfer_to"):
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
call = ReferencelessEvaluation.fmt_tool_call(
|
|
137
|
+
tool_id=tool_call_msg.get("id", "1"),
|
|
138
|
+
tool_call_name=tool_call_msg["name"],
|
|
139
|
+
arguments=json.dumps(tool_call_msg["args"]),
|
|
140
|
+
context=context,
|
|
141
|
+
)
|
|
142
|
+
examples.append(call)
|
|
143
|
+
|
|
144
|
+
return examples
|
|
145
|
+
|
|
146
|
+
def _run_pipeline(self, examples: List[Mapping[str, Any]]):
|
|
147
|
+
results = []
|
|
148
|
+
for example in examples:
|
|
149
|
+
result = self.pipeline.run_sync(
|
|
150
|
+
conversation=example["context"],
|
|
151
|
+
inventory=self.apis_specs,
|
|
152
|
+
call=example["call"],
|
|
153
|
+
continue_on_static=False,
|
|
154
|
+
retries=2,
|
|
155
|
+
)
|
|
156
|
+
result_dict = result.model_dump()
|
|
157
|
+
results.append(result_dict)
|
|
158
|
+
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
def run(self, examples: List[Mapping[str, str]], verbose=False):
|
|
162
|
+
"""`examples` should be an array where each element is formatted:
|
|
163
|
+
|
|
164
|
+
call = {
|
|
165
|
+
"call": {
|
|
166
|
+
"id": tool_call_msg.get("id", "1"),
|
|
167
|
+
"type": "function",
|
|
168
|
+
"function": {
|
|
169
|
+
"name": tool_call_msg["name"],
|
|
170
|
+
"arguments": json.dumps(tool_call_msg["args"]),
|
|
171
|
+
},
|
|
172
|
+
},
|
|
173
|
+
"context": context,
|
|
174
|
+
}
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
examples = [
|
|
178
|
+
{
|
|
179
|
+
"call": ToolCall.model_validate(ex["call"]),
|
|
180
|
+
"context": ex["context"],
|
|
181
|
+
}
|
|
182
|
+
for ex in examples
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
if verbose:
|
|
186
|
+
rich.print(
|
|
187
|
+
f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
|
|
188
|
+
)
|
|
189
|
+
results = self._run_pipeline(examples)
|
|
190
|
+
|
|
191
|
+
return results
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
4
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
class ResourceMap:
|
|
@@ -14,7 +16,7 @@ class ResourceMap:
|
|
|
14
16
|
if is_saas_url(self.wxo_client.service_url):
|
|
15
17
|
# TO-DO: this is not validated after the v1 prefix change
|
|
16
18
|
# need additional validation
|
|
17
|
-
tools_path = "v1/orchestrate/tools
|
|
19
|
+
tools_path = "v1/orchestrate/tools"
|
|
18
20
|
agents_path = "v1/orchestrate/agents"
|
|
19
21
|
else:
|
|
20
22
|
tools_path = "v1/tools/"
|
|
@@ -33,6 +35,7 @@ class ResourceMap:
|
|
|
33
35
|
|
|
34
36
|
if resp.status_code == 200:
|
|
35
37
|
agents = resp.json()
|
|
38
|
+
self.all_agent_objs = agents
|
|
36
39
|
for agent in agents:
|
|
37
40
|
agent_name = agent["name"]
|
|
38
41
|
tools = [tool_map[id] for id in agent["tools"]]
|
|
@@ -44,4 +47,4 @@ class ResourceMap:
|
|
|
44
47
|
|
|
45
48
|
agent2tools = dict(agent2tools)
|
|
46
49
|
tools2agents = dict(tools2agents)
|
|
47
|
-
return agent2tools, tools2agents
|
|
50
|
+
return agent2tools, tools2agents
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import rich
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
9
|
+
from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import (
|
|
10
|
+
EvaluationController,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
13
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
14
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
15
|
+
CustomEvalMetrics,
|
|
16
|
+
KnowledgeBaseMetricSummary,
|
|
17
|
+
ToolCallAndRoutingMetrics,
|
|
18
|
+
)
|
|
19
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
20
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
21
|
+
WXORuntimeAdapter,
|
|
22
|
+
)
|
|
23
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
24
|
+
from wxo_agentic_evaluation.type import OrchestrateDataset
|
|
25
|
+
from wxo_agentic_evaluation.utils import json_dump
|
|
26
|
+
from wxo_agentic_evaluation.utils.evaluation_discovery import (
|
|
27
|
+
find_evaluation_subclasses,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _save_data(
|
|
32
|
+
config: TestConfig,
|
|
33
|
+
test_case_name: str,
|
|
34
|
+
run_tag: str,
|
|
35
|
+
data,
|
|
36
|
+
file_path: str | None = None,
|
|
37
|
+
file_suffix: str | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Save data to a JSON file.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Test configuration
|
|
44
|
+
test_case_name: Test case name
|
|
45
|
+
run_tag: Run tag
|
|
46
|
+
data: Data to save
|
|
47
|
+
file_path: Complete file path (optional)
|
|
48
|
+
file_suffix: File suffix for messages directory (optional)
|
|
49
|
+
"""
|
|
50
|
+
if file_path:
|
|
51
|
+
json_dump(str(file_path), data)
|
|
52
|
+
elif file_suffix:
|
|
53
|
+
json_dump(
|
|
54
|
+
os.path.join(
|
|
55
|
+
config.output_dir,
|
|
56
|
+
"messages",
|
|
57
|
+
f"{test_case_name}{run_tag}{file_suffix}",
|
|
58
|
+
),
|
|
59
|
+
data,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Handle conversational search data
|
|
63
|
+
if (
|
|
64
|
+
isinstance(data, list)
|
|
65
|
+
and data
|
|
66
|
+
and hasattr(data[0], "model_dump")
|
|
67
|
+
and file_suffix == ".retrieval_context.json"
|
|
68
|
+
):
|
|
69
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
70
|
+
out_folder.mkdir(exist_ok=True)
|
|
71
|
+
retrieval_context = [context.model_dump() for context in data]
|
|
72
|
+
json_dump(
|
|
73
|
+
str(out_folder / f"{test_case_name}{run_tag}{file_suffix}"),
|
|
74
|
+
retrieval_context,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _process_tool_calls(
|
|
79
|
+
history: List,
|
|
80
|
+
evaluation_data: OrchestrateDataset,
|
|
81
|
+
resource_map: ResourceMap,
|
|
82
|
+
) -> Tuple[List[str], List[str], List[str]]:
|
|
83
|
+
"""
|
|
84
|
+
Process tool calls from history and evaluation data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
history: Message history
|
|
88
|
+
evaluation_data: evaluation data
|
|
89
|
+
resource_map: Resource map
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Tuple of (expected tool calls, actual tool calls, missed tool calls)
|
|
93
|
+
"""
|
|
94
|
+
expected_tools = [
|
|
95
|
+
goal_detail.tool_name
|
|
96
|
+
for goal_detail in evaluation_data.goal_details
|
|
97
|
+
if getattr(goal_detail, "type", None) == "tool_call"
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
raw_actual = []
|
|
101
|
+
for message in history:
|
|
102
|
+
try:
|
|
103
|
+
if getattr(message, "type", None) == "tool_call":
|
|
104
|
+
payload = (
|
|
105
|
+
json.loads(message.content)
|
|
106
|
+
if isinstance(message.content, str)
|
|
107
|
+
else message.content
|
|
108
|
+
)
|
|
109
|
+
name = (payload or {}).get("name")
|
|
110
|
+
if name:
|
|
111
|
+
raw_actual.append(str(name).strip())
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
expected_set = set(expected_tools)
|
|
116
|
+
agent_names = (
|
|
117
|
+
set(getattr(resource_map, "agent2tools", {}).keys())
|
|
118
|
+
if resource_map
|
|
119
|
+
else set()
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
filtered_actual_tool_calls = [
|
|
123
|
+
name for name in raw_actual if name not in agent_names
|
|
124
|
+
]
|
|
125
|
+
missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
|
|
126
|
+
|
|
127
|
+
return expected_tools, filtered_actual_tool_calls, missed_tool_calls
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def process_test_case(
|
|
131
|
+
task_n: int,
|
|
132
|
+
test_case: str,
|
|
133
|
+
config: TestConfig,
|
|
134
|
+
runtime_adapter: WXORuntimeAdapter,
|
|
135
|
+
resource_map: ResourceMap,
|
|
136
|
+
llm_user: LLMUser,
|
|
137
|
+
llmaaj_provider: Provider,
|
|
138
|
+
run_idx: int = 0,
|
|
139
|
+
) -> List[
|
|
140
|
+
Tuple[
|
|
141
|
+
ToolCallAndRoutingMetrics, KnowledgeBaseMetricSummary, CustomEvalMetrics
|
|
142
|
+
]
|
|
143
|
+
]:
|
|
144
|
+
"""
|
|
145
|
+
Process a single test case.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
task_n: Task number
|
|
149
|
+
test_case: Path to the test case file
|
|
150
|
+
config: Test configuration
|
|
151
|
+
inference_backend: Inference backend
|
|
152
|
+
resource_map: Resource map
|
|
153
|
+
llm_user: LLM user
|
|
154
|
+
llmaaj_provider: Provider for custom metrics
|
|
155
|
+
run_idx: Run index
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
List of tuples (metrics, knowledge_base_metrics, custom_metrics)
|
|
159
|
+
"""
|
|
160
|
+
summary_results_for_path = []
|
|
161
|
+
test_case_name = os.path.basename(test_case).replace(".json", "")
|
|
162
|
+
run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
|
|
163
|
+
|
|
164
|
+
with open(test_case, "r") as f:
|
|
165
|
+
evaluation_data = OrchestrateDataset.model_validate(json.load(f))
|
|
166
|
+
|
|
167
|
+
# Set up evaluation controller and run test
|
|
168
|
+
evaluation_controller = EvaluationController(
|
|
169
|
+
runtime=runtime_adapter,
|
|
170
|
+
llm_user=llm_user,
|
|
171
|
+
config=config,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
rich.print(
|
|
175
|
+
f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Run the evaluation
|
|
179
|
+
history, call_tracker, conversational_search_data, _ = (
|
|
180
|
+
evaluation_controller.run(
|
|
181
|
+
task_n,
|
|
182
|
+
story=evaluation_data.story,
|
|
183
|
+
agent_name=evaluation_data.agent,
|
|
184
|
+
starting_user_input=evaluation_data.starting_sentence,
|
|
185
|
+
max_user_turns=evaluation_data.max_user_turns,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Save metadata (that contains thread_id)
|
|
190
|
+
json_dump(
|
|
191
|
+
os.path.join(
|
|
192
|
+
config.output_dir,
|
|
193
|
+
f"{test_case_name}{run_tag}.metadata.json",
|
|
194
|
+
),
|
|
195
|
+
call_tracker.metadata,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if config.skip_legacy_evaluation:
|
|
199
|
+
return summary_results_for_path # empty result set, skip evaluation
|
|
200
|
+
|
|
201
|
+
# Save message history
|
|
202
|
+
result = [message.model_dump() for message in history]
|
|
203
|
+
_save_data(
|
|
204
|
+
config, test_case_name, run_tag, result, file_suffix=".messages.json"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Save conversational search data if available
|
|
208
|
+
if conversational_search_data:
|
|
209
|
+
retrieval_context = [
|
|
210
|
+
context.model_dump() for context in conversational_search_data
|
|
211
|
+
]
|
|
212
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
213
|
+
out_folder.mkdir(exist_ok=True)
|
|
214
|
+
file_path = str(
|
|
215
|
+
out_folder / f"{test_case_name}{run_tag}.retrieval_context.json"
|
|
216
|
+
)
|
|
217
|
+
_save_data(
|
|
218
|
+
config,
|
|
219
|
+
test_case_name,
|
|
220
|
+
run_tag,
|
|
221
|
+
retrieval_context,
|
|
222
|
+
file_path=file_path,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# If data annotation run, skip summary generation
|
|
226
|
+
if config.data_annotation_run:
|
|
227
|
+
return summary_results_for_path # empty result set, skip summary
|
|
228
|
+
|
|
229
|
+
# Load custom extractors and evaluations
|
|
230
|
+
all_extractors = []
|
|
231
|
+
all_custom_evals = []
|
|
232
|
+
|
|
233
|
+
# Load custom extractors
|
|
234
|
+
if config.extractors_config.paths is not None:
|
|
235
|
+
for path in config.extrators_config.paths:
|
|
236
|
+
extractors = find_evaluation_subclasses(
|
|
237
|
+
directory=path, base_class_name="Extractor"
|
|
238
|
+
)
|
|
239
|
+
for extractor_class in extractors:
|
|
240
|
+
all_extractors.append(extractor_class())
|
|
241
|
+
|
|
242
|
+
# Load custom evaluations
|
|
243
|
+
if config.custom_metrics_config.paths is not None:
|
|
244
|
+
for path in config.custom_metrics_config.paths:
|
|
245
|
+
custom_eval_classes = find_evaluation_subclasses(path)
|
|
246
|
+
for _class in custom_eval_classes:
|
|
247
|
+
all_custom_evals.append(_class(llm_client=llmaaj_provider))
|
|
248
|
+
|
|
249
|
+
# Create evaluation package and generate summary
|
|
250
|
+
evaluation_package = EvaluationPackage(
|
|
251
|
+
test_case_name=test_case_name,
|
|
252
|
+
messages=history,
|
|
253
|
+
ground_truth=evaluation_data,
|
|
254
|
+
conversational_search_data=conversational_search_data,
|
|
255
|
+
resource_map=resource_map,
|
|
256
|
+
config=config,
|
|
257
|
+
custom_evals=all_custom_evals,
|
|
258
|
+
extractors=all_extractors,
|
|
259
|
+
similarity_threshold=config.similarity_threshold,
|
|
260
|
+
enable_fuzzy_matching=config.enable_fuzzy_matching,
|
|
261
|
+
strict_topological_matching=config.strict_topological_matching,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Generate summary
|
|
265
|
+
(
|
|
266
|
+
_keyword_semantic_matches,
|
|
267
|
+
knowledge_base_metrics,
|
|
268
|
+
messages_with_reason,
|
|
269
|
+
metrics,
|
|
270
|
+
custom_metrics,
|
|
271
|
+
) = evaluation_package.generate_summary()
|
|
272
|
+
|
|
273
|
+
# Process messages with reason
|
|
274
|
+
temp = [message.model_dump() for message in messages_with_reason]
|
|
275
|
+
|
|
276
|
+
# Process tool calls
|
|
277
|
+
expected_tools, filtered_actual_tool_calls, missed_tool_calls = (
|
|
278
|
+
_process_tool_calls(history, evaluation_data, resource_map)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Add meta information
|
|
282
|
+
temp.append(
|
|
283
|
+
{
|
|
284
|
+
"meta": {
|
|
285
|
+
"expected_tool_calls": expected_tools,
|
|
286
|
+
"actual_tool_calls": filtered_actual_tool_calls,
|
|
287
|
+
"missed_tool_calls": missed_tool_calls,
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Save analysis results
|
|
293
|
+
_save_data(
|
|
294
|
+
config,
|
|
295
|
+
test_case_name,
|
|
296
|
+
run_tag,
|
|
297
|
+
temp,
|
|
298
|
+
file_suffix=".messages.analyze.json",
|
|
299
|
+
)
|
|
300
|
+
_save_data(
|
|
301
|
+
config,
|
|
302
|
+
test_case_name,
|
|
303
|
+
run_tag,
|
|
304
|
+
metrics.model_dump(),
|
|
305
|
+
file_suffix=".metrics.json",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Update metrics
|
|
309
|
+
metrics.dataset_name = test_case_name
|
|
310
|
+
|
|
311
|
+
# Calculate average response time
|
|
312
|
+
metrics.avg_resp_time = 0.0
|
|
313
|
+
if hasattr(call_tracker, "generic") and hasattr(call_tracker, "tool_call"):
|
|
314
|
+
generic_calls = getattr(call_tracker, "generic", [])
|
|
315
|
+
tool_calls = getattr(call_tracker, "tool_call", [])
|
|
316
|
+
|
|
317
|
+
if generic_calls or tool_calls:
|
|
318
|
+
total_time = sum(generic_calls) + sum(tool_calls)
|
|
319
|
+
total_calls = len(generic_calls) + len(tool_calls)
|
|
320
|
+
if total_calls > 0:
|
|
321
|
+
metrics.avg_resp_time = round(total_time / total_calls, 2)
|
|
322
|
+
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
323
|
+
|
|
324
|
+
# Add results to summary
|
|
325
|
+
summary_results_for_path.append(
|
|
326
|
+
(metrics, knowledge_base_metrics, custom_metrics)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
return summary_results_for_path
|
|
File without changes
|