ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.metrics.metrics import Metric
|
|
6
|
+
from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
|
|
7
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
8
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
9
|
+
from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
|
|
10
|
+
|
|
11
|
+
root_dir: str = os.path.dirname(os.path.dirname(__file__))
|
|
12
|
+
LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Evaluation(ABC):
|
|
16
|
+
"""Abstract base class for all evaluations."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, llm_client: Optional[Provider] = None) -> None:
|
|
19
|
+
self._llm_client = llm_client
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def llm_client(self) -> Any:
|
|
23
|
+
"""Access client, require it if used."""
|
|
24
|
+
if self._llm_client is None:
|
|
25
|
+
raise RuntimeError(
|
|
26
|
+
f"{self.__class__.__name__} requires a client, but none was provided"
|
|
27
|
+
)
|
|
28
|
+
return self._llm_client
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def name(self) -> str:
|
|
33
|
+
"""Unique name for the evaluator."""
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def evaluate(
|
|
38
|
+
self,
|
|
39
|
+
messages: list[Message],
|
|
40
|
+
ground_truth: OrchestrateDataset,
|
|
41
|
+
extracted_context: Dict[str, Any],
|
|
42
|
+
) -> Optional[Metric]:
|
|
43
|
+
"""
|
|
44
|
+
Evaluation method.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
messages: agent and user conversational messages (includes tool calls)
|
|
48
|
+
ground_truth: ground truth data
|
|
49
|
+
extracted_context: dictionary containing data derived from the messages
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Metic
|
|
53
|
+
"""
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class LLMaaJEvaluation(Evaluation, ABC):
|
|
58
|
+
"""Evaluation metric for LLMaaJ."""
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def llmaaj_instructions(self) -> str:
|
|
63
|
+
"""LLMaaJ instructions for the evaluator."""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def format_llm_output(self, string: str) -> int | float | bool | str:
|
|
68
|
+
"""Format the output of the LLMaaJ query."""
|
|
69
|
+
raise NotImplementedError
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def selected_context_keys(self) -> set[str]:
|
|
73
|
+
"""Override to implement context keys to pass to the prompt."""
|
|
74
|
+
return set()
|
|
75
|
+
|
|
76
|
+
def select_context(
|
|
77
|
+
self, extracted_context: Dict[str, Any]
|
|
78
|
+
) -> dict[str, Any]:
|
|
79
|
+
"""Additional context to be added to the prompt."""
|
|
80
|
+
selected_context = {
|
|
81
|
+
key: value
|
|
82
|
+
for key, value in extracted_context.items()
|
|
83
|
+
if key in self.selected_context_keys
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return selected_context
|
|
87
|
+
|
|
88
|
+
def evaluate(
|
|
89
|
+
self,
|
|
90
|
+
messages: list[Message],
|
|
91
|
+
ground_truth: OrchestrateDataset,
|
|
92
|
+
extracted_context: Dict[str, Any],
|
|
93
|
+
) -> Optional[Metric]:
|
|
94
|
+
renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
|
|
95
|
+
parsed = ParsedMessages(messages=messages)
|
|
96
|
+
if parsed.user_input is None or parsed.agent_response is None:
|
|
97
|
+
return None
|
|
98
|
+
context = str(self.select_context(extracted_context))
|
|
99
|
+
prompt = renderer.render(
|
|
100
|
+
user_input=parsed.user_input,
|
|
101
|
+
agent_answer=parsed.agent_response,
|
|
102
|
+
llmaaj_instructions=self.llmaaj_instructions,
|
|
103
|
+
context=context,
|
|
104
|
+
)
|
|
105
|
+
score_str = self.llm_client.query(prompt)
|
|
106
|
+
value = self.format_llm_output(score_str)
|
|
107
|
+
return Metric(eval_name=self.name, value=value)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
|
|
7
|
+
from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
|
|
8
|
+
|
|
9
|
+
## fix later
|
|
10
|
+
from wxo_agentic_evaluation.otel_parser.parser_types import (
|
|
11
|
+
Message as OtelMessage,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
16
|
+
- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
|
|
17
|
+
a dummy node to the ground truth and the labelled messages to take into account
|
|
18
|
+
single, summary step goals.
|
|
19
|
+
"""
|
|
20
|
+
DUMMY_GRAPH_NODE_NAME = "dummy-goal"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class JourneySuccessMetric(Evaluation):
|
|
24
|
+
def __init__(self, llm_client=None):
|
|
25
|
+
super().__init__(llm_client)
|
|
26
|
+
self.is_strict = True
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def name(self):
|
|
30
|
+
return "Journey Success"
|
|
31
|
+
|
|
32
|
+
def find_terminal_nodes(self, graph: dict[str, list[str]]) -> set[str]:
|
|
33
|
+
"""Finds terminal nodes (nodes with no outgoing edges).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
graph: the input graph
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
a set of the terminal nodes
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
seen_nodes = set() # track seen nodes
|
|
43
|
+
non_terminal_nodes = set() # track nodes with children
|
|
44
|
+
|
|
45
|
+
for node in graph:
|
|
46
|
+
seen_nodes.add(node)
|
|
47
|
+
if graph[node]:
|
|
48
|
+
non_terminal_nodes.add(node)
|
|
49
|
+
for n in graph[node]:
|
|
50
|
+
seen_nodes.add(n)
|
|
51
|
+
return seen_nodes - non_terminal_nodes
|
|
52
|
+
|
|
53
|
+
def is_topological_sort(
|
|
54
|
+
self,
|
|
55
|
+
graph: dict[str, list[str]],
|
|
56
|
+
ordering: list[str],
|
|
57
|
+
is_strict: bool = True,
|
|
58
|
+
) -> bool:
|
|
59
|
+
"""Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
|
|
63
|
+
ordering: the nodes visited, in order
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
|
|
67
|
+
"""
|
|
68
|
+
# No keyword match or goal details were achieved
|
|
69
|
+
if not ordering:
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
if is_strict:
|
|
73
|
+
# strict matching: only consider most recent tool call
|
|
74
|
+
position = {node: [i] for i, node in enumerate(ordering)}
|
|
75
|
+
else:
|
|
76
|
+
# lenient matching: consider all tool calls (account for all indexes of the node)
|
|
77
|
+
position = defaultdict(list)
|
|
78
|
+
for i, node in enumerate(ordering):
|
|
79
|
+
position[node].append(i)
|
|
80
|
+
|
|
81
|
+
terminal_nodes = self.find_terminal_nodes(graph)
|
|
82
|
+
# adds a dummy node for each terminal node
|
|
83
|
+
next_idx = (
|
|
84
|
+
max(val for values in position.values() for val in values) + 1
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for n in terminal_nodes:
|
|
88
|
+
graph[n] = [DUMMY_GRAPH_NODE_NAME]
|
|
89
|
+
graph[DUMMY_GRAPH_NODE_NAME] = []
|
|
90
|
+
position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
|
|
91
|
+
next_idx += 1
|
|
92
|
+
|
|
93
|
+
for node in graph:
|
|
94
|
+
for child_nodes in graph[node]:
|
|
95
|
+
# Current node/children doesn't show up in made calls
|
|
96
|
+
if node not in position or child_nodes not in position:
|
|
97
|
+
return False
|
|
98
|
+
# Current node doesn't show up before any of its child
|
|
99
|
+
# all index in current nodes are larger than every child nodes' index
|
|
100
|
+
if all(
|
|
101
|
+
curr >= max(position[child_nodes])
|
|
102
|
+
for curr in position[node]
|
|
103
|
+
):
|
|
104
|
+
return False
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
def evaluate(
|
|
108
|
+
self, messages, ground_truth, extracted_context, metadata, **kwargs
|
|
109
|
+
):
|
|
110
|
+
labeled_messages = extracted_context.get("labeled_messages")
|
|
111
|
+
correct_tool_calls = []
|
|
112
|
+
|
|
113
|
+
for message_idx, matching_goal_details in labeled_messages.items():
|
|
114
|
+
msg_tool_call = messages[message_idx]
|
|
115
|
+
msg_tool_call = msg_tool_call.tool_calls[0].function
|
|
116
|
+
for goal_detail in matching_goal_details:
|
|
117
|
+
args_match = argument_matching(
|
|
118
|
+
expected=goal_detail.args,
|
|
119
|
+
actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if args_match:
|
|
123
|
+
correct_tool_calls.append(goal_detail.name)
|
|
124
|
+
|
|
125
|
+
is_topological_sort = self.is_topological_sort(
|
|
126
|
+
graph=ground_truth.goals,
|
|
127
|
+
ordering=correct_tool_calls,
|
|
128
|
+
is_strict=self.is_strict,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return LangfuseMetric(
|
|
132
|
+
eval_name=self.name,
|
|
133
|
+
comment="",
|
|
134
|
+
value=is_topological_sort,
|
|
135
|
+
data_type="NUMERIC",
|
|
136
|
+
metadata=metadata,
|
|
137
|
+
)
|
|
@@ -44,3 +44,29 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
|
|
|
44
44
|
"answer_relevancy": self.answer_relevancy,
|
|
45
45
|
"answer_relevancy_score": self.answer_relevancy_score,
|
|
46
46
|
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AnswerDerailment(BaseLLMJudgeMetric):
|
|
50
|
+
in_scope: str | float
|
|
51
|
+
statement: str
|
|
52
|
+
reason: str
|
|
53
|
+
|
|
54
|
+
def table(self):
|
|
55
|
+
return {
|
|
56
|
+
"statement": self.statement,
|
|
57
|
+
"reason": self.reason,
|
|
58
|
+
"on_topic_score": str(self.in_scope),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AnswerUnsafeTopic(BaseLLMJudgeMetric):
|
|
63
|
+
is_safe: str | float
|
|
64
|
+
statement: str
|
|
65
|
+
reason: str
|
|
66
|
+
|
|
67
|
+
def table(self):
|
|
68
|
+
return {
|
|
69
|
+
"statement": self.statement,
|
|
70
|
+
"reason": self.reason,
|
|
71
|
+
"safe_topic_score": str(self.is_safe),
|
|
72
|
+
}
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import
|
|
2
|
-
from enum import Enum
|
|
3
|
-
from typing import Any, List, Mapping, Optional, Tuple
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from enum import Enum, StrEnum
|
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, computed_field
|
|
6
|
+
from pydantic.fields import Field
|
|
6
7
|
|
|
7
8
|
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
8
9
|
AnswerRelevancy,
|
|
@@ -11,12 +12,34 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
|
11
12
|
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
class DescriptionQuality(StrEnum):
|
|
16
|
+
GOOD = "GOOD"
|
|
17
|
+
BAD = "BAD"
|
|
18
|
+
MISSING = "MISSING"
|
|
17
19
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
|
|
21
|
+
class DescriptionQualityMetric(BaseModel):
|
|
22
|
+
tool_name: str = None
|
|
23
|
+
description_score: float | None = None
|
|
24
|
+
threshold: float | None = None
|
|
25
|
+
|
|
26
|
+
@computed_field
|
|
27
|
+
@property
|
|
28
|
+
def is_bad_description(self) -> Optional[bool]:
|
|
29
|
+
if self.description_score and self.threshold:
|
|
30
|
+
return self.description_score >= self.threshold
|
|
31
|
+
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
@computed_field
|
|
35
|
+
@property
|
|
36
|
+
def description_quality(self) -> str:
|
|
37
|
+
if self.description_score is None:
|
|
38
|
+
return DescriptionQuality.MISSING
|
|
39
|
+
elif self.is_bad_description:
|
|
40
|
+
return DescriptionQuality.BAD
|
|
41
|
+
else:
|
|
42
|
+
return DescriptionQuality.GOOD
|
|
20
43
|
|
|
21
44
|
|
|
22
45
|
class KnowledgeBaseMetrics(BaseModel):
|
|
@@ -72,6 +95,8 @@ class KnowledgeBaseMetricSummary(BaseModel):
|
|
|
72
95
|
@computed_field(alias="summary")
|
|
73
96
|
@property
|
|
74
97
|
def average(self) -> Mapping[str, Any]:
|
|
98
|
+
from wxo_agentic_evaluation.utils.utils import average
|
|
99
|
+
|
|
75
100
|
summary = {}
|
|
76
101
|
for dataset, metric in self.groupby_dataset.items():
|
|
77
102
|
average_metric = {}
|
|
@@ -175,6 +200,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
175
200
|
)
|
|
176
201
|
|
|
177
202
|
|
|
203
|
+
class Annotation(BaseModel):
|
|
204
|
+
recommendation: str
|
|
205
|
+
details: str
|
|
206
|
+
quote: str
|
|
207
|
+
parameter_name: Optional[str]
|
|
208
|
+
|
|
209
|
+
|
|
178
210
|
class FailedStaticTestCases(BaseModel):
|
|
179
211
|
metric_name: str
|
|
180
212
|
description: str
|
|
@@ -187,6 +219,15 @@ class FailedSemanticTestCases(BaseModel):
|
|
|
187
219
|
explanation: str
|
|
188
220
|
output: int
|
|
189
221
|
confidence: float
|
|
222
|
+
annotations: Optional[List[Annotation]] = None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class EnhancedAnalyzeMetrics(BaseModel):
|
|
226
|
+
test_case_name: str
|
|
227
|
+
tool_names: List[str]
|
|
228
|
+
parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
229
|
+
tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
230
|
+
static_metrics: List[List[FailedStaticTestCases]] = [[]]
|
|
190
231
|
|
|
191
232
|
|
|
192
233
|
class ReferenceLessEvalMetrics(BaseModel):
|
|
@@ -201,3 +242,230 @@ class ReferenceLessEvalMetrics(BaseModel):
|
|
|
201
242
|
failed_semantic_tool_calls: Optional[
|
|
202
243
|
List[Tuple[int, List[FailedSemanticTestCases]]]
|
|
203
244
|
]
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class Metric(BaseModel):
|
|
248
|
+
"""Generic metric result."""
|
|
249
|
+
|
|
250
|
+
eval_name: str = Field(description="name of eval that produce metric")
|
|
251
|
+
value: int | float | bool | str = Field(description="metric value")
|
|
252
|
+
metadata: Optional[dict] = Field(
|
|
253
|
+
default=None,
|
|
254
|
+
description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class LangfuseMetric(Metric):
|
|
259
|
+
comment: Optional[str] = ""
|
|
260
|
+
data_type: Optional[str] = ""
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class CustomEvalMetrics(BaseModel):
|
|
264
|
+
dataset_name: str
|
|
265
|
+
custom_metrics: list[Metric]
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def create_avg_row(metrics: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
269
|
+
"""
|
|
270
|
+
Create an average row from a list of metric dictionaries.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
metrics: List of metric dictionaries
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dictionary with averaged metrics
|
|
277
|
+
"""
|
|
278
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
279
|
+
|
|
280
|
+
avg_row = {
|
|
281
|
+
"Dataset": "Summary (Average)",
|
|
282
|
+
"Runs": 0,
|
|
283
|
+
"Total Steps": 0,
|
|
284
|
+
"LLM Steps": 0,
|
|
285
|
+
"Total Tool Calls": 0,
|
|
286
|
+
"Tool Call Precision": 0,
|
|
287
|
+
"Tool Call Recall": 0,
|
|
288
|
+
"Agent Routing Accuracy": 0,
|
|
289
|
+
"Text Match": 0,
|
|
290
|
+
"Journey Success": 0,
|
|
291
|
+
"Avg Resp Time (sec)": 0,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if metrics:
|
|
295
|
+
for row in metrics:
|
|
296
|
+
avg_row["Runs"] += row.get("Runs", 0)
|
|
297
|
+
avg_row["Total Steps"] += row["Total Steps"]
|
|
298
|
+
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
299
|
+
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
300
|
+
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
301
|
+
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
302
|
+
avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
|
|
303
|
+
avg_row["Text Match"] += row["Text Match"]
|
|
304
|
+
avg_row["Journey Success"] += row["Journey Success"]
|
|
305
|
+
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
306
|
+
|
|
307
|
+
n = len(metrics)
|
|
308
|
+
# Average over datasets
|
|
309
|
+
avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
|
|
310
|
+
avg_row["Total Steps"] = round(
|
|
311
|
+
safe_divide(avg_row["Total Steps"], n), 2
|
|
312
|
+
)
|
|
313
|
+
avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], n), 2)
|
|
314
|
+
avg_row["Total Tool Calls"] = round(
|
|
315
|
+
safe_divide(avg_row["Total Tool Calls"], n), 2
|
|
316
|
+
)
|
|
317
|
+
avg_row["Tool Call Precision"] = round(
|
|
318
|
+
safe_divide(avg_row["Tool Call Precision"], n), 2
|
|
319
|
+
)
|
|
320
|
+
avg_row["Tool Call Recall"] = round(
|
|
321
|
+
safe_divide(avg_row["Tool Call Recall"], n), 2
|
|
322
|
+
)
|
|
323
|
+
avg_row["Agent Routing Accuracy"] = round(
|
|
324
|
+
safe_divide(avg_row["Agent Routing Accuracy"], n), 2
|
|
325
|
+
)
|
|
326
|
+
avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], n), 2)
|
|
327
|
+
avg_row["Journey Success"] = round(
|
|
328
|
+
safe_divide(avg_row["Journey Success"], n), 2
|
|
329
|
+
)
|
|
330
|
+
avg_row["Avg Resp Time (sec)"] = round(
|
|
331
|
+
safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
return avg_row
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def format_metrics_for_display(
|
|
338
|
+
tool_call_metrics: list[ToolCallAndRoutingMetrics],
|
|
339
|
+
) -> list[dict[str, Any]]:
|
|
340
|
+
from wxo_agentic_evaluation.utils.utils import mean, safe_divide, to_pct
|
|
341
|
+
|
|
342
|
+
# Group metrics by dataset name
|
|
343
|
+
grouped = defaultdict(list)
|
|
344
|
+
for m in tool_call_metrics:
|
|
345
|
+
grouped[m.dataset_name].append(
|
|
346
|
+
{
|
|
347
|
+
"Dataset": m.dataset_name,
|
|
348
|
+
"Total Steps": m.total_steps,
|
|
349
|
+
"LLM Steps": m.llm_step,
|
|
350
|
+
"Total Tool Calls": m.total_tool_calls,
|
|
351
|
+
"Tool Call Precision": m.tool_call_precision,
|
|
352
|
+
"Tool Call Recall": m.tool_call_recall,
|
|
353
|
+
"Agent Routing Accuracy": m.agent_routing_accuracy,
|
|
354
|
+
"Text Match": m.text_match,
|
|
355
|
+
"Journey Success": m.is_success,
|
|
356
|
+
"Avg Resp Time (sec)": m.avg_resp_time,
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Create per-test rows with averages over runs
|
|
361
|
+
per_test_rows = []
|
|
362
|
+
for ds, rows in grouped.items():
|
|
363
|
+
out = {"Dataset": ds}
|
|
364
|
+
|
|
365
|
+
# Average numeric columns over runs
|
|
366
|
+
numeric_keys = [
|
|
367
|
+
"Total Steps",
|
|
368
|
+
"LLM Steps",
|
|
369
|
+
"Total Tool Calls",
|
|
370
|
+
"Tool Call Precision",
|
|
371
|
+
"Tool Call Recall",
|
|
372
|
+
"Agent Routing Accuracy",
|
|
373
|
+
"Avg Resp Time (sec)",
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
for k in numeric_keys:
|
|
377
|
+
out[k] = mean(
|
|
378
|
+
[r[k] for r in rows if isinstance(r.get(k), (int, float))]
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Add total runs per dataset
|
|
382
|
+
out["Runs"] = round(float(len(rows)), 2)
|
|
383
|
+
|
|
384
|
+
# Journey Success -> numeric fraction in [0,1]
|
|
385
|
+
js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
|
|
386
|
+
out["Journey Success"] = round(
|
|
387
|
+
safe_divide(sum(js_vals), len(js_vals)), 2
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Text Match -> numeric fraction in [0,1]
|
|
391
|
+
tm_hits = 0
|
|
392
|
+
tm_den = len(rows)
|
|
393
|
+
for r in rows:
|
|
394
|
+
val = r.get("Text Match")
|
|
395
|
+
if str(val).strip() == TextMatchType.text_match.value:
|
|
396
|
+
tm_hits += 1
|
|
397
|
+
out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
|
|
398
|
+
|
|
399
|
+
per_test_rows.append(out)
|
|
400
|
+
|
|
401
|
+
# Create overall average row
|
|
402
|
+
overall_row = create_avg_row(per_test_rows)
|
|
403
|
+
|
|
404
|
+
# Format percentages
|
|
405
|
+
tool_call_metrics_for_display = per_test_rows + [overall_row]
|
|
406
|
+
for row in tool_call_metrics_for_display:
|
|
407
|
+
row["Text Match"] = to_pct(row.get("Text Match"), decimals=0)
|
|
408
|
+
row["Journey Success"] = to_pct(row.get("Journey Success"), decimals=0)
|
|
409
|
+
|
|
410
|
+
column_order = [
|
|
411
|
+
"Dataset",
|
|
412
|
+
"Runs",
|
|
413
|
+
"Total Steps",
|
|
414
|
+
"LLM Steps",
|
|
415
|
+
"Total Tool Calls",
|
|
416
|
+
"Tool Call Precision",
|
|
417
|
+
"Tool Call Recall",
|
|
418
|
+
"Agent Routing Accuracy",
|
|
419
|
+
"Text Match",
|
|
420
|
+
"Journey Success",
|
|
421
|
+
"Avg Resp Time (sec)",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
tool_call_metrics_for_display = [
|
|
425
|
+
{col: row.get(col, "") for col in column_order}
|
|
426
|
+
for row in tool_call_metrics_for_display
|
|
427
|
+
]
|
|
428
|
+
|
|
429
|
+
return tool_call_metrics_for_display
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def extract_metrics(
|
|
433
|
+
results: List[
|
|
434
|
+
Tuple[
|
|
435
|
+
ToolCallAndRoutingMetrics,
|
|
436
|
+
KnowledgeBaseMetricSummary,
|
|
437
|
+
CustomEvalMetrics,
|
|
438
|
+
]
|
|
439
|
+
],
|
|
440
|
+
) -> tuple[
|
|
441
|
+
list[ToolCallAndRoutingMetrics],
|
|
442
|
+
KnowledgeBaseMetricSummary,
|
|
443
|
+
List[CustomEvalMetrics],
|
|
444
|
+
]:
|
|
445
|
+
"""
|
|
446
|
+
Aggregate metrics from test results.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
results: List of tuples (metrics, knowledge_base_metrics, custom_metrics)
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Tuple of (knowledge_base_summary, tool_rows, custom_metrics)
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
tool_call_metrics = [metric[0] for metric in results]
|
|
456
|
+
knowledge_base_metrics = [metric[1] for metric in results]
|
|
457
|
+
custom_metrics: List[CustomEvalMetrics] = [metric[2] for metric in results]
|
|
458
|
+
|
|
459
|
+
kb_summary = KnowledgeBaseMetricSummary(
|
|
460
|
+
knowledge_base_metrics=knowledge_base_metrics
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
if len(tool_call_metrics) > 0:
|
|
464
|
+
# Remove the average row if it exists
|
|
465
|
+
tool_call_metrics = [
|
|
466
|
+
row
|
|
467
|
+
for row in tool_call_metrics
|
|
468
|
+
if row.dataset_name != "Summary (Average)"
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
return tool_call_metrics, kb_summary, custom_metrics
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
|
|
5
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
6
|
+
LangfuseMetric,
|
|
7
|
+
ToolCallAndRoutingMetrics,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.type import ContentType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ToolCalling(Evaluation):
|
|
13
|
+
@property
|
|
14
|
+
def name(self):
|
|
15
|
+
return "Tool Calling Metrics"
|
|
16
|
+
|
|
17
|
+
def evaluate(
|
|
18
|
+
self, messages, ground_truth, extracted_context, metadata, **kwargs
|
|
19
|
+
) -> Union[LangfuseMetric, List[LangfuseMetric]]:
|
|
20
|
+
dataset_name = kwargs.get("dataset", "")
|
|
21
|
+
|
|
22
|
+
total_tool_calls = 0
|
|
23
|
+
relevant_tool_calls = 0
|
|
24
|
+
tool_calls_with_incorrect_parameter = 0
|
|
25
|
+
correct_tool_calls = set()
|
|
26
|
+
|
|
27
|
+
tool_dictionary = (
|
|
28
|
+
{
|
|
29
|
+
goal_detail.name: goal_detail
|
|
30
|
+
for goal_detail in ground_truth.goal_details
|
|
31
|
+
if goal_detail.type == ContentType.tool_call
|
|
32
|
+
}
|
|
33
|
+
if ground_truth.goal_details
|
|
34
|
+
else {}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
labeled_messages = extracted_context.get("labeled_messages")
|
|
38
|
+
total_tool_calls = len(
|
|
39
|
+
[
|
|
40
|
+
message
|
|
41
|
+
for message in messages
|
|
42
|
+
if message.type == ContentType.tool_call
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
relevant_tool_calls = len(labeled_messages)
|
|
46
|
+
|
|
47
|
+
for message_idx, matching_goal_details in labeled_messages.items():
|
|
48
|
+
msg_tool_call = messages[message_idx]
|
|
49
|
+
msg_tool_call = msg_tool_call.tool_calls[0].function
|
|
50
|
+
for goal_detail in matching_goal_details:
|
|
51
|
+
# TODO flesh out to match ADK EVAL
|
|
52
|
+
args_match = argument_matching(
|
|
53
|
+
expected=goal_detail.args,
|
|
54
|
+
actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if args_match:
|
|
58
|
+
correct_tool_calls.add(goal_detail.name)
|
|
59
|
+
else:
|
|
60
|
+
tool_calls_with_incorrect_parameter += 1
|
|
61
|
+
|
|
62
|
+
# TODO: think about the dataset name
|
|
63
|
+
# TODO: total_steps
|
|
64
|
+
tool_call_metrics = ToolCallAndRoutingMetrics(
|
|
65
|
+
dataset_name=dataset_name,
|
|
66
|
+
total_tool_calls=total_tool_calls,
|
|
67
|
+
expected_tool_calls=len(tool_dictionary),
|
|
68
|
+
correct_tool_calls=len(correct_tool_calls),
|
|
69
|
+
relevant_tool_calls=relevant_tool_calls,
|
|
70
|
+
tool_calls_with_incorrect_parameter=tool_calls_with_incorrect_parameter,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
tool_call_metrics = tool_call_metrics.model_dump()
|
|
74
|
+
|
|
75
|
+
metrics = []
|
|
76
|
+
|
|
77
|
+
for tool in [
|
|
78
|
+
"total_tool_calls",
|
|
79
|
+
"correct_tool_calls",
|
|
80
|
+
"expected_tool_calls",
|
|
81
|
+
"tool_calls_with_incorrect_parameter",
|
|
82
|
+
"tool_call_recall",
|
|
83
|
+
"tool_call_precision",
|
|
84
|
+
]:
|
|
85
|
+
metric = LangfuseMetric(
|
|
86
|
+
eval_name=tool,
|
|
87
|
+
value=tool_call_metrics.get(tool),
|
|
88
|
+
metadata=metadata,
|
|
89
|
+
data_type="NUMERIC",
|
|
90
|
+
)
|
|
91
|
+
metrics.append(metric)
|
|
92
|
+
|
|
93
|
+
return metrics
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.otel_parser import parser as otel_parser
|