ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,107 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Dict, Optional
4
+
5
+ from wxo_agentic_evaluation.metrics.metrics import Metric
6
+ from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
7
+ from wxo_agentic_evaluation.service_provider.provider import Provider
8
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
9
+ from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
10
+
11
+ root_dir: str = os.path.dirname(os.path.dirname(__file__))
12
+ LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
13
+
14
+
15
+ class Evaluation(ABC):
16
+ """Abstract base class for all evaluations."""
17
+
18
+ def __init__(self, llm_client: Optional[Provider] = None) -> None:
19
+ self._llm_client = llm_client
20
+
21
+ @property
22
+ def llm_client(self) -> Any:
23
+ """Access client, require it if used."""
24
+ if self._llm_client is None:
25
+ raise RuntimeError(
26
+ f"{self.__class__.__name__} requires a client, but none was provided"
27
+ )
28
+ return self._llm_client
29
+
30
+ @property
31
+ @abstractmethod
32
+ def name(self) -> str:
33
+ """Unique name for the evaluator."""
34
+ raise NotImplementedError
35
+
36
+ @abstractmethod
37
+ def evaluate(
38
+ self,
39
+ messages: list[Message],
40
+ ground_truth: OrchestrateDataset,
41
+ extracted_context: Dict[str, Any],
42
+ ) -> Optional[Metric]:
43
+ """
44
+ Evaluation method.
45
+
46
+ Args:
47
+ messages: agent and user conversational messages (includes tool calls)
48
+ ground_truth: ground truth data
49
+ extracted_context: dictionary containing data derived from the messages
50
+
51
+ Returns:
52
+ Metic
53
+ """
54
+ raise NotImplementedError
55
+
56
+
57
+ class LLMaaJEvaluation(Evaluation, ABC):
58
+ """Evaluation metric for LLMaaJ."""
59
+
60
+ @property
61
+ @abstractmethod
62
+ def llmaaj_instructions(self) -> str:
63
+ """LLMaaJ instructions for the evaluator."""
64
+ raise NotImplementedError
65
+
66
+ @abstractmethod
67
+ def format_llm_output(self, string: str) -> int | float | bool | str:
68
+ """Format the output of the LLMaaJ query."""
69
+ raise NotImplementedError
70
+
71
+ @property
72
+ def selected_context_keys(self) -> set[str]:
73
+ """Override to implement context keys to pass to the prompt."""
74
+ return set()
75
+
76
+ def select_context(
77
+ self, extracted_context: Dict[str, Any]
78
+ ) -> dict[str, Any]:
79
+ """Additional context to be added to the prompt."""
80
+ selected_context = {
81
+ key: value
82
+ for key, value in extracted_context.items()
83
+ if key in self.selected_context_keys
84
+ }
85
+
86
+ return selected_context
87
+
88
+ def evaluate(
89
+ self,
90
+ messages: list[Message],
91
+ ground_truth: OrchestrateDataset,
92
+ extracted_context: Dict[str, Any],
93
+ ) -> Optional[Metric]:
94
+ renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
95
+ parsed = ParsedMessages(messages=messages)
96
+ if parsed.user_input is None or parsed.agent_response is None:
97
+ return None
98
+ context = str(self.select_context(extracted_context))
99
+ prompt = renderer.render(
100
+ user_input=parsed.user_input,
101
+ agent_answer=parsed.agent_response,
102
+ llmaaj_instructions=self.llmaaj_instructions,
103
+ context=context,
104
+ )
105
+ score_str = self.llm_client.query(prompt)
106
+ value = self.format_llm_output(score_str)
107
+ return Metric(eval_name=self.name, value=value)
@@ -0,0 +1,137 @@
1
+ import json
2
+ from collections import defaultdict
3
+
4
+ from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
5
+
6
+ from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
7
+ from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
8
+
9
+ ## fix later
10
+ from wxo_agentic_evaluation.otel_parser.parser_types import (
11
+ Message as OtelMessage,
12
+ )
13
+
14
+ """
15
+ - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
16
+ - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
17
+ a dummy node to the ground truth and the labelled messages to take into account
18
+ single, summary step goals.
19
+ """
20
+ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
21
+
22
+
23
+ class JourneySuccessMetric(Evaluation):
24
+ def __init__(self, llm_client=None):
25
+ super().__init__(llm_client)
26
+ self.is_strict = True
27
+
28
+ @property
29
+ def name(self):
30
+ return "Journey Success"
31
+
32
+ def find_terminal_nodes(self, graph: dict[str, list[str]]) -> set[str]:
33
+ """Finds terminal nodes (nodes with no outgoing edges).
34
+
35
+ Args:
36
+ graph: the input graph
37
+
38
+ Returns:
39
+ a set of the terminal nodes
40
+ """
41
+
42
+ seen_nodes = set() # track seen nodes
43
+ non_terminal_nodes = set() # track nodes with children
44
+
45
+ for node in graph:
46
+ seen_nodes.add(node)
47
+ if graph[node]:
48
+ non_terminal_nodes.add(node)
49
+ for n in graph[node]:
50
+ seen_nodes.add(n)
51
+ return seen_nodes - non_terminal_nodes
52
+
53
+ def is_topological_sort(
54
+ self,
55
+ graph: dict[str, list[str]],
56
+ ordering: list[str],
57
+ is_strict: bool = True,
58
+ ) -> bool:
59
+ """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
60
+
61
+ Args:
62
+ graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
63
+ ordering: the nodes visited, in order
64
+
65
+ Returns:
66
+ Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
67
+ """
68
+ # No keyword match or goal details were achieved
69
+ if not ordering:
70
+ return False
71
+
72
+ if is_strict:
73
+ # strict matching: only consider most recent tool call
74
+ position = {node: [i] for i, node in enumerate(ordering)}
75
+ else:
76
+ # lenient matching: consider all tool calls (account for all indexes of the node)
77
+ position = defaultdict(list)
78
+ for i, node in enumerate(ordering):
79
+ position[node].append(i)
80
+
81
+ terminal_nodes = self.find_terminal_nodes(graph)
82
+ # adds a dummy node for each terminal node
83
+ next_idx = (
84
+ max(val for values in position.values() for val in values) + 1
85
+ )
86
+
87
+ for n in terminal_nodes:
88
+ graph[n] = [DUMMY_GRAPH_NODE_NAME]
89
+ graph[DUMMY_GRAPH_NODE_NAME] = []
90
+ position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
91
+ next_idx += 1
92
+
93
+ for node in graph:
94
+ for child_nodes in graph[node]:
95
+ # Current node/children doesn't show up in made calls
96
+ if node not in position or child_nodes not in position:
97
+ return False
98
+ # Current node doesn't show up before any of its child
99
+ # all index in current nodes are larger than every child nodes' index
100
+ if all(
101
+ curr >= max(position[child_nodes])
102
+ for curr in position[node]
103
+ ):
104
+ return False
105
+ return True
106
+
107
+ def evaluate(
108
+ self, messages, ground_truth, extracted_context, metadata, **kwargs
109
+ ):
110
+ labeled_messages = extracted_context.get("labeled_messages")
111
+ correct_tool_calls = []
112
+
113
+ for message_idx, matching_goal_details in labeled_messages.items():
114
+ msg_tool_call = messages[message_idx]
115
+ msg_tool_call = msg_tool_call.tool_calls[0].function
116
+ for goal_detail in matching_goal_details:
117
+ args_match = argument_matching(
118
+ expected=goal_detail.args,
119
+ actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
120
+ )
121
+
122
+ if args_match:
123
+ correct_tool_calls.append(goal_detail.name)
124
+
125
+ is_topological_sort = self.is_topological_sort(
126
+ graph=ground_truth.goals,
127
+ ordering=correct_tool_calls,
128
+ is_strict=self.is_strict,
129
+ )
130
+
131
+ return LangfuseMetric(
132
+ eval_name=self.name,
133
+ comment="",
134
+ value=is_topological_sort,
135
+ data_type="NUMERIC",
136
+ metadata=metadata,
137
+ )
@@ -44,3 +44,29 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
44
44
  "answer_relevancy": self.answer_relevancy,
45
45
  "answer_relevancy_score": self.answer_relevancy_score,
46
46
  }
47
+
48
+
49
+ class AnswerDerailment(BaseLLMJudgeMetric):
50
+ in_scope: str | float
51
+ statement: str
52
+ reason: str
53
+
54
+ def table(self):
55
+ return {
56
+ "statement": self.statement,
57
+ "reason": self.reason,
58
+ "on_topic_score": str(self.in_scope),
59
+ }
60
+
61
+
62
+ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
63
+ is_safe: str | float
64
+ statement: str
65
+ reason: str
66
+
67
+ def table(self):
68
+ return {
69
+ "statement": self.statement,
70
+ "reason": self.reason,
71
+ "safe_topic_score": str(self.is_safe),
72
+ }
@@ -1,8 +1,9 @@
1
- import math
2
- from enum import Enum
3
- from typing import Any, List, Mapping, Optional, Tuple
1
+ from collections import defaultdict
2
+ from enum import Enum, StrEnum
3
+ from typing import Any, Dict, List, Mapping, Optional, Tuple
4
4
 
5
5
  from pydantic import BaseModel, computed_field
6
+ from pydantic.fields import Field
6
7
 
7
8
  from wxo_agentic_evaluation.metrics.llm_as_judge import (
8
9
  AnswerRelevancy,
@@ -11,12 +12,34 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import (
11
12
  from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
12
13
 
13
14
 
14
- def average(array):
15
- if len(array) == 0:
16
- return math.nan
15
+ class DescriptionQuality(StrEnum):
16
+ GOOD = "GOOD"
17
+ BAD = "BAD"
18
+ MISSING = "MISSING"
17
19
 
18
- else:
19
- return sum(array) / len(array)
20
+
21
+ class DescriptionQualityMetric(BaseModel):
22
+ tool_name: str = None
23
+ description_score: float | None = None
24
+ threshold: float | None = None
25
+
26
+ @computed_field
27
+ @property
28
+ def is_bad_description(self) -> Optional[bool]:
29
+ if self.description_score and self.threshold:
30
+ return self.description_score >= self.threshold
31
+
32
+ return None
33
+
34
+ @computed_field
35
+ @property
36
+ def description_quality(self) -> str:
37
+ if self.description_score is None:
38
+ return DescriptionQuality.MISSING
39
+ elif self.is_bad_description:
40
+ return DescriptionQuality.BAD
41
+ else:
42
+ return DescriptionQuality.GOOD
20
43
 
21
44
 
22
45
  class KnowledgeBaseMetrics(BaseModel):
@@ -72,6 +95,8 @@ class KnowledgeBaseMetricSummary(BaseModel):
72
95
  @computed_field(alias="summary")
73
96
  @property
74
97
  def average(self) -> Mapping[str, Any]:
98
+ from wxo_agentic_evaluation.utils.utils import average
99
+
75
100
  summary = {}
76
101
  for dataset, metric in self.groupby_dataset.items():
77
102
  average_metric = {}
@@ -175,6 +200,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
175
200
  )
176
201
 
177
202
 
203
+ class Annotation(BaseModel):
204
+ recommendation: str
205
+ details: str
206
+ quote: str
207
+ parameter_name: Optional[str]
208
+
209
+
178
210
  class FailedStaticTestCases(BaseModel):
179
211
  metric_name: str
180
212
  description: str
@@ -187,6 +219,15 @@ class FailedSemanticTestCases(BaseModel):
187
219
  explanation: str
188
220
  output: int
189
221
  confidence: float
222
+ annotations: Optional[List[Annotation]] = None
223
+
224
+
225
+ class EnhancedAnalyzeMetrics(BaseModel):
226
+ test_case_name: str
227
+ tool_names: List[str]
228
+ parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
229
+ tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
230
+ static_metrics: List[List[FailedStaticTestCases]] = [[]]
190
231
 
191
232
 
192
233
  class ReferenceLessEvalMetrics(BaseModel):
@@ -201,3 +242,230 @@ class ReferenceLessEvalMetrics(BaseModel):
201
242
  failed_semantic_tool_calls: Optional[
202
243
  List[Tuple[int, List[FailedSemanticTestCases]]]
203
244
  ]
245
+
246
+
247
+ class Metric(BaseModel):
248
+ """Generic metric result."""
249
+
250
+ eval_name: str = Field(description="name of eval that produce metric")
251
+ value: int | float | bool | str = Field(description="metric value")
252
+ metadata: Optional[dict] = Field(
253
+ default=None,
254
+ description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
255
+ )
256
+
257
+
258
+ class LangfuseMetric(Metric):
259
+ comment: Optional[str] = ""
260
+ data_type: Optional[str] = ""
261
+
262
+
263
+ class CustomEvalMetrics(BaseModel):
264
+ dataset_name: str
265
+ custom_metrics: list[Metric]
266
+
267
+
268
+ def create_avg_row(metrics: List[Dict[str, Any]]) -> Dict[str, Any]:
269
+ """
270
+ Create an average row from a list of metric dictionaries.
271
+
272
+ Args:
273
+ metrics: List of metric dictionaries
274
+
275
+ Returns:
276
+ Dictionary with averaged metrics
277
+ """
278
+ from wxo_agentic_evaluation.utils.utils import safe_divide
279
+
280
+ avg_row = {
281
+ "Dataset": "Summary (Average)",
282
+ "Runs": 0,
283
+ "Total Steps": 0,
284
+ "LLM Steps": 0,
285
+ "Total Tool Calls": 0,
286
+ "Tool Call Precision": 0,
287
+ "Tool Call Recall": 0,
288
+ "Agent Routing Accuracy": 0,
289
+ "Text Match": 0,
290
+ "Journey Success": 0,
291
+ "Avg Resp Time (sec)": 0,
292
+ }
293
+
294
+ if metrics:
295
+ for row in metrics:
296
+ avg_row["Runs"] += row.get("Runs", 0)
297
+ avg_row["Total Steps"] += row["Total Steps"]
298
+ avg_row["LLM Steps"] += row["LLM Steps"]
299
+ avg_row["Total Tool Calls"] += row["Total Tool Calls"]
300
+ avg_row["Tool Call Precision"] += row["Tool Call Precision"]
301
+ avg_row["Tool Call Recall"] += row["Tool Call Recall"]
302
+ avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
303
+ avg_row["Text Match"] += row["Text Match"]
304
+ avg_row["Journey Success"] += row["Journey Success"]
305
+ avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
306
+
307
+ n = len(metrics)
308
+ # Average over datasets
309
+ avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
310
+ avg_row["Total Steps"] = round(
311
+ safe_divide(avg_row["Total Steps"], n), 2
312
+ )
313
+ avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], n), 2)
314
+ avg_row["Total Tool Calls"] = round(
315
+ safe_divide(avg_row["Total Tool Calls"], n), 2
316
+ )
317
+ avg_row["Tool Call Precision"] = round(
318
+ safe_divide(avg_row["Tool Call Precision"], n), 2
319
+ )
320
+ avg_row["Tool Call Recall"] = round(
321
+ safe_divide(avg_row["Tool Call Recall"], n), 2
322
+ )
323
+ avg_row["Agent Routing Accuracy"] = round(
324
+ safe_divide(avg_row["Agent Routing Accuracy"], n), 2
325
+ )
326
+ avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], n), 2)
327
+ avg_row["Journey Success"] = round(
328
+ safe_divide(avg_row["Journey Success"], n), 2
329
+ )
330
+ avg_row["Avg Resp Time (sec)"] = round(
331
+ safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
332
+ )
333
+
334
+ return avg_row
335
+
336
+
337
+ def format_metrics_for_display(
338
+ tool_call_metrics: list[ToolCallAndRoutingMetrics],
339
+ ) -> list[dict[str, Any]]:
340
+ from wxo_agentic_evaluation.utils.utils import mean, safe_divide, to_pct
341
+
342
+ # Group metrics by dataset name
343
+ grouped = defaultdict(list)
344
+ for m in tool_call_metrics:
345
+ grouped[m.dataset_name].append(
346
+ {
347
+ "Dataset": m.dataset_name,
348
+ "Total Steps": m.total_steps,
349
+ "LLM Steps": m.llm_step,
350
+ "Total Tool Calls": m.total_tool_calls,
351
+ "Tool Call Precision": m.tool_call_precision,
352
+ "Tool Call Recall": m.tool_call_recall,
353
+ "Agent Routing Accuracy": m.agent_routing_accuracy,
354
+ "Text Match": m.text_match,
355
+ "Journey Success": m.is_success,
356
+ "Avg Resp Time (sec)": m.avg_resp_time,
357
+ }
358
+ )
359
+
360
+ # Create per-test rows with averages over runs
361
+ per_test_rows = []
362
+ for ds, rows in grouped.items():
363
+ out = {"Dataset": ds}
364
+
365
+ # Average numeric columns over runs
366
+ numeric_keys = [
367
+ "Total Steps",
368
+ "LLM Steps",
369
+ "Total Tool Calls",
370
+ "Tool Call Precision",
371
+ "Tool Call Recall",
372
+ "Agent Routing Accuracy",
373
+ "Avg Resp Time (sec)",
374
+ ]
375
+
376
+ for k in numeric_keys:
377
+ out[k] = mean(
378
+ [r[k] for r in rows if isinstance(r.get(k), (int, float))]
379
+ )
380
+
381
+ # Add total runs per dataset
382
+ out["Runs"] = round(float(len(rows)), 2)
383
+
384
+ # Journey Success -> numeric fraction in [0,1]
385
+ js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
386
+ out["Journey Success"] = round(
387
+ safe_divide(sum(js_vals), len(js_vals)), 2
388
+ )
389
+
390
+ # Text Match -> numeric fraction in [0,1]
391
+ tm_hits = 0
392
+ tm_den = len(rows)
393
+ for r in rows:
394
+ val = r.get("Text Match")
395
+ if str(val).strip() == TextMatchType.text_match.value:
396
+ tm_hits += 1
397
+ out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
398
+
399
+ per_test_rows.append(out)
400
+
401
+ # Create overall average row
402
+ overall_row = create_avg_row(per_test_rows)
403
+
404
+ # Format percentages
405
+ tool_call_metrics_for_display = per_test_rows + [overall_row]
406
+ for row in tool_call_metrics_for_display:
407
+ row["Text Match"] = to_pct(row.get("Text Match"), decimals=0)
408
+ row["Journey Success"] = to_pct(row.get("Journey Success"), decimals=0)
409
+
410
+ column_order = [
411
+ "Dataset",
412
+ "Runs",
413
+ "Total Steps",
414
+ "LLM Steps",
415
+ "Total Tool Calls",
416
+ "Tool Call Precision",
417
+ "Tool Call Recall",
418
+ "Agent Routing Accuracy",
419
+ "Text Match",
420
+ "Journey Success",
421
+ "Avg Resp Time (sec)",
422
+ ]
423
+
424
+ tool_call_metrics_for_display = [
425
+ {col: row.get(col, "") for col in column_order}
426
+ for row in tool_call_metrics_for_display
427
+ ]
428
+
429
+ return tool_call_metrics_for_display
430
+
431
+
432
+ def extract_metrics(
433
+ results: List[
434
+ Tuple[
435
+ ToolCallAndRoutingMetrics,
436
+ KnowledgeBaseMetricSummary,
437
+ CustomEvalMetrics,
438
+ ]
439
+ ],
440
+ ) -> tuple[
441
+ list[ToolCallAndRoutingMetrics],
442
+ KnowledgeBaseMetricSummary,
443
+ List[CustomEvalMetrics],
444
+ ]:
445
+ """
446
+ Aggregate metrics from test results.
447
+
448
+ Args:
449
+ results: List of tuples (metrics, knowledge_base_metrics, custom_metrics)
450
+
451
+ Returns:
452
+ Tuple of (knowledge_base_summary, tool_rows, custom_metrics)
453
+ """
454
+
455
+ tool_call_metrics = [metric[0] for metric in results]
456
+ knowledge_base_metrics = [metric[1] for metric in results]
457
+ custom_metrics: List[CustomEvalMetrics] = [metric[2] for metric in results]
458
+
459
+ kb_summary = KnowledgeBaseMetricSummary(
460
+ knowledge_base_metrics=knowledge_base_metrics
461
+ )
462
+
463
+ if len(tool_call_metrics) > 0:
464
+ # Remove the average row if it exists
465
+ tool_call_metrics = [
466
+ row
467
+ for row in tool_call_metrics
468
+ if row.dataset_name != "Summary (Average)"
469
+ ]
470
+
471
+ return tool_call_metrics, kb_summary, custom_metrics
@@ -0,0 +1,93 @@
1
+ import json
2
+ from typing import List, Union
3
+
4
+ from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
5
+ from wxo_agentic_evaluation.metrics.metrics import (
6
+ LangfuseMetric,
7
+ ToolCallAndRoutingMetrics,
8
+ )
9
+ from wxo_agentic_evaluation.type import ContentType
10
+
11
+
12
+ class ToolCalling(Evaluation):
13
+ @property
14
+ def name(self):
15
+ return "Tool Calling Metrics"
16
+
17
+ def evaluate(
18
+ self, messages, ground_truth, extracted_context, metadata, **kwargs
19
+ ) -> Union[LangfuseMetric, List[LangfuseMetric]]:
20
+ dataset_name = kwargs.get("dataset", "")
21
+
22
+ total_tool_calls = 0
23
+ relevant_tool_calls = 0
24
+ tool_calls_with_incorrect_parameter = 0
25
+ correct_tool_calls = set()
26
+
27
+ tool_dictionary = (
28
+ {
29
+ goal_detail.name: goal_detail
30
+ for goal_detail in ground_truth.goal_details
31
+ if goal_detail.type == ContentType.tool_call
32
+ }
33
+ if ground_truth.goal_details
34
+ else {}
35
+ )
36
+
37
+ labeled_messages = extracted_context.get("labeled_messages")
38
+ total_tool_calls = len(
39
+ [
40
+ message
41
+ for message in messages
42
+ if message.type == ContentType.tool_call
43
+ ]
44
+ )
45
+ relevant_tool_calls = len(labeled_messages)
46
+
47
+ for message_idx, matching_goal_details in labeled_messages.items():
48
+ msg_tool_call = messages[message_idx]
49
+ msg_tool_call = msg_tool_call.tool_calls[0].function
50
+ for goal_detail in matching_goal_details:
51
+ # TODO flesh out to match ADK EVAL
52
+ args_match = argument_matching(
53
+ expected=goal_detail.args,
54
+ actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
55
+ )
56
+
57
+ if args_match:
58
+ correct_tool_calls.add(goal_detail.name)
59
+ else:
60
+ tool_calls_with_incorrect_parameter += 1
61
+
62
+ # TODO: think about the dataset name
63
+ # TODO: total_steps
64
+ tool_call_metrics = ToolCallAndRoutingMetrics(
65
+ dataset_name=dataset_name,
66
+ total_tool_calls=total_tool_calls,
67
+ expected_tool_calls=len(tool_dictionary),
68
+ correct_tool_calls=len(correct_tool_calls),
69
+ relevant_tool_calls=relevant_tool_calls,
70
+ tool_calls_with_incorrect_parameter=tool_calls_with_incorrect_parameter,
71
+ )
72
+
73
+ tool_call_metrics = tool_call_metrics.model_dump()
74
+
75
+ metrics = []
76
+
77
+ for tool in [
78
+ "total_tool_calls",
79
+ "correct_tool_calls",
80
+ "expected_tool_calls",
81
+ "tool_calls_with_incorrect_parameter",
82
+ "tool_call_recall",
83
+ "tool_call_precision",
84
+ ]:
85
+ metric = LangfuseMetric(
86
+ eval_name=tool,
87
+ value=tool_call_metrics.get(tool),
88
+ metadata=metadata,
89
+ data_type="NUMERIC",
90
+ )
91
+ metrics.append(metric)
92
+
93
+ return metrics
@@ -0,0 +1 @@
1
+ from wxo_agentic_evaluation.otel_parser import parser as otel_parser