ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- from pydantic import BaseModel, computed_field
2
-
3
1
  from abc import abstractmethod
4
2
  from functools import cached_property
5
3
 
4
+ from pydantic import BaseModel, computed_field
5
+
6
6
 
7
7
  class BaseLLMJudgeMetric(BaseModel):
8
8
  @abstractmethod
@@ -44,3 +44,29 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
44
44
  "answer_relevancy": self.answer_relevancy,
45
45
  "answer_relevancy_score": self.answer_relevancy_score,
46
46
  }
47
+
48
+
49
+ class AnswerDerailment(BaseLLMJudgeMetric):
50
+ in_scope: str | float
51
+ statement: str
52
+ reason: str
53
+
54
+ def table(self):
55
+ return {
56
+ "statement": self.statement,
57
+ "reason": self.reason,
58
+ "on_topic_score": str(self.in_scope),
59
+ }
60
+
61
+
62
+ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
63
+ is_safe: str | float
64
+ statement: str
65
+ reason: str
66
+
67
+ def table(self):
68
+ return {
69
+ "statement": self.statement,
70
+ "reason": self.reason,
71
+ "safe_topic_score": str(self.is_safe),
72
+ }
@@ -1,19 +1,45 @@
1
- import math
2
- from typing import List, Mapping, Any
3
- from enum import Enum
1
+ from collections import defaultdict
2
+ from enum import Enum, StrEnum
3
+ from typing import Any, Dict, List, Mapping, Optional, Tuple
4
4
 
5
5
  from pydantic import BaseModel, computed_field
6
+ from pydantic.fields import Field
6
7
 
7
- from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
8
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
9
+ AnswerRelevancy,
10
+ Faithfulness,
11
+ )
8
12
  from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
9
13
 
10
14
 
11
- def average(array):
12
- if len(array) == 0:
13
- return math.nan
15
+ class DescriptionQuality(StrEnum):
16
+ GOOD = "GOOD"
17
+ BAD = "BAD"
18
+ MISSING = "MISSING"
14
19
 
15
- else:
16
- return sum(array)/len(array)
20
+
21
+ class DescriptionQualityMetric(BaseModel):
22
+ tool_name: str = None
23
+ description_score: float | None = None
24
+ threshold: float | None = None
25
+
26
+ @computed_field
27
+ @property
28
+ def is_bad_description(self) -> Optional[bool]:
29
+ if self.description_score and self.threshold:
30
+ return self.description_score >= self.threshold
31
+
32
+ return None
33
+
34
+ @computed_field
35
+ @property
36
+ def description_quality(self) -> str:
37
+ if self.description_score is None:
38
+ return DescriptionQuality.MISSING
39
+ elif self.is_bad_description:
40
+ return DescriptionQuality.BAD
41
+ else:
42
+ return DescriptionQuality.GOOD
17
43
 
18
44
 
19
45
  class KnowledgeBaseMetrics(BaseModel):
@@ -54,7 +80,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
54
80
  }
55
81
  else:
56
82
  values = groupby[name]
57
- values.get("knowledge_base_name").append(knowledge_base_name)
83
+ values.get("knowledge_base_name").append(
84
+ knowledge_base_name
85
+ )
58
86
  values.get("faithfulness").append(faithfulness)
59
87
  values.get("answer_relevancy").append(answer_relevancy)
60
88
  values.get("confidence_scores").append(confidence_scores)
@@ -67,6 +95,8 @@ class KnowledgeBaseMetricSummary(BaseModel):
67
95
  @computed_field(alias="summary")
68
96
  @property
69
97
  def average(self) -> Mapping[str, Any]:
98
+ from wxo_agentic_evaluation.utils.utils import average
99
+
70
100
  summary = {}
71
101
  for dataset, metric in self.groupby_dataset.items():
72
102
  average_metric = {}
@@ -109,6 +139,7 @@ class KeywordSemanticSearchMetric(BaseModel):
109
139
  message: str
110
140
  goal_detail: str
111
141
 
142
+
112
143
  class TextMatchType(Enum):
113
144
  text_match = "Summary Matched"
114
145
  text_mismatch = "Summary MisMatched"
@@ -117,12 +148,14 @@ class TextMatchType(Enum):
117
148
 
118
149
  class ToolCallAndRoutingMetrics(BaseModel):
119
150
  dataset_name: str = ""
120
- total_steps: int=0
121
- llm_step: int =0
151
+ total_steps: int = 0
152
+ llm_step: int = 0
122
153
  total_tool_calls: int = 0
123
154
  expected_tool_calls: int = 0
124
155
  correct_tool_calls: int = 0
125
- relevant_tool_calls: int = 0 # calls with the same function but different args
156
+ relevant_tool_calls: int = (
157
+ 0 # calls with the same function but different args
158
+ )
126
159
  total_routing_calls: int = 0
127
160
  relevant_routing_calls: int = 0
128
161
  tool_calls_with_incorrect_parameter: int = 0
@@ -135,7 +168,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
135
168
  def tool_call_recall(self) -> float:
136
169
  return round(
137
170
  (
138
- self.correct_tool_calls/self.expected_tool_calls
171
+ self.correct_tool_calls / self.expected_tool_calls
139
172
  if self.expected_tool_calls > 0
140
173
  else 0.0
141
174
  ),
@@ -147,8 +180,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
147
180
  def tool_call_precision(self) -> float:
148
181
  return round(
149
182
  (
150
- (self.correct_tool_calls)
151
- / self.total_tool_calls
183
+ (self.correct_tool_calls) / self.total_tool_calls
152
184
  if self.total_tool_calls > 0
153
185
  else 0.0
154
186
  ),
@@ -166,3 +198,274 @@ class ToolCallAndRoutingMetrics(BaseModel):
166
198
  ),
167
199
  2,
168
200
  )
201
+
202
+
203
+ class Annotation(BaseModel):
204
+ recommendation: str
205
+ details: str
206
+ quote: str
207
+ parameter_name: Optional[str]
208
+
209
+
210
+ class FailedStaticTestCases(BaseModel):
211
+ metric_name: str
212
+ description: str
213
+ explanation: str
214
+
215
+
216
+ class FailedSemanticTestCases(BaseModel):
217
+ metric_name: str
218
+ evidence: str
219
+ explanation: str
220
+ output: int
221
+ confidence: float
222
+ annotations: Optional[List[Annotation]] = None
223
+
224
+
225
+ class EnhancedAnalyzeMetrics(BaseModel):
226
+ test_case_name: str
227
+ tool_names: List[str]
228
+ parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
229
+ tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
230
+ static_metrics: List[List[FailedStaticTestCases]] = [[]]
231
+
232
+
233
+ class ReferenceLessEvalMetrics(BaseModel):
234
+ dataset_name: str
235
+ number_of_tool_calls: int
236
+ number_of_successful_tool_calls: int
237
+ number_of_static_failed_tool_calls: int
238
+ number_of_semantic_failed_tool_calls: int
239
+ failed_static_tool_calls: Optional[
240
+ List[Tuple[int, List[FailedStaticTestCases]]]
241
+ ]
242
+ failed_semantic_tool_calls: Optional[
243
+ List[Tuple[int, List[FailedSemanticTestCases]]]
244
+ ]
245
+
246
+
247
+ class Metric(BaseModel):
248
+ """Generic metric result."""
249
+
250
+ eval_name: str = Field(description="name of eval that produce metric")
251
+ value: int | float | bool | str = Field(description="metric value")
252
+ metadata: Optional[dict] = Field(
253
+ default=None,
254
+ description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
255
+ )
256
+
257
+
258
+ class LangfuseMetric(Metric):
259
+ comment: Optional[str] = ""
260
+ data_type: Optional[str] = ""
261
+
262
+
263
+ class CustomEvalMetrics(BaseModel):
264
+ dataset_name: str
265
+ custom_metrics: list[Metric]
266
+
267
+
268
+ def create_avg_row(metrics: List[Dict[str, Any]]) -> Dict[str, Any]:
269
+ """
270
+ Create an average row from a list of metric dictionaries.
271
+
272
+ Args:
273
+ metrics: List of metric dictionaries
274
+
275
+ Returns:
276
+ Dictionary with averaged metrics
277
+ """
278
+ from wxo_agentic_evaluation.utils.utils import safe_divide
279
+
280
+ avg_row = {
281
+ "Dataset": "Summary (Average)",
282
+ "Runs": 0,
283
+ "Total Steps": 0,
284
+ "LLM Steps": 0,
285
+ "Total Tool Calls": 0,
286
+ "Tool Call Precision": 0,
287
+ "Tool Call Recall": 0,
288
+ "Agent Routing Accuracy": 0,
289
+ "Text Match": 0,
290
+ "Journey Success": 0,
291
+ "Avg Resp Time (sec)": 0,
292
+ }
293
+
294
+ if metrics:
295
+ for row in metrics:
296
+ avg_row["Runs"] += row.get("Runs", 0)
297
+ avg_row["Total Steps"] += row["Total Steps"]
298
+ avg_row["LLM Steps"] += row["LLM Steps"]
299
+ avg_row["Total Tool Calls"] += row["Total Tool Calls"]
300
+ avg_row["Tool Call Precision"] += row["Tool Call Precision"]
301
+ avg_row["Tool Call Recall"] += row["Tool Call Recall"]
302
+ avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
303
+ avg_row["Text Match"] += row["Text Match"]
304
+ avg_row["Journey Success"] += row["Journey Success"]
305
+ avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
306
+
307
+ n = len(metrics)
308
+ # Average over datasets
309
+ avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
310
+ avg_row["Total Steps"] = round(
311
+ safe_divide(avg_row["Total Steps"], n), 2
312
+ )
313
+ avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], n), 2)
314
+ avg_row["Total Tool Calls"] = round(
315
+ safe_divide(avg_row["Total Tool Calls"], n), 2
316
+ )
317
+ avg_row["Tool Call Precision"] = round(
318
+ safe_divide(avg_row["Tool Call Precision"], n), 2
319
+ )
320
+ avg_row["Tool Call Recall"] = round(
321
+ safe_divide(avg_row["Tool Call Recall"], n), 2
322
+ )
323
+ avg_row["Agent Routing Accuracy"] = round(
324
+ safe_divide(avg_row["Agent Routing Accuracy"], n), 2
325
+ )
326
+ avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], n), 2)
327
+ avg_row["Journey Success"] = round(
328
+ safe_divide(avg_row["Journey Success"], n), 2
329
+ )
330
+ avg_row["Avg Resp Time (sec)"] = round(
331
+ safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
332
+ )
333
+
334
+ return avg_row
335
+
336
+
337
+ def format_metrics_for_display(
338
+ tool_call_metrics: list[ToolCallAndRoutingMetrics],
339
+ ) -> list[dict[str, Any]]:
340
+ from wxo_agentic_evaluation.utils.utils import mean, safe_divide, to_pct
341
+
342
+ # Group metrics by dataset name
343
+ grouped = defaultdict(list)
344
+ for m in tool_call_metrics:
345
+ grouped[m.dataset_name].append(
346
+ {
347
+ "Dataset": m.dataset_name,
348
+ "Total Steps": m.total_steps,
349
+ "LLM Steps": m.llm_step,
350
+ "Total Tool Calls": m.total_tool_calls,
351
+ "Tool Call Precision": m.tool_call_precision,
352
+ "Tool Call Recall": m.tool_call_recall,
353
+ "Agent Routing Accuracy": m.agent_routing_accuracy,
354
+ "Text Match": m.text_match,
355
+ "Journey Success": m.is_success,
356
+ "Avg Resp Time (sec)": m.avg_resp_time,
357
+ }
358
+ )
359
+
360
+ # Create per-test rows with averages over runs
361
+ per_test_rows = []
362
+ for ds, rows in grouped.items():
363
+ out = {"Dataset": ds}
364
+
365
+ # Average numeric columns over runs
366
+ numeric_keys = [
367
+ "Total Steps",
368
+ "LLM Steps",
369
+ "Total Tool Calls",
370
+ "Tool Call Precision",
371
+ "Tool Call Recall",
372
+ "Agent Routing Accuracy",
373
+ "Avg Resp Time (sec)",
374
+ ]
375
+
376
+ for k in numeric_keys:
377
+ out[k] = mean(
378
+ [r[k] for r in rows if isinstance(r.get(k), (int, float))]
379
+ )
380
+
381
+ # Add total runs per dataset
382
+ out["Runs"] = round(float(len(rows)), 2)
383
+
384
+ # Journey Success -> numeric fraction in [0,1]
385
+ js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
386
+ out["Journey Success"] = round(
387
+ safe_divide(sum(js_vals), len(js_vals)), 2
388
+ )
389
+
390
+ # Text Match -> numeric fraction in [0,1]
391
+ tm_hits = 0
392
+ tm_den = len(rows)
393
+ for r in rows:
394
+ val = r.get("Text Match")
395
+ if str(val).strip() == TextMatchType.text_match.value:
396
+ tm_hits += 1
397
+ out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
398
+
399
+ per_test_rows.append(out)
400
+
401
+ # Create overall average row
402
+ overall_row = create_avg_row(per_test_rows)
403
+
404
+ # Format percentages
405
+ tool_call_metrics_for_display = per_test_rows + [overall_row]
406
+ for row in tool_call_metrics_for_display:
407
+ row["Text Match"] = to_pct(row.get("Text Match"), decimals=0)
408
+ row["Journey Success"] = to_pct(row.get("Journey Success"), decimals=0)
409
+
410
+ column_order = [
411
+ "Dataset",
412
+ "Runs",
413
+ "Total Steps",
414
+ "LLM Steps",
415
+ "Total Tool Calls",
416
+ "Tool Call Precision",
417
+ "Tool Call Recall",
418
+ "Agent Routing Accuracy",
419
+ "Text Match",
420
+ "Journey Success",
421
+ "Avg Resp Time (sec)",
422
+ ]
423
+
424
+ tool_call_metrics_for_display = [
425
+ {col: row.get(col, "") for col in column_order}
426
+ for row in tool_call_metrics_for_display
427
+ ]
428
+
429
+ return tool_call_metrics_for_display
430
+
431
+
432
+ def extract_metrics(
433
+ results: List[
434
+ Tuple[
435
+ ToolCallAndRoutingMetrics,
436
+ KnowledgeBaseMetricSummary,
437
+ CustomEvalMetrics,
438
+ ]
439
+ ],
440
+ ) -> tuple[
441
+ list[ToolCallAndRoutingMetrics],
442
+ KnowledgeBaseMetricSummary,
443
+ List[CustomEvalMetrics],
444
+ ]:
445
+ """
446
+ Aggregate metrics from test results.
447
+
448
+ Args:
449
+ results: List of tuples (metrics, knowledge_base_metrics, custom_metrics)
450
+
451
+ Returns:
452
+ Tuple of (knowledge_base_summary, tool_rows, custom_metrics)
453
+ """
454
+
455
+ tool_call_metrics = [metric[0] for metric in results]
456
+ knowledge_base_metrics = [metric[1] for metric in results]
457
+ custom_metrics: List[CustomEvalMetrics] = [metric[2] for metric in results]
458
+
459
+ kb_summary = KnowledgeBaseMetricSummary(
460
+ knowledge_base_metrics=knowledge_base_metrics
461
+ )
462
+
463
+ if len(tool_call_metrics) > 0:
464
+ # Remove the average row if it exists
465
+ tool_call_metrics = [
466
+ row
467
+ for row in tool_call_metrics
468
+ if row.dataset_name != "Summary (Average)"
469
+ ]
470
+
471
+ return tool_call_metrics, kb_summary, custom_metrics
@@ -0,0 +1,93 @@
1
+ import json
2
+ from typing import List, Union
3
+
4
+ from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
5
+ from wxo_agentic_evaluation.metrics.metrics import (
6
+ LangfuseMetric,
7
+ ToolCallAndRoutingMetrics,
8
+ )
9
+ from wxo_agentic_evaluation.type import ContentType
10
+
11
+
12
+ class ToolCalling(Evaluation):
13
+ @property
14
+ def name(self):
15
+ return "Tool Calling Metrics"
16
+
17
+ def evaluate(
18
+ self, messages, ground_truth, extracted_context, metadata, **kwargs
19
+ ) -> Union[LangfuseMetric, List[LangfuseMetric]]:
20
+ dataset_name = kwargs.get("dataset", "")
21
+
22
+ total_tool_calls = 0
23
+ relevant_tool_calls = 0
24
+ tool_calls_with_incorrect_parameter = 0
25
+ correct_tool_calls = set()
26
+
27
+ tool_dictionary = (
28
+ {
29
+ goal_detail.name: goal_detail
30
+ for goal_detail in ground_truth.goal_details
31
+ if goal_detail.type == ContentType.tool_call
32
+ }
33
+ if ground_truth.goal_details
34
+ else {}
35
+ )
36
+
37
+ labeled_messages = extracted_context.get("labeled_messages")
38
+ total_tool_calls = len(
39
+ [
40
+ message
41
+ for message in messages
42
+ if message.type == ContentType.tool_call
43
+ ]
44
+ )
45
+ relevant_tool_calls = len(labeled_messages)
46
+
47
+ for message_idx, matching_goal_details in labeled_messages.items():
48
+ msg_tool_call = messages[message_idx]
49
+ msg_tool_call = msg_tool_call.tool_calls[0].function
50
+ for goal_detail in matching_goal_details:
51
+ # TODO flesh out to match ADK EVAL
52
+ args_match = argument_matching(
53
+ expected=goal_detail.args,
54
+ actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
55
+ )
56
+
57
+ if args_match:
58
+ correct_tool_calls.add(goal_detail.name)
59
+ else:
60
+ tool_calls_with_incorrect_parameter += 1
61
+
62
+ # TODO: think about the dataset name
63
+ # TODO: total_steps
64
+ tool_call_metrics = ToolCallAndRoutingMetrics(
65
+ dataset_name=dataset_name,
66
+ total_tool_calls=total_tool_calls,
67
+ expected_tool_calls=len(tool_dictionary),
68
+ correct_tool_calls=len(correct_tool_calls),
69
+ relevant_tool_calls=relevant_tool_calls,
70
+ tool_calls_with_incorrect_parameter=tool_calls_with_incorrect_parameter,
71
+ )
72
+
73
+ tool_call_metrics = tool_call_metrics.model_dump()
74
+
75
+ metrics = []
76
+
77
+ for tool in [
78
+ "total_tool_calls",
79
+ "correct_tool_calls",
80
+ "expected_tool_calls",
81
+ "tool_calls_with_incorrect_parameter",
82
+ "tool_call_recall",
83
+ "tool_call_precision",
84
+ ]:
85
+ metric = LangfuseMetric(
86
+ eval_name=tool,
87
+ value=tool_call_metrics.get(tool),
88
+ metadata=metadata,
89
+ data_type="NUMERIC",
90
+ )
91
+ metrics.append(metric)
92
+
93
+ return metrics
@@ -0,0 +1 @@
1
+ from wxo_agentic_evaluation.otel_parser import parser as otel_parser
@@ -0,0 +1,86 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations, dfs_callable: callable):
5
+ messages = []
6
+ for node in dfs_observations:
7
+ # assume there will only be one AgentExecutor in the trace!
8
+ if node.obs.name == 'AgentExecutor': return _parse_agent_executor(node.children, dfs_callable(node.children))
9
+ return messages
10
+
11
+
12
+ def _parse_agent_executor(observation_tree, dfs_observations):
13
+ messages = []
14
+ for node in dfs_observations:
15
+ if node.obs.type == 'GENERATION':
16
+ print(node.obs.id)
17
+ messages.extend(_get_messages(node.obs.input))
18
+ # get intemediate steps from parent
19
+ messages.extend(_get_intermediate_steps(node.parent))
20
+ messages.extend(_get_messages([node.obs.output]))
21
+ return messages
22
+
23
+
24
+ def _get_messages(data):
25
+ messages = []
26
+ for msg in data:
27
+ if msg['role'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
28
+ elif msg['role'] == 'user':
29
+ content = ''
30
+ if isinstance(msg['content'], list):
31
+ content = []
32
+ for item in msg['content']:
33
+ if item['type'] == ['text']: content.append(item['text'])
34
+ content = ' '.join(content)
35
+ elif isinstance(msg['content'], str):
36
+ content = msg['content']
37
+
38
+ messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
39
+ elif msg['role'] == 'assistant':
40
+ content = msg['content'] or ''
41
+ additional_kwargs = msg.get('additional_kwargs', {})
42
+ tool_calls = None
43
+ if 'tool_calls' in additional_kwargs:
44
+ tool_calls = []
45
+ for tc in additional_kwargs['tool_calls']:
46
+ id_ = tc['id']
47
+ function = OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])
48
+ tool_calls.append(OTelParserToolCall(id=id_, function=function))
49
+ messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=ContentType.tool_call))
50
+ return messages
51
+
52
+ def _get_intermediate_steps(node):
53
+ messages = []
54
+ tool_calls_n_responses = node.obs.input['intermediate_steps']
55
+ for tc, tr in tool_calls_n_responses:
56
+ if 'tool' in tc and 'tool_input' in tc and 'tool_call_id' in tc:
57
+ tool_call_id = tc['tool_call_id']
58
+ if isinstance(tr, str):
59
+ messages.append(OTelParserMessage(role='tool', content=tr, tool_call_id=tool_call_id, type=ContentType.tool_response))
60
+ continue
61
+ elif (isinstance(tr, dict) and 'content' not in tr):
62
+ messages.append(OTelParserMessage(role='tool', content=json.dumps(tr), tool_call_id=tool_call_id, type=ContentType.tool_response))
63
+ continue
64
+ elif isinstance(tr, dict) and 'content' in tr:
65
+ content = tr['content']
66
+ if isinstance(content, str):
67
+ messages.append(OTelParserMessage(role='tool', content=content, tool_call_id=tool_call_id, type=ContentType.tool_response))
68
+ continue
69
+ elif isinstance(content, list):
70
+ for part in content:
71
+ if isinstance(part, dict) and part['type'] == 'text':
72
+ text = part['text']
73
+ if isinstance(text, dict): text = json.dumps(text)
74
+ messages.append(OTelParserMessage(role='tool', content=text, tool_call_id=tool_call_id, type=ContentType.tool_response))
75
+ continue
76
+ else:
77
+ raise ValueError(f"Unexpected part type: {type(part)} or part[type] '{part['type']}' != 'text'")
78
+ else:
79
+ raise ValueError(f"Unexpected content type: {type(content)}")
80
+ else:
81
+ raise ValueError(f"Unexpected tool response: Type: {type(tr)}, Value: {tr}")
82
+
83
+ else:
84
+ print('Tool Call:', tc)
85
+ print('Tool Response:', tr)
86
+ return messages
@@ -0,0 +1,61 @@
1
+ import json
2
+ from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
3
+
4
+ def parse_observations(observation_tree, dfs_observations):
5
+ messages = []
6
+ is_first_generation = True
7
+ for obs in dfs_observations:
8
+ if obs.obs.type == 'GENERATION':
9
+ if is_first_generation:
10
+ messages.extend(_get_input_message(obs))
11
+ is_first_generation = False
12
+ parent = obs.parent
13
+ if parent.obs.type == 'CHAIN':
14
+ # TODO: messages is a list. confirm, we will only see one message in the list.
15
+ msg = parent.obs.output['messages'][0]
16
+ content = msg['content'] or ''
17
+ msg_type = ContentType.text
18
+ tool_calls = msg['tool_calls'] or None
19
+ if tool_calls is not None:
20
+ msg_type = ContentType.tool_call
21
+ tool_calls = [_to_tool_call(tc) for tc in tool_calls]
22
+ messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
23
+ elif obs.obs.type == 'TOOL':
24
+ parent_node = obs.parent
25
+ if parent_node.obs.type == 'CHAIN':
26
+ for tool_response in parent_node.obs.output['messages']:
27
+ messages.append(OTelParserMessage(role='tool', content=tool_response['content'], tool_call_id=tool_response['tool_call_id'], type=ContentType.tool_response))
28
+ return messages
29
+
30
+
31
+ def _get_input_message(obs_node):
32
+ ret = []
33
+ parent = obs_node.parent
34
+ if parent.obs.type == 'CHAIN':
35
+ for msg in parent.obs.input['messages']:
36
+ if msg['type'] == 'system': ret.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
37
+ elif msg['type'] == 'human': ret.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
38
+ elif msg['type'] == 'tool': ret.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id'], type=ContentType.tool_response))
39
+ elif msg['type'] == 'ai':
40
+ content = msg['content'] or ''
41
+ tool_calls = msg['tool_calls'] or None
42
+ msg_type = ContentType.text
43
+ if tool_calls is not None:
44
+ msg_type = ContentType.tool_call
45
+ tool_calls = [_to_tool_call(tc) for tc in tool_calls]
46
+ ret.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
47
+ return ret
48
+
49
+
50
+ def _to_tool_call(tool_call):
51
+ return OTelParserToolCall(
52
+ id=tool_call['id'],
53
+ type='function', # OTelParserToolCall expects literal 'function'
54
+ function=_to_function(tool_call)
55
+ )
56
+
57
+ def _to_function(func):
58
+ return OTelParserFunction(
59
+ name=func['name'],
60
+ arguments=json.dumps(func['args'])
61
+ )