ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from pydantic import BaseModel, computed_field
|
|
2
|
-
|
|
3
1
|
from abc import abstractmethod
|
|
4
2
|
from functools import cached_property
|
|
5
3
|
|
|
4
|
+
from pydantic import BaseModel, computed_field
|
|
5
|
+
|
|
6
6
|
|
|
7
7
|
class BaseLLMJudgeMetric(BaseModel):
|
|
8
8
|
@abstractmethod
|
|
@@ -44,3 +44,29 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
|
|
|
44
44
|
"answer_relevancy": self.answer_relevancy,
|
|
45
45
|
"answer_relevancy_score": self.answer_relevancy_score,
|
|
46
46
|
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AnswerDerailment(BaseLLMJudgeMetric):
|
|
50
|
+
in_scope: str | float
|
|
51
|
+
statement: str
|
|
52
|
+
reason: str
|
|
53
|
+
|
|
54
|
+
def table(self):
|
|
55
|
+
return {
|
|
56
|
+
"statement": self.statement,
|
|
57
|
+
"reason": self.reason,
|
|
58
|
+
"on_topic_score": str(self.in_scope),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AnswerUnsafeTopic(BaseLLMJudgeMetric):
|
|
63
|
+
is_safe: str | float
|
|
64
|
+
statement: str
|
|
65
|
+
reason: str
|
|
66
|
+
|
|
67
|
+
def table(self):
|
|
68
|
+
return {
|
|
69
|
+
"statement": self.statement,
|
|
70
|
+
"reason": self.reason,
|
|
71
|
+
"safe_topic_score": str(self.is_safe),
|
|
72
|
+
}
|
|
@@ -1,19 +1,45 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
from
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from enum import Enum, StrEnum
|
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, computed_field
|
|
6
|
+
from pydantic.fields import Field
|
|
6
7
|
|
|
7
|
-
from wxo_agentic_evaluation.metrics.llm_as_judge import
|
|
8
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
9
|
+
AnswerRelevancy,
|
|
10
|
+
Faithfulness,
|
|
11
|
+
)
|
|
8
12
|
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
9
13
|
|
|
10
14
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
15
|
+
class DescriptionQuality(StrEnum):
|
|
16
|
+
GOOD = "GOOD"
|
|
17
|
+
BAD = "BAD"
|
|
18
|
+
MISSING = "MISSING"
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
20
|
+
|
|
21
|
+
class DescriptionQualityMetric(BaseModel):
|
|
22
|
+
tool_name: str = None
|
|
23
|
+
description_score: float | None = None
|
|
24
|
+
threshold: float | None = None
|
|
25
|
+
|
|
26
|
+
@computed_field
|
|
27
|
+
@property
|
|
28
|
+
def is_bad_description(self) -> Optional[bool]:
|
|
29
|
+
if self.description_score and self.threshold:
|
|
30
|
+
return self.description_score >= self.threshold
|
|
31
|
+
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
@computed_field
|
|
35
|
+
@property
|
|
36
|
+
def description_quality(self) -> str:
|
|
37
|
+
if self.description_score is None:
|
|
38
|
+
return DescriptionQuality.MISSING
|
|
39
|
+
elif self.is_bad_description:
|
|
40
|
+
return DescriptionQuality.BAD
|
|
41
|
+
else:
|
|
42
|
+
return DescriptionQuality.GOOD
|
|
17
43
|
|
|
18
44
|
|
|
19
45
|
class KnowledgeBaseMetrics(BaseModel):
|
|
@@ -54,7 +80,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
|
|
|
54
80
|
}
|
|
55
81
|
else:
|
|
56
82
|
values = groupby[name]
|
|
57
|
-
values.get("knowledge_base_name").append(
|
|
83
|
+
values.get("knowledge_base_name").append(
|
|
84
|
+
knowledge_base_name
|
|
85
|
+
)
|
|
58
86
|
values.get("faithfulness").append(faithfulness)
|
|
59
87
|
values.get("answer_relevancy").append(answer_relevancy)
|
|
60
88
|
values.get("confidence_scores").append(confidence_scores)
|
|
@@ -67,6 +95,8 @@ class KnowledgeBaseMetricSummary(BaseModel):
|
|
|
67
95
|
@computed_field(alias="summary")
|
|
68
96
|
@property
|
|
69
97
|
def average(self) -> Mapping[str, Any]:
|
|
98
|
+
from wxo_agentic_evaluation.utils.utils import average
|
|
99
|
+
|
|
70
100
|
summary = {}
|
|
71
101
|
for dataset, metric in self.groupby_dataset.items():
|
|
72
102
|
average_metric = {}
|
|
@@ -109,6 +139,7 @@ class KeywordSemanticSearchMetric(BaseModel):
|
|
|
109
139
|
message: str
|
|
110
140
|
goal_detail: str
|
|
111
141
|
|
|
142
|
+
|
|
112
143
|
class TextMatchType(Enum):
|
|
113
144
|
text_match = "Summary Matched"
|
|
114
145
|
text_mismatch = "Summary MisMatched"
|
|
@@ -117,12 +148,14 @@ class TextMatchType(Enum):
|
|
|
117
148
|
|
|
118
149
|
class ToolCallAndRoutingMetrics(BaseModel):
|
|
119
150
|
dataset_name: str = ""
|
|
120
|
-
total_steps: int=0
|
|
121
|
-
llm_step: int =0
|
|
151
|
+
total_steps: int = 0
|
|
152
|
+
llm_step: int = 0
|
|
122
153
|
total_tool_calls: int = 0
|
|
123
154
|
expected_tool_calls: int = 0
|
|
124
155
|
correct_tool_calls: int = 0
|
|
125
|
-
relevant_tool_calls: int =
|
|
156
|
+
relevant_tool_calls: int = (
|
|
157
|
+
0 # calls with the same function but different args
|
|
158
|
+
)
|
|
126
159
|
total_routing_calls: int = 0
|
|
127
160
|
relevant_routing_calls: int = 0
|
|
128
161
|
tool_calls_with_incorrect_parameter: int = 0
|
|
@@ -135,7 +168,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
135
168
|
def tool_call_recall(self) -> float:
|
|
136
169
|
return round(
|
|
137
170
|
(
|
|
138
|
-
self.correct_tool_calls/self.expected_tool_calls
|
|
171
|
+
self.correct_tool_calls / self.expected_tool_calls
|
|
139
172
|
if self.expected_tool_calls > 0
|
|
140
173
|
else 0.0
|
|
141
174
|
),
|
|
@@ -147,8 +180,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
147
180
|
def tool_call_precision(self) -> float:
|
|
148
181
|
return round(
|
|
149
182
|
(
|
|
150
|
-
(self.correct_tool_calls)
|
|
151
|
-
/ self.total_tool_calls
|
|
183
|
+
(self.correct_tool_calls) / self.total_tool_calls
|
|
152
184
|
if self.total_tool_calls > 0
|
|
153
185
|
else 0.0
|
|
154
186
|
),
|
|
@@ -166,3 +198,274 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
166
198
|
),
|
|
167
199
|
2,
|
|
168
200
|
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class Annotation(BaseModel):
|
|
204
|
+
recommendation: str
|
|
205
|
+
details: str
|
|
206
|
+
quote: str
|
|
207
|
+
parameter_name: Optional[str]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class FailedStaticTestCases(BaseModel):
|
|
211
|
+
metric_name: str
|
|
212
|
+
description: str
|
|
213
|
+
explanation: str
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class FailedSemanticTestCases(BaseModel):
|
|
217
|
+
metric_name: str
|
|
218
|
+
evidence: str
|
|
219
|
+
explanation: str
|
|
220
|
+
output: int
|
|
221
|
+
confidence: float
|
|
222
|
+
annotations: Optional[List[Annotation]] = None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class EnhancedAnalyzeMetrics(BaseModel):
|
|
226
|
+
test_case_name: str
|
|
227
|
+
tool_names: List[str]
|
|
228
|
+
parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
229
|
+
tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
230
|
+
static_metrics: List[List[FailedStaticTestCases]] = [[]]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class ReferenceLessEvalMetrics(BaseModel):
|
|
234
|
+
dataset_name: str
|
|
235
|
+
number_of_tool_calls: int
|
|
236
|
+
number_of_successful_tool_calls: int
|
|
237
|
+
number_of_static_failed_tool_calls: int
|
|
238
|
+
number_of_semantic_failed_tool_calls: int
|
|
239
|
+
failed_static_tool_calls: Optional[
|
|
240
|
+
List[Tuple[int, List[FailedStaticTestCases]]]
|
|
241
|
+
]
|
|
242
|
+
failed_semantic_tool_calls: Optional[
|
|
243
|
+
List[Tuple[int, List[FailedSemanticTestCases]]]
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class Metric(BaseModel):
|
|
248
|
+
"""Generic metric result."""
|
|
249
|
+
|
|
250
|
+
eval_name: str = Field(description="name of eval that produce metric")
|
|
251
|
+
value: int | float | bool | str = Field(description="metric value")
|
|
252
|
+
metadata: Optional[dict] = Field(
|
|
253
|
+
default=None,
|
|
254
|
+
description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class LangfuseMetric(Metric):
|
|
259
|
+
comment: Optional[str] = ""
|
|
260
|
+
data_type: Optional[str] = ""
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class CustomEvalMetrics(BaseModel):
|
|
264
|
+
dataset_name: str
|
|
265
|
+
custom_metrics: list[Metric]
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def create_avg_row(metrics: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
269
|
+
"""
|
|
270
|
+
Create an average row from a list of metric dictionaries.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
metrics: List of metric dictionaries
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dictionary with averaged metrics
|
|
277
|
+
"""
|
|
278
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
279
|
+
|
|
280
|
+
avg_row = {
|
|
281
|
+
"Dataset": "Summary (Average)",
|
|
282
|
+
"Runs": 0,
|
|
283
|
+
"Total Steps": 0,
|
|
284
|
+
"LLM Steps": 0,
|
|
285
|
+
"Total Tool Calls": 0,
|
|
286
|
+
"Tool Call Precision": 0,
|
|
287
|
+
"Tool Call Recall": 0,
|
|
288
|
+
"Agent Routing Accuracy": 0,
|
|
289
|
+
"Text Match": 0,
|
|
290
|
+
"Journey Success": 0,
|
|
291
|
+
"Avg Resp Time (sec)": 0,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if metrics:
|
|
295
|
+
for row in metrics:
|
|
296
|
+
avg_row["Runs"] += row.get("Runs", 0)
|
|
297
|
+
avg_row["Total Steps"] += row["Total Steps"]
|
|
298
|
+
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
299
|
+
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
300
|
+
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
301
|
+
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
302
|
+
avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
|
|
303
|
+
avg_row["Text Match"] += row["Text Match"]
|
|
304
|
+
avg_row["Journey Success"] += row["Journey Success"]
|
|
305
|
+
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
306
|
+
|
|
307
|
+
n = len(metrics)
|
|
308
|
+
# Average over datasets
|
|
309
|
+
avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
|
|
310
|
+
avg_row["Total Steps"] = round(
|
|
311
|
+
safe_divide(avg_row["Total Steps"], n), 2
|
|
312
|
+
)
|
|
313
|
+
avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], n), 2)
|
|
314
|
+
avg_row["Total Tool Calls"] = round(
|
|
315
|
+
safe_divide(avg_row["Total Tool Calls"], n), 2
|
|
316
|
+
)
|
|
317
|
+
avg_row["Tool Call Precision"] = round(
|
|
318
|
+
safe_divide(avg_row["Tool Call Precision"], n), 2
|
|
319
|
+
)
|
|
320
|
+
avg_row["Tool Call Recall"] = round(
|
|
321
|
+
safe_divide(avg_row["Tool Call Recall"], n), 2
|
|
322
|
+
)
|
|
323
|
+
avg_row["Agent Routing Accuracy"] = round(
|
|
324
|
+
safe_divide(avg_row["Agent Routing Accuracy"], n), 2
|
|
325
|
+
)
|
|
326
|
+
avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], n), 2)
|
|
327
|
+
avg_row["Journey Success"] = round(
|
|
328
|
+
safe_divide(avg_row["Journey Success"], n), 2
|
|
329
|
+
)
|
|
330
|
+
avg_row["Avg Resp Time (sec)"] = round(
|
|
331
|
+
safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
return avg_row
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def format_metrics_for_display(
|
|
338
|
+
tool_call_metrics: list[ToolCallAndRoutingMetrics],
|
|
339
|
+
) -> list[dict[str, Any]]:
|
|
340
|
+
from wxo_agentic_evaluation.utils.utils import mean, safe_divide, to_pct
|
|
341
|
+
|
|
342
|
+
# Group metrics by dataset name
|
|
343
|
+
grouped = defaultdict(list)
|
|
344
|
+
for m in tool_call_metrics:
|
|
345
|
+
grouped[m.dataset_name].append(
|
|
346
|
+
{
|
|
347
|
+
"Dataset": m.dataset_name,
|
|
348
|
+
"Total Steps": m.total_steps,
|
|
349
|
+
"LLM Steps": m.llm_step,
|
|
350
|
+
"Total Tool Calls": m.total_tool_calls,
|
|
351
|
+
"Tool Call Precision": m.tool_call_precision,
|
|
352
|
+
"Tool Call Recall": m.tool_call_recall,
|
|
353
|
+
"Agent Routing Accuracy": m.agent_routing_accuracy,
|
|
354
|
+
"Text Match": m.text_match,
|
|
355
|
+
"Journey Success": m.is_success,
|
|
356
|
+
"Avg Resp Time (sec)": m.avg_resp_time,
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Create per-test rows with averages over runs
|
|
361
|
+
per_test_rows = []
|
|
362
|
+
for ds, rows in grouped.items():
|
|
363
|
+
out = {"Dataset": ds}
|
|
364
|
+
|
|
365
|
+
# Average numeric columns over runs
|
|
366
|
+
numeric_keys = [
|
|
367
|
+
"Total Steps",
|
|
368
|
+
"LLM Steps",
|
|
369
|
+
"Total Tool Calls",
|
|
370
|
+
"Tool Call Precision",
|
|
371
|
+
"Tool Call Recall",
|
|
372
|
+
"Agent Routing Accuracy",
|
|
373
|
+
"Avg Resp Time (sec)",
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
for k in numeric_keys:
|
|
377
|
+
out[k] = mean(
|
|
378
|
+
[r[k] for r in rows if isinstance(r.get(k), (int, float))]
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Add total runs per dataset
|
|
382
|
+
out["Runs"] = round(float(len(rows)), 2)
|
|
383
|
+
|
|
384
|
+
# Journey Success -> numeric fraction in [0,1]
|
|
385
|
+
js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
|
|
386
|
+
out["Journey Success"] = round(
|
|
387
|
+
safe_divide(sum(js_vals), len(js_vals)), 2
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Text Match -> numeric fraction in [0,1]
|
|
391
|
+
tm_hits = 0
|
|
392
|
+
tm_den = len(rows)
|
|
393
|
+
for r in rows:
|
|
394
|
+
val = r.get("Text Match")
|
|
395
|
+
if str(val).strip() == TextMatchType.text_match.value:
|
|
396
|
+
tm_hits += 1
|
|
397
|
+
out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
|
|
398
|
+
|
|
399
|
+
per_test_rows.append(out)
|
|
400
|
+
|
|
401
|
+
# Create overall average row
|
|
402
|
+
overall_row = create_avg_row(per_test_rows)
|
|
403
|
+
|
|
404
|
+
# Format percentages
|
|
405
|
+
tool_call_metrics_for_display = per_test_rows + [overall_row]
|
|
406
|
+
for row in tool_call_metrics_for_display:
|
|
407
|
+
row["Text Match"] = to_pct(row.get("Text Match"), decimals=0)
|
|
408
|
+
row["Journey Success"] = to_pct(row.get("Journey Success"), decimals=0)
|
|
409
|
+
|
|
410
|
+
column_order = [
|
|
411
|
+
"Dataset",
|
|
412
|
+
"Runs",
|
|
413
|
+
"Total Steps",
|
|
414
|
+
"LLM Steps",
|
|
415
|
+
"Total Tool Calls",
|
|
416
|
+
"Tool Call Precision",
|
|
417
|
+
"Tool Call Recall",
|
|
418
|
+
"Agent Routing Accuracy",
|
|
419
|
+
"Text Match",
|
|
420
|
+
"Journey Success",
|
|
421
|
+
"Avg Resp Time (sec)",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
tool_call_metrics_for_display = [
|
|
425
|
+
{col: row.get(col, "") for col in column_order}
|
|
426
|
+
for row in tool_call_metrics_for_display
|
|
427
|
+
]
|
|
428
|
+
|
|
429
|
+
return tool_call_metrics_for_display
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def extract_metrics(
|
|
433
|
+
results: List[
|
|
434
|
+
Tuple[
|
|
435
|
+
ToolCallAndRoutingMetrics,
|
|
436
|
+
KnowledgeBaseMetricSummary,
|
|
437
|
+
CustomEvalMetrics,
|
|
438
|
+
]
|
|
439
|
+
],
|
|
440
|
+
) -> tuple[
|
|
441
|
+
list[ToolCallAndRoutingMetrics],
|
|
442
|
+
KnowledgeBaseMetricSummary,
|
|
443
|
+
List[CustomEvalMetrics],
|
|
444
|
+
]:
|
|
445
|
+
"""
|
|
446
|
+
Aggregate metrics from test results.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
results: List of tuples (metrics, knowledge_base_metrics, custom_metrics)
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Tuple of (knowledge_base_summary, tool_rows, custom_metrics)
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
tool_call_metrics = [metric[0] for metric in results]
|
|
456
|
+
knowledge_base_metrics = [metric[1] for metric in results]
|
|
457
|
+
custom_metrics: List[CustomEvalMetrics] = [metric[2] for metric in results]
|
|
458
|
+
|
|
459
|
+
kb_summary = KnowledgeBaseMetricSummary(
|
|
460
|
+
knowledge_base_metrics=knowledge_base_metrics
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
if len(tool_call_metrics) > 0:
|
|
464
|
+
# Remove the average row if it exists
|
|
465
|
+
tool_call_metrics = [
|
|
466
|
+
row
|
|
467
|
+
for row in tool_call_metrics
|
|
468
|
+
if row.dataset_name != "Summary (Average)"
|
|
469
|
+
]
|
|
470
|
+
|
|
471
|
+
return tool_call_metrics, kb_summary, custom_metrics
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
|
|
5
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
6
|
+
LangfuseMetric,
|
|
7
|
+
ToolCallAndRoutingMetrics,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.type import ContentType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ToolCalling(Evaluation):
|
|
13
|
+
@property
|
|
14
|
+
def name(self):
|
|
15
|
+
return "Tool Calling Metrics"
|
|
16
|
+
|
|
17
|
+
def evaluate(
|
|
18
|
+
self, messages, ground_truth, extracted_context, metadata, **kwargs
|
|
19
|
+
) -> Union[LangfuseMetric, List[LangfuseMetric]]:
|
|
20
|
+
dataset_name = kwargs.get("dataset", "")
|
|
21
|
+
|
|
22
|
+
total_tool_calls = 0
|
|
23
|
+
relevant_tool_calls = 0
|
|
24
|
+
tool_calls_with_incorrect_parameter = 0
|
|
25
|
+
correct_tool_calls = set()
|
|
26
|
+
|
|
27
|
+
tool_dictionary = (
|
|
28
|
+
{
|
|
29
|
+
goal_detail.name: goal_detail
|
|
30
|
+
for goal_detail in ground_truth.goal_details
|
|
31
|
+
if goal_detail.type == ContentType.tool_call
|
|
32
|
+
}
|
|
33
|
+
if ground_truth.goal_details
|
|
34
|
+
else {}
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
labeled_messages = extracted_context.get("labeled_messages")
|
|
38
|
+
total_tool_calls = len(
|
|
39
|
+
[
|
|
40
|
+
message
|
|
41
|
+
for message in messages
|
|
42
|
+
if message.type == ContentType.tool_call
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
relevant_tool_calls = len(labeled_messages)
|
|
46
|
+
|
|
47
|
+
for message_idx, matching_goal_details in labeled_messages.items():
|
|
48
|
+
msg_tool_call = messages[message_idx]
|
|
49
|
+
msg_tool_call = msg_tool_call.tool_calls[0].function
|
|
50
|
+
for goal_detail in matching_goal_details:
|
|
51
|
+
# TODO flesh out to match ADK EVAL
|
|
52
|
+
args_match = argument_matching(
|
|
53
|
+
expected=goal_detail.args,
|
|
54
|
+
actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if args_match:
|
|
58
|
+
correct_tool_calls.add(goal_detail.name)
|
|
59
|
+
else:
|
|
60
|
+
tool_calls_with_incorrect_parameter += 1
|
|
61
|
+
|
|
62
|
+
# TODO: think about the dataset name
|
|
63
|
+
# TODO: total_steps
|
|
64
|
+
tool_call_metrics = ToolCallAndRoutingMetrics(
|
|
65
|
+
dataset_name=dataset_name,
|
|
66
|
+
total_tool_calls=total_tool_calls,
|
|
67
|
+
expected_tool_calls=len(tool_dictionary),
|
|
68
|
+
correct_tool_calls=len(correct_tool_calls),
|
|
69
|
+
relevant_tool_calls=relevant_tool_calls,
|
|
70
|
+
tool_calls_with_incorrect_parameter=tool_calls_with_incorrect_parameter,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
tool_call_metrics = tool_call_metrics.model_dump()
|
|
74
|
+
|
|
75
|
+
metrics = []
|
|
76
|
+
|
|
77
|
+
for tool in [
|
|
78
|
+
"total_tool_calls",
|
|
79
|
+
"correct_tool_calls",
|
|
80
|
+
"expected_tool_calls",
|
|
81
|
+
"tool_calls_with_incorrect_parameter",
|
|
82
|
+
"tool_call_recall",
|
|
83
|
+
"tool_call_precision",
|
|
84
|
+
]:
|
|
85
|
+
metric = LangfuseMetric(
|
|
86
|
+
eval_name=tool,
|
|
87
|
+
value=tool_call_metrics.get(tool),
|
|
88
|
+
metadata=metadata,
|
|
89
|
+
data_type="NUMERIC",
|
|
90
|
+
)
|
|
91
|
+
metrics.append(metric)
|
|
92
|
+
|
|
93
|
+
return metrics
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.otel_parser import parser as otel_parser
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations, dfs_callable: callable):
|
|
5
|
+
messages = []
|
|
6
|
+
for node in dfs_observations:
|
|
7
|
+
# assume there will only be one AgentExecutor in the trace!
|
|
8
|
+
if node.obs.name == 'AgentExecutor': return _parse_agent_executor(node.children, dfs_callable(node.children))
|
|
9
|
+
return messages
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _parse_agent_executor(observation_tree, dfs_observations):
|
|
13
|
+
messages = []
|
|
14
|
+
for node in dfs_observations:
|
|
15
|
+
if node.obs.type == 'GENERATION':
|
|
16
|
+
print(node.obs.id)
|
|
17
|
+
messages.extend(_get_messages(node.obs.input))
|
|
18
|
+
# get intemediate steps from parent
|
|
19
|
+
messages.extend(_get_intermediate_steps(node.parent))
|
|
20
|
+
messages.extend(_get_messages([node.obs.output]))
|
|
21
|
+
return messages
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_messages(data):
|
|
25
|
+
messages = []
|
|
26
|
+
for msg in data:
|
|
27
|
+
if msg['role'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
28
|
+
elif msg['role'] == 'user':
|
|
29
|
+
content = ''
|
|
30
|
+
if isinstance(msg['content'], list):
|
|
31
|
+
content = []
|
|
32
|
+
for item in msg['content']:
|
|
33
|
+
if item['type'] == ['text']: content.append(item['text'])
|
|
34
|
+
content = ' '.join(content)
|
|
35
|
+
elif isinstance(msg['content'], str):
|
|
36
|
+
content = msg['content']
|
|
37
|
+
|
|
38
|
+
messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
|
|
39
|
+
elif msg['role'] == 'assistant':
|
|
40
|
+
content = msg['content'] or ''
|
|
41
|
+
additional_kwargs = msg.get('additional_kwargs', {})
|
|
42
|
+
tool_calls = None
|
|
43
|
+
if 'tool_calls' in additional_kwargs:
|
|
44
|
+
tool_calls = []
|
|
45
|
+
for tc in additional_kwargs['tool_calls']:
|
|
46
|
+
id_ = tc['id']
|
|
47
|
+
function = OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])
|
|
48
|
+
tool_calls.append(OTelParserToolCall(id=id_, function=function))
|
|
49
|
+
messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=ContentType.tool_call))
|
|
50
|
+
return messages
|
|
51
|
+
|
|
52
|
+
def _get_intermediate_steps(node):
|
|
53
|
+
messages = []
|
|
54
|
+
tool_calls_n_responses = node.obs.input['intermediate_steps']
|
|
55
|
+
for tc, tr in tool_calls_n_responses:
|
|
56
|
+
if 'tool' in tc and 'tool_input' in tc and 'tool_call_id' in tc:
|
|
57
|
+
tool_call_id = tc['tool_call_id']
|
|
58
|
+
if isinstance(tr, str):
|
|
59
|
+
messages.append(OTelParserMessage(role='tool', content=tr, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
60
|
+
continue
|
|
61
|
+
elif (isinstance(tr, dict) and 'content' not in tr):
|
|
62
|
+
messages.append(OTelParserMessage(role='tool', content=json.dumps(tr), tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
63
|
+
continue
|
|
64
|
+
elif isinstance(tr, dict) and 'content' in tr:
|
|
65
|
+
content = tr['content']
|
|
66
|
+
if isinstance(content, str):
|
|
67
|
+
messages.append(OTelParserMessage(role='tool', content=content, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
68
|
+
continue
|
|
69
|
+
elif isinstance(content, list):
|
|
70
|
+
for part in content:
|
|
71
|
+
if isinstance(part, dict) and part['type'] == 'text':
|
|
72
|
+
text = part['text']
|
|
73
|
+
if isinstance(text, dict): text = json.dumps(text)
|
|
74
|
+
messages.append(OTelParserMessage(role='tool', content=text, tool_call_id=tool_call_id, type=ContentType.tool_response))
|
|
75
|
+
continue
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError(f"Unexpected part type: {type(part)} or part[type] '{part['type']}' != 'text'")
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Unexpected content type: {type(content)}")
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unexpected tool response: Type: {type(tr)}, Value: {tr}")
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
print('Tool Call:', tc)
|
|
85
|
+
print('Tool Response:', tr)
|
|
86
|
+
return messages
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
|
|
3
|
+
|
|
4
|
+
def parse_observations(observation_tree, dfs_observations):
|
|
5
|
+
messages = []
|
|
6
|
+
is_first_generation = True
|
|
7
|
+
for obs in dfs_observations:
|
|
8
|
+
if obs.obs.type == 'GENERATION':
|
|
9
|
+
if is_first_generation:
|
|
10
|
+
messages.extend(_get_input_message(obs))
|
|
11
|
+
is_first_generation = False
|
|
12
|
+
parent = obs.parent
|
|
13
|
+
if parent.obs.type == 'CHAIN':
|
|
14
|
+
# TODO: messages is a list. confirm, we will only see one message in the list.
|
|
15
|
+
msg = parent.obs.output['messages'][0]
|
|
16
|
+
content = msg['content'] or ''
|
|
17
|
+
msg_type = ContentType.text
|
|
18
|
+
tool_calls = msg['tool_calls'] or None
|
|
19
|
+
if tool_calls is not None:
|
|
20
|
+
msg_type = ContentType.tool_call
|
|
21
|
+
tool_calls = [_to_tool_call(tc) for tc in tool_calls]
|
|
22
|
+
messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
|
|
23
|
+
elif obs.obs.type == 'TOOL':
|
|
24
|
+
parent_node = obs.parent
|
|
25
|
+
if parent_node.obs.type == 'CHAIN':
|
|
26
|
+
for tool_response in parent_node.obs.output['messages']:
|
|
27
|
+
messages.append(OTelParserMessage(role='tool', content=tool_response['content'], tool_call_id=tool_response['tool_call_id'], type=ContentType.tool_response))
|
|
28
|
+
return messages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_input_message(obs_node):
|
|
32
|
+
ret = []
|
|
33
|
+
parent = obs_node.parent
|
|
34
|
+
if parent.obs.type == 'CHAIN':
|
|
35
|
+
for msg in parent.obs.input['messages']:
|
|
36
|
+
if msg['type'] == 'system': ret.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
|
|
37
|
+
elif msg['type'] == 'human': ret.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
|
|
38
|
+
elif msg['type'] == 'tool': ret.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id'], type=ContentType.tool_response))
|
|
39
|
+
elif msg['type'] == 'ai':
|
|
40
|
+
content = msg['content'] or ''
|
|
41
|
+
tool_calls = msg['tool_calls'] or None
|
|
42
|
+
msg_type = ContentType.text
|
|
43
|
+
if tool_calls is not None:
|
|
44
|
+
msg_type = ContentType.tool_call
|
|
45
|
+
tool_calls = [_to_tool_call(tc) for tc in tool_calls]
|
|
46
|
+
ret.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
|
|
47
|
+
return ret
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _to_tool_call(tool_call):
|
|
51
|
+
return OTelParserToolCall(
|
|
52
|
+
id=tool_call['id'],
|
|
53
|
+
type='function', # OTelParserToolCall expects literal 'function'
|
|
54
|
+
function=_to_function(tool_call)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _to_function(func):
|
|
58
|
+
return OTelParserFunction(
|
|
59
|
+
name=func['name'],
|
|
60
|
+
arguments=json.dumps(func['args'])
|
|
61
|
+
)
|