ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import statistics
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TestCaseEvaluationResult:
|
|
10
|
+
"""Class representing a single test case evaluation result."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
text_match: Optional[str] = None
|
|
14
|
+
is_success: bool = False
|
|
15
|
+
total_steps: float = 0
|
|
16
|
+
llm_step: float = 0
|
|
17
|
+
total_tool_calls: float = 0
|
|
18
|
+
tool_call_precision: float = 0
|
|
19
|
+
tool_call_recall: float = 0
|
|
20
|
+
agent_routing_accuracy: float = 0
|
|
21
|
+
avg_resp_time: float = 0
|
|
22
|
+
failed_tool_calls: int = 0
|
|
23
|
+
|
|
24
|
+
# Store any additional metrics not explicitly defined
|
|
25
|
+
additional_metrics: Dict[str, Any] = field(default_factory=dict)
|
|
26
|
+
|
|
27
|
+
def matches_count(self, match_value: str = "Summary Matched") -> int:
|
|
28
|
+
"""Check if this test case matches the specified value."""
|
|
29
|
+
return 1 if self.text_match == match_value else 0
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
32
|
+
"""Convert the test case result to a dictionary."""
|
|
33
|
+
result = {
|
|
34
|
+
"dataset_name": self.name,
|
|
35
|
+
"text_match": self.text_match,
|
|
36
|
+
"is_success": self.is_success,
|
|
37
|
+
"total_steps": self.total_steps,
|
|
38
|
+
"llm_step": self.llm_step,
|
|
39
|
+
"total_tool_calls": self.total_tool_calls,
|
|
40
|
+
"tool_call_precision": self.tool_call_precision,
|
|
41
|
+
"tool_call_recall": self.tool_call_recall,
|
|
42
|
+
"agent_routing_accuracy": self.agent_routing_accuracy,
|
|
43
|
+
"avg_resp_time": self.avg_resp_time,
|
|
44
|
+
"failed_tool_calls": self.failed_tool_calls,
|
|
45
|
+
}
|
|
46
|
+
# Add any additional metrics
|
|
47
|
+
result.update(self.additional_metrics)
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class EvaluationResult:
|
|
53
|
+
"""Class representing a collection of test case evaluation results."""
|
|
54
|
+
|
|
55
|
+
test_case_results: Dict[str, TestCaseEvaluationResult]
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
|
|
59
|
+
"""Create an EvaluationResult from CSV data."""
|
|
60
|
+
results = {}
|
|
61
|
+
for row in data:
|
|
62
|
+
name = row["dataset_name"]
|
|
63
|
+
|
|
64
|
+
# Extract standard fields
|
|
65
|
+
standard_fields = {
|
|
66
|
+
"name": name,
|
|
67
|
+
"text_match": row.get("text_match"),
|
|
68
|
+
"is_success": row.get("is_success", False),
|
|
69
|
+
"total_steps": row.get("total_steps", 0),
|
|
70
|
+
"llm_step": row.get("llm_step", 0),
|
|
71
|
+
"total_tool_calls": row.get("total_tool_calls", 0),
|
|
72
|
+
"tool_call_precision": row.get("tool_call_precision", 0),
|
|
73
|
+
"tool_call_recall": row.get("tool_call_recall", 0),
|
|
74
|
+
"agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
|
|
75
|
+
"avg_resp_time": row.get("avg_resp_time", 0),
|
|
76
|
+
"failed_tool_calls": row.get("failed_tool_calls", 0),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Extract additional fields not in the standard set
|
|
80
|
+
additional_metrics = {}
|
|
81
|
+
for key, value in row.items():
|
|
82
|
+
if key not in standard_fields and key != "dataset_name":
|
|
83
|
+
additional_metrics[key] = value
|
|
84
|
+
|
|
85
|
+
# Create the test case result
|
|
86
|
+
result = TestCaseEvaluationResult(
|
|
87
|
+
**standard_fields, additional_metrics=additional_metrics
|
|
88
|
+
)
|
|
89
|
+
results[name] = result
|
|
90
|
+
|
|
91
|
+
return cls(results)
|
|
92
|
+
|
|
93
|
+
def calculate_boolean_percent_true(
|
|
94
|
+
self, values: List[bool]
|
|
95
|
+
) -> Dict[str, Any]:
|
|
96
|
+
"""Calculate statistics for boolean values."""
|
|
97
|
+
return {
|
|
98
|
+
"mean": sum(1 for v in values if v) / len(values) if values else 0,
|
|
99
|
+
"count": len(values),
|
|
100
|
+
"true_count": sum(1 for v in values if v),
|
|
101
|
+
"false_count": sum(1 for v in values if not v),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def calculate_numeric_statistics(
|
|
105
|
+
self, values: List[float]
|
|
106
|
+
) -> Dict[str, Any]:
|
|
107
|
+
"""Calculate statistics for numeric values."""
|
|
108
|
+
try:
|
|
109
|
+
stats = {
|
|
110
|
+
"mean": statistics.mean(values),
|
|
111
|
+
"median": statistics.median(values),
|
|
112
|
+
"min": min(values),
|
|
113
|
+
"max": max(values),
|
|
114
|
+
"count": len(values),
|
|
115
|
+
}
|
|
116
|
+
if len(values) > 1:
|
|
117
|
+
stats["std_dev"] = statistics.stdev(values)
|
|
118
|
+
return stats
|
|
119
|
+
except statistics.StatisticsError:
|
|
120
|
+
# Handle empty lists or other statistical errors
|
|
121
|
+
return {"error": "Could not compute statistics"}
|
|
122
|
+
|
|
123
|
+
def compute_summary_statistics(self) -> Dict[str, Any]:
|
|
124
|
+
"""Compute summary statistics for all test cases."""
|
|
125
|
+
stats = {}
|
|
126
|
+
|
|
127
|
+
if not self.test_case_results:
|
|
128
|
+
return stats
|
|
129
|
+
|
|
130
|
+
# Get all fields from the first test case
|
|
131
|
+
first_result = next(iter(self.test_case_results.values()))
|
|
132
|
+
first_dict = first_result.to_dict()
|
|
133
|
+
|
|
134
|
+
# Identify numeric and boolean columns
|
|
135
|
+
for key, value in first_dict.items():
|
|
136
|
+
if key == "dataset_name" or key == "text_match":
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Collect values for this field from all test cases
|
|
140
|
+
values = []
|
|
141
|
+
for result in self.test_case_results.values():
|
|
142
|
+
result_dict = result.to_dict()
|
|
143
|
+
if key in result_dict:
|
|
144
|
+
values.append(result_dict[key])
|
|
145
|
+
|
|
146
|
+
# Calculate statistics based on value type
|
|
147
|
+
if values:
|
|
148
|
+
if all(isinstance(v, bool) for v in values):
|
|
149
|
+
stats[key] = self.calculate_boolean_percent_true(values)
|
|
150
|
+
elif all(isinstance(v, (int, float)) for v in values):
|
|
151
|
+
stats[key] = self.calculate_numeric_statistics(values)
|
|
152
|
+
|
|
153
|
+
# Count summary matches
|
|
154
|
+
match_count = sum(
|
|
155
|
+
result.matches_count() for result in self.test_case_results.values()
|
|
156
|
+
)
|
|
157
|
+
stats["summary_matched_count"] = {
|
|
158
|
+
"count": match_count,
|
|
159
|
+
"percentage": (
|
|
160
|
+
round(match_count / len(self.test_case_results) * 100, 2)
|
|
161
|
+
if self.test_case_results
|
|
162
|
+
else 0
|
|
163
|
+
),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return stats
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def test_count(self) -> int:
|
|
170
|
+
"""Get the total number of test cases."""
|
|
171
|
+
return len(self.test_case_results)
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def summary_matched_count(self) -> int:
|
|
175
|
+
"""Get the count of summary matched test cases."""
|
|
176
|
+
return sum(
|
|
177
|
+
result.matches_count() for result in self.test_case_results.values()
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def is_success_count(self) -> int:
|
|
182
|
+
"""Get the count of successful test cases."""
|
|
183
|
+
return sum(
|
|
184
|
+
1 for result in self.test_case_results.values() if result.is_success
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def summary_match_ratio(self) -> float:
|
|
188
|
+
"""Calculate the ratio of summary matches to total tests."""
|
|
189
|
+
return safe_divide(self.summary_matched_count, self.test_count)
|
|
190
|
+
|
|
191
|
+
def is_success_ratio(self) -> float:
|
|
192
|
+
"""Calculate the ratio of successful tests to total tests."""
|
|
193
|
+
return safe_divide(self.is_success_count, self.test_count)
|
|
@@ -3,13 +3,16 @@ import collections
|
|
|
3
3
|
import json
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
|
-
from wxo_agentic_evaluation.arg_configs import
|
|
6
|
+
from wxo_agentic_evaluation.arg_configs import (
|
|
7
|
+
ChatRecordingConfig,
|
|
8
|
+
KeywordsGenerationConfig,
|
|
9
|
+
)
|
|
7
10
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
11
|
LlamaKeywordsGenerationTemplateRenderer,
|
|
9
12
|
)
|
|
10
13
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
11
14
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
12
|
-
from wxo_agentic_evaluation.type import
|
|
15
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
13
16
|
|
|
14
17
|
ERROR_KEYWORDS = [
|
|
15
18
|
"error",
|
|
@@ -113,11 +116,11 @@ class DataAnnotator:
|
|
|
113
116
|
self,
|
|
114
117
|
messages: List[Message],
|
|
115
118
|
keywords_generation_config: KeywordsGenerationConfig,
|
|
116
|
-
initial_data: Optional[
|
|
119
|
+
initial_data: Optional[OrchestrateDataset] = None,
|
|
117
120
|
):
|
|
118
121
|
self.messages = messages
|
|
119
122
|
self.keywords_generation_config = keywords_generation_config
|
|
120
|
-
self.initial_data = initial_data or
|
|
123
|
+
self.initial_data = initial_data or OrchestrateDataset(
|
|
121
124
|
agent="",
|
|
122
125
|
story="",
|
|
123
126
|
starting_sentence=messages[0].content if messages else "",
|
|
@@ -223,11 +226,23 @@ class DataAnnotator:
|
|
|
223
226
|
return goals, goal_details, previous
|
|
224
227
|
|
|
225
228
|
def _process_summarization(
|
|
226
|
-
self,
|
|
229
|
+
self,
|
|
230
|
+
previous: str,
|
|
231
|
+
goals: Dict,
|
|
232
|
+
goal_details: List,
|
|
233
|
+
config: ChatRecordingConfig = None,
|
|
227
234
|
) -> None:
|
|
228
235
|
"""Process summarization step"""
|
|
229
236
|
summarize_step = None
|
|
230
237
|
# we assume single summary step at the end
|
|
238
|
+
extra_kwargs = {}
|
|
239
|
+
instance_url = getattr(config, "service_url", None)
|
|
240
|
+
token = getattr(config, "token", None)
|
|
241
|
+
if instance_url:
|
|
242
|
+
extra_kwargs["instance_url"] = instance_url
|
|
243
|
+
if token:
|
|
244
|
+
extra_kwargs["token"] = token
|
|
245
|
+
|
|
231
246
|
for message in self.messages[::-1]:
|
|
232
247
|
if message.role == "assistant":
|
|
233
248
|
provider = get_provider(
|
|
@@ -237,6 +252,7 @@ class DataAnnotator:
|
|
|
237
252
|
"decoding_method": "greedy",
|
|
238
253
|
"max_new_tokens": 256,
|
|
239
254
|
},
|
|
255
|
+
**extra_kwargs,
|
|
240
256
|
)
|
|
241
257
|
kw_generator = KeywordsGenerationLLM(
|
|
242
258
|
provider=provider,
|
|
@@ -261,10 +277,12 @@ class DataAnnotator:
|
|
|
261
277
|
else:
|
|
262
278
|
goals[previous] = ["summarize"]
|
|
263
279
|
|
|
264
|
-
def generate(self) -> Dict:
|
|
280
|
+
def generate(self, config: ChatRecordingConfig = None) -> Dict:
|
|
265
281
|
"""Generate the final dataset"""
|
|
266
282
|
goals, goal_details, previous = self._process_tool_calls()
|
|
267
|
-
self._process_summarization(
|
|
283
|
+
self._process_summarization(
|
|
284
|
+
previous, goals, goal_details, config=config
|
|
285
|
+
)
|
|
268
286
|
|
|
269
287
|
return {
|
|
270
288
|
"agent": self.initial_data.agent,
|
|
@@ -3,8 +3,7 @@ from enum import Enum
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
-
import
|
|
7
|
-
|
|
6
|
+
from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
|
|
8
7
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
8
|
BadToolDescriptionRenderer,
|
|
10
9
|
)
|
|
@@ -15,6 +14,9 @@ from wxo_agentic_evaluation.tool_planner import (
|
|
|
15
14
|
parse_json_string,
|
|
16
15
|
)
|
|
17
16
|
from wxo_agentic_evaluation.type import ToolDefinition
|
|
17
|
+
from wxo_agentic_evaluation.utils.gateway_provider_utils import (
|
|
18
|
+
get_provider_kwargs,
|
|
19
|
+
)
|
|
18
20
|
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
19
21
|
|
|
20
22
|
|
|
@@ -60,12 +62,23 @@ class DescriptionQualityInspector:
|
|
|
60
62
|
root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
|
|
61
63
|
)
|
|
62
64
|
|
|
65
|
+
DEFAULT_PROVIDER_KWARGS = {
|
|
66
|
+
"model_id": LLM_MODEL_ID,
|
|
67
|
+
"params": LLM_PARAMS,
|
|
68
|
+
}
|
|
69
|
+
|
|
63
70
|
def __init__(self, llm_client=None):
|
|
71
|
+
|
|
64
72
|
if llm_client is None:
|
|
73
|
+
|
|
74
|
+
provider_kwargs = get_provider_kwargs(
|
|
75
|
+
**self.DEFAULT_PROVIDER_KWARGS,
|
|
76
|
+
)
|
|
77
|
+
|
|
65
78
|
llm_client = get_provider(
|
|
66
|
-
|
|
67
|
-
params=self.LLM_PARAMS,
|
|
79
|
+
**provider_kwargs,
|
|
68
80
|
)
|
|
81
|
+
|
|
69
82
|
self.llm_client = llm_client
|
|
70
83
|
self.template = BadToolDescriptionRenderer(
|
|
71
84
|
self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
|
|
@@ -106,7 +119,9 @@ class DescriptionQualityInspector:
|
|
|
106
119
|
)
|
|
107
120
|
return tool_definitions
|
|
108
121
|
|
|
109
|
-
def detect_bad_description(
|
|
122
|
+
def detect_bad_description(
|
|
123
|
+
self, tool_definition: ToolDefinition
|
|
124
|
+
) -> DescriptionQualityMetric:
|
|
110
125
|
"""
|
|
111
126
|
Detects if a tool description is 'bad' using an LLM judge.
|
|
112
127
|
A 'bad' description is one that:
|
|
@@ -119,6 +134,10 @@ class DescriptionQualityInspector:
|
|
|
119
134
|
Returns:
|
|
120
135
|
bool: True if the description is 'bad', False otherwise.
|
|
121
136
|
"""
|
|
137
|
+
|
|
138
|
+
if tool_definition.tool_description is None:
|
|
139
|
+
return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
|
|
140
|
+
|
|
122
141
|
prompt = self.template.render(tool_definition=tool_definition)
|
|
123
142
|
response = self.llm_client.query(prompt)
|
|
124
143
|
|
|
@@ -137,7 +156,11 @@ class DescriptionQualityInspector:
|
|
|
137
156
|
response_data=response_data
|
|
138
157
|
)
|
|
139
158
|
|
|
140
|
-
return
|
|
159
|
+
return DescriptionQualityMetric(
|
|
160
|
+
tool_name=tool_definition.tool_name,
|
|
161
|
+
description_score=final_description_score,
|
|
162
|
+
threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
|
|
163
|
+
)
|
|
141
164
|
|
|
142
165
|
def _calculate_score(self, response_data: dict) -> float:
|
|
143
166
|
"""
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
2
|
-
from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
|
|
3
|
-
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
4
|
-
|
|
5
1
|
import json
|
|
6
2
|
|
|
7
|
-
|
|
3
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
4
|
+
from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
|
|
5
|
+
convert_otel_to_message,
|
|
6
|
+
)
|
|
7
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message
|
|
8
|
+
|
|
9
|
+
with open(
|
|
10
|
+
"/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
|
|
11
|
+
"r",
|
|
12
|
+
) as f:
|
|
8
13
|
data = json.load(f)
|
|
9
14
|
|
|
10
15
|
tc_name = "collie_trial"
|
|
@@ -15,7 +20,10 @@ for message in history:
|
|
|
15
20
|
print(f"{message.role}: {message.content}")
|
|
16
21
|
|
|
17
22
|
|
|
18
|
-
with open(
|
|
23
|
+
with open(
|
|
24
|
+
"/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
|
|
25
|
+
"r",
|
|
26
|
+
) as f:
|
|
19
27
|
gt = json.load(f)
|
|
20
28
|
|
|
21
29
|
tc_name = "collie_trial"
|
|
@@ -28,7 +36,7 @@ evaluation_package = EvaluationPackage(
|
|
|
28
36
|
messages=history,
|
|
29
37
|
ground_truth=gt,
|
|
30
38
|
conversational_search_data=None,
|
|
31
|
-
resource_map=None
|
|
39
|
+
resource_map=None,
|
|
32
40
|
)
|
|
33
41
|
|
|
34
42
|
(
|
|
@@ -39,4 +47,4 @@ evaluation_package = EvaluationPackage(
|
|
|
39
47
|
) = evaluation_package.generate_summary()
|
|
40
48
|
|
|
41
49
|
|
|
42
|
-
print(metrics)
|
|
50
|
+
print(metrics)
|