ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import statistics
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TestCaseEvaluationResult:
|
|
10
|
+
"""Class representing a single test case evaluation result."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
text_match: Optional[str] = None
|
|
14
|
+
is_success: bool = False
|
|
15
|
+
total_steps: float = 0
|
|
16
|
+
llm_step: float = 0
|
|
17
|
+
total_tool_calls: float = 0
|
|
18
|
+
tool_call_precision: float = 0
|
|
19
|
+
tool_call_recall: float = 0
|
|
20
|
+
agent_routing_accuracy: float = 0
|
|
21
|
+
avg_resp_time: float = 0
|
|
22
|
+
failed_tool_calls: int = 0
|
|
23
|
+
|
|
24
|
+
# Store any additional metrics not explicitly defined
|
|
25
|
+
additional_metrics: Dict[str, Any] = field(default_factory=dict)
|
|
26
|
+
|
|
27
|
+
def matches_count(self, match_value: str = "Summary Matched") -> int:
|
|
28
|
+
"""Check if this test case matches the specified value."""
|
|
29
|
+
return 1 if self.text_match == match_value else 0
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
32
|
+
"""Convert the test case result to a dictionary."""
|
|
33
|
+
result = {
|
|
34
|
+
"dataset_name": self.name,
|
|
35
|
+
"text_match": self.text_match,
|
|
36
|
+
"is_success": self.is_success,
|
|
37
|
+
"total_steps": self.total_steps,
|
|
38
|
+
"llm_step": self.llm_step,
|
|
39
|
+
"total_tool_calls": self.total_tool_calls,
|
|
40
|
+
"tool_call_precision": self.tool_call_precision,
|
|
41
|
+
"tool_call_recall": self.tool_call_recall,
|
|
42
|
+
"agent_routing_accuracy": self.agent_routing_accuracy,
|
|
43
|
+
"avg_resp_time": self.avg_resp_time,
|
|
44
|
+
"failed_tool_calls": self.failed_tool_calls,
|
|
45
|
+
}
|
|
46
|
+
# Add any additional metrics
|
|
47
|
+
result.update(self.additional_metrics)
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class EvaluationResult:
|
|
53
|
+
"""Class representing a collection of test case evaluation results."""
|
|
54
|
+
|
|
55
|
+
test_case_results: Dict[str, TestCaseEvaluationResult]
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
|
|
59
|
+
"""Create an EvaluationResult from CSV data."""
|
|
60
|
+
results = {}
|
|
61
|
+
for row in data:
|
|
62
|
+
name = row["dataset_name"]
|
|
63
|
+
|
|
64
|
+
# Extract standard fields
|
|
65
|
+
standard_fields = {
|
|
66
|
+
"name": name,
|
|
67
|
+
"text_match": row.get("text_match"),
|
|
68
|
+
"is_success": row.get("is_success", False),
|
|
69
|
+
"total_steps": row.get("total_steps", 0),
|
|
70
|
+
"llm_step": row.get("llm_step", 0),
|
|
71
|
+
"total_tool_calls": row.get("total_tool_calls", 0),
|
|
72
|
+
"tool_call_precision": row.get("tool_call_precision", 0),
|
|
73
|
+
"tool_call_recall": row.get("tool_call_recall", 0),
|
|
74
|
+
"agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
|
|
75
|
+
"avg_resp_time": row.get("avg_resp_time", 0),
|
|
76
|
+
"failed_tool_calls": row.get("failed_tool_calls", 0),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Extract additional fields not in the standard set
|
|
80
|
+
additional_metrics = {}
|
|
81
|
+
for key, value in row.items():
|
|
82
|
+
if key not in standard_fields and key != "dataset_name":
|
|
83
|
+
additional_metrics[key] = value
|
|
84
|
+
|
|
85
|
+
# Create the test case result
|
|
86
|
+
result = TestCaseEvaluationResult(
|
|
87
|
+
**standard_fields, additional_metrics=additional_metrics
|
|
88
|
+
)
|
|
89
|
+
results[name] = result
|
|
90
|
+
|
|
91
|
+
return cls(results)
|
|
92
|
+
|
|
93
|
+
def calculate_boolean_percent_true(
|
|
94
|
+
self, values: List[bool]
|
|
95
|
+
) -> Dict[str, Any]:
|
|
96
|
+
"""Calculate statistics for boolean values."""
|
|
97
|
+
return {
|
|
98
|
+
"mean": sum(1 for v in values if v) / len(values) if values else 0,
|
|
99
|
+
"count": len(values),
|
|
100
|
+
"true_count": sum(1 for v in values if v),
|
|
101
|
+
"false_count": sum(1 for v in values if not v),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def calculate_numeric_statistics(
|
|
105
|
+
self, values: List[float]
|
|
106
|
+
) -> Dict[str, Any]:
|
|
107
|
+
"""Calculate statistics for numeric values."""
|
|
108
|
+
try:
|
|
109
|
+
stats = {
|
|
110
|
+
"mean": statistics.mean(values),
|
|
111
|
+
"median": statistics.median(values),
|
|
112
|
+
"min": min(values),
|
|
113
|
+
"max": max(values),
|
|
114
|
+
"count": len(values),
|
|
115
|
+
}
|
|
116
|
+
if len(values) > 1:
|
|
117
|
+
stats["std_dev"] = statistics.stdev(values)
|
|
118
|
+
return stats
|
|
119
|
+
except statistics.StatisticsError:
|
|
120
|
+
# Handle empty lists or other statistical errors
|
|
121
|
+
return {"error": "Could not compute statistics"}
|
|
122
|
+
|
|
123
|
+
def compute_summary_statistics(self) -> Dict[str, Any]:
|
|
124
|
+
"""Compute summary statistics for all test cases."""
|
|
125
|
+
stats = {}
|
|
126
|
+
|
|
127
|
+
if not self.test_case_results:
|
|
128
|
+
return stats
|
|
129
|
+
|
|
130
|
+
# Get all fields from the first test case
|
|
131
|
+
first_result = next(iter(self.test_case_results.values()))
|
|
132
|
+
first_dict = first_result.to_dict()
|
|
133
|
+
|
|
134
|
+
# Identify numeric and boolean columns
|
|
135
|
+
for key, value in first_dict.items():
|
|
136
|
+
if key == "dataset_name" or key == "text_match":
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Collect values for this field from all test cases
|
|
140
|
+
values = []
|
|
141
|
+
for result in self.test_case_results.values():
|
|
142
|
+
result_dict = result.to_dict()
|
|
143
|
+
if key in result_dict:
|
|
144
|
+
values.append(result_dict[key])
|
|
145
|
+
|
|
146
|
+
# Calculate statistics based on value type
|
|
147
|
+
if values:
|
|
148
|
+
if all(isinstance(v, bool) for v in values):
|
|
149
|
+
stats[key] = self.calculate_boolean_percent_true(values)
|
|
150
|
+
elif all(isinstance(v, (int, float)) for v in values):
|
|
151
|
+
stats[key] = self.calculate_numeric_statistics(values)
|
|
152
|
+
|
|
153
|
+
# Count summary matches
|
|
154
|
+
match_count = sum(
|
|
155
|
+
result.matches_count() for result in self.test_case_results.values()
|
|
156
|
+
)
|
|
157
|
+
stats["summary_matched_count"] = {
|
|
158
|
+
"count": match_count,
|
|
159
|
+
"percentage": (
|
|
160
|
+
round(match_count / len(self.test_case_results) * 100, 2)
|
|
161
|
+
if self.test_case_results
|
|
162
|
+
else 0
|
|
163
|
+
),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return stats
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def test_count(self) -> int:
|
|
170
|
+
"""Get the total number of test cases."""
|
|
171
|
+
return len(self.test_case_results)
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def summary_matched_count(self) -> int:
|
|
175
|
+
"""Get the count of summary matched test cases."""
|
|
176
|
+
return sum(
|
|
177
|
+
result.matches_count() for result in self.test_case_results.values()
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def is_success_count(self) -> int:
|
|
182
|
+
"""Get the count of successful test cases."""
|
|
183
|
+
return sum(
|
|
184
|
+
1 for result in self.test_case_results.values() if result.is_success
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def summary_match_ratio(self) -> float:
|
|
188
|
+
"""Calculate the ratio of summary matches to total tests."""
|
|
189
|
+
return safe_divide(self.summary_matched_count, self.test_count)
|
|
190
|
+
|
|
191
|
+
def is_success_ratio(self) -> float:
|
|
192
|
+
"""Calculate the ratio of successful tests to total tests."""
|
|
193
|
+
return safe_divide(self.is_success_count, self.test_count)
|
|
@@ -1,16 +1,19 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
2
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
3
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
4
|
-
from wxo_agentic_evaluation.prompt.template_render import (
|
|
5
|
-
LlamaKeywordsGenerationTemplateRenderer,
|
|
6
|
-
)
|
|
7
|
-
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
8
|
-
|
|
9
1
|
import ast
|
|
10
|
-
import json
|
|
11
2
|
import collections
|
|
3
|
+
import json
|
|
12
4
|
from typing import Dict, List, Optional
|
|
13
5
|
|
|
6
|
+
from wxo_agentic_evaluation.arg_configs import (
|
|
7
|
+
ChatRecordingConfig,
|
|
8
|
+
KeywordsGenerationConfig,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
11
|
+
LlamaKeywordsGenerationTemplateRenderer,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
14
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
15
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
16
|
+
|
|
14
17
|
ERROR_KEYWORDS = [
|
|
15
18
|
"error",
|
|
16
19
|
"erroneous",
|
|
@@ -113,11 +116,11 @@ class DataAnnotator:
|
|
|
113
116
|
self,
|
|
114
117
|
messages: List[Message],
|
|
115
118
|
keywords_generation_config: KeywordsGenerationConfig,
|
|
116
|
-
initial_data: Optional[
|
|
119
|
+
initial_data: Optional[OrchestrateDataset] = None,
|
|
117
120
|
):
|
|
118
121
|
self.messages = messages
|
|
119
122
|
self.keywords_generation_config = keywords_generation_config
|
|
120
|
-
self.initial_data = initial_data or
|
|
123
|
+
self.initial_data = initial_data or OrchestrateDataset(
|
|
121
124
|
agent="",
|
|
122
125
|
story="",
|
|
123
126
|
starting_sentence=messages[0].content if messages else "",
|
|
@@ -143,7 +146,9 @@ class DataAnnotator:
|
|
|
143
146
|
)
|
|
144
147
|
return wrong_tool_response_id
|
|
145
148
|
|
|
146
|
-
def _process_tool_call_order(
|
|
149
|
+
def _process_tool_call_order(
|
|
150
|
+
self, wrong_tool_response_id: list[str]
|
|
151
|
+
) -> list[str]:
|
|
147
152
|
"""Process and order tool calls, skipping failed ones"""
|
|
148
153
|
# gather all call ids that actually got a response
|
|
149
154
|
valid_call_ids = {
|
|
@@ -221,16 +226,33 @@ class DataAnnotator:
|
|
|
221
226
|
return goals, goal_details, previous
|
|
222
227
|
|
|
223
228
|
def _process_summarization(
|
|
224
|
-
self,
|
|
229
|
+
self,
|
|
230
|
+
previous: str,
|
|
231
|
+
goals: Dict,
|
|
232
|
+
goal_details: List,
|
|
233
|
+
config: ChatRecordingConfig = None,
|
|
225
234
|
) -> None:
|
|
226
235
|
"""Process summarization step"""
|
|
227
236
|
summarize_step = None
|
|
228
237
|
# we assume single summary step at the end
|
|
238
|
+
extra_kwargs = {}
|
|
239
|
+
instance_url = getattr(config, "service_url", None)
|
|
240
|
+
token = getattr(config, "token", None)
|
|
241
|
+
if instance_url:
|
|
242
|
+
extra_kwargs["instance_url"] = instance_url
|
|
243
|
+
if token:
|
|
244
|
+
extra_kwargs["token"] = token
|
|
245
|
+
|
|
229
246
|
for message in self.messages[::-1]:
|
|
230
247
|
if message.role == "assistant":
|
|
231
248
|
provider = get_provider(
|
|
232
249
|
model_id=self.keywords_generation_config.model_id,
|
|
233
|
-
params={
|
|
250
|
+
params={
|
|
251
|
+
"min_new_tokens": 0,
|
|
252
|
+
"decoding_method": "greedy",
|
|
253
|
+
"max_new_tokens": 256,
|
|
254
|
+
},
|
|
255
|
+
**extra_kwargs,
|
|
234
256
|
)
|
|
235
257
|
kw_generator = KeywordsGenerationLLM(
|
|
236
258
|
provider=provider,
|
|
@@ -248,15 +270,19 @@ class DataAnnotator:
|
|
|
248
270
|
goal_details.append(summarize_step)
|
|
249
271
|
break
|
|
250
272
|
|
|
251
|
-
if
|
|
252
|
-
goals[
|
|
253
|
-
|
|
273
|
+
if previous is None:
|
|
274
|
+
goals["summarize"] = []
|
|
275
|
+
elif summarize_step is None:
|
|
254
276
|
goals[previous] = []
|
|
277
|
+
else:
|
|
278
|
+
goals[previous] = ["summarize"]
|
|
255
279
|
|
|
256
|
-
def generate(self) -> Dict:
|
|
280
|
+
def generate(self, config: ChatRecordingConfig = None) -> Dict:
|
|
257
281
|
"""Generate the final dataset"""
|
|
258
282
|
goals, goal_details, previous = self._process_tool_calls()
|
|
259
|
-
self._process_summarization(
|
|
283
|
+
self._process_summarization(
|
|
284
|
+
previous, goals, goal_details, config=config
|
|
285
|
+
)
|
|
260
286
|
|
|
261
287
|
return {
|
|
262
288
|
"agent": self.initial_data.agent,
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
|
|
7
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
|
+
BadToolDescriptionRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
11
|
+
from wxo_agentic_evaluation.tool_planner import (
|
|
12
|
+
MISSING_DOCSTRING_PROMPT,
|
|
13
|
+
extract_tool_signatures,
|
|
14
|
+
parse_json_string,
|
|
15
|
+
)
|
|
16
|
+
from wxo_agentic_evaluation.type import ToolDefinition
|
|
17
|
+
from wxo_agentic_evaluation.utils.gateway_provider_utils import (
|
|
18
|
+
get_provider_kwargs,
|
|
19
|
+
)
|
|
20
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ToolDescriptionIssue(Enum):
|
|
24
|
+
"""
|
|
25
|
+
Represents the binary outcomes the LLM judge will classify in its assessment \
|
|
26
|
+
of the tool's description.
|
|
27
|
+
The presence of these issues in the tool's description indicates poor quality.
|
|
28
|
+
For more detail on what each issue indicates, please take a look at the template here: `wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2`.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# TODO: Priority-based weighting of issues.
|
|
32
|
+
CONTAINS_REDUNDANT_INFORMATION = "contains_redundant_information"
|
|
33
|
+
USES_VAGUE_LANGUAGE = "uses_vague_language"
|
|
34
|
+
DOES_NOT_HELP_IN_IDENTIFYING_TOOL_UNIQUELY = (
|
|
35
|
+
"does_not_help_in_identifying_tool_uniquely"
|
|
36
|
+
)
|
|
37
|
+
PROVIDES_NO_NEW_INFORMATION = "provides_no_new_information"
|
|
38
|
+
DOES_NOT_CONVEY_TOOL_PURPOSE = "does_not_convey_tool_purpose"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DescriptionQualityInspector:
|
|
42
|
+
DEFAULT_CLASSIFICATION_THRESHOLD = 40.0 # 2/5 issues detected. A higher score indicates a worse description.
|
|
43
|
+
CLASSIFICATION_SCORE_THRESHOLD = float(
|
|
44
|
+
os.getenv(
|
|
45
|
+
"CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
|
|
50
|
+
LLM_PARAMS = {
|
|
51
|
+
"min_new_tokens": 128,
|
|
52
|
+
"decoding_method": "greedy",
|
|
53
|
+
"max_new_tokens": 512,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
WORST_POSSIBLE_EVAL_OUTCOME = len(
|
|
57
|
+
ToolDescriptionIssue
|
|
58
|
+
) # the final score used for classification is normalized against this value.
|
|
59
|
+
|
|
60
|
+
root_dir = os.path.dirname(__file__)
|
|
61
|
+
BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH = os.path.join(
|
|
62
|
+
root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
DEFAULT_PROVIDER_KWARGS = {
|
|
66
|
+
"model_id": LLM_MODEL_ID,
|
|
67
|
+
"params": LLM_PARAMS,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def __init__(self, llm_client=None):
|
|
71
|
+
|
|
72
|
+
if llm_client is None:
|
|
73
|
+
|
|
74
|
+
provider_kwargs = get_provider_kwargs(
|
|
75
|
+
**self.DEFAULT_PROVIDER_KWARGS,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
llm_client = get_provider(
|
|
79
|
+
**provider_kwargs,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.llm_client = llm_client
|
|
83
|
+
self.template = BadToolDescriptionRenderer(
|
|
84
|
+
self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
|
|
85
|
+
)
|
|
86
|
+
self.cached_response = None # this is used in the unit-tests for nuanced analysis of the response.
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def extract_tool_desc_from_tool_source(
|
|
90
|
+
tool_source: Path, failing_tools: List[str]
|
|
91
|
+
) -> List[ToolDefinition]:
|
|
92
|
+
"""
|
|
93
|
+
Parses the tool source file to extract the tool description.
|
|
94
|
+
Wraps the description along with the tool name, and args into a `ToolDefinition` for all `failing_tools`.
|
|
95
|
+
This `ToolDefinition` is later rendered into the judge's prompt template for evaluation.
|
|
96
|
+
Args:
|
|
97
|
+
tool_source (Path): The path to the tool source file/dir containing `.py` tools.
|
|
98
|
+
failing_tools (List[str]): List of tool names that failed during inference.
|
|
99
|
+
Returns:
|
|
100
|
+
List[ToolDefinition]: The extracted tool definition(s) or [] if the file contains no @tool decorators.
|
|
101
|
+
"""
|
|
102
|
+
all_tool_data = extract_tool_signatures(tool_source)
|
|
103
|
+
|
|
104
|
+
tool_definitions = []
|
|
105
|
+
for tool_data in all_tool_data:
|
|
106
|
+
tool_name = tool_data["Function Name"]
|
|
107
|
+
if tool_name in failing_tools:
|
|
108
|
+
tool_definitions.append(
|
|
109
|
+
ToolDefinition(
|
|
110
|
+
tool_name=tool_name,
|
|
111
|
+
tool_description=(
|
|
112
|
+
tool_data["Docstring"]
|
|
113
|
+
if tool_data["Docstring"]
|
|
114
|
+
!= MISSING_DOCSTRING_PROMPT
|
|
115
|
+
else None
|
|
116
|
+
),
|
|
117
|
+
tool_params=tool_data["Arguments"],
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
return tool_definitions
|
|
121
|
+
|
|
122
|
+
def detect_bad_description(
|
|
123
|
+
self, tool_definition: ToolDefinition
|
|
124
|
+
) -> DescriptionQualityMetric:
|
|
125
|
+
"""
|
|
126
|
+
Detects if a tool description is 'bad' using an LLM judge.
|
|
127
|
+
A 'bad' description is one that:
|
|
128
|
+
- does not describe the tool's functionality/use-case clearly
|
|
129
|
+
- does not provide sufficient detail for an agent to understand how to use the tool
|
|
130
|
+
- does not distinguish the tool from other tools
|
|
131
|
+
For the exact definition of a 'bad' description, refer to `ToolDescriptionIssue` Enum.
|
|
132
|
+
Args:
|
|
133
|
+
tool_definition (ToolDefinition): The definition of the tool to evaluate.
|
|
134
|
+
Returns:
|
|
135
|
+
bool: True if the description is 'bad', False otherwise.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
if tool_definition.tool_description is None:
|
|
139
|
+
return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
|
|
140
|
+
|
|
141
|
+
prompt = self.template.render(tool_definition=tool_definition)
|
|
142
|
+
response = self.llm_client.query(prompt)
|
|
143
|
+
|
|
144
|
+
# parse JSON objects from cleaned text
|
|
145
|
+
json_objects = parse_json_string(response)
|
|
146
|
+
|
|
147
|
+
# pick the first JSON object
|
|
148
|
+
if json_objects:
|
|
149
|
+
response_data = json_objects[0]
|
|
150
|
+
self.cached_response = response_data
|
|
151
|
+
else:
|
|
152
|
+
return False # likely some unexpected parsing issue, in this case - flags description as good.
|
|
153
|
+
|
|
154
|
+
# calculate weighted score
|
|
155
|
+
final_description_score = self._calculate_score(
|
|
156
|
+
response_data=response_data
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return DescriptionQualityMetric(
|
|
160
|
+
tool_name=tool_definition.tool_name,
|
|
161
|
+
description_score=final_description_score,
|
|
162
|
+
threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _calculate_score(self, response_data: dict) -> float:
|
|
166
|
+
"""
|
|
167
|
+
Calculates a final score for the tool description.
|
|
168
|
+
This score is used to finally classify a 'good' or 'bad' description.
|
|
169
|
+
:param response_data: Parsed JSON response returned by the LLM judge.
|
|
170
|
+
"""
|
|
171
|
+
detected_issues = sum(
|
|
172
|
+
1
|
|
173
|
+
for issue in ToolDescriptionIssue
|
|
174
|
+
if response_data.get(issue.value, "FALSE").upper() == "TRUE"
|
|
175
|
+
)
|
|
176
|
+
return (
|
|
177
|
+
safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
|
|
178
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
4
|
+
from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
|
|
5
|
+
convert_otel_to_message,
|
|
6
|
+
)
|
|
7
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message
|
|
8
|
+
|
|
9
|
+
with open(
|
|
10
|
+
"/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
|
|
11
|
+
"r",
|
|
12
|
+
) as f:
|
|
13
|
+
data = json.load(f)
|
|
14
|
+
|
|
15
|
+
tc_name = "collie_trial"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
history = convert_otel_to_message(data["calls"][-1]["messages"])
|
|
19
|
+
for message in history:
|
|
20
|
+
print(f"{message.role}: {message.content}")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
with open(
|
|
24
|
+
"/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
|
|
25
|
+
"r",
|
|
26
|
+
) as f:
|
|
27
|
+
gt = json.load(f)
|
|
28
|
+
|
|
29
|
+
tc_name = "collie_trial"
|
|
30
|
+
|
|
31
|
+
gt = EvaluationData.model_validate(gt)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
evaluation_package = EvaluationPackage(
|
|
35
|
+
test_case_name=tc_name,
|
|
36
|
+
messages=history,
|
|
37
|
+
ground_truth=gt,
|
|
38
|
+
conversational_search_data=None,
|
|
39
|
+
resource_map=None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
(
|
|
43
|
+
keyword_semantic_matches,
|
|
44
|
+
knowledge_base_metrics,
|
|
45
|
+
messages_with_reason,
|
|
46
|
+
metrics,
|
|
47
|
+
) = evaluation_package.generate_summary()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
print(metrics)
|