ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +35 -31
- wxo_agentic_evaluation/analyze_run.py +805 -344
- wxo_agentic_evaluation/arg_configs.py +10 -1
- wxo_agentic_evaluation/description_quality_checker.py +11 -2
- wxo_agentic_evaluation/evaluation_package.py +8 -3
- wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/inference_backend.py +46 -79
- wxo_agentic_evaluation/llm_matching.py +14 -2
- wxo_agentic_evaluation/main.py +1 -1
- wxo_agentic_evaluation/metrics/__init__.py +1 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +43 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +4 -2
- wxo_agentic_evaluation/quick_eval.py +7 -9
- wxo_agentic_evaluation/record_chat.py +22 -29
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
- wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +7 -0
- wxo_agentic_evaluation/type.py +1 -1
- wxo_agentic_evaluation/utils/__init__.py +3 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +131 -16
- wxo_agentic_evaluation/wxo_client.py +80 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,153 +1,481 @@
|
|
|
1
|
-
import csv
|
|
2
1
|
import json
|
|
3
2
|
import os
|
|
4
3
|
import re
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from threading import Lock
|
|
5
10
|
from pathlib import Path
|
|
6
|
-
from typing import Dict, List, Optional, Set
|
|
11
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
7
12
|
|
|
13
|
+
import rich
|
|
8
14
|
from jsonargparse import CLI
|
|
9
|
-
from rich
|
|
15
|
+
from rich import box
|
|
16
|
+
from rich.rule import Rule
|
|
17
|
+
from rich.console import Group, Console
|
|
10
18
|
from rich.panel import Panel
|
|
19
|
+
from rich.progress import Progress
|
|
11
20
|
from rich.style import Style
|
|
12
21
|
from rich.table import Table
|
|
13
22
|
from rich.text import Text
|
|
14
23
|
|
|
15
|
-
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
|
|
24
|
+
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig, AnalyzeMode
|
|
16
25
|
from wxo_agentic_evaluation.description_quality_checker import (
|
|
17
26
|
DescriptionQualityInspector,
|
|
18
27
|
)
|
|
19
28
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
29
|
+
EnhancedAnalyzeMetrics,
|
|
20
30
|
TextMatchType,
|
|
21
31
|
ToolCallAndRoutingMetrics,
|
|
32
|
+
DescriptionQualityMetric,
|
|
33
|
+
DescriptionQuality,
|
|
22
34
|
)
|
|
35
|
+
from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
|
|
23
36
|
from wxo_agentic_evaluation.type import (
|
|
24
37
|
ContentType,
|
|
25
38
|
ExtendedMessage,
|
|
26
39
|
ToolDefinition,
|
|
27
40
|
)
|
|
28
|
-
from wxo_agentic_evaluation.utils
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
print_done,
|
|
33
|
-
warn,
|
|
34
|
-
)
|
|
35
|
-
from wxo_agentic_evaluation.utils.utils import (
|
|
41
|
+
from wxo_agentic_evaluation.utils import (
|
|
42
|
+
ReferencelessEvalParser,
|
|
43
|
+
TestCaseResources,
|
|
44
|
+
ToolExtractionOpenAIFormat,
|
|
36
45
|
add_line_seperator,
|
|
37
46
|
list_run_files,
|
|
38
47
|
load_run_metrics,
|
|
48
|
+
N_A,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
MODEL_ID = "meta-llama/llama-3-405b-instruct"
|
|
52
|
+
GATE_TOOL_ENRICHMENTS = (
|
|
53
|
+
os.getenv("GATE_TOOL_ENRICHMENTS", "true").lower().strip() == "true"
|
|
39
54
|
)
|
|
55
|
+
LOCK = Lock()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AnalyzerBase(ABC):
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def analyze(self, config: AnalyzeConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def render(self):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
def _is_failed_tool_call(self, message: ExtendedMessage):
|
|
68
|
+
if message.reason and message.message.type == ContentType.tool_call:
|
|
69
|
+
if (
|
|
70
|
+
reason := message.reason.get("reason")
|
|
71
|
+
) and reason != "irrelevant tool call":
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
def _single_run(
|
|
75
|
+
self, test_case_name, run_map, test_cases_resource: TestCaseResources
|
|
76
|
+
):
|
|
77
|
+
if not run_map:
|
|
78
|
+
# Legacy single-run files
|
|
79
|
+
test_messages, meta = test_cases_resource.get_analyze_messages(
|
|
80
|
+
test_case_name=test_case_name
|
|
81
|
+
)
|
|
82
|
+
metrics: ToolCallAndRoutingMetrics = (
|
|
83
|
+
test_cases_resource.get_test_metrics(
|
|
84
|
+
test_case_name=test_case_name
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
run_id = next(iter(run_map))
|
|
89
|
+
paths = run_map[run_id]
|
|
90
|
+
metrics = test_cases_resource.get_test_metrics(
|
|
91
|
+
path=paths["metrics"]
|
|
92
|
+
)
|
|
93
|
+
test_messages, meta = test_cases_resource.get_analyze_messages(
|
|
94
|
+
path=paths["analyze"]
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# --- compute status uniformly (legacy & run1) ---
|
|
98
|
+
runs_problematic = self._is_failed_test_case(metrics)
|
|
99
|
+
|
|
100
|
+
return test_messages, meta, metrics, runs_problematic
|
|
101
|
+
|
|
102
|
+
def _is_failed_test_case(self, data) -> bool:
|
|
103
|
+
"""
|
|
104
|
+
True -> test case failed
|
|
105
|
+
False -> test success
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
# not ideal if statement
|
|
109
|
+
# in the future, refactor so this if statement is not needed
|
|
110
|
+
# this if statement is needed because this function is called in two cases:
|
|
111
|
+
# 1. if data is an instance ToolCallAndRoutingMetrics
|
|
112
|
+
# 2. if data is a row in the summary table (dictionary)
|
|
113
|
+
|
|
114
|
+
# ideal the SummaryMetrics should be parsed into pydantic class as well
|
|
115
|
+
|
|
116
|
+
if isinstance(data, ToolCallAndRoutingMetrics):
|
|
117
|
+
is_success = data.is_success
|
|
118
|
+
had_incorrect_param = data.tool_calls_with_incorrect_parameter > 0
|
|
119
|
+
low_precision = float(data.tool_call_precision) < 1.0
|
|
120
|
+
low_recall = float(data.tool_call_recall) < 1.0
|
|
121
|
+
else:
|
|
122
|
+
is_success = str(data["is_success"]).strip().lower() == "true"
|
|
123
|
+
had_incorrect_param = (
|
|
124
|
+
float(data.get("tool_calls_with_incorrect_parameter", 0) or 0)
|
|
125
|
+
> 0
|
|
126
|
+
)
|
|
127
|
+
low_precision = float(data.get("tool_call_precision", 1) or 1) < 1.0
|
|
128
|
+
low_recall = float(data.get("tool_call_recall", 1) or 1) < 1.0
|
|
129
|
+
|
|
130
|
+
return (
|
|
131
|
+
not is_success or had_incorrect_param or low_precision or low_recall
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def _get_test_case_with_failed_tools(self, summary) -> List[str]:
|
|
135
|
+
test_case_with_failed_tools = []
|
|
136
|
+
|
|
137
|
+
for entry in summary:
|
|
138
|
+
test_case_name = entry["dataset_name"]
|
|
139
|
+
|
|
140
|
+
if test_case_name.lower().strip() == "summary (average)":
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
if self._is_failed_test_case(entry):
|
|
144
|
+
test_case_with_failed_tools.append(entry)
|
|
145
|
+
|
|
146
|
+
return test_case_with_failed_tools
|
|
40
147
|
|
|
41
148
|
|
|
42
|
-
class
|
|
149
|
+
class DescriptionQualityAnalyzer(AnalyzerBase):
|
|
43
150
|
def __init__(self):
|
|
44
|
-
self.analysis_cache: Dict[str,
|
|
45
|
-
{}
|
|
46
|
-
) # the failing tools cached here won't be re-analyzed.
|
|
151
|
+
self.analysis_cache: Dict[str, DescriptionQualityMetric] = {}
|
|
47
152
|
# tool_name -> description analysis
|
|
153
|
+
self.missing_tools = set()
|
|
154
|
+
self.tools_not_found = set()
|
|
48
155
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
)
|
|
156
|
+
def _get_tools_not_found_in_source(
|
|
157
|
+
self,
|
|
158
|
+
tools_to_analyze: List[str],
|
|
159
|
+
failing_tool_definitions: List[ToolDefinition],
|
|
160
|
+
) -> Set[str]:
|
|
161
|
+
|
|
162
|
+
return set(tools_to_analyze) - {
|
|
163
|
+
tool_def.tool_name for tool_def in failing_tool_definitions
|
|
164
|
+
}
|
|
56
165
|
|
|
57
|
-
def
|
|
58
|
-
|
|
59
|
-
|
|
166
|
+
def _failing_tool_from_messages(self, messages: List[ExtendedMessage]):
|
|
167
|
+
failed_tool_calls = set()
|
|
168
|
+
for message in messages:
|
|
169
|
+
if self._is_failed_tool_call(message):
|
|
170
|
+
content = json.loads(message.message.content)
|
|
171
|
+
tool_call_name = content["name"]
|
|
172
|
+
failed_tool_calls.add(tool_call_name)
|
|
173
|
+
|
|
174
|
+
return failed_tool_calls
|
|
175
|
+
|
|
176
|
+
def failing_tools(self, data_path):
|
|
177
|
+
messages_dir = os.path.join(data_path, "messages")
|
|
178
|
+
test_case_resources = TestCaseResources(data_path)
|
|
179
|
+
processed_test_cases = set()
|
|
180
|
+
failed_tool_calls = set()
|
|
181
|
+
|
|
182
|
+
for test_case in test_case_resources.get_summary:
|
|
183
|
+
dataset_name = test_case["dataset_name"]
|
|
184
|
+
if dataset_name in processed_test_cases:
|
|
185
|
+
continue
|
|
186
|
+
processed_test_cases.add(dataset_name)
|
|
60
187
|
|
|
61
|
-
|
|
62
|
-
cached_lines: List[Text] = []
|
|
63
|
-
tools_analyzed: List[str] = []
|
|
188
|
+
run_map = list_run_files(messages_dir, test_case["dataset_name"])
|
|
64
189
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
190
|
+
if not run_map:
|
|
191
|
+
test_messages, _ = test_case_resources.get_analyze_messages(
|
|
192
|
+
test_case_name=dataset_name
|
|
193
|
+
)
|
|
194
|
+
failed_tool_calls.update(
|
|
195
|
+
self._failing_tool_from_messages(test_messages)
|
|
196
|
+
)
|
|
72
197
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
198
|
+
else:
|
|
199
|
+
for paths in run_map.values():
|
|
200
|
+
test_messages, _ = test_case_resources.get_analyze_messages(
|
|
201
|
+
path=paths["analyze"]
|
|
202
|
+
)
|
|
203
|
+
failed_tool_calls.update(
|
|
204
|
+
self._failing_tool_from_messages(test_messages)
|
|
205
|
+
)
|
|
78
206
|
|
|
79
|
-
return
|
|
207
|
+
return failed_tool_calls
|
|
80
208
|
|
|
81
209
|
def analyze_failing_tool_description_quality(
|
|
82
210
|
self,
|
|
83
211
|
inspector: DescriptionQualityInspector,
|
|
84
212
|
tool_definition_path: str,
|
|
85
213
|
failing_tools: Set[str],
|
|
86
|
-
) -> List[
|
|
214
|
+
) -> Tuple[List[DescriptionQualityMetric], List[str]]:
|
|
87
215
|
"""
|
|
88
216
|
:param tool_definition_path: Path to the tool definition file.
|
|
89
217
|
:param failing_tools: Set of tool names that failed.
|
|
90
|
-
:return:
|
|
218
|
+
:return: A tuple where the first item in the tuple is List[DescriptionQualityMetric] for failed tools that were analyzed,
|
|
219
|
+
the second item in the list is a list of missing tools
|
|
91
220
|
"""
|
|
92
221
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
222
|
+
failing_tool_definitions: List[ToolDefinition] = (
|
|
223
|
+
inspector.extract_tool_desc_from_tool_source(
|
|
224
|
+
Path(tool_definition_path),
|
|
225
|
+
failing_tools,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if not failing_tool_definitions:
|
|
230
|
+
"""
|
|
231
|
+
No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
|
|
232
|
+
"""
|
|
233
|
+
with Lock:
|
|
234
|
+
self.tools_not_found.add(failing_tools)
|
|
235
|
+
|
|
236
|
+
missing_tools = self._get_tools_not_found_in_source(
|
|
237
|
+
failing_tools, failing_tool_definitions
|
|
238
|
+
)
|
|
239
|
+
for tool_definition in failing_tool_definitions:
|
|
240
|
+
tool_analysis = inspector.detect_bad_description(tool_definition)
|
|
241
|
+
with LOCK:
|
|
242
|
+
self.analysis_cache[tool_definition.tool_name] = tool_analysis
|
|
243
|
+
self.missing_tools.update(missing_tools)
|
|
244
|
+
|
|
245
|
+
return 1
|
|
246
|
+
|
|
247
|
+
def analyze(self, config):
|
|
248
|
+
failing_tools = self.failing_tools(config.data_path)
|
|
249
|
+
inspector = DescriptionQualityInspector()
|
|
250
|
+
tool_definition_path = config.tool_definition_path
|
|
251
|
+
|
|
252
|
+
with ThreadPoolExecutor(
|
|
253
|
+
max_workers=config.num_workers, thread_name_prefix="[Worker]"
|
|
254
|
+
) as pool:
|
|
255
|
+
futures = [
|
|
256
|
+
pool.submit(
|
|
257
|
+
self.analyze_failing_tool_description_quality,
|
|
258
|
+
inspector,
|
|
259
|
+
tool_definition_path,
|
|
260
|
+
[failing_tool],
|
|
261
|
+
)
|
|
262
|
+
for failing_tool in failing_tools
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
if futures:
|
|
266
|
+
with Progress() as progress:
|
|
267
|
+
task = progress.add_task(
|
|
268
|
+
f"[purple]Analyzing description quality for {len(futures)} tasks...",
|
|
269
|
+
total=len(futures),
|
|
270
|
+
)
|
|
271
|
+
for future in as_completed(futures):
|
|
272
|
+
try:
|
|
273
|
+
future.result()
|
|
274
|
+
except Exception:
|
|
275
|
+
traceback.print_exc()
|
|
276
|
+
finally:
|
|
277
|
+
progress.update(task, advance=1)
|
|
278
|
+
|
|
279
|
+
def render(self):
|
|
280
|
+
raise NotImplementedError("Not implemented")
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class Analyzer(AnalyzerBase):
|
|
284
|
+
def __init__(
|
|
285
|
+
self,
|
|
286
|
+
enhanced_metrics: Optional[List[EnhancedAnalyzeMetrics]] = None,
|
|
287
|
+
description_quality_analyzer: DescriptionQualityAnalyzer = None,
|
|
288
|
+
):
|
|
289
|
+
self.enhanced_metrics = enhanced_metrics
|
|
290
|
+
self.enhanced_metrics_idx_map = {}
|
|
291
|
+
|
|
292
|
+
if self.enhanced_metrics:
|
|
293
|
+
# do some post-processing on the enhanced metrics
|
|
294
|
+
# create a mapping between test case name and index
|
|
295
|
+
if self.enhanced_metrics:
|
|
296
|
+
for idx, metric in enumerate(self.enhanced_metrics):
|
|
297
|
+
self.enhanced_metrics_idx_map[metric.test_case_name] = idx
|
|
298
|
+
|
|
299
|
+
self.description_quality_analyzer = description_quality_analyzer
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def _generate_style_config():
|
|
303
|
+
return Style(
|
|
304
|
+
color="magenta",
|
|
305
|
+
blink=True,
|
|
306
|
+
bold=True,
|
|
96
307
|
)
|
|
97
308
|
|
|
98
|
-
|
|
309
|
+
def _parse_enhanced_metrics(self, test_case_name) -> Optional[Table]:
|
|
310
|
+
table = Table(
|
|
311
|
+
box=box.ROUNDED,
|
|
312
|
+
show_lines=True,
|
|
313
|
+
)
|
|
99
314
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
315
|
+
columns = [
|
|
316
|
+
"Tool Name",
|
|
317
|
+
"Root Cause Analysis",
|
|
318
|
+
"Docstring Recommendations",
|
|
319
|
+
]
|
|
104
320
|
|
|
105
|
-
|
|
106
|
-
|
|
321
|
+
rows = []
|
|
322
|
+
|
|
323
|
+
if (
|
|
324
|
+
self.enhanced_metrics
|
|
325
|
+
and (index := self.enhanced_metrics_idx_map.get(test_case_name))
|
|
326
|
+
is not None
|
|
327
|
+
):
|
|
328
|
+
enhanced_metric: EnhancedAnalyzeMetrics = self.enhanced_metrics[
|
|
329
|
+
index
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
for idx, tool_call in enumerate(enhanced_metric.tool_names):
|
|
333
|
+
static_root_causes = []
|
|
334
|
+
parsed_tool_annotations = []
|
|
335
|
+
param_annotations = defaultdict(list)
|
|
336
|
+
|
|
337
|
+
row = [tool_call]
|
|
338
|
+
|
|
339
|
+
# if this is true, then there are no semantic metrics
|
|
340
|
+
static_root_causes = [
|
|
341
|
+
Text(item.explanation)
|
|
342
|
+
for item in enhanced_metric.static_metrics[idx]
|
|
343
|
+
]
|
|
344
|
+
|
|
345
|
+
static_root_causes = Text().join(static_root_causes)
|
|
346
|
+
|
|
347
|
+
# Parameter Root Cause
|
|
348
|
+
parameter_annotations = enhanced_metric.parameter_annotations[
|
|
349
|
+
idx
|
|
350
|
+
]
|
|
351
|
+
formatted_param_root_cause = [
|
|
352
|
+
Text(metric.explanation) for metric in parameter_annotations
|
|
353
|
+
]
|
|
354
|
+
formatted_param_root_cause = Text().join(
|
|
355
|
+
formatted_param_root_cause
|
|
356
|
+
)
|
|
107
357
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
358
|
+
# Tool Root Cause
|
|
359
|
+
tool_annotations = enhanced_metric.tool_annotations[idx]
|
|
360
|
+
formatted_tool_root_cause = [
|
|
361
|
+
Text(metric.explanation) for metric in tool_annotations
|
|
362
|
+
]
|
|
363
|
+
formatted_tool_root_cause = Text().join(
|
|
364
|
+
formatted_tool_root_cause
|
|
112
365
|
)
|
|
113
|
-
)
|
|
114
366
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
367
|
+
if formatted_param_root_cause or formatted_tool_root_cause:
|
|
368
|
+
root_cause = (
|
|
369
|
+
formatted_tool_root_cause
|
|
370
|
+
if len(formatted_tool_root_cause)
|
|
371
|
+
> len(formatted_param_root_cause)
|
|
372
|
+
else formatted_param_root_cause
|
|
119
373
|
)
|
|
374
|
+
elif static_root_causes:
|
|
375
|
+
root_cause = static_root_causes
|
|
376
|
+
else:
|
|
377
|
+
root_cause = N_A
|
|
378
|
+
|
|
379
|
+
row.append(root_cause)
|
|
380
|
+
|
|
381
|
+
# Parameter Level Docstring
|
|
382
|
+
for metric in parameter_annotations:
|
|
383
|
+
if annotations := metric.annotations:
|
|
384
|
+
for annotation in annotations:
|
|
385
|
+
param_annotations[annotation.parameter_name].append(
|
|
386
|
+
f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
newline = "\n"
|
|
390
|
+
param_annotations = [
|
|
391
|
+
f"- [b]{param_name}:[/b] {newline.join(doc_string)}"
|
|
392
|
+
for param_name, doc_string in param_annotations.items()
|
|
393
|
+
]
|
|
394
|
+
param_annotations = "\n".join(param_annotations)
|
|
395
|
+
|
|
396
|
+
# Tool Level Docstring
|
|
397
|
+
for metric in tool_annotations:
|
|
398
|
+
if annotations := metric.annotations:
|
|
399
|
+
for annotation in annotations:
|
|
400
|
+
parsed_tool_annotations.append(
|
|
401
|
+
f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
|
|
402
|
+
)
|
|
403
|
+
parsed_tool_annotations = "\n".join(parsed_tool_annotations)
|
|
404
|
+
docstring_cell = Table(
|
|
405
|
+
show_lines=False, show_header=False, box=None
|
|
120
406
|
)
|
|
121
|
-
|
|
407
|
+
add_divider = False
|
|
408
|
+
|
|
409
|
+
# - Gate the Doc String Enrichments.
|
|
410
|
+
# - Ensure the environment variable is enabled.
|
|
411
|
+
if GATE_TOOL_ENRICHMENTS and self.description_quality_analyzer:
|
|
412
|
+
# check if tool in cache
|
|
413
|
+
tool_description_analysis = (
|
|
414
|
+
self.description_quality_analyzer.analysis_cache.get(
|
|
415
|
+
tool_call
|
|
416
|
+
)
|
|
417
|
+
)
|
|
418
|
+
is_missing_tool = (
|
|
419
|
+
tool_call
|
|
420
|
+
in self.description_quality_analyzer.missing_tools
|
|
421
|
+
) # tool call not in tool_definition_path
|
|
422
|
+
# failed tool call that failed to get extracted from the tool_definition_path because of missing `@tool` decorator
|
|
423
|
+
# TODO: figure out if this edge is needed? taken from original Analyze implementation
|
|
424
|
+
tool_not_found = (
|
|
425
|
+
tool_call
|
|
426
|
+
in self.description_quality_analyzer.tools_not_found
|
|
427
|
+
)
|
|
122
428
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
429
|
+
# If the tool_call is in `missing_tools`, don't show the annotations
|
|
430
|
+
if is_missing_tool or tool_not_found:
|
|
431
|
+
parsed_tool_annotations = []
|
|
432
|
+
param_annotations = []
|
|
433
|
+
|
|
434
|
+
if tool_description_analysis is not None:
|
|
435
|
+
if (
|
|
436
|
+
tool_description_analysis.description_quality
|
|
437
|
+
== DescriptionQuality.GOOD
|
|
438
|
+
):
|
|
439
|
+
parsed_tool_annotations = []
|
|
440
|
+
param_annotations = []
|
|
441
|
+
else:
|
|
442
|
+
print("cache miss: ", tool_call)
|
|
443
|
+
|
|
444
|
+
if not parsed_tool_annotations and not param_annotations:
|
|
445
|
+
docstring_cell.add_row(N_A)
|
|
446
|
+
if parsed_tool_annotations:
|
|
447
|
+
docstring_cell.add_row(
|
|
448
|
+
"[b]Tool Docstrings", parsed_tool_annotations
|
|
449
|
+
)
|
|
450
|
+
add_divider = True
|
|
451
|
+
if param_annotations:
|
|
452
|
+
if add_divider:
|
|
453
|
+
docstring_cell.add_row(Rule(characters="--"))
|
|
454
|
+
docstring_cell.add_row(
|
|
455
|
+
"[b]Parameter Docstrings", param_annotations
|
|
130
456
|
)
|
|
131
|
-
)
|
|
132
457
|
|
|
133
|
-
|
|
458
|
+
row.append(docstring_cell)
|
|
459
|
+
rows.append(row)
|
|
134
460
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
tool_definition_path=tool_definition_path,
|
|
139
|
-
)
|
|
461
|
+
is_empty = not any(rows)
|
|
462
|
+
if is_empty:
|
|
463
|
+
return None
|
|
140
464
|
|
|
141
|
-
|
|
142
|
-
|
|
465
|
+
for idx, column in enumerate(columns):
|
|
466
|
+
table.add_column(column)
|
|
467
|
+
|
|
468
|
+
for row in rows:
|
|
469
|
+
table.add_row(*row)
|
|
143
470
|
|
|
144
|
-
return
|
|
471
|
+
return table
|
|
145
472
|
|
|
146
473
|
def render(
|
|
147
474
|
self,
|
|
148
475
|
data: List[ExtendedMessage],
|
|
149
476
|
tool_definition_path: Optional[str],
|
|
150
477
|
meta: Optional[dict] = None,
|
|
478
|
+
test_case_name=None,
|
|
151
479
|
) -> Group:
|
|
152
480
|
"""
|
|
153
481
|
Render the conversation history and analysis results.
|
|
@@ -158,7 +486,6 @@ class Analyzer:
|
|
|
158
486
|
conversation_lines = []
|
|
159
487
|
reason_lines = []
|
|
160
488
|
failing_tools = []
|
|
161
|
-
added_errors_header = False
|
|
162
489
|
added_missed_header = False
|
|
163
490
|
|
|
164
491
|
for entry in data:
|
|
@@ -192,31 +519,11 @@ class Analyzer:
|
|
|
192
519
|
|
|
193
520
|
text_line = Text(f"{label}: {content}\n")
|
|
194
521
|
if reason:
|
|
195
|
-
if not added_errors_header:
|
|
196
|
-
reason_lines.append(
|
|
197
|
-
Text("\nTool Call Errors:\n", style="bold red")
|
|
198
|
-
)
|
|
199
|
-
added_errors_header = True
|
|
200
522
|
text_line.stylize("bold red")
|
|
201
523
|
reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
|
|
202
524
|
reason_lines.append(Text(reason_text, style="red"))
|
|
203
525
|
conversation_lines.append(text_line)
|
|
204
526
|
|
|
205
|
-
if failing_tools and tool_definition_path:
|
|
206
|
-
|
|
207
|
-
inspector = DescriptionQualityInspector()
|
|
208
|
-
|
|
209
|
-
description_quality_inspection_lines = (
|
|
210
|
-
self.analyze_failing_tool_description_quality(
|
|
211
|
-
inspector, tool_definition_path, set(failing_tools)
|
|
212
|
-
)
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
print_done()
|
|
216
|
-
|
|
217
|
-
if description_quality_inspection_lines:
|
|
218
|
-
reason_lines.extend(description_quality_inspection_lines)
|
|
219
|
-
|
|
220
527
|
if meta:
|
|
221
528
|
missed = meta.get("missed_tool_calls") or []
|
|
222
529
|
if missed:
|
|
@@ -231,18 +538,21 @@ class Analyzer:
|
|
|
231
538
|
conversation_panel = Panel(
|
|
232
539
|
Text().join(conversation_lines),
|
|
233
540
|
title="Conversation History",
|
|
234
|
-
border_style="
|
|
541
|
+
border_style="bold deep_sky_blue2",
|
|
235
542
|
)
|
|
236
543
|
reason_panel = Panel(
|
|
237
544
|
Text().join(reason_lines),
|
|
238
|
-
|
|
239
|
-
|
|
545
|
+
box=box.ROUNDED,
|
|
546
|
+
title=f"[bold red]Tool Call Errors[/bold red]",
|
|
547
|
+
border_style="bold red",
|
|
240
548
|
)
|
|
549
|
+
table = self._parse_enhanced_metrics(test_case_name=test_case_name)
|
|
550
|
+
if table:
|
|
551
|
+
group = Group(conversation_panel, reason_panel, table)
|
|
552
|
+
else:
|
|
553
|
+
group = Group(conversation_panel, reason_panel)
|
|
241
554
|
|
|
242
|
-
return
|
|
243
|
-
conversation_panel,
|
|
244
|
-
reason_panel,
|
|
245
|
-
)
|
|
555
|
+
return group
|
|
246
556
|
|
|
247
557
|
def analyze(self, config: AnalyzeConfig):
|
|
248
558
|
"""
|
|
@@ -250,59 +560,15 @@ class Analyzer:
|
|
|
250
560
|
:param config: AnalyzeConfig object containing user provided paths for analysis.
|
|
251
561
|
"""
|
|
252
562
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
path_to_summary_file = os.path.join(
|
|
257
|
-
config.data_path, summary_file_name
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
with open(path_to_summary_file, "r") as f:
|
|
261
|
-
reader = csv.reader(f)
|
|
262
|
-
header = next(reader)
|
|
263
|
-
for row in reader:
|
|
264
|
-
summary.append(dict(zip(header, row)))
|
|
265
|
-
|
|
266
|
-
return summary
|
|
267
|
-
|
|
268
|
-
def get_test_messages(test_case_name):
|
|
269
|
-
test_messages = []
|
|
270
|
-
meta = {}
|
|
271
|
-
|
|
272
|
-
test_case_path = os.path.join(
|
|
273
|
-
config.data_path,
|
|
274
|
-
"messages",
|
|
275
|
-
f"{test_case_name}.messages.analyze.json",
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
with open(test_case_path, "r", encoding="utf-8") as f:
|
|
279
|
-
temp = json.load(f)
|
|
280
|
-
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
281
|
-
meta = temp[-1]["meta"]
|
|
282
|
-
temp = temp[:-1]
|
|
283
|
-
|
|
284
|
-
for entry in temp:
|
|
285
|
-
msg = ExtendedMessage(**entry)
|
|
286
|
-
test_messages.append(msg)
|
|
287
|
-
|
|
288
|
-
return test_messages, meta
|
|
289
|
-
|
|
290
|
-
def get_metrics(test_case_name):
|
|
291
|
-
test_metrics_path = os.path.join(
|
|
292
|
-
config.data_path, "messages", f"{test_case_name}.metrics.json"
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
with open(test_metrics_path, "r", encoding="utf-8") as f:
|
|
296
|
-
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
297
|
-
|
|
298
|
-
return metrics
|
|
299
|
-
|
|
300
|
-
summary = get_summary()
|
|
563
|
+
test_case_resources = TestCaseResources(config.data_path)
|
|
564
|
+
summary = test_case_resources.get_summary
|
|
301
565
|
|
|
302
566
|
test_case_with_failed_tools = self._get_test_case_with_failed_tools(
|
|
303
567
|
summary=summary
|
|
304
568
|
)
|
|
305
569
|
|
|
570
|
+
output_panels = []
|
|
571
|
+
|
|
306
572
|
if len(test_case_with_failed_tools) == 0:
|
|
307
573
|
header_table = Table(show_header=False, box=None)
|
|
308
574
|
|
|
@@ -313,7 +579,7 @@ class Analyzer:
|
|
|
313
579
|
title="[bold green]📋 Analysis Summary[/bold green]",
|
|
314
580
|
)
|
|
315
581
|
|
|
316
|
-
|
|
582
|
+
output_panels.append(panel)
|
|
317
583
|
|
|
318
584
|
messages_dir = os.path.join(config.data_path, "messages")
|
|
319
585
|
|
|
@@ -340,100 +606,17 @@ class Analyzer:
|
|
|
340
606
|
if dataset_base in processed_parents:
|
|
341
607
|
continue
|
|
342
608
|
|
|
343
|
-
run_map = list_run_files(messages_dir, dataset_base)
|
|
609
|
+
run_map = list_run_files(messages_dir, dataset_base, config.run)
|
|
344
610
|
|
|
345
611
|
# ---- SINGLE RUN (legacy or run1 only) ----
|
|
346
612
|
if not run_map or len(run_map) == 1:
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
test_case_name=dataset_base
|
|
613
|
+
runs_performed = 1
|
|
614
|
+
test_messages, meta, metrics, runs_problematic = (
|
|
615
|
+
self._single_run(
|
|
616
|
+
test_case_name=dataset_base,
|
|
617
|
+
run_map=run_map,
|
|
618
|
+
test_cases_resource=test_case_resources,
|
|
351
619
|
)
|
|
352
|
-
metrics: ToolCallAndRoutingMetrics = get_metrics(
|
|
353
|
-
test_case_name=dataset_base
|
|
354
|
-
)
|
|
355
|
-
runs_performed = 1
|
|
356
|
-
else:
|
|
357
|
-
run_id = next(iter(run_map))
|
|
358
|
-
paths = run_map[run_id]
|
|
359
|
-
runs_performed = 1
|
|
360
|
-
if not paths["metrics"]:
|
|
361
|
-
pretty_print(
|
|
362
|
-
f"❌ {dataset_base}.run{run_id} — metrics file missing.",
|
|
363
|
-
style="bold red",
|
|
364
|
-
)
|
|
365
|
-
# Count it as analyzed & problematic
|
|
366
|
-
processed_parents.add(dataset_base)
|
|
367
|
-
ds_table = Table(show_header=False, box=None)
|
|
368
|
-
ds_table.add_row("Type: Single-run")
|
|
369
|
-
ds_table.add_row("Status: ❌ Problematic")
|
|
370
|
-
pretty_print(
|
|
371
|
-
Panel(
|
|
372
|
-
ds_table,
|
|
373
|
-
title=f"📋 Analysis Summary — {dataset_base}",
|
|
374
|
-
border_style="green",
|
|
375
|
-
)
|
|
376
|
-
)
|
|
377
|
-
overall_runs_performed += 1
|
|
378
|
-
overall_runs_problematic += 1
|
|
379
|
-
add_line_seperator(self._generate_style_config())
|
|
380
|
-
continue
|
|
381
|
-
|
|
382
|
-
metrics = load_run_metrics(paths["metrics"])
|
|
383
|
-
meta = {}
|
|
384
|
-
|
|
385
|
-
if paths["analyze"]:
|
|
386
|
-
with open(paths["analyze"], "r", encoding="utf-8") as f:
|
|
387
|
-
raw = json.load(f)
|
|
388
|
-
if (
|
|
389
|
-
raw
|
|
390
|
-
and isinstance(raw[-1], dict)
|
|
391
|
-
and "meta" in raw[-1]
|
|
392
|
-
):
|
|
393
|
-
meta = raw[-1]["meta"]
|
|
394
|
-
raw = raw[:-1]
|
|
395
|
-
test_messages = [
|
|
396
|
-
ExtendedMessage(**entry) for entry in raw
|
|
397
|
-
]
|
|
398
|
-
else:
|
|
399
|
-
test_messages, meta = [], {}
|
|
400
|
-
|
|
401
|
-
# --- compute status uniformly (legacy & run1) ---
|
|
402
|
-
had_incorrect_param = (
|
|
403
|
-
hasattr(metrics, "tool_calls_with_incorrect_parameter")
|
|
404
|
-
and float(metrics.tool_calls_with_incorrect_parameter or 0)
|
|
405
|
-
> 0
|
|
406
|
-
)
|
|
407
|
-
low_precision = (
|
|
408
|
-
hasattr(metrics, "tool_call_precision")
|
|
409
|
-
and float(
|
|
410
|
-
metrics.tool_call_precision
|
|
411
|
-
if metrics.tool_call_precision is not None
|
|
412
|
-
else 1.0
|
|
413
|
-
)
|
|
414
|
-
< 1.0
|
|
415
|
-
)
|
|
416
|
-
low_recall = (
|
|
417
|
-
hasattr(metrics, "tool_call_recall")
|
|
418
|
-
and float(
|
|
419
|
-
metrics.tool_call_recall
|
|
420
|
-
if metrics.tool_call_recall is not None
|
|
421
|
-
else 1.0
|
|
422
|
-
)
|
|
423
|
-
< 1.0
|
|
424
|
-
)
|
|
425
|
-
runs_problematic = (
|
|
426
|
-
1
|
|
427
|
-
if (
|
|
428
|
-
(
|
|
429
|
-
hasattr(metrics, "is_success")
|
|
430
|
-
and not metrics.is_success
|
|
431
|
-
)
|
|
432
|
-
or had_incorrect_param
|
|
433
|
-
or low_precision
|
|
434
|
-
or low_recall
|
|
435
|
-
)
|
|
436
|
-
else 0
|
|
437
620
|
)
|
|
438
621
|
|
|
439
622
|
processed_parents.add(dataset_base)
|
|
@@ -445,14 +628,6 @@ class Analyzer:
|
|
|
445
628
|
"❌ Problematic" if runs_problematic else "✅ No problems"
|
|
446
629
|
)
|
|
447
630
|
ds_table.add_row(f"Status: {status}")
|
|
448
|
-
pretty_print(
|
|
449
|
-
Panel(
|
|
450
|
-
ds_table,
|
|
451
|
-
title=f"📋 Analysis Summary — {dataset_base}",
|
|
452
|
-
border_style="green",
|
|
453
|
-
)
|
|
454
|
-
)
|
|
455
|
-
|
|
456
631
|
# Update overall counters/averages
|
|
457
632
|
overall_runs_performed += runs_performed
|
|
458
633
|
overall_runs_problematic += runs_problematic
|
|
@@ -469,19 +644,43 @@ class Analyzer:
|
|
|
469
644
|
1 if bool(metrics.is_success) else 0
|
|
470
645
|
)
|
|
471
646
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
647
|
+
header_group = Group(
|
|
648
|
+
*[
|
|
649
|
+
ds_table,
|
|
475
650
|
self._create_header_analysis_panel(
|
|
476
651
|
dataset_base, metrics
|
|
652
|
+
),
|
|
653
|
+
],
|
|
654
|
+
)
|
|
655
|
+
border_style = "bold red" if runs_problematic else "bold green"
|
|
656
|
+
header_panel = Panel(
|
|
657
|
+
header_group,
|
|
658
|
+
title=f"[b]📋 Analysis Summary — {dataset_base}[/b]",
|
|
659
|
+
border_style=border_style,
|
|
660
|
+
)
|
|
661
|
+
output_panels.append(header_panel)
|
|
662
|
+
|
|
663
|
+
if runs_problematic:
|
|
664
|
+
output_panels.append(
|
|
665
|
+
self.render(
|
|
666
|
+
test_messages,
|
|
667
|
+
config.tool_definition_path,
|
|
668
|
+
meta,
|
|
669
|
+
test_case_name=dataset_base,
|
|
477
670
|
)
|
|
478
671
|
)
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
672
|
+
output_panels.append(
|
|
673
|
+
add_line_seperator(
|
|
674
|
+
self._generate_style_config(), print=False
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
else:
|
|
679
|
+
output_panels.append(
|
|
680
|
+
add_line_seperator(
|
|
681
|
+
self._generate_style_config(), print=False
|
|
482
682
|
)
|
|
483
683
|
)
|
|
484
|
-
add_line_seperator(self._generate_style_config())
|
|
485
684
|
|
|
486
685
|
continue
|
|
487
686
|
|
|
@@ -555,31 +754,32 @@ class Analyzer:
|
|
|
555
754
|
}
|
|
556
755
|
)
|
|
557
756
|
|
|
558
|
-
# Print the dataset panel FIRST with both lines inside
|
|
559
|
-
ds_table = Table(show_header=False, box=None)
|
|
560
|
-
ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
|
|
561
|
-
ds_table.add_row(
|
|
562
|
-
f"Runs with problems: {runs_problematic} / {runs_performed}"
|
|
563
|
-
)
|
|
564
|
-
status = (
|
|
565
|
-
"❌ Problematic" if runs_problematic > 0 else "✅ No problems"
|
|
566
|
-
)
|
|
567
|
-
ds_table.add_row(f"Status: {status}")
|
|
568
|
-
pretty_print(
|
|
569
|
-
Panel(
|
|
570
|
-
ds_table,
|
|
571
|
-
title=f"📋 Analysis Summary — {dataset_base}",
|
|
572
|
-
border_style="green",
|
|
573
|
-
)
|
|
574
|
-
)
|
|
575
|
-
|
|
576
757
|
# Second pass: now replay only the problematic runs (so summary stays at the top)
|
|
577
758
|
for item in deferred_runs:
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
759
|
+
ds_table = Table(show_header=False, box=None)
|
|
760
|
+
ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
|
|
761
|
+
ds_table.add_row(
|
|
762
|
+
f"Runs with problems: {runs_problematic} / {runs_performed}"
|
|
763
|
+
)
|
|
764
|
+
status = (
|
|
765
|
+
"❌ Problematic"
|
|
766
|
+
if runs_problematic > 0
|
|
767
|
+
else "✅ No problems"
|
|
768
|
+
)
|
|
769
|
+
ds_table.add_row(f"Status: {status}")
|
|
770
|
+
header_table = self._create_header_analysis_panel(
|
|
771
|
+
item["title"], item["metrics"]
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
group = Group(*[ds_table, header_table])
|
|
775
|
+
output_panels.append(
|
|
776
|
+
Panel(
|
|
777
|
+
group,
|
|
778
|
+
title=f"📋 Analysis Summary — {dataset_base}",
|
|
779
|
+
border_style="green",
|
|
581
780
|
)
|
|
582
781
|
)
|
|
782
|
+
|
|
583
783
|
if item["analyze_path"]:
|
|
584
784
|
with open(item["analyze_path"], "r", encoding="utf-8") as f:
|
|
585
785
|
raw = json.load(f)
|
|
@@ -589,12 +789,16 @@ class Analyzer:
|
|
|
589
789
|
raw = raw[:-1]
|
|
590
790
|
test_messages = [ExtendedMessage(**entry) for entry in raw]
|
|
591
791
|
|
|
592
|
-
|
|
792
|
+
output_panels.append(
|
|
593
793
|
self.render(
|
|
594
794
|
test_messages, config.tool_definition_path, meta
|
|
595
795
|
)
|
|
596
796
|
)
|
|
597
|
-
|
|
797
|
+
output_panels.append(
|
|
798
|
+
add_line_seperator(
|
|
799
|
+
self._generate_style_config(), print=False
|
|
800
|
+
)
|
|
801
|
+
)
|
|
598
802
|
|
|
599
803
|
# Update overall aggregates
|
|
600
804
|
overall_runs_performed += runs_performed
|
|
@@ -624,9 +828,9 @@ class Analyzer:
|
|
|
624
828
|
)
|
|
625
829
|
overall_lines.append(f"Avg journey success: {js_pct}%")
|
|
626
830
|
else:
|
|
627
|
-
overall_lines.append("Avg journey success: N/A")
|
|
831
|
+
overall_lines.append(f"Avg journey success: N/A")
|
|
628
832
|
|
|
629
|
-
|
|
833
|
+
output_panels.append(
|
|
630
834
|
Panel(
|
|
631
835
|
Text("\n".join(overall_lines)),
|
|
632
836
|
title="📋 Overall Summary",
|
|
@@ -634,6 +838,11 @@ class Analyzer:
|
|
|
634
838
|
)
|
|
635
839
|
)
|
|
636
840
|
|
|
841
|
+
console = Console()
|
|
842
|
+
with console.pager(styles=True):
|
|
843
|
+
for panel in output_panels:
|
|
844
|
+
console.print(panel, overflow="crop")
|
|
845
|
+
|
|
637
846
|
def _create_header_analysis_panel(
|
|
638
847
|
self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
|
|
639
848
|
) -> Panel:
|
|
@@ -649,86 +858,338 @@ class Analyzer:
|
|
|
649
858
|
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
650
859
|
header_table.add_row(f"Journey Success: {metrics.is_success}")
|
|
651
860
|
|
|
652
|
-
|
|
653
|
-
header_table, title="[bold green]Test Case Summary[/bold green]"
|
|
654
|
-
)
|
|
861
|
+
return header_table
|
|
655
862
|
|
|
656
|
-
return header_panel
|
|
657
863
|
|
|
658
|
-
|
|
864
|
+
class AnalyzerEnhanced(AnalyzerBase):
|
|
865
|
+
PARAMETER_DOCUMENTATION = "PARAMETER_DOCUMENTATION"
|
|
866
|
+
TOOL_USAGE_EXAMPLES = "TOOL_USAGE_EXAMPLES"
|
|
867
|
+
TOOL_DOCUMENTATION = "TOOL_DOCUMENTATION"
|
|
659
868
|
|
|
660
|
-
|
|
869
|
+
DEFAULT_GENERATION_PARAMS = {
|
|
870
|
+
"min_new_tokens": 0,
|
|
871
|
+
"decoding_method": "greedy",
|
|
872
|
+
"max_new_tokens": 10_000,
|
|
873
|
+
"random_seed": 42,
|
|
874
|
+
}
|
|
661
875
|
|
|
662
|
-
|
|
663
|
-
|
|
876
|
+
def __init__(self):
|
|
877
|
+
super().__init__()
|
|
664
878
|
|
|
665
|
-
|
|
879
|
+
def _deduplicate_tool_call_failures(self, messages: List[ExtendedMessage]):
|
|
880
|
+
"""If there are multiple failures from the same tool, then choose the failure that occurs later in the trajectory
|
|
881
|
+
|
|
882
|
+
ex.
|
|
883
|
+
1. Tool A fails
|
|
884
|
+
2. Tool A Error response
|
|
885
|
+
3. Tool A call again which fails
|
|
886
|
+
4. Tool A error response
|
|
887
|
+
|
|
888
|
+
For the analysis, we analyze the second time the tool call fails, with the previous messages serving as context.
|
|
889
|
+
|
|
890
|
+
"""
|
|
891
|
+
tool_indices = []
|
|
892
|
+
seen_tools = set()
|
|
893
|
+
|
|
894
|
+
for idx, message in enumerate(reversed(messages)):
|
|
895
|
+
if self._is_failed_tool_call(message):
|
|
896
|
+
content = json.loads(message.message.content)
|
|
897
|
+
tool_call_name = content["name"]
|
|
898
|
+
if tool_call_name not in seen_tools:
|
|
899
|
+
seen_tools.add(tool_call_name)
|
|
900
|
+
tool_indices.append(len(messages) - 1 - idx)
|
|
901
|
+
|
|
902
|
+
return sorted(tool_indices)
|
|
903
|
+
|
|
904
|
+
def process_messages(self, task_name, test_case, tools, messages):
|
|
905
|
+
eval = ReferencelessEvaluation(
|
|
906
|
+
api_spec=tools,
|
|
907
|
+
model_id=MODEL_ID,
|
|
908
|
+
task_n=task_name,
|
|
909
|
+
dataset_name=test_case,
|
|
910
|
+
runtime_pipeline=False,
|
|
911
|
+
generation_params=AnalyzerEnhanced.DEFAULT_GENERATION_PARAMS,
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
processed_data = [
|
|
915
|
+
{
|
|
916
|
+
k: msg.model_dump().get(k)
|
|
917
|
+
for k in ["role", "content", "type"]
|
|
918
|
+
if k in msg.model_dump()
|
|
919
|
+
}
|
|
920
|
+
for msg in messages
|
|
921
|
+
]
|
|
922
|
+
|
|
923
|
+
context = processed_data[:-1]
|
|
924
|
+
tool_call = processed_data[
|
|
925
|
+
-1
|
|
926
|
+
] # assume that the message is the last tool call
|
|
927
|
+
tool_call_msg = json.loads(tool_call["content"])
|
|
928
|
+
call = ReferencelessEvaluation.fmt_tool_call(
|
|
929
|
+
tool_id=tool_call_msg.get("id", "1"),
|
|
930
|
+
tool_call_name=tool_call_msg["name"],
|
|
931
|
+
arguments=json.dumps(tool_call_msg["args"]),
|
|
932
|
+
context=context,
|
|
933
|
+
)
|
|
934
|
+
return test_case, eval.run([call])
|
|
935
|
+
|
|
936
|
+
def _extract_semantic_metrics(
|
|
937
|
+
self, metrics_dictionary, annotation_filters: Optional[List[str]]
|
|
938
|
+
):
|
|
939
|
+
semantic_analysis = []
|
|
940
|
+
for metric_data in metrics_dictionary.values():
|
|
941
|
+
raw_response = metric_data.get("raw_response")
|
|
942
|
+
if raw_response is None:
|
|
666
943
|
continue
|
|
667
944
|
|
|
668
|
-
|
|
945
|
+
is_correct = metric_data.get("is_correct", False)
|
|
946
|
+
if is_correct:
|
|
947
|
+
continue
|
|
669
948
|
|
|
670
|
-
|
|
671
|
-
|
|
949
|
+
failed_semantic_test_case = ReferencelessEvalParser.semantic_parser(
|
|
950
|
+
metric_name=metric_data.get("metric_name"),
|
|
951
|
+
data=raw_response,
|
|
952
|
+
annotation_filters=annotation_filters,
|
|
672
953
|
)
|
|
673
|
-
tcp = float(entry.get("tool_call_precision", 1) or 1)
|
|
674
|
-
tcr = float(entry.get("tool_call_recall", 1) or 1)
|
|
675
954
|
|
|
676
|
-
|
|
677
|
-
if (not is_success) or (tip > 0) or (tcp < 1.0) or (tcr < 1.0):
|
|
678
|
-
test_case_with_failed_tools.append(entry)
|
|
955
|
+
semantic_analysis.append(failed_semantic_test_case)
|
|
679
956
|
|
|
680
|
-
return
|
|
957
|
+
return semantic_analysis
|
|
681
958
|
|
|
682
|
-
def
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
959
|
+
def tool_enrichment_view(self, results):
|
|
960
|
+
enhanced_metrics = []
|
|
961
|
+
tool_enrichment_metrics = defaultdict(list)
|
|
962
|
+
for result in results:
|
|
963
|
+
for test_case, eval_results in result.items():
|
|
964
|
+
for result in eval_results:
|
|
965
|
+
# for metric in result:
|
|
966
|
+
failed_static_metrics = []
|
|
967
|
+
parameter_annotations = []
|
|
968
|
+
tool_annotations = []
|
|
687
969
|
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
970
|
+
static_metrics_passed = result.get("static", {}).get(
|
|
971
|
+
"final_decision", False
|
|
972
|
+
)
|
|
973
|
+
tool_call_obj = result.get("inputs", {}).get(
|
|
974
|
+
"tool_call", {}
|
|
975
|
+
)
|
|
691
976
|
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
977
|
+
if static_metrics_passed:
|
|
978
|
+
semantic_metrics = result.get("semantic")
|
|
979
|
+
function_selection_metrics = semantic_metrics.get(
|
|
980
|
+
"function_selection", {}
|
|
981
|
+
).get("metrics", {})
|
|
982
|
+
tool_annotations = self._extract_semantic_metrics(
|
|
983
|
+
function_selection_metrics,
|
|
984
|
+
[
|
|
985
|
+
AnalyzerEnhanced.TOOL_DOCUMENTATION,
|
|
986
|
+
AnalyzerEnhanced.TOOL_USAGE_EXAMPLES,
|
|
987
|
+
],
|
|
988
|
+
)
|
|
698
989
|
|
|
699
|
-
|
|
700
|
-
|
|
990
|
+
general_metrics = semantic_metrics.get(
|
|
991
|
+
"general", {}
|
|
992
|
+
).get("metrics", {})
|
|
993
|
+
parameter_annotations = self._extract_semantic_metrics(
|
|
994
|
+
general_metrics,
|
|
995
|
+
[AnalyzerEnhanced.PARAMETER_DOCUMENTATION],
|
|
996
|
+
)
|
|
997
|
+
else:
|
|
998
|
+
static_metrics = result.get("static").get("metrics")
|
|
999
|
+
failed_static_metrics = (
|
|
1000
|
+
ReferencelessEvalParser.static_parser(
|
|
1001
|
+
static_metrics=static_metrics
|
|
1002
|
+
)
|
|
1003
|
+
)
|
|
701
1004
|
|
|
702
|
-
|
|
1005
|
+
parsed_metrics = {
|
|
1006
|
+
"tool_name": tool_call_obj.get("function", {}).get(
|
|
1007
|
+
"name"
|
|
1008
|
+
),
|
|
1009
|
+
"parameter_annotations": parameter_annotations,
|
|
1010
|
+
"tool_annotations": tool_annotations,
|
|
1011
|
+
"static_metrics": failed_static_metrics,
|
|
1012
|
+
}
|
|
1013
|
+
tool_enrichment_metrics[test_case].append(parsed_metrics)
|
|
1014
|
+
|
|
1015
|
+
for test_case, metrics in tool_enrichment_metrics.items():
|
|
1016
|
+
failed_tools = [metric["tool_name"] for metric in metrics]
|
|
1017
|
+
parameter_annotations = [
|
|
1018
|
+
metric["parameter_annotations"] for metric in metrics
|
|
1019
|
+
]
|
|
1020
|
+
tool_annotation = [metric["tool_annotations"] for metric in metrics]
|
|
1021
|
+
static_metrics = [metric["static_metrics"] for metric in metrics]
|
|
1022
|
+
|
|
1023
|
+
# don't add to final metrics array if there were no annotations
|
|
1024
|
+
if (
|
|
1025
|
+
not any(parameter_annotations)
|
|
1026
|
+
and not any(tool_annotation)
|
|
1027
|
+
and not any(static_metrics)
|
|
1028
|
+
):
|
|
1029
|
+
continue
|
|
703
1030
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
1031
|
+
enhanced_metrics.append(
|
|
1032
|
+
EnhancedAnalyzeMetrics(
|
|
1033
|
+
test_case_name=test_case,
|
|
1034
|
+
parameter_annotations=parameter_annotations,
|
|
1035
|
+
tool_annotations=tool_annotation,
|
|
1036
|
+
tool_names=failed_tools,
|
|
1037
|
+
static_metrics=static_metrics,
|
|
710
1038
|
)
|
|
711
1039
|
)
|
|
712
|
-
return tool_analysis
|
|
713
1040
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
1041
|
+
return enhanced_metrics
|
|
1042
|
+
|
|
1043
|
+
def analyze(
|
|
1044
|
+
self, config: AnalyzeConfig
|
|
1045
|
+
) -> Optional[List[EnhancedAnalyzeMetrics]]:
|
|
1046
|
+
start = time.time()
|
|
1047
|
+
all_tools = ToolExtractionOpenAIFormat.from_path(
|
|
1048
|
+
config.tool_definition_path
|
|
1049
|
+
)
|
|
1050
|
+
messages_dir = os.path.join(config.data_path, "messages")
|
|
1051
|
+
test_case_resources = TestCaseResources(config.data_path)
|
|
1052
|
+
|
|
1053
|
+
failed_test_cases = {}
|
|
1054
|
+
for test_case in test_case_resources.get_summary:
|
|
1055
|
+
if test_case["dataset_name"] in failed_test_cases:
|
|
1056
|
+
continue
|
|
1057
|
+
run_map = list_run_files(
|
|
1058
|
+
messages_dir, test_case["dataset_name"], config.run
|
|
720
1059
|
)
|
|
721
|
-
|
|
1060
|
+
if run_map and config.run == -1:
|
|
1061
|
+
rich.print(
|
|
1062
|
+
"[red]Enhanced Mode only operates on a single run for a dataset. Since there are multiple runs, set the `--run` flag to the specific run for enhanced analysis."
|
|
1063
|
+
)
|
|
1064
|
+
# run the first run in the config map
|
|
1065
|
+
rich.print(
|
|
1066
|
+
f"[b]Defaulting to run {next(iter(run_map))} to analyze for {test_case['dataset_name']}"
|
|
1067
|
+
)
|
|
1068
|
+
config.run = next(iter(run_map))
|
|
1069
|
+
run_map = {config.run: run_map.get(config.run)}
|
|
722
1070
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
is_ok(
|
|
726
|
-
message=f"The description for the `{tool_name}` looks sufficient."
|
|
1071
|
+
_, _, _, run_problematic = self._single_run(
|
|
1072
|
+
test_case["dataset_name"], run_map, test_case_resources
|
|
727
1073
|
)
|
|
1074
|
+
if run_problematic:
|
|
1075
|
+
if run_files := run_map.get(config.run):
|
|
1076
|
+
failed_test_cases[test_case["dataset_name"]] = run_files
|
|
1077
|
+
|
|
1078
|
+
else:
|
|
1079
|
+
# legacy runs without n runs
|
|
1080
|
+
# tranform the legacy runs into the same data structure from `list_files`
|
|
1081
|
+
|
|
1082
|
+
messages_path = os.path.join(
|
|
1083
|
+
test_case_resources.output_dir,
|
|
1084
|
+
"messages",
|
|
1085
|
+
f"{test_case['dataset_name']}.messages.json",
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
analyze_path = os.path.join(
|
|
1089
|
+
test_case_resources.output_dir,
|
|
1090
|
+
"messages",
|
|
1091
|
+
f"{test_case['dataset_name']}.messages.analyze.json",
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
metrics_path = os.path.join(
|
|
1095
|
+
test_case_resources.output_dir,
|
|
1096
|
+
"messages",
|
|
1097
|
+
f"{test_case['dataset_name']}.metrics.json",
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
failed_test_cases[test_case["dataset_name"]] = {
|
|
1101
|
+
"analyze": analyze_path,
|
|
1102
|
+
"messages": messages_path,
|
|
1103
|
+
"metrics": metrics_path,
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
max_workers = config.num_workers
|
|
1107
|
+
rich.print(
|
|
1108
|
+
f"[bold green]INFO:[/bold green] Number of workers set to: {max_workers}"
|
|
728
1109
|
)
|
|
729
|
-
|
|
1110
|
+
|
|
1111
|
+
jobs = []
|
|
1112
|
+
|
|
1113
|
+
with ThreadPoolExecutor(
|
|
1114
|
+
max_workers=max_workers, thread_name_prefix="[Worker]"
|
|
1115
|
+
) as pool:
|
|
1116
|
+
aggregate_results = []
|
|
1117
|
+
for test_case, file_mapping in failed_test_cases.items():
|
|
1118
|
+
analyze_messages, _ = test_case_resources.get_analyze_messages(
|
|
1119
|
+
path=file_mapping["analyze"]
|
|
1120
|
+
)
|
|
1121
|
+
idx_failed_tool_calls = self._deduplicate_tool_call_failures(
|
|
1122
|
+
analyze_messages
|
|
1123
|
+
)
|
|
1124
|
+
messages = test_case_resources.get_messages(
|
|
1125
|
+
path=file_mapping["messages"]
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
for idx in idx_failed_tool_calls:
|
|
1129
|
+
jobs.append(
|
|
1130
|
+
{
|
|
1131
|
+
"task_name": f"{test_case}-0-{idx + 1}",
|
|
1132
|
+
"test_case": test_case,
|
|
1133
|
+
"tools": all_tools,
|
|
1134
|
+
"messages": messages[0 : idx + 1],
|
|
1135
|
+
}
|
|
1136
|
+
)
|
|
1137
|
+
jobs = sorted(jobs, key=lambda x: len(x["messages"]))
|
|
1138
|
+
futures = [
|
|
1139
|
+
pool.submit(
|
|
1140
|
+
self.process_messages,
|
|
1141
|
+
job["task_name"],
|
|
1142
|
+
job["test_case"],
|
|
1143
|
+
job["tools"],
|
|
1144
|
+
job["messages"],
|
|
1145
|
+
)
|
|
1146
|
+
for job in jobs
|
|
1147
|
+
]
|
|
1148
|
+
|
|
1149
|
+
if futures:
|
|
1150
|
+
with Progress() as progress:
|
|
1151
|
+
task = progress.add_task(
|
|
1152
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
1153
|
+
total=len(futures),
|
|
1154
|
+
)
|
|
1155
|
+
for future in as_completed(futures):
|
|
1156
|
+
try:
|
|
1157
|
+
test_case, results = future.result()
|
|
1158
|
+
aggregate_results.append({test_case: results})
|
|
1159
|
+
except Exception as e:
|
|
1160
|
+
rich.print(
|
|
1161
|
+
f"test case, {test_case} ,fails with {e}"
|
|
1162
|
+
)
|
|
1163
|
+
traceback.print_exc()
|
|
1164
|
+
finally:
|
|
1165
|
+
progress.update(task, advance=1)
|
|
1166
|
+
|
|
1167
|
+
enhanced_metrics = self.tool_enrichment_view(aggregate_results)
|
|
1168
|
+
end = time.time()
|
|
1169
|
+
rich.print(f"Enhanced Analysis took {end - start} s")
|
|
1170
|
+
|
|
1171
|
+
return enhanced_metrics
|
|
1172
|
+
|
|
1173
|
+
def render(self):
|
|
1174
|
+
raise NotImplementedError("Not implemented")
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
def run(args):
|
|
1178
|
+
d = DescriptionQualityAnalyzer()
|
|
1179
|
+
if args.mode == AnalyzeMode.enhanced:
|
|
1180
|
+
if GATE_TOOL_ENRICHMENTS:
|
|
1181
|
+
d.analyze(args)
|
|
1182
|
+
|
|
1183
|
+
enhanced = AnalyzerEnhanced()
|
|
1184
|
+
enhanced_metrics = enhanced.analyze(config=args)
|
|
1185
|
+
dummy_analyzer = Analyzer(enhanced_metrics, d)
|
|
1186
|
+
dummy_analyzer.analyze(args)
|
|
1187
|
+
|
|
1188
|
+
else:
|
|
1189
|
+
dummy_analyzer = Analyzer()
|
|
1190
|
+
dummy_analyzer.analyze(args)
|
|
730
1191
|
|
|
731
1192
|
|
|
732
1193
|
if __name__ == "__main__":
|
|
733
|
-
|
|
734
|
-
|
|
1194
|
+
args = CLI(AnalyzeConfig, as_positional=False)
|
|
1195
|
+
run(args)
|