ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (33) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +33 -29
  3. wxo_agentic_evaluation/analyze_run.py +805 -344
  4. wxo_agentic_evaluation/arg_configs.py +10 -1
  5. wxo_agentic_evaluation/description_quality_checker.py +11 -2
  6. wxo_agentic_evaluation/evaluation_package.py +8 -3
  7. wxo_agentic_evaluation/inference_backend.py +46 -79
  8. wxo_agentic_evaluation/llm_matching.py +14 -2
  9. wxo_agentic_evaluation/main.py +1 -1
  10. wxo_agentic_evaluation/metrics/__init__.py +1 -0
  11. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  12. wxo_agentic_evaluation/metrics/metrics.py +43 -1
  13. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  14. wxo_agentic_evaluation/prompt/template_render.py +4 -2
  15. wxo_agentic_evaluation/quick_eval.py +7 -9
  16. wxo_agentic_evaluation/record_chat.py +2 -5
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
  21. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  24. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
  25. wxo_agentic_evaluation/resource_map.py +3 -1
  26. wxo_agentic_evaluation/service_instance.py +7 -0
  27. wxo_agentic_evaluation/type.py +1 -1
  28. wxo_agentic_evaluation/utils/__init__.py +3 -0
  29. wxo_agentic_evaluation/utils/parsers.py +71 -0
  30. wxo_agentic_evaluation/utils/utils.py +131 -16
  31. wxo_agentic_evaluation/wxo_client.py +80 -0
  32. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
  33. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
@@ -1,153 +1,481 @@
1
- import csv
2
1
  import json
3
2
  import os
4
3
  import re
4
+ import time
5
+ import traceback
6
+ from abc import ABC, abstractmethod
7
+ from collections import defaultdict
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from threading import Lock
5
10
  from pathlib import Path
6
- from typing import Dict, List, Optional, Set
11
+ from typing import Dict, List, Optional, Set, Tuple
7
12
 
13
+ import rich
8
14
  from jsonargparse import CLI
9
- from rich.console import Group
15
+ from rich import box
16
+ from rich.rule import Rule
17
+ from rich.console import Group, Console
10
18
  from rich.panel import Panel
19
+ from rich.progress import Progress
11
20
  from rich.style import Style
12
21
  from rich.table import Table
13
22
  from rich.text import Text
14
23
 
15
- from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
24
+ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig, AnalyzeMode
16
25
  from wxo_agentic_evaluation.description_quality_checker import (
17
26
  DescriptionQualityInspector,
18
27
  )
19
28
  from wxo_agentic_evaluation.metrics.metrics import (
29
+ EnhancedAnalyzeMetrics,
20
30
  TextMatchType,
21
31
  ToolCallAndRoutingMetrics,
32
+ DescriptionQualityMetric,
33
+ DescriptionQuality,
22
34
  )
35
+ from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
23
36
  from wxo_agentic_evaluation.type import (
24
37
  ContentType,
25
38
  ExtendedMessage,
26
39
  ToolDefinition,
27
40
  )
28
- from wxo_agentic_evaluation.utils.rich_utils import (
29
- IncorrectParameterUtils,
30
- is_ok,
31
- pretty_print,
32
- print_done,
33
- warn,
34
- )
35
- from wxo_agentic_evaluation.utils.utils import (
41
+ from wxo_agentic_evaluation.utils import (
42
+ ReferencelessEvalParser,
43
+ TestCaseResources,
44
+ ToolExtractionOpenAIFormat,
36
45
  add_line_seperator,
37
46
  list_run_files,
38
47
  load_run_metrics,
48
+ N_A,
49
+ )
50
+
51
+ MODEL_ID = "meta-llama/llama-3-405b-instruct"
52
+ GATE_TOOL_ENRICHMENTS = (
53
+ os.getenv("GATE_TOOL_ENRICHMENTS", "true").lower().strip() == "true"
39
54
  )
55
+ LOCK = Lock()
56
+
57
+
58
+ class AnalyzerBase(ABC):
59
+ @abstractmethod
60
+ def analyze(self, config: AnalyzeConfig):
61
+ pass
62
+
63
+ @abstractmethod
64
+ def render(self):
65
+ pass
66
+
67
+ def _is_failed_tool_call(self, message: ExtendedMessage):
68
+ if message.reason and message.message.type == ContentType.tool_call:
69
+ if (
70
+ reason := message.reason.get("reason")
71
+ ) and reason != "irrelevant tool call":
72
+ return True
73
+
74
+ def _single_run(
75
+ self, test_case_name, run_map, test_cases_resource: TestCaseResources
76
+ ):
77
+ if not run_map:
78
+ # Legacy single-run files
79
+ test_messages, meta = test_cases_resource.get_analyze_messages(
80
+ test_case_name=test_case_name
81
+ )
82
+ metrics: ToolCallAndRoutingMetrics = (
83
+ test_cases_resource.get_test_metrics(
84
+ test_case_name=test_case_name
85
+ )
86
+ )
87
+ else:
88
+ run_id = next(iter(run_map))
89
+ paths = run_map[run_id]
90
+ metrics = test_cases_resource.get_test_metrics(
91
+ path=paths["metrics"]
92
+ )
93
+ test_messages, meta = test_cases_resource.get_analyze_messages(
94
+ path=paths["analyze"]
95
+ )
96
+
97
+ # --- compute status uniformly (legacy & run1) ---
98
+ runs_problematic = self._is_failed_test_case(metrics)
99
+
100
+ return test_messages, meta, metrics, runs_problematic
101
+
102
+ def _is_failed_test_case(self, data) -> bool:
103
+ """
104
+ True -> test case failed
105
+ False -> test success
106
+ """
107
+
108
+ # not ideal if statement
109
+ # in the future, refactor so this if statement is not needed
110
+ # this if statement is needed because this function is called in two cases:
111
+ # 1. if data is an instance ToolCallAndRoutingMetrics
112
+ # 2. if data is a row in the summary table (dictionary)
113
+
114
+ # ideal the SummaryMetrics should be parsed into pydantic class as well
115
+
116
+ if isinstance(data, ToolCallAndRoutingMetrics):
117
+ is_success = data.is_success
118
+ had_incorrect_param = data.tool_calls_with_incorrect_parameter > 0
119
+ low_precision = float(data.tool_call_precision) < 1.0
120
+ low_recall = float(data.tool_call_recall) < 1.0
121
+ else:
122
+ is_success = str(data["is_success"]).strip().lower() == "true"
123
+ had_incorrect_param = (
124
+ float(data.get("tool_calls_with_incorrect_parameter", 0) or 0)
125
+ > 0
126
+ )
127
+ low_precision = float(data.get("tool_call_precision", 1) or 1) < 1.0
128
+ low_recall = float(data.get("tool_call_recall", 1) or 1) < 1.0
129
+
130
+ return (
131
+ not is_success or had_incorrect_param or low_precision or low_recall
132
+ )
133
+
134
+ def _get_test_case_with_failed_tools(self, summary) -> List[str]:
135
+ test_case_with_failed_tools = []
136
+
137
+ for entry in summary:
138
+ test_case_name = entry["dataset_name"]
139
+
140
+ if test_case_name.lower().strip() == "summary (average)":
141
+ continue
142
+
143
+ if self._is_failed_test_case(entry):
144
+ test_case_with_failed_tools.append(entry)
145
+
146
+ return test_case_with_failed_tools
40
147
 
41
148
 
42
- class Analyzer:
149
+ class DescriptionQualityAnalyzer(AnalyzerBase):
43
150
  def __init__(self):
44
- self.analysis_cache: Dict[str, List[Text]] = (
45
- {}
46
- ) # the failing tools cached here won't be re-analyzed.
151
+ self.analysis_cache: Dict[str, DescriptionQualityMetric] = {}
47
152
  # tool_name -> description analysis
153
+ self.missing_tools = set()
154
+ self.tools_not_found = set()
48
155
 
49
- @staticmethod
50
- def _generate_style_config():
51
- return Style(
52
- color="magenta",
53
- blink=True,
54
- bold=True,
55
- )
156
+ def _get_tools_not_found_in_source(
157
+ self,
158
+ tools_to_analyze: List[str],
159
+ failing_tool_definitions: List[ToolDefinition],
160
+ ) -> Set[str]:
161
+
162
+ return set(tools_to_analyze) - {
163
+ tool_def.tool_name for tool_def in failing_tool_definitions
164
+ }
56
165
 
57
- def _split_cache(
58
- self, failing_tools: Set[str]
59
- ) -> tuple[List[str], List[Text]]:
166
+ def _failing_tool_from_messages(self, messages: List[ExtendedMessage]):
167
+ failed_tool_calls = set()
168
+ for message in messages:
169
+ if self._is_failed_tool_call(message):
170
+ content = json.loads(message.message.content)
171
+ tool_call_name = content["name"]
172
+ failed_tool_calls.add(tool_call_name)
173
+
174
+ return failed_tool_calls
175
+
176
+ def failing_tools(self, data_path):
177
+ messages_dir = os.path.join(data_path, "messages")
178
+ test_case_resources = TestCaseResources(data_path)
179
+ processed_test_cases = set()
180
+ failed_tool_calls = set()
181
+
182
+ for test_case in test_case_resources.get_summary:
183
+ dataset_name = test_case["dataset_name"]
184
+ if dataset_name in processed_test_cases:
185
+ continue
186
+ processed_test_cases.add(dataset_name)
60
187
 
61
- tools_to_analyze: List[str] = []
62
- cached_lines: List[Text] = []
63
- tools_analyzed: List[str] = []
188
+ run_map = list_run_files(messages_dir, test_case["dataset_name"])
64
189
 
65
- for tool_name in sorted(failing_tools):
66
- cached_analysis = self.analysis_cache.get(tool_name)
67
- if cached_analysis:
68
- cached_lines.extend(cached_analysis)
69
- tools_analyzed.append(tool_name)
70
- else:
71
- tools_to_analyze.append(tool_name)
190
+ if not run_map:
191
+ test_messages, _ = test_case_resources.get_analyze_messages(
192
+ test_case_name=dataset_name
193
+ )
194
+ failed_tool_calls.update(
195
+ self._failing_tool_from_messages(test_messages)
196
+ )
72
197
 
73
- if tools_analyzed:
74
- pretty_print(
75
- content=f"ℹ️ Loading cached analysis since these failing tools: {tools_analyzed} have been analyzed previously.",
76
- style="bold cyan",
77
- )
198
+ else:
199
+ for paths in run_map.values():
200
+ test_messages, _ = test_case_resources.get_analyze_messages(
201
+ path=paths["analyze"]
202
+ )
203
+ failed_tool_calls.update(
204
+ self._failing_tool_from_messages(test_messages)
205
+ )
78
206
 
79
- return (tools_to_analyze, cached_lines)
207
+ return failed_tool_calls
80
208
 
81
209
  def analyze_failing_tool_description_quality(
82
210
  self,
83
211
  inspector: DescriptionQualityInspector,
84
212
  tool_definition_path: str,
85
213
  failing_tools: Set[str],
86
- ) -> List[Text]:
214
+ ) -> Tuple[List[DescriptionQualityMetric], List[str]]:
87
215
  """
88
216
  :param tool_definition_path: Path to the tool definition file.
89
217
  :param failing_tools: Set of tool names that failed.
90
- :return: List of rich `Text` objects containing feedback for the customer.
218
+ :return: A tuple where the first item in the tuple is List[DescriptionQualityMetric] for failed tools that were analyzed,
219
+ the second item in the list is a list of missing tools
91
220
  """
92
221
 
93
- pretty_print(
94
- content=f"⚙️ Checking tool description quality for failing tools: {sorted(failing_tools)}",
95
- style="bold cyan",
222
+ failing_tool_definitions: List[ToolDefinition] = (
223
+ inspector.extract_tool_desc_from_tool_source(
224
+ Path(tool_definition_path),
225
+ failing_tools,
226
+ )
227
+ )
228
+
229
+ if not failing_tool_definitions:
230
+ """
231
+ No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
232
+ """
233
+ with Lock:
234
+ self.tools_not_found.add(failing_tools)
235
+
236
+ missing_tools = self._get_tools_not_found_in_source(
237
+ failing_tools, failing_tool_definitions
238
+ )
239
+ for tool_definition in failing_tool_definitions:
240
+ tool_analysis = inspector.detect_bad_description(tool_definition)
241
+ with LOCK:
242
+ self.analysis_cache[tool_definition.tool_name] = tool_analysis
243
+ self.missing_tools.update(missing_tools)
244
+
245
+ return 1
246
+
247
+ def analyze(self, config):
248
+ failing_tools = self.failing_tools(config.data_path)
249
+ inspector = DescriptionQualityInspector()
250
+ tool_definition_path = config.tool_definition_path
251
+
252
+ with ThreadPoolExecutor(
253
+ max_workers=config.num_workers, thread_name_prefix="[Worker]"
254
+ ) as pool:
255
+ futures = [
256
+ pool.submit(
257
+ self.analyze_failing_tool_description_quality,
258
+ inspector,
259
+ tool_definition_path,
260
+ [failing_tool],
261
+ )
262
+ for failing_tool in failing_tools
263
+ ]
264
+
265
+ if futures:
266
+ with Progress() as progress:
267
+ task = progress.add_task(
268
+ f"[purple]Analyzing description quality for {len(futures)} tasks...",
269
+ total=len(futures),
270
+ )
271
+ for future in as_completed(futures):
272
+ try:
273
+ future.result()
274
+ except Exception:
275
+ traceback.print_exc()
276
+ finally:
277
+ progress.update(task, advance=1)
278
+
279
+ def render(self):
280
+ raise NotImplementedError("Not implemented")
281
+
282
+
283
+ class Analyzer(AnalyzerBase):
284
+ def __init__(
285
+ self,
286
+ enhanced_metrics: Optional[List[EnhancedAnalyzeMetrics]] = None,
287
+ description_quality_analyzer: DescriptionQualityAnalyzer = None,
288
+ ):
289
+ self.enhanced_metrics = enhanced_metrics
290
+ self.enhanced_metrics_idx_map = {}
291
+
292
+ if self.enhanced_metrics:
293
+ # do some post-processing on the enhanced metrics
294
+ # create a mapping between test case name and index
295
+ if self.enhanced_metrics:
296
+ for idx, metric in enumerate(self.enhanced_metrics):
297
+ self.enhanced_metrics_idx_map[metric.test_case_name] = idx
298
+
299
+ self.description_quality_analyzer = description_quality_analyzer
300
+
301
+ @staticmethod
302
+ def _generate_style_config():
303
+ return Style(
304
+ color="magenta",
305
+ blink=True,
306
+ bold=True,
96
307
  )
97
308
 
98
- analysis_for_display: List[Text] = []
309
+ def _parse_enhanced_metrics(self, test_case_name) -> Optional[Table]:
310
+ table = Table(
311
+ box=box.ROUNDED,
312
+ show_lines=True,
313
+ )
99
314
 
100
- # Step 1: get tools not yet analyzed and cached analysis for tools analyzed previously
101
- tools_to_analyze, cached_analysis = self._split_cache(failing_tools)
102
- if cached_analysis:
103
- analysis_for_display.extend(cached_analysis)
315
+ columns = [
316
+ "Tool Name",
317
+ "Root Cause Analysis",
318
+ "Docstring Recommendations",
319
+ ]
104
320
 
105
- # Step 2: analyze cache misses
106
- if tools_to_analyze:
321
+ rows = []
322
+
323
+ if (
324
+ self.enhanced_metrics
325
+ and (index := self.enhanced_metrics_idx_map.get(test_case_name))
326
+ is not None
327
+ ):
328
+ enhanced_metric: EnhancedAnalyzeMetrics = self.enhanced_metrics[
329
+ index
330
+ ]
331
+
332
+ for idx, tool_call in enumerate(enhanced_metric.tool_names):
333
+ static_root_causes = []
334
+ parsed_tool_annotations = []
335
+ param_annotations = defaultdict(list)
336
+
337
+ row = [tool_call]
338
+
339
+ # if this is true, then there are no semantic metrics
340
+ static_root_causes = [
341
+ Text(item.explanation)
342
+ for item in enhanced_metric.static_metrics[idx]
343
+ ]
344
+
345
+ static_root_causes = Text().join(static_root_causes)
346
+
347
+ # Parameter Root Cause
348
+ parameter_annotations = enhanced_metric.parameter_annotations[
349
+ idx
350
+ ]
351
+ formatted_param_root_cause = [
352
+ Text(metric.explanation) for metric in parameter_annotations
353
+ ]
354
+ formatted_param_root_cause = Text().join(
355
+ formatted_param_root_cause
356
+ )
107
357
 
108
- failing_tool_definitions: List[ToolDefinition] = (
109
- inspector.extract_tool_desc_from_tool_source(
110
- Path(tool_definition_path),
111
- tools_to_analyze,
358
+ # Tool Root Cause
359
+ tool_annotations = enhanced_metric.tool_annotations[idx]
360
+ formatted_tool_root_cause = [
361
+ Text(metric.explanation) for metric in tool_annotations
362
+ ]
363
+ formatted_tool_root_cause = Text().join(
364
+ formatted_tool_root_cause
112
365
  )
113
- )
114
366
 
115
- if not failing_tool_definitions:
116
- analysis_for_display.append(
117
- warn(
118
- message=f"No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
367
+ if formatted_param_root_cause or formatted_tool_root_cause:
368
+ root_cause = (
369
+ formatted_tool_root_cause
370
+ if len(formatted_tool_root_cause)
371
+ > len(formatted_param_root_cause)
372
+ else formatted_param_root_cause
119
373
  )
374
+ elif static_root_causes:
375
+ root_cause = static_root_causes
376
+ else:
377
+ root_cause = N_A
378
+
379
+ row.append(root_cause)
380
+
381
+ # Parameter Level Docstring
382
+ for metric in parameter_annotations:
383
+ if annotations := metric.annotations:
384
+ for annotation in annotations:
385
+ param_annotations[annotation.parameter_name].append(
386
+ f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
387
+ )
388
+
389
+ newline = "\n"
390
+ param_annotations = [
391
+ f"- [b]{param_name}:[/b] {newline.join(doc_string)}"
392
+ for param_name, doc_string in param_annotations.items()
393
+ ]
394
+ param_annotations = "\n".join(param_annotations)
395
+
396
+ # Tool Level Docstring
397
+ for metric in tool_annotations:
398
+ if annotations := metric.annotations:
399
+ for annotation in annotations:
400
+ parsed_tool_annotations.append(
401
+ f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
402
+ )
403
+ parsed_tool_annotations = "\n".join(parsed_tool_annotations)
404
+ docstring_cell = Table(
405
+ show_lines=False, show_header=False, box=None
120
406
  )
121
- return analysis_for_display
407
+ add_divider = False
408
+
409
+ # - Gate the Doc String Enrichments.
410
+ # - Ensure the environment variable is enabled.
411
+ if GATE_TOOL_ENRICHMENTS and self.description_quality_analyzer:
412
+ # check if tool in cache
413
+ tool_description_analysis = (
414
+ self.description_quality_analyzer.analysis_cache.get(
415
+ tool_call
416
+ )
417
+ )
418
+ is_missing_tool = (
419
+ tool_call
420
+ in self.description_quality_analyzer.missing_tools
421
+ ) # tool call not in tool_definition_path
422
+ # failed tool call that failed to get extracted from the tool_definition_path because of missing `@tool` decorator
423
+ # TODO: figure out if this edge is needed? taken from original Analyze implementation
424
+ tool_not_found = (
425
+ tool_call
426
+ in self.description_quality_analyzer.tools_not_found
427
+ )
122
428
 
123
- missing_tools = self._get_tools_not_found_in_source(
124
- tools_to_analyze, failing_tool_definitions
125
- )
126
- if missing_tools:
127
- analysis_for_display.append(
128
- warn(
129
- message=f"Missing tool definitions for failed tools: '{missing_tools}' in the file: '{tool_definition_path}'"
429
+ # If the tool_call is in `missing_tools`, don't show the annotations
430
+ if is_missing_tool or tool_not_found:
431
+ parsed_tool_annotations = []
432
+ param_annotations = []
433
+
434
+ if tool_description_analysis is not None:
435
+ if (
436
+ tool_description_analysis.description_quality
437
+ == DescriptionQuality.GOOD
438
+ ):
439
+ parsed_tool_annotations = []
440
+ param_annotations = []
441
+ else:
442
+ print("cache miss: ", tool_call)
443
+
444
+ if not parsed_tool_annotations and not param_annotations:
445
+ docstring_cell.add_row(N_A)
446
+ if parsed_tool_annotations:
447
+ docstring_cell.add_row(
448
+ "[b]Tool Docstrings", parsed_tool_annotations
449
+ )
450
+ add_divider = True
451
+ if param_annotations:
452
+ if add_divider:
453
+ docstring_cell.add_row(Rule(characters="--"))
454
+ docstring_cell.add_row(
455
+ "[b]Parameter Docstrings", param_annotations
130
456
  )
131
- )
132
457
 
133
- for tool_definition in failing_tool_definitions:
458
+ row.append(docstring_cell)
459
+ rows.append(row)
134
460
 
135
- tool_analysis = self._analyze_tool_definition(
136
- inspector=inspector,
137
- tool_definition=tool_definition,
138
- tool_definition_path=tool_definition_path,
139
- )
461
+ is_empty = not any(rows)
462
+ if is_empty:
463
+ return None
140
464
 
141
- self.analysis_cache[tool_definition.tool_name] = tool_analysis
142
- analysis_for_display.extend(tool_analysis)
465
+ for idx, column in enumerate(columns):
466
+ table.add_column(column)
467
+
468
+ for row in rows:
469
+ table.add_row(*row)
143
470
 
144
- return analysis_for_display
471
+ return table
145
472
 
146
473
  def render(
147
474
  self,
148
475
  data: List[ExtendedMessage],
149
476
  tool_definition_path: Optional[str],
150
477
  meta: Optional[dict] = None,
478
+ test_case_name=None,
151
479
  ) -> Group:
152
480
  """
153
481
  Render the conversation history and analysis results.
@@ -158,7 +486,6 @@ class Analyzer:
158
486
  conversation_lines = []
159
487
  reason_lines = []
160
488
  failing_tools = []
161
- added_errors_header = False
162
489
  added_missed_header = False
163
490
 
164
491
  for entry in data:
@@ -192,31 +519,11 @@ class Analyzer:
192
519
 
193
520
  text_line = Text(f"{label}: {content}\n")
194
521
  if reason:
195
- if not added_errors_header:
196
- reason_lines.append(
197
- Text("\nTool Call Errors:\n", style="bold red")
198
- )
199
- added_errors_header = True
200
522
  text_line.stylize("bold red")
201
523
  reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
202
524
  reason_lines.append(Text(reason_text, style="red"))
203
525
  conversation_lines.append(text_line)
204
526
 
205
- if failing_tools and tool_definition_path:
206
-
207
- inspector = DescriptionQualityInspector()
208
-
209
- description_quality_inspection_lines = (
210
- self.analyze_failing_tool_description_quality(
211
- inspector, tool_definition_path, set(failing_tools)
212
- )
213
- )
214
-
215
- print_done()
216
-
217
- if description_quality_inspection_lines:
218
- reason_lines.extend(description_quality_inspection_lines)
219
-
220
527
  if meta:
221
528
  missed = meta.get("missed_tool_calls") or []
222
529
  if missed:
@@ -231,18 +538,21 @@ class Analyzer:
231
538
  conversation_panel = Panel(
232
539
  Text().join(conversation_lines),
233
540
  title="Conversation History",
234
- border_style="blue",
541
+ border_style="bold deep_sky_blue2",
235
542
  )
236
543
  reason_panel = Panel(
237
544
  Text().join(reason_lines),
238
- title="Analysis Results",
239
- border_style="red",
545
+ box=box.ROUNDED,
546
+ title=f"[bold red]Tool Call Errors[/bold red]",
547
+ border_style="bold red",
240
548
  )
549
+ table = self._parse_enhanced_metrics(test_case_name=test_case_name)
550
+ if table:
551
+ group = Group(conversation_panel, reason_panel, table)
552
+ else:
553
+ group = Group(conversation_panel, reason_panel)
241
554
 
242
- return Group(
243
- conversation_panel,
244
- reason_panel,
245
- )
555
+ return group
246
556
 
247
557
  def analyze(self, config: AnalyzeConfig):
248
558
  """
@@ -250,59 +560,15 @@ class Analyzer:
250
560
  :param config: AnalyzeConfig object containing user provided paths for analysis.
251
561
  """
252
562
 
253
- def get_summary(summary_file_name: str = "summary_metrics.csv"):
254
- summary = []
255
-
256
- path_to_summary_file = os.path.join(
257
- config.data_path, summary_file_name
258
- )
259
-
260
- with open(path_to_summary_file, "r") as f:
261
- reader = csv.reader(f)
262
- header = next(reader)
263
- for row in reader:
264
- summary.append(dict(zip(header, row)))
265
-
266
- return summary
267
-
268
- def get_test_messages(test_case_name):
269
- test_messages = []
270
- meta = {}
271
-
272
- test_case_path = os.path.join(
273
- config.data_path,
274
- "messages",
275
- f"{test_case_name}.messages.analyze.json",
276
- )
277
-
278
- with open(test_case_path, "r", encoding="utf-8") as f:
279
- temp = json.load(f)
280
- if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
281
- meta = temp[-1]["meta"]
282
- temp = temp[:-1]
283
-
284
- for entry in temp:
285
- msg = ExtendedMessage(**entry)
286
- test_messages.append(msg)
287
-
288
- return test_messages, meta
289
-
290
- def get_metrics(test_case_name):
291
- test_metrics_path = os.path.join(
292
- config.data_path, "messages", f"{test_case_name}.metrics.json"
293
- )
294
-
295
- with open(test_metrics_path, "r", encoding="utf-8") as f:
296
- metrics = ToolCallAndRoutingMetrics(**json.load(f))
297
-
298
- return metrics
299
-
300
- summary = get_summary()
563
+ test_case_resources = TestCaseResources(config.data_path)
564
+ summary = test_case_resources.get_summary
301
565
 
302
566
  test_case_with_failed_tools = self._get_test_case_with_failed_tools(
303
567
  summary=summary
304
568
  )
305
569
 
570
+ output_panels = []
571
+
306
572
  if len(test_case_with_failed_tools) == 0:
307
573
  header_table = Table(show_header=False, box=None)
308
574
 
@@ -313,7 +579,7 @@ class Analyzer:
313
579
  title="[bold green]📋 Analysis Summary[/bold green]",
314
580
  )
315
581
 
316
- pretty_print(panel)
582
+ output_panels.append(panel)
317
583
 
318
584
  messages_dir = os.path.join(config.data_path, "messages")
319
585
 
@@ -340,100 +606,17 @@ class Analyzer:
340
606
  if dataset_base in processed_parents:
341
607
  continue
342
608
 
343
- run_map = list_run_files(messages_dir, dataset_base)
609
+ run_map = list_run_files(messages_dir, dataset_base, config.run)
344
610
 
345
611
  # ---- SINGLE RUN (legacy or run1 only) ----
346
612
  if not run_map or len(run_map) == 1:
347
- if not run_map:
348
- # Legacy single-run files
349
- test_messages, meta = get_test_messages(
350
- test_case_name=dataset_base
613
+ runs_performed = 1
614
+ test_messages, meta, metrics, runs_problematic = (
615
+ self._single_run(
616
+ test_case_name=dataset_base,
617
+ run_map=run_map,
618
+ test_cases_resource=test_case_resources,
351
619
  )
352
- metrics: ToolCallAndRoutingMetrics = get_metrics(
353
- test_case_name=dataset_base
354
- )
355
- runs_performed = 1
356
- else:
357
- run_id = next(iter(run_map))
358
- paths = run_map[run_id]
359
- runs_performed = 1
360
- if not paths["metrics"]:
361
- pretty_print(
362
- f"❌ {dataset_base}.run{run_id} — metrics file missing.",
363
- style="bold red",
364
- )
365
- # Count it as analyzed & problematic
366
- processed_parents.add(dataset_base)
367
- ds_table = Table(show_header=False, box=None)
368
- ds_table.add_row("Type: Single-run")
369
- ds_table.add_row("Status: ❌ Problematic")
370
- pretty_print(
371
- Panel(
372
- ds_table,
373
- title=f"📋 Analysis Summary — {dataset_base}",
374
- border_style="green",
375
- )
376
- )
377
- overall_runs_performed += 1
378
- overall_runs_problematic += 1
379
- add_line_seperator(self._generate_style_config())
380
- continue
381
-
382
- metrics = load_run_metrics(paths["metrics"])
383
- meta = {}
384
-
385
- if paths["analyze"]:
386
- with open(paths["analyze"], "r", encoding="utf-8") as f:
387
- raw = json.load(f)
388
- if (
389
- raw
390
- and isinstance(raw[-1], dict)
391
- and "meta" in raw[-1]
392
- ):
393
- meta = raw[-1]["meta"]
394
- raw = raw[:-1]
395
- test_messages = [
396
- ExtendedMessage(**entry) for entry in raw
397
- ]
398
- else:
399
- test_messages, meta = [], {}
400
-
401
- # --- compute status uniformly (legacy & run1) ---
402
- had_incorrect_param = (
403
- hasattr(metrics, "tool_calls_with_incorrect_parameter")
404
- and float(metrics.tool_calls_with_incorrect_parameter or 0)
405
- > 0
406
- )
407
- low_precision = (
408
- hasattr(metrics, "tool_call_precision")
409
- and float(
410
- metrics.tool_call_precision
411
- if metrics.tool_call_precision is not None
412
- else 1.0
413
- )
414
- < 1.0
415
- )
416
- low_recall = (
417
- hasattr(metrics, "tool_call_recall")
418
- and float(
419
- metrics.tool_call_recall
420
- if metrics.tool_call_recall is not None
421
- else 1.0
422
- )
423
- < 1.0
424
- )
425
- runs_problematic = (
426
- 1
427
- if (
428
- (
429
- hasattr(metrics, "is_success")
430
- and not metrics.is_success
431
- )
432
- or had_incorrect_param
433
- or low_precision
434
- or low_recall
435
- )
436
- else 0
437
620
  )
438
621
 
439
622
  processed_parents.add(dataset_base)
@@ -445,14 +628,6 @@ class Analyzer:
445
628
  "❌ Problematic" if runs_problematic else "✅ No problems"
446
629
  )
447
630
  ds_table.add_row(f"Status: {status}")
448
- pretty_print(
449
- Panel(
450
- ds_table,
451
- title=f"📋 Analysis Summary — {dataset_base}",
452
- border_style="green",
453
- )
454
- )
455
-
456
631
  # Update overall counters/averages
457
632
  overall_runs_performed += runs_performed
458
633
  overall_runs_problematic += runs_problematic
@@ -469,19 +644,43 @@ class Analyzer:
469
644
  1 if bool(metrics.is_success) else 0
470
645
  )
471
646
 
472
- # Replay details only if problematic
473
- if runs_problematic:
474
- pretty_print(
647
+ header_group = Group(
648
+ *[
649
+ ds_table,
475
650
  self._create_header_analysis_panel(
476
651
  dataset_base, metrics
652
+ ),
653
+ ],
654
+ )
655
+ border_style = "bold red" if runs_problematic else "bold green"
656
+ header_panel = Panel(
657
+ header_group,
658
+ title=f"[b]📋 Analysis Summary — {dataset_base}[/b]",
659
+ border_style=border_style,
660
+ )
661
+ output_panels.append(header_panel)
662
+
663
+ if runs_problematic:
664
+ output_panels.append(
665
+ self.render(
666
+ test_messages,
667
+ config.tool_definition_path,
668
+ meta,
669
+ test_case_name=dataset_base,
477
670
  )
478
671
  )
479
- pretty_print(
480
- self.render(
481
- test_messages, config.tool_definition_path, meta
672
+ output_panels.append(
673
+ add_line_seperator(
674
+ self._generate_style_config(), print=False
675
+ )
676
+ )
677
+
678
+ else:
679
+ output_panels.append(
680
+ add_line_seperator(
681
+ self._generate_style_config(), print=False
482
682
  )
483
683
  )
484
- add_line_seperator(self._generate_style_config())
485
684
 
486
685
  continue
487
686
 
@@ -555,31 +754,32 @@ class Analyzer:
555
754
  }
556
755
  )
557
756
 
558
- # Print the dataset panel FIRST with both lines inside
559
- ds_table = Table(show_header=False, box=None)
560
- ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
561
- ds_table.add_row(
562
- f"Runs with problems: {runs_problematic} / {runs_performed}"
563
- )
564
- status = (
565
- "❌ Problematic" if runs_problematic > 0 else "✅ No problems"
566
- )
567
- ds_table.add_row(f"Status: {status}")
568
- pretty_print(
569
- Panel(
570
- ds_table,
571
- title=f"📋 Analysis Summary — {dataset_base}",
572
- border_style="green",
573
- )
574
- )
575
-
576
757
  # Second pass: now replay only the problematic runs (so summary stays at the top)
577
758
  for item in deferred_runs:
578
- pretty_print(
579
- self._create_header_analysis_panel(
580
- item["title"], item["metrics"]
759
+ ds_table = Table(show_header=False, box=None)
760
+ ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
761
+ ds_table.add_row(
762
+ f"Runs with problems: {runs_problematic} / {runs_performed}"
763
+ )
764
+ status = (
765
+ "❌ Problematic"
766
+ if runs_problematic > 0
767
+ else "✅ No problems"
768
+ )
769
+ ds_table.add_row(f"Status: {status}")
770
+ header_table = self._create_header_analysis_panel(
771
+ item["title"], item["metrics"]
772
+ )
773
+
774
+ group = Group(*[ds_table, header_table])
775
+ output_panels.append(
776
+ Panel(
777
+ group,
778
+ title=f"📋 Analysis Summary — {dataset_base}",
779
+ border_style="green",
581
780
  )
582
781
  )
782
+
583
783
  if item["analyze_path"]:
584
784
  with open(item["analyze_path"], "r", encoding="utf-8") as f:
585
785
  raw = json.load(f)
@@ -589,12 +789,16 @@ class Analyzer:
589
789
  raw = raw[:-1]
590
790
  test_messages = [ExtendedMessage(**entry) for entry in raw]
591
791
 
592
- pretty_print(
792
+ output_panels.append(
593
793
  self.render(
594
794
  test_messages, config.tool_definition_path, meta
595
795
  )
596
796
  )
597
- add_line_seperator(self._generate_style_config())
797
+ output_panels.append(
798
+ add_line_seperator(
799
+ self._generate_style_config(), print=False
800
+ )
801
+ )
598
802
 
599
803
  # Update overall aggregates
600
804
  overall_runs_performed += runs_performed
@@ -624,9 +828,9 @@ class Analyzer:
624
828
  )
625
829
  overall_lines.append(f"Avg journey success: {js_pct}%")
626
830
  else:
627
- overall_lines.append("Avg journey success: N/A")
831
+ overall_lines.append(f"Avg journey success: N/A")
628
832
 
629
- pretty_print(
833
+ output_panels.append(
630
834
  Panel(
631
835
  Text("\n".join(overall_lines)),
632
836
  title="📋 Overall Summary",
@@ -634,6 +838,11 @@ class Analyzer:
634
838
  )
635
839
  )
636
840
 
841
+ console = Console()
842
+ with console.pager(styles=True):
843
+ for panel in output_panels:
844
+ console.print(panel, overflow="crop")
845
+
637
846
  def _create_header_analysis_panel(
638
847
  self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
639
848
  ) -> Panel:
@@ -649,86 +858,338 @@ class Analyzer:
649
858
  header_table.add_row(f"Text Match: {metrics.text_match.value}")
650
859
  header_table.add_row(f"Journey Success: {metrics.is_success}")
651
860
 
652
- header_panel = Panel(
653
- header_table, title="[bold green]Test Case Summary[/bold green]"
654
- )
861
+ return header_table
655
862
 
656
- return header_panel
657
863
 
658
- def _get_test_case_with_failed_tools(self, summary) -> List:
864
+ class AnalyzerEnhanced(AnalyzerBase):
865
+ PARAMETER_DOCUMENTATION = "PARAMETER_DOCUMENTATION"
866
+ TOOL_USAGE_EXAMPLES = "TOOL_USAGE_EXAMPLES"
867
+ TOOL_DOCUMENTATION = "TOOL_DOCUMENTATION"
659
868
 
660
- test_case_with_failed_tools = []
869
+ DEFAULT_GENERATION_PARAMS = {
870
+ "min_new_tokens": 0,
871
+ "decoding_method": "greedy",
872
+ "max_new_tokens": 10_000,
873
+ "random_seed": 42,
874
+ }
661
875
 
662
- for entry in summary:
663
- test_case_name = entry["dataset_name"]
876
+ def __init__(self):
877
+ super().__init__()
664
878
 
665
- if test_case_name.lower().strip() == "summary (average)":
879
+ def _deduplicate_tool_call_failures(self, messages: List[ExtendedMessage]):
880
+ """If there are multiple failures from the same tool, then choose the failure that occurs later in the trajectory
881
+
882
+ ex.
883
+ 1. Tool A fails
884
+ 2. Tool A Error response
885
+ 3. Tool A call again which fails
886
+ 4. Tool A error response
887
+
888
+ For the analysis, we analyze the second time the tool call fails, with the previous messages serving as context.
889
+
890
+ """
891
+ tool_indices = []
892
+ seen_tools = set()
893
+
894
+ for idx, message in enumerate(reversed(messages)):
895
+ if self._is_failed_tool_call(message):
896
+ content = json.loads(message.message.content)
897
+ tool_call_name = content["name"]
898
+ if tool_call_name not in seen_tools:
899
+ seen_tools.add(tool_call_name)
900
+ tool_indices.append(len(messages) - 1 - idx)
901
+
902
+ return sorted(tool_indices)
903
+
904
+ def process_messages(self, task_name, test_case, tools, messages):
905
+ eval = ReferencelessEvaluation(
906
+ api_spec=tools,
907
+ model_id=MODEL_ID,
908
+ task_n=task_name,
909
+ dataset_name=test_case,
910
+ runtime_pipeline=False,
911
+ generation_params=AnalyzerEnhanced.DEFAULT_GENERATION_PARAMS,
912
+ )
913
+
914
+ processed_data = [
915
+ {
916
+ k: msg.model_dump().get(k)
917
+ for k in ["role", "content", "type"]
918
+ if k in msg.model_dump()
919
+ }
920
+ for msg in messages
921
+ ]
922
+
923
+ context = processed_data[:-1]
924
+ tool_call = processed_data[
925
+ -1
926
+ ] # assume that the message is the last tool call
927
+ tool_call_msg = json.loads(tool_call["content"])
928
+ call = ReferencelessEvaluation.fmt_tool_call(
929
+ tool_id=tool_call_msg.get("id", "1"),
930
+ tool_call_name=tool_call_msg["name"],
931
+ arguments=json.dumps(tool_call_msg["args"]),
932
+ context=context,
933
+ )
934
+ return test_case, eval.run([call])
935
+
936
+ def _extract_semantic_metrics(
937
+ self, metrics_dictionary, annotation_filters: Optional[List[str]]
938
+ ):
939
+ semantic_analysis = []
940
+ for metric_data in metrics_dictionary.values():
941
+ raw_response = metric_data.get("raw_response")
942
+ if raw_response is None:
666
943
  continue
667
944
 
668
- is_success = str(entry["is_success"]).strip().lower() == "true"
945
+ is_correct = metric_data.get("is_correct", False)
946
+ if is_correct:
947
+ continue
669
948
 
670
- tip = float(
671
- entry.get("tool_calls_with_incorrect_parameter", 0) or 0
949
+ failed_semantic_test_case = ReferencelessEvalParser.semantic_parser(
950
+ metric_name=metric_data.get("metric_name"),
951
+ data=raw_response,
952
+ annotation_filters=annotation_filters,
672
953
  )
673
- tcp = float(entry.get("tool_call_precision", 1) or 1)
674
- tcr = float(entry.get("tool_call_recall", 1) or 1)
675
954
 
676
- # Apply the 4 checks
677
- if (not is_success) or (tip > 0) or (tcp < 1.0) or (tcr < 1.0):
678
- test_case_with_failed_tools.append(entry)
955
+ semantic_analysis.append(failed_semantic_test_case)
679
956
 
680
- return test_case_with_failed_tools
957
+ return semantic_analysis
681
958
 
682
- def _get_tools_not_found_in_source(
683
- self,
684
- tools_to_analyze: List[str],
685
- failing_tool_definitions: List[ToolDefinition],
686
- ) -> Set[str]:
959
+ def tool_enrichment_view(self, results):
960
+ enhanced_metrics = []
961
+ tool_enrichment_metrics = defaultdict(list)
962
+ for result in results:
963
+ for test_case, eval_results in result.items():
964
+ for result in eval_results:
965
+ # for metric in result:
966
+ failed_static_metrics = []
967
+ parameter_annotations = []
968
+ tool_annotations = []
687
969
 
688
- return set(tools_to_analyze) - {
689
- tool_def.tool_name for tool_def in failing_tool_definitions
690
- }
970
+ static_metrics_passed = result.get("static", {}).get(
971
+ "final_decision", False
972
+ )
973
+ tool_call_obj = result.get("inputs", {}).get(
974
+ "tool_call", {}
975
+ )
691
976
 
692
- def _analyze_tool_definition(
693
- self,
694
- inspector: DescriptionQualityInspector,
695
- tool_definition: ToolDefinition,
696
- tool_definition_path: str,
697
- ) -> List[Text]:
977
+ if static_metrics_passed:
978
+ semantic_metrics = result.get("semantic")
979
+ function_selection_metrics = semantic_metrics.get(
980
+ "function_selection", {}
981
+ ).get("metrics", {})
982
+ tool_annotations = self._extract_semantic_metrics(
983
+ function_selection_metrics,
984
+ [
985
+ AnalyzerEnhanced.TOOL_DOCUMENTATION,
986
+ AnalyzerEnhanced.TOOL_USAGE_EXAMPLES,
987
+ ],
988
+ )
698
989
 
699
- tool_name = tool_definition.tool_name
700
- tool_desc = tool_definition.tool_description
990
+ general_metrics = semantic_metrics.get(
991
+ "general", {}
992
+ ).get("metrics", {})
993
+ parameter_annotations = self._extract_semantic_metrics(
994
+ general_metrics,
995
+ [AnalyzerEnhanced.PARAMETER_DOCUMENTATION],
996
+ )
997
+ else:
998
+ static_metrics = result.get("static").get("metrics")
999
+ failed_static_metrics = (
1000
+ ReferencelessEvalParser.static_parser(
1001
+ static_metrics=static_metrics
1002
+ )
1003
+ )
701
1004
 
702
- tool_analysis = []
1005
+ parsed_metrics = {
1006
+ "tool_name": tool_call_obj.get("function", {}).get(
1007
+ "name"
1008
+ ),
1009
+ "parameter_annotations": parameter_annotations,
1010
+ "tool_annotations": tool_annotations,
1011
+ "static_metrics": failed_static_metrics,
1012
+ }
1013
+ tool_enrichment_metrics[test_case].append(parsed_metrics)
1014
+
1015
+ for test_case, metrics in tool_enrichment_metrics.items():
1016
+ failed_tools = [metric["tool_name"] for metric in metrics]
1017
+ parameter_annotations = [
1018
+ metric["parameter_annotations"] for metric in metrics
1019
+ ]
1020
+ tool_annotation = [metric["tool_annotations"] for metric in metrics]
1021
+ static_metrics = [metric["static_metrics"] for metric in metrics]
1022
+
1023
+ # don't add to final metrics array if there were no annotations
1024
+ if (
1025
+ not any(parameter_annotations)
1026
+ and not any(tool_annotation)
1027
+ and not any(static_metrics)
1028
+ ):
1029
+ continue
703
1030
 
704
- # missing description
705
- if tool_desc is None:
706
- tool_analysis.extend(
707
- IncorrectParameterUtils.format_missing_description_message(
708
- tool_name=tool_name,
709
- tool_definition_path=tool_definition_path,
1031
+ enhanced_metrics.append(
1032
+ EnhancedAnalyzeMetrics(
1033
+ test_case_name=test_case,
1034
+ parameter_annotations=parameter_annotations,
1035
+ tool_annotations=tool_annotation,
1036
+ tool_names=failed_tools,
1037
+ static_metrics=static_metrics,
710
1038
  )
711
1039
  )
712
- return tool_analysis
713
1040
 
714
- # bad description
715
- if inspector.detect_bad_description(tool_definition):
716
- tool_analysis.extend(
717
- IncorrectParameterUtils.format_bad_description_message(
718
- tool_name=tool_name, tool_desc=tool_desc
719
- )
1041
+ return enhanced_metrics
1042
+
1043
+ def analyze(
1044
+ self, config: AnalyzeConfig
1045
+ ) -> Optional[List[EnhancedAnalyzeMetrics]]:
1046
+ start = time.time()
1047
+ all_tools = ToolExtractionOpenAIFormat.from_path(
1048
+ config.tool_definition_path
1049
+ )
1050
+ messages_dir = os.path.join(config.data_path, "messages")
1051
+ test_case_resources = TestCaseResources(config.data_path)
1052
+
1053
+ failed_test_cases = {}
1054
+ for test_case in test_case_resources.get_summary:
1055
+ if test_case["dataset_name"] in failed_test_cases:
1056
+ continue
1057
+ run_map = list_run_files(
1058
+ messages_dir, test_case["dataset_name"], config.run
720
1059
  )
721
- return tool_analysis
1060
+ if run_map and config.run == -1:
1061
+ rich.print(
1062
+ "[red]Enhanced Mode only operates on a single run for a dataset. Since there are multiple runs, set the `--run` flag to the specific run for enhanced analysis."
1063
+ )
1064
+ # run the first run in the config map
1065
+ rich.print(
1066
+ f"[b]Defaulting to run {next(iter(run_map))} to analyze for {test_case['dataset_name']}"
1067
+ )
1068
+ config.run = next(iter(run_map))
1069
+ run_map = {config.run: run_map.get(config.run)}
722
1070
 
723
- # good description
724
- tool_analysis.append(
725
- is_ok(
726
- message=f"The description for the `{tool_name}` looks sufficient."
1071
+ _, _, _, run_problematic = self._single_run(
1072
+ test_case["dataset_name"], run_map, test_case_resources
727
1073
  )
1074
+ if run_problematic:
1075
+ if run_files := run_map.get(config.run):
1076
+ failed_test_cases[test_case["dataset_name"]] = run_files
1077
+
1078
+ else:
1079
+ # legacy runs without n runs
1080
+ # tranform the legacy runs into the same data structure from `list_files`
1081
+
1082
+ messages_path = os.path.join(
1083
+ test_case_resources.output_dir,
1084
+ "messages",
1085
+ f"{test_case['dataset_name']}.messages.json",
1086
+ )
1087
+
1088
+ analyze_path = os.path.join(
1089
+ test_case_resources.output_dir,
1090
+ "messages",
1091
+ f"{test_case['dataset_name']}.messages.analyze.json",
1092
+ )
1093
+
1094
+ metrics_path = os.path.join(
1095
+ test_case_resources.output_dir,
1096
+ "messages",
1097
+ f"{test_case['dataset_name']}.metrics.json",
1098
+ )
1099
+
1100
+ failed_test_cases[test_case["dataset_name"]] = {
1101
+ "analyze": analyze_path,
1102
+ "messages": messages_path,
1103
+ "metrics": metrics_path,
1104
+ }
1105
+
1106
+ max_workers = config.num_workers
1107
+ rich.print(
1108
+ f"[bold green]INFO:[/bold green] Number of workers set to: {max_workers}"
728
1109
  )
729
- return tool_analysis
1110
+
1111
+ jobs = []
1112
+
1113
+ with ThreadPoolExecutor(
1114
+ max_workers=max_workers, thread_name_prefix="[Worker]"
1115
+ ) as pool:
1116
+ aggregate_results = []
1117
+ for test_case, file_mapping in failed_test_cases.items():
1118
+ analyze_messages, _ = test_case_resources.get_analyze_messages(
1119
+ path=file_mapping["analyze"]
1120
+ )
1121
+ idx_failed_tool_calls = self._deduplicate_tool_call_failures(
1122
+ analyze_messages
1123
+ )
1124
+ messages = test_case_resources.get_messages(
1125
+ path=file_mapping["messages"]
1126
+ )
1127
+
1128
+ for idx in idx_failed_tool_calls:
1129
+ jobs.append(
1130
+ {
1131
+ "task_name": f"{test_case}-0-{idx + 1}",
1132
+ "test_case": test_case,
1133
+ "tools": all_tools,
1134
+ "messages": messages[0 : idx + 1],
1135
+ }
1136
+ )
1137
+ jobs = sorted(jobs, key=lambda x: len(x["messages"]))
1138
+ futures = [
1139
+ pool.submit(
1140
+ self.process_messages,
1141
+ job["task_name"],
1142
+ job["test_case"],
1143
+ job["tools"],
1144
+ job["messages"],
1145
+ )
1146
+ for job in jobs
1147
+ ]
1148
+
1149
+ if futures:
1150
+ with Progress() as progress:
1151
+ task = progress.add_task(
1152
+ f"[purple]Evaluating {len(futures)} tasks...",
1153
+ total=len(futures),
1154
+ )
1155
+ for future in as_completed(futures):
1156
+ try:
1157
+ test_case, results = future.result()
1158
+ aggregate_results.append({test_case: results})
1159
+ except Exception as e:
1160
+ rich.print(
1161
+ f"test case, {test_case} ,fails with {e}"
1162
+ )
1163
+ traceback.print_exc()
1164
+ finally:
1165
+ progress.update(task, advance=1)
1166
+
1167
+ enhanced_metrics = self.tool_enrichment_view(aggregate_results)
1168
+ end = time.time()
1169
+ rich.print(f"Enhanced Analysis took {end - start} s")
1170
+
1171
+ return enhanced_metrics
1172
+
1173
+ def render(self):
1174
+ raise NotImplementedError("Not implemented")
1175
+
1176
+
1177
+ def run(args):
1178
+ d = DescriptionQualityAnalyzer()
1179
+ if args.mode == AnalyzeMode.enhanced:
1180
+ if GATE_TOOL_ENRICHMENTS:
1181
+ d.analyze(args)
1182
+
1183
+ enhanced = AnalyzerEnhanced()
1184
+ enhanced_metrics = enhanced.analyze(config=args)
1185
+ dummy_analyzer = Analyzer(enhanced_metrics, d)
1186
+ dummy_analyzer.analyze(args)
1187
+
1188
+ else:
1189
+ dummy_analyzer = Analyzer()
1190
+ dummy_analyzer.analyze(args)
730
1191
 
731
1192
 
732
1193
  if __name__ == "__main__":
733
- dummy_analyzer = Analyzer()
734
- dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
1194
+ args = CLI(AnalyzeConfig, as_positional=False)
1195
+ run(args)