ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,142 +1,489 @@
1
- import csv
2
1
  import json
3
2
  import os
3
+ import re
4
+ import time
5
+ import traceback
6
+ from abc import ABC, abstractmethod
7
+ from collections import defaultdict
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
9
  from pathlib import Path
5
- from typing import Dict, List, Optional, Set
10
+ from threading import Lock
11
+ from typing import Dict, List, Optional, Set, Tuple
6
12
 
13
+ import rich
7
14
  from jsonargparse import CLI
8
- from rich.console import Group
15
+ from rich import box
16
+ from rich.console import Console, Group
9
17
  from rich.panel import Panel
18
+ from rich.progress import Progress
19
+ from rich.rule import Rule
10
20
  from rich.style import Style
11
21
  from rich.table import Table
12
22
  from rich.text import Text
13
23
 
14
- from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
24
+ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig, AnalyzeMode
15
25
  from wxo_agentic_evaluation.description_quality_checker import (
16
26
  DescriptionQualityInspector,
17
27
  )
18
- from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
28
+ from wxo_agentic_evaluation.metrics.metrics import (
29
+ DescriptionQuality,
30
+ DescriptionQualityMetric,
31
+ EnhancedAnalyzeMetrics,
32
+ TextMatchType,
33
+ ToolCallAndRoutingMetrics,
34
+ )
35
+ from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
36
+ from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
19
37
  from wxo_agentic_evaluation.type import (
20
38
  ContentType,
21
39
  ExtendedMessage,
40
+ Message,
22
41
  ToolDefinition,
23
42
  )
24
- from wxo_agentic_evaluation.utils.rich_utils import (
25
- IncorrectParameterUtils,
26
- is_ok,
27
- pretty_print,
28
- print_done,
29
- warn,
43
+ from wxo_agentic_evaluation.utils import (
44
+ N_A,
45
+ ReferencelessEvalParser,
46
+ TestCaseResources,
47
+ ToolExtractionOpenAIFormat,
48
+ add_line_seperator,
49
+ list_run_files,
50
+ load_run_metrics,
51
+ )
52
+
53
+ MODEL_ID = "meta-llama/llama-3-405b-instruct"
54
+ GATE_TOOL_ENRICHMENTS = (
55
+ os.getenv("GATE_TOOL_ENRICHMENTS", "true").lower().strip() == "true"
30
56
  )
31
- from wxo_agentic_evaluation.utils.utils import add_line_seperator
57
+ LOCK = Lock()
58
+
59
+
60
+ class AnalyzerBase(ABC):
61
+ @abstractmethod
62
+ def analyze(self, config: AnalyzeConfig):
63
+ pass
64
+
65
+ @abstractmethod
66
+ def render(self):
67
+ pass
68
+
69
+ def _is_failed_tool_call(self, message: ExtendedMessage):
70
+ if message.reason and message.message.type == ContentType.tool_call:
71
+ if (
72
+ reason := message.reason.get("reason")
73
+ ) and reason != "irrelevant tool call":
74
+ return True
75
+
76
+ def _single_run(
77
+ self, test_case_name, run_map, test_cases_resource: TestCaseResources
78
+ ):
79
+ if not run_map:
80
+ # Legacy single-run files
81
+ test_messages, meta = test_cases_resource.get_analyze_messages(
82
+ test_case_name=test_case_name
83
+ )
84
+ metrics: ToolCallAndRoutingMetrics = (
85
+ test_cases_resource.get_test_metrics(
86
+ test_case_name=test_case_name
87
+ )
88
+ )
89
+ else:
90
+ run_id = next(iter(run_map))
91
+ paths = run_map[run_id]
92
+ metrics = test_cases_resource.get_test_metrics(
93
+ path=paths["metrics"]
94
+ )
95
+ test_messages, meta = test_cases_resource.get_analyze_messages(
96
+ path=paths["analyze"]
97
+ )
98
+
99
+ # --- compute status uniformly (legacy & run1) ---
100
+ runs_problematic = self._is_failed_test_case(metrics)
101
+
102
+ return test_messages, meta, metrics, runs_problematic
103
+
104
+ def _is_failed_test_case(self, data) -> bool:
105
+ """
106
+ True -> test case failed
107
+ False -> test success
108
+ """
109
+
110
+ # not ideal if statement
111
+ # in the future, refactor so this if statement is not needed
112
+ # this if statement is needed because this function is called in two cases:
113
+ # 1. if data is an instance ToolCallAndRoutingMetrics
114
+ # 2. if data is a row in the summary table (dictionary)
115
+
116
+ # ideal the SummaryMetrics should be parsed into pydantic class as well
117
+
118
+ if isinstance(data, ToolCallAndRoutingMetrics):
119
+ is_success = data.is_success
120
+ had_incorrect_param = data.tool_calls_with_incorrect_parameter > 0
121
+ low_precision = float(data.tool_call_precision) < 1.0
122
+ low_recall = float(data.tool_call_recall) < 1.0
123
+ else:
124
+ is_success = str(data["is_success"]).strip().lower() == "true"
125
+ had_incorrect_param = (
126
+ float(data.get("tool_calls_with_incorrect_parameter", 0) or 0)
127
+ > 0
128
+ )
129
+ low_precision = float(data.get("tool_call_precision", 1) or 1) < 1.0
130
+ low_recall = float(data.get("tool_call_recall", 1) or 1) < 1.0
131
+
132
+ return (
133
+ not is_success or had_incorrect_param or low_precision or low_recall
134
+ )
135
+
136
+ def _get_test_case_with_failed_tools(self, summary) -> List[str]:
137
+ test_case_with_failed_tools = []
138
+
139
+ for entry in summary:
140
+ test_case_name = entry["dataset_name"]
32
141
 
142
+ if test_case_name.lower().strip() == "summary (average)":
143
+ continue
144
+
145
+ if self._is_failed_test_case(entry):
146
+ test_case_with_failed_tools.append(entry)
33
147
 
34
- class Analyzer:
148
+ return test_case_with_failed_tools
149
+
150
+
151
+ class DescriptionQualityAnalyzer(AnalyzerBase):
35
152
  def __init__(self):
36
- self.analysis_cache: Dict[str, List[Text]] = (
37
- {}
38
- ) # the failing tools cached here won't be re-analyzed.
153
+ self.analysis_cache: Dict[str, DescriptionQualityMetric] = {}
39
154
  # tool_name -> description analysis
155
+ self.missing_tools = set()
156
+ self.tools_not_found = set()
40
157
 
41
- @staticmethod
42
- def _generate_style_config():
43
- return Style(
44
- color="magenta",
45
- blink=True,
46
- bold=True,
47
- )
158
+ def _get_tools_not_found_in_source(
159
+ self,
160
+ tools_to_analyze: List[str],
161
+ failing_tool_definitions: List[ToolDefinition],
162
+ ) -> Set[str]:
48
163
 
49
- def _split_cache(
50
- self, failing_tools: Set[str]
51
- ) -> tuple[List[str], List[Text]]:
164
+ return set(tools_to_analyze) - {
165
+ tool_def.tool_name for tool_def in failing_tool_definitions
166
+ }
52
167
 
53
- tools_to_analyze: List[str] = []
54
- cached_lines: List[Text] = []
55
- tools_analyzed: List[str] = []
168
+ def _failing_tool_from_messages(self, messages: List[ExtendedMessage]):
169
+ failed_tool_calls = set()
170
+ for message in messages:
171
+ if self._is_failed_tool_call(message):
172
+ content = json.loads(message.message.content)
173
+ tool_call_name = content["name"]
174
+ failed_tool_calls.add(tool_call_name)
175
+
176
+ return failed_tool_calls
177
+
178
+ def failing_tools(self, data_path):
179
+ messages_dir = os.path.join(data_path, "messages")
180
+ test_case_resources = TestCaseResources(data_path)
181
+ processed_test_cases = set()
182
+ failed_tool_calls = set()
183
+
184
+ for test_case in test_case_resources.get_summary:
185
+ dataset_name = test_case["dataset_name"]
186
+ if dataset_name in processed_test_cases:
187
+ continue
188
+ processed_test_cases.add(dataset_name)
56
189
 
57
- for tool_name in sorted(failing_tools):
58
- cached_analysis = self.analysis_cache.get(tool_name)
59
- if cached_analysis:
60
- cached_lines.extend(cached_analysis)
61
- tools_analyzed.append(tool_name)
62
- else:
63
- tools_to_analyze.append(tool_name)
190
+ run_map = list_run_files(messages_dir, test_case["dataset_name"])
64
191
 
65
- if tools_analyzed:
66
- pretty_print(
67
- content=f"ℹ️ Loading cached analysis since these failing tools: {tools_analyzed} have been analyzed previously.",
68
- style="bold cyan",
69
- )
192
+ if not run_map:
193
+ test_messages, _ = test_case_resources.get_analyze_messages(
194
+ test_case_name=dataset_name
195
+ )
196
+ failed_tool_calls.update(
197
+ self._failing_tool_from_messages(test_messages)
198
+ )
70
199
 
71
- return (tools_to_analyze, cached_lines)
200
+ else:
201
+ for paths in run_map.values():
202
+ test_messages, _ = test_case_resources.get_analyze_messages(
203
+ path=paths["analyze"]
204
+ )
205
+ failed_tool_calls.update(
206
+ self._failing_tool_from_messages(test_messages)
207
+ )
208
+
209
+ return failed_tool_calls
72
210
 
73
211
  def analyze_failing_tool_description_quality(
74
212
  self,
75
213
  inspector: DescriptionQualityInspector,
76
214
  tool_definition_path: str,
77
215
  failing_tools: Set[str],
78
- ) -> List[Text]:
216
+ ) -> Tuple[List[DescriptionQualityMetric], List[str]]:
79
217
  """
80
218
  :param tool_definition_path: Path to the tool definition file.
81
219
  :param failing_tools: Set of tool names that failed.
82
- :return: List of rich `Text` objects containing feedback for the customer.
220
+ :return: A tuple where the first item in the tuple is List[DescriptionQualityMetric] for failed tools that were analyzed,
221
+ the second item in the list is a list of missing tools
83
222
  """
84
223
 
85
- pretty_print(
86
- content=f"⚙️ Checking tool description quality for failing tools: {sorted(failing_tools)}",
87
- style="bold cyan",
224
+ failing_tool_definitions: List[ToolDefinition] = (
225
+ inspector.extract_tool_desc_from_tool_source(
226
+ Path(tool_definition_path),
227
+ failing_tools,
228
+ )
88
229
  )
89
230
 
90
- analysis_for_display: List[Text] = []
231
+ if not failing_tool_definitions:
232
+ """
233
+ No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
234
+ """
235
+ with Lock:
236
+ self.tools_not_found.add(failing_tools)
237
+
238
+ missing_tools = self._get_tools_not_found_in_source(
239
+ failing_tools, failing_tool_definitions
240
+ )
241
+ for tool_definition in failing_tool_definitions:
242
+ tool_analysis = inspector.detect_bad_description(tool_definition)
243
+ with LOCK:
244
+ self.analysis_cache[tool_definition.tool_name] = tool_analysis
245
+ self.missing_tools.update(missing_tools)
246
+
247
+ return 1
248
+
249
+ def analyze(self, config):
250
+ failing_tools = self.failing_tools(config.data_path)
251
+ inspector = DescriptionQualityInspector()
252
+ tool_definition_path = config.tool_definition_path
253
+
254
+ with ThreadPoolExecutor(
255
+ max_workers=config.num_workers, thread_name_prefix="[Worker]"
256
+ ) as pool:
257
+ futures = [
258
+ pool.submit(
259
+ self.analyze_failing_tool_description_quality,
260
+ inspector,
261
+ tool_definition_path,
262
+ [failing_tool],
263
+ )
264
+ for failing_tool in failing_tools
265
+ ]
266
+
267
+ if futures:
268
+ if not LOGGING_ENABLED:
269
+ progress = Progress()
270
+ task = progress.add_task(
271
+ f"[purple]Analyzing description quality for {len(futures)} tasks...",
272
+ total=len(futures),
273
+ )
274
+ progress.start()
275
+ for future in as_completed(futures):
276
+ try:
277
+ future.result()
278
+ except Exception:
279
+ traceback.print_exc()
280
+ finally:
281
+ if not LOGGING_ENABLED:
282
+ progress.update(task, advance=1)
91
283
 
92
- # Step 1: get tools not yet analyzed and cached analysis for tools analyzed previously
93
- tools_to_analyze, cached_analysis = self._split_cache(failing_tools)
94
- if cached_analysis:
95
- analysis_for_display.extend(cached_analysis)
284
+ if not LOGGING_ENABLED:
285
+ progress.stop()
96
286
 
97
- # Step 2: analyze cache misses
98
- if tools_to_analyze:
287
+ def render(self):
288
+ raise NotImplementedError("Not implemented")
99
289
 
100
- failing_tool_definitions: List[ToolDefinition] = (
101
- inspector.extract_tool_desc_from_tool_source(
102
- Path(tool_definition_path),
103
- tools_to_analyze,
290
+
291
+ class Analyzer(AnalyzerBase):
292
+ def __init__(
293
+ self,
294
+ enhanced_metrics: Optional[List[EnhancedAnalyzeMetrics]] = None,
295
+ description_quality_analyzer: DescriptionQualityAnalyzer = None,
296
+ ):
297
+ self.enhanced_metrics = enhanced_metrics
298
+ self.enhanced_metrics_idx_map = {}
299
+
300
+ if self.enhanced_metrics:
301
+ # do some post-processing on the enhanced metrics
302
+ # create a mapping between test case name and index
303
+ if self.enhanced_metrics:
304
+ for idx, metric in enumerate(self.enhanced_metrics):
305
+ self.enhanced_metrics_idx_map[metric.test_case_name] = idx
306
+
307
+ self.description_quality_analyzer = description_quality_analyzer
308
+
309
+ @staticmethod
310
+ def _generate_style_config():
311
+ return Style(
312
+ color="magenta",
313
+ blink=True,
314
+ bold=True,
315
+ )
316
+
317
+ def _parse_enhanced_metrics(self, test_case_name) -> Optional[Table]:
318
+ table = Table(
319
+ box=box.ROUNDED,
320
+ show_lines=True,
321
+ )
322
+
323
+ columns = [
324
+ "Tool Name",
325
+ "Root Cause Analysis",
326
+ "Docstring Recommendations",
327
+ ]
328
+
329
+ rows = []
330
+
331
+ if (
332
+ self.enhanced_metrics
333
+ and (index := self.enhanced_metrics_idx_map.get(test_case_name))
334
+ is not None
335
+ ):
336
+ enhanced_metric: EnhancedAnalyzeMetrics = self.enhanced_metrics[
337
+ index
338
+ ]
339
+
340
+ for idx, tool_call in enumerate(enhanced_metric.tool_names):
341
+ static_root_causes = []
342
+ parsed_tool_annotations = []
343
+ param_annotations = defaultdict(list)
344
+
345
+ row = [tool_call]
346
+
347
+ # if this is true, then there are no semantic metrics
348
+ static_root_causes = [
349
+ Text(item.explanation)
350
+ for item in enhanced_metric.static_metrics[idx]
351
+ ]
352
+
353
+ static_root_causes = Text().join(static_root_causes)
354
+
355
+ # Parameter Root Cause
356
+ parameter_annotations = enhanced_metric.parameter_annotations[
357
+ idx
358
+ ]
359
+ formatted_param_root_cause = [
360
+ Text(metric.explanation) for metric in parameter_annotations
361
+ ]
362
+ formatted_param_root_cause = Text().join(
363
+ formatted_param_root_cause
104
364
  )
105
- )
106
365
 
107
- if not failing_tool_definitions:
108
- analysis_for_display.append(
109
- warn(
110
- message=f"No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
111
- )
366
+ # Tool Root Cause
367
+ tool_annotations = enhanced_metric.tool_annotations[idx]
368
+ formatted_tool_root_cause = [
369
+ Text(metric.explanation) for metric in tool_annotations
370
+ ]
371
+ formatted_tool_root_cause = Text().join(
372
+ formatted_tool_root_cause
112
373
  )
113
- return analysis_for_display
114
374
 
115
- missing_tools = self._get_tools_not_found_in_source(
116
- tools_to_analyze, failing_tool_definitions
117
- )
118
- if missing_tools:
119
- analysis_for_display.append(
120
- warn(
121
- message=f"Missing tool definitions for failed tools: '{missing_tools}' in the file: '{tool_definition_path}'"
375
+ if formatted_param_root_cause or formatted_tool_root_cause:
376
+ root_cause = (
377
+ formatted_tool_root_cause
378
+ if len(formatted_tool_root_cause)
379
+ > len(formatted_param_root_cause)
380
+ else formatted_param_root_cause
122
381
  )
382
+ elif static_root_causes:
383
+ root_cause = static_root_causes
384
+ else:
385
+ root_cause = N_A
386
+
387
+ row.append(root_cause)
388
+
389
+ # Parameter Level Docstring
390
+ for metric in parameter_annotations:
391
+ if annotations := metric.annotations:
392
+ for annotation in annotations:
393
+ param_annotations[annotation.parameter_name].append(
394
+ f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
395
+ )
396
+
397
+ newline = "\n"
398
+ param_annotations = [
399
+ f"- [b]{param_name}:[/b] {newline.join(doc_string)}"
400
+ for param_name, doc_string in param_annotations.items()
401
+ ]
402
+ param_annotations = "\n".join(param_annotations)
403
+
404
+ # Tool Level Docstring
405
+ for metric in tool_annotations:
406
+ if annotations := metric.annotations:
407
+ for annotation in annotations:
408
+ parsed_tool_annotations.append(
409
+ f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
410
+ )
411
+ parsed_tool_annotations = "\n".join(parsed_tool_annotations)
412
+ docstring_cell = Table(
413
+ show_lines=False, show_header=False, box=None
123
414
  )
415
+ add_divider = False
416
+
417
+ # - Gate the Doc String Enrichments.
418
+ # - Ensure the environment variable is enabled.
419
+ if GATE_TOOL_ENRICHMENTS and self.description_quality_analyzer:
420
+ # check if tool in cache
421
+ tool_description_analysis = (
422
+ self.description_quality_analyzer.analysis_cache.get(
423
+ tool_call
424
+ )
425
+ )
426
+ is_missing_tool = (
427
+ tool_call
428
+ in self.description_quality_analyzer.missing_tools
429
+ ) # tool call not in tool_definition_path
430
+ # failed tool call that failed to get extracted from the tool_definition_path because of missing `@tool` decorator
431
+ # TODO: figure out if this edge is needed? taken from original Analyze implementation
432
+ tool_not_found = (
433
+ tool_call
434
+ in self.description_quality_analyzer.tools_not_found
435
+ )
124
436
 
125
- for tool_definition in failing_tool_definitions:
437
+ # If the tool_call is in `missing_tools`, don't show the annotations
438
+ if is_missing_tool or tool_not_found:
439
+ parsed_tool_annotations = []
440
+ param_annotations = []
441
+
442
+ if tool_description_analysis is not None:
443
+ if (
444
+ tool_description_analysis.description_quality
445
+ == DescriptionQuality.GOOD
446
+ ):
447
+ parsed_tool_annotations = []
448
+ param_annotations = []
449
+ else:
450
+ print("cache miss: ", tool_call)
451
+
452
+ if not parsed_tool_annotations and not param_annotations:
453
+ docstring_cell.add_row(N_A)
454
+ if parsed_tool_annotations:
455
+ docstring_cell.add_row(
456
+ "[b]Tool Docstrings", parsed_tool_annotations
457
+ )
458
+ add_divider = True
459
+ if param_annotations:
460
+ if add_divider:
461
+ docstring_cell.add_row(Rule(characters="--"))
462
+ docstring_cell.add_row(
463
+ "[b]Parameter Docstrings", param_annotations
464
+ )
126
465
 
127
- tool_analysis = self._analyze_tool_definition(
128
- inspector=inspector,
129
- tool_definition=tool_definition,
130
- tool_definition_path=tool_definition_path,
131
- )
466
+ row.append(docstring_cell)
467
+ rows.append(row)
132
468
 
133
- self.analysis_cache[tool_definition.tool_name] = tool_analysis
134
- analysis_for_display.extend(tool_analysis)
469
+ is_empty = not any(rows)
470
+ if is_empty:
471
+ return None
135
472
 
136
- return analysis_for_display
473
+ for idx, column in enumerate(columns):
474
+ table.add_column(column)
475
+
476
+ for row in rows:
477
+ table.add_row(*row)
478
+
479
+ return table
137
480
 
138
481
  def render(
139
- self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
482
+ self,
483
+ data: List[ExtendedMessage],
484
+ tool_definition_path: Optional[str],
485
+ meta: Optional[dict] = None,
486
+ test_case_name=None,
140
487
  ) -> Group:
141
488
  """
142
489
  Render the conversation history and analysis results.
@@ -147,6 +494,7 @@ class Analyzer:
147
494
  conversation_lines = []
148
495
  reason_lines = []
149
496
  failing_tools = []
497
+ added_missed_header = False
150
498
 
151
499
  for entry in data:
152
500
  msg = entry.message
@@ -184,36 +532,35 @@ class Analyzer:
184
532
  reason_lines.append(Text(reason_text, style="red"))
185
533
  conversation_lines.append(text_line)
186
534
 
187
- if failing_tools and tool_definition_path:
188
-
189
- inspector = DescriptionQualityInspector()
190
-
191
- description_quality_inspection_lines = (
192
- self.analyze_failing_tool_description_quality(
193
- inspector, tool_definition_path, set(failing_tools)
194
- )
195
- )
196
-
197
- print_done()
198
-
199
- if description_quality_inspection_lines:
200
- reason_lines.extend(description_quality_inspection_lines)
535
+ if meta:
536
+ missed = meta.get("missed_tool_calls") or []
537
+ if missed:
538
+ if not added_missed_header:
539
+ reason_lines.append(
540
+ Text("\nMissed Calls:\n", style="bold red")
541
+ )
542
+ added_missed_header = True
543
+ for tool in missed:
544
+ reason_lines.append(Text(f"❌ {tool}\n", style="red"))
201
545
 
202
546
  conversation_panel = Panel(
203
547
  Text().join(conversation_lines),
204
548
  title="Conversation History",
205
- border_style="blue",
549
+ border_style="bold deep_sky_blue2",
206
550
  )
207
551
  reason_panel = Panel(
208
552
  Text().join(reason_lines),
209
- title="Analysis Results",
210
- border_style="red",
553
+ box=box.ROUNDED,
554
+ title=f"[bold red]Tool Call Errors[/bold red]",
555
+ border_style="bold red",
211
556
  )
557
+ table = self._parse_enhanced_metrics(test_case_name=test_case_name)
558
+ if table:
559
+ group = Group(conversation_panel, reason_panel, table)
560
+ else:
561
+ group = Group(conversation_panel, reason_panel)
212
562
 
213
- return Group(
214
- conversation_panel,
215
- reason_panel,
216
- )
563
+ return group
217
564
 
218
565
  def analyze(self, config: AnalyzeConfig):
219
566
  """
@@ -221,92 +568,288 @@ class Analyzer:
221
568
  :param config: AnalyzeConfig object containing user provided paths for analysis.
222
569
  """
223
570
 
224
- def get_summary(summary_file_name: str = "summary_metrics.csv"):
225
- summary = []
571
+ test_case_resources = TestCaseResources(config.data_path)
572
+ summary = test_case_resources.get_summary
226
573
 
227
- path_to_summary_file = os.path.join(
228
- config.data_path, summary_file_name
229
- )
574
+ test_case_with_failed_tools = self._get_test_case_with_failed_tools(
575
+ summary=summary
576
+ )
230
577
 
231
- with open(path_to_summary_file, "r") as f:
232
- reader = csv.reader(f)
233
- header = next(reader)
234
- for row in reader:
235
- summary.append(dict(zip(header, row)))
578
+ output_panels = []
236
579
 
237
- return summary
580
+ if len(test_case_with_failed_tools) == 0:
581
+ header_table = Table(show_header=False, box=None)
238
582
 
239
- def get_test_messages(test_case_name):
240
- test_messages = []
583
+ header_table.add_row("No Tool Call Error found!")
241
584
 
242
- test_case_path = os.path.join(
243
- config.data_path,
244
- "messages",
245
- f"{test_case_name}.messages.analyze.json",
585
+ panel = Panel(
586
+ header_table,
587
+ title="[bold green]📋 Analysis Summary[/bold green]",
246
588
  )
247
589
 
248
- with open(test_case_path, "r", encoding="utf-8") as f:
249
- temp = json.load(f)
250
- for entry in temp:
251
- msg = ExtendedMessage(**entry)
252
- test_messages.append(msg)
590
+ output_panels.append(panel)
253
591
 
254
- return test_messages
592
+ messages_dir = os.path.join(config.data_path, "messages")
255
593
 
256
- def get_metrics(test_case_name):
257
- test_metrics_path = os.path.join(
258
- config.data_path, "messages", f"{test_case_name}.metrics.json"
259
- )
594
+ RUN_NAME_ONLY_RE = re.compile(r"^(?P<parent>.+)\.run(?P<id>\d+)$")
595
+ processed_parents: Set[str] = set()
260
596
 
261
- with open(test_metrics_path, "r", encoding="utf-8") as f:
262
- metrics = ToolCallAndRoutingMetrics(**json.load(f))
597
+ overall_runs_performed = 0
598
+ overall_runs_problematic = 0
599
+ overall_text_match_hits = 0
600
+ overall_text_match_den = 0
601
+ overall_journey_vals = []
263
602
 
264
- return metrics
603
+ for test_case_entry in summary:
604
+ dataset_base = test_case_entry["dataset_name"]
265
605
 
266
- summary = get_summary()
606
+ # If CSV row looks like "<parent>.runN" and we have runs on disk for <parent>, skip the per-run row.
607
+ m = RUN_NAME_ONLY_RE.match(dataset_base)
608
+ if m:
609
+ parent = m.group("parent")
610
+ if list_run_files(messages_dir, parent):
611
+ continue
267
612
 
268
- test_case_with_failed_tools = self._get_test_case_with_failed_tools(
269
- summary=summary
270
- )
613
+ # Avoid processing a parent twice if it appears multiple times in CSV.
614
+ if dataset_base in processed_parents:
615
+ continue
271
616
 
272
- if len(test_case_with_failed_tools) == 0:
273
- header_table = Table(show_header=False, box=None)
617
+ run_map = list_run_files(messages_dir, dataset_base, config.run)
274
618
 
275
- header_table.add_row("No Tool Call Error found!")
619
+ # ---- SINGLE RUN (legacy or run1 only) ----
620
+ if not run_map or len(run_map) == 1:
621
+ runs_performed = 1
622
+ test_messages, meta, metrics, runs_problematic = (
623
+ self._single_run(
624
+ test_case_name=dataset_base,
625
+ run_map=run_map,
626
+ test_cases_resource=test_case_resources,
627
+ )
628
+ )
276
629
 
277
- panel = Panel(
278
- header_table,
279
- title="[bold green]📋 Analysis Summary[/bold green]",
280
- )
630
+ processed_parents.add(dataset_base)
281
631
 
282
- pretty_print(panel)
632
+ # ✅ Dataset-level panel (print BEFORE details)
633
+ ds_table = Table(show_header=False, box=None)
634
+ ds_table.add_row("Type: Single-run")
635
+ status = (
636
+ "❌ Problematic" if runs_problematic else "✅ No problems"
637
+ )
638
+ ds_table.add_row(f"Status: {status}")
639
+ # Update overall counters/averages
640
+ overall_runs_performed += runs_performed
641
+ overall_runs_problematic += runs_problematic
642
+ tm = getattr(metrics, "text_match", None)
643
+ tm_val = getattr(tm, "value", None) if tm else None
644
+
645
+ if tm_val is not None and tm_val != TextMatchType.na:
646
+ overall_text_match_den += 1
647
+ overall_text_match_hits += (
648
+ tm_val == TextMatchType.text_match
649
+ )
650
+ if getattr(metrics, "is_success", None) is not None:
651
+ overall_journey_vals.append(
652
+ 1 if bool(metrics.is_success) else 0
653
+ )
283
654
 
284
- for test_case_entry in test_case_with_failed_tools:
285
- test_case_name = test_case_entry["dataset_name"]
655
+ header_group = Group(
656
+ *[
657
+ ds_table,
658
+ self._create_header_analysis_panel(
659
+ dataset_base, metrics
660
+ ),
661
+ ],
662
+ )
663
+ border_style = "bold red" if runs_problematic else "bold green"
664
+ header_panel = Panel(
665
+ header_group,
666
+ title=f"[b]📋 Analysis Summary — {dataset_base}[/b]",
667
+ border_style=border_style,
668
+ )
669
+ output_panels.append(header_panel)
670
+
671
+ if runs_problematic:
672
+ output_panels.append(
673
+ self.render(
674
+ test_messages,
675
+ config.tool_definition_path,
676
+ meta,
677
+ test_case_name=dataset_base,
678
+ )
679
+ )
680
+ output_panels.append(
681
+ add_line_seperator(
682
+ self._generate_style_config(), print=False
683
+ )
684
+ )
286
685
 
287
- test_messages = get_test_messages(test_case_name=test_case_name)
686
+ else:
687
+ output_panels.append(
688
+ add_line_seperator(
689
+ self._generate_style_config(), print=False
690
+ )
691
+ )
288
692
 
289
- metrics: ToolCallAndRoutingMetrics = get_metrics(
290
- test_case_name=test_case_name
291
- )
693
+ continue
292
694
 
293
- header_panel = self._create_header_analysis_panel(
294
- test_case_name, metrics
295
- )
296
- pretty_print(header_panel)
695
+ # ---- MULTI RUN (two-pass: compute first, then print summary, then details) ----
696
+ processed_parents.add(dataset_base)
697
+ runs_performed = len(run_map)
698
+ runs_problematic = 0
699
+ text_match_hits = 0
700
+ text_match_den = 0
701
+ journey_vals = []
702
+
703
+ # First pass: compute aggregates and collect problematic runs to replay later
704
+ deferred_runs = []
705
+ for run_id in sorted(run_map):
706
+ paths = run_map[run_id]
707
+ if not paths["metrics"]:
708
+ runs_problematic += 1
709
+ # no analyze file to replay; still counted as problematic
710
+ continue
711
+
712
+ metrics = load_run_metrics(paths["metrics"])
713
+
714
+ # Aggregate for per-dataset
715
+ tm = getattr(metrics, "text_match", None)
716
+ tm_val = getattr(tm, "value", None) if tm is not None else None
717
+ if tm_val is not None and tm_val != TextMatchType.na.value:
718
+ text_match_den += 1
719
+ text_match_hits += tm_val == TextMatchType.text_match.value
720
+
721
+ if getattr(metrics, "is_success", None) is not None:
722
+ journey_vals.append(1 if bool(metrics.is_success) else 0)
723
+
724
+ # Decide if problematic
725
+ had_incorrect_param = (
726
+ hasattr(metrics, "tool_calls_with_incorrect_parameter")
727
+ and float(metrics.tool_calls_with_incorrect_parameter or 0)
728
+ > 0
729
+ )
730
+ low_precision = (
731
+ hasattr(metrics, "tool_call_precision")
732
+ and float(
733
+ metrics.tool_call_precision
734
+ if metrics.tool_call_precision is not None
735
+ else 1.0
736
+ )
737
+ < 1.0
738
+ )
739
+ low_recall = (
740
+ hasattr(metrics, "tool_call_recall")
741
+ and float(
742
+ metrics.tool_call_recall
743
+ if metrics.tool_call_recall is not None
744
+ else 1.0
745
+ )
746
+ < 1.0
747
+ )
297
748
 
298
- tool_definition_path = (
299
- config.tool_definition_path
300
- if config.tool_definition_path
301
- else None
302
- )
749
+ is_problem = (
750
+ (hasattr(metrics, "is_success") and not metrics.is_success)
751
+ or had_incorrect_param
752
+ or low_precision
753
+ or low_recall
754
+ )
755
+ if is_problem:
756
+ runs_problematic += 1
757
+ deferred_runs.append(
758
+ {
759
+ "title": f"{dataset_base}.run{run_id}",
760
+ "metrics": metrics,
761
+ "analyze_path": paths.get("analyze"),
762
+ }
763
+ )
764
+
765
+ # Second pass: now replay only the problematic runs (so summary stays at the top)
766
+ for item in deferred_runs:
767
+ ds_table = Table(show_header=False, box=None)
768
+ ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
769
+ ds_table.add_row(
770
+ f"Runs with problems: {runs_problematic} / {runs_performed}"
771
+ )
772
+ status = (
773
+ "❌ Problematic"
774
+ if runs_problematic > 0
775
+ else "✅ No problems"
776
+ )
777
+ ds_table.add_row(f"Status: {status}")
778
+ header_table = self._create_header_analysis_panel(
779
+ item["title"], item["metrics"]
780
+ )
303
781
 
304
- rendered_content = self.render(
305
- data=test_messages, tool_definition_path=tool_definition_path
782
+ group = Group(*[ds_table, header_table])
783
+ output_panels.append(
784
+ Panel(
785
+ group,
786
+ title=f"📋 Analysis Summary — {dataset_base}",
787
+ border_style="green",
788
+ )
789
+ )
790
+
791
+ if item["analyze_path"]:
792
+ with open(item["analyze_path"], "r", encoding="utf-8") as f:
793
+ raw = json.load(f)
794
+ meta = {}
795
+ if raw and isinstance(raw[-1], dict) and "meta" in raw[-1]:
796
+ meta = raw[-1]["meta"]
797
+ raw = raw[:-1]
798
+ test_messages = [ExtendedMessage(**entry) for entry in raw]
799
+
800
+ output_panels.append(
801
+ self.render(
802
+ test_messages, config.tool_definition_path, meta
803
+ )
804
+ )
805
+ output_panels.append(
806
+ add_line_seperator(
807
+ self._generate_style_config(), print=False
808
+ )
809
+ )
810
+
811
+ # Update overall aggregates
812
+ overall_runs_performed += runs_performed
813
+ overall_runs_problematic += runs_problematic
814
+ overall_text_match_hits += text_match_hits
815
+ overall_text_match_den += text_match_den
816
+ overall_journey_vals.extend(journey_vals)
817
+
818
+ # --- Overall summary ---
819
+ overall_lines = [
820
+ f"Test cases analyzed: {len(processed_parents)}",
821
+ f"Total runs executed: {overall_runs_performed}",
822
+ f"Problematic runs: {overall_runs_problematic} ({round((overall_runs_problematic/overall_runs_performed)*100,1) if overall_runs_performed else 0}%)",
823
+ ]
824
+
825
+ if overall_text_match_den:
826
+ tm_pct = round(
827
+ (overall_text_match_hits / overall_text_match_den) * 100, 2
306
828
  )
307
- pretty_print(rendered_content)
829
+ overall_lines.append(f"Avg text-match success: {tm_pct}%")
830
+ else:
831
+ overall_lines.append("Avg text-match success: N/A")
308
832
 
309
- add_line_seperator(self._generate_style_config())
833
+ if overall_journey_vals:
834
+ js_pct = round(
835
+ (sum(overall_journey_vals) / len(overall_journey_vals)) * 100, 2
836
+ )
837
+ overall_lines.append(f"Avg journey success: {js_pct}%")
838
+ else:
839
+ overall_lines.append(f"Avg journey success: N/A")
840
+
841
+ output_panels.append(
842
+ Panel(
843
+ Text("\n".join(overall_lines)),
844
+ title="📋 Overall Summary",
845
+ border_style="cyan",
846
+ )
847
+ )
848
+ os.environ["LESS"] = "-R"
849
+ console = Console()
850
+ with console.pager(styles=True):
851
+ for panel in output_panels:
852
+ console.print(panel, overflow="crop")
310
853
 
311
854
  def _create_header_analysis_panel(
312
855
  self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
@@ -323,83 +866,345 @@ class Analyzer:
323
866
  header_table.add_row(f"Text Match: {metrics.text_match.value}")
324
867
  header_table.add_row(f"Journey Success: {metrics.is_success}")
325
868
 
326
- header_panel = Panel(
327
- header_table, title="[bold green]📋 Analysis Summary[/bold green]"
328
- )
869
+ return header_table
329
870
 
330
- return header_panel
331
871
 
332
- def _get_test_case_with_failed_tools(self, summary) -> List:
872
+ class AnalyzerEnhanced(AnalyzerBase):
873
+ PARAMETER_DOCUMENTATION = "PARAMETER_DOCUMENTATION"
874
+ TOOL_USAGE_EXAMPLES = "TOOL_USAGE_EXAMPLES"
875
+ TOOL_DOCUMENTATION = "TOOL_DOCUMENTATION"
333
876
 
334
- test_case_with_failed_tools = []
877
+ DEFAULT_GENERATION_PARAMS = {
878
+ "min_new_tokens": 0,
879
+ "decoding_method": "greedy",
880
+ "max_new_tokens": 10_000,
881
+ "random_seed": 42,
882
+ }
335
883
 
336
- for entry in summary:
337
- test_case_name = entry["dataset_name"]
884
+ def __init__(self):
885
+ super().__init__()
338
886
 
339
- if test_case_name.lower().strip() == "summary (average)":
340
- continue
887
+ def _deduplicate_tool_call_failures(self, messages: List[ExtendedMessage]):
888
+ """If there are multiple failures from the same tool, then choose the failure that occurs later in the trajectory
341
889
 
342
- if (
343
- not entry["is_success"]
344
- or float(entry["tool_calls_with_incorrect_parameter"]) > 0
345
- or float(entry["tool_call_precision"]) < 1.0
346
- or float(entry["tool_call_recall"]) < 1.0
347
- ):
890
+ ex.
891
+ 1. Tool A fails
892
+ 2. Tool A Error response
893
+ 3. Tool A call again which fails
894
+ 4. Tool A error response
348
895
 
349
- test_case_with_failed_tools.append(entry)
896
+ For the analysis, we analyze the second time the tool call fails, with the previous messages serving as context.
350
897
 
351
- return test_case_with_failed_tools
898
+ """
899
+ tool_indices = []
900
+ seen_tools = set()
901
+
902
+ for idx, message in enumerate(reversed(messages)):
903
+ if self._is_failed_tool_call(message):
904
+ content = json.loads(message.message.content)
905
+ tool_call_name = content["name"]
906
+ if tool_call_name not in seen_tools:
907
+ seen_tools.add(tool_call_name)
908
+ tool_indices.append(len(messages) - 1 - idx)
909
+
910
+ return sorted(tool_indices)
911
+
912
+ def process_messages(self, task_name, test_case, tools, messages):
913
+ eval = ReferencelessEvaluation(
914
+ api_spec=tools,
915
+ model_id=MODEL_ID,
916
+ task_n=task_name,
917
+ dataset_name=test_case,
918
+ runtime_pipeline=False,
919
+ generation_params=AnalyzerEnhanced.DEFAULT_GENERATION_PARAMS,
920
+ )
352
921
 
353
- def _get_tools_not_found_in_source(
354
- self,
355
- tools_to_analyze: List[str],
356
- failing_tool_definitions: List[ToolDefinition],
357
- ) -> Set[str]:
922
+ processed_data = [
923
+ {
924
+ k: msg.model_dump().get(k)
925
+ for k in ["role", "content", "type"]
926
+ if k in msg.model_dump()
927
+ }
928
+ for msg in messages
929
+ ]
930
+
931
+ context = processed_data[:-1]
932
+ tool_call = processed_data[
933
+ -1
934
+ ] # assume that the message is the last tool call
935
+ tool_call_msg = json.loads(tool_call["content"])
936
+ call = ReferencelessEvaluation.fmt_tool_call(
937
+ tool_id=tool_call_msg.get("id", "1"),
938
+ tool_call_name=tool_call_msg["name"],
939
+ arguments=json.dumps(tool_call_msg["args"]),
940
+ context=context,
941
+ )
942
+ return test_case, eval.run([call])
943
+
944
+ def _extract_semantic_metrics(
945
+ self, metrics_dictionary, annotation_filters: Optional[List[str]]
946
+ ):
947
+ semantic_analysis = []
948
+ for metric_data in metrics_dictionary.values():
949
+ raw_response = metric_data.get("raw_response")
950
+ if raw_response is None:
951
+ continue
358
952
 
359
- return set(tools_to_analyze) - {
360
- tool_def.tool_name for tool_def in failing_tool_definitions
361
- }
953
+ is_correct = metric_data.get("is_correct", False)
954
+ if is_correct:
955
+ continue
362
956
 
363
- def _analyze_tool_definition(
364
- self,
365
- inspector: DescriptionQualityInspector,
366
- tool_definition: ToolDefinition,
367
- tool_definition_path: str,
368
- ) -> List[Text]:
957
+ failed_semantic_test_case = ReferencelessEvalParser.semantic_parser(
958
+ metric_name=metric_data.get("metric_name"),
959
+ data=raw_response,
960
+ annotation_filters=annotation_filters,
961
+ )
962
+
963
+ semantic_analysis.append(failed_semantic_test_case)
369
964
 
370
- tool_name = tool_definition.tool_name
371
- tool_desc = tool_definition.tool_description
965
+ return semantic_analysis
372
966
 
373
- tool_analysis = []
967
+ def tool_enrichment_view(self, results):
968
+ enhanced_metrics = []
969
+ tool_enrichment_metrics = defaultdict(list)
970
+ for result in results:
971
+ for test_case, eval_results in result.items():
972
+ for result in eval_results:
973
+ # for metric in result:
974
+ failed_static_metrics = []
975
+ parameter_annotations = []
976
+ tool_annotations = []
977
+
978
+ static_metrics_passed = result.get("static", {}).get(
979
+ "final_decision", False
980
+ )
981
+ tool_call_obj = result.get("inputs", {}).get(
982
+ "tool_call", {}
983
+ )
984
+
985
+ if static_metrics_passed:
986
+ semantic_metrics = result.get("semantic")
987
+ function_selection_metrics = semantic_metrics.get(
988
+ "function_selection", {}
989
+ ).get("metrics", {})
990
+ tool_annotations = self._extract_semantic_metrics(
991
+ function_selection_metrics,
992
+ [
993
+ AnalyzerEnhanced.TOOL_DOCUMENTATION,
994
+ AnalyzerEnhanced.TOOL_USAGE_EXAMPLES,
995
+ ],
996
+ )
997
+
998
+ general_metrics = semantic_metrics.get(
999
+ "general", {}
1000
+ ).get("metrics", {})
1001
+ parameter_annotations = self._extract_semantic_metrics(
1002
+ general_metrics,
1003
+ [AnalyzerEnhanced.PARAMETER_DOCUMENTATION],
1004
+ )
1005
+ else:
1006
+ static_metrics = result.get("static").get("metrics")
1007
+ failed_static_metrics = (
1008
+ ReferencelessEvalParser.static_parser(
1009
+ static_metrics=static_metrics
1010
+ )
1011
+ )
1012
+
1013
+ parsed_metrics = {
1014
+ "tool_name": tool_call_obj.get("function", {}).get(
1015
+ "name"
1016
+ ),
1017
+ "parameter_annotations": parameter_annotations,
1018
+ "tool_annotations": tool_annotations,
1019
+ "static_metrics": failed_static_metrics,
1020
+ }
1021
+ tool_enrichment_metrics[test_case].append(parsed_metrics)
1022
+
1023
+ for test_case, metrics in tool_enrichment_metrics.items():
1024
+ failed_tools = [metric["tool_name"] for metric in metrics]
1025
+ parameter_annotations = [
1026
+ metric["parameter_annotations"] for metric in metrics
1027
+ ]
1028
+ tool_annotation = [metric["tool_annotations"] for metric in metrics]
1029
+ static_metrics = [metric["static_metrics"] for metric in metrics]
1030
+
1031
+ # don't add to final metrics array if there were no annotations
1032
+ if (
1033
+ not any(parameter_annotations)
1034
+ and not any(tool_annotation)
1035
+ and not any(static_metrics)
1036
+ ):
1037
+ continue
374
1038
 
375
- # missing description
376
- if tool_desc is None:
377
- tool_analysis.extend(
378
- IncorrectParameterUtils.format_missing_description_message(
379
- tool_name=tool_name,
380
- tool_definition_path=tool_definition_path,
1039
+ enhanced_metrics.append(
1040
+ EnhancedAnalyzeMetrics(
1041
+ test_case_name=test_case,
1042
+ parameter_annotations=parameter_annotations,
1043
+ tool_annotations=tool_annotation,
1044
+ tool_names=failed_tools,
1045
+ static_metrics=static_metrics,
381
1046
  )
382
1047
  )
383
- return tool_analysis
384
1048
 
385
- # bad description
386
- if inspector.detect_bad_description(tool_definition):
387
- tool_analysis.extend(
388
- IncorrectParameterUtils.format_bad_description_message(
389
- tool_name=tool_name, tool_desc=tool_desc
390
- )
1049
+ return enhanced_metrics
1050
+
1051
+ def analyze(
1052
+ self, config: AnalyzeConfig
1053
+ ) -> Optional[List[EnhancedAnalyzeMetrics]]:
1054
+ start = time.time()
1055
+ all_tools = ToolExtractionOpenAIFormat.from_path(
1056
+ config.tool_definition_path
1057
+ )
1058
+ messages_dir = os.path.join(config.data_path, "messages")
1059
+ test_case_resources = TestCaseResources(config.data_path)
1060
+
1061
+ failed_test_cases = {}
1062
+ for test_case in test_case_resources.get_summary:
1063
+ if test_case["dataset_name"] in failed_test_cases:
1064
+ continue
1065
+ run_map = list_run_files(
1066
+ messages_dir, test_case["dataset_name"], config.run
391
1067
  )
392
- return tool_analysis
1068
+ if run_map and config.run == -1:
1069
+ rich.print(
1070
+ "[red]Enhanced Mode only operates on a single run for a dataset. Since there are multiple runs, set the `--run` flag to the specific run for enhanced analysis."
1071
+ )
1072
+ # run the first run in the config map
1073
+ rich.print(
1074
+ f"[b]Defaulting to run {next(iter(run_map))} to analyze for {test_case['dataset_name']}"
1075
+ )
1076
+ config.run = next(iter(run_map))
1077
+ run_map = {config.run: run_map.get(config.run)}
393
1078
 
394
- # good description
395
- tool_analysis.append(
396
- is_ok(
397
- message=f"The description for the `{tool_name}` looks sufficient."
1079
+ _, _, _, run_problematic = self._single_run(
1080
+ test_case["dataset_name"], run_map, test_case_resources
398
1081
  )
1082
+ if run_problematic:
1083
+ if run_files := run_map.get(config.run):
1084
+ failed_test_cases[test_case["dataset_name"]] = run_files
1085
+
1086
+ else:
1087
+ # legacy runs without n runs
1088
+ # tranform the legacy runs into the same data structure from `list_files`
1089
+
1090
+ messages_path = os.path.join(
1091
+ test_case_resources.output_dir,
1092
+ "messages",
1093
+ f"{test_case['dataset_name']}.messages.json",
1094
+ )
1095
+
1096
+ analyze_path = os.path.join(
1097
+ test_case_resources.output_dir,
1098
+ "messages",
1099
+ f"{test_case['dataset_name']}.messages.analyze.json",
1100
+ )
1101
+
1102
+ metrics_path = os.path.join(
1103
+ test_case_resources.output_dir,
1104
+ "messages",
1105
+ f"{test_case['dataset_name']}.metrics.json",
1106
+ )
1107
+
1108
+ failed_test_cases[test_case["dataset_name"]] = {
1109
+ "analyze": analyze_path,
1110
+ "messages": messages_path,
1111
+ "metrics": metrics_path,
1112
+ }
1113
+
1114
+ max_workers = config.num_workers
1115
+ rich.print(
1116
+ f"[bold green]INFO:[/bold green] Number of workers set to: {max_workers}"
399
1117
  )
400
- return tool_analysis
1118
+
1119
+ jobs = []
1120
+
1121
+ with ThreadPoolExecutor(
1122
+ max_workers=max_workers, thread_name_prefix="[Worker]"
1123
+ ) as pool:
1124
+ aggregate_results = []
1125
+ for test_case, file_mapping in failed_test_cases.items():
1126
+ analyze_messages, _ = test_case_resources.get_analyze_messages(
1127
+ path=file_mapping["analyze"]
1128
+ )
1129
+ idx_failed_tool_calls = self._deduplicate_tool_call_failures(
1130
+ analyze_messages
1131
+ )
1132
+ messages = [
1133
+ Message.model_validate(message.message)
1134
+ for message in analyze_messages
1135
+ ]
1136
+
1137
+ for idx in idx_failed_tool_calls:
1138
+ jobs.append(
1139
+ {
1140
+ "task_name": f"{test_case}-0-{idx + 1}",
1141
+ "test_case": test_case,
1142
+ "tools": all_tools,
1143
+ "messages": messages[0 : idx + 1],
1144
+ }
1145
+ )
1146
+ jobs = sorted(jobs, key=lambda x: len(x["messages"]))
1147
+ futures = [
1148
+ pool.submit(
1149
+ self.process_messages,
1150
+ job["task_name"],
1151
+ job["test_case"],
1152
+ job["tools"],
1153
+ job["messages"],
1154
+ )
1155
+ for job in jobs
1156
+ ]
1157
+
1158
+ if futures:
1159
+ if not LOGGING_ENABLED:
1160
+ # logging is not enabled we want to show the progress bar
1161
+ progress = Progress()
1162
+ task = progress.add_task(
1163
+ f"[purple]Evaluating {len(futures)} tasks...",
1164
+ total=len(futures),
1165
+ )
1166
+ progress.start()
1167
+
1168
+ for future in as_completed(futures):
1169
+ try:
1170
+ test_case, results = future.result()
1171
+ aggregate_results.append({test_case: results})
1172
+ except Exception as e:
1173
+ rich.print(f"test case, {test_case} ,fails with {e}")
1174
+ traceback.print_exc()
1175
+ finally:
1176
+ if not LOGGING_ENABLED:
1177
+ progress.update(task, advance=1)
1178
+
1179
+ if not LOGGING_ENABLED:
1180
+ progress.stop()
1181
+
1182
+ enhanced_metrics = self.tool_enrichment_view(aggregate_results)
1183
+ end = time.time()
1184
+ rich.print(f"Enhanced Analysis took {end - start} s")
1185
+
1186
+ return enhanced_metrics
1187
+
1188
+ def render(self):
1189
+ raise NotImplementedError("Not implemented")
1190
+
1191
+
1192
+ def run(args):
1193
+ d = DescriptionQualityAnalyzer()
1194
+ if args.mode == AnalyzeMode.enhanced:
1195
+ if GATE_TOOL_ENRICHMENTS:
1196
+ d.analyze(args)
1197
+
1198
+ enhanced = AnalyzerEnhanced()
1199
+ enhanced_metrics = enhanced.analyze(config=args)
1200
+ dummy_analyzer = Analyzer(enhanced_metrics, d)
1201
+ dummy_analyzer.analyze(args)
1202
+
1203
+ else:
1204
+ dummy_analyzer = Analyzer()
1205
+ dummy_analyzer.analyze(args)
401
1206
 
402
1207
 
403
1208
  if __name__ == "__main__":
404
- dummy_analyzer = Analyzer()
405
- dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
1209
+ args = CLI(AnalyzeConfig, as_positional=False)
1210
+ run(args)