ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,405 @@
1
+ from type import Message, ContentType, EvaluationData
2
+ from typing import List, Optional
3
+ import json
4
+ import rich
5
+ from collections import defaultdict
6
+ from analytics.tools.types import (
7
+ ErrorPatterns,
8
+ ToolFailure,
9
+ HallucinatedParameter,
10
+ RootCauses,
11
+ HallucinationCause,
12
+ ParameterUsageCause,
13
+ BadToolCallCause,
14
+ AgentRecommendation,
15
+ AnalysisResults,
16
+ ErrorType,
17
+ )
18
+ from data_annotator import ERROR_KEYWORDS
19
+ from http import HTTPStatus
20
+
21
+
22
+ class ToolErrorAnalyzer:
23
+ THRESHOLD = (
24
+ 2 # Minimum consecutive failures to consider a tool as having repeated failures
25
+ )
26
+ COMMON_PLACEHOLDERS = [
27
+ "your user id",
28
+ "your email id",
29
+ "your account id",
30
+ "user_id_here",
31
+ "email_here",
32
+ "account_id_here",
33
+ "<user_id>",
34
+ "<email>",
35
+ "<account_id>",
36
+ "placeholder",
37
+ "example",
38
+ "sample",
39
+ ]
40
+
41
+ @classmethod
42
+ def _get_api_error_codes(cls) -> List[str]:
43
+ """Extract all 4xx and 5xx HTTP status codes and phrases for error detection."""
44
+ error_terms = []
45
+ for status in HTTPStatus:
46
+ if status.value >= 400: # 4xx and 5xx errors
47
+ error_terms.append(str(status.value)) # "400", "404", "500", etc.
48
+ error_terms.append(
49
+ status.phrase.lower()
50
+ ) # "bad request", "not found", "internal server error", etc.
51
+
52
+ return error_terms
53
+
54
+ def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
55
+ self.messages = messages
56
+ self.ground_truth = ground_truth
57
+ self.error_patterns = ErrorPatterns()
58
+ self.api_error_codes = self._get_api_error_codes()
59
+
60
+ def analyze(self) -> AnalysisResults:
61
+ """Pipeline coordinator.
62
+ Returns:
63
+ dict: Analyzed results with recommendations.
64
+ """
65
+ self._find_error_patterns()
66
+ root_causes: RootCauses = self._root_cause_classifier()
67
+ recommendations: List[AgentRecommendation] = (
68
+ self._generate_agent_definition_improvements(root_causes)
69
+ )
70
+
71
+ return AnalysisResults(
72
+ error_patterns=self.error_patterns,
73
+ root_causes=root_causes,
74
+ recommendations=recommendations,
75
+ )
76
+
77
+ def _find_error_patterns(self) -> ErrorPatterns:
78
+ """Identify tools that 1. fail and, 2. fail repeatedly.
79
+ Returns:
80
+ dict: toolnames -> failure patterns
81
+ """
82
+ # Group errors by tool name and count sequential failures
83
+ # Return tools with >= threshold consecutive failures
84
+
85
+ tool_failures = defaultdict(list)
86
+ for i, msg in enumerate(self.messages):
87
+ if msg.type == ContentType.tool_response and any(
88
+ keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
89
+ ):
90
+ if isinstance(msg.content, dict):
91
+ tool_call_id = msg.content.get("tool_call_id")
92
+ elif isinstance(msg.content, str):
93
+ try:
94
+ parsed_content = json.loads(msg.content)
95
+ tool_call_id = (
96
+ parsed_content.get("tool_call_id")
97
+ if isinstance(parsed_content, dict)
98
+ else None
99
+ )
100
+ except json.JSONDecodeError:
101
+ continue
102
+
103
+ if not tool_call_id:
104
+ continue
105
+ tool_call_index = self._find_tool_call_index_by_id(tool_call_id)
106
+ if tool_call_index == -1:
107
+ continue # Tool call not found in messages
108
+ tool_call_msg = self.messages[tool_call_index]
109
+ if isinstance(tool_call_msg.content, dict):
110
+ tool_call = tool_call_msg.content
111
+ else:
112
+ try:
113
+ tool_call = json.loads(tool_call_msg.content)
114
+ except json.JSONDecodeError:
115
+ continue
116
+
117
+ tool_name = tool_call.get("name", "")
118
+ if not tool_name:
119
+ continue
120
+
121
+ tool_failures[tool_name].append(
122
+ ToolFailure(
123
+ attempt_index=tool_call_index,
124
+ parameters=tool_call.get("args", {}),
125
+ error_message=msg.content,
126
+ )
127
+ )
128
+ # Store all failures
129
+ self.error_patterns.all_failures = tool_failures
130
+
131
+ # Filter tools with >= threshold consecutive failures
132
+ self.error_patterns.repeated_failures = {
133
+ tool: failures
134
+ for tool, failures in tool_failures.items()
135
+ if len(failures) >= self.THRESHOLD
136
+ }
137
+
138
+ rich.print(
139
+ f"[cyan]Found {len(self.error_patterns.repeated_failures)} tools with repeated failures:[/cyan]"
140
+ )
141
+ return self.error_patterns
142
+
143
+ def _root_cause_classifier(self) -> RootCauses:
144
+ """Map error patterns to probable root causes."""
145
+ causes = RootCauses()
146
+
147
+ for tool, failures in self.error_patterns.all_failures.items():
148
+ for failure in failures:
149
+ error_content = failure.error_message # handle both Dict and str
150
+ if isinstance(error_content, dict):
151
+ error_text = error_content.get("content", "")
152
+ if not isinstance(error_text, str):
153
+ error_text = str(error_text)
154
+ else:
155
+ error_text = str(error_content)
156
+
157
+ error_msg = error_text.lower()
158
+ params = failure.parameters
159
+
160
+ # Compare with ground truth to detect hallucinations
161
+ hallucinated_params = self._detect_hallucinations(tool, params)
162
+ if hallucinated_params:
163
+ causes.agent_hallucinations.append(
164
+ HallucinationCause(
165
+ tool=tool,
166
+ attempt_index=failure.attempt_index,
167
+ error=error_msg,
168
+ hallucinated_params=hallucinated_params,
169
+ )
170
+ )
171
+
172
+ # Check for placeholder usage
173
+ has_placeholder = False
174
+ for param_name, param_value in params.items():
175
+ if isinstance(param_value, str):
176
+
177
+ if any(
178
+ placeholder in param_value.lower()
179
+ for placeholder in self.COMMON_PLACEHOLDERS
180
+ ):
181
+ has_placeholder = True
182
+ break
183
+
184
+ if has_placeholder:
185
+ causes.incorrect_parameter_usage.append(
186
+ ParameterUsageCause(
187
+ tool=tool,
188
+ placeholder_used=True,
189
+ attempt_index=failure.attempt_index,
190
+ error=error_msg,
191
+ )
192
+ )
193
+ elif any(
194
+ term in error_msg
195
+ for term in ["invalid", "malformed", "expected", "format"]
196
+ ):
197
+ causes.incorrect_parameter_usage.append(
198
+ ParameterUsageCause(
199
+ tool=tool,
200
+ placeholder_used=False,
201
+ attempt_index=failure.attempt_index,
202
+ error=error_msg,
203
+ )
204
+ )
205
+ elif any(term in error_msg for term in self.api_error_codes):
206
+ # Group all HTTP errors under "bad_tool_call" as they all represent (...)
207
+ # (...) problems with the API request, then further categorize by specific error type
208
+ causes.bad_tool_call.append(
209
+ BadToolCallCause(
210
+ tool=tool,
211
+ attempt_index=failure.attempt_index,
212
+ error=error_msg,
213
+ )
214
+ )
215
+
216
+ return causes # TODO: add pattern-analysis based RCA for repeated_failures
217
+
218
+ def _generate_agent_definition_improvements(
219
+ self, root_causes: RootCauses
220
+ ) -> List[AgentRecommendation]:
221
+ """Generate specific agent prompt template improvements based on root causes."""
222
+ recommendations = []
223
+
224
+ # Recurring failures
225
+ if self.error_patterns.repeated_failures:
226
+ for tool, failures in self.error_patterns.repeated_failures.items():
227
+ recommendations.append(
228
+ AgentRecommendation(
229
+ issue=f"Agent repeatedly fails when calling {tool}",
230
+ prompt_addition="The agent made multiple unsuccessful attempts to call this tool. It may help to define fallback behavior for repeated failures, such as asking the user for clarification or escalating the issue.",
231
+ summary=f"Agent made {len(failures)} failed attempts with {tool}",
232
+ )
233
+ )
234
+
235
+ # Handle incorrect parameters
236
+ param_issues = root_causes.incorrect_parameter_usage
237
+ placeholder_issues = [i for i in param_issues if i.placeholder_used]
238
+ other_param_issues = [i for i in param_issues if not i.placeholder_used]
239
+
240
+ if placeholder_issues:
241
+ tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
242
+ tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
243
+
244
+ recommendations.append(
245
+ AgentRecommendation(
246
+ issue=f"Using placeholder values in {tools_placeholder_issues_str}",
247
+ prompt_addition="A placeholder-style value (like <user_id> or email_here) was used in this tool call. You may want to guide the agent to use actual values from user input or previous responses, rather than placeholders.",
248
+ summary="The agent used generic placeholders instead of actual data values.",
249
+ )
250
+ )
251
+
252
+ if other_param_issues:
253
+ recommendations.append(
254
+ AgentRecommendation(
255
+ issue="Parameter format errors",
256
+ prompt_addition="A parameter provided in the tool call didn't match the expected format. Clarifying format expectations (e.g., for dates or IDs) in the agent instructions can help reduce these errors.",
257
+ summary="The agent provided incorrectly formatted parameters.",
258
+ )
259
+ )
260
+
261
+ # Handle bad API requests
262
+ if root_causes.bad_tool_call:
263
+ tool_errors = {} # maps error code -> erroneous tools set
264
+ for error in root_causes.bad_tool_call:
265
+ tool: str = error.tool
266
+ error_msg: str = error.error
267
+
268
+ # Extract error type
269
+ error_type = ErrorType.GENERAL
270
+ if "404" in error_msg or "not found" in error_msg:
271
+ error_type = ErrorType.NOT_FOUND
272
+ elif "401" in error_msg or "unauthorized" in error_msg:
273
+ error_type = ErrorType.AUTH_ERROR
274
+ elif "400" in error_msg or "bad request" in error_msg:
275
+ error_type = ErrorType.BAD_REQUEST
276
+
277
+ if error_type not in tool_errors:
278
+ tool_errors[error_type] = set()
279
+ tool_errors[error_type].add(tool)
280
+
281
+ # Generate targetted rec.
282
+ for error_type, tools in tool_errors.items():
283
+ tools_str = ", ".join(tools)
284
+
285
+ if error_type == ErrorType.NOT_FOUND:
286
+ recommendations.append(
287
+ AgentRecommendation(
288
+ issue=f"Resource not found errors with tool(s): {tools_str}",
289
+ prompt_addition="The tool call failed with a “not found” error, possibly due to a missing or incorrect ID. You might consider prompting the agent to confirm such values before using them.",
290
+ summary="The agent used IDs that don't exist in the database or called endpoints that don't exist.",
291
+ )
292
+ )
293
+ elif error_type == ErrorType.AUTH_ERROR:
294
+ recommendations.append(
295
+ AgentRecommendation(
296
+ issue=f"Authentication errors with {tools_str}",
297
+ prompt_addition="The tool call was rejected due to missing or invalid authentication. If applicable, consider including guidance that limits tool usage to authenticated contexts only.",
298
+ summary="The agent made API requests with invalid or missing authentication. Please verify your credentials as a first step.",
299
+ )
300
+ )
301
+ elif error_type == ErrorType.BAD_REQUEST:
302
+ recommendations.append(
303
+ AgentRecommendation(
304
+ issue=f"Bad request errors with {tools_str}",
305
+ prompt_addition="The tool call failed due to a malformed request. It may help to reinforce parameter validation and type checking before making such calls.",
306
+ summary="The agent made API requests with invalid parameter formats or values.",
307
+ )
308
+ )
309
+ else:
310
+ recommendations.append(
311
+ AgentRecommendation(
312
+ issue=f"API errors with {tools_str}",
313
+ prompt_addition="The tool call failed due to an unexpected server or API error. While this may be environmental, reviewing when and how this tool is called could reduce unintended issues.",
314
+ summary=f"The agent made API requests that were rejected by the server: {error_type}",
315
+ )
316
+ )
317
+
318
+ # Agent hallucinations
319
+ if root_causes.agent_hallucinations:
320
+ tools_with_hallucinations = {
321
+ i.tool for i in root_causes.agent_hallucinations
322
+ }
323
+ tools_hallucination_str = ", ".join(tools_with_hallucinations)
324
+
325
+ hallucination_examples = []
326
+ for cause in root_causes.agent_hallucinations:
327
+ for param in cause.hallucinated_params:
328
+ hallucination_examples.append(
329
+ f"{param.param}: expected '{param.expected}', got '{param.actual}'"
330
+ )
331
+
332
+ hallucination_examples_str = "; ".join(
333
+ hallucination_examples[:2]
334
+ ) # Limit to 2 examples
335
+ recommendations.append(
336
+ AgentRecommendation(
337
+ issue=f"Agent hallucinated parameter values for {tools_hallucination_str}",
338
+ prompt_addition="The agent used parameter values that did not match the expected inputs for this tool. Consider making it clear that parameter values should come from prior conversation context, not be assumed.",
339
+ summary=f"The agent made up parameter values instead of using correct ones. Examples: {hallucination_examples_str}",
340
+ )
341
+ )
342
+
343
+ return recommendations
344
+
345
+ def _detect_hallucinations(
346
+ self, tool_name, actual_params
347
+ ) -> List[HallucinatedParameter]:
348
+ """Compare tool parameters with ground truth to detect hallucinated values."""
349
+ hallucinated = []
350
+
351
+ if not self.ground_truth:
352
+ return hallucinated
353
+
354
+ # Find corresponding tool call in ground truth
355
+ for goal in self.ground_truth.get("goal_details", []):
356
+ if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
357
+ expected_params = goal.get("args", {})
358
+
359
+ # Compare .message args with ground-truth expectations
360
+ for param_name, actual_value in actual_params.items():
361
+ expected_value = expected_params.get(param_name)
362
+
363
+ if param_name not in expected_params:
364
+ hallucinated.append(
365
+ HallucinatedParameter(
366
+ param=param_name,
367
+ expected="agent fabricated parameter, should not exist",
368
+ actual=f"{param_name}={actual_value}",
369
+ )
370
+ )
371
+
372
+ if actual_value != expected_value:
373
+ hallucinated.append(
374
+ HallucinatedParameter(
375
+ param=param_name,
376
+ expected=expected_value,
377
+ actual=actual_value,
378
+ )
379
+ )
380
+ break
381
+
382
+ return hallucinated
383
+
384
+ def _find_tool_call_index_by_id(self, tool_call_id: str) -> int:
385
+ """Find the index of tool_call message by tool_call_id
386
+
387
+ Returns:
388
+ int: Index of the message, or -1 if not found
389
+ """
390
+ for i, msg in enumerate(self.messages):
391
+ if msg.type == ContentType.tool_call:
392
+ if isinstance(msg.content, dict):
393
+ if msg.content.get("tool_call_id") == tool_call_id:
394
+ return i
395
+ elif isinstance(msg.content, str):
396
+ try:
397
+ parsed_content = json.loads(msg.content)
398
+ if (
399
+ isinstance(parsed_content, dict)
400
+ and parsed_content.get("tool_call_id") == tool_call_id
401
+ ):
402
+ return i
403
+ except json.JSONDecodeError:
404
+ continue
405
+ return -1 # Not found
@@ -0,0 +1,163 @@
1
+ import argparse
2
+ import json
3
+ from pathlib import Path
4
+ import rich
5
+ from type import ContentType
6
+ from analytics.tools.analyzer import ToolErrorAnalyzer
7
+ from analytics.tools.ux import ToolErrorDisplayManager
8
+ from type import Message
9
+ from shutil import get_terminal_size
10
+
11
+ if __name__ == "__main__":
12
+ parser = argparse.ArgumentParser(description="tool-analytics-resources")
13
+ parser.add_argument(
14
+ "--messages_dir",
15
+ type=Path,
16
+ help="Path to `messages` folder in the output directory containing your evaluation artifacts. For reference, the output directory is specified by `output_dir` either in your evaluation configuration YAML file, or when passed in via the command line",
17
+ )
18
+ parser.add_argument(
19
+ "--ground_truth_dir",
20
+ type=Path,
21
+ help="Path to your ground truth directory containing the expected results for your evaluation. This should contain JSON files with the same base names as your messages files, e.g. `data1.messages.json`: message file and `data1.json`: ground truth file.",
22
+ )
23
+
24
+ args = parser.parse_args()
25
+
26
+ messages_dir = args.messages_dir
27
+ ground_truth_dir = args.ground_truth_dir
28
+ if not messages_dir or not ground_truth_dir:
29
+ rich.print(f"[red]Error: \n[/red]")
30
+ rich.print(
31
+ "[yellow]Please ensure you provide valid paths for --messages_dir and --ground_truth_dir[/yellow]"
32
+ )
33
+ exit(1)
34
+
35
+ # Check terminal size and prompt user to resize if needed
36
+ term_width = get_terminal_size().columns
37
+ if term_width < 180:
38
+ rich.print(
39
+ f"[yellow]⚠️ Terminal width is only {term_width} characters.[/yellow]"
40
+ )
41
+ rich.print(
42
+ "[cyan]Consider expanding your terminal window [bold]to full screen[/bold] for better layout and readability.[/cyan]\n"
43
+ )
44
+ input("Press Enter to continue once you've resized the terminal")
45
+
46
+ def count_tool_calls(messages) -> int:
47
+ """Count total tool calls in the conversation."""
48
+ return sum(1 for msg in messages if msg.type == ContentType.tool_call)
49
+
50
+ # Function to load messages from JSON file
51
+ def load_messages(file_path):
52
+ with open(file_path, "r") as f:
53
+
54
+ try:
55
+ message_data = json.load(f)
56
+ messages = []
57
+ for msg in message_data:
58
+ messages.append(Message.model_validate(msg))
59
+
60
+ return messages
61
+
62
+ except Exception as e:
63
+ print(file_path)
64
+ print(e)
65
+ return None
66
+
67
+ # Function to load ground truth from JSON file
68
+ def load_ground_truth(file_path):
69
+ with open(file_path, "r") as f:
70
+ ground_truth_data = json.load(f)
71
+ return ground_truth_data
72
+
73
+ # Find all message files
74
+ message_files = list(messages_dir.glob("*.messages.json"))
75
+ if not message_files:
76
+ message_files = list(messages_dir.glob("*.json"))
77
+ message_files = sorted(message_files)
78
+ rich.print(
79
+ f"[bold green]Found {len(message_files)} message files to analyze[/bold green]"
80
+ )
81
+
82
+ all_results = {}
83
+ all_tool_def_recs = []
84
+
85
+ for message_file in message_files:
86
+ # Extract base name to find matching ground truth
87
+ base_name = message_file.stem
88
+ if base_name.endswith(".messages"):
89
+ base_name = base_name.replace(".messages", "")
90
+
91
+ # Find matching ground truth file
92
+ ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
93
+
94
+ if ground_truth_file:
95
+ rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
96
+
97
+ # Load data
98
+ messages = load_messages(message_file)
99
+ if not messages:
100
+ continue
101
+ ground_truth = load_ground_truth(ground_truth_file)
102
+
103
+ # Run analysis
104
+ analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
105
+ results = analyzer.analyze()
106
+ display_manager = ToolErrorDisplayManager(
107
+ messages=messages, error_patterns=results.error_patterns
108
+ )
109
+
110
+ # Count tool calls and store in results
111
+ results.total_tool_calls = count_tool_calls(messages)
112
+
113
+ tool_def_recs = display_manager.generate_tool_definition_recommendations()
114
+ all_tool_def_recs.extend(tool_def_recs)
115
+
116
+ # Display results
117
+ error_count = len(results.error_patterns.all_failures)
118
+ repeat_count = len(results.error_patterns.repeated_failures)
119
+ rec_count = len(results.recommendations)
120
+
121
+ rich.print(
122
+ f"[yellow]Results:[/yellow] {error_count} failing tools, {repeat_count} with repeated failures"
123
+ )
124
+ display_manager.create_individual_testcase_header_analysis(
125
+ base_name, results, tool_def_recs
126
+ )
127
+ all_results[base_name] = results
128
+
129
+ if rec_count > 0:
130
+ rich.print(
131
+ "\n[bold magenta]🤖 Agent Template Recommendations:[/bold magenta]"
132
+ )
133
+ for rec in results.recommendations:
134
+ rich.print(f"• [bold]{rec.issue}[/bold]")
135
+ rich.print(
136
+ f" [green]Guideline/Suggested Fix(es):[/green] --"
137
+ ) # rec.prompt_addition can be embedded here when ready
138
+ rich.print(
139
+ f" [gold3][bold]Explanation:[/bold][/gold3] {rec.summary}"
140
+ )
141
+
142
+ if tool_def_recs:
143
+ rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
144
+ for rec in tool_def_recs:
145
+ rich.print(
146
+ f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
147
+ )
148
+ rich.print(
149
+ f" [cyan]Fix:[/cyan] --"
150
+ ) # rec.recommendation can be embedded here when ready
151
+ if rec.example is not None:
152
+ rich.print(f" [yellow]Example:[/yellow] {rec.example}")
153
+
154
+ rich.print("\n" + "[grey70]=[/grey70]" * 100 + "\n")
155
+ else:
156
+ rich.print(
157
+ f"\n[red][bold]No ground truth found for {base_name}[/bold][/red] - [yellow]🚨 SKIPPED[/yellow]"
158
+ )
159
+
160
+ # Final executive summary
161
+ if all_results:
162
+ display_manager.generate_executive_summary(all_results, all_tool_def_recs)
163
+ rich.print("\n[bold green]Analysis complete![/bold green]")