ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,130 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Dict, Any, Optional
3
+ from enum import Enum
4
+
5
+
6
+ class ErrorType(str, Enum):
7
+ """Categories of tool call errors."""
8
+
9
+ NOT_FOUND = "not_found"
10
+ AUTH_ERROR = "auth_error"
11
+ BAD_REQUEST = "bad_request"
12
+ GENERAL = "general"
13
+
14
+
15
+ class Priority(str, Enum):
16
+ """Priority levels for recommendations."""
17
+
18
+ HIGH = "🔴 High"
19
+ MEDIUM = "🟡 Medium"
20
+ LOW = "🆗 Low"
21
+
22
+
23
+ # Foundational data structures
24
+ class ToolFailure(BaseModel):
25
+ """Represents a single tool call failure."""
26
+
27
+ attempt_index: int = Field(
28
+ ..., description="Index of the failed tool call in messages"
29
+ )
30
+ parameters: Dict[str, Any] = Field(
31
+ default_factory=dict, description="Parameters passed to the tool"
32
+ )
33
+ error_message: Any = Field(..., description="Error message returned by the tool")
34
+
35
+
36
+ class HallucinatedParameter(BaseModel):
37
+ """Represents a parameter that was hallucinated by the agent."""
38
+
39
+ param: str = Field(..., description="Parameter name")
40
+ expected: Any = Field(..., description="Expected value or description")
41
+ actual: Any = Field(..., description="Actual value provided by agent")
42
+ type: Optional[str] = Field(
43
+ None, description="Type of hallucination (e.g., 'invented_parameter')"
44
+ )
45
+
46
+
47
+ # Root cause analysis structures
48
+ class RootCauseBase(BaseModel):
49
+ """Base class for all root cause classifications."""
50
+
51
+ tool: str = Field(..., description="Name of the tool that failed")
52
+ attempt_index: int = Field(..., description="Index of the failed attempt")
53
+ error: str = Field(..., description="Error message (lowercased)")
54
+
55
+
56
+ class HallucinationCause(RootCauseBase):
57
+ """Agent hallucinated parameter values."""
58
+
59
+ hallucinated_params: List[HallucinatedParameter] = Field(
60
+ default_factory=list, description="List of parameters that were hallucinated"
61
+ )
62
+
63
+
64
+ class ParameterUsageCause(RootCauseBase):
65
+ """Incorrect parameter usage (placeholders or format errors)."""
66
+
67
+ placeholder_used: bool = Field(
68
+ ..., description="Whether placeholder values were used"
69
+ )
70
+
71
+
72
+ class BadToolCallCause(RootCauseBase):
73
+ """API errors and bad requests."""
74
+
75
+ error_type: ErrorType = Field(
76
+ default=ErrorType.GENERAL, description="Specific type of API error"
77
+ )
78
+
79
+
80
+ class RootCauses(BaseModel):
81
+ """Container for all categorized root causes."""
82
+
83
+ incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
84
+ bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
85
+ agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
86
+
87
+
88
+ # Recommendation structures
89
+ class AgentRecommendation(BaseModel):
90
+ """Recommendation for improving agent prompt templates."""
91
+
92
+ issue: str = Field(..., description="Description of the issue")
93
+ prompt_addition: str = Field(..., description="Suggested prompt improvement")
94
+ summary: str = Field(..., description="Brief explanation of the problem")
95
+
96
+
97
+ class ToolDefinitionRecommendation(BaseModel):
98
+ """Recommendation for improving tool definitions."""
99
+
100
+ tool: str = Field(..., description="Name of the tool")
101
+ issue: str = Field(..., description="Issue with the tool definition")
102
+ recommendation: str = Field(..., description="Suggested improvement")
103
+ priority: Priority = Field(..., description="Priority level")
104
+ count: int = Field(..., description="Number of occurrences")
105
+ example: Optional[str] = Field(None, description="Example of the fix")
106
+
107
+
108
+ # Main container structures
109
+ class ErrorPatterns(BaseModel):
110
+ """Container for error pattern analysis results."""
111
+
112
+ repeated_failures: Dict[str, List[ToolFailure]] = Field(
113
+ default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
114
+ )
115
+ all_failures: Dict[str, List[ToolFailure]] = Field(
116
+ default_factory=dict, description="All tool failures grouped by tool name"
117
+ )
118
+
119
+
120
+ class AnalysisResults(BaseModel):
121
+ """Complete analysis results from ToolErrorAnalyzer."""
122
+
123
+ error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
124
+ root_causes: RootCauses = Field(..., description="Root cause classification")
125
+ recommendations: List[AgentRecommendation] = Field(
126
+ default_factory=list, description="Agent template improvement recommendations"
127
+ )
128
+ total_tool_calls: Optional[int] = Field(
129
+ None, description="Total number of tool calls made"
130
+ )
@@ -0,0 +1,428 @@
1
+ import rich
2
+ import json
3
+ from rich.layout import Layout
4
+ from rich.table import Table
5
+ from rich.panel import Panel
6
+ from rich.align import Align
7
+ from rich.console import Group
8
+ from type import Message, ContentType
9
+ from typing import List, Dict, Optional
10
+ from analytics.tools.types import (
11
+ ToolDefinitionRecommendation,
12
+ Priority,
13
+ AgentRecommendation,
14
+ AnalysisResults,
15
+ ErrorPatterns,
16
+ )
17
+
18
+
19
+ class ToolErrorDisplayManager:
20
+ """Handles all display/UX functionality for tool error analysis."""
21
+
22
+ CHARACTER_THRESHOLD = (
23
+ 200 # characters <- if the error_msg has fewer then it is not helpful.
24
+ )
25
+
26
+ def __init__(
27
+ self, messages: List[Message], error_patterns: Optional[ErrorPatterns] = None
28
+ ):
29
+ self.messages = messages
30
+ self.error_patterns = error_patterns or ErrorPatterns()
31
+
32
+ # Suggest tool definition improvements
33
+ def generate_tool_definition_recommendations(
34
+ self,
35
+ ) -> List[ToolDefinitionRecommendation]:
36
+ """Suggest improvements to the customer's tool definitions"""
37
+ recommendations = []
38
+
39
+ for tool, failures in self.error_patterns.all_failures.items():
40
+ failure_counts = {
41
+ "stringified_json_outputs": 0, # should instead return native objects
42
+ "parameter_type_validation": 0, # no validation logic causing API errors
43
+ "unhelpful_responses": 0, # empty responses, not helpful to guide agent in a conversation
44
+ }
45
+
46
+ validation_error_codes = ["404", "not found", "client error"]
47
+ unhelpful_resp_threshold = ToolErrorDisplayManager.CHARACTER_THRESHOLD
48
+
49
+ for failure in failures:
50
+ error_msg = str(failure.error_message).lower()
51
+
52
+ if (
53
+ error_msg.strip() in ["[]", "{}", ""]
54
+ or len(error_msg) < unhelpful_resp_threshold
55
+ ):
56
+ failure_counts["unhelpful_responses"] += 1
57
+
58
+ if any(err_code in error_msg for err_code in validation_error_codes):
59
+ failure_counts["parameter_type_validation"] += 1
60
+
61
+ if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
62
+ failure_counts["stringified_json_outputs"] += 1
63
+
64
+ if failure_counts["unhelpful_responses"] > 0:
65
+ recommendations.append(
66
+ ToolDefinitionRecommendation(
67
+ tool=tool,
68
+ issue="Unhelpful and Contextless Response",
69
+ recommendation="Return structured error messages or raise exceptions instead of empty responses",
70
+ priority=Priority.MEDIUM,
71
+ count=failure_counts["unhelpful_responses"],
72
+ )
73
+ )
74
+
75
+ if failure_counts["stringified_json_outputs"] > 0:
76
+ recommendations.append(
77
+ ToolDefinitionRecommendation(
78
+ tool=tool,
79
+ issue="Stringified JSON output",
80
+ recommendation="Return native Python objects instead of JSON strings for better type safety",
81
+ priority=Priority.LOW,
82
+ count=failure_counts["stringified_json_outputs"],
83
+ )
84
+ )
85
+
86
+ if failure_counts["parameter_type_validation"] > 0:
87
+ recommendations.append(
88
+ ToolDefinitionRecommendation(
89
+ tool=tool,
90
+ issue="Parameter type validation issues",
91
+ recommendation="Add validation to ensure correct parameter types are passed. Return clear errors when wrong types received.",
92
+ priority=Priority.HIGH,
93
+ count=failure_counts["parameter_type_validation"],
94
+ example="-",
95
+ )
96
+ )
97
+
98
+ # Sort by priority (high -> medium -> low)
99
+ priority_order = {Priority.HIGH: 0, Priority.MEDIUM: 1, Priority.LOW: 2}
100
+ recommendations.sort(key=lambda x: priority_order[x.priority])
101
+
102
+ return recommendations
103
+
104
+ def create_individual_testcase_header_analysis(
105
+ self,
106
+ base_name: str,
107
+ results: AnalysisResults,
108
+ tool_def_recs: List[ToolDefinitionRecommendation],
109
+ ) -> None:
110
+ """Display comprehensive analysis using analyzed results."""
111
+
112
+ all_failures = results.error_patterns.all_failures
113
+ repeated_failures = results.error_patterns.repeated_failures
114
+
115
+ tool_def_recs_count = len(tool_def_recs)
116
+
117
+ # Calculate accurate statistics from analyzed results
118
+ total_failed_tools = len(all_failures) # unique tools that failed atleast once
119
+ total_failure_instances = sum(
120
+ len(failures) for failures in all_failures.values()
121
+ ) # individual failures across all tools, the same tool may have multiple failure instances
122
+ repeated_failure_tools = len(
123
+ repeated_failures
124
+ ) # number of tools that only failed >= threshold (=2)
125
+
126
+ # Create tool status lookup from analyzed results
127
+ failed_tool_calls = set()
128
+ for tool, failures in all_failures.items():
129
+ for failure in failures:
130
+ failed_tool_calls.add(failure.attempt_index)
131
+
132
+ header_table = Table(show_header=False, box=None)
133
+ header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
134
+ header_table.add_row(
135
+ "🔧 Total Tools Used (unique):", str(len(self._get_all_tools(results)))
136
+ )
137
+ header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
138
+ header_table.add_row(
139
+ "🔥 Total Failure Instances (not unique):", str(total_failure_instances)
140
+ )
141
+ header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
142
+ header_table.add_row(
143
+ "🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
144
+ )
145
+ header_table.add_row(
146
+ "🤖 Agent Template Recommendations:", str(len(results.recommendations))
147
+ )
148
+
149
+ header_panel = Panel(
150
+ header_table, title="[bold green]📋 Analysis Summary[/bold green]"
151
+ )
152
+
153
+ layout = Layout()
154
+ layout.split_row(
155
+ Layout(self._display_conversation(failed_tool_calls), name="conversation"),
156
+ Layout(self._create_detailed_analysis_panel(results), name="analysis"),
157
+ )
158
+
159
+ rich.print(header_panel)
160
+ rich.print(layout)
161
+
162
+ def _display_conversation(self, failed_tool_calls: set) -> Panel:
163
+ """Display conversation with color coding for erreneous calls."""
164
+
165
+ conversation_content = []
166
+
167
+ for i, msg in enumerate(self.messages):
168
+ if msg.role == "user":
169
+ conversation_content.append(
170
+ f"[bold blue]👤 User:[/bold blue] {msg.content}"
171
+ )
172
+ elif msg.role == "assistant":
173
+ if msg.type == ContentType.tool_call:
174
+ is_failed = i in failed_tool_calls
175
+ color = "red" if is_failed else "green"
176
+ icon = "❌" if is_failed else "✅"
177
+
178
+ conversation_content.append(
179
+ f"[bold {color}]{icon} Tool Call:[/bold {color}] {msg.content}"
180
+ )
181
+ elif msg.type == ContentType.tool_response:
182
+ is_error_response = (i + 1) in failed_tool_calls
183
+ color = "red" if is_error_response else "green"
184
+ icon = "⚠️" if is_error_response else "🔧"
185
+
186
+ # Truncate long responses
187
+ content = str(msg.content)
188
+ if len(content) > 300:
189
+ content = content[:300] + "[bold](...)[/bold]"
190
+
191
+ conversation_content.append(
192
+ f"[{color}]{icon} Response:[/{color}] {content}"
193
+ )
194
+ else:
195
+ conversation_content.append(
196
+ f"[bold cyan]🤖 Assistant:[/bold cyan] {msg.content}"
197
+ )
198
+
199
+ return Panel(
200
+ "\n".join(conversation_content),
201
+ title="[bold]📱 Conversation History[/bold]",
202
+ border_style="blue",
203
+ )
204
+
205
+ def _create_detailed_analysis_panel(self, results: AnalysisResults) -> Panel:
206
+ """Creates the analysis panel."""
207
+
208
+ content = []
209
+
210
+ if results.error_patterns.repeated_failures:
211
+ error_table = Table(title="🔄 Repeated Failures")
212
+ error_table.add_column("Tool", style="cyan")
213
+ error_table.add_column("Attempts", justify="center")
214
+ error_table.add_column("Error Type", style="red")
215
+
216
+ for tool, failures in results.error_patterns.repeated_failures.items():
217
+ # Use the analyzed error classification
218
+ error_snippet = str(failures[-1].error_message)[:50] + "..."
219
+ error_table.add_row(tool, str(len(failures)), error_snippet)
220
+
221
+ content.append(error_table)
222
+
223
+ causes = results.root_causes
224
+ root_cause_data = {
225
+ "incorrect_parameter_usage": causes.incorrect_parameter_usage,
226
+ "bad_tool_call": causes.bad_tool_call,
227
+ "agent_hallucinations": causes.agent_hallucinations,
228
+ }
229
+ if any(root_cause_data.values()):
230
+ cause_table = Table(title="🎯 Root Cause Analysis")
231
+ cause_table.add_column("Category", style="bold")
232
+ cause_table.add_column("Count", justify="center")
233
+ cause_table.add_column("Tools Affected", style="yellow")
234
+
235
+ for category, issues in root_cause_data.items():
236
+ if issues:
237
+ affected_tools = {issue.tool for issue in issues}
238
+ tools_str = ", ".join(list(affected_tools)[:3]) # Limit display
239
+ if len(affected_tools) > 3:
240
+ tools_str += f"... (+{len(affected_tools)-3} more)"
241
+
242
+ cause_table.add_row(
243
+ category.replace("_", " ").title(), str(len(issues)), tools_str
244
+ )
245
+
246
+ content.append(cause_table)
247
+
248
+ if results.recommendations:
249
+ content.append(
250
+ self._create_recommendations_display(results.recommendations)
251
+ )
252
+
253
+ # Add tool definition status table
254
+ tool_def_recs = self.generate_tool_definition_recommendations()
255
+ if tool_def_recs:
256
+ tool_def_table = Table(title="🔧 Tool Definition Status")
257
+ tool_def_table.add_column("Tool Name", style="cyan")
258
+ tool_def_table.add_column("Status", style="bold")
259
+
260
+ # Get unique tools with issues
261
+ tools_with_issues = {rec.tool for rec in tool_def_recs}
262
+
263
+ # Show all tools from failures
264
+ for tool in results.error_patterns.all_failures.keys():
265
+ if tool in tools_with_issues:
266
+ issue_count = len([r for r in tool_def_recs if r.tool == tool])
267
+ tool_def_table.add_row(
268
+ tool, f"[red]❌ {issue_count} issue(s)[/red]"
269
+ )
270
+ else:
271
+ tool_def_table.add_row(tool, "[green]✅ OK[/green]")
272
+
273
+ content.append(tool_def_table)
274
+
275
+ return Panel(
276
+ Group(*content),
277
+ title="[bold red]🔍 Analysis Results[/bold red]",
278
+ border_style="red",
279
+ )
280
+
281
+ def _create_recommendations_display(
282
+ self, recommendations: List[AgentRecommendation]
283
+ ) -> Table:
284
+ """Create prioritized recommendations table."""
285
+ rec_table = Table(title="💡 Improvement Recommendations")
286
+ rec_table.add_column("Priority", style="bold")
287
+ rec_table.add_column("Issue", style="yellow")
288
+ rec_table.add_column("Suggested Fix", style="green")
289
+
290
+ # Sort recommendations by priority
291
+ prioritized_recs = self._prioritize_recommendations(recommendations)
292
+
293
+ for i, rec in enumerate(prioritized_recs, 1):
294
+ priority = "📝 LOW" if i <= 2 else "⚡ MED" if i <= 5 else "🔥 HIGH"
295
+
296
+ rec_table.add_row(
297
+ priority,
298
+ rec.issue,
299
+ "--", # rec.prompt_addition can be added here when ready
300
+ )
301
+
302
+ return rec_table
303
+
304
+ def generate_executive_summary(
305
+ self,
306
+ all_results: Dict[str, AnalysisResults],
307
+ all_tool_def_recs: List[ToolDefinitionRecommendation],
308
+ ) -> None:
309
+ """Generate executive summary across all test cases with real tool call metrics."""
310
+
311
+ total_tool_definition_recs = len(all_tool_def_recs)
312
+
313
+ # 1. Identify failing test cases and their tool failure counts
314
+ failing_test_cases = {}
315
+ for test_case, results in all_results.items():
316
+ failed_tools_count = len(results.error_patterns.all_failures)
317
+ if failed_tools_count > 0:
318
+ failing_test_cases[test_case] = failed_tools_count
319
+
320
+ # 2. Count total failed tool calls across all test cases
321
+ total_failed_tool_calls = sum(
322
+ sum(len(failures) for failures in r.error_patterns.all_failures.values())
323
+ for r in all_results.values()
324
+ )
325
+
326
+ # 3. Get total tool calls from stored data (we'll add this to results)
327
+ total_tool_calls = sum(r.total_tool_calls or 0 for r in all_results.values())
328
+
329
+ # 4. Calculate successful tool calls and success rate
330
+ successful_tool_calls = total_tool_calls - total_failed_tool_calls
331
+ success_rate = (
332
+ (successful_tool_calls / total_tool_calls * 100)
333
+ if total_tool_calls > 0
334
+ else 100
335
+ )
336
+
337
+ # 5. Other metrics
338
+ total_cases = len(all_results)
339
+ total_agent_template_recs = sum(
340
+ len(r.recommendations) for r in all_results.values()
341
+ )
342
+
343
+ # Create failing test cases display
344
+ failing_cases_text = ""
345
+ if failing_test_cases:
346
+ failing_cases_text = "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
347
+ for test_case, failed_tool_count in sorted(failing_test_cases.items()):
348
+ failing_cases_text += f" • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
349
+ else:
350
+ failing_cases_text = (
351
+ "\n[bold green]🎉 All test cases passed![/bold green]\n"
352
+ )
353
+
354
+ # Disclaimer text
355
+ disclaimer_text = """[bold red]⚠️ IMPORTANT DISCLAIMER:[/bold red]
356
+ [yellow]The guidelines above are based on observed error patterns and are intended to help you identify potential improvements to your agent setup.
357
+ They are not exact fixes, but rather general suggestions drawn from common failure modes.
358
+
359
+ Please use them as starting points to inform your review process — not as definitive instructions.
360
+ Effectiveness may vary depending on your domain, agent behavior, and tool configuration.
361
+
362
+ [bold red]We do not recommend copying these statements directly into your prompts or tool definitions.[/bold red]
363
+ Instead, adapt the insights to fit the context of your use case, and validate any changes before deployment.[/yellow]"""
364
+
365
+ summary_text = f"""
366
+ [bold green]🎯 EXECUTIVE SUMMARY[/bold green]
367
+
368
+ 📊 [bold]Test Cases Analyzed:[/bold] {total_cases}
369
+ 🔧 [bold]Total Tool Calls Made [italic](across all test cases)[/italic]:[/bold] {total_tool_calls}
370
+ ✅ [bold]Successful Tool Calls [italic](calls that completed without error across all test cases)[/italic]:[/bold] {successful_tool_calls}
371
+ ❌ [bold]Failed Tool Calls [italic](calls that generated errors across all test cases)[/italic]:[/bold] {total_failed_tool_calls}
372
+ 🤖 [bold]Agent Template Recommendations Suggested:[/bold] {total_agent_template_recs}
373
+ 🔨 [bold]Tool Definition Recommendations Suggested:[/bold] {total_tool_definition_recs}
374
+
375
+ [yellow]📈 Success Rate = [italic](across all test cases) successful tool calls / total tool calls[/italic]:[/yellow] [bold bright_cyan]{success_rate:.1f}%[/bold bright_cyan]
376
+ {failing_cases_text}
377
+ [bold cyan]🚀 Next Steps:[/bold cyan]
378
+ 1. Implement high-priority prompt improvements
379
+ 2. Review agent tool usage patterns
380
+ 3. Update ground truth data where needed
381
+ """ # disclaimer_text can be embedded here when recommendations are ready
382
+
383
+ rich.print(Panel(Align.center(summary_text), border_style="green", padding=1))
384
+
385
+ def _prioritize_recommendations(
386
+ self, recommendations: List[AgentRecommendation]
387
+ ) -> List[AgentRecommendation]:
388
+ """Sort recommendations by priority based on issue type."""
389
+ priority_order = {
390
+ "Agent hallucinated": 1,
391
+ "Agent repeatedly fails": 2,
392
+ "Resource not found": 3,
393
+ "Using placeholder": 4,
394
+ "Parameter format": 5,
395
+ "Authentication": 6,
396
+ "Bad request": 7,
397
+ "API errors": 8,
398
+ }
399
+
400
+ def get_priority(rec):
401
+ for key_phrase, priority in priority_order.items():
402
+ if key_phrase.lower() in rec.issue.lower():
403
+ return priority
404
+ return 1 # Default priority for unmatched issues
405
+
406
+ return sorted(recommendations, key=get_priority)
407
+
408
+ def _get_all_tools(self, results: AnalysisResults) -> List[str]:
409
+ """Extract tools from analyzed results rather than re-parsing messages."""
410
+
411
+ all_tools_in_conversation = set() # unique calls
412
+ for i, msg in enumerate(self.messages):
413
+ if msg.type == ContentType.tool_call:
414
+ # Extract tool name safely
415
+ try:
416
+ if isinstance(msg.content, str):
417
+ tool_call = json.loads(msg.content)
418
+ else:
419
+ tool_call = msg.content
420
+
421
+ if isinstance(tool_call, dict):
422
+ tool_name = tool_call.get("name", "")
423
+ if tool_name:
424
+ all_tools_in_conversation.add(tool_name)
425
+ except (json.JSONDecodeError, AttributeError):
426
+ continue
427
+
428
+ return list(all_tools_in_conversation)
@@ -0,0 +1,123 @@
1
+ import json
2
+ import os
3
+ import csv
4
+ import rich
5
+ from rich.text import Text
6
+ from rich.panel import Panel
7
+ from rich.layout import Layout
8
+ from rich.table import Table
9
+ from typing import List
10
+ from wxo_agentic_evaluation.type import (
11
+ ExtendedMessage,
12
+ ContentType,
13
+ ToolCallAndRoutingMetrics,
14
+ )
15
+ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
16
+ from jsonargparse import CLI
17
+
18
+
19
+ def render(data: List[ExtendedMessage]):
20
+ conversation_lines = []
21
+ reason_lines = []
22
+
23
+ for entry in data:
24
+ msg = entry.message
25
+ role = msg.role
26
+ content = msg.content
27
+ reason = entry.reason
28
+ tool_name = None
29
+ if role == "user":
30
+ label = "👤 User"
31
+ elif role == "assistant" and msg.type == ContentType.tool_call:
32
+ if reason:
33
+ label = "❌ Tool Call"
34
+ tool_name = json.loads(msg.content)["name"]
35
+ else:
36
+ label = "✅ Tool Call"
37
+ elif role == "assistant":
38
+ label = "🤖 Assistant"
39
+ else:
40
+ label = "📦 Unknown"
41
+
42
+ text_line = Text(f"{label}: {content}\n")
43
+ if reason:
44
+ text_line.stylize("bold red")
45
+ reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
46
+ reason_lines.append(Text(reason_text, style="red"))
47
+ conversation_lines.append(text_line)
48
+
49
+ conversation_panel = Panel(
50
+ Text().join(conversation_lines),
51
+ title="Conversation History",
52
+ border_style="blue",
53
+ )
54
+ reason_panel = Panel(
55
+ Text().join(reason_lines), title="Analysis Results", border_style="red"
56
+ )
57
+
58
+ layout = Layout()
59
+ layout.split_row(Layout(conversation_panel), Layout(reason_panel))
60
+
61
+ return layout
62
+
63
+
64
+ def analyze(config: AnalyzeConfig):
65
+ summary = []
66
+ with open(os.path.join(config.data_path, "summary_metrics.csv"), "r") as f:
67
+ reader = csv.reader(f)
68
+ header = next(reader)
69
+ for row in reader:
70
+ summary.append(dict(zip(header, row)))
71
+
72
+ test_case_with_failed_tools = []
73
+ for entry in summary:
74
+ test_case_name = entry["test_case"]
75
+ if test_case_name.lower().strip() == "summary (average)":
76
+ continue
77
+ if int(entry["Wrong Function Calls"]) > 0 or int(entry["Wrong Parameters"]) > 0:
78
+ test_case_with_failed_tools.append(entry)
79
+ if len(test_case_with_failed_tools) == 0:
80
+ header_table = Table(show_header=False, box=None)
81
+ header_table.add_row(f"No Tool Call Error found!")
82
+ header_panel = Panel(
83
+ header_table, title="[bold green]📋 Analysis Summary[/bold green]"
84
+ )
85
+ rich.print(header_panel)
86
+
87
+ for test_case_entry in test_case_with_failed_tools:
88
+ test_case_name = test_case_entry["test_case"]
89
+
90
+ test_case_path = os.path.join(
91
+ config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
92
+ )
93
+ test_messages = []
94
+ with open(test_case_path, "r", encoding="utf-8") as f:
95
+ temp = json.load(f)
96
+ for entry in temp:
97
+ test_messages.append(ExtendedMessage(**entry))
98
+
99
+ test_metrics_path = os.path.join(
100
+ config.data_path, "messages", f"{test_case_name}.metrics.json"
101
+ )
102
+ with open(test_metrics_path, "r", encoding="utf-8") as f:
103
+ metrics = ToolCallAndRoutingMetrics(**json.load(f))
104
+ header_table = Table(show_header=False, box=None)
105
+ header_table.add_row(f"Test Case Name: {test_case_name}")
106
+ header_table.add_row((f"Expected Tool Calls: {metrics.expected_tool_calls}"))
107
+ header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
108
+ irrelevant_tool_calls = test_case_entry["Wrong Function Calls"]
109
+ header_table.add_row(f"Irrelevant Tool Call: {irrelevant_tool_calls}")
110
+ tool_call_with_incorrect_parameters = test_case_entry["Wrong Parameters"]
111
+ header_table.add_row(
112
+ f"Tool Call with incorrect parameters: {tool_call_with_incorrect_parameters}"
113
+ )
114
+ header_panel = Panel(
115
+ header_table, title="[bold green]📋 Analysis Summary[/bold green]"
116
+ )
117
+ rich.print(header_panel)
118
+ layout = render(test_messages)
119
+ rich.print(layout)
120
+
121
+
122
+ if __name__ == "__main__":
123
+ analyze(CLI(AnalyzeConfig, as_positional=False))