PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

wxo_agentic_evaluation/analytics/tools/ux.py CHANGED Viewed

@@ -1,19 +1,21 @@
-import rich
 import json
-from rich.layout import Layout
-from rich.table import Table
-from rich.panel import Panel
-from rich.align import Align
-from rich.console import Group
-from wxo_agentic_evaluation.type import Message, ContentType
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
+import rich
 from analytics.tools.types import (
-    ToolDefinitionRecommendation,
-    Priority,
     AgentRecommendation,
     AnalysisResults,
     ErrorPatterns,
+    Priority,
+    ToolDefinitionRecommendation,
 )
+from rich.align import Align
+from rich.console import Group
+from rich.layout import Layout
+from rich.panel import Panel
+from rich.table import Table
+from wxo_agentic_evaluation.type import ContentType, Message
 class ToolErrorDisplayManager:
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
     )
     def __init__(
-        self, messages: List[Message], error_patterns: Optional[ErrorPatterns] = None
+        self,
+        messages: List[Message],
+        error_patterns: Optional[ErrorPatterns] = None,
     ):
         self.messages = messages
         self.error_patterns = error_patterns or ErrorPatterns()
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
             }
             validation_error_codes = ["404", "not found", "client error"]
-            unhelpful_resp_threshold = ToolErrorDisplayManager.CHARACTER_THRESHOLD
+            unhelpful_resp_threshold = (
+                ToolErrorDisplayManager.CHARACTER_THRESHOLD
+            )
             for failure in failures:
                 error_msg = str(failure.error_message).lower()
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
                 ):
                     failure_counts["unhelpful_responses"] += 1
-                if any(err_code in error_msg for err_code in validation_error_codes):
+                if any(
+                    err_code in error_msg for err_code in validation_error_codes
+                ):
                     failure_counts["parameter_type_validation"] += 1
                 if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
         tool_def_recs_count = len(tool_def_recs)
         # Calculate accurate statistics from analyzed results
-        total_failed_tools = len(all_failures)  # unique tools that failed atleast once
+        total_failed_tools = len(
+            all_failures
+        )  # unique tools that failed atleast once
         total_failure_instances = sum(
             len(failures) for failures in all_failures.values()
         )  # individual failures across all tools, the same tool may have multiple failure instances
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
         header_table = Table(show_header=False, box=None)
         header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
         header_table.add_row(
-            "🔧 Total Tools Used (unique):", str(len(self._get_all_tools(results)))
+            "🔧 Total Tools Used (unique):",
+            str(len(self._get_all_tools(results))),
+        )
+        header_table.add_row(
+            "❌ Failed Tools (unique):", str(total_failed_tools)
         )
-        header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
         header_table.add_row(
-            "🔥 Total Failure Instances (not unique):", str(total_failure_instances)
+            "🔥 Total Failure Instances (not unique):",
+            str(total_failure_instances),
+        )
+        header_table.add_row(
+            "🔄 Repeated Failures:", str(repeated_failure_tools)
         )
-        header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
         header_table.add_row(
             "🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
         )
         header_table.add_row(
-            "🤖 Agent Template Recommendations:", str(len(results.recommendations))
+            "🤖 Agent Template Recommendations:",
+            str(len(results.recommendations)),
         )
         header_panel = Panel(
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
         layout = Layout()
         layout.split_row(
-            Layout(self._display_conversation(failed_tool_calls), name="conversation"),
-            Layout(self._create_detailed_analysis_panel(results), name="analysis"),
+            Layout(
+                self._display_conversation(failed_tool_calls),
+                name="conversation",
+            ),
+            Layout(
+                self._create_detailed_analysis_panel(results), name="analysis"
+            ),
         )
         rich.print(header_panel)
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
             border_style="blue",
         )
-    def _create_detailed_analysis_panel(self, results: AnalysisResults) -> Panel:
+    def _create_detailed_analysis_panel(
+        self, results: AnalysisResults
+    ) -> Panel:
         """Creates the analysis panel."""
         content = []
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
             error_table.add_column("Attempts", justify="center")
             error_table.add_column("Error Type", style="red")
-            for tool, failures in results.error_patterns.repeated_failures.items():
+            for (
+                tool,
+                failures,
+            ) in results.error_patterns.repeated_failures.items():
                 # Use the analyzed error classification
                 error_snippet = str(failures[-1].error_message)[:50] + "..."
                 error_table.add_row(tool, str(len(failures)), error_snippet)
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
             for category, issues in root_cause_data.items():
                 if issues:
                     affected_tools = {issue.tool for issue in issues}
-                    tools_str = ", ".join(list(affected_tools)[:3])  # Limit display
+                    tools_str = ", ".join(
+                        list(affected_tools)[:3]
+                    )  # Limit display
                     if len(affected_tools) > 3:
                         tools_str += f"... (+{len(affected_tools)-3} more)"
                     cause_table.add_row(
-                        category.replace("_", " ").title(), str(len(issues)), tools_str
+                        category.replace("_", " ").title(),
+                        str(len(issues)),
+                        tools_str,
                     )
             content.append(cause_table)
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
             # Show all tools from failures
             for tool in results.error_patterns.all_failures.keys():
                 if tool in tools_with_issues:
-                    issue_count = len([r for r in tool_def_recs if r.tool == tool])
+                    issue_count = len(
+                        [r for r in tool_def_recs if r.tool == tool]
+                    )
                     tool_def_table.add_row(
                         tool, f"[red]❌ {issue_count} issue(s)[/red]"
                     )
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
         # 2. Count total failed tool calls across all test cases
         total_failed_tool_calls = sum(
-            sum(len(failures) for failures in r.error_patterns.all_failures.values())
+            sum(
+                len(failures)
+                for failures in r.error_patterns.all_failures.values()
+            )
             for r in all_results.values()
         )
         # 3. Get total tool calls from stored data (we'll add this to results)
-        total_tool_calls = sum(r.total_tool_calls or 0 for r in all_results.values())
+        total_tool_calls = sum(
+            r.total_tool_calls or 0 for r in all_results.values()
+        )
         # 4. Calculate successful tool calls and success rate
         successful_tool_calls = total_tool_calls - total_failed_tool_calls
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
         # Create failing test cases display
         failing_cases_text = ""
         if failing_test_cases:
-            failing_cases_text = "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
-            for test_case, failed_tool_count in sorted(failing_test_cases.items()):
+            failing_cases_text = (
+                "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
+            )
+            for test_case, failed_tool_count in sorted(
+                failing_test_cases.items()
+            ):
                 failing_cases_text += f"  • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
         else:
             failing_cases_text = (
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
     3. Update ground truth data where needed
     """  # disclaimer_text can be embedded here when recommendations are ready
-        rich.print(Panel(Align.center(summary_text), border_style="green", padding=1))
+        rich.print(
+            Panel(Align.center(summary_text), border_style="green", padding=1)
+        )
     def _prioritize_recommendations(
         self, recommendations: List[AgentRecommendation]

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl