PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/analytics/tools/ux.py CHANGED Viewed

@@ -1,19 +1,21 @@
-import rich
 import json
-from rich.layout import Layout
-from rich.table import Table
-from rich.panel import Panel
-from rich.align import Align
-from rich.console import Group
-from wxo_agentic_evaluation.type import Message, ContentType
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
+import rich
 from analytics.tools.types import (
-    ToolDefinitionRecommendation,
-    Priority,
     AgentRecommendation,
     AnalysisResults,
     ErrorPatterns,
+    Priority,
+    ToolDefinitionRecommendation,
 )
+from rich.align import Align
+from rich.console import Group
+from rich.layout import Layout
+from rich.panel import Panel
+from rich.table import Table
+from wxo_agentic_evaluation.type import ContentType, Message
 class ToolErrorDisplayManager:
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
     )
     def __init__(
-        self, messages: List[Message], error_patterns: Optional[ErrorPatterns] = None
+        self,
+        messages: List[Message],
+        error_patterns: Optional[ErrorPatterns] = None,
     ):
         self.messages = messages
         self.error_patterns = error_patterns or ErrorPatterns()
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
             }
             validation_error_codes = ["404", "not found", "client error"]
-            unhelpful_resp_threshold = ToolErrorDisplayManager.CHARACTER_THRESHOLD
+            unhelpful_resp_threshold = (
+                ToolErrorDisplayManager.CHARACTER_THRESHOLD
+            )
             for failure in failures:
                 error_msg = str(failure.error_message).lower()
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
                 ):
                     failure_counts["unhelpful_responses"] += 1
-                if any(err_code in error_msg for err_code in validation_error_codes):
+                if any(
+                    err_code in error_msg for err_code in validation_error_codes
+                ):
                     failure_counts["parameter_type_validation"] += 1
                 if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
         tool_def_recs_count = len(tool_def_recs)
         # Calculate accurate statistics from analyzed results
-        total_failed_tools = len(all_failures)  # unique tools that failed atleast once
+        total_failed_tools = len(
+            all_failures
+        )  # unique tools that failed atleast once
         total_failure_instances = sum(
             len(failures) for failures in all_failures.values()
         )  # individual failures across all tools, the same tool may have multiple failure instances
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
         header_table = Table(show_header=False, box=None)
         header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
         header_table.add_row(
-            "🔧 Total Tools Used (unique):", str(len(self._get_all_tools(results)))
+            "🔧 Total Tools Used (unique):",
+            str(len(self._get_all_tools(results))),
+        )
+        header_table.add_row(
+            "❌ Failed Tools (unique):", str(total_failed_tools)
         )
-        header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
         header_table.add_row(
-            "🔥 Total Failure Instances (not unique):", str(total_failure_instances)
+            "🔥 Total Failure Instances (not unique):",
+            str(total_failure_instances),
+        )
+        header_table.add_row(
+            "🔄 Repeated Failures:", str(repeated_failure_tools)
         )
-        header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
         header_table.add_row(
             "🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
         )
         header_table.add_row(
-            "🤖 Agent Template Recommendations:", str(len(results.recommendations))
+            "🤖 Agent Template Recommendations:",
+            str(len(results.recommendations)),
         )
         header_panel = Panel(
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
         layout = Layout()
         layout.split_row(
-            Layout(self._display_conversation(failed_tool_calls), name="conversation"),
-            Layout(self._create_detailed_analysis_panel(results), name="analysis"),
+            Layout(
+                self._display_conversation(failed_tool_calls),
+                name="conversation",
+            ),
+            Layout(
+                self._create_detailed_analysis_panel(results), name="analysis"
+            ),
         )
         rich.print(header_panel)
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
             border_style="blue",
         )
-    def _create_detailed_analysis_panel(self, results: AnalysisResults) -> Panel:
+    def _create_detailed_analysis_panel(
+        self, results: AnalysisResults
+    ) -> Panel:
         """Creates the analysis panel."""
         content = []
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
             error_table.add_column("Attempts", justify="center")
             error_table.add_column("Error Type", style="red")
-            for tool, failures in results.error_patterns.repeated_failures.items():
+            for (
+                tool,
+                failures,
+            ) in results.error_patterns.repeated_failures.items():
                 # Use the analyzed error classification
                 error_snippet = str(failures[-1].error_message)[:50] + "..."
                 error_table.add_row(tool, str(len(failures)), error_snippet)
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
             for category, issues in root_cause_data.items():
                 if issues:
                     affected_tools = {issue.tool for issue in issues}
-                    tools_str = ", ".join(list(affected_tools)[:3])  # Limit display
+                    tools_str = ", ".join(
+                        list(affected_tools)[:3]
+                    )  # Limit display
                     if len(affected_tools) > 3:
                         tools_str += f"... (+{len(affected_tools)-3} more)"
                     cause_table.add_row(
-                        category.replace("_", " ").title(), str(len(issues)), tools_str
+                        category.replace("_", " ").title(),
+                        str(len(issues)),
+                        tools_str,
                     )
             content.append(cause_table)
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
             # Show all tools from failures
             for tool in results.error_patterns.all_failures.keys():
                 if tool in tools_with_issues:
-                    issue_count = len([r for r in tool_def_recs if r.tool == tool])
+                    issue_count = len(
+                        [r for r in tool_def_recs if r.tool == tool]
+                    )
                     tool_def_table.add_row(
                         tool, f"[red]❌ {issue_count} issue(s)[/red]"
                     )
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
         # 2. Count total failed tool calls across all test cases
         total_failed_tool_calls = sum(
-            sum(len(failures) for failures in r.error_patterns.all_failures.values())
+            sum(
+                len(failures)
+                for failures in r.error_patterns.all_failures.values()
+            )
             for r in all_results.values()
         )
         # 3. Get total tool calls from stored data (we'll add this to results)
-        total_tool_calls = sum(r.total_tool_calls or 0 for r in all_results.values())
+        total_tool_calls = sum(
+            r.total_tool_calls or 0 for r in all_results.values()
+        )
         # 4. Calculate successful tool calls and success rate
         successful_tool_calls = total_tool_calls - total_failed_tool_calls
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
         # Create failing test cases display
         failing_cases_text = ""
         if failing_test_cases:
-            failing_cases_text = "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
-            for test_case, failed_tool_count in sorted(failing_test_cases.items()):
+            failing_cases_text = (
+                "\n[bold red]📋 Failing Test Cases:[/bold red]\n"
+            )
+            for test_case, failed_tool_count in sorted(
+                failing_test_cases.items()
+            ):
                 failing_cases_text += f"  • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
         else:
             failing_cases_text = (
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
     3. Update ground truth data where needed
     """  # disclaimer_text can be embedded here when recommendations are ready
-        rich.print(Panel(Align.center(summary_text), border_style="green", padding=1))
+        rich.print(
+            Panel(Align.center(summary_text), border_style="green", padding=1)
+        )
     def _prioritize_recommendations(
         self, recommendations: List[AgentRecommendation]

wxo_agentic_evaluation/analyze_run.py CHANGED Viewed

@@ -1,36 +1,37 @@
+import csv
 import json
 import os
-import csv
-from jsonargparse import CLI
 from pathlib import Path
-from typing import List, Dict, Set, Optional
+from typing import Dict, List, Optional, Set
-from rich.text import Text
-from rich.table import Table
-from rich.panel import Panel
+from jsonargparse import CLI
 from rich.console import Group
+from rich.panel import Panel
 from rich.style import Style
+from rich.table import Table
+from rich.text import Text
-from wxo_agentic_evaluation.type import ExtendedMessage, ContentType, ToolDefinition
-from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
 from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
 from wxo_agentic_evaluation.description_quality_checker import (
     DescriptionQualityInspector,
 )
+from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
+from wxo_agentic_evaluation.type import (
+    ContentType,
+    ExtendedMessage,
+    ToolDefinition,
+)
 from wxo_agentic_evaluation.utils.rich_utils import (
-    pretty_print,
-    warn,
+    IncorrectParameterUtils,
     is_ok,
+    pretty_print,
     print_done,
-    IncorrectParameterUtils,
-)
-from wxo_agentic_evaluation.utils.utils import (
-    add_line_seperator,
+    warn,
 )
+from wxo_agentic_evaluation.utils.utils import add_line_seperator
 class Analyzer:
     def __init__(self):
         self.analysis_cache: Dict[str, List[Text]] = (
             {}
@@ -44,8 +45,10 @@ class Analyzer:
             blink=True,
             bold=True,
         )
-    def _split_cache(self, failing_tools: Set[str]) -> tuple[List[str], List[Text]]:
+    def _split_cache(
+        self, failing_tools: Set[str]
+    ) -> tuple[List[str], List[Text]]:
         tools_to_analyze: List[str] = []
         cached_lines: List[Text] = []
@@ -65,11 +68,7 @@ class Analyzer:
                 style="bold cyan",
             )
-        return (
-            tools_to_analyze,
-            cached_lines
-            )
+        return (tools_to_analyze, cached_lines)
     def analyze_failing_tool_description_quality(
         self,
@@ -98,9 +97,11 @@ class Analyzer:
         # Step 2: analyze cache misses
         if tools_to_analyze:
-            failing_tool_definitions: List[ToolDefinition] = inspector.extract_tool_desc_from_tool_source(
-                Path(tool_definition_path),
-                tools_to_analyze,
+            failing_tool_definitions: List[ToolDefinition] = (
+                inspector.extract_tool_desc_from_tool_source(
+                    Path(tool_definition_path),
+                    tools_to_analyze,
+                )
             )
             if not failing_tool_definitions:
@@ -110,7 +111,7 @@ class Analyzer:
                     )
                 )
                 return analysis_for_display
             missing_tools = self._get_tools_not_found_in_source(
                 tools_to_analyze, failing_tool_definitions
             )
@@ -134,7 +135,9 @@ class Analyzer:
         return analysis_for_display
-    def render(self, data: List[ExtendedMessage], tool_definition_path: Optional[str]) -> Group:
+    def render(
+        self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
+    ) -> Group:
         """
         Render the conversation history and analysis results.
         :param data: List of ExtendedMessage objects containing the conversation history.
@@ -151,7 +154,10 @@ class Analyzer:
             content = msg.content
             reason = entry.reason
             tool_name = None
-            if msg.type == ContentType.tool_call or msg.type == ContentType.tool_response:
+            if (
+                msg.type == ContentType.tool_call
+                or msg.type == ContentType.tool_response
+            ):
                 tool_name = json.loads(msg.content)["name"]
             if role == "user":
@@ -159,7 +165,7 @@ class Analyzer:
             elif role == "assistant" and msg.type == ContentType.tool_call:
                 if reason:
                     label = "❌ Tool Call"
                     if reason.get("reason") == "incorrect parameter":
                         failing_tools.append(
                             tool_name
@@ -199,8 +205,8 @@ class Analyzer:
             border_style="blue",
         )
         reason_panel = Panel(
-            Text().join(reason_lines),
-            title="Analysis Results",
+            Text().join(reason_lines),
+            title="Analysis Results",
             border_style="red",
         )
@@ -218,7 +224,9 @@ class Analyzer:
         def get_summary(summary_file_name: str = "summary_metrics.csv"):
             summary = []
-            path_to_summary_file = os.path.join(config.data_path, summary_file_name)
+            path_to_summary_file = os.path.join(
+                config.data_path, summary_file_name
+            )
             with open(path_to_summary_file, "r") as f:
                 reader = csv.reader(f)
@@ -232,7 +240,9 @@ class Analyzer:
             test_messages = []
             test_case_path = os.path.join(
-                config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
+                config.data_path,
+                "messages",
+                f"{test_case_name}.messages.analyze.json",
             )
             with open(test_case_path, "r", encoding="utf-8") as f:
@@ -265,7 +275,8 @@ class Analyzer:
             header_table.add_row("No Tool Call Error found!")
             panel = Panel(
-                header_table, title="[bold green]📋 Analysis Summary[/bold green]"
+                header_table,
+                title="[bold green]📋 Analysis Summary[/bold green]",
             )
             pretty_print(panel)
@@ -279,21 +290,23 @@ class Analyzer:
                 test_case_name=test_case_name
             )
-            header_panel = self._create_header_analysis_panel(test_case_name, metrics)
+            header_panel = self._create_header_analysis_panel(
+                test_case_name, metrics
+            )
             pretty_print(header_panel)
-            tool_definition_path = config.tool_definition_path \
-                if config.tool_definition_path else None
+            tool_definition_path = (
+                config.tool_definition_path
+                if config.tool_definition_path
+                else None
+            )
             rendered_content = self.render(
-                data=test_messages,
-                tool_definition_path=tool_definition_path
-                )
+                data=test_messages, tool_definition_path=tool_definition_path
+            )
             pretty_print(rendered_content)
-            add_line_seperator(
-                self._generate_style_config()
-            )
+            add_line_seperator(self._generate_style_config())
     def _create_header_analysis_panel(
         self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
@@ -301,8 +314,12 @@ class Analyzer:
         header_table = Table(show_header=False, box=None)
         header_table.add_row(f"Test Case Name: {test_case_name}")
-        header_table.add_row(f"Expected Tool Calls: {metrics.expected_tool_calls}")
-        header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
+        header_table.add_row(
+            f"Expected Tool Calls: {metrics.expected_tool_calls}"
+        )
+        header_table.add_row(
+            f"Correct Tool Calls: {metrics.correct_tool_calls}"
+        )
         header_table.add_row(f"Text Match: {metrics.text_match.value}")
         header_table.add_row(f"Journey Success: {metrics.is_success}")
@@ -359,7 +376,8 @@ class Analyzer:
         if tool_desc is None:
             tool_analysis.extend(
                 IncorrectParameterUtils.format_missing_description_message(
-                    tool_name=tool_name, tool_definition_path=tool_definition_path
+                    tool_name=tool_name,
+                    tool_definition_path=tool_definition_path,
                 )
             )
             return tool_analysis
@@ -375,10 +393,13 @@ class Analyzer:
         # good description
         tool_analysis.append(
-            is_ok(message=f"The description for the `{tool_name}` looks sufficient.")
+            is_ok(
+                message=f"The description for the `{tool_name}` looks sufficient."
+            )
         )
         return tool_analysis
 if __name__ == "__main__":
     dummy_analyzer = Analyzer()
     dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))

wxo_agentic_evaluation/annotate.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from wxo_agentic_evaluation.type import Message, EvaluationData
-from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
-from wxo_agentic_evaluation.data_annotator import DataAnnotator
 import json
+import os
 from pprint import pprint
 from jsonargparse import CLI
-import os
+from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
+from wxo_agentic_evaluation.data_annotator import DataAnnotator
+from wxo_agentic_evaluation.type import EvaluationData, Message
 def main(config: TestCaseGenerationConfig):

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -1,11 +1,16 @@
 import os
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
 from wxo_agentic_evaluation import __file__
 root_dir = os.path.dirname(__file__)
-LLAMA_USER_PROMPT_PATH = os.path.join(root_dir, "prompt", "llama_user_prompt.jinja2")
-KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(root_dir, "prompt", "keywords_generation_prompt.jinja2")
+LLAMA_USER_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "llama_user_prompt.jinja2"
+)
+KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "keywords_generation_prompt.jinja2"
+)
 @dataclass
@@ -104,6 +109,7 @@ class ChatRecordingConfig:
 class QuickEvalConfig(TestConfig):
     tools_path: str = None
 @dataclass
 class BatchAnnotateConfig:
     allowed_tools: List[str]

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl