PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

wxo_agentic_evaluation/compare_runs/model.py ADDED Viewed

@@ -0,0 +1,193 @@
+import statistics
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+from wxo_agentic_evaluation.utils.utils import safe_divide
+@dataclass
+class TestCaseEvaluationResult:
+    """Class representing a single test case evaluation result."""
+    name: str
+    text_match: Optional[str] = None
+    is_success: bool = False
+    total_steps: float = 0
+    llm_step: float = 0
+    total_tool_calls: float = 0
+    tool_call_precision: float = 0
+    tool_call_recall: float = 0
+    agent_routing_accuracy: float = 0
+    avg_resp_time: float = 0
+    failed_tool_calls: int = 0
+    # Store any additional metrics not explicitly defined
+    additional_metrics: Dict[str, Any] = field(default_factory=dict)
+    def matches_count(self, match_value: str = "Summary Matched") -> int:
+        """Check if this test case matches the specified value."""
+        return 1 if self.text_match == match_value else 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the test case result to a dictionary."""
+        result = {
+            "dataset_name": self.name,
+            "text_match": self.text_match,
+            "is_success": self.is_success,
+            "total_steps": self.total_steps,
+            "llm_step": self.llm_step,
+            "total_tool_calls": self.total_tool_calls,
+            "tool_call_precision": self.tool_call_precision,
+            "tool_call_recall": self.tool_call_recall,
+            "agent_routing_accuracy": self.agent_routing_accuracy,
+            "avg_resp_time": self.avg_resp_time,
+            "failed_tool_calls": self.failed_tool_calls,
+        }
+        # Add any additional metrics
+        result.update(self.additional_metrics)
+        return result
+@dataclass
+class EvaluationResult:
+    """Class representing a collection of test case evaluation results."""
+    test_case_results: Dict[str, TestCaseEvaluationResult]
+    @classmethod
+    def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
+        """Create an EvaluationResult from CSV data."""
+        results = {}
+        for row in data:
+            name = row["dataset_name"]
+            # Extract standard fields
+            standard_fields = {
+                "name": name,
+                "text_match": row.get("text_match"),
+                "is_success": row.get("is_success", False),
+                "total_steps": row.get("total_steps", 0),
+                "llm_step": row.get("llm_step", 0),
+                "total_tool_calls": row.get("total_tool_calls", 0),
+                "tool_call_precision": row.get("tool_call_precision", 0),
+                "tool_call_recall": row.get("tool_call_recall", 0),
+                "agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
+                "avg_resp_time": row.get("avg_resp_time", 0),
+                "failed_tool_calls": row.get("failed_tool_calls", 0),
+            }
+            # Extract additional fields not in the standard set
+            additional_metrics = {}
+            for key, value in row.items():
+                if key not in standard_fields and key != "dataset_name":
+                    additional_metrics[key] = value
+            # Create the test case result
+            result = TestCaseEvaluationResult(
+                **standard_fields, additional_metrics=additional_metrics
+            )
+            results[name] = result
+        return cls(results)
+    def calculate_boolean_percent_true(
+        self, values: List[bool]
+    ) -> Dict[str, Any]:
+        """Calculate statistics for boolean values."""
+        return {
+            "mean": sum(1 for v in values if v) / len(values) if values else 0,
+            "count": len(values),
+            "true_count": sum(1 for v in values if v),
+            "false_count": sum(1 for v in values if not v),
+        }
+    def calculate_numeric_statistics(
+        self, values: List[float]
+    ) -> Dict[str, Any]:
+        """Calculate statistics for numeric values."""
+        try:
+            stats = {
+                "mean": statistics.mean(values),
+                "median": statistics.median(values),
+                "min": min(values),
+                "max": max(values),
+                "count": len(values),
+            }
+            if len(values) > 1:
+                stats["std_dev"] = statistics.stdev(values)
+            return stats
+        except statistics.StatisticsError:
+            # Handle empty lists or other statistical errors
+            return {"error": "Could not compute statistics"}
+    def compute_summary_statistics(self) -> Dict[str, Any]:
+        """Compute summary statistics for all test cases."""
+        stats = {}
+        if not self.test_case_results:
+            return stats
+        # Get all fields from the first test case
+        first_result = next(iter(self.test_case_results.values()))
+        first_dict = first_result.to_dict()
+        # Identify numeric and boolean columns
+        for key, value in first_dict.items():
+            if key == "dataset_name" or key == "text_match":
+                continue
+            # Collect values for this field from all test cases
+            values = []
+            for result in self.test_case_results.values():
+                result_dict = result.to_dict()
+                if key in result_dict:
+                    values.append(result_dict[key])
+            # Calculate statistics based on value type
+            if values:
+                if all(isinstance(v, bool) for v in values):
+                    stats[key] = self.calculate_boolean_percent_true(values)
+                elif all(isinstance(v, (int, float)) for v in values):
+                    stats[key] = self.calculate_numeric_statistics(values)
+        # Count summary matches
+        match_count = sum(
+            result.matches_count() for result in self.test_case_results.values()
+        )
+        stats["summary_matched_count"] = {
+            "count": match_count,
+            "percentage": (
+                round(match_count / len(self.test_case_results) * 100, 2)
+                if self.test_case_results
+                else 0
+            ),
+        }
+        return stats
+    @property
+    def test_count(self) -> int:
+        """Get the total number of test cases."""
+        return len(self.test_case_results)
+    @property
+    def summary_matched_count(self) -> int:
+        """Get the count of summary matched test cases."""
+        return sum(
+            result.matches_count() for result in self.test_case_results.values()
+        )
+    @property
+    def is_success_count(self) -> int:
+        """Get the count of successful test cases."""
+        return sum(
+            1 for result in self.test_case_results.values() if result.is_success
+        )
+    def summary_match_ratio(self) -> float:
+        """Calculate the ratio of summary matches to total tests."""
+        return safe_divide(self.summary_matched_count, self.test_count)
+    def is_success_ratio(self) -> float:
+        """Calculate the ratio of successful tests to total tests."""
+        return safe_divide(self.is_success_count, self.test_count)

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -1,16 +1,19 @@
-from wxo_agentic_evaluation.type import Message, EvaluationData
-from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.prompt.template_render import (
-    LlamaKeywordsGenerationTemplateRenderer,
-)
-from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
 import ast
-import json
 import collections
+import json
 from typing import Dict, List, Optional
+from wxo_agentic_evaluation.arg_configs import (
+    ChatRecordingConfig,
+    KeywordsGenerationConfig,
+)
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaKeywordsGenerationTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.type import Message, OrchestrateDataset
 ERROR_KEYWORDS = [
     "error",
     "erroneous",
@@ -113,11 +116,11 @@ class DataAnnotator:
         self,
         messages: List[Message],
         keywords_generation_config: KeywordsGenerationConfig,
-        initial_data: Optional[EvaluationData] = None,
+        initial_data: Optional[OrchestrateDataset] = None,
     ):
         self.messages = messages
         self.keywords_generation_config = keywords_generation_config
-        self.initial_data = initial_data or EvaluationData(
+        self.initial_data = initial_data or OrchestrateDataset(
             agent="",
             story="",
             starting_sentence=messages[0].content if messages else "",
@@ -143,7 +146,9 @@ class DataAnnotator:
                     )
         return wrong_tool_response_id
-    def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
+    def _process_tool_call_order(
+        self, wrong_tool_response_id: list[str]
+    ) -> list[str]:
         """Process and order tool calls, skipping failed ones"""
         # gather all call ids that actually got a response
         valid_call_ids = {
@@ -221,16 +226,33 @@ class DataAnnotator:
         return goals, goal_details, previous
     def _process_summarization(
-        self, previous: str, goals: Dict, goal_details: List
+        self,
+        previous: str,
+        goals: Dict,
+        goal_details: List,
+        config: ChatRecordingConfig = None,
     ) -> None:
         """Process summarization step"""
         summarize_step = None
         # we assume single summary step at the end
+        extra_kwargs = {}
+        instance_url = getattr(config, "service_url", None)
+        token = getattr(config, "token", None)
+        if instance_url:
+            extra_kwargs["instance_url"] = instance_url
+        if token:
+            extra_kwargs["token"] = token
         for message in self.messages[::-1]:
             if message.role == "assistant":
                 provider = get_provider(
                     model_id=self.keywords_generation_config.model_id,
-                    params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
+                    params={
+                        "min_new_tokens": 0,
+                        "decoding_method": "greedy",
+                        "max_new_tokens": 256,
+                    },
+                    **extra_kwargs,
                 )
                 kw_generator = KeywordsGenerationLLM(
                     provider=provider,
@@ -248,15 +270,19 @@ class DataAnnotator:
                 goal_details.append(summarize_step)
                 break
-        if summarize_step:
-            goals[previous] = ["summarize"]
-        else:
+        if previous is None:
+            goals["summarize"] = []
+        elif summarize_step is None:
             goals[previous] = []
+        else:
+            goals[previous] = ["summarize"]
-    def generate(self) -> Dict:
+    def generate(self, config: ChatRecordingConfig = None) -> Dict:
         """Generate the final dataset"""
         goals, goal_details, previous = self._process_tool_calls()
-        self._process_summarization(previous, goals, goal_details)
+        self._process_summarization(
+            previous, goals, goal_details, config=config
+        )
         return {
             "agent": self.initial_data.agent,

wxo_agentic_evaluation/description_quality_checker.py ADDED Viewed

@@ -0,0 +1,178 @@
+import os
+from enum import Enum
+from pathlib import Path
+from typing import List
+from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
+from wxo_agentic_evaluation.prompt.template_render import (
+    BadToolDescriptionRenderer,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.tool_planner import (
+    MISSING_DOCSTRING_PROMPT,
+    extract_tool_signatures,
+    parse_json_string,
+)
+from wxo_agentic_evaluation.type import ToolDefinition
+from wxo_agentic_evaluation.utils.gateway_provider_utils import (
+    get_provider_kwargs,
+)
+from wxo_agentic_evaluation.utils.utils import safe_divide
+class ToolDescriptionIssue(Enum):
+    """
+    Represents the binary outcomes the LLM judge will classify in its assessment \
+    of the tool's description.
+    The presence of these issues in the tool's description indicates poor quality.
+    For more detail on what each issue indicates, please take a look at the template here: `wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2`.
+    """
+    # TODO: Priority-based weighting of issues.
+    CONTAINS_REDUNDANT_INFORMATION = "contains_redundant_information"
+    USES_VAGUE_LANGUAGE = "uses_vague_language"
+    DOES_NOT_HELP_IN_IDENTIFYING_TOOL_UNIQUELY = (
+        "does_not_help_in_identifying_tool_uniquely"
+    )
+    PROVIDES_NO_NEW_INFORMATION = "provides_no_new_information"
+    DOES_NOT_CONVEY_TOOL_PURPOSE = "does_not_convey_tool_purpose"
+class DescriptionQualityInspector:
+    DEFAULT_CLASSIFICATION_THRESHOLD = 40.0  # 2/5 issues detected. A higher score indicates a worse description.
+    CLASSIFICATION_SCORE_THRESHOLD = float(
+        os.getenv(
+            "CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
+        )
+    )
+    LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
+    LLM_PARAMS = {
+        "min_new_tokens": 128,
+        "decoding_method": "greedy",
+        "max_new_tokens": 512,
+    }
+    WORST_POSSIBLE_EVAL_OUTCOME = len(
+        ToolDescriptionIssue
+    )  # the final score used for classification is normalized against this value.
+    root_dir = os.path.dirname(__file__)
+    BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH = os.path.join(
+        root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
+    )
+    DEFAULT_PROVIDER_KWARGS = {
+        "model_id": LLM_MODEL_ID,
+        "params": LLM_PARAMS,
+    }
+    def __init__(self, llm_client=None):
+        if llm_client is None:
+            provider_kwargs = get_provider_kwargs(
+                **self.DEFAULT_PROVIDER_KWARGS,
+            )
+            llm_client = get_provider(
+                **provider_kwargs,
+            )
+        self.llm_client = llm_client
+        self.template = BadToolDescriptionRenderer(
+            self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
+        )
+        self.cached_response = None  # this is used in the unit-tests for nuanced analysis of the response.
+    @staticmethod
+    def extract_tool_desc_from_tool_source(
+        tool_source: Path, failing_tools: List[str]
+    ) -> List[ToolDefinition]:
+        """
+        Parses the tool source file to extract the tool description.
+        Wraps the description along with the tool name, and args into a `ToolDefinition` for all `failing_tools`.
+        This `ToolDefinition` is later rendered into the judge's prompt template for evaluation.
+        Args:
+            tool_source (Path): The path to the tool source file/dir containing `.py` tools.
+            failing_tools (List[str]): List of tool names that failed during inference.
+        Returns:
+            List[ToolDefinition]: The extracted tool definition(s) or [] if the file contains no @tool decorators.
+        """
+        all_tool_data = extract_tool_signatures(tool_source)
+        tool_definitions = []
+        for tool_data in all_tool_data:
+            tool_name = tool_data["Function Name"]
+            if tool_name in failing_tools:
+                tool_definitions.append(
+                    ToolDefinition(
+                        tool_name=tool_name,
+                        tool_description=(
+                            tool_data["Docstring"]
+                            if tool_data["Docstring"]
+                            != MISSING_DOCSTRING_PROMPT
+                            else None
+                        ),
+                        tool_params=tool_data["Arguments"],
+                    )
+                )
+        return tool_definitions
+    def detect_bad_description(
+        self, tool_definition: ToolDefinition
+    ) -> DescriptionQualityMetric:
+        """
+        Detects if a tool description is 'bad' using an LLM judge.
+        A 'bad' description is one that:
+            - does not describe the tool's functionality/use-case clearly
+            - does not provide sufficient detail for an agent to understand how to use the tool
+            - does not distinguish the tool from other tools
+            For the exact definition of a 'bad' description, refer to `ToolDescriptionIssue` Enum.
+        Args:
+            tool_definition (ToolDefinition): The definition of the tool to evaluate.
+        Returns:
+            bool: True if the description is 'bad', False otherwise.
+        """
+        if tool_definition.tool_description is None:
+            return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
+        prompt = self.template.render(tool_definition=tool_definition)
+        response = self.llm_client.query(prompt)
+        # parse JSON objects from cleaned text
+        json_objects = parse_json_string(response)
+        # pick the first JSON object
+        if json_objects:
+            response_data = json_objects[0]
+            self.cached_response = response_data
+        else:
+            return False  # likely some unexpected parsing issue, in this case - flags description as good.
+        # calculate weighted score
+        final_description_score = self._calculate_score(
+            response_data=response_data
+        )
+        return DescriptionQualityMetric(
+            tool_name=tool_definition.tool_name,
+            description_score=final_description_score,
+            threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
+        )
+    def _calculate_score(self, response_data: dict) -> float:
+        """
+        Calculates a final score for the tool description.
+        This score is used to finally classify a 'good' or 'bad' description.
+        :param response_data: Parsed JSON response returned by the LLM judge.
+        """
+        detected_issues = sum(
+            1
+            for issue in ToolDescriptionIssue
+            if response_data.get(issue.value, "FALSE").upper() == "TRUE"
+        )
+        return (
+            safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
+        )

wxo_agentic_evaluation/evaluation.py ADDED Viewed

@@ -0,0 +1,50 @@
+import json
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
+    convert_otel_to_message,
+)
+from wxo_agentic_evaluation.type import EvaluationData, Message
+with open(
+    "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
+    "r",
+) as f:
+    data = json.load(f)
+tc_name = "collie_trial"
+history = convert_otel_to_message(data["calls"][-1]["messages"])
+for message in history:
+    print(f"{message.role}: {message.content}")
+with open(
+    "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
+    "r",
+) as f:
+    gt = json.load(f)
+tc_name = "collie_trial"
+gt = EvaluationData.model_validate(gt)
+evaluation_package = EvaluationPackage(
+    test_case_name=tc_name,
+    messages=history,
+    ground_truth=gt,
+    conversational_search_data=None,
+    resource_map=None,
+)
+(
+    keyword_semantic_matches,
+    knowledge_base_metrics,
+    messages_with_reason,
+    metrics,
+) = evaluation_package.generate_summary()
+print(metrics)

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl