PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/compare_runs/model.py ADDED Viewed

@@ -0,0 +1,193 @@
+import statistics
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+from wxo_agentic_evaluation.utils.utils import safe_divide
+@dataclass
+class TestCaseEvaluationResult:
+    """Class representing a single test case evaluation result."""
+    name: str
+    text_match: Optional[str] = None
+    is_success: bool = False
+    total_steps: float = 0
+    llm_step: float = 0
+    total_tool_calls: float = 0
+    tool_call_precision: float = 0
+    tool_call_recall: float = 0
+    agent_routing_accuracy: float = 0
+    avg_resp_time: float = 0
+    failed_tool_calls: int = 0
+    # Store any additional metrics not explicitly defined
+    additional_metrics: Dict[str, Any] = field(default_factory=dict)
+    def matches_count(self, match_value: str = "Summary Matched") -> int:
+        """Check if this test case matches the specified value."""
+        return 1 if self.text_match == match_value else 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the test case result to a dictionary."""
+        result = {
+            "dataset_name": self.name,
+            "text_match": self.text_match,
+            "is_success": self.is_success,
+            "total_steps": self.total_steps,
+            "llm_step": self.llm_step,
+            "total_tool_calls": self.total_tool_calls,
+            "tool_call_precision": self.tool_call_precision,
+            "tool_call_recall": self.tool_call_recall,
+            "agent_routing_accuracy": self.agent_routing_accuracy,
+            "avg_resp_time": self.avg_resp_time,
+            "failed_tool_calls": self.failed_tool_calls,
+        }
+        # Add any additional metrics
+        result.update(self.additional_metrics)
+        return result
+@dataclass
+class EvaluationResult:
+    """Class representing a collection of test case evaluation results."""
+    test_case_results: Dict[str, TestCaseEvaluationResult]
+    @classmethod
+    def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
+        """Create an EvaluationResult from CSV data."""
+        results = {}
+        for row in data:
+            name = row["dataset_name"]
+            # Extract standard fields
+            standard_fields = {
+                "name": name,
+                "text_match": row.get("text_match"),
+                "is_success": row.get("is_success", False),
+                "total_steps": row.get("total_steps", 0),
+                "llm_step": row.get("llm_step", 0),
+                "total_tool_calls": row.get("total_tool_calls", 0),
+                "tool_call_precision": row.get("tool_call_precision", 0),
+                "tool_call_recall": row.get("tool_call_recall", 0),
+                "agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
+                "avg_resp_time": row.get("avg_resp_time", 0),
+                "failed_tool_calls": row.get("failed_tool_calls", 0),
+            }
+            # Extract additional fields not in the standard set
+            additional_metrics = {}
+            for key, value in row.items():
+                if key not in standard_fields and key != "dataset_name":
+                    additional_metrics[key] = value
+            # Create the test case result
+            result = TestCaseEvaluationResult(
+                **standard_fields, additional_metrics=additional_metrics
+            )
+            results[name] = result
+        return cls(results)
+    def calculate_boolean_percent_true(
+        self, values: List[bool]
+    ) -> Dict[str, Any]:
+        """Calculate statistics for boolean values."""
+        return {
+            "mean": sum(1 for v in values if v) / len(values) if values else 0,
+            "count": len(values),
+            "true_count": sum(1 for v in values if v),
+            "false_count": sum(1 for v in values if not v),
+        }
+    def calculate_numeric_statistics(
+        self, values: List[float]
+    ) -> Dict[str, Any]:
+        """Calculate statistics for numeric values."""
+        try:
+            stats = {
+                "mean": statistics.mean(values),
+                "median": statistics.median(values),
+                "min": min(values),
+                "max": max(values),
+                "count": len(values),
+            }
+            if len(values) > 1:
+                stats["std_dev"] = statistics.stdev(values)
+            return stats
+        except statistics.StatisticsError:
+            # Handle empty lists or other statistical errors
+            return {"error": "Could not compute statistics"}
+    def compute_summary_statistics(self) -> Dict[str, Any]:
+        """Compute summary statistics for all test cases."""
+        stats = {}
+        if not self.test_case_results:
+            return stats
+        # Get all fields from the first test case
+        first_result = next(iter(self.test_case_results.values()))
+        first_dict = first_result.to_dict()
+        # Identify numeric and boolean columns
+        for key, value in first_dict.items():
+            if key == "dataset_name" or key == "text_match":
+                continue
+            # Collect values for this field from all test cases
+            values = []
+            for result in self.test_case_results.values():
+                result_dict = result.to_dict()
+                if key in result_dict:
+                    values.append(result_dict[key])
+            # Calculate statistics based on value type
+            if values:
+                if all(isinstance(v, bool) for v in values):
+                    stats[key] = self.calculate_boolean_percent_true(values)
+                elif all(isinstance(v, (int, float)) for v in values):
+                    stats[key] = self.calculate_numeric_statistics(values)
+        # Count summary matches
+        match_count = sum(
+            result.matches_count() for result in self.test_case_results.values()
+        )
+        stats["summary_matched_count"] = {
+            "count": match_count,
+            "percentage": (
+                round(match_count / len(self.test_case_results) * 100, 2)
+                if self.test_case_results
+                else 0
+            ),
+        }
+        return stats
+    @property
+    def test_count(self) -> int:
+        """Get the total number of test cases."""
+        return len(self.test_case_results)
+    @property
+    def summary_matched_count(self) -> int:
+        """Get the count of summary matched test cases."""
+        return sum(
+            result.matches_count() for result in self.test_case_results.values()
+        )
+    @property
+    def is_success_count(self) -> int:
+        """Get the count of successful test cases."""
+        return sum(
+            1 for result in self.test_case_results.values() if result.is_success
+        )
+    def summary_match_ratio(self) -> float:
+        """Calculate the ratio of summary matches to total tests."""
+        return safe_divide(self.summary_matched_count, self.test_count)
+    def is_success_ratio(self) -> float:
+        """Calculate the ratio of successful tests to total tests."""
+        return safe_divide(self.is_success_count, self.test_count)

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -3,13 +3,16 @@ import collections
 import json
 from typing import Dict, List, Optional
-from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.arg_configs import (
+    ChatRecordingConfig,
+    KeywordsGenerationConfig,
+)
 from wxo_agentic_evaluation.prompt.template_render import (
     LlamaKeywordsGenerationTemplateRenderer,
 )
 from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
-from wxo_agentic_evaluation.type import EvaluationData, Message
+from wxo_agentic_evaluation.type import Message, OrchestrateDataset
 ERROR_KEYWORDS = [
     "error",
@@ -113,11 +116,11 @@ class DataAnnotator:
         self,
         messages: List[Message],
         keywords_generation_config: KeywordsGenerationConfig,
-        initial_data: Optional[EvaluationData] = None,
+        initial_data: Optional[OrchestrateDataset] = None,
     ):
         self.messages = messages
         self.keywords_generation_config = keywords_generation_config
-        self.initial_data = initial_data or EvaluationData(
+        self.initial_data = initial_data or OrchestrateDataset(
             agent="",
             story="",
             starting_sentence=messages[0].content if messages else "",
@@ -223,11 +226,23 @@ class DataAnnotator:
         return goals, goal_details, previous
     def _process_summarization(
-        self, previous: str, goals: Dict, goal_details: List
+        self,
+        previous: str,
+        goals: Dict,
+        goal_details: List,
+        config: ChatRecordingConfig = None,
     ) -> None:
         """Process summarization step"""
         summarize_step = None
         # we assume single summary step at the end
+        extra_kwargs = {}
+        instance_url = getattr(config, "service_url", None)
+        token = getattr(config, "token", None)
+        if instance_url:
+            extra_kwargs["instance_url"] = instance_url
+        if token:
+            extra_kwargs["token"] = token
         for message in self.messages[::-1]:
             if message.role == "assistant":
                 provider = get_provider(
@@ -237,6 +252,7 @@ class DataAnnotator:
                         "decoding_method": "greedy",
                         "max_new_tokens": 256,
                     },
+                    **extra_kwargs,
                 )
                 kw_generator = KeywordsGenerationLLM(
                     provider=provider,
@@ -261,10 +277,12 @@ class DataAnnotator:
         else:
             goals[previous] = ["summarize"]
-    def generate(self) -> Dict:
+    def generate(self, config: ChatRecordingConfig = None) -> Dict:
         """Generate the final dataset"""
         goals, goal_details, previous = self._process_tool_calls()
-        self._process_summarization(previous, goals, goal_details)
+        self._process_summarization(
+            previous, goals, goal_details, config=config
+        )
         return {
             "agent": self.initial_data.agent,

wxo_agentic_evaluation/description_quality_checker.py CHANGED Viewed

@@ -3,8 +3,7 @@ from enum import Enum
 from pathlib import Path
 from typing import List
-import rich
+from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
 from wxo_agentic_evaluation.prompt.template_render import (
     BadToolDescriptionRenderer,
 )
@@ -15,6 +14,9 @@ from wxo_agentic_evaluation.tool_planner import (
     parse_json_string,
 )
 from wxo_agentic_evaluation.type import ToolDefinition
+from wxo_agentic_evaluation.utils.gateway_provider_utils import (
+    get_provider_kwargs,
+)
 from wxo_agentic_evaluation.utils.utils import safe_divide
@@ -60,12 +62,23 @@ class DescriptionQualityInspector:
         root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
     )
+    DEFAULT_PROVIDER_KWARGS = {
+        "model_id": LLM_MODEL_ID,
+        "params": LLM_PARAMS,
+    }
     def __init__(self, llm_client=None):
         if llm_client is None:
+            provider_kwargs = get_provider_kwargs(
+                **self.DEFAULT_PROVIDER_KWARGS,
+            )
             llm_client = get_provider(
-                model_id=self.LLM_MODEL_ID,
-                params=self.LLM_PARAMS,
+                **provider_kwargs,
             )
         self.llm_client = llm_client
         self.template = BadToolDescriptionRenderer(
             self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
@@ -106,7 +119,9 @@ class DescriptionQualityInspector:
                 )
         return tool_definitions
-    def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
+    def detect_bad_description(
+        self, tool_definition: ToolDefinition
+    ) -> DescriptionQualityMetric:
         """
         Detects if a tool description is 'bad' using an LLM judge.
         A 'bad' description is one that:
@@ -119,6 +134,10 @@ class DescriptionQualityInspector:
         Returns:
             bool: True if the description is 'bad', False otherwise.
         """
+        if tool_definition.tool_description is None:
+            return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
         prompt = self.template.render(tool_definition=tool_definition)
         response = self.llm_client.query(prompt)
@@ -137,7 +156,11 @@ class DescriptionQualityInspector:
             response_data=response_data
         )
-        return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
+        return DescriptionQualityMetric(
+            tool_name=tool_definition.tool_name,
+            description_score=final_description_score,
+            threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
+        )
     def _calculate_score(self, response_data: dict) -> float:
         """

wxo_agentic_evaluation/evaluation.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
-from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
-from wxo_agentic_evaluation.type import Message, EvaluationData
 import json
-with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
+    convert_otel_to_message,
+)
+from wxo_agentic_evaluation.type import EvaluationData, Message
+with open(
+    "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
+    "r",
+) as f:
     data = json.load(f)
 tc_name = "collie_trial"
@@ -15,7 +20,10 @@ for message in history:
     print(f"{message.role}: {message.content}")
-with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
+with open(
+    "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
+    "r",
+) as f:
     gt = json.load(f)
 tc_name = "collie_trial"
@@ -28,7 +36,7 @@ evaluation_package = EvaluationPackage(
     messages=history,
     ground_truth=gt,
     conversational_search_data=None,
-    resource_map=None
+    resource_map=None,
 )
 (
@@ -39,4 +47,4 @@ evaluation_package = EvaluationPackage(
 ) = evaluation_package.generate_summary()
-print(metrics)
+print(metrics)

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl