PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (33) hide show

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
+from enum import StrEnum
 from wxo_agentic_evaluation import __file__
@@ -59,22 +60,30 @@ class AttackConfig:
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     num_workers: int = 2
+    skip_available_results: bool = True
 @dataclass
 class AttackGeneratorConfig:
     attacks_list: Union[List[str], str]
     datasets_path: Union[List[str], str]
-    agents_path: str
+    agents_list_or_path: Union[List[str], str]
     target_agent_name: str
+    auth_config: AuthConfig
     output_dir: str = None
     max_variants: int = None
+class AnalyzeMode(StrEnum):
+    default = "default"
+    enhanced = "enhanced"
 @dataclass
 class AnalyzeConfig:
     data_path: str
     tool_definition_path: Optional[str] = None
+    mode: str = AnalyzeMode.default
+    num_workers: int = 10
+    run: int = -1
 @dataclass

wxo_agentic_evaluation/description_quality_checker.py CHANGED Viewed

@@ -16,6 +16,7 @@ from wxo_agentic_evaluation.tool_planner import (
 )
 from wxo_agentic_evaluation.type import ToolDefinition
 from wxo_agentic_evaluation.utils.utils import safe_divide
+from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
 class ToolDescriptionIssue(Enum):
@@ -106,7 +107,7 @@ class DescriptionQualityInspector:
                 )
         return tool_definitions
-    def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
+    def detect_bad_description(self, tool_definition: ToolDefinition) -> DescriptionQualityMetric:
         """
         Detects if a tool description is 'bad' using an LLM judge.
         A 'bad' description is one that:
@@ -119,6 +120,10 @@ class DescriptionQualityInspector:
         Returns:
             bool: True if the description is 'bad', False otherwise.
         """
+        if tool_definition.tool_description is None:
+            return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
         prompt = self.template.render(tool_definition=tool_definition)
         response = self.llm_client.query(prompt)
@@ -137,7 +142,11 @@ class DescriptionQualityInspector:
             response_data=response_data
         )
-        return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
+        return DescriptionQualityMetric(
+            tool_name=tool_definition.tool_name,
+            description_score=final_description_score,
+            threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
+        )
     def _calculate_score(self, response_data: dict) -> float:
         """

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -77,7 +77,7 @@ class EvaluationPackage:
     def __init__(
         self,
         test_case_name,
-        ground_truth,
+        ground_truth: EvaluationData,
         messages,
         conversational_search_data: List[ConversationalSearch] = None,
         resource_map: ResourceMap = None,
@@ -103,7 +103,7 @@ class EvaluationPackage:
             else []
         )
-        self.messages = messages
+        self.messages: List[Message] = messages
         self.conversational_search_data = conversational_search_data
         self.is_attack_evaluation = is_attack_evaluation
         self.ground_truth = ground_truth
@@ -113,6 +113,7 @@ class EvaluationPackage:
         if not self.is_attack_evaluation:
             self.validate_ground_truth(self.ground_truth, self.test_case_name)
+        # output response matching
         self.matcher = LLMMatcher(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -129,6 +130,7 @@ class EvaluationPackage:
                 SEMANTIC_MATCHING_PROMPT_PATH
             ),
         )
+        # only used for RAG evaluation
         self.rag_llm_as_a_judge = LLMJudge(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -470,6 +472,7 @@ class EvaluationPackage:
             if message.event == EventTypes.message_created
             and message.role == "assistant"
         ]
         keyword_semantic_list = []
         for message in assistant_responses:
             for goal_detail in self.text_list:
@@ -478,7 +481,9 @@ class EvaluationPackage:
                         message.content, goal_detail.keywords
                     )
                     semantic_match: bool = self.matcher.semantic_match(
-                        message.content, goal_detail.response
+                        self.messages[0].content,
+                        prediction=message.content,
+                        ground_truth=goal_detail.response,
                     )
                     keyword_semantic_match = KeywordSemanticSearchMetric(
                         keyword_match=keyword_match,

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -3,21 +3,15 @@ import os
 import time
 from collections import deque
 from enum import Enum
-from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
+from typing import Any, Dict, Generator, List, Mapping, Tuple
 import requests
 import rich
-import urllib3
 import yaml
 from pydantic import BaseModel
-from urllib3.exceptions import InsecureRequestWarning
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_instance import (
-    get_env_settings,
-    tenant_setup,
-)
 from wxo_agentic_evaluation.service_provider.watsonx_provider import (
     WatsonXProvider,
 )
@@ -36,6 +30,7 @@ from wxo_agentic_evaluation.utils.utils import (
     is_saas_url,
     safe_divide,
 )
+from wxo_agentic_evaluation.wxo_client import WXOClient
 tokenizer = Tokenizer()
@@ -82,63 +77,6 @@ class CallTracker(BaseModel):
     generic: List = []
-class WXOClient:
-    def __init__(
-        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
-    ):
-        self.service_url = service_url
-        self.api_key = api_key
-        ov = os.getenv("WO_SSL_VERIFY")
-        if ov and ov.strip().lower() in ("true", "false"):
-            self._verify_ssl = ov.strip().lower() == "true"
-        else:
-            v, bs = (env.get("verify") if env else None), (
-                env.get("bypass_ssl") if env else None
-            )
-            self._verify_ssl = (
-                False
-                if (
-                    (bs is True)
-                    or (isinstance(bs, str) and bs.strip().lower() == "true")
-                    or (v is None)
-                    or (
-                        isinstance(v, str)
-                        and v.strip().lower() in {"none", "null"}
-                    )
-                )
-                else (v if isinstance(v, bool) else True)
-            )
-        if not self._verify_ssl:
-            urllib3.disable_warnings(InsecureRequestWarning)
-    def _get_headers(self) -> dict:
-        headers = {}
-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
-        return headers
-    def post(self, payload: dict, path: str, stream=False):
-        url = f"{self.service_url}/{path}"
-        return requests.post(
-            url=url,
-            headers=self._get_headers(),
-            json=payload,
-            stream=stream,
-            verify=self._verify_ssl,
-        )
-    def get(self, path: str, params: dict = None):
-        url = f"{self.service_url}/{path}"
-        return requests.get(
-            url,
-            params=params,
-            headers=self._get_headers(),
-            verify=self._verify_ssl,
-        )
 class WXOInferenceBackend:
     def __init__(self, wxo_client):
         self.wxo_client = wxo_client
@@ -721,6 +659,19 @@ class EvaluationController:
                         message.content,
                     )
+                # hook for subclasses
+                if self._post_message_hook(
+                    task_n=task_n,
+                    step=step,
+                    message=message,
+                    conversation_history=conversation_history,
+                ):
+                    return (
+                        conversation_history,
+                        call_tracker,
+                        conversational_search_history_data,
+                    )
             conversation_history.extend(messages)
             conversational_search_history_data.extend(
                 conversational_search_data
@@ -733,6 +684,13 @@ class EvaluationController:
             conversational_search_history_data,
         )
+    def _post_message_hook(self, **kwargs) -> bool:
+        """
+        Hook for subclasses to extend behavior.
+        Return True to break the loop early.
+        """
+        return False
     def _is_looping(self, messages: deque) -> bool:
         """Checks whether the user or assistant is stuck in a loop.
         Args:
@@ -786,21 +744,30 @@ class EvaluationController:
         return False  # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
-def get_wxo_client(
-    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
-) -> WXOClient:
-    token, resolved_url, env = tenant_setup(service_url, tenant_name)
-    service_url = service_url or resolved_url
-    if not (service_url and str(service_url).strip()):
-        raise ValueError(
-            f"service_url not provided and not found in config for tenant '{tenant_name}'"
-        )
-    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
-    return wxo_client
+class AttackEvaluationController(EvaluationController):
+    def __init__(self, *args, attack_data=None, attack_evaluator=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.attack_data = attack_data
+        self.attack_evaluator = attack_evaluator
+    def _post_message_hook(self, task_n, step, message, conversation_history) -> bool:
+        """Override hook to add live attack evaluation."""
+        if self.attack_evaluator and self.attack_data:
+            success = self.attack_evaluator.evaluate(
+                self.attack_data, conversation_history + [message]
+            )
+            if success:
+                rich.print(
+                    f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
+                )
+                # persist the live result so the aggregator can pick it up later
+                try:
+                    self.attack_evaluator.save_evaluation_result(self.attack_data, True)
+                except Exception:
+                    pass
+                conversation_history.append(message)
+                return True
+        return False
 if __name__ == "__main__":

wxo_agentic_evaluation/llm_matching.py CHANGED Viewed

@@ -31,9 +31,21 @@ class LLMMatcher:
         result = output.strip().lower()
         return result.startswith("true")
-    def semantic_match(self, prediction: str, ground_truth: str) -> bool:
+    def semantic_match(
+        self, context: str, prediction: str, ground_truth: str
+    ) -> bool:
+        """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
+        Args:
+            context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
+            prediction: the predicted string
+            ground_truth: the expected string
+        Returns:
+            a boolean indicating if the sentences match.
+        """
         prompt = self.semantic_template.render(
-            expected_text=ground_truth, actual_text=prediction
+            context=context, expected_text=ground_truth, actual_text=prediction
         )
         output: str = self.llm_client.query(prompt)
         result = output.strip().lower()

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -21,8 +21,8 @@ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
     WXOInferenceBackend,
-    get_wxo_client,
 )
+from wxo_agentic_evaluation.wxo_client import get_wxo_client
 from wxo_agentic_evaluation.llm_user import LLMUser
 from wxo_agentic_evaluation.metrics.metrics import (
     KnowledgeBaseMetricSummary,

wxo_agentic_evaluation/metrics/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+ from wxo_agentic_evaluation.metrics.metrics import FailedSemanticTestCases, FailedStaticTestCases, Annotation

wxo_agentic_evaluation/metrics/llm_as_judge.py CHANGED Viewed

@@ -53,8 +53,9 @@ class AnswerDerailment(BaseLLMJudgeMetric):
     def table(self):
         return {
-            "statement": ",".join(self.statement),
+            "statement": self.statement,
             "reason": self.reason,
+            "on_topic_score": str(self.in_scope),
         }
@@ -65,7 +66,7 @@ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
     def table(self):
         return {
-            "statement": ",".join(self.statement),
+            "statement": self.statement,
             "reason": self.reason,
-            "unsafe_topic_score": str(self.is_safe),
+            "safe_topic_score": str(self.is_safe),
         }

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import math
-from enum import Enum
 from typing import Any, List, Mapping, Optional, Tuple
+from enum import Enum, StrEnum
 from pydantic import BaseModel, computed_field
@@ -18,6 +18,33 @@ def average(array):
     else:
         return sum(array) / len(array)
+class DescriptionQuality(StrEnum):
+    GOOD = "GOOD"
+    BAD = "BAD"
+    MISSING = "MISSING"
+class DescriptionQualityMetric(BaseModel):
+    tool_name: str = None
+    description_score: float | None = None
+    threshold: float | None = None
+    @computed_field
+    @property
+    def is_bad_description(self) -> Optional[bool]:
+        if self.description_score and self.threshold:
+            return self.description_score >= self.threshold
+        return None
+    @computed_field
+    @property
+    def description_quality(self) -> str:
+        if self.description_score is None:
+            return DescriptionQuality.MISSING
+        elif self.is_bad_description:
+            return DescriptionQuality.BAD
+        else:
+            return DescriptionQuality.GOOD
 class KnowledgeBaseMetrics(BaseModel):
     dataset_name: str = None
@@ -175,6 +202,12 @@ class ToolCallAndRoutingMetrics(BaseModel):
         )
+class Annotation(BaseModel):
+    recommendation: str
+    details: str
+    quote: str
+    parameter_name: Optional[str]
 class FailedStaticTestCases(BaseModel):
     metric_name: str
     description: str
@@ -187,6 +220,15 @@ class FailedSemanticTestCases(BaseModel):
     explanation: str
     output: int
     confidence: float
+    annotations: Optional[List[Annotation]] = None
+class EnhancedAnalyzeMetrics(BaseModel):
+    test_case_name: str
+    tool_names: List[str]
+    parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    static_metrics: List[List[FailedStaticTestCases]] = [[]]
 class ReferenceLessEvalMetrics(BaseModel):

wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 CHANGED Viewed

@@ -1,13 +1,13 @@
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
+You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently, given a context of the situation.
 Key evaluation principles:
-1. Focus on whether the core information and outcome is the same
-2. Different phrasings that convey the same result should be considered equivalent
-3. When specific values (IDs, dates, amounts, names) appear in both texts, they must match exactly
-4. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs
-5. Different levels of detail are acceptable if they don't contradict each other and the primary information remains intact
-6. Reference IDs that are clearly system-generated (like request IDs, confirmation numbers, UUIDs) may vary and should be ignored
+1. Focus on whether the core information and outcome is the same.
+2. Different phrasings that convey the same result should be considered equivalent.
+3. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs.
+4. When specific values (e.g. IDs, dates, amounts, names) appear in both texts, they must match exactly. If they appear only in one text but the other text doesn’t contradict them, consider it equivalent.
+5. Reference IDs that are system-generated (e.g. item IDs, request IDs, confirmation numbers, UUIDs) should be ignored when checking for equivalence.
+6. When checking query results like lists or tables, differences in field values, and rows are acceptable as long as the same entities or items are represented and the query intent, data type, and structure remain the same.
 Respond ONLY with:
 - True: if the texts convey the same essential information and outcomes
@@ -20,16 +20,30 @@ DO NOT provide explanations or commentary - only respond with "True" or "False"
 Evaluate the following examples:
 ### Example 1
+Context:
+Get me a list of all active machines.
 Expected:
-Your email has been successfully updated.
+Here are all the active machines:
+| id | name      | number | status   |
+|----|-----------|--------|----------|
+| 43 | NNM1 |        | active   |
+| 01 | XYZ2 |        | active   |
+| 44 | RRX |        | active   |
 Actual:
-You have successfully updated your email.
+Here are all the active machines:
+| id | name      | number | status   |
+|----|-----------|--------|----------|
+| 1280 | ABC |        | active   |
 Answer:
 True
 ### Example 2
+Context:
+Give me information about Ontario.
 Expected:
 Ontario is a province in Canada.
@@ -40,6 +54,9 @@ Answer:
 False
 ### Example 3
+Context:
+Find payslip details for user 12345.
 Expected:
 No payslips found for user with ID 12345.
@@ -50,6 +67,9 @@ Answer:
 True
 ### Example 4
+Context:
+I'd like to create a new time off request.
 Expected:
 Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
@@ -60,6 +80,9 @@ Answer:
 True
 ### Example 5
+Context:
+What's my compensation details?
 Expected:
 Your compensation details are as follows:
 * Currency: USD
@@ -72,6 +95,9 @@ Answer:
 True
 ### Example 6
+Context:
+Show me my visa details.
 Expected:
 Your visa details are as follows:
 - Country: 44
@@ -88,6 +114,9 @@ Answer:
 False
 ### Example 7
+Context:
+Update my preferred name and my starting date.
 Expected:
 I successfully updated your personal information.
@@ -101,6 +130,9 @@ True
 ### Now, evaluate the following:
+Context:
+{{ context }}
 Expected:
 {{ expected_text }}

wxo_agentic_evaluation/prompt/template_render.py CHANGED Viewed

@@ -45,9 +45,11 @@ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
 class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
-    def render(self, expected_text: str, actual_text: str) -> str:
+    def render(self, context: str, expected_text: str, actual_text: str) -> str:
         return super().render(
-            expected_text=expected_text, actual_text=actual_text
+            context=context,
+            expected_text=expected_text,
+            actual_text=actual_text,
         )

wxo_agentic_evaluation/quick_eval.py CHANGED Viewed

@@ -14,8 +14,8 @@ from wxo_agentic_evaluation.arg_configs import QuickEvalConfig
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
     WXOInferenceBackend,
-    get_wxo_client,
 )
+from wxo_agentic_evaluation.wxo_client import get_wxo_client
 from wxo_agentic_evaluation.llm_user import LLMUser
 from wxo_agentic_evaluation.metrics.metrics import (
     FailedSemanticTestCases,
@@ -115,14 +115,16 @@ class QuickEvalController(EvaluationController):
     ) -> Tuple[ReferenceLessEvalMetrics, List[ExtendedMessage]]:
         # run reference-less evaluation
         rich.print(f"[b][Task-{task_n}] Starting Quick Evaluation")
+        processed_data = ReferencelessEvaluation.fmt_msgs_referenceless(
+            messages
+        )
         te = ReferencelessEvaluation(
             tools,
-            messages,
             MODEL_ID,
             task_n,
             self.test_case_name,
         )
-        referenceless_results = te.run()
+        referenceless_results = te.run(examples=processed_data)
         rich.print(f"[b][Task-{task_n}] Finished Quick Evaluation")
         summary_metrics = self.compute_metrics(referenceless_results)
@@ -167,13 +169,13 @@ class QuickEvalController(EvaluationController):
             extended_messages.append(extended_message)
-        # return summary_metrics, referenceless_results
         return summary_metrics, extended_messages
     def failed_static_metrics_for_tool_call(
         self, static_metrics: Mapping[str, Mapping[str, Any]]
     ) -> Optional[List[FailedStaticTestCases]]:
         """
+        # TODO: in future PR, use the ReferencelessParser library
         static.metrics
         """
@@ -195,6 +197,7 @@ class QuickEvalController(EvaluationController):
         self, semantic_metrics: Mapping[str, Mapping[str, Any]]
     ) -> Optional[List[FailedSemanticTestCases]]:
         """
+        # TODO: in future PR, use the ReferencelessParser library
         semantic.general
         semantic.function_selection
@@ -257,11 +260,6 @@ class QuickEvalController(EvaluationController):
             []
         )  # keep track of tool calls that failed for semantic reason
-        from pprint import pprint
-        # pprint("quick eval results: ")
-        # pprint(quick_eval_results)
         for tool_call_idx, result in enumerate(quick_eval_results):
             static_passed = result.get("static", {}).get(
                 "final_decision", False

wxo_agentic_evaluation/record_chat.py CHANGED Viewed

@@ -15,11 +15,8 @@ from wxo_agentic_evaluation.arg_configs import (
     KeywordsGenerationConfig,
 )
 from wxo_agentic_evaluation.data_annotator import DataAnnotator
-from wxo_agentic_evaluation.inference_backend import (
-    WXOClient,
-    WXOInferenceBackend,
-    get_wxo_client,
-)
+from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
+from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
 from wxo_agentic_evaluation.prompt.template_render import (
     StoryGenerationTemplateRenderer,
 )

ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.6py3-none-any.whl