PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -3,21 +3,15 @@ import os
 import time
 from collections import deque
 from enum import Enum
-from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
+from typing import Any, Dict, Generator, List, Mapping, Tuple
 import requests
 import rich
-import urllib3
 import yaml
 from pydantic import BaseModel
-from urllib3.exceptions import InsecureRequestWarning
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_instance import (
-    get_env_settings,
-    tenant_setup,
-)
 from wxo_agentic_evaluation.service_provider.watsonx_provider import (
     WatsonXProvider,
 )
@@ -36,6 +30,7 @@ from wxo_agentic_evaluation.utils.utils import (
     is_saas_url,
     safe_divide,
 )
+from wxo_agentic_evaluation.wxo_client import WXOClient
 tokenizer = Tokenizer()
@@ -76,67 +71,40 @@ def is_transfer_response(step_detail: Dict):
     return False
-class CallTracker(BaseModel):
-    tool_call: List = []
-    tool_response: List = []
-    generic: List = []
+def _generate_user_input(
+    user_turn: int,
+    story: str,
+    conversation_history: list[Message],
+    llm_user: LLMUser,
+    enable_manual_user_input: bool = False,
+    starting_user_input: str | None = None,
+    attack_instructions: str | None = None,
+) -> Message:
+    """Generates the user input for the current turn."""
+    if user_turn == 0 and starting_user_input is not None:
+        return Message(
+            role="user",
+            content=starting_user_input,
+            type=ContentType.text,
+        )
-class WXOClient:
-    def __init__(
-        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
-    ):
-        self.service_url = service_url
-        self.api_key = api_key
+    if enable_manual_user_input:
+        content = input("[medium_orchid1]Enter your input[/medium_orchid1] ✍️: ")
+        return Message(role="user", content=content, type=ContentType.text)
-        ov = os.getenv("WO_SSL_VERIFY")
-        if ov and ov.strip().lower() in ("true", "false"):
-            self._verify_ssl = ov.strip().lower() == "true"
-        else:
-            v, bs = (env.get("verify") if env else None), (
-                env.get("bypass_ssl") if env else None
-            )
-            self._verify_ssl = (
-                False
-                if (
-                    (bs is True)
-                    or (isinstance(bs, str) and bs.strip().lower() == "true")
-                    or (v is None)
-                    or (
-                        isinstance(v, str)
-                        and v.strip().lower() in {"none", "null"}
-                    )
-                )
-                else (v if isinstance(v, bool) else True)
-            )
+    # llm generated user input
+    return llm_user.generate_user_input(
+        story,
+        conversation_history,
+        attack_instructions=attack_instructions,
+    )
-        if not self._verify_ssl:
-            urllib3.disable_warnings(InsecureRequestWarning)
-    def _get_headers(self) -> dict:
-        headers = {}
-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
-        return headers
-    def post(self, payload: dict, path: str, stream=False):
-        url = f"{self.service_url}/{path}"
-        return requests.post(
-            url=url,
-            headers=self._get_headers(),
-            json=payload,
-            stream=stream,
-            verify=self._verify_ssl,
-        )
-    def get(self, path: str, params: dict = None):
-        url = f"{self.service_url}/{path}"
-        return requests.get(
-            url,
-            params=params,
-            headers=self._get_headers(),
-            verify=self._verify_ssl,
-        )
+class CallTracker(BaseModel):
+    tool_call: List = []
+    tool_response: List = []
+    generic: List = []
 class WXOInferenceBackend:
@@ -273,7 +241,6 @@ class WXOInferenceBackend:
         start_time = time.time()
         for chunk in self._stream_events(user_input, agent_name, thread_id):
             event = chunk.get("event", "")
             if _thread_id := chunk.get("data", {}).get("thread_id"):
                 thread_id = _thread_id
@@ -484,7 +451,6 @@ class WXOInferenceBackend:
         messages = []
         for entry in result:
             tool_call_id = None
             if step_history := entry.get("step_history"):
                 for step_message in step_history:
@@ -613,7 +579,6 @@ class WXOInferenceBackend:
 class EvaluationController:
     MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
     MESSAGE_SIMILARITY_THRESHOLD = float(
         os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
@@ -647,37 +612,32 @@ class EvaluationController:
         task_n,
         story,
         agent_name: str,
-        starting_user_input: str = None,
-        attack_instructions: str = None,
+        starting_user_input: str | None = None,
+        attack_instructions: str | None = None,
+        max_user_turns: int | None = None,
     ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
-        step = 0
         thread_id = None
         conversation_history: List[Message] = []
         conversational_search_history_data = []
         call_tracker = CallTracker()
-        # make this configurable
-        while step < self.MAX_CONVERSATION_STEPS:
-            if step == 0 and starting_user_input:
-                user_input = Message(
-                    role="user",
-                    content=starting_user_input,
-                    type=ContentType.text,
-                )
-            else:
-                if self.config.enable_manual_user_input == True:
-                    content = input(
-                        "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
-                    )
-                    user_input = Message(
-                        role="user", content=content, type=ContentType.text
-                    )
-                else:  # llm
-                    user_input = self.llm_user.generate_user_input(
-                        story,
-                        conversation_history,
-                        attack_instructions=attack_instructions,
-                    )
+        max_turns = (
+            self.MAX_CONVERSATION_STEPS
+            if max_user_turns is None
+            else max_user_turns
+        )
+        for user_turn in range(max_turns):
+            user_input = _generate_user_input(
+                user_turn=user_turn,
+                story=story,
+                conversation_history=conversation_history,
+                llm_user=self.llm_user,
+                enable_manual_user_input=self.config.enable_manual_user_input,
+                starting_user_input=starting_user_input,
+                attack_instructions=attack_instructions,
+            )
             if self.config.enable_verbose_logging:
                 rich.print(
                     f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
@@ -721,18 +681,37 @@ class EvaluationController:
                         message.content,
                     )
+                # hook for subclasses
+                if self._post_message_hook(
+                    task_n=task_n,
+                    step=user_turn,
+                    message=message,
+                    conversation_history=conversation_history,
+                ):
+                    return (
+                        conversation_history,
+                        call_tracker,
+                        conversational_search_history_data,
+                    )
             conversation_history.extend(messages)
             conversational_search_history_data.extend(
                 conversational_search_data
             )
-            step += 1
         return (
             conversation_history,
             call_tracker,
             conversational_search_history_data,
         )
+    def _post_message_hook(self, **kwargs) -> bool:
+        """
+        Hook for subclasses to extend behavior.
+        Return True to break the loop early.
+        """
+        return False
     def _is_looping(self, messages: deque) -> bool:
         """Checks whether the user or assistant is stuck in a loop.
         Args:
@@ -784,23 +763,40 @@ class EvaluationController:
             ):
                 return True
-        return False  # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
+        # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
+        return False
-def get_wxo_client(
-    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
-) -> WXOClient:
-    token, resolved_url, env = tenant_setup(service_url, tenant_name)
-    service_url = service_url or resolved_url
-    if not (service_url and str(service_url).strip()):
-        raise ValueError(
-            f"service_url not provided and not found in config for tenant '{tenant_name}'"
-        )
-    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
-    return wxo_client
+class AttackEvaluationController(EvaluationController):
+    def __init__(
+        self, *args, attack_data=None, attack_evaluator=None, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.attack_data = attack_data
+        self.attack_evaluator = attack_evaluator
+    def _post_message_hook(
+        self, task_n, step, message, conversation_history
+    ) -> bool:
+        """Override hook to add live attack evaluation."""
+        if self.attack_evaluator and self.attack_data:
+            success = self.attack_evaluator.evaluate(
+                self.attack_data, conversation_history + [message]
+            )
+            if success:
+                rich.print(
+                    f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
+                )
+                # persist the live result so the aggregator can pick it up later
+                try:
+                    self.attack_evaluator.save_evaluation_result(
+                        self.attack_data, True
+                    )
+                except Exception:
+                    pass
+                conversation_history.append(message)
+                return True
+        return False
 if __name__ == "__main__":
@@ -810,6 +806,7 @@ if __name__ == "__main__":
     )
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
     tenant_name = "local"
     token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]

wxo_agentic_evaluation/llm_matching.py CHANGED Viewed

@@ -1,10 +1,22 @@
+"""
+LLM Matching Module with Cosine Similarity Support
+This module provides functionality for matching text using:
+1. LLM-based matching (using a language model to determine semantic equivalence)
+2. Embedding-based matching (using cosine similarity between text embeddings)
+"""
+import math
 from typing import List
+from fuzzywuzzy import fuzz
 from wxo_agentic_evaluation.prompt.template_render import (
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
 )
 from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.utils.utils import safe_divide
 class LLMMatcher:
@@ -13,10 +25,18 @@ class LLMMatcher:
         llm_client: Provider,
         keyword_template: KeywordMatchingTemplateRenderer,
         semantic_template: SemanticMatchingTemplateRenderer,
+        use_llm_for_semantic: bool = True,
+        embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
+        similarity_threshold: float = 0.8,
+        enable_fuzzy_matching: bool = False,
     ):
         self.llm_client = llm_client
         self.keyword_template = keyword_template
         self.semantic_template = semantic_template
+        self.embedding_model_id = embedding_model_id
+        self.use_llm_for_semantic = use_llm_for_semantic
+        self.similarity_threshold = similarity_threshold
+        self.enable_fuzzy_matching = enable_fuzzy_matching
     def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
         if len(keywords) == 0:
@@ -31,10 +51,92 @@ class LLMMatcher:
         result = output.strip().lower()
         return result.startswith("true")
-    def semantic_match(self, prediction: str, ground_truth: str) -> bool:
+    def generate_embeddings(
+        self, prediction: str, ground_truth: str
+    ) -> List[List[float]]:
+        embeddings = self.llm_client.encode([prediction, ground_truth])
+        return embeddings
+    def compute_cosine_similarity(
+        self, vec1: List[float], vec2: List[float]
+    ) -> float:
+        """Calculate cosine similarity between two vectors using pure Python"""
+        # Manual dot product calculation
+        dot_product = sum(a * b for a, b in zip(vec1, vec2))
+        # Manual magnitude calculations
+        magnitude1 = math.sqrt(sum(a * a for a in vec1))
+        magnitude2 = math.sqrt(sum(b * b for b in vec2))
+        return safe_divide(dot_product, (magnitude1 * magnitude2))
+    def cosine_similarity_semantic_match(
+        self, prediction: str, ground_truth: str
+    ) -> bool:
+        embeddings = self.generate_embeddings(prediction, ground_truth)
+        cosine_similarity = self.compute_cosine_similarity(
+            embeddings[0], embeddings[1]
+        )
+        return cosine_similarity >= self.similarity_threshold
+    def llm_semantic_match(
+        self, context, prediction: str, ground_truth: str
+    ) -> bool:
+        """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
+        Args:
+            context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
+            prediction: the predicted string
+            ground_truth: the expected string
+        Returns:
+            a boolean indicating if the sentences match.
+        """
         prompt = self.semantic_template.render(
-            expected_text=ground_truth, actual_text=prediction
+            context=context, expected_text=ground_truth, actual_text=prediction
         )
         output: str = self.llm_client.query(prompt)
         result = output.strip().lower()
         return result.startswith("true")
+    def fuzzywuzzy_semantic_match(
+        self, prediction: str, ground_truth: str
+    ) -> bool:
+        similarity_score = fuzz.WRatio(prediction, ground_truth)
+        return similarity_score > self.similarity_threshold
+    def semantic_match(
+        self,
+        context: str,
+        prediction: str,
+        ground_truth: str,
+        enable_fuzzy_matching: bool = False,
+    ) -> bool:
+        ## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
+        try:
+            return self.llm_semantic_match(context, prediction, ground_truth)
+        except Exception as e:
+            print(f"LLM semantic match failed: {e}")
+        if enable_fuzzy_matching:
+            print("falling back to fuzzy matching")
+            # Fallback to cosine similarity if LLM matching is not used or failed
+            try:
+                return self.cosine_similarity_semantic_match(
+                    prediction, ground_truth
+                )
+            except Exception as e:
+                print(
+                    f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
+                )
+            # Final fallback to fuzzywuzzy
+            return self.fuzzywuzzy_semantic_match(prediction, ground_truth)

wxo_agentic_evaluation/llm_user.py CHANGED Viewed

@@ -21,8 +21,8 @@ class LLMUser:
         self,
         user_story,
         conversation_history: List[Message],
-        attack_instructions: str = None,
-    ) -> Message | None:
+        attack_instructions: str | None = None,
+    ) -> Message:
         # the tool response is already summarized, we don't need that to take over the chat history context window
         prompt_input = self.prompt_template.render(
             conversation_history=[

ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl