PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/external_agent/types.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from typing import Any, List, Literal, Mapping, Union
 from pydantic import BaseModel
-from typing import List, Union, Literal, Mapping, Any
 class ThinkingStepDetails(BaseModel):
@@ -25,7 +26,9 @@ class ToolResponseStepDetails(BaseModel):
     tool_call_id: str
-StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
+StepDetails = Union[
+    ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails
+]
 class DeltaMessageChoice(BaseModel):
@@ -59,8 +62,12 @@ class ThreadRunStepDeltaData(BaseEventData):
 class UniversalData(BaseEventData):
-    object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
-                  Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
+    object: Union[
+        Literal["thread.message.delta"],
+        Literal["thread.run.step.delta"],
+        Literal["thread.run.step.created"],
+        Literal["thread.run.step.completed"],
+    ]
     choices: List[ThreadMessageDeltaChoice]
     choices: List[Union[ThreadMessageDeltaChoice, dict]]
@@ -68,4 +75,4 @@ class UniversalData(BaseEventData):
 class SchemaValidationResults(BaseModel):
     success: bool
     logged_events: List[str]
-    messages: List[Mapping[Any, Any]]
+    messages: List[Mapping[Any, Any]]

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -1,61 +1,68 @@
-import requests
-import os
-import yaml
 import json
-import rich
+import os
 import time
-from pydantic import BaseModel
-from typing import List, Generator, Dict, Tuple, Mapping, Any
-from enum import Enum
 from collections import deque
+import urllib3
+from urllib3.exceptions import InsecureRequestWarning
+from enum import Enum
+from typing import Any, Dict, Generator, List, Mapping, Tuple
+import requests
+import rich
+import yaml
+from pydantic import BaseModel
+from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_provider.watsonx_provider import (
+    WatsonXProvider,
+)
 from wxo_agentic_evaluation.type import (
     ContentType,
-    Message,
+    ConversationalConfidenceThresholdScore,
     ConversationalSearch,
     ConversationalSearchCitations,
     ConversationalSearchResultMetadata,
-    ConversationalConfidenceThresholdScore,
     ConversationalSearchResults,
     ConversationSearchMetadata,
+    Message,
 )
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
-from wxo_agentic_evaluation.arg_configs import TestConfig
-from wxo_agentic_evaluation.service_instance import tenant_setup
 from wxo_agentic_evaluation.utils.utils import (
+    Tokenizer,
     is_saas_url,
     safe_divide,
-    Tokenizer
 )
 tokenizer = Tokenizer()
 class Roles(Enum):
     ASSISTANT = "assistant"
     USER = "user"
-def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
+def calculate_word_overlap_similarity_score(
+    first_message_text: str, second_message_text: str
+) -> float:
     """Calculate the word overlap similarity score between the .content field of two Message objects.
     Args:
         first_message_text (str): The .content field of the first message.
         second_message_text (str): The .content field of the second message.
     """
     words_in_first_message = tokenizer(first_message_text)
     words_in_second_message = tokenizer(second_message_text)
     # Calculate the number of common words
     common_words = set(words_in_first_message) & set(words_in_second_message)
     unique_words = set(words_in_first_message + words_in_second_message)
     unique_words_count = len(unique_words)
     common_words_count = len(common_words)
-    return safe_divide(
-        common_words_count,
-        unique_words_count
-    )
+    return safe_divide(common_words_count, unique_words_count)
 def is_transfer_response(step_detail: Dict):
     # this is not very reliable
@@ -77,6 +84,13 @@ class WXOClient:
         self.service_url = service_url
         self.api_key = api_key
+        env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
+        verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
+        self._verify_ssl = verify
+        if not self._verify_ssl:
+            urllib3.disable_warnings(InsecureRequestWarning)
     def _get_headers(self) -> dict:
         headers = {}
         if self.api_key:
@@ -86,12 +100,12 @@ class WXOClient:
     def post(self, payload: dict, path: str, stream=False):
         url = f"{self.service_url}/{path}"
         return requests.post(
-            url=url, headers=self._get_headers(), json=payload, stream=stream
+            url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
         )
     def get(self, path: str, params: dict = None):
         url = f"{self.service_url}/{path}"
-        return requests.get(url, params=params, headers=self._get_headers())
+        return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
 class WXOInferenceBackend:
@@ -135,7 +149,9 @@ class WXOInferenceBackend:
         else:
             path = "v1/orchestrate/runs?stream=true"
-        response: requests.Response = self.wxo_client.post(payload, path, stream=True)
+        response: requests.Response = self.wxo_client.post(
+            payload, path, stream=True
+        )
         import json
         for chunk in self._parse_events(response):
@@ -188,7 +204,9 @@ class WXOInferenceBackend:
         citations = parse_citations()
         retrieval_context = parsed_search_results()
         citations_title = conversational_search.get("citations_title", "")
-        response_length_option = conversational_search.get("response_length_option", "")
+        response_length_option = conversational_search.get(
+            "response_length_option", ""
+        )
         text = conversational_search.get("text", "")
         confidence_scores = ConversationalConfidenceThresholdScore(
@@ -261,7 +279,9 @@ class WXOInferenceBackend:
                                         )
                                     )
                                     end_time = time.time()
-                                    call_tracker.tool_call.append(end_time - start_time)
+                                    call_tracker.tool_call.append(
+                                        end_time - start_time
+                                    )
                                     start_time = end_time
                             elif step_detail["type"] == "tool_call":
                                 # in step details, we could have [tool_response, tool_call]
@@ -279,7 +299,9 @@ class WXOInferenceBackend:
                                     )
                                 )
                                 end_time = time.time()
-                                call_tracker.tool_call.append(end_time - start_time)
+                                call_tracker.tool_call.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                             elif step_detail["type"] == "tool_response":
                                 content = json.dumps(step_detail)
@@ -293,7 +315,9 @@ class WXOInferenceBackend:
                                     )
                                 )
                                 end_time = time.time()
-                                call_tracker.tool_response.append(end_time - start_time)
+                                call_tracker.tool_response.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                     elif content_field := delta.get("content"):
                         for val in content_field:
@@ -312,7 +336,9 @@ class WXOInferenceBackend:
                                     chunk=event,
                                 )
                                 end_time = time.time()
-                                call_tracker.generic.append(end_time - start_time)
+                                call_tracker.generic.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                 # NOTE: The event here that is parsed is part of the "message.created" event
@@ -336,10 +362,14 @@ class WXOInferenceBackend:
                             """
                             last_message = json.loads(messages[-1].content)
-                            tool_call_id = last_message.get("tool_call_id", None)
+                            tool_call_id = last_message.get(
+                                "tool_call_id", None
+                            )
                             assert tool_call_id is not None
-                            conversational_search_metadata = ConversationSearchMetadata(
-                                tool_call_id=tool_call_id
+                            conversational_search_metadata = (
+                                ConversationSearchMetadata(
+                                    tool_call_id=tool_call_id
+                                )
                             )
                             conversational_search = (
                                 self.parse_conversational_search_response(
@@ -347,7 +377,9 @@ class WXOInferenceBackend:
                                     metadata=conversational_search_metadata,
                                 )
                             )
-                            conversational_search_data.append(conversational_search)
+                            conversational_search_data.append(
+                                conversational_search
+                            )
                             messages.append(
                                 Message(
                                     role=role,
@@ -436,7 +468,10 @@ class WXOInferenceBackend:
                                     content = json.dumps(tool_json)
                                     # TO-DO: review do we even need the get messages for retry loop anymore?
                                     if msg_content := entry.get("content"):
-                                        if msg_content[0].get("response_type") == "conversational_search":
+                                        if (
+                                            msg_content[0].get("response_type")
+                                            == "conversational_search"
+                                        ):
                                             continue
                                     messages.append(
                                         Message(
@@ -451,7 +486,9 @@ class WXOInferenceBackend:
                                 content = json.dumps(step_detail)
                                 messages.append(
                                     Message(
-                                        role=role, content=content, type=content_type
+                                        role=role,
+                                        content=content,
+                                        type=content_type,
                                     )
                                 )
                             else:
@@ -459,7 +496,9 @@ class WXOInferenceBackend:
                                 content_type = ContentType.tool_response
                                 messages.append(
                                     Message(
-                                        role=role, content=content, type=content_type
+                                        role=role,
+                                        content=content,
+                                        type=content_type,
                                     )
                                 )
             if content_field := entry.get("content"):
@@ -468,12 +507,19 @@ class WXOInferenceBackend:
                     if val["response_type"] == ContentType.text:
                         messages.append(
                             Message(
-                                role=role, content=val["text"], type=ContentType.text
+                                role=role,
+                                content=val["text"],
+                                type=ContentType.text,
                             )
                         )
-                    if val["response_type"] == ContentType.conversational_search:
-                        conversational_search_metadata = ConversationSearchMetadata(
-                            tool_call_id=tool_call_id
+                    if (
+                        val["response_type"]
+                        == ContentType.conversational_search
+                    ):
+                        conversational_search_metadata = (
+                            ConversationSearchMetadata(
+                                tool_call_id=tool_call_id
+                            )
                         )
                         messages.append(
                             Message(
@@ -538,8 +584,12 @@ class WXOInferenceBackend:
 class EvaluationController:
     MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
-    MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98))  # if any two consecutive messages are >98% similar, the inference loop will be terminated
-    MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
+    MESSAGE_SIMILARITY_THRESHOLD = float(
+        os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
+    )  # if any two consecutive messages are >98% similar, the inference loop will be terminated
+    MAX_REPEATING_MESSAGES = int(
+        os.getenv("MAX_REPEATING_MESSAGES", 3)
+    )  # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
     def __init__(
         self,
@@ -554,11 +604,20 @@ class EvaluationController:
         if self.repeating_output_detection:
             # Use deque for efficient O(1) operations
-            self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
-            self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
+            self.recent_user_messages = deque(
+                maxlen=self.MAX_REPEATING_MESSAGES
+            )
+            self.recent_assistant_messages = deque(
+                maxlen=self.MAX_REPEATING_MESSAGES
+            )
     def run(
-        self, task_n, story, agent_name: str, starting_user_input: str = None, attack_instructions: str = None
+        self,
+        task_n,
+        story,
+        agent_name: str,
+        starting_user_input: str = None,
+        attack_instructions: str = None,
     ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
         step = 0
         thread_id = None
@@ -570,7 +629,9 @@ class EvaluationController:
         while step < self.MAX_CONVERSATION_STEPS:
             if step == 0 and starting_user_input:
                 user_input = Message(
-                    role="user", content=starting_user_input, type=ContentType.text
+                    role="user",
+                    content=starting_user_input,
+                    type=ContentType.text,
                 )
             else:
                 if self.config.enable_manual_user_input == True:
@@ -582,7 +643,9 @@ class EvaluationController:
                     )
                 else:  # llm
                     user_input = self.llm_user.generate_user_input(
-                        story, conversation_history, attack_instructions=attack_instructions
+                        story,
+                        conversation_history,
+                        attack_instructions=attack_instructions,
                     )
             if self.config.enable_verbose_logging:
                 rich.print(
@@ -592,26 +655,33 @@ class EvaluationController:
             if self._is_end(user_input):
                 break
             if self.repeating_output_detection:
                 self.recent_user_messages.append(user_input.content)
             conversation_history.append(user_input)
-            messages, thread_id, conversational_search_data = (
-                self.wxo_inference_backend.stream_messages(
-                    user_input,
-                    agent_name=agent_name,
-                    thread_id=thread_id,
-                    call_tracker=call_tracker,
-                )
+            (
+                messages,
+                thread_id,
+                conversational_search_data,
+            ) = self.wxo_inference_backend.stream_messages(
+                user_input,
+                agent_name=agent_name,
+                thread_id=thread_id,
+                call_tracker=call_tracker,
             )
             if not messages:
-                raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
+                raise RuntimeError(
+                    f"[Task-{task_n}] No messages is produced. Exiting task."
+                )
             for message in messages:
                 if self.repeating_output_detection:
-                    if message.role == Roles.ASSISTANT and message.type == ContentType.text:
+                    if (
+                        message.role == Roles.ASSISTANT
+                        and message.type == ContentType.text
+                    ):
                         self.recent_assistant_messages.append(message.content)
                 if self.config.enable_verbose_logging:
@@ -621,11 +691,17 @@ class EvaluationController:
                     )
             conversation_history.extend(messages)
-            conversational_search_history_data.extend(conversational_search_data)
+            conversational_search_history_data.extend(
+                conversational_search_data
+            )
             step += 1
-        return conversation_history, call_tracker, conversational_search_history_data
+        return (
+            conversation_history,
+            call_tracker,
+            conversational_search_history_data,
+        )
     def _is_looping(self, messages: deque) -> bool:
         """Checks whether the user or assistant is stuck in a loop.
         Args:
@@ -634,7 +710,7 @@ class EvaluationController:
             bool: True if stuck in a loop, False otherwise.
         """
         sim_count = 0
         if len(messages) >= self.MAX_REPEATING_MESSAGES:
             oldest_cached_message = messages[0]
             for i, old_message in enumerate(messages):
@@ -642,11 +718,16 @@ class EvaluationController:
                     continue
                 if oldest_cached_message == old_message:
                     sim_count += 1
-                elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
+                elif (
+                    calculate_word_overlap_similarity_score(
+                        oldest_cached_message, old_message
+                    )
+                    > self.MESSAGE_SIMILARITY_THRESHOLD
+                ):
                     sim_count += 1
         return sim_count >= self.MAX_REPEATING_MESSAGES - 1
     def _is_end(self, current_user_input: Message) -> bool:
         """
         Check if the user input indicates the end of the conversation.
@@ -664,14 +745,16 @@ class EvaluationController:
         # Check if the user message contains 'END'
         if "END" in current_user_message_content:
             return True
         if self.repeating_output_detection:
             # Check for repeating user or assistant messages
-            if (self._is_looping(self.recent_user_messages) or
-                self._is_looping(self.recent_assistant_messages)):
+            if self._is_looping(self.recent_user_messages) or self._is_looping(
+                self.recent_assistant_messages
+            ):
                 return True
-        return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
+        return False  # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
 def get_wxo_client(
     service_url: str, tenant_name: str, token: str = None
@@ -684,7 +767,9 @@ def get_wxo_client(
 if __name__ == "__main__":
     wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
-    auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
+    auth_config_path = (
+        f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
+    )
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
     tenant_name = "local"

wxo_agentic_evaluation/llm_matching.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from typing import List
 from wxo_agentic_evaluation.prompt.template_render import (
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
 )
-from typing import List
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 class LLMMatcher:
@@ -26,7 +27,7 @@ class LLMMatcher:
         prompt = self.keyword_template.render(
             keywords_text=keywords_text, response_text=response_text
         )
-        output:str = self.llm_client.query(prompt)
+        output: str = self.llm_client.query(prompt)
         result = output.strip().lower()
         return result.startswith("true")

wxo_agentic_evaluation/llm_rag_eval.py CHANGED Viewed

@@ -1,12 +1,15 @@
-from typing import List
 import json
+from typing import List
-from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerRelevancy,
+    Faithfulness,
+)
 from wxo_agentic_evaluation.prompt.template_render import (
-    FaithfulnessTemplateRenderer,
     AnswerRelevancyTemplateRenderer,
+    FaithfulnessTemplateRenderer,
 )
-from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 class LLMJudge:

wxo_agentic_evaluation/llm_user.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import List, TypeVar
-from wxo_agentic_evaluation.type import Message, ContentType
-from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.type import ContentType, Message
 T = TypeVar("T", bound=JinjaTemplateRenderer)
@@ -17,7 +18,10 @@ class LLMUser:
         )
     def generate_user_input(
-        self, user_story, conversation_history: List[Message], attack_instructions: str = None
+        self,
+        user_story,
+        conversation_history: List[Message],
+        attack_instructions: str = None,
     ) -> Message | None:
         # the tool response is already summarized, we don't need that to take over the chat history context window
         prompt_input = self.prompt_template.render(

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl