PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (9) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.0.7
+Version: 1.0.8
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT

{ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/RECORD RENAMED Viewed

@@ -4,12 +4,12 @@ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89
 wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
 wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
 wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
-wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
-wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
+wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
+wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
 wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
 wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
-wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
+wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
 wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
 wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
 wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
@@ -20,9 +20,9 @@ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohX
 wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
 wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
 wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
-wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
+wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
 wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
-wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
+wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
 wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
@@ -50,7 +50,7 @@ wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId
 wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
 wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
 wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
-ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
-ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -218,7 +218,7 @@ class EvaluationPackage:
         tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
         )
         tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
+        correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
         for message in self.messages:
             if message.type == ContentType.tool_call:
@@ -244,6 +244,7 @@ class EvaluationPackage:
                     continue
+                # TO-DO: re-think how deduplication works in the context of precision & recall
                 tool_call_and_routing_metrics.total_tool_calls += 1
                 # evaluating more than once is fine
@@ -262,8 +263,8 @@ class EvaluationPackage:
                         if msg_tool_call["args"] == goal_detail.args:
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(goal_detail.name)
-                            tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            correct_tool_calls.add(goal_detail.name)
+                            #tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
@@ -308,6 +309,9 @@ class EvaluationPackage:
             else:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
+        tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
         assistant_responses = [
             message
             for message in self.messages

wxo_agentic_evaluation/external_agent/__init__.py CHANGED Viewed

@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
         "decoding_method": "greedy",
         "max_new_tokens": 4096,
     }
-    wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
+    wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
     prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
     res = wai_client.query(prompt)
     res = res.strip()

wxo_agentic_evaluation/external_agent/performance_test.py CHANGED Viewed

@@ -3,7 +3,7 @@ from rich.console import Console
 from wxo_agentic_evaluation.external_agent import generate_starting_sentence
 from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
-from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
+from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
 class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
         kw_gen_config = KeywordsGenerationConfig()
-        provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
         llm_decode_parameter = {
             "min_new_tokens": 0,
             "decoding_method": "greedy",
             "max_new_tokens": 256,
         }
-        wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
+        wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
         self.kw_gen = KeywordsGenerationLLM(
             provider=wai_client,

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -6,6 +6,8 @@ import rich
 import time
 from pydantic import BaseModel
 from typing import List, Generator, Dict, Tuple, Mapping, Any
+from enum import Enum
+from collections import deque
 from wxo_agentic_evaluation.type import (
     ContentType,
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.service_instance import tenant_setup
 from wxo_agentic_evaluation.utils.utils import is_saas_url
+class Roles(Enum):
+    ASSISTANT = "assistant"
+    USER = "user"
-def is_end(user_input: Message):
-    if "END" in user_input.content.strip():
-        return True
-    return False
+def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
+    """Calculate the word overlap similarity score between the .content field of two Message objects.
+    Args:
+        first_message_text (str): The .content field of the first message.
+        second_message_text (str): The .content field of the second message.
+    """
+    words_in_first_message = first_message_text.lower().split()
+    words_in_second_message = second_message_text.lower().split()
+    # Calculate the number of common words
+    common_words = set(words_in_first_message) & set(words_in_second_message)
+    unique_words = set(words_in_first_message + words_in_second_message)
+    unique_words_count = len(unique_words)
+    if unique_words_count == 0:
+        return 0.0
+    return len(common_words) / unique_words_count
 def is_transfer_response(step_detail: Dict):
     # this is not very reliable
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
 class EvaluationController:
+    MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
+    MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98))  # if any two consecutive messages are >98% similar, the inference loop will be terminated
+    MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
     def __init__(
         self,
         wxo_inference_backend: WXOInferenceBackend,
@@ -513,6 +535,12 @@ class EvaluationController:
         self.wxo_inference_backend = wxo_inference_backend
         self.llm_user = llm_user
         self.config = config
+        self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
+        if self.repeating_output_detection:
+            # Use deque for efficient O(1) operations
+            self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
+            self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
     def run(
         self, task_n, story, agent_name: str, starting_user_input: str = None
@@ -522,9 +550,9 @@ class EvaluationController:
         conversation_history: List[Message] = []
         conversational_search_history_data = []
         call_tracker = CallTracker()
-        # make this configurable
-        while step < 20:
+        # make this configurable
+        while step < self.MAX_CONVERSATION_STEPS:
             if step == 0 and starting_user_input:
                 user_input = Message(
                     role="user", content=starting_user_input, type=ContentType.text
@@ -546,9 +574,15 @@ class EvaluationController:
                     f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
                     user_input.content,
                 )
-            if is_end(user_input):
+            if self._is_end(user_input):
                 break
+            if self.repeating_output_detection:
+                self.recent_user_messages.append(user_input.content)
             conversation_history.append(user_input)
             messages, thread_id, conversational_search_data = (
                 self.wxo_inference_backend.stream_messages(
                     user_input,
@@ -559,16 +593,70 @@ class EvaluationController:
             )
             if not messages:
                 raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
-            if self.config.enable_verbose_logging:
-                for message in messages:
-                    rich.print(
-                        f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
-                        message.content,
-                    )
+            for message in messages:
+                    if self.repeating_output_detection:
+                        if message.role == Roles.ASSISTANT and message.type == ContentType.text:
+                            self.recent_assistant_messages.append(message.content)
+                    if self.config.enable_verbose_logging:
+                        rich.print(
+                            f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
+                            message.content,
+                        )
             conversation_history.extend(messages)
             conversational_search_history_data.extend(conversational_search_data)
             step += 1
         return conversation_history, call_tracker, conversational_search_history_data
+    def _is_looping(self, messages: deque) -> bool:
+        """Checks whether the user or assistant is stuck in a loop.
+        Args:
+            messages (deque): Defines the message cache to be assessed for similarity.
+        Returns:
+            bool: True if stuck in a loop, False otherwise.
+        """
+        sim_count = 0
+        if len(messages) >= self.MAX_REPEATING_MESSAGES:
+            oldest_cached_message = messages[0]
+            for i, old_message in enumerate(messages):
+                if i == 0:
+                    continue
+                if oldest_cached_message == old_message:
+                    sim_count += 1
+                elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
+                    sim_count += 1
+        return sim_count >= self.MAX_REPEATING_MESSAGES - 1
+    def _is_end(self, current_user_input: Message) -> bool:
+        """
+        Check if the user input indicates the end of the conversation.
+        - This function checks if the user input contains 'END'.
+        - An END is also triggered when the message cache(s) is filled with messages that are too similar.
+        - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
+        Args:
+            current_user_input (Message): The user message.
+        Returns:
+            bool: True if the user input indicates an END, False otherwise.
+        """
+        current_user_message_content = current_user_input.content.strip()
+        # Check if the user message contains 'END'
+        if "END" in current_user_message_content:
+            return True
+        if self.repeating_output_detection:
+            # Check for repeating user or assistant messages
+            if (self._is_looping(self.recent_user_messages) or
+                self._is_looping(self.recent_assistant_messages)):
+                return True
+        return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
 def get_wxo_client(
     service_url: str, tenant_name: str, token: str = None

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
+    if config.num_workers > 1 and config.enable_manual_user_input:
+        rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
+        config.enable_manual_user_input = False # disable manual user input for parallel execution
+        # reason: threads continue to stream messages while waiting for user input, which is not desired
+        # and the manual input prompt is not labelled properly in the UI
     wxo_client = get_wxo_client(
         config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
     )

{ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.8py3-none-any.whl