PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.6py3-none-any.whl → 1.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (15) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.0.6
+Version: 1.0.8
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
 pip install -e .
 ```
+## contribution guide
+### secret resolution
+install detect secret utilities:
+```
+pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
+```
+run the scan & resolve detections:
+```
+detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
+```
 ## quick experiment against the default wxo-dev env
 ```bash

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/RECORD RENAMED Viewed

@@ -1,28 +1,28 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
 wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
-wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
+wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
 wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
-wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
-wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
-wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
+wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
+wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
+wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
 wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
 wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
-wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
-wxo_agentic_evaluation/record_chat.py,sha256=IAKCZ6Bc4natHA4SyNtC4tjo-0MDglwBcY5AWvXSgR0,7317
-wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
+wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
+wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
+wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
 wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
-wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
+wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
 wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
 wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
 wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
 wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
 wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
-wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
+wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
 wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
-wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
+wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
 wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
 wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
 wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
 wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
 wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
 wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
 wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
 wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/METADATA,sha256=BqQELgtuSVS6tHNQ5nGkgfwPBiAFgTnvgZbWG3hjCgM,17674
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -74,6 +74,7 @@ class ChatRecordingConfig:
     service_url: str = "http://localhost:4321"
     tenant_name: str = "local"
     token: str = None
+    max_retries: int = 5
 @dataclass

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -247,11 +247,14 @@ class DataAnnotator:
                 }
                 goal_details.append(summarize_step)
                 break
-        if summarize_step:
-            goals[previous] = ["summarize"]
-        else:
+        if previous is None:
+            goals["summarize"] = []
+        elif summarize_step is None:
             goals[previous] = []
+        else:
+            goals[previous] = ["summarize"]
     def generate(self) -> Dict:
         """Generate the final dataset"""

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -218,7 +218,7 @@ class EvaluationPackage:
         tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
         )
         tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
+        correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
         for message in self.messages:
             if message.type == ContentType.tool_call:
@@ -244,6 +244,7 @@ class EvaluationPackage:
                     continue
+                # TO-DO: re-think how deduplication works in the context of precision & recall
                 tool_call_and_routing_metrics.total_tool_calls += 1
                 # evaluating more than once is fine
@@ -262,8 +263,8 @@ class EvaluationPackage:
                         if msg_tool_call["args"] == goal_detail.args:
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(goal_detail.name)
-                            tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            correct_tool_calls.add(goal_detail.name)
+                            #tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
@@ -308,6 +309,9 @@ class EvaluationPackage:
             else:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
+        tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
         assistant_responses = [
             message
             for message in self.messages

wxo_agentic_evaluation/external_agent/__init__.py CHANGED Viewed

@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
         "decoding_method": "greedy",
         "max_new_tokens": 4096,
     }
-    wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
+    wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
     prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
     res = wai_client.query(prompt)
     res = res.strip()

wxo_agentic_evaluation/external_agent/performance_test.py CHANGED Viewed

@@ -3,7 +3,7 @@ from rich.console import Console
 from wxo_agentic_evaluation.external_agent import generate_starting_sentence
 from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
-from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
+from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
 class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
         kw_gen_config = KeywordsGenerationConfig()
-        provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
         llm_decode_parameter = {
             "min_new_tokens": 0,
             "decoding_method": "greedy",
             "max_new_tokens": 256,
         }
-        wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
+        wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
         self.kw_gen = KeywordsGenerationLLM(
             provider=wai_client,

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -6,6 +6,8 @@ import rich
 import time
 from pydantic import BaseModel
 from typing import List, Generator, Dict, Tuple, Mapping, Any
+from enum import Enum
+from collections import deque
 from wxo_agentic_evaluation.type import (
     ContentType,
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.service_instance import tenant_setup
 from wxo_agentic_evaluation.utils.utils import is_saas_url
+class Roles(Enum):
+    ASSISTANT = "assistant"
+    USER = "user"
-def is_end(user_input: Message):
-    if "END" in user_input.content.strip():
-        return True
-    return False
+def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
+    """Calculate the word overlap similarity score between the .content field of two Message objects.
+    Args:
+        first_message_text (str): The .content field of the first message.
+        second_message_text (str): The .content field of the second message.
+    """
+    words_in_first_message = first_message_text.lower().split()
+    words_in_second_message = second_message_text.lower().split()
+    # Calculate the number of common words
+    common_words = set(words_in_first_message) & set(words_in_second_message)
+    unique_words = set(words_in_first_message + words_in_second_message)
+    unique_words_count = len(unique_words)
+    if unique_words_count == 0:
+        return 0.0
+    return len(common_words) / unique_words_count
 def is_transfer_response(step_detail: Dict):
     # this is not very reliable
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
 class EvaluationController:
+    MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
+    MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98))  # if any two consecutive messages are >98% similar, the inference loop will be terminated
+    MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
     def __init__(
         self,
         wxo_inference_backend: WXOInferenceBackend,
@@ -513,6 +535,12 @@ class EvaluationController:
         self.wxo_inference_backend = wxo_inference_backend
         self.llm_user = llm_user
         self.config = config
+        self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
+        if self.repeating_output_detection:
+            # Use deque for efficient O(1) operations
+            self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
+            self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
     def run(
         self, task_n, story, agent_name: str, starting_user_input: str = None
@@ -522,9 +550,9 @@ class EvaluationController:
         conversation_history: List[Message] = []
         conversational_search_history_data = []
         call_tracker = CallTracker()
-        # make this configurable
-        while step < 20:
+        # make this configurable
+        while step < self.MAX_CONVERSATION_STEPS:
             if step == 0 and starting_user_input:
                 user_input = Message(
                     role="user", content=starting_user_input, type=ContentType.text
@@ -546,9 +574,15 @@ class EvaluationController:
                     f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
                     user_input.content,
                 )
-            if is_end(user_input):
+            if self._is_end(user_input):
                 break
+            if self.repeating_output_detection:
+                self.recent_user_messages.append(user_input.content)
             conversation_history.append(user_input)
             messages, thread_id, conversational_search_data = (
                 self.wxo_inference_backend.stream_messages(
                     user_input,
@@ -559,16 +593,70 @@ class EvaluationController:
             )
             if not messages:
                 raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
-            if self.config.enable_verbose_logging:
-                for message in messages:
-                    rich.print(
-                        f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
-                        message.content,
-                    )
+            for message in messages:
+                    if self.repeating_output_detection:
+                        if message.role == Roles.ASSISTANT and message.type == ContentType.text:
+                            self.recent_assistant_messages.append(message.content)
+                    if self.config.enable_verbose_logging:
+                        rich.print(
+                            f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
+                            message.content,
+                        )
             conversation_history.extend(messages)
             conversational_search_history_data.extend(conversational_search_data)
             step += 1
         return conversation_history, call_tracker, conversational_search_history_data
+    def _is_looping(self, messages: deque) -> bool:
+        """Checks whether the user or assistant is stuck in a loop.
+        Args:
+            messages (deque): Defines the message cache to be assessed for similarity.
+        Returns:
+            bool: True if stuck in a loop, False otherwise.
+        """
+        sim_count = 0
+        if len(messages) >= self.MAX_REPEATING_MESSAGES:
+            oldest_cached_message = messages[0]
+            for i, old_message in enumerate(messages):
+                if i == 0:
+                    continue
+                if oldest_cached_message == old_message:
+                    sim_count += 1
+                elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
+                    sim_count += 1
+        return sim_count >= self.MAX_REPEATING_MESSAGES - 1
+    def _is_end(self, current_user_input: Message) -> bool:
+        """
+        Check if the user input indicates the end of the conversation.
+        - This function checks if the user input contains 'END'.
+        - An END is also triggered when the message cache(s) is filled with messages that are too similar.
+        - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
+        Args:
+            current_user_input (Message): The user message.
+        Returns:
+            bool: True if the user input indicates an END, False otherwise.
+        """
+        current_user_message_content = current_user_input.content.strip()
+        # Check if the user message contains 'END'
+        if "END" in current_user_message_content:
+            return True
+        if self.repeating_output_detection:
+            # Check for repeating user or assistant messages
+            if (self._is_looping(self.recent_user_messages) or
+                self._is_looping(self.recent_assistant_messages)):
+                return True
+        return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
 def get_wxo_client(
     service_url: str, tenant_name: str, token: str = None

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
+    if config.num_workers > 1 and config.enable_manual_user_input:
+        rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
+        config.enable_manual_user_input = False # disable manual user input for parallel execution
+        # reason: threads continue to stream messages while waiting for user input, which is not desired
+        # and the manual input prompt is not labelled properly in the UI
     wxo_client = get_wxo_client(
         config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
     )

wxo_agentic_evaluation/record_chat.py CHANGED Viewed

@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
     else:
         path = "v1/orchestrate/runs"
-    initial_response = wxo_client.get(
-        path, {"limit": limit, "offset": 0}
-    ).json()
+    initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
     total_runs = initial_response["total"]
     all_runs.extend(initial_response["data"])
     while len(all_runs) < total_runs:
         offset += limit
-        response = wxo_client.get(
-            path, {"limit": limit, "offset": offset}
-        ).json()
+        response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
         all_runs.extend(response["data"])
     # Sort runs by completed_at in descending order (most recent first)
@@ -92,9 +88,10 @@ def annotate_messages(
         annotated_data["agent"] = agent_name
     annotated_data["story"] = generate_story(annotated_data)
     return annotated_data
 def has_messages_changed(
     thread_id: str,
     messages: List[Message],
@@ -111,32 +108,27 @@ def has_messages_changed(
     return False
-def record_chats(config: ChatRecordingConfig):
+def _record(config: ChatRecordingConfig, bad_threads: set):
     """Record chats in background mode"""
     start_time = datetime.utcnow()
     processed_threads = set()
     previous_input_hash: dict[str, str] = {}
-    rich.print(
-        f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
-    )
     if config.token is None:
         config.token = tenant_setup(config.service_url, config.tenant_name)
     wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    try:
-        while True:
+    retry_count = 0
+    while retry_count < config.max_retries:
+        thread_id = None
+        try:
             all_runs = get_all_runs(wxo_client)
             seen_threads = set()
             # Process only new runs that started after our recording began
             for run in all_runs:
                 thread_id = run.get("thread_id")
-                try:
-                    agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
-                except Exception as e:
-                    rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
-                    continue
-                if thread_id in seen_threads or agent_name is None:
+                if (thread_id in bad_threads) or (thread_id in seen_threads):
                     continue
                 seen_threads.add(thread_id)
                 started_at = run.get("started_at")
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
                         try:
                             messages = inference_backend.get_messages(thread_id)
-                            if not has_messages_changed(
-                                thread_id,
-                                messages,
-                                previous_input_hash,
-                            ):
+                            if not has_messages_changed(thread_id, messages, previous_input_hash):
+                                continue
+                            try:
+                                agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
+                            except Exception as e:
+                                rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
+                                raise
+                            if agent_name is None:
+                                rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
                                 continue
                             annotated_data = annotate_messages(
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
                             with open(annotation_filename, "w") as f:
                                 json.dump(annotated_data, f, indent=4)
                         except Exception as e:
-                            rich.print(
-                                f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
-                            )
+                            rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
+                            raise
                 except (ValueError, TypeError) as e:
-                    rich.print(
-                        f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
-                    )
+                    rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
+                    raise
+            retry_count = 0
+            time.sleep(2)
-            time.sleep(2)  # Poll every 2 seconds
+        except KeyboardInterrupt:
+            rich.print("\n[yellow]Recording stopped by user[/yellow]")
+            break
-    except KeyboardInterrupt:
-        rich.print("\n[yellow]Recording stopped by user[/yellow]")
+        except Exception as e:
+            if thread_id is None:
+                rich.print(f"[red]ERROR:[/red] {e}")
+                break
+            time.sleep(1)
+            retry_count += 1
+            if retry_count >= config.max_retries:
+                rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
+                bad_threads.add(thread_id)
+                _record(config, bad_threads)
+def record_chats(config: ChatRecordingConfig):
+    rich.print(
+        f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
+    )
+    bad_threads = set()
+    _record(config, bad_threads)
 if __name__ == "__main__":
     record_chats(CLI(ChatRecordingConfig, as_positional=False))

wxo_agentic_evaluation/resource_map.py CHANGED Viewed

@@ -14,7 +14,7 @@ class ResourceMap:
         if is_saas_url(self.wxo_client.service_url):
             # TO-DO: this is not validated after the v1 prefix change
             # need additional validation
-            tools_path = "v1/orchestrate/tools/"
+            tools_path = "v1/orchestrate/tools"
             agents_path = "v1/orchestrate/agents"
         else:
             tools_path = "v1/tools/"

wxo_agentic_evaluation/service_provider/model_proxy_provider.py CHANGED Viewed

@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
 AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
 AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
-WO_INSTANCE = os.environ.get("WO_INSTANCE")
-WO_API_KEY = os.environ.get("WO_API_KEY")
 DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
     def __init__(
         self,
         model_id=None,
-        api_key=WO_API_KEY,
-        instance_url=WO_INSTANCE,
+        api_key=None,
+        instance_url=None,
         timeout=300,
         embedding_model_id=None,
         params=None
     ):
         super().__init__()
+        instance_url = os.environ.get("WO_INSTANCE", instance_url)
+        api_key = os.environ.get("WO_API_KEY", api_key)
         if not instance_url or not api_key:
             raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")

wxo_agentic_evaluation/tool_planner.py CHANGED Viewed

@@ -6,6 +6,7 @@ import importlib.util
 import re
 from jsonargparse import CLI
 import os
+import sys
 import textwrap
 from dataclasses import is_dataclass, asdict
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
             module_name = file_path.stem
             spec = importlib.util.spec_from_file_location(module_name, file_path)
             module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(module)
+            parent_dir = str(file_path.parent)
+            sys_path_modified = False
+            if parent_dir not in sys.path:
+                sys.path.append(parent_dir)
+                sys_path_modified = True
+            try:
+                spec.loader.exec_module(module)
+            finally:
+                if sys_path_modified:
+                    sys.path.pop()
             # Add all module's non-private functions to tools_dict
             for attr_name in dir(module):
                 attr = getattr(module, attr_name)

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.6py3-none-any.whl → 1.0.8py3-none-any.whl