PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.6py3-none-any.whl → 1.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (10) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.0.6
+Version: 1.0.7
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
 pip install -e .
 ```
+## contribution guide
+### secret resolution
+install detect secret utilities:
+```
+pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
+```
+run the scan & resolve detections:
+```
+detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
+```
 ## quick experiment against the default wxo-dev env
 ```bash

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/RECORD RENAMED Viewed

@@ -1,20 +1,20 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
 wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
-wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
+wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
 wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
-wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
+wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
 wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
 wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
 wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
 wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
 wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
-wxo_agentic_evaluation/record_chat.py,sha256=IAKCZ6Bc4natHA4SyNtC4tjo-0MDglwBcY5AWvXSgR0,7317
-wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
+wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
+wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
 wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
-wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
+wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
 wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
 wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
 wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
 wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
 wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
 wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
 wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
 wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
 wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
 wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/METADATA,sha256=BqQELgtuSVS6tHNQ5nGkgfwPBiAFgTnvgZbWG3hjCgM,17674
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
+ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -74,6 +74,7 @@ class ChatRecordingConfig:
     service_url: str = "http://localhost:4321"
     tenant_name: str = "local"
     token: str = None
+    max_retries: int = 5
 @dataclass

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -247,11 +247,14 @@ class DataAnnotator:
                 }
                 goal_details.append(summarize_step)
                 break
-        if summarize_step:
-            goals[previous] = ["summarize"]
-        else:
+        if previous is None:
+            goals["summarize"] = []
+        elif summarize_step is None:
             goals[previous] = []
+        else:
+            goals[previous] = ["summarize"]
     def generate(self) -> Dict:
         """Generate the final dataset"""

wxo_agentic_evaluation/record_chat.py CHANGED Viewed

@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
     else:
         path = "v1/orchestrate/runs"
-    initial_response = wxo_client.get(
-        path, {"limit": limit, "offset": 0}
-    ).json()
+    initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
     total_runs = initial_response["total"]
     all_runs.extend(initial_response["data"])
     while len(all_runs) < total_runs:
         offset += limit
-        response = wxo_client.get(
-            path, {"limit": limit, "offset": offset}
-        ).json()
+        response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
         all_runs.extend(response["data"])
     # Sort runs by completed_at in descending order (most recent first)
@@ -92,9 +88,10 @@ def annotate_messages(
         annotated_data["agent"] = agent_name
     annotated_data["story"] = generate_story(annotated_data)
     return annotated_data
 def has_messages_changed(
     thread_id: str,
     messages: List[Message],
@@ -111,32 +108,27 @@ def has_messages_changed(
     return False
-def record_chats(config: ChatRecordingConfig):
+def _record(config: ChatRecordingConfig, bad_threads: set):
     """Record chats in background mode"""
     start_time = datetime.utcnow()
     processed_threads = set()
     previous_input_hash: dict[str, str] = {}
-    rich.print(
-        f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
-    )
     if config.token is None:
         config.token = tenant_setup(config.service_url, config.tenant_name)
     wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    try:
-        while True:
+    retry_count = 0
+    while retry_count < config.max_retries:
+        thread_id = None
+        try:
             all_runs = get_all_runs(wxo_client)
             seen_threads = set()
             # Process only new runs that started after our recording began
             for run in all_runs:
                 thread_id = run.get("thread_id")
-                try:
-                    agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
-                except Exception as e:
-                    rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
-                    continue
-                if thread_id in seen_threads or agent_name is None:
+                if (thread_id in bad_threads) or (thread_id in seen_threads):
                     continue
                 seen_threads.add(thread_id)
                 started_at = run.get("started_at")
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
                         try:
                             messages = inference_backend.get_messages(thread_id)
-                            if not has_messages_changed(
-                                thread_id,
-                                messages,
-                                previous_input_hash,
-                            ):
+                            if not has_messages_changed(thread_id, messages, previous_input_hash):
+                                continue
+                            try:
+                                agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
+                            except Exception as e:
+                                rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
+                                raise
+                            if agent_name is None:
+                                rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
                                 continue
                             annotated_data = annotate_messages(
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
                             with open(annotation_filename, "w") as f:
                                 json.dump(annotated_data, f, indent=4)
                         except Exception as e:
-                            rich.print(
-                                f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
-                            )
+                            rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
+                            raise
                 except (ValueError, TypeError) as e:
-                    rich.print(
-                        f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
-                    )
+                    rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
+                    raise
+            retry_count = 0
+            time.sleep(2)
-            time.sleep(2)  # Poll every 2 seconds
+        except KeyboardInterrupt:
+            rich.print("\n[yellow]Recording stopped by user[/yellow]")
+            break
-    except KeyboardInterrupt:
-        rich.print("\n[yellow]Recording stopped by user[/yellow]")
+        except Exception as e:
+            if thread_id is None:
+                rich.print(f"[red]ERROR:[/red] {e}")
+                break
+            time.sleep(1)
+            retry_count += 1
+            if retry_count >= config.max_retries:
+                rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
+                bad_threads.add(thread_id)
+                _record(config, bad_threads)
+def record_chats(config: ChatRecordingConfig):
+    rich.print(
+        f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
+    )
+    bad_threads = set()
+    _record(config, bad_threads)
 if __name__ == "__main__":
     record_chats(CLI(ChatRecordingConfig, as_positional=False))

wxo_agentic_evaluation/resource_map.py CHANGED Viewed

@@ -14,7 +14,7 @@ class ResourceMap:
         if is_saas_url(self.wxo_client.service_url):
             # TO-DO: this is not validated after the v1 prefix change
             # need additional validation
-            tools_path = "v1/orchestrate/tools/"
+            tools_path = "v1/orchestrate/tools"
             agents_path = "v1/orchestrate/agents"
         else:
             tools_path = "v1/tools/"

wxo_agentic_evaluation/service_provider/model_proxy_provider.py CHANGED Viewed

@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
 AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
 AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
-WO_INSTANCE = os.environ.get("WO_INSTANCE")
-WO_API_KEY = os.environ.get("WO_API_KEY")
 DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
     def __init__(
         self,
         model_id=None,
-        api_key=WO_API_KEY,
-        instance_url=WO_INSTANCE,
+        api_key=None,
+        instance_url=None,
         timeout=300,
         embedding_model_id=None,
         params=None
     ):
         super().__init__()
+        instance_url = os.environ.get("WO_INSTANCE", instance_url)
+        api_key = os.environ.get("WO_API_KEY", api_key)
         if not instance_url or not api_key:
             raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")

wxo_agentic_evaluation/tool_planner.py CHANGED Viewed

@@ -6,6 +6,7 @@ import importlib.util
 import re
 from jsonargparse import CLI
 import os
+import sys
 import textwrap
 from dataclasses import is_dataclass, asdict
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
             module_name = file_path.stem
             spec = importlib.util.spec_from_file_location(module_name, file_path)
             module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(module)
+            parent_dir = str(file_path.parent)
+            sys_path_modified = False
+            if parent_dir not in sys.path:
+                sys.path.append(parent_dir)
+                sys_path_modified = True
+            try:
+                spec.loader.exec_module(module)
+            finally:
+                if sys_path_modified:
+                    sys.path.pop()
             # Add all module's non-private functions to tools_dict
             for attr_name in dir(module):
                 attr = getattr(module, attr_name)

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.6py3-none-any.whl → 1.0.7py3-none-any.whl