PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py ADDED Viewed

@@ -0,0 +1,14 @@
+from wxo_agentic_evaluation.type import CallTracker, Message, RuntimeResponse
+from abc import abstractmethod
+class RuntimeAdapter:
+    @abstractmethod
+    def run(
+        self,
+        user_message: Message,
+        context: dict,
+        thread_id=None,
+    ) -> RuntimeResponse:
+        pass

wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} RENAMED Viewed

@@ -1,33 +1,31 @@
-import requests
-import os
-import yaml
 import json
-import rich
+import os
 import time
-from pydantic import BaseModel
-from typing import List, Generator, Dict, Tuple, Mapping, Any
+from typing import Any, Dict, Generator, List, Mapping
+import requests
+import rich
+import yaml
+from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
+    RuntimeAdapter,
+)
+from wxo_agentic_evaluation.service_provider.watsonx_provider import (
+    WatsonXProvider,
+)
 from wxo_agentic_evaluation.type import (
     ContentType,
-    Message,
+    ConversationalConfidenceThresholdScore,
     ConversationalSearch,
     ConversationalSearchCitations,
     ConversationalSearchResultMetadata,
-    ConversationalConfidenceThresholdScore,
     ConversationalSearchResults,
     ConversationSearchMetadata,
+    Message,
+    RuntimeResponse,
 )
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
-from wxo_agentic_evaluation.arg_configs import TestConfig
-from wxo_agentic_evaluation.service_instance import tenant_setup
 from wxo_agentic_evaluation.utils.utils import is_saas_url
-def is_end(user_input: Message):
-    if "END" in user_input.content.strip():
-        return True
-    return False
+from wxo_agentic_evaluation.wxo_client import WXOClient
 def is_transfer_response(step_detail: Dict):
@@ -39,40 +37,12 @@ def is_transfer_response(step_detail: Dict):
     return False
-class CallTracker(BaseModel):
-    tool_call: List = []
-    tool_response: List = []
-    generic: List = []
-class WXOClient:
-    def __init__(self, service_url, api_key):
-        self.service_url = service_url
-        self.api_key = api_key
-    def _get_headers(self) -> dict:
-        headers = {}
-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
-        return headers
-    def post(self, payload: dict, path: str, stream=False):
-        url = f"{self.service_url}/{path}"
-        return requests.post(
-            url=url, headers=self._get_headers(), json=payload, stream=stream
-        )
-    def get(self, path: str, params: dict = None):
-        url = f"{self.service_url}/{path}"
-        return requests.get(url, params=params, headers=self._get_headers())
-class WXOInferenceBackend:
+class WXORuntimeAdapter(RuntimeAdapter):
     def __init__(self, wxo_client):
         self.wxo_client = wxo_client
         self.enable_saas_mode = is_saas_url(wxo_client.service_url)
-    def run(self, user_input: Message, agent_name, thread_id=None):
+    def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
         agent_id = self.get_agent_id(agent_name)
         payload = {"message": user_input.model_dump(), "agent_id": agent_id}
         if thread_id:
@@ -108,7 +78,9 @@ class WXOInferenceBackend:
         else:
             path = "v1/orchestrate/runs?stream=true"
-        response: requests.Response = self.wxo_client.post(payload, path, stream=True)
+        response: requests.Response = self.wxo_client.post(
+            payload, path, stream=True
+        )
         import json
         for chunk in self._parse_events(response):
@@ -161,7 +133,9 @@ class WXOInferenceBackend:
         citations = parse_citations()
         retrieval_context = parsed_search_results()
         citations_title = conversational_search.get("citations_title", "")
-        response_length_option = conversational_search.get("response_length_option", "")
+        response_length_option = conversational_search.get(
+            "response_length_option", ""
+        )
         text = conversational_search.get("text", "")
         confidence_scores = ConversationalConfidenceThresholdScore(
@@ -184,20 +158,21 @@ class WXOInferenceBackend:
         return conversational_search
-    def stream_messages(
+    def run(
         self,
         user_input: Message,
-        agent_name: str,
-        call_tracker: CallTracker,
+        context: dict,
         thread_id=None,
-    ) -> Tuple[List[Message], str, List[ConversationalSearch]]:
+    ) -> RuntimeResponse:
+        agent_name = context["agent_name"]
+        call_tracker = context["call_tracker"]
         recover = False
         messages = list()
         conversational_search_data = []
         start_time = time.time()
         for chunk in self._stream_events(user_input, agent_name, thread_id):
             event = chunk.get("event", "")
             if _thread_id := chunk.get("data", {}).get("thread_id"):
                 thread_id = _thread_id
@@ -234,7 +209,9 @@ class WXOInferenceBackend:
                                         )
                                     )
                                     end_time = time.time()
-                                    call_tracker.tool_call.append(end_time - start_time)
+                                    call_tracker.tool_call.append(
+                                        end_time - start_time
+                                    )
                                     start_time = end_time
                             elif step_detail["type"] == "tool_call":
                                 # in step details, we could have [tool_response, tool_call]
@@ -252,7 +229,9 @@ class WXOInferenceBackend:
                                     )
                                 )
                                 end_time = time.time()
-                                call_tracker.tool_call.append(end_time - start_time)
+                                call_tracker.tool_call.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                             elif step_detail["type"] == "tool_response":
                                 content = json.dumps(step_detail)
@@ -266,7 +245,9 @@ class WXOInferenceBackend:
                                     )
                                 )
                                 end_time = time.time()
-                                call_tracker.tool_response.append(end_time - start_time)
+                                call_tracker.tool_response.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                     elif content_field := delta.get("content"):
                         for val in content_field:
@@ -285,7 +266,9 @@ class WXOInferenceBackend:
                                     chunk=event,
                                 )
                                 end_time = time.time()
-                                call_tracker.generic.append(end_time - start_time)
+                                call_tracker.generic.append(
+                                    end_time - start_time
+                                )
                                 start_time = end_time
                 # NOTE: The event here that is parsed is part of the "message.created" event
@@ -309,10 +292,14 @@ class WXOInferenceBackend:
                             """
                             last_message = json.loads(messages[-1].content)
-                            tool_call_id = last_message.get("tool_call_id", None)
+                            tool_call_id = last_message.get(
+                                "tool_call_id", None
+                            )
                             assert tool_call_id is not None
-                            conversational_search_metadata = ConversationSearchMetadata(
-                                tool_call_id=tool_call_id
+                            conversational_search_metadata = (
+                                ConversationSearchMetadata(
+                                    tool_call_id=tool_call_id
+                                )
                             )
                             conversational_search = (
                                 self.parse_conversational_search_response(
@@ -320,7 +307,9 @@ class WXOInferenceBackend:
                                     metadata=conversational_search_metadata,
                                 )
                             )
-                            conversational_search_data.append(conversational_search)
+                            conversational_search_data.append(
+                                conversational_search
+                            )
                             messages.append(
                                 Message(
                                     role=role,
@@ -361,7 +350,11 @@ class WXOInferenceBackend:
                 f"Recovered {len(messages)} messages from thread_id {thread_id}",
             )
-        return messages, thread_id, conversational_search_data
+        return RuntimeResponse(
+            messages=messages,
+            thread_id=thread_id,
+            context={"conversational_search_data": conversational_search_data},
+        )
     def _parse_events(
         self, stream: Generator[bytes, None, None]
@@ -406,6 +399,13 @@ class WXOInferenceBackend:
                                     tool_json = {"type": "tool_call"}
                                     tool_json.update(tool)
                                     content = json.dumps(tool_json)
+                                    # TO-DO: review do we even need the get messages for retry loop anymore?
+                                    if msg_content := entry.get("content"):
+                                        if (
+                                            msg_content[0].get("response_type")
+                                            == "conversational_search"
+                                        ):
+                                            continue
                                     messages.append(
                                         Message(
                                             role=role,
@@ -419,7 +419,9 @@ class WXOInferenceBackend:
                                 content = json.dumps(step_detail)
                                 messages.append(
                                     Message(
-                                        role=role, content=content, type=content_type
+                                        role=role,
+                                        content=content,
+                                        type=content_type,
                                     )
                                 )
                             else:
@@ -427,7 +429,9 @@ class WXOInferenceBackend:
                                 content_type = ContentType.tool_response
                                 messages.append(
                                     Message(
-                                        role=role, content=content, type=content_type
+                                        role=role,
+                                        content=content,
+                                        type=content_type,
                                     )
                                 )
             if content_field := entry.get("content"):
@@ -436,12 +440,19 @@ class WXOInferenceBackend:
                     if val["response_type"] == ContentType.text:
                         messages.append(
                             Message(
-                                role=role, content=val["text"], type=ContentType.text
+                                role=role,
+                                content=val["text"],
+                                type=ContentType.text,
                             )
                         )
-                    if val["response_type"] == ContentType.conversational_search:
-                        conversational_search_metadata = ConversationSearchMetadata(
-                            tool_call_id=tool_call_id
+                    if (
+                        val["response_type"]
+                        == ContentType.conversational_search
+                    ):
+                        conversational_search_metadata = (
+                            ConversationSearchMetadata(
+                                tool_call_id=tool_call_id
+                            )
                         )
                         messages.append(
                             Message(
@@ -503,94 +514,21 @@ class WXOInferenceBackend:
         return None
-class EvaluationController:
-    def __init__(
-        self,
-        wxo_inference_backend: WXOInferenceBackend,
-        llm_user: LLMUser,
-        config: TestConfig,
-    ):
-        self.wxo_inference_backend = wxo_inference_backend
-        self.llm_user = llm_user
-        self.config = config
-    def run(
-        self, task_n, story, agent_name: str, starting_user_input: str = None
-    ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
-        step = 0
-        thread_id = None
-        conversation_history: List[Message] = []
-        conversational_search_history_data = []
-        call_tracker = CallTracker()
-        # make this configurable
-        while step < 20:
-            if step == 0 and starting_user_input:
-                user_input = Message(
-                    role="user", content=starting_user_input, type=ContentType.text
-                )
-            else:
-                if self.config.enable_manual_user_input == True:
-                    content = input(
-                        "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
-                    )
-                    user_input = Message(
-                        role="user", content=content, type=ContentType.text
-                    )
-                else:  # llm
-                    user_input = self.llm_user.generate_user_input(
-                        story, conversation_history
-                    )
-            if self.config.enable_verbose_logging:
-                rich.print(
-                    f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
-                    user_input.content,
-                )
-            if is_end(user_input):
-                break
-            conversation_history.append(user_input)
-            messages, thread_id, conversational_search_data = (
-                self.wxo_inference_backend.stream_messages(
-                    user_input,
-                    agent_name=agent_name,
-                    thread_id=thread_id,
-                    call_tracker=call_tracker,
-                )
-            )
-            if not messages:
-                raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
-            if self.config.enable_verbose_logging:
-                for message in messages:
-                    rich.print(
-                        f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
-                        message.content,
-                    )
-            conversation_history.extend(messages)
-            conversational_search_history_data.extend(conversational_search_data)
-            step += 1
-        return conversation_history, call_tracker, conversational_search_history_data
-def get_wxo_client(
-    service_url: str, tenant_name: str, token: str = None
-) -> WXOClient:
-    if not token:
-        token = tenant_setup(service_url, tenant_name)
-    wxo_client = WXOClient(service_url=service_url, api_key=token)
-    return wxo_client
 if __name__ == "__main__":
     wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
-    auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
+    auth_config_path = (
+        f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
+    )
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
     tenant_name = "local"
     token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
     wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
-    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    resp = wxo_client.get("orchestrate/agents")
+    inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
+    resp = wxo_client.get("v1/orchestrate/agents")
     resp = resp.json()
-    print(resp[0])
     for agent in resp:
         print(agent["name"], agent["display_name"])

wxo_agentic_evaluation/scheduler.py ADDED Viewed

@@ -0,0 +1,247 @@
+import glob
+import os
+import re
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from enum import unique
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Set, Tuple
+from rich import print as rich_print
+from rich.progress import Progress
+from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.clients import Clients
+from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
+def discover_tests(
+    test_paths: List[str], recursive_search: bool = False
+) -> List[str]:
+    """
+    Discover test cases from the given test paths.
+    This function searches for JSON test case files in the provided paths.
+    When recursive_search is enabled, it will search through all subdirectories
+    recursively. Otherwise, it will only search the top level of each directory.
+    Args:
+        test_paths: List of paths to search for test cases
+        recursive_search: Whether to search recursively in subdirectories
+    Returns:
+        List of unique test case names
+    """
+    test_cases = []
+    for test_path in test_paths:
+        # Check if the path exists
+        if not glob.glob(test_path):
+            rich_print(
+                f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
+            )
+            continue
+        if os.path.isdir(test_path):
+            if recursive_search:
+                # Use ** pattern for recursive search
+                pattern = os.path.join(test_path, "**", "*.json")
+                found_files = sorted(glob.glob(pattern, recursive=True))
+                rich_print(
+                    f"Found {len(found_files)} files in '{test_path}' (recursive search)"
+                )
+                test_cases.extend(found_files)
+            else:
+                # Original behavior for non-recursive search
+                pattern = os.path.join(test_path, "*.json")
+                found_files = sorted(glob.glob(pattern))
+                rich_print(
+                    f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
+                )
+                test_cases.extend(found_files)
+        else:
+            # If it's a file pattern, just use it directly
+            found_files = sorted(glob.glob(test_path))
+            test_cases.extend(found_files)
+    # Filter out non-JSON files and agent.json files
+    filtered_cases = [
+        tc
+        for tc in test_cases
+        if tc.endswith(".json") and not tc.endswith("agent.json")
+    ]
+    # create mapping of test case name to file path
+    unique_files_map: dict[str, str] = {}
+    for f in filtered_cases:
+        name = Path(f).stem
+        if name not in unique_files_map:
+            unique_files_map[name] = f
+        else:
+            rich_print(
+                f"[bold red]Duplicate test case name detected:[/bold red] "
+                f"'{name}' (skipping file '{f}')"
+            )
+    unique_files = list(unique_files_map.values())
+    rich_print(
+        f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
+    )
+    return unique_files
+def _removesuffix(s: str, suf: str) -> str:
+    """Remove suffix from string (for Python < 3.9 compatibility)"""
+    return s[: -len(suf)] if s.endswith(suf) else s
+def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
+    """
+    Get available runs from the output directory.
+    Args:
+        output_dir: Output directory path
+    Returns:
+        Dictionary mapping test case stems to sets of run numbers
+    """
+    available_runs = defaultdict(set)
+    for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
+        # strip the fixed tail
+        name = _removesuffix(os.path.basename(f), ".messages.json")
+        # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
+        m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
+        if not m:
+            continue
+        stem = m.group("stem")
+        run_num = int(m.group("run") or 1)  # no suffix ⇒ run 1
+        available_runs[stem].add(run_num)
+    return available_runs
+def enumerate_jobs(
+    test_cases: List[str],
+    n_runs: int,
+    skip_available_results: bool,
+    output_dir: str,
+) -> List[Tuple[int, str, int]]:
+    """
+    Enumerate jobs to be run.
+    Args:
+        test_cases: List of test case file paths
+        n_runs: Number of runs per test case
+        skip_available_results: Whether to skip available results
+        output_dir: Output directory path
+    Returns:
+        List of tuples (task_n, test_case, run_idx)
+    """
+    jobs = []
+    task_n = 0
+    available_runs = (
+        get_available_runs(output_dir) if skip_available_results else {}
+    )
+    for test_case in test_cases:
+        stem = Path(test_case).stem
+        for run_idx in range(n_runs):
+            run_number = run_idx + 1
+            # Skip precisely this (test, run) if results exist
+            if skip_available_results and (
+                run_number in available_runs.get(stem, set())
+            ):
+                print(
+                    f"Skipping {stem} run {run_number} as results already exist."
+                )
+                continue
+            jobs.append((task_n, test_case, run_idx))
+            task_n += 1
+    return jobs
+def run_jobs(
+    jobs: List[Tuple[int, str, int]],
+    config: TestConfig,
+    clients: Clients,
+    process_func: Callable,
+    num_workers: int,
+) -> List[Any]:
+    """
+    Run jobs using ThreadPoolExecutor.
+    Args:
+        jobs: List of jobs to run
+        config: Test configuration
+        clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
+        process_func: Function to process each job
+        num_workers: Number of worker threads
+    Returns:
+        List of results from all jobs
+    """
+    if config.num_workers > 1 and config.enable_manual_user_input:
+        rich_print(
+            "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
+        )
+        config.enable_manual_user_input = (
+            False  # disable manual user input for parallel execution
+        )
+    executor = ThreadPoolExecutor(max_workers=num_workers)
+    futures = []
+    for task_n, test_case, run_idx in jobs:
+        future = executor.submit(
+            process_func,
+            task_n,
+            test_case,
+            config,
+            clients.inference_backend,
+            clients.resource_map,
+            clients.llm_user,
+            clients.llmaaj_provider,
+            run_idx,
+        )
+        futures.append(((test_case, run_idx), future))
+    results = []
+    if futures:
+        if LOGGING_ENABLED:
+            # No progress bar when logging - just process tasks
+            for (test_case, run_idx), future in futures:
+                try:
+                    results.extend(future.result())
+                except Exception as e:
+                    import traceback
+                    rich_print(f"test case {test_case} fails with {e}")
+                    traceback.print_exc()
+        else:
+            with Progress() as progress:
+                task1 = progress.add_task(
+                    f"[purple]Evaluating {len(futures)} tasks...",
+                    total=len(futures),
+                )
+                for (test_case, run_idx), future in futures:
+                    try:
+                        results.extend(future.result())
+                    except Exception as e:
+                        import traceback
+                        rich_print(f"test case {test_case} fails with {e}")
+                        traceback.print_exc()
+                    finally:
+                        progress.update(task1, advance=1)
+    return results

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl