PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} RENAMED Viewed

@@ -1,20 +1,15 @@
 import json
 import os
 import time
-from collections import deque
-from enum import Enum
-from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
+from typing import Any, Dict, Generator, List, Mapping
 import requests
 import rich
-import urllib3
 import yaml
-from pydantic import BaseModel
-from urllib3.exceptions import InsecureRequestWarning
-from wxo_agentic_evaluation.arg_configs import TestConfig
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
+from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
+    RuntimeAdapter,
+)
 from wxo_agentic_evaluation.service_provider.watsonx_provider import (
     WatsonXProvider,
 )
@@ -27,41 +22,10 @@ from wxo_agentic_evaluation.type import (
     ConversationalSearchResults,
     ConversationSearchMetadata,
     Message,
+    RuntimeResponse,
 )
-from wxo_agentic_evaluation.utils.utils import (
-    Tokenizer,
-    is_saas_url,
-    safe_divide,
-)
-tokenizer = Tokenizer()
-class Roles(Enum):
-    ASSISTANT = "assistant"
-    USER = "user"
-def calculate_word_overlap_similarity_score(
-    first_message_text: str, second_message_text: str
-) -> float:
-    """Calculate the word overlap similarity score between the .content field of two Message objects.
-    Args:
-        first_message_text (str): The .content field of the first message.
-        second_message_text (str): The .content field of the second message.
-    """
-    words_in_first_message = tokenizer(first_message_text)
-    words_in_second_message = tokenizer(second_message_text)
-    # Calculate the number of common words
-    common_words = set(words_in_first_message) & set(words_in_second_message)
-    unique_words = set(words_in_first_message + words_in_second_message)
-    unique_words_count = len(unique_words)
-    common_words_count = len(common_words)
-    return safe_divide(common_words_count, unique_words_count)
+from wxo_agentic_evaluation.utils.utils import is_saas_url
+from wxo_agentic_evaluation.wxo_client import WXOClient
 def is_transfer_response(step_detail: Dict):
@@ -73,62 +37,12 @@ def is_transfer_response(step_detail: Dict):
     return False
-class CallTracker(BaseModel):
-    tool_call: List = []
-    tool_response: List = []
-    generic: List = []
-class WXOClient:
-    def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
-        self.service_url = service_url
-        self.api_key = api_key
-        ov = os.getenv("WO_SSL_VERIFY")
-        if ov and ov.strip().lower() in ("true", "false"):
-            self._verify_ssl = ov.strip().lower() == "true"
-        else:
-            v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
-            self._verify_ssl = False if (
-                (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
-                (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
-            ) else (v if isinstance(v, bool) else True)
-        if not self._verify_ssl:
-            urllib3.disable_warnings(InsecureRequestWarning)
-    def _get_headers(self) -> dict:
-        headers = {}
-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
-        return headers
-    def post(self, payload: dict, path: str, stream=False):
-        url = f"{self.service_url}/{path}"
-        return requests.post(
-            url=url,
-            headers=self._get_headers(),
-            json=payload,
-            stream=stream,
-            verify=self._verify_ssl,
-        )
-    def get(self, path: str, params: dict = None):
-        url = f"{self.service_url}/{path}"
-        return requests.get(
-            url,
-            params=params,
-            headers=self._get_headers(),
-            verify=self._verify_ssl,
-        )
-class WXOInferenceBackend:
+class WXORuntimeAdapter(RuntimeAdapter):
     def __init__(self, wxo_client):
         self.wxo_client = wxo_client
         self.enable_saas_mode = is_saas_url(wxo_client.service_url)
-    def run(self, user_input: Message, agent_name, thread_id=None):
+    def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
         agent_id = self.get_agent_id(agent_name)
         payload = {"message": user_input.model_dump(), "agent_id": agent_id}
         if thread_id:
@@ -244,20 +158,21 @@ class WXOInferenceBackend:
         return conversational_search
-    def stream_messages(
+    def run(
         self,
         user_input: Message,
-        agent_name: str,
-        call_tracker: CallTracker,
+        context: dict,
         thread_id=None,
-    ) -> Tuple[List[Message], str, List[ConversationalSearch]]:
+    ) -> RuntimeResponse:
+        agent_name = context["agent_name"]
+        call_tracker = context["call_tracker"]
         recover = False
         messages = list()
         conversational_search_data = []
         start_time = time.time()
         for chunk in self._stream_events(user_input, agent_name, thread_id):
             event = chunk.get("event", "")
             if _thread_id := chunk.get("data", {}).get("thread_id"):
                 thread_id = _thread_id
@@ -435,7 +350,11 @@ class WXOInferenceBackend:
                 f"Recovered {len(messages)} messages from thread_id {thread_id}",
             )
-        return messages, thread_id, conversational_search_data
+        return RuntimeResponse(
+            messages=messages,
+            thread_id=thread_id,
+            context={"conversational_search_data": conversational_search_data},
+        )
     def _parse_events(
         self, stream: Generator[bytes, None, None]
@@ -468,7 +387,6 @@ class WXOInferenceBackend:
         messages = []
         for entry in result:
             tool_call_id = None
             if step_history := entry.get("step_history"):
                 for step_message in step_history:
@@ -596,194 +514,6 @@ class WXOInferenceBackend:
         return None
-class EvaluationController:
-    MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
-    MESSAGE_SIMILARITY_THRESHOLD = float(
-        os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
-    )  # if any two consecutive messages are >98% similar, the inference loop will be terminated
-    MAX_REPEATING_MESSAGES = int(
-        os.getenv("MAX_REPEATING_MESSAGES", 3)
-    )  # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
-    def __init__(
-        self,
-        wxo_inference_backend: WXOInferenceBackend,
-        llm_user: LLMUser,
-        config: TestConfig,
-    ):
-        self.wxo_inference_backend = wxo_inference_backend
-        self.llm_user = llm_user
-        self.config = config
-        self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
-        if self.repeating_output_detection:
-            # Use deque for efficient O(1) operations
-            self.recent_user_messages = deque(
-                maxlen=self.MAX_REPEATING_MESSAGES
-            )
-            self.recent_assistant_messages = deque(
-                maxlen=self.MAX_REPEATING_MESSAGES
-            )
-    def run(
-        self,
-        task_n,
-        story,
-        agent_name: str,
-        starting_user_input: str = None,
-        attack_instructions: str = None,
-    ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
-        step = 0
-        thread_id = None
-        conversation_history: List[Message] = []
-        conversational_search_history_data = []
-        call_tracker = CallTracker()
-        # make this configurable
-        while step < self.MAX_CONVERSATION_STEPS:
-            if step == 0 and starting_user_input:
-                user_input = Message(
-                    role="user",
-                    content=starting_user_input,
-                    type=ContentType.text,
-                )
-            else:
-                if self.config.enable_manual_user_input == True:
-                    content = input(
-                        "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
-                    )
-                    user_input = Message(
-                        role="user", content=content, type=ContentType.text
-                    )
-                else:  # llm
-                    user_input = self.llm_user.generate_user_input(
-                        story,
-                        conversation_history,
-                        attack_instructions=attack_instructions,
-                    )
-            if self.config.enable_verbose_logging:
-                rich.print(
-                    f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
-                    user_input.content,
-                )
-            if self._is_end(user_input):
-                break
-            if self.repeating_output_detection:
-                self.recent_user_messages.append(user_input.content)
-            conversation_history.append(user_input)
-            (
-                messages,
-                thread_id,
-                conversational_search_data,
-            ) = self.wxo_inference_backend.stream_messages(
-                user_input,
-                agent_name=agent_name,
-                thread_id=thread_id,
-                call_tracker=call_tracker,
-            )
-            if not messages:
-                raise RuntimeError(
-                    f"[Task-{task_n}] No messages is produced. Exiting task."
-                )
-            for message in messages:
-                if self.repeating_output_detection:
-                    if (
-                        message.role == Roles.ASSISTANT
-                        and message.type == ContentType.text
-                    ):
-                        self.recent_assistant_messages.append(message.content)
-                if self.config.enable_verbose_logging:
-                    rich.print(
-                        f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
-                        message.content,
-                    )
-            conversation_history.extend(messages)
-            conversational_search_history_data.extend(
-                conversational_search_data
-            )
-            step += 1
-        return (
-            conversation_history,
-            call_tracker,
-            conversational_search_history_data,
-        )
-    def _is_looping(self, messages: deque) -> bool:
-        """Checks whether the user or assistant is stuck in a loop.
-        Args:
-            messages (deque): Defines the message cache to be assessed for similarity.
-        Returns:
-            bool: True if stuck in a loop, False otherwise.
-        """
-        sim_count = 0
-        if len(messages) >= self.MAX_REPEATING_MESSAGES:
-            oldest_cached_message = messages[0]
-            for i, old_message in enumerate(messages):
-                if i == 0:
-                    continue
-                if oldest_cached_message == old_message:
-                    sim_count += 1
-                elif (
-                    calculate_word_overlap_similarity_score(
-                        oldest_cached_message, old_message
-                    )
-                    > self.MESSAGE_SIMILARITY_THRESHOLD
-                ):
-                    sim_count += 1
-        return sim_count >= self.MAX_REPEATING_MESSAGES - 1
-    def _is_end(self, current_user_input: Message) -> bool:
-        """
-        Check if the user input indicates the end of the conversation.
-        - This function checks if the user input contains 'END'.
-        - An END is also triggered when the message cache(s) is filled with messages that are too similar.
-        - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
-        Args:
-            current_user_input (Message): The user message.
-        Returns:
-            bool: True if the user input indicates an END, False otherwise.
-        """
-        current_user_message_content = current_user_input.content.strip()
-        # Check if the user message contains 'END'
-        if "END" in current_user_message_content:
-            return True
-        if self.repeating_output_detection:
-            # Check for repeating user or assistant messages
-            if self._is_looping(self.recent_user_messages) or self._is_looping(
-                self.recent_assistant_messages
-            ):
-                return True
-        return False  # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
-def get_wxo_client(
-    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
-) -> WXOClient:
-    token, resolved_url, env = tenant_setup(service_url, tenant_name)
-    service_url = service_url or resolved_url
-    if not (service_url and str(service_url).strip()):
-        raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
-    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
-    return wxo_client
 if __name__ == "__main__":
     wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
     auth_config_path = (
@@ -791,13 +521,14 @@ if __name__ == "__main__":
     )
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
     tenant_name = "local"
     token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
     wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
-    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    resp = wxo_client.get("orchestrate/agents")
+    inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
+    resp = wxo_client.get("v1/orchestrate/agents")
     resp = resp.json()
-    print(resp[0])
     for agent in resp:
         print(agent["name"], agent["display_name"])

wxo_agentic_evaluation/scheduler.py ADDED Viewed

@@ -0,0 +1,247 @@
+import glob
+import os
+import re
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from enum import unique
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Set, Tuple
+from rich import print as rich_print
+from rich.progress import Progress
+from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.clients import Clients
+from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
+def discover_tests(
+    test_paths: List[str], recursive_search: bool = False
+) -> List[str]:
+    """
+    Discover test cases from the given test paths.
+    This function searches for JSON test case files in the provided paths.
+    When recursive_search is enabled, it will search through all subdirectories
+    recursively. Otherwise, it will only search the top level of each directory.
+    Args:
+        test_paths: List of paths to search for test cases
+        recursive_search: Whether to search recursively in subdirectories
+    Returns:
+        List of unique test case names
+    """
+    test_cases = []
+    for test_path in test_paths:
+        # Check if the path exists
+        if not glob.glob(test_path):
+            rich_print(
+                f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
+            )
+            continue
+        if os.path.isdir(test_path):
+            if recursive_search:
+                # Use ** pattern for recursive search
+                pattern = os.path.join(test_path, "**", "*.json")
+                found_files = sorted(glob.glob(pattern, recursive=True))
+                rich_print(
+                    f"Found {len(found_files)} files in '{test_path}' (recursive search)"
+                )
+                test_cases.extend(found_files)
+            else:
+                # Original behavior for non-recursive search
+                pattern = os.path.join(test_path, "*.json")
+                found_files = sorted(glob.glob(pattern))
+                rich_print(
+                    f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
+                )
+                test_cases.extend(found_files)
+        else:
+            # If it's a file pattern, just use it directly
+            found_files = sorted(glob.glob(test_path))
+            test_cases.extend(found_files)
+    # Filter out non-JSON files and agent.json files
+    filtered_cases = [
+        tc
+        for tc in test_cases
+        if tc.endswith(".json") and not tc.endswith("agent.json")
+    ]
+    # create mapping of test case name to file path
+    unique_files_map: dict[str, str] = {}
+    for f in filtered_cases:
+        name = Path(f).stem
+        if name not in unique_files_map:
+            unique_files_map[name] = f
+        else:
+            rich_print(
+                f"[bold red]Duplicate test case name detected:[/bold red] "
+                f"'{name}' (skipping file '{f}')"
+            )
+    unique_files = list(unique_files_map.values())
+    rich_print(
+        f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
+    )
+    return unique_files
+def _removesuffix(s: str, suf: str) -> str:
+    """Remove suffix from string (for Python < 3.9 compatibility)"""
+    return s[: -len(suf)] if s.endswith(suf) else s
+def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
+    """
+    Get available runs from the output directory.
+    Args:
+        output_dir: Output directory path
+    Returns:
+        Dictionary mapping test case stems to sets of run numbers
+    """
+    available_runs = defaultdict(set)
+    for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
+        # strip the fixed tail
+        name = _removesuffix(os.path.basename(f), ".messages.json")
+        # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
+        m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
+        if not m:
+            continue
+        stem = m.group("stem")
+        run_num = int(m.group("run") or 1)  # no suffix ⇒ run 1
+        available_runs[stem].add(run_num)
+    return available_runs
+def enumerate_jobs(
+    test_cases: List[str],
+    n_runs: int,
+    skip_available_results: bool,
+    output_dir: str,
+) -> List[Tuple[int, str, int]]:
+    """
+    Enumerate jobs to be run.
+    Args:
+        test_cases: List of test case file paths
+        n_runs: Number of runs per test case
+        skip_available_results: Whether to skip available results
+        output_dir: Output directory path
+    Returns:
+        List of tuples (task_n, test_case, run_idx)
+    """
+    jobs = []
+    task_n = 0
+    available_runs = (
+        get_available_runs(output_dir) if skip_available_results else {}
+    )
+    for test_case in test_cases:
+        stem = Path(test_case).stem
+        for run_idx in range(n_runs):
+            run_number = run_idx + 1
+            # Skip precisely this (test, run) if results exist
+            if skip_available_results and (
+                run_number in available_runs.get(stem, set())
+            ):
+                print(
+                    f"Skipping {stem} run {run_number} as results already exist."
+                )
+                continue
+            jobs.append((task_n, test_case, run_idx))
+            task_n += 1
+    return jobs
+def run_jobs(
+    jobs: List[Tuple[int, str, int]],
+    config: TestConfig,
+    clients: Clients,
+    process_func: Callable,
+    num_workers: int,
+) -> List[Any]:
+    """
+    Run jobs using ThreadPoolExecutor.
+    Args:
+        jobs: List of jobs to run
+        config: Test configuration
+        clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
+        process_func: Function to process each job
+        num_workers: Number of worker threads
+    Returns:
+        List of results from all jobs
+    """
+    if config.num_workers > 1 and config.enable_manual_user_input:
+        rich_print(
+            "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
+        )
+        config.enable_manual_user_input = (
+            False  # disable manual user input for parallel execution
+        )
+    executor = ThreadPoolExecutor(max_workers=num_workers)
+    futures = []
+    for task_n, test_case, run_idx in jobs:
+        future = executor.submit(
+            process_func,
+            task_n,
+            test_case,
+            config,
+            clients.inference_backend,
+            clients.resource_map,
+            clients.llm_user,
+            clients.llmaaj_provider,
+            run_idx,
+        )
+        futures.append(((test_case, run_idx), future))
+    results = []
+    if futures:
+        if LOGGING_ENABLED:
+            # No progress bar when logging - just process tasks
+            for (test_case, run_idx), future in futures:
+                try:
+                    results.extend(future.result())
+                except Exception as e:
+                    import traceback
+                    rich_print(f"test case {test_case} fails with {e}")
+                    traceback.print_exc()
+        else:
+            with Progress() as progress:
+                task1 = progress.add_task(
+                    f"[purple]Evaluating {len(futures)} tasks...",
+                    total=len(futures),
+                )
+                for (test_case, run_idx), future in futures:
+                    try:
+                        results.extend(future.result())
+                    except Exception as e:
+                        import traceback
+                        rich_print(f"test case {test_case} fails with {e}")
+                        traceback.print_exc()
+                    finally:
+                        progress.update(task1, advance=1)
+    return results

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl