ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,15 @@
1
1
  import json
2
2
  import os
3
3
  import time
4
- from collections import deque
5
- from enum import Enum
6
- from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
4
+ from typing import Any, Dict, Generator, List, Mapping
7
5
 
8
6
  import requests
9
7
  import rich
10
- import urllib3
11
8
  import yaml
12
- from pydantic import BaseModel
13
- from urllib3.exceptions import InsecureRequestWarning
14
9
 
15
- from wxo_agentic_evaluation.arg_configs import TestConfig
16
- from wxo_agentic_evaluation.llm_user import LLMUser
17
- from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
10
+ from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
11
+ RuntimeAdapter,
12
+ )
18
13
  from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
14
  WatsonXProvider,
20
15
  )
@@ -27,41 +22,10 @@ from wxo_agentic_evaluation.type import (
27
22
  ConversationalSearchResults,
28
23
  ConversationSearchMetadata,
29
24
  Message,
25
+ RuntimeResponse,
30
26
  )
31
- from wxo_agentic_evaluation.utils.utils import (
32
- Tokenizer,
33
- is_saas_url,
34
- safe_divide,
35
- )
36
-
37
- tokenizer = Tokenizer()
38
-
39
-
40
- class Roles(Enum):
41
- ASSISTANT = "assistant"
42
- USER = "user"
43
-
44
-
45
- def calculate_word_overlap_similarity_score(
46
- first_message_text: str, second_message_text: str
47
- ) -> float:
48
- """Calculate the word overlap similarity score between the .content field of two Message objects.
49
- Args:
50
- first_message_text (str): The .content field of the first message.
51
- second_message_text (str): The .content field of the second message.
52
- """
53
-
54
- words_in_first_message = tokenizer(first_message_text)
55
- words_in_second_message = tokenizer(second_message_text)
56
-
57
- # Calculate the number of common words
58
- common_words = set(words_in_first_message) & set(words_in_second_message)
59
- unique_words = set(words_in_first_message + words_in_second_message)
60
-
61
- unique_words_count = len(unique_words)
62
- common_words_count = len(common_words)
63
-
64
- return safe_divide(common_words_count, unique_words_count)
27
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
28
+ from wxo_agentic_evaluation.wxo_client import WXOClient
65
29
 
66
30
 
67
31
  def is_transfer_response(step_detail: Dict):
@@ -73,62 +37,12 @@ def is_transfer_response(step_detail: Dict):
73
37
  return False
74
38
 
75
39
 
76
- class CallTracker(BaseModel):
77
- tool_call: List = []
78
- tool_response: List = []
79
- generic: List = []
80
-
81
-
82
- class WXOClient:
83
- def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
84
- self.service_url = service_url
85
- self.api_key = api_key
86
-
87
- ov = os.getenv("WO_SSL_VERIFY")
88
- if ov and ov.strip().lower() in ("true", "false"):
89
- self._verify_ssl = ov.strip().lower() == "true"
90
- else:
91
- v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
92
- self._verify_ssl = False if (
93
- (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
94
- (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
95
- ) else (v if isinstance(v, bool) else True)
96
-
97
- if not self._verify_ssl:
98
- urllib3.disable_warnings(InsecureRequestWarning)
99
-
100
- def _get_headers(self) -> dict:
101
- headers = {}
102
- if self.api_key:
103
- headers["Authorization"] = f"Bearer {self.api_key}"
104
- return headers
105
-
106
- def post(self, payload: dict, path: str, stream=False):
107
- url = f"{self.service_url}/{path}"
108
- return requests.post(
109
- url=url,
110
- headers=self._get_headers(),
111
- json=payload,
112
- stream=stream,
113
- verify=self._verify_ssl,
114
- )
115
-
116
- def get(self, path: str, params: dict = None):
117
- url = f"{self.service_url}/{path}"
118
- return requests.get(
119
- url,
120
- params=params,
121
- headers=self._get_headers(),
122
- verify=self._verify_ssl,
123
- )
124
-
125
-
126
- class WXOInferenceBackend:
40
+ class WXORuntimeAdapter(RuntimeAdapter):
127
41
  def __init__(self, wxo_client):
128
42
  self.wxo_client = wxo_client
129
43
  self.enable_saas_mode = is_saas_url(wxo_client.service_url)
130
44
 
131
- def run(self, user_input: Message, agent_name, thread_id=None):
45
+ def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
132
46
  agent_id = self.get_agent_id(agent_name)
133
47
  payload = {"message": user_input.model_dump(), "agent_id": agent_id}
134
48
  if thread_id:
@@ -244,20 +158,21 @@ class WXOInferenceBackend:
244
158
 
245
159
  return conversational_search
246
160
 
247
- def stream_messages(
161
+ def run(
248
162
  self,
249
163
  user_input: Message,
250
- agent_name: str,
251
- call_tracker: CallTracker,
164
+ context: dict,
252
165
  thread_id=None,
253
- ) -> Tuple[List[Message], str, List[ConversationalSearch]]:
166
+ ) -> RuntimeResponse:
167
+
168
+ agent_name = context["agent_name"]
169
+ call_tracker = context["call_tracker"]
254
170
  recover = False
255
171
  messages = list()
256
172
  conversational_search_data = []
257
173
 
258
174
  start_time = time.time()
259
175
  for chunk in self._stream_events(user_input, agent_name, thread_id):
260
-
261
176
  event = chunk.get("event", "")
262
177
  if _thread_id := chunk.get("data", {}).get("thread_id"):
263
178
  thread_id = _thread_id
@@ -435,7 +350,11 @@ class WXOInferenceBackend:
435
350
  f"Recovered {len(messages)} messages from thread_id {thread_id}",
436
351
  )
437
352
 
438
- return messages, thread_id, conversational_search_data
353
+ return RuntimeResponse(
354
+ messages=messages,
355
+ thread_id=thread_id,
356
+ context={"conversational_search_data": conversational_search_data},
357
+ )
439
358
 
440
359
  def _parse_events(
441
360
  self, stream: Generator[bytes, None, None]
@@ -468,7 +387,6 @@ class WXOInferenceBackend:
468
387
 
469
388
  messages = []
470
389
  for entry in result:
471
-
472
390
  tool_call_id = None
473
391
  if step_history := entry.get("step_history"):
474
392
  for step_message in step_history:
@@ -596,194 +514,6 @@ class WXOInferenceBackend:
596
514
  return None
597
515
 
598
516
 
599
- class EvaluationController:
600
-
601
- MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
602
- MESSAGE_SIMILARITY_THRESHOLD = float(
603
- os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
604
- ) # if any two consecutive messages are >98% similar, the inference loop will be terminated
605
- MAX_REPEATING_MESSAGES = int(
606
- os.getenv("MAX_REPEATING_MESSAGES", 3)
607
- ) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
608
-
609
- def __init__(
610
- self,
611
- wxo_inference_backend: WXOInferenceBackend,
612
- llm_user: LLMUser,
613
- config: TestConfig,
614
- ):
615
- self.wxo_inference_backend = wxo_inference_backend
616
- self.llm_user = llm_user
617
- self.config = config
618
- self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
619
-
620
- if self.repeating_output_detection:
621
- # Use deque for efficient O(1) operations
622
- self.recent_user_messages = deque(
623
- maxlen=self.MAX_REPEATING_MESSAGES
624
- )
625
- self.recent_assistant_messages = deque(
626
- maxlen=self.MAX_REPEATING_MESSAGES
627
- )
628
-
629
- def run(
630
- self,
631
- task_n,
632
- story,
633
- agent_name: str,
634
- starting_user_input: str = None,
635
- attack_instructions: str = None,
636
- ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
637
- step = 0
638
- thread_id = None
639
- conversation_history: List[Message] = []
640
- conversational_search_history_data = []
641
- call_tracker = CallTracker()
642
-
643
- # make this configurable
644
- while step < self.MAX_CONVERSATION_STEPS:
645
- if step == 0 and starting_user_input:
646
- user_input = Message(
647
- role="user",
648
- content=starting_user_input,
649
- type=ContentType.text,
650
- )
651
- else:
652
- if self.config.enable_manual_user_input == True:
653
- content = input(
654
- "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
655
- )
656
- user_input = Message(
657
- role="user", content=content, type=ContentType.text
658
- )
659
- else: # llm
660
- user_input = self.llm_user.generate_user_input(
661
- story,
662
- conversation_history,
663
- attack_instructions=attack_instructions,
664
- )
665
- if self.config.enable_verbose_logging:
666
- rich.print(
667
- f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
668
- user_input.content,
669
- )
670
-
671
- if self._is_end(user_input):
672
- break
673
-
674
- if self.repeating_output_detection:
675
- self.recent_user_messages.append(user_input.content)
676
-
677
- conversation_history.append(user_input)
678
-
679
- (
680
- messages,
681
- thread_id,
682
- conversational_search_data,
683
- ) = self.wxo_inference_backend.stream_messages(
684
- user_input,
685
- agent_name=agent_name,
686
- thread_id=thread_id,
687
- call_tracker=call_tracker,
688
- )
689
- if not messages:
690
- raise RuntimeError(
691
- f"[Task-{task_n}] No messages is produced. Exiting task."
692
- )
693
-
694
- for message in messages:
695
- if self.repeating_output_detection:
696
- if (
697
- message.role == Roles.ASSISTANT
698
- and message.type == ContentType.text
699
- ):
700
- self.recent_assistant_messages.append(message.content)
701
-
702
- if self.config.enable_verbose_logging:
703
- rich.print(
704
- f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
705
- message.content,
706
- )
707
-
708
- conversation_history.extend(messages)
709
- conversational_search_history_data.extend(
710
- conversational_search_data
711
- )
712
-
713
- step += 1
714
- return (
715
- conversation_history,
716
- call_tracker,
717
- conversational_search_history_data,
718
- )
719
-
720
- def _is_looping(self, messages: deque) -> bool:
721
- """Checks whether the user or assistant is stuck in a loop.
722
- Args:
723
- messages (deque): Defines the message cache to be assessed for similarity.
724
- Returns:
725
- bool: True if stuck in a loop, False otherwise.
726
- """
727
- sim_count = 0
728
-
729
- if len(messages) >= self.MAX_REPEATING_MESSAGES:
730
- oldest_cached_message = messages[0]
731
- for i, old_message in enumerate(messages):
732
- if i == 0:
733
- continue
734
- if oldest_cached_message == old_message:
735
- sim_count += 1
736
- elif (
737
- calculate_word_overlap_similarity_score(
738
- oldest_cached_message, old_message
739
- )
740
- > self.MESSAGE_SIMILARITY_THRESHOLD
741
- ):
742
- sim_count += 1
743
-
744
- return sim_count >= self.MAX_REPEATING_MESSAGES - 1
745
-
746
- def _is_end(self, current_user_input: Message) -> bool:
747
- """
748
- Check if the user input indicates the end of the conversation.
749
-
750
- - This function checks if the user input contains 'END'.
751
- - An END is also triggered when the message cache(s) is filled with messages that are too similar.
752
- - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
753
- Args:
754
- current_user_input (Message): The user message.
755
- Returns:
756
- bool: True if the user input indicates an END, False otherwise.
757
- """
758
- current_user_message_content = current_user_input.content.strip()
759
-
760
- # Check if the user message contains 'END'
761
- if "END" in current_user_message_content:
762
- return True
763
-
764
- if self.repeating_output_detection:
765
- # Check for repeating user or assistant messages
766
- if self._is_looping(self.recent_user_messages) or self._is_looping(
767
- self.recent_assistant_messages
768
- ):
769
- return True
770
-
771
- return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
772
-
773
-
774
- def get_wxo_client(
775
- service_url: Optional[str], tenant_name: str, token: Optional[str] = None
776
- ) -> WXOClient:
777
-
778
- token, resolved_url, env = tenant_setup(service_url, tenant_name)
779
- service_url = service_url or resolved_url
780
-
781
- if not (service_url and str(service_url).strip()):
782
- raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
783
-
784
- wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
785
- return wxo_client
786
-
787
517
  if __name__ == "__main__":
788
518
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
789
519
  auth_config_path = (
@@ -791,13 +521,14 @@ if __name__ == "__main__":
791
521
  )
792
522
  with open(auth_config_path, "r") as f:
793
523
  auth_config = yaml.safe_load(f)
524
+
794
525
  tenant_name = "local"
795
526
  token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
796
527
 
797
528
  wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
798
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
799
- resp = wxo_client.get("orchestrate/agents")
529
+ inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
530
+ resp = wxo_client.get("v1/orchestrate/agents")
800
531
  resp = resp.json()
801
- print(resp[0])
532
+
802
533
  for agent in resp:
803
534
  print(agent["name"], agent["display_name"])
@@ -0,0 +1,247 @@
1
+ import glob
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from enum import unique
7
+ from pathlib import Path
8
+ from typing import Any, Callable, Dict, List, Set, Tuple
9
+
10
+ from rich import print as rich_print
11
+ from rich.progress import Progress
12
+
13
+ from wxo_agentic_evaluation.arg_configs import TestConfig
14
+ from wxo_agentic_evaluation.clients import Clients
15
+ from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
16
+
17
+
18
+ def discover_tests(
19
+ test_paths: List[str], recursive_search: bool = False
20
+ ) -> List[str]:
21
+ """
22
+ Discover test cases from the given test paths.
23
+
24
+ This function searches for JSON test case files in the provided paths.
25
+ When recursive_search is enabled, it will search through all subdirectories
26
+ recursively. Otherwise, it will only search the top level of each directory.
27
+
28
+ Args:
29
+ test_paths: List of paths to search for test cases
30
+ recursive_search: Whether to search recursively in subdirectories
31
+
32
+ Returns:
33
+ List of unique test case names
34
+ """
35
+ test_cases = []
36
+ for test_path in test_paths:
37
+ # Check if the path exists
38
+ if not glob.glob(test_path):
39
+ rich_print(
40
+ f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
41
+ )
42
+ continue
43
+
44
+ if os.path.isdir(test_path):
45
+ if recursive_search:
46
+ # Use ** pattern for recursive search
47
+ pattern = os.path.join(test_path, "**", "*.json")
48
+ found_files = sorted(glob.glob(pattern, recursive=True))
49
+ rich_print(
50
+ f"Found {len(found_files)} files in '{test_path}' (recursive search)"
51
+ )
52
+ test_cases.extend(found_files)
53
+ else:
54
+ # Original behavior for non-recursive search
55
+ pattern = os.path.join(test_path, "*.json")
56
+ found_files = sorted(glob.glob(pattern))
57
+ rich_print(
58
+ f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
59
+ )
60
+ test_cases.extend(found_files)
61
+ else:
62
+ # If it's a file pattern, just use it directly
63
+ found_files = sorted(glob.glob(test_path))
64
+ test_cases.extend(found_files)
65
+
66
+ # Filter out non-JSON files and agent.json files
67
+ filtered_cases = [
68
+ tc
69
+ for tc in test_cases
70
+ if tc.endswith(".json") and not tc.endswith("agent.json")
71
+ ]
72
+
73
+ # create mapping of test case name to file path
74
+ unique_files_map: dict[str, str] = {}
75
+
76
+ for f in filtered_cases:
77
+ name = Path(f).stem
78
+ if name not in unique_files_map:
79
+ unique_files_map[name] = f
80
+ else:
81
+ rich_print(
82
+ f"[bold red]Duplicate test case name detected:[/bold red] "
83
+ f"'{name}' (skipping file '{f}')"
84
+ )
85
+
86
+ unique_files = list(unique_files_map.values())
87
+ rich_print(
88
+ f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
89
+ )
90
+ return unique_files
91
+
92
+
93
+ def _removesuffix(s: str, suf: str) -> str:
94
+ """Remove suffix from string (for Python < 3.9 compatibility)"""
95
+ return s[: -len(suf)] if s.endswith(suf) else s
96
+
97
+
98
+ def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
99
+ """
100
+ Get available runs from the output directory.
101
+
102
+ Args:
103
+ output_dir: Output directory path
104
+
105
+ Returns:
106
+ Dictionary mapping test case stems to sets of run numbers
107
+ """
108
+ available_runs = defaultdict(set)
109
+ for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
110
+ # strip the fixed tail
111
+ name = _removesuffix(os.path.basename(f), ".messages.json")
112
+ # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
113
+ m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
114
+ if not m:
115
+ continue
116
+ stem = m.group("stem")
117
+ run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
118
+ available_runs[stem].add(run_num)
119
+
120
+ return available_runs
121
+
122
+
123
+ def enumerate_jobs(
124
+ test_cases: List[str],
125
+ n_runs: int,
126
+ skip_available_results: bool,
127
+ output_dir: str,
128
+ ) -> List[Tuple[int, str, int]]:
129
+ """
130
+ Enumerate jobs to be run.
131
+
132
+ Args:
133
+ test_cases: List of test case file paths
134
+ n_runs: Number of runs per test case
135
+ skip_available_results: Whether to skip available results
136
+ output_dir: Output directory path
137
+
138
+ Returns:
139
+ List of tuples (task_n, test_case, run_idx)
140
+ """
141
+ jobs = []
142
+ task_n = 0
143
+
144
+ available_runs = (
145
+ get_available_runs(output_dir) if skip_available_results else {}
146
+ )
147
+
148
+ for test_case in test_cases:
149
+ stem = Path(test_case).stem
150
+
151
+ for run_idx in range(n_runs):
152
+ run_number = run_idx + 1
153
+
154
+ # Skip precisely this (test, run) if results exist
155
+ if skip_available_results and (
156
+ run_number in available_runs.get(stem, set())
157
+ ):
158
+ print(
159
+ f"Skipping {stem} run {run_number} as results already exist."
160
+ )
161
+ continue
162
+
163
+ jobs.append((task_n, test_case, run_idx))
164
+ task_n += 1
165
+
166
+ return jobs
167
+
168
+
169
+ def run_jobs(
170
+ jobs: List[Tuple[int, str, int]],
171
+ config: TestConfig,
172
+ clients: Clients,
173
+ process_func: Callable,
174
+ num_workers: int,
175
+ ) -> List[Any]:
176
+ """
177
+ Run jobs using ThreadPoolExecutor.
178
+
179
+ Args:
180
+ jobs: List of jobs to run
181
+ config: Test configuration
182
+ clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
183
+ process_func: Function to process each job
184
+ num_workers: Number of worker threads
185
+
186
+ Returns:
187
+ List of results from all jobs
188
+ """
189
+
190
+ if config.num_workers > 1 and config.enable_manual_user_input:
191
+ rich_print(
192
+ "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
193
+ )
194
+ config.enable_manual_user_input = (
195
+ False # disable manual user input for parallel execution
196
+ )
197
+
198
+ executor = ThreadPoolExecutor(max_workers=num_workers)
199
+ futures = []
200
+
201
+ for task_n, test_case, run_idx in jobs:
202
+ future = executor.submit(
203
+ process_func,
204
+ task_n,
205
+ test_case,
206
+ config,
207
+ clients.inference_backend,
208
+ clients.resource_map,
209
+ clients.llm_user,
210
+ clients.llmaaj_provider,
211
+ run_idx,
212
+ )
213
+ futures.append(((test_case, run_idx), future))
214
+
215
+ results = []
216
+
217
+ if futures:
218
+ if LOGGING_ENABLED:
219
+ # No progress bar when logging - just process tasks
220
+ for (test_case, run_idx), future in futures:
221
+ try:
222
+ results.extend(future.result())
223
+ except Exception as e:
224
+ import traceback
225
+
226
+ rich_print(f"test case {test_case} fails with {e}")
227
+
228
+ traceback.print_exc()
229
+ else:
230
+ with Progress() as progress:
231
+ task1 = progress.add_task(
232
+ f"[purple]Evaluating {len(futures)} tasks...",
233
+ total=len(futures),
234
+ )
235
+ for (test_case, run_idx), future in futures:
236
+ try:
237
+ results.extend(future.result())
238
+ except Exception as e:
239
+ import traceback
240
+
241
+ rich_print(f"test case {test_case} fails with {e}")
242
+
243
+ traceback.print_exc()
244
+ finally:
245
+ progress.update(task1, advance=1)
246
+
247
+ return results