ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.6
3
+ Version: 1.0.8
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
53
53
  pip install -e .
54
54
  ```
55
55
 
56
+ ## contribution guide
57
+ ### secret resolution
58
+ install detect secret utilities:
59
+ ```
60
+ pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
61
+ ```
62
+ run the scan & resolve detections:
63
+ ```
64
+ detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
65
+ ```
66
+
56
67
 
57
68
  ## quick experiment against the default wxo-dev env
58
69
  ```bash
@@ -1,28 +1,28 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
3
3
  wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
7
- wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
- wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
+ wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
8
+ wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
9
9
  wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
10
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
11
  wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
- wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
13
- wxo_agentic_evaluation/record_chat.py,sha256=IAKCZ6Bc4natHA4SyNtC4tjo-0MDglwBcY5AWvXSgR0,7317
14
- wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
12
+ wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
13
+ wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
14
+ wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
15
15
  wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
16
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
17
- wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
17
+ wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
18
18
  wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
19
19
  wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
20
20
  wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
21
  wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
22
  wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
23
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
23
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
24
24
  wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
25
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
25
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
26
26
  wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
27
27
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
44
44
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
46
46
  wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
47
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
47
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
48
48
  wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
49
49
  wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
50
50
  wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
51
  wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
52
  wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/METADATA,sha256=BqQELgtuSVS6tHNQ5nGkgfwPBiAFgTnvgZbWG3hjCgM,17674
54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
- ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info/RECORD,,
53
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
54
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,
@@ -74,6 +74,7 @@ class ChatRecordingConfig:
74
74
  service_url: str = "http://localhost:4321"
75
75
  tenant_name: str = "local"
76
76
  token: str = None
77
+ max_retries: int = 5
77
78
 
78
79
 
79
80
  @dataclass
@@ -247,11 +247,14 @@ class DataAnnotator:
247
247
  }
248
248
  goal_details.append(summarize_step)
249
249
  break
250
-
251
- if summarize_step:
252
- goals[previous] = ["summarize"]
253
- else:
250
+
251
+ if previous is None:
252
+ goals["summarize"] = []
253
+ elif summarize_step is None:
254
254
  goals[previous] = []
255
+ else:
256
+ goals[previous] = ["summarize"]
257
+
255
258
 
256
259
  def generate(self) -> Dict:
257
260
  """Generate the final dataset"""
@@ -218,7 +218,7 @@ class EvaluationPackage:
218
218
  tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
219
219
  )
220
220
  tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
221
-
221
+ correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
222
222
  for message in self.messages:
223
223
  if message.type == ContentType.tool_call:
224
224
 
@@ -244,6 +244,7 @@ class EvaluationPackage:
244
244
 
245
245
  continue
246
246
 
247
+ # TO-DO: re-think how deduplication works in the context of precision & recall
247
248
  tool_call_and_routing_metrics.total_tool_calls += 1
248
249
 
249
250
  # evaluating more than once is fine
@@ -262,8 +263,8 @@ class EvaluationPackage:
262
263
  if msg_tool_call["args"] == goal_detail.args:
263
264
  labelled_messages.append(goal_detail.name)
264
265
  labelled_messages_without_text_step.append(goal_detail.name)
265
-
266
- tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
266
+ correct_tool_calls.add(goal_detail.name)
267
+ #tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
267
268
  found = True
268
269
  message_outcome = ExtendedMessage(message=message)
269
270
  message_outcomes.append(message_outcome)
@@ -308,6 +309,9 @@ class EvaluationPackage:
308
309
  else:
309
310
  message_outcome = ExtendedMessage(message=message)
310
311
  message_outcomes.append(message_outcome)
312
+
313
+ tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
314
+
311
315
  assistant_responses = [
312
316
  message
313
317
  for message in self.messages
@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
23
23
  "decoding_method": "greedy",
24
24
  "max_new_tokens": 4096,
25
25
  }
26
- wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
26
+ wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
27
27
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
28
28
  res = wai_client.query(prompt)
29
29
  res = res.strip()
@@ -3,7 +3,7 @@ from rich.console import Console
3
3
 
4
4
  from wxo_agentic_evaluation.external_agent import generate_starting_sentence
5
5
  from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
6
+ from wxo_agentic_evaluation.service_provider import get_provider
7
7
  from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
8
8
 
9
9
  class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
19
19
 
20
20
  kw_gen_config = KeywordsGenerationConfig()
21
21
 
22
- provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
23
22
  llm_decode_parameter = {
24
23
  "min_new_tokens": 0,
25
24
  "decoding_method": "greedy",
26
25
  "max_new_tokens": 256,
27
26
  }
28
- wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
27
+ wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
29
28
 
30
29
  self.kw_gen = KeywordsGenerationLLM(
31
30
  provider=wai_client,
@@ -6,6 +6,8 @@ import rich
6
6
  import time
7
7
  from pydantic import BaseModel
8
8
  from typing import List, Generator, Dict, Tuple, Mapping, Any
9
+ from enum import Enum
10
+ from collections import deque
9
11
 
10
12
  from wxo_agentic_evaluation.type import (
11
13
  ContentType,
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
23
25
  from wxo_agentic_evaluation.service_instance import tenant_setup
24
26
  from wxo_agentic_evaluation.utils.utils import is_saas_url
25
27
 
28
+ class Roles(Enum):
29
+ ASSISTANT = "assistant"
30
+ USER = "user"
26
31
 
27
- def is_end(user_input: Message):
28
- if "END" in user_input.content.strip():
29
- return True
30
- return False
32
+ def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
33
+ """Calculate the word overlap similarity score between the .content field of two Message objects.
34
+ Args:
35
+ first_message_text (str): The .content field of the first message.
36
+ second_message_text (str): The .content field of the second message.
37
+ """
38
+ words_in_first_message = first_message_text.lower().split()
39
+ words_in_second_message = second_message_text.lower().split()
31
40
 
41
+ # Calculate the number of common words
42
+ common_words = set(words_in_first_message) & set(words_in_second_message)
43
+ unique_words = set(words_in_first_message + words_in_second_message)
44
+ unique_words_count = len(unique_words)
45
+
46
+ if unique_words_count == 0:
47
+ return 0.0
48
+ return len(common_words) / unique_words_count
32
49
 
33
50
  def is_transfer_response(step_detail: Dict):
34
51
  # this is not very reliable
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
504
521
 
505
522
 
506
523
  class EvaluationController:
524
+
525
+ MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
526
+ MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
527
+ MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
528
+
507
529
  def __init__(
508
530
  self,
509
531
  wxo_inference_backend: WXOInferenceBackend,
@@ -513,6 +535,12 @@ class EvaluationController:
513
535
  self.wxo_inference_backend = wxo_inference_backend
514
536
  self.llm_user = llm_user
515
537
  self.config = config
538
+ self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
539
+
540
+ if self.repeating_output_detection:
541
+ # Use deque for efficient O(1) operations
542
+ self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
543
+ self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
516
544
 
517
545
  def run(
518
546
  self, task_n, story, agent_name: str, starting_user_input: str = None
@@ -522,9 +550,9 @@ class EvaluationController:
522
550
  conversation_history: List[Message] = []
523
551
  conversational_search_history_data = []
524
552
  call_tracker = CallTracker()
525
- # make this configurable
526
- while step < 20:
527
553
 
554
+ # make this configurable
555
+ while step < self.MAX_CONVERSATION_STEPS:
528
556
  if step == 0 and starting_user_input:
529
557
  user_input = Message(
530
558
  role="user", content=starting_user_input, type=ContentType.text
@@ -546,9 +574,15 @@ class EvaluationController:
546
574
  f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
547
575
  user_input.content,
548
576
  )
549
- if is_end(user_input):
577
+
578
+ if self._is_end(user_input):
550
579
  break
580
+
581
+ if self.repeating_output_detection:
582
+ self.recent_user_messages.append(user_input.content)
583
+
551
584
  conversation_history.append(user_input)
585
+
552
586
  messages, thread_id, conversational_search_data = (
553
587
  self.wxo_inference_backend.stream_messages(
554
588
  user_input,
@@ -559,16 +593,70 @@ class EvaluationController:
559
593
  )
560
594
  if not messages:
561
595
  raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
562
- if self.config.enable_verbose_logging:
563
- for message in messages:
564
- rich.print(
565
- f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
566
- message.content,
567
- )
596
+
597
+ for message in messages:
598
+ if self.repeating_output_detection:
599
+ if message.role == Roles.ASSISTANT and message.type == ContentType.text:
600
+ self.recent_assistant_messages.append(message.content)
601
+
602
+ if self.config.enable_verbose_logging:
603
+ rich.print(
604
+ f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
605
+ message.content,
606
+ )
607
+
568
608
  conversation_history.extend(messages)
569
609
  conversational_search_history_data.extend(conversational_search_data)
610
+
570
611
  step += 1
571
612
  return conversation_history, call_tracker, conversational_search_history_data
613
+
614
+ def _is_looping(self, messages: deque) -> bool:
615
+ """Checks whether the user or assistant is stuck in a loop.
616
+ Args:
617
+ messages (deque): Defines the message cache to be assessed for similarity.
618
+ Returns:
619
+ bool: True if stuck in a loop, False otherwise.
620
+ """
621
+ sim_count = 0
622
+
623
+ if len(messages) >= self.MAX_REPEATING_MESSAGES:
624
+ oldest_cached_message = messages[0]
625
+ for i, old_message in enumerate(messages):
626
+ if i == 0:
627
+ continue
628
+ if oldest_cached_message == old_message:
629
+ sim_count += 1
630
+ elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
631
+ sim_count += 1
632
+
633
+ return sim_count >= self.MAX_REPEATING_MESSAGES - 1
634
+
635
+ def _is_end(self, current_user_input: Message) -> bool:
636
+ """
637
+ Check if the user input indicates the end of the conversation.
638
+
639
+ - This function checks if the user input contains 'END'.
640
+ - An END is also triggered when the message cache(s) is filled with messages that are too similar.
641
+ - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
642
+ Args:
643
+ current_user_input (Message): The user message.
644
+ Returns:
645
+ bool: True if the user input indicates an END, False otherwise.
646
+ """
647
+ current_user_message_content = current_user_input.content.strip()
648
+
649
+ # Check if the user message contains 'END'
650
+ if "END" in current_user_message_content:
651
+ return True
652
+
653
+ if self.repeating_output_detection:
654
+ # Check for repeating user or assistant messages
655
+ if (self._is_looping(self.recent_user_messages) or
656
+ self._is_looping(self.recent_assistant_messages)):
657
+ return True
658
+
659
+ return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
572
660
 
573
661
  def get_wxo_client(
574
662
  service_url: str, tenant_name: str, token: str = None
@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
107
107
 
108
108
  def main(config: TestConfig):
109
109
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
110
+ if config.num_workers > 1 and config.enable_manual_user_input:
111
+ rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
112
+ config.enable_manual_user_input = False # disable manual user input for parallel execution
113
+ # reason: threads continue to stream messages while waiting for user input, which is not desired
114
+ # and the manual input prompt is not labelled properly in the UI
110
115
  wxo_client = get_wxo_client(
111
116
  config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
112
117
  )
@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
43
43
  else:
44
44
  path = "v1/orchestrate/runs"
45
45
 
46
- initial_response = wxo_client.get(
47
- path, {"limit": limit, "offset": 0}
48
- ).json()
46
+ initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
49
47
  total_runs = initial_response["total"]
50
48
  all_runs.extend(initial_response["data"])
51
49
 
52
50
  while len(all_runs) < total_runs:
53
51
  offset += limit
54
- response = wxo_client.get(
55
- path, {"limit": limit, "offset": offset}
56
- ).json()
52
+ response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
57
53
  all_runs.extend(response["data"])
58
54
 
59
55
  # Sort runs by completed_at in descending order (most recent first)
@@ -92,9 +88,10 @@ def annotate_messages(
92
88
  annotated_data["agent"] = agent_name
93
89
 
94
90
  annotated_data["story"] = generate_story(annotated_data)
95
-
91
+
96
92
  return annotated_data
97
93
 
94
+
98
95
  def has_messages_changed(
99
96
  thread_id: str,
100
97
  messages: List[Message],
@@ -111,32 +108,27 @@ def has_messages_changed(
111
108
  return False
112
109
 
113
110
 
114
- def record_chats(config: ChatRecordingConfig):
111
+ def _record(config: ChatRecordingConfig, bad_threads: set):
115
112
  """Record chats in background mode"""
116
113
  start_time = datetime.utcnow()
117
114
  processed_threads = set()
118
115
  previous_input_hash: dict[str, str] = {}
119
116
 
120
- rich.print(
121
- f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
122
- )
123
117
  if config.token is None:
124
118
  config.token = tenant_setup(config.service_url, config.tenant_name)
125
119
  wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
126
120
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
127
- try:
128
- while True:
121
+
122
+ retry_count = 0
123
+ while retry_count < config.max_retries:
124
+ thread_id = None
125
+ try:
129
126
  all_runs = get_all_runs(wxo_client)
130
127
  seen_threads = set()
131
128
  # Process only new runs that started after our recording began
132
129
  for run in all_runs:
133
130
  thread_id = run.get("thread_id")
134
- try:
135
- agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
136
- except Exception as e:
137
- rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
138
- continue
139
- if thread_id in seen_threads or agent_name is None:
131
+ if (thread_id in bad_threads) or (thread_id in seen_threads):
140
132
  continue
141
133
  seen_threads.add(thread_id)
142
134
  started_at = run.get("started_at")
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
162
154
  try:
163
155
  messages = inference_backend.get_messages(thread_id)
164
156
 
165
- if not has_messages_changed(
166
- thread_id,
167
- messages,
168
- previous_input_hash,
169
- ):
157
+ if not has_messages_changed(thread_id, messages, previous_input_hash):
158
+ continue
159
+
160
+ try:
161
+ agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
162
+ except Exception as e:
163
+ rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
164
+ raise
165
+
166
+ if agent_name is None:
167
+ rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
170
168
  continue
171
169
 
172
170
  annotated_data = annotate_messages(
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
180
178
  with open(annotation_filename, "w") as f:
181
179
  json.dump(annotated_data, f, indent=4)
182
180
  except Exception as e:
183
- rich.print(
184
- f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
185
- )
181
+ rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
182
+ raise
186
183
  except (ValueError, TypeError) as e:
187
- rich.print(
188
- f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
189
- )
184
+ rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
185
+ raise
186
+
187
+ retry_count = 0
188
+ time.sleep(2)
190
189
 
191
- time.sleep(2) # Poll every 2 seconds
190
+ except KeyboardInterrupt:
191
+ rich.print("\n[yellow]Recording stopped by user[/yellow]")
192
+ break
192
193
 
193
- except KeyboardInterrupt:
194
- rich.print("\n[yellow]Recording stopped by user[/yellow]")
194
+ except Exception as e:
195
+ if thread_id is None:
196
+ rich.print(f"[red]ERROR:[/red] {e}")
197
+ break
195
198
 
199
+ time.sleep(1)
200
+ retry_count += 1
201
+ if retry_count >= config.max_retries:
202
+ rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
203
+ bad_threads.add(thread_id)
204
+ _record(config, bad_threads)
205
+
206
+ def record_chats(config: ChatRecordingConfig):
207
+ rich.print(
208
+ f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
209
+ )
210
+ bad_threads = set()
211
+ _record(config, bad_threads)
196
212
 
197
213
  if __name__ == "__main__":
198
214
  record_chats(CLI(ChatRecordingConfig, as_positional=False))
@@ -14,7 +14,7 @@ class ResourceMap:
14
14
  if is_saas_url(self.wxo_client.service_url):
15
15
  # TO-DO: this is not validated after the v1 prefix change
16
16
  # need additional validation
17
- tools_path = "v1/orchestrate/tools/"
17
+ tools_path = "v1/orchestrate/tools"
18
18
  agents_path = "v1/orchestrate/agents"
19
19
  else:
20
20
  tools_path = "v1/tools/"
@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
10
10
 
11
11
  AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
12
12
  AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
13
- WO_INSTANCE = os.environ.get("WO_INSTANCE")
14
- WO_API_KEY = os.environ.get("WO_API_KEY")
15
13
  DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
16
14
 
17
15
 
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
19
17
  def __init__(
20
18
  self,
21
19
  model_id=None,
22
- api_key=WO_API_KEY,
23
- instance_url=WO_INSTANCE,
20
+ api_key=None,
21
+ instance_url=None,
24
22
  timeout=300,
25
23
  embedding_model_id=None,
26
24
  params=None
27
25
  ):
28
26
  super().__init__()
29
27
 
28
+ instance_url = os.environ.get("WO_INSTANCE", instance_url)
29
+ api_key = os.environ.get("WO_API_KEY", api_key)
30
30
  if not instance_url or not api_key:
31
31
  raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")
32
32
 
@@ -6,6 +6,7 @@ import importlib.util
6
6
  import re
7
7
  from jsonargparse import CLI
8
8
  import os
9
+ import sys
9
10
  import textwrap
10
11
  from dataclasses import is_dataclass, asdict
11
12
 
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
83
84
  module_name = file_path.stem
84
85
  spec = importlib.util.spec_from_file_location(module_name, file_path)
85
86
  module = importlib.util.module_from_spec(spec)
86
- spec.loader.exec_module(module)
87
-
87
+ parent_dir = str(file_path.parent)
88
+ sys_path_modified = False
89
+ if parent_dir not in sys.path:
90
+ sys.path.append(parent_dir)
91
+ sys_path_modified = True
92
+ try:
93
+ spec.loader.exec_module(module)
94
+ finally:
95
+ if sys_path_modified:
96
+ sys.path.pop()
88
97
  # Add all module's non-private functions to tools_dict
89
98
  for attr_name in dir(module):
90
99
  attr = getattr(module, attr_name)