ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.7
3
+ Version: 1.0.8
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -4,12 +4,12 @@ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89
4
4
  wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
6
  wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
- wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
- wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
7
+ wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
8
+ wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
9
9
  wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
10
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
11
  wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
- wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
12
+ wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
13
13
  wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
14
14
  wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
15
15
  wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
@@ -20,9 +20,9 @@ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohX
20
20
  wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
21
  wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
22
  wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
23
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
23
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
24
24
  wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
25
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
25
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
26
26
  wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
27
27
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
@@ -50,7 +50,7 @@ wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId
50
50
  wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
51
  wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
52
  wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/METADATA,sha256=wz60je0UK3ogKLH9qiDLS808j57cfWOosONyCuQR95g,18051
54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
- ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD,,
53
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
54
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,
@@ -218,7 +218,7 @@ class EvaluationPackage:
218
218
  tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
219
219
  )
220
220
  tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
221
-
221
+ correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
222
222
  for message in self.messages:
223
223
  if message.type == ContentType.tool_call:
224
224
 
@@ -244,6 +244,7 @@ class EvaluationPackage:
244
244
 
245
245
  continue
246
246
 
247
+ # TO-DO: re-think how deduplication works in the context of precision & recall
247
248
  tool_call_and_routing_metrics.total_tool_calls += 1
248
249
 
249
250
  # evaluating more than once is fine
@@ -262,8 +263,8 @@ class EvaluationPackage:
262
263
  if msg_tool_call["args"] == goal_detail.args:
263
264
  labelled_messages.append(goal_detail.name)
264
265
  labelled_messages_without_text_step.append(goal_detail.name)
265
-
266
- tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
266
+ correct_tool_calls.add(goal_detail.name)
267
+ #tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
267
268
  found = True
268
269
  message_outcome = ExtendedMessage(message=message)
269
270
  message_outcomes.append(message_outcome)
@@ -308,6 +309,9 @@ class EvaluationPackage:
308
309
  else:
309
310
  message_outcome = ExtendedMessage(message=message)
310
311
  message_outcomes.append(message_outcome)
312
+
313
+ tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
314
+
311
315
  assistant_responses = [
312
316
  message
313
317
  for message in self.messages
@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
23
23
  "decoding_method": "greedy",
24
24
  "max_new_tokens": 4096,
25
25
  }
26
- wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
26
+ wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
27
27
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
28
28
  res = wai_client.query(prompt)
29
29
  res = res.strip()
@@ -3,7 +3,7 @@ from rich.console import Console
3
3
 
4
4
  from wxo_agentic_evaluation.external_agent import generate_starting_sentence
5
5
  from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
6
+ from wxo_agentic_evaluation.service_provider import get_provider
7
7
  from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
8
8
 
9
9
  class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
19
19
 
20
20
  kw_gen_config = KeywordsGenerationConfig()
21
21
 
22
- provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
23
22
  llm_decode_parameter = {
24
23
  "min_new_tokens": 0,
25
24
  "decoding_method": "greedy",
26
25
  "max_new_tokens": 256,
27
26
  }
28
- wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
27
+ wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
29
28
 
30
29
  self.kw_gen = KeywordsGenerationLLM(
31
30
  provider=wai_client,
@@ -6,6 +6,8 @@ import rich
6
6
  import time
7
7
  from pydantic import BaseModel
8
8
  from typing import List, Generator, Dict, Tuple, Mapping, Any
9
+ from enum import Enum
10
+ from collections import deque
9
11
 
10
12
  from wxo_agentic_evaluation.type import (
11
13
  ContentType,
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
23
25
  from wxo_agentic_evaluation.service_instance import tenant_setup
24
26
  from wxo_agentic_evaluation.utils.utils import is_saas_url
25
27
 
28
+ class Roles(Enum):
29
+ ASSISTANT = "assistant"
30
+ USER = "user"
26
31
 
27
- def is_end(user_input: Message):
28
- if "END" in user_input.content.strip():
29
- return True
30
- return False
32
+ def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
33
+ """Calculate the word overlap similarity score between the .content field of two Message objects.
34
+ Args:
35
+ first_message_text (str): The .content field of the first message.
36
+ second_message_text (str): The .content field of the second message.
37
+ """
38
+ words_in_first_message = first_message_text.lower().split()
39
+ words_in_second_message = second_message_text.lower().split()
31
40
 
41
+ # Calculate the number of common words
42
+ common_words = set(words_in_first_message) & set(words_in_second_message)
43
+ unique_words = set(words_in_first_message + words_in_second_message)
44
+ unique_words_count = len(unique_words)
45
+
46
+ if unique_words_count == 0:
47
+ return 0.0
48
+ return len(common_words) / unique_words_count
32
49
 
33
50
  def is_transfer_response(step_detail: Dict):
34
51
  # this is not very reliable
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
504
521
 
505
522
 
506
523
  class EvaluationController:
524
+
525
+ MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
526
+ MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
527
+ MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
528
+
507
529
  def __init__(
508
530
  self,
509
531
  wxo_inference_backend: WXOInferenceBackend,
@@ -513,6 +535,12 @@ class EvaluationController:
513
535
  self.wxo_inference_backend = wxo_inference_backend
514
536
  self.llm_user = llm_user
515
537
  self.config = config
538
+ self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
539
+
540
+ if self.repeating_output_detection:
541
+ # Use deque for efficient O(1) operations
542
+ self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
543
+ self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
516
544
 
517
545
  def run(
518
546
  self, task_n, story, agent_name: str, starting_user_input: str = None
@@ -522,9 +550,9 @@ class EvaluationController:
522
550
  conversation_history: List[Message] = []
523
551
  conversational_search_history_data = []
524
552
  call_tracker = CallTracker()
525
- # make this configurable
526
- while step < 20:
527
553
 
554
+ # make this configurable
555
+ while step < self.MAX_CONVERSATION_STEPS:
528
556
  if step == 0 and starting_user_input:
529
557
  user_input = Message(
530
558
  role="user", content=starting_user_input, type=ContentType.text
@@ -546,9 +574,15 @@ class EvaluationController:
546
574
  f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
547
575
  user_input.content,
548
576
  )
549
- if is_end(user_input):
577
+
578
+ if self._is_end(user_input):
550
579
  break
580
+
581
+ if self.repeating_output_detection:
582
+ self.recent_user_messages.append(user_input.content)
583
+
551
584
  conversation_history.append(user_input)
585
+
552
586
  messages, thread_id, conversational_search_data = (
553
587
  self.wxo_inference_backend.stream_messages(
554
588
  user_input,
@@ -559,16 +593,70 @@ class EvaluationController:
559
593
  )
560
594
  if not messages:
561
595
  raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
562
- if self.config.enable_verbose_logging:
563
- for message in messages:
564
- rich.print(
565
- f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
566
- message.content,
567
- )
596
+
597
+ for message in messages:
598
+ if self.repeating_output_detection:
599
+ if message.role == Roles.ASSISTANT and message.type == ContentType.text:
600
+ self.recent_assistant_messages.append(message.content)
601
+
602
+ if self.config.enable_verbose_logging:
603
+ rich.print(
604
+ f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
605
+ message.content,
606
+ )
607
+
568
608
  conversation_history.extend(messages)
569
609
  conversational_search_history_data.extend(conversational_search_data)
610
+
570
611
  step += 1
571
612
  return conversation_history, call_tracker, conversational_search_history_data
613
+
614
+ def _is_looping(self, messages: deque) -> bool:
615
+ """Checks whether the user or assistant is stuck in a loop.
616
+ Args:
617
+ messages (deque): Defines the message cache to be assessed for similarity.
618
+ Returns:
619
+ bool: True if stuck in a loop, False otherwise.
620
+ """
621
+ sim_count = 0
622
+
623
+ if len(messages) >= self.MAX_REPEATING_MESSAGES:
624
+ oldest_cached_message = messages[0]
625
+ for i, old_message in enumerate(messages):
626
+ if i == 0:
627
+ continue
628
+ if oldest_cached_message == old_message:
629
+ sim_count += 1
630
+ elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
631
+ sim_count += 1
632
+
633
+ return sim_count >= self.MAX_REPEATING_MESSAGES - 1
634
+
635
+ def _is_end(self, current_user_input: Message) -> bool:
636
+ """
637
+ Check if the user input indicates the end of the conversation.
638
+
639
+ - This function checks if the user input contains 'END'.
640
+ - An END is also triggered when the message cache(s) is filled with messages that are too similar.
641
+ - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
642
+ Args:
643
+ current_user_input (Message): The user message.
644
+ Returns:
645
+ bool: True if the user input indicates an END, False otherwise.
646
+ """
647
+ current_user_message_content = current_user_input.content.strip()
648
+
649
+ # Check if the user message contains 'END'
650
+ if "END" in current_user_message_content:
651
+ return True
652
+
653
+ if self.repeating_output_detection:
654
+ # Check for repeating user or assistant messages
655
+ if (self._is_looping(self.recent_user_messages) or
656
+ self._is_looping(self.recent_assistant_messages)):
657
+ return True
658
+
659
+ return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
572
660
 
573
661
  def get_wxo_client(
574
662
  service_url: str, tenant_name: str, token: str = None
@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
107
107
 
108
108
  def main(config: TestConfig):
109
109
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
110
+ if config.num_workers > 1 and config.enable_manual_user_input:
111
+ rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
112
+ config.enable_manual_user_input = False # disable manual user input for parallel execution
113
+ # reason: threads continue to stream messages while waiting for user input, which is not desired
114
+ # and the manual input prompt is not labelled properly in the UI
110
115
  wxo_client = get_wxo_client(
111
116
  config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
112
117
  )