ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/METADATA +1 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/RECORD +9 -9
- wxo_agentic_evaluation/evaluation_package.py +7 -3
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
- wxo_agentic_evaluation/inference_backend.py +101 -13
- wxo_agentic_evaluation/main.py +5 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/top_level.txt +0 -0
|
@@ -4,12 +4,12 @@ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89
|
|
|
4
4
|
wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
6
|
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=
|
|
8
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=
|
|
7
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
|
|
8
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
|
|
9
9
|
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
10
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
11
|
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
|
-
wxo_agentic_evaluation/main.py,sha256=
|
|
12
|
+
wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
|
|
13
13
|
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
14
14
|
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
15
15
|
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
@@ -20,9 +20,9 @@ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohX
|
|
|
20
20
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
21
21
|
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
22
22
|
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
|
|
23
|
-
wxo_agentic_evaluation/external_agent/__init__.py,sha256=
|
|
23
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
|
|
24
24
|
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
|
|
25
|
-
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=
|
|
25
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
|
|
26
26
|
wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
|
|
27
27
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
@@ -50,7 +50,7 @@ wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId
|
|
|
50
50
|
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
51
|
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
52
|
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
54
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
55
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
56
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
53
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
|
|
54
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,
|
|
@@ -218,7 +218,7 @@ class EvaluationPackage:
|
|
|
218
218
|
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
|
|
219
219
|
)
|
|
220
220
|
tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
|
|
221
|
-
|
|
221
|
+
correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
|
|
222
222
|
for message in self.messages:
|
|
223
223
|
if message.type == ContentType.tool_call:
|
|
224
224
|
|
|
@@ -244,6 +244,7 @@ class EvaluationPackage:
|
|
|
244
244
|
|
|
245
245
|
continue
|
|
246
246
|
|
|
247
|
+
# TO-DO: re-think how deduplication works in the context of precision & recall
|
|
247
248
|
tool_call_and_routing_metrics.total_tool_calls += 1
|
|
248
249
|
|
|
249
250
|
# evaluating more than once is fine
|
|
@@ -262,8 +263,8 @@ class EvaluationPackage:
|
|
|
262
263
|
if msg_tool_call["args"] == goal_detail.args:
|
|
263
264
|
labelled_messages.append(goal_detail.name)
|
|
264
265
|
labelled_messages_without_text_step.append(goal_detail.name)
|
|
265
|
-
|
|
266
|
-
tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
266
|
+
correct_tool_calls.add(goal_detail.name)
|
|
267
|
+
#tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
267
268
|
found = True
|
|
268
269
|
message_outcome = ExtendedMessage(message=message)
|
|
269
270
|
message_outcomes.append(message_outcome)
|
|
@@ -308,6 +309,9 @@ class EvaluationPackage:
|
|
|
308
309
|
else:
|
|
309
310
|
message_outcome = ExtendedMessage(message=message)
|
|
310
311
|
message_outcomes.append(message_outcome)
|
|
312
|
+
|
|
313
|
+
tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
|
|
314
|
+
|
|
311
315
|
assistant_responses = [
|
|
312
316
|
message
|
|
313
317
|
for message in self.messages
|
|
@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
23
23
|
"decoding_method": "greedy",
|
|
24
24
|
"max_new_tokens": 4096,
|
|
25
25
|
}
|
|
26
|
-
wai_client = get_provider(
|
|
26
|
+
wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
|
|
27
27
|
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
|
|
28
28
|
res = wai_client.query(prompt)
|
|
29
29
|
res = res.strip()
|
|
@@ -3,7 +3,7 @@ from rich.console import Console
|
|
|
3
3
|
|
|
4
4
|
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
5
5
|
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
6
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
6
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
7
7
|
from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
|
|
8
8
|
|
|
9
9
|
class ExternalAgentPerformanceTest:
|
|
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
|
|
|
19
19
|
|
|
20
20
|
kw_gen_config = KeywordsGenerationConfig()
|
|
21
21
|
|
|
22
|
-
provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
|
|
23
22
|
llm_decode_parameter = {
|
|
24
23
|
"min_new_tokens": 0,
|
|
25
24
|
"decoding_method": "greedy",
|
|
26
25
|
"max_new_tokens": 256,
|
|
27
26
|
}
|
|
28
|
-
wai_client = get_provider(
|
|
27
|
+
wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
|
|
29
28
|
|
|
30
29
|
self.kw_gen = KeywordsGenerationLLM(
|
|
31
30
|
provider=wai_client,
|
|
@@ -6,6 +6,8 @@ import rich
|
|
|
6
6
|
import time
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
from typing import List, Generator, Dict, Tuple, Mapping, Any
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from collections import deque
|
|
9
11
|
|
|
10
12
|
from wxo_agentic_evaluation.type import (
|
|
11
13
|
ContentType,
|
|
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
|
23
25
|
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
24
26
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
25
27
|
|
|
28
|
+
class Roles(Enum):
|
|
29
|
+
ASSISTANT = "assistant"
|
|
30
|
+
USER = "user"
|
|
26
31
|
|
|
27
|
-
def
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
32
|
+
def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
|
|
33
|
+
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
34
|
+
Args:
|
|
35
|
+
first_message_text (str): The .content field of the first message.
|
|
36
|
+
second_message_text (str): The .content field of the second message.
|
|
37
|
+
"""
|
|
38
|
+
words_in_first_message = first_message_text.lower().split()
|
|
39
|
+
words_in_second_message = second_message_text.lower().split()
|
|
31
40
|
|
|
41
|
+
# Calculate the number of common words
|
|
42
|
+
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
43
|
+
unique_words = set(words_in_first_message + words_in_second_message)
|
|
44
|
+
unique_words_count = len(unique_words)
|
|
45
|
+
|
|
46
|
+
if unique_words_count == 0:
|
|
47
|
+
return 0.0
|
|
48
|
+
return len(common_words) / unique_words_count
|
|
32
49
|
|
|
33
50
|
def is_transfer_response(step_detail: Dict):
|
|
34
51
|
# this is not very reliable
|
|
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
|
|
|
504
521
|
|
|
505
522
|
|
|
506
523
|
class EvaluationController:
|
|
524
|
+
|
|
525
|
+
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
526
|
+
MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
527
|
+
MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
528
|
+
|
|
507
529
|
def __init__(
|
|
508
530
|
self,
|
|
509
531
|
wxo_inference_backend: WXOInferenceBackend,
|
|
@@ -513,6 +535,12 @@ class EvaluationController:
|
|
|
513
535
|
self.wxo_inference_backend = wxo_inference_backend
|
|
514
536
|
self.llm_user = llm_user
|
|
515
537
|
self.config = config
|
|
538
|
+
self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
|
|
539
|
+
|
|
540
|
+
if self.repeating_output_detection:
|
|
541
|
+
# Use deque for efficient O(1) operations
|
|
542
|
+
self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
|
|
543
|
+
self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
|
|
516
544
|
|
|
517
545
|
def run(
|
|
518
546
|
self, task_n, story, agent_name: str, starting_user_input: str = None
|
|
@@ -522,9 +550,9 @@ class EvaluationController:
|
|
|
522
550
|
conversation_history: List[Message] = []
|
|
523
551
|
conversational_search_history_data = []
|
|
524
552
|
call_tracker = CallTracker()
|
|
525
|
-
# make this configurable
|
|
526
|
-
while step < 20:
|
|
527
553
|
|
|
554
|
+
# make this configurable
|
|
555
|
+
while step < self.MAX_CONVERSATION_STEPS:
|
|
528
556
|
if step == 0 and starting_user_input:
|
|
529
557
|
user_input = Message(
|
|
530
558
|
role="user", content=starting_user_input, type=ContentType.text
|
|
@@ -546,9 +574,15 @@ class EvaluationController:
|
|
|
546
574
|
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
547
575
|
user_input.content,
|
|
548
576
|
)
|
|
549
|
-
|
|
577
|
+
|
|
578
|
+
if self._is_end(user_input):
|
|
550
579
|
break
|
|
580
|
+
|
|
581
|
+
if self.repeating_output_detection:
|
|
582
|
+
self.recent_user_messages.append(user_input.content)
|
|
583
|
+
|
|
551
584
|
conversation_history.append(user_input)
|
|
585
|
+
|
|
552
586
|
messages, thread_id, conversational_search_data = (
|
|
553
587
|
self.wxo_inference_backend.stream_messages(
|
|
554
588
|
user_input,
|
|
@@ -559,16 +593,70 @@ class EvaluationController:
|
|
|
559
593
|
)
|
|
560
594
|
if not messages:
|
|
561
595
|
raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
596
|
+
|
|
597
|
+
for message in messages:
|
|
598
|
+
if self.repeating_output_detection:
|
|
599
|
+
if message.role == Roles.ASSISTANT and message.type == ContentType.text:
|
|
600
|
+
self.recent_assistant_messages.append(message.content)
|
|
601
|
+
|
|
602
|
+
if self.config.enable_verbose_logging:
|
|
603
|
+
rich.print(
|
|
604
|
+
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
605
|
+
message.content,
|
|
606
|
+
)
|
|
607
|
+
|
|
568
608
|
conversation_history.extend(messages)
|
|
569
609
|
conversational_search_history_data.extend(conversational_search_data)
|
|
610
|
+
|
|
570
611
|
step += 1
|
|
571
612
|
return conversation_history, call_tracker, conversational_search_history_data
|
|
613
|
+
|
|
614
|
+
def _is_looping(self, messages: deque) -> bool:
|
|
615
|
+
"""Checks whether the user or assistant is stuck in a loop.
|
|
616
|
+
Args:
|
|
617
|
+
messages (deque): Defines the message cache to be assessed for similarity.
|
|
618
|
+
Returns:
|
|
619
|
+
bool: True if stuck in a loop, False otherwise.
|
|
620
|
+
"""
|
|
621
|
+
sim_count = 0
|
|
622
|
+
|
|
623
|
+
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
624
|
+
oldest_cached_message = messages[0]
|
|
625
|
+
for i, old_message in enumerate(messages):
|
|
626
|
+
if i == 0:
|
|
627
|
+
continue
|
|
628
|
+
if oldest_cached_message == old_message:
|
|
629
|
+
sim_count += 1
|
|
630
|
+
elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
|
|
631
|
+
sim_count += 1
|
|
632
|
+
|
|
633
|
+
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
634
|
+
|
|
635
|
+
def _is_end(self, current_user_input: Message) -> bool:
|
|
636
|
+
"""
|
|
637
|
+
Check if the user input indicates the end of the conversation.
|
|
638
|
+
|
|
639
|
+
- This function checks if the user input contains 'END'.
|
|
640
|
+
- An END is also triggered when the message cache(s) is filled with messages that are too similar.
|
|
641
|
+
- Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
|
|
642
|
+
Args:
|
|
643
|
+
current_user_input (Message): The user message.
|
|
644
|
+
Returns:
|
|
645
|
+
bool: True if the user input indicates an END, False otherwise.
|
|
646
|
+
"""
|
|
647
|
+
current_user_message_content = current_user_input.content.strip()
|
|
648
|
+
|
|
649
|
+
# Check if the user message contains 'END'
|
|
650
|
+
if "END" in current_user_message_content:
|
|
651
|
+
return True
|
|
652
|
+
|
|
653
|
+
if self.repeating_output_detection:
|
|
654
|
+
# Check for repeating user or assistant messages
|
|
655
|
+
if (self._is_looping(self.recent_user_messages) or
|
|
656
|
+
self._is_looping(self.recent_assistant_messages)):
|
|
657
|
+
return True
|
|
658
|
+
|
|
659
|
+
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
572
660
|
|
|
573
661
|
def get_wxo_client(
|
|
574
662
|
service_url: str, tenant_name: str, token: str = None
|
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
107
107
|
|
|
108
108
|
def main(config: TestConfig):
|
|
109
109
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
110
|
+
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
111
|
+
rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
|
|
112
|
+
config.enable_manual_user_input = False # disable manual user input for parallel execution
|
|
113
|
+
# reason: threads continue to stream messages while waiting for user input, which is not desired
|
|
114
|
+
# and the manual input prompt is not labelled properly in the UI
|
|
110
115
|
wxo_client = get_wxo_client(
|
|
111
116
|
config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
|
|
112
117
|
)
|
|
File without changes
|