ibm-watsonx-orchestrate-evaluation-framework 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/METADATA +12 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/RECORD +15 -15
- wxo_agentic_evaluation/arg_configs.py +1 -0
- wxo_agentic_evaluation/data_annotator.py +7 -4
- wxo_agentic_evaluation/evaluation_package.py +7 -3
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
- wxo_agentic_evaluation/inference_backend.py +101 -13
- wxo_agentic_evaluation/main.py +5 -0
- wxo_agentic_evaluation/record_chat.py +49 -33
- wxo_agentic_evaluation/resource_map.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +11 -2
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.8
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -53,6 +53,17 @@ Run the following command to install evaluation framework in the same env:
|
|
|
53
53
|
pip install -e .
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
## contribution guide
|
|
57
|
+
### secret resolution
|
|
58
|
+
install detect secret utilities:
|
|
59
|
+
```
|
|
60
|
+
pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
|
|
61
|
+
```
|
|
62
|
+
run the scan & resolve detections:
|
|
63
|
+
```
|
|
64
|
+
detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
|
|
65
|
+
```
|
|
66
|
+
|
|
56
67
|
|
|
57
68
|
## quick experiment against the default wxo-dev env
|
|
58
69
|
```bash
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=
|
|
7
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=
|
|
8
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
|
|
8
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
|
|
9
9
|
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
10
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
11
|
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
|
-
wxo_agentic_evaluation/main.py,sha256=
|
|
13
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
14
|
-
wxo_agentic_evaluation/resource_map.py,sha256
|
|
12
|
+
wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
|
|
13
|
+
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
14
|
+
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
15
15
|
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
16
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
17
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=
|
|
17
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
|
|
18
18
|
wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
|
|
19
19
|
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
20
20
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
21
21
|
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
22
22
|
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
|
|
23
|
-
wxo_agentic_evaluation/external_agent/__init__.py,sha256=
|
|
23
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
|
|
24
24
|
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
|
|
25
|
-
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=
|
|
25
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
|
|
26
26
|
wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
|
|
27
27
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
@@ -44,13 +44,13 @@ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TC
|
|
|
44
44
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
46
46
|
wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
|
|
47
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
47
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
|
|
48
48
|
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
49
49
|
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
50
50
|
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
51
|
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
52
|
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
54
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
55
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
56
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
53
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
|
|
54
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,
|
|
@@ -247,11 +247,14 @@ class DataAnnotator:
|
|
|
247
247
|
}
|
|
248
248
|
goal_details.append(summarize_step)
|
|
249
249
|
break
|
|
250
|
-
|
|
251
|
-
if
|
|
252
|
-
goals[
|
|
253
|
-
|
|
250
|
+
|
|
251
|
+
if previous is None:
|
|
252
|
+
goals["summarize"] = []
|
|
253
|
+
elif summarize_step is None:
|
|
254
254
|
goals[previous] = []
|
|
255
|
+
else:
|
|
256
|
+
goals[previous] = ["summarize"]
|
|
257
|
+
|
|
255
258
|
|
|
256
259
|
def generate(self) -> Dict:
|
|
257
260
|
"""Generate the final dataset"""
|
|
@@ -218,7 +218,7 @@ class EvaluationPackage:
|
|
|
218
218
|
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
|
|
219
219
|
)
|
|
220
220
|
tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
|
|
221
|
-
|
|
221
|
+
correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
|
|
222
222
|
for message in self.messages:
|
|
223
223
|
if message.type == ContentType.tool_call:
|
|
224
224
|
|
|
@@ -244,6 +244,7 @@ class EvaluationPackage:
|
|
|
244
244
|
|
|
245
245
|
continue
|
|
246
246
|
|
|
247
|
+
# TO-DO: re-think how deduplication works in the context of precision & recall
|
|
247
248
|
tool_call_and_routing_metrics.total_tool_calls += 1
|
|
248
249
|
|
|
249
250
|
# evaluating more than once is fine
|
|
@@ -262,8 +263,8 @@ class EvaluationPackage:
|
|
|
262
263
|
if msg_tool_call["args"] == goal_detail.args:
|
|
263
264
|
labelled_messages.append(goal_detail.name)
|
|
264
265
|
labelled_messages_without_text_step.append(goal_detail.name)
|
|
265
|
-
|
|
266
|
-
tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
266
|
+
correct_tool_calls.add(goal_detail.name)
|
|
267
|
+
#tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
267
268
|
found = True
|
|
268
269
|
message_outcome = ExtendedMessage(message=message)
|
|
269
270
|
message_outcomes.append(message_outcome)
|
|
@@ -308,6 +309,9 @@ class EvaluationPackage:
|
|
|
308
309
|
else:
|
|
309
310
|
message_outcome = ExtendedMessage(message=message)
|
|
310
311
|
message_outcomes.append(message_outcome)
|
|
312
|
+
|
|
313
|
+
tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
|
|
314
|
+
|
|
311
315
|
assistant_responses = [
|
|
312
316
|
message
|
|
313
317
|
for message in self.messages
|
|
@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
23
23
|
"decoding_method": "greedy",
|
|
24
24
|
"max_new_tokens": 4096,
|
|
25
25
|
}
|
|
26
|
-
wai_client = get_provider(
|
|
26
|
+
wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
|
|
27
27
|
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
|
|
28
28
|
res = wai_client.query(prompt)
|
|
29
29
|
res = res.strip()
|
|
@@ -3,7 +3,7 @@ from rich.console import Console
|
|
|
3
3
|
|
|
4
4
|
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
5
5
|
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
6
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
6
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
7
7
|
from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
|
|
8
8
|
|
|
9
9
|
class ExternalAgentPerformanceTest:
|
|
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
|
|
|
19
19
|
|
|
20
20
|
kw_gen_config = KeywordsGenerationConfig()
|
|
21
21
|
|
|
22
|
-
provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
|
|
23
22
|
llm_decode_parameter = {
|
|
24
23
|
"min_new_tokens": 0,
|
|
25
24
|
"decoding_method": "greedy",
|
|
26
25
|
"max_new_tokens": 256,
|
|
27
26
|
}
|
|
28
|
-
wai_client = get_provider(
|
|
27
|
+
wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
|
|
29
28
|
|
|
30
29
|
self.kw_gen = KeywordsGenerationLLM(
|
|
31
30
|
provider=wai_client,
|
|
@@ -6,6 +6,8 @@ import rich
|
|
|
6
6
|
import time
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
from typing import List, Generator, Dict, Tuple, Mapping, Any
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from collections import deque
|
|
9
11
|
|
|
10
12
|
from wxo_agentic_evaluation.type import (
|
|
11
13
|
ContentType,
|
|
@@ -23,12 +25,27 @@ from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
|
23
25
|
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
24
26
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
25
27
|
|
|
28
|
+
class Roles(Enum):
|
|
29
|
+
ASSISTANT = "assistant"
|
|
30
|
+
USER = "user"
|
|
26
31
|
|
|
27
|
-
def
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
32
|
+
def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
|
|
33
|
+
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
34
|
+
Args:
|
|
35
|
+
first_message_text (str): The .content field of the first message.
|
|
36
|
+
second_message_text (str): The .content field of the second message.
|
|
37
|
+
"""
|
|
38
|
+
words_in_first_message = first_message_text.lower().split()
|
|
39
|
+
words_in_second_message = second_message_text.lower().split()
|
|
31
40
|
|
|
41
|
+
# Calculate the number of common words
|
|
42
|
+
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
43
|
+
unique_words = set(words_in_first_message + words_in_second_message)
|
|
44
|
+
unique_words_count = len(unique_words)
|
|
45
|
+
|
|
46
|
+
if unique_words_count == 0:
|
|
47
|
+
return 0.0
|
|
48
|
+
return len(common_words) / unique_words_count
|
|
32
49
|
|
|
33
50
|
def is_transfer_response(step_detail: Dict):
|
|
34
51
|
# this is not very reliable
|
|
@@ -504,6 +521,11 @@ class WXOInferenceBackend:
|
|
|
504
521
|
|
|
505
522
|
|
|
506
523
|
class EvaluationController:
|
|
524
|
+
|
|
525
|
+
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
526
|
+
MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
527
|
+
MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
528
|
+
|
|
507
529
|
def __init__(
|
|
508
530
|
self,
|
|
509
531
|
wxo_inference_backend: WXOInferenceBackend,
|
|
@@ -513,6 +535,12 @@ class EvaluationController:
|
|
|
513
535
|
self.wxo_inference_backend = wxo_inference_backend
|
|
514
536
|
self.llm_user = llm_user
|
|
515
537
|
self.config = config
|
|
538
|
+
self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
|
|
539
|
+
|
|
540
|
+
if self.repeating_output_detection:
|
|
541
|
+
# Use deque for efficient O(1) operations
|
|
542
|
+
self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
|
|
543
|
+
self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
|
|
516
544
|
|
|
517
545
|
def run(
|
|
518
546
|
self, task_n, story, agent_name: str, starting_user_input: str = None
|
|
@@ -522,9 +550,9 @@ class EvaluationController:
|
|
|
522
550
|
conversation_history: List[Message] = []
|
|
523
551
|
conversational_search_history_data = []
|
|
524
552
|
call_tracker = CallTracker()
|
|
525
|
-
# make this configurable
|
|
526
|
-
while step < 20:
|
|
527
553
|
|
|
554
|
+
# make this configurable
|
|
555
|
+
while step < self.MAX_CONVERSATION_STEPS:
|
|
528
556
|
if step == 0 and starting_user_input:
|
|
529
557
|
user_input = Message(
|
|
530
558
|
role="user", content=starting_user_input, type=ContentType.text
|
|
@@ -546,9 +574,15 @@ class EvaluationController:
|
|
|
546
574
|
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
547
575
|
user_input.content,
|
|
548
576
|
)
|
|
549
|
-
|
|
577
|
+
|
|
578
|
+
if self._is_end(user_input):
|
|
550
579
|
break
|
|
580
|
+
|
|
581
|
+
if self.repeating_output_detection:
|
|
582
|
+
self.recent_user_messages.append(user_input.content)
|
|
583
|
+
|
|
551
584
|
conversation_history.append(user_input)
|
|
585
|
+
|
|
552
586
|
messages, thread_id, conversational_search_data = (
|
|
553
587
|
self.wxo_inference_backend.stream_messages(
|
|
554
588
|
user_input,
|
|
@@ -559,16 +593,70 @@ class EvaluationController:
|
|
|
559
593
|
)
|
|
560
594
|
if not messages:
|
|
561
595
|
raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
596
|
+
|
|
597
|
+
for message in messages:
|
|
598
|
+
if self.repeating_output_detection:
|
|
599
|
+
if message.role == Roles.ASSISTANT and message.type == ContentType.text:
|
|
600
|
+
self.recent_assistant_messages.append(message.content)
|
|
601
|
+
|
|
602
|
+
if self.config.enable_verbose_logging:
|
|
603
|
+
rich.print(
|
|
604
|
+
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
605
|
+
message.content,
|
|
606
|
+
)
|
|
607
|
+
|
|
568
608
|
conversation_history.extend(messages)
|
|
569
609
|
conversational_search_history_data.extend(conversational_search_data)
|
|
610
|
+
|
|
570
611
|
step += 1
|
|
571
612
|
return conversation_history, call_tracker, conversational_search_history_data
|
|
613
|
+
|
|
614
|
+
def _is_looping(self, messages: deque) -> bool:
|
|
615
|
+
"""Checks whether the user or assistant is stuck in a loop.
|
|
616
|
+
Args:
|
|
617
|
+
messages (deque): Defines the message cache to be assessed for similarity.
|
|
618
|
+
Returns:
|
|
619
|
+
bool: True if stuck in a loop, False otherwise.
|
|
620
|
+
"""
|
|
621
|
+
sim_count = 0
|
|
622
|
+
|
|
623
|
+
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
624
|
+
oldest_cached_message = messages[0]
|
|
625
|
+
for i, old_message in enumerate(messages):
|
|
626
|
+
if i == 0:
|
|
627
|
+
continue
|
|
628
|
+
if oldest_cached_message == old_message:
|
|
629
|
+
sim_count += 1
|
|
630
|
+
elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
|
|
631
|
+
sim_count += 1
|
|
632
|
+
|
|
633
|
+
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
634
|
+
|
|
635
|
+
def _is_end(self, current_user_input: Message) -> bool:
|
|
636
|
+
"""
|
|
637
|
+
Check if the user input indicates the end of the conversation.
|
|
638
|
+
|
|
639
|
+
- This function checks if the user input contains 'END'.
|
|
640
|
+
- An END is also triggered when the message cache(s) is filled with messages that are too similar.
|
|
641
|
+
- Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
|
|
642
|
+
Args:
|
|
643
|
+
current_user_input (Message): The user message.
|
|
644
|
+
Returns:
|
|
645
|
+
bool: True if the user input indicates an END, False otherwise.
|
|
646
|
+
"""
|
|
647
|
+
current_user_message_content = current_user_input.content.strip()
|
|
648
|
+
|
|
649
|
+
# Check if the user message contains 'END'
|
|
650
|
+
if "END" in current_user_message_content:
|
|
651
|
+
return True
|
|
652
|
+
|
|
653
|
+
if self.repeating_output_detection:
|
|
654
|
+
# Check for repeating user or assistant messages
|
|
655
|
+
if (self._is_looping(self.recent_user_messages) or
|
|
656
|
+
self._is_looping(self.recent_assistant_messages)):
|
|
657
|
+
return True
|
|
658
|
+
|
|
659
|
+
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
572
660
|
|
|
573
661
|
def get_wxo_client(
|
|
574
662
|
service_url: str, tenant_name: str, token: str = None
|
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
|
|
|
107
107
|
|
|
108
108
|
def main(config: TestConfig):
|
|
109
109
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
110
|
+
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
111
|
+
rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
|
|
112
|
+
config.enable_manual_user_input = False # disable manual user input for parallel execution
|
|
113
|
+
# reason: threads continue to stream messages while waiting for user input, which is not desired
|
|
114
|
+
# and the manual input prompt is not labelled properly in the UI
|
|
110
115
|
wxo_client = get_wxo_client(
|
|
111
116
|
config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
|
|
112
117
|
)
|
|
@@ -43,17 +43,13 @@ def get_all_runs(wxo_client: WXOClient):
|
|
|
43
43
|
else:
|
|
44
44
|
path = "v1/orchestrate/runs"
|
|
45
45
|
|
|
46
|
-
initial_response = wxo_client.get(
|
|
47
|
-
path, {"limit": limit, "offset": 0}
|
|
48
|
-
).json()
|
|
46
|
+
initial_response = wxo_client.get(path, {"limit": limit, "offset": 0}).json()
|
|
49
47
|
total_runs = initial_response["total"]
|
|
50
48
|
all_runs.extend(initial_response["data"])
|
|
51
49
|
|
|
52
50
|
while len(all_runs) < total_runs:
|
|
53
51
|
offset += limit
|
|
54
|
-
response = wxo_client.get(
|
|
55
|
-
path, {"limit": limit, "offset": offset}
|
|
56
|
-
).json()
|
|
52
|
+
response = wxo_client.get(path, {"limit": limit, "offset": offset}).json()
|
|
57
53
|
all_runs.extend(response["data"])
|
|
58
54
|
|
|
59
55
|
# Sort runs by completed_at in descending order (most recent first)
|
|
@@ -92,9 +88,10 @@ def annotate_messages(
|
|
|
92
88
|
annotated_data["agent"] = agent_name
|
|
93
89
|
|
|
94
90
|
annotated_data["story"] = generate_story(annotated_data)
|
|
95
|
-
|
|
91
|
+
|
|
96
92
|
return annotated_data
|
|
97
93
|
|
|
94
|
+
|
|
98
95
|
def has_messages_changed(
|
|
99
96
|
thread_id: str,
|
|
100
97
|
messages: List[Message],
|
|
@@ -111,32 +108,27 @@ def has_messages_changed(
|
|
|
111
108
|
return False
|
|
112
109
|
|
|
113
110
|
|
|
114
|
-
def
|
|
111
|
+
def _record(config: ChatRecordingConfig, bad_threads: set):
|
|
115
112
|
"""Record chats in background mode"""
|
|
116
113
|
start_time = datetime.utcnow()
|
|
117
114
|
processed_threads = set()
|
|
118
115
|
previous_input_hash: dict[str, str] = {}
|
|
119
116
|
|
|
120
|
-
rich.print(
|
|
121
|
-
f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
|
|
122
|
-
)
|
|
123
117
|
if config.token is None:
|
|
124
118
|
config.token = tenant_setup(config.service_url, config.tenant_name)
|
|
125
119
|
wxo_client = get_wxo_client(config.service_url, config.tenant_name, config.token)
|
|
126
120
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
127
|
-
|
|
128
|
-
|
|
121
|
+
|
|
122
|
+
retry_count = 0
|
|
123
|
+
while retry_count < config.max_retries:
|
|
124
|
+
thread_id = None
|
|
125
|
+
try:
|
|
129
126
|
all_runs = get_all_runs(wxo_client)
|
|
130
127
|
seen_threads = set()
|
|
131
128
|
# Process only new runs that started after our recording began
|
|
132
129
|
for run in all_runs:
|
|
133
130
|
thread_id = run.get("thread_id")
|
|
134
|
-
|
|
135
|
-
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
136
|
-
except Exception as e:
|
|
137
|
-
rich.print(f"[yellow]WARNING:[/yellow]Failure in getting thread id {thread_id}")
|
|
138
|
-
continue
|
|
139
|
-
if thread_id in seen_threads or agent_name is None:
|
|
131
|
+
if (thread_id in bad_threads) or (thread_id in seen_threads):
|
|
140
132
|
continue
|
|
141
133
|
seen_threads.add(thread_id)
|
|
142
134
|
started_at = run.get("started_at")
|
|
@@ -162,11 +154,17 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
162
154
|
try:
|
|
163
155
|
messages = inference_backend.get_messages(thread_id)
|
|
164
156
|
|
|
165
|
-
if not has_messages_changed(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
157
|
+
if not has_messages_changed(thread_id, messages, previous_input_hash):
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
agent_name = inference_backend.get_agent_name_from_thread_id(thread_id)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failure getting agent name for thread_id {thread_id}: {e}")
|
|
164
|
+
raise
|
|
165
|
+
|
|
166
|
+
if agent_name is None:
|
|
167
|
+
rich.print(f"[yellow]WARNING:[/yellow] No agent name found for thread_id {thread_id}. Skipping ...")
|
|
170
168
|
continue
|
|
171
169
|
|
|
172
170
|
annotated_data = annotate_messages(
|
|
@@ -180,19 +178,37 @@ def record_chats(config: ChatRecordingConfig):
|
|
|
180
178
|
with open(annotation_filename, "w") as f:
|
|
181
179
|
json.dump(annotated_data, f, indent=4)
|
|
182
180
|
except Exception as e:
|
|
183
|
-
rich.print(
|
|
184
|
-
|
|
185
|
-
)
|
|
181
|
+
rich.print(f"[yellow]WARNING:[/yellow] Failed to process thread {thread_id}: {e}")
|
|
182
|
+
raise
|
|
186
183
|
except (ValueError, TypeError) as e:
|
|
187
|
-
rich.print(
|
|
188
|
-
|
|
189
|
-
|
|
184
|
+
rich.print(f"[yellow]WARNING:[/yellow] Invalid timestamp for thread {thread_id}: {e}")
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
retry_count = 0
|
|
188
|
+
time.sleep(2)
|
|
190
189
|
|
|
191
|
-
|
|
190
|
+
except KeyboardInterrupt:
|
|
191
|
+
rich.print("\n[yellow]Recording stopped by user[/yellow]")
|
|
192
|
+
break
|
|
192
193
|
|
|
193
|
-
|
|
194
|
-
|
|
194
|
+
except Exception as e:
|
|
195
|
+
if thread_id is None:
|
|
196
|
+
rich.print(f"[red]ERROR:[/red] {e}")
|
|
197
|
+
break
|
|
195
198
|
|
|
199
|
+
time.sleep(1)
|
|
200
|
+
retry_count += 1
|
|
201
|
+
if retry_count >= config.max_retries:
|
|
202
|
+
rich.print(f"[red]ERROR:[/red] Maximum retries reached. Skipping thread {thread_id}")
|
|
203
|
+
bad_threads.add(thread_id)
|
|
204
|
+
_record(config, bad_threads)
|
|
205
|
+
|
|
206
|
+
def record_chats(config: ChatRecordingConfig):
|
|
207
|
+
rich.print(
|
|
208
|
+
f"[green]INFO:[/green] Chat recording started. Press Ctrl+C to stop."
|
|
209
|
+
)
|
|
210
|
+
bad_threads = set()
|
|
211
|
+
_record(config, bad_threads)
|
|
196
212
|
|
|
197
213
|
if __name__ == "__main__":
|
|
198
214
|
record_chats(CLI(ChatRecordingConfig, as_positional=False))
|
|
@@ -14,7 +14,7 @@ class ResourceMap:
|
|
|
14
14
|
if is_saas_url(self.wxo_client.service_url):
|
|
15
15
|
# TO-DO: this is not validated after the v1 prefix change
|
|
16
16
|
# need additional validation
|
|
17
|
-
tools_path = "v1/orchestrate/tools
|
|
17
|
+
tools_path = "v1/orchestrate/tools"
|
|
18
18
|
agents_path = "v1/orchestrate/agents"
|
|
19
19
|
else:
|
|
20
20
|
tools_path = "v1/tools/"
|
|
@@ -10,8 +10,6 @@ from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
|
|
|
10
10
|
|
|
11
11
|
AUTH_ENDPOINT_AWS = "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
12
12
|
AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
|
|
13
|
-
WO_INSTANCE = os.environ.get("WO_INSTANCE")
|
|
14
|
-
WO_API_KEY = os.environ.get("WO_API_KEY")
|
|
15
13
|
DEFAULT_PARAM = {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
|
|
16
14
|
|
|
17
15
|
|
|
@@ -19,14 +17,16 @@ class ModelProxyProvider(Provider):
|
|
|
19
17
|
def __init__(
|
|
20
18
|
self,
|
|
21
19
|
model_id=None,
|
|
22
|
-
api_key=
|
|
23
|
-
instance_url=
|
|
20
|
+
api_key=None,
|
|
21
|
+
instance_url=None,
|
|
24
22
|
timeout=300,
|
|
25
23
|
embedding_model_id=None,
|
|
26
24
|
params=None
|
|
27
25
|
):
|
|
28
26
|
super().__init__()
|
|
29
27
|
|
|
28
|
+
instance_url = os.environ.get("WO_INSTANCE", instance_url)
|
|
29
|
+
api_key = os.environ.get("WO_API_KEY", api_key)
|
|
30
30
|
if not instance_url or not api_key:
|
|
31
31
|
raise RuntimeError("instance url and WO apikey must be specified to use WO model proxy")
|
|
32
32
|
|
|
@@ -6,6 +6,7 @@ import importlib.util
|
|
|
6
6
|
import re
|
|
7
7
|
from jsonargparse import CLI
|
|
8
8
|
import os
|
|
9
|
+
import sys
|
|
9
10
|
import textwrap
|
|
10
11
|
from dataclasses import is_dataclass, asdict
|
|
11
12
|
|
|
@@ -83,8 +84,16 @@ def load_tools_module(tools_path: Path) -> dict:
|
|
|
83
84
|
module_name = file_path.stem
|
|
84
85
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
85
86
|
module = importlib.util.module_from_spec(spec)
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
parent_dir = str(file_path.parent)
|
|
88
|
+
sys_path_modified = False
|
|
89
|
+
if parent_dir not in sys.path:
|
|
90
|
+
sys.path.append(parent_dir)
|
|
91
|
+
sys_path_modified = True
|
|
92
|
+
try:
|
|
93
|
+
spec.loader.exec_module(module)
|
|
94
|
+
finally:
|
|
95
|
+
if sys_path_modified:
|
|
96
|
+
sys.path.pop()
|
|
88
97
|
# Add all module's non-private functions to tools_dict
|
|
89
98
|
for attr_name in dir(module):
|
|
90
99
|
attr = getattr(module, attr_name)
|
|
File without changes
|