ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,20 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from
|
|
5
|
-
from enum import Enum
|
|
6
|
-
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Generator, List, Mapping
|
|
7
5
|
|
|
8
6
|
import requests
|
|
9
7
|
import rich
|
|
10
|
-
import urllib3
|
|
11
8
|
import yaml
|
|
12
|
-
from pydantic import BaseModel
|
|
13
|
-
from urllib3.exceptions import InsecureRequestWarning
|
|
14
9
|
|
|
15
|
-
from wxo_agentic_evaluation.
|
|
16
|
-
|
|
17
|
-
|
|
10
|
+
from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
|
|
11
|
+
RuntimeAdapter,
|
|
12
|
+
)
|
|
18
13
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
14
|
WatsonXProvider,
|
|
20
15
|
)
|
|
@@ -27,41 +22,10 @@ from wxo_agentic_evaluation.type import (
|
|
|
27
22
|
ConversationalSearchResults,
|
|
28
23
|
ConversationSearchMetadata,
|
|
29
24
|
Message,
|
|
25
|
+
RuntimeResponse,
|
|
30
26
|
)
|
|
31
|
-
from wxo_agentic_evaluation.utils.utils import
|
|
32
|
-
|
|
33
|
-
is_saas_url,
|
|
34
|
-
safe_divide,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
tokenizer = Tokenizer()
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class Roles(Enum):
|
|
41
|
-
ASSISTANT = "assistant"
|
|
42
|
-
USER = "user"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def calculate_word_overlap_similarity_score(
|
|
46
|
-
first_message_text: str, second_message_text: str
|
|
47
|
-
) -> float:
|
|
48
|
-
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
49
|
-
Args:
|
|
50
|
-
first_message_text (str): The .content field of the first message.
|
|
51
|
-
second_message_text (str): The .content field of the second message.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
words_in_first_message = tokenizer(first_message_text)
|
|
55
|
-
words_in_second_message = tokenizer(second_message_text)
|
|
56
|
-
|
|
57
|
-
# Calculate the number of common words
|
|
58
|
-
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
59
|
-
unique_words = set(words_in_first_message + words_in_second_message)
|
|
60
|
-
|
|
61
|
-
unique_words_count = len(unique_words)
|
|
62
|
-
common_words_count = len(common_words)
|
|
63
|
-
|
|
64
|
-
return safe_divide(common_words_count, unique_words_count)
|
|
27
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
28
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
65
29
|
|
|
66
30
|
|
|
67
31
|
def is_transfer_response(step_detail: Dict):
|
|
@@ -73,62 +37,12 @@ def is_transfer_response(step_detail: Dict):
|
|
|
73
37
|
return False
|
|
74
38
|
|
|
75
39
|
|
|
76
|
-
class
|
|
77
|
-
tool_call: List = []
|
|
78
|
-
tool_response: List = []
|
|
79
|
-
generic: List = []
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
class WXOClient:
|
|
83
|
-
def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
|
|
84
|
-
self.service_url = service_url
|
|
85
|
-
self.api_key = api_key
|
|
86
|
-
|
|
87
|
-
ov = os.getenv("WO_SSL_VERIFY")
|
|
88
|
-
if ov and ov.strip().lower() in ("true", "false"):
|
|
89
|
-
self._verify_ssl = ov.strip().lower() == "true"
|
|
90
|
-
else:
|
|
91
|
-
v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
|
|
92
|
-
self._verify_ssl = False if (
|
|
93
|
-
(bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
|
|
94
|
-
(v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
|
|
95
|
-
) else (v if isinstance(v, bool) else True)
|
|
96
|
-
|
|
97
|
-
if not self._verify_ssl:
|
|
98
|
-
urllib3.disable_warnings(InsecureRequestWarning)
|
|
99
|
-
|
|
100
|
-
def _get_headers(self) -> dict:
|
|
101
|
-
headers = {}
|
|
102
|
-
if self.api_key:
|
|
103
|
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
104
|
-
return headers
|
|
105
|
-
|
|
106
|
-
def post(self, payload: dict, path: str, stream=False):
|
|
107
|
-
url = f"{self.service_url}/{path}"
|
|
108
|
-
return requests.post(
|
|
109
|
-
url=url,
|
|
110
|
-
headers=self._get_headers(),
|
|
111
|
-
json=payload,
|
|
112
|
-
stream=stream,
|
|
113
|
-
verify=self._verify_ssl,
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
def get(self, path: str, params: dict = None):
|
|
117
|
-
url = f"{self.service_url}/{path}"
|
|
118
|
-
return requests.get(
|
|
119
|
-
url,
|
|
120
|
-
params=params,
|
|
121
|
-
headers=self._get_headers(),
|
|
122
|
-
verify=self._verify_ssl,
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
class WXOInferenceBackend:
|
|
40
|
+
class WXORuntimeAdapter(RuntimeAdapter):
|
|
127
41
|
def __init__(self, wxo_client):
|
|
128
42
|
self.wxo_client = wxo_client
|
|
129
43
|
self.enable_saas_mode = is_saas_url(wxo_client.service_url)
|
|
130
44
|
|
|
131
|
-
def
|
|
45
|
+
def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
|
|
132
46
|
agent_id = self.get_agent_id(agent_name)
|
|
133
47
|
payload = {"message": user_input.model_dump(), "agent_id": agent_id}
|
|
134
48
|
if thread_id:
|
|
@@ -244,20 +158,21 @@ class WXOInferenceBackend:
|
|
|
244
158
|
|
|
245
159
|
return conversational_search
|
|
246
160
|
|
|
247
|
-
def
|
|
161
|
+
def run(
|
|
248
162
|
self,
|
|
249
163
|
user_input: Message,
|
|
250
|
-
|
|
251
|
-
call_tracker: CallTracker,
|
|
164
|
+
context: dict,
|
|
252
165
|
thread_id=None,
|
|
253
|
-
) ->
|
|
166
|
+
) -> RuntimeResponse:
|
|
167
|
+
|
|
168
|
+
agent_name = context["agent_name"]
|
|
169
|
+
call_tracker = context["call_tracker"]
|
|
254
170
|
recover = False
|
|
255
171
|
messages = list()
|
|
256
172
|
conversational_search_data = []
|
|
257
173
|
|
|
258
174
|
start_time = time.time()
|
|
259
175
|
for chunk in self._stream_events(user_input, agent_name, thread_id):
|
|
260
|
-
|
|
261
176
|
event = chunk.get("event", "")
|
|
262
177
|
if _thread_id := chunk.get("data", {}).get("thread_id"):
|
|
263
178
|
thread_id = _thread_id
|
|
@@ -435,7 +350,11 @@ class WXOInferenceBackend:
|
|
|
435
350
|
f"Recovered {len(messages)} messages from thread_id {thread_id}",
|
|
436
351
|
)
|
|
437
352
|
|
|
438
|
-
return
|
|
353
|
+
return RuntimeResponse(
|
|
354
|
+
messages=messages,
|
|
355
|
+
thread_id=thread_id,
|
|
356
|
+
context={"conversational_search_data": conversational_search_data},
|
|
357
|
+
)
|
|
439
358
|
|
|
440
359
|
def _parse_events(
|
|
441
360
|
self, stream: Generator[bytes, None, None]
|
|
@@ -468,7 +387,6 @@ class WXOInferenceBackend:
|
|
|
468
387
|
|
|
469
388
|
messages = []
|
|
470
389
|
for entry in result:
|
|
471
|
-
|
|
472
390
|
tool_call_id = None
|
|
473
391
|
if step_history := entry.get("step_history"):
|
|
474
392
|
for step_message in step_history:
|
|
@@ -596,194 +514,6 @@ class WXOInferenceBackend:
|
|
|
596
514
|
return None
|
|
597
515
|
|
|
598
516
|
|
|
599
|
-
class EvaluationController:
|
|
600
|
-
|
|
601
|
-
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
602
|
-
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
603
|
-
os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
|
|
604
|
-
) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
605
|
-
MAX_REPEATING_MESSAGES = int(
|
|
606
|
-
os.getenv("MAX_REPEATING_MESSAGES", 3)
|
|
607
|
-
) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
608
|
-
|
|
609
|
-
def __init__(
|
|
610
|
-
self,
|
|
611
|
-
wxo_inference_backend: WXOInferenceBackend,
|
|
612
|
-
llm_user: LLMUser,
|
|
613
|
-
config: TestConfig,
|
|
614
|
-
):
|
|
615
|
-
self.wxo_inference_backend = wxo_inference_backend
|
|
616
|
-
self.llm_user = llm_user
|
|
617
|
-
self.config = config
|
|
618
|
-
self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
|
|
619
|
-
|
|
620
|
-
if self.repeating_output_detection:
|
|
621
|
-
# Use deque for efficient O(1) operations
|
|
622
|
-
self.recent_user_messages = deque(
|
|
623
|
-
maxlen=self.MAX_REPEATING_MESSAGES
|
|
624
|
-
)
|
|
625
|
-
self.recent_assistant_messages = deque(
|
|
626
|
-
maxlen=self.MAX_REPEATING_MESSAGES
|
|
627
|
-
)
|
|
628
|
-
|
|
629
|
-
def run(
|
|
630
|
-
self,
|
|
631
|
-
task_n,
|
|
632
|
-
story,
|
|
633
|
-
agent_name: str,
|
|
634
|
-
starting_user_input: str = None,
|
|
635
|
-
attack_instructions: str = None,
|
|
636
|
-
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
|
|
637
|
-
step = 0
|
|
638
|
-
thread_id = None
|
|
639
|
-
conversation_history: List[Message] = []
|
|
640
|
-
conversational_search_history_data = []
|
|
641
|
-
call_tracker = CallTracker()
|
|
642
|
-
|
|
643
|
-
# make this configurable
|
|
644
|
-
while step < self.MAX_CONVERSATION_STEPS:
|
|
645
|
-
if step == 0 and starting_user_input:
|
|
646
|
-
user_input = Message(
|
|
647
|
-
role="user",
|
|
648
|
-
content=starting_user_input,
|
|
649
|
-
type=ContentType.text,
|
|
650
|
-
)
|
|
651
|
-
else:
|
|
652
|
-
if self.config.enable_manual_user_input == True:
|
|
653
|
-
content = input(
|
|
654
|
-
"[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
|
|
655
|
-
)
|
|
656
|
-
user_input = Message(
|
|
657
|
-
role="user", content=content, type=ContentType.text
|
|
658
|
-
)
|
|
659
|
-
else: # llm
|
|
660
|
-
user_input = self.llm_user.generate_user_input(
|
|
661
|
-
story,
|
|
662
|
-
conversation_history,
|
|
663
|
-
attack_instructions=attack_instructions,
|
|
664
|
-
)
|
|
665
|
-
if self.config.enable_verbose_logging:
|
|
666
|
-
rich.print(
|
|
667
|
-
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
668
|
-
user_input.content,
|
|
669
|
-
)
|
|
670
|
-
|
|
671
|
-
if self._is_end(user_input):
|
|
672
|
-
break
|
|
673
|
-
|
|
674
|
-
if self.repeating_output_detection:
|
|
675
|
-
self.recent_user_messages.append(user_input.content)
|
|
676
|
-
|
|
677
|
-
conversation_history.append(user_input)
|
|
678
|
-
|
|
679
|
-
(
|
|
680
|
-
messages,
|
|
681
|
-
thread_id,
|
|
682
|
-
conversational_search_data,
|
|
683
|
-
) = self.wxo_inference_backend.stream_messages(
|
|
684
|
-
user_input,
|
|
685
|
-
agent_name=agent_name,
|
|
686
|
-
thread_id=thread_id,
|
|
687
|
-
call_tracker=call_tracker,
|
|
688
|
-
)
|
|
689
|
-
if not messages:
|
|
690
|
-
raise RuntimeError(
|
|
691
|
-
f"[Task-{task_n}] No messages is produced. Exiting task."
|
|
692
|
-
)
|
|
693
|
-
|
|
694
|
-
for message in messages:
|
|
695
|
-
if self.repeating_output_detection:
|
|
696
|
-
if (
|
|
697
|
-
message.role == Roles.ASSISTANT
|
|
698
|
-
and message.type == ContentType.text
|
|
699
|
-
):
|
|
700
|
-
self.recent_assistant_messages.append(message.content)
|
|
701
|
-
|
|
702
|
-
if self.config.enable_verbose_logging:
|
|
703
|
-
rich.print(
|
|
704
|
-
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
705
|
-
message.content,
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
conversation_history.extend(messages)
|
|
709
|
-
conversational_search_history_data.extend(
|
|
710
|
-
conversational_search_data
|
|
711
|
-
)
|
|
712
|
-
|
|
713
|
-
step += 1
|
|
714
|
-
return (
|
|
715
|
-
conversation_history,
|
|
716
|
-
call_tracker,
|
|
717
|
-
conversational_search_history_data,
|
|
718
|
-
)
|
|
719
|
-
|
|
720
|
-
def _is_looping(self, messages: deque) -> bool:
|
|
721
|
-
"""Checks whether the user or assistant is stuck in a loop.
|
|
722
|
-
Args:
|
|
723
|
-
messages (deque): Defines the message cache to be assessed for similarity.
|
|
724
|
-
Returns:
|
|
725
|
-
bool: True if stuck in a loop, False otherwise.
|
|
726
|
-
"""
|
|
727
|
-
sim_count = 0
|
|
728
|
-
|
|
729
|
-
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
730
|
-
oldest_cached_message = messages[0]
|
|
731
|
-
for i, old_message in enumerate(messages):
|
|
732
|
-
if i == 0:
|
|
733
|
-
continue
|
|
734
|
-
if oldest_cached_message == old_message:
|
|
735
|
-
sim_count += 1
|
|
736
|
-
elif (
|
|
737
|
-
calculate_word_overlap_similarity_score(
|
|
738
|
-
oldest_cached_message, old_message
|
|
739
|
-
)
|
|
740
|
-
> self.MESSAGE_SIMILARITY_THRESHOLD
|
|
741
|
-
):
|
|
742
|
-
sim_count += 1
|
|
743
|
-
|
|
744
|
-
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
745
|
-
|
|
746
|
-
def _is_end(self, current_user_input: Message) -> bool:
|
|
747
|
-
"""
|
|
748
|
-
Check if the user input indicates the end of the conversation.
|
|
749
|
-
|
|
750
|
-
- This function checks if the user input contains 'END'.
|
|
751
|
-
- An END is also triggered when the message cache(s) is filled with messages that are too similar.
|
|
752
|
-
- Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
|
|
753
|
-
Args:
|
|
754
|
-
current_user_input (Message): The user message.
|
|
755
|
-
Returns:
|
|
756
|
-
bool: True if the user input indicates an END, False otherwise.
|
|
757
|
-
"""
|
|
758
|
-
current_user_message_content = current_user_input.content.strip()
|
|
759
|
-
|
|
760
|
-
# Check if the user message contains 'END'
|
|
761
|
-
if "END" in current_user_message_content:
|
|
762
|
-
return True
|
|
763
|
-
|
|
764
|
-
if self.repeating_output_detection:
|
|
765
|
-
# Check for repeating user or assistant messages
|
|
766
|
-
if self._is_looping(self.recent_user_messages) or self._is_looping(
|
|
767
|
-
self.recent_assistant_messages
|
|
768
|
-
):
|
|
769
|
-
return True
|
|
770
|
-
|
|
771
|
-
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
def get_wxo_client(
|
|
775
|
-
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
776
|
-
) -> WXOClient:
|
|
777
|
-
|
|
778
|
-
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
779
|
-
service_url = service_url or resolved_url
|
|
780
|
-
|
|
781
|
-
if not (service_url and str(service_url).strip()):
|
|
782
|
-
raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
|
|
783
|
-
|
|
784
|
-
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
785
|
-
return wxo_client
|
|
786
|
-
|
|
787
517
|
if __name__ == "__main__":
|
|
788
518
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
789
519
|
auth_config_path = (
|
|
@@ -791,13 +521,14 @@ if __name__ == "__main__":
|
|
|
791
521
|
)
|
|
792
522
|
with open(auth_config_path, "r") as f:
|
|
793
523
|
auth_config = yaml.safe_load(f)
|
|
524
|
+
|
|
794
525
|
tenant_name = "local"
|
|
795
526
|
token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
|
|
796
527
|
|
|
797
528
|
wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
|
|
798
|
-
inference_backend =
|
|
799
|
-
resp = wxo_client.get("orchestrate/agents")
|
|
529
|
+
inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
|
|
530
|
+
resp = wxo_client.get("v1/orchestrate/agents")
|
|
800
531
|
resp = resp.json()
|
|
801
|
-
|
|
532
|
+
|
|
802
533
|
for agent in resp:
|
|
803
534
|
print(agent["name"], agent["display_name"])
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from enum import unique
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable, Dict, List, Set, Tuple
|
|
9
|
+
|
|
10
|
+
from rich import print as rich_print
|
|
11
|
+
from rich.progress import Progress
|
|
12
|
+
|
|
13
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
14
|
+
from wxo_agentic_evaluation.clients import Clients
|
|
15
|
+
from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def discover_tests(
|
|
19
|
+
test_paths: List[str], recursive_search: bool = False
|
|
20
|
+
) -> List[str]:
|
|
21
|
+
"""
|
|
22
|
+
Discover test cases from the given test paths.
|
|
23
|
+
|
|
24
|
+
This function searches for JSON test case files in the provided paths.
|
|
25
|
+
When recursive_search is enabled, it will search through all subdirectories
|
|
26
|
+
recursively. Otherwise, it will only search the top level of each directory.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
test_paths: List of paths to search for test cases
|
|
30
|
+
recursive_search: Whether to search recursively in subdirectories
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of unique test case names
|
|
34
|
+
"""
|
|
35
|
+
test_cases = []
|
|
36
|
+
for test_path in test_paths:
|
|
37
|
+
# Check if the path exists
|
|
38
|
+
if not glob.glob(test_path):
|
|
39
|
+
rich_print(
|
|
40
|
+
f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
|
|
41
|
+
)
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
if os.path.isdir(test_path):
|
|
45
|
+
if recursive_search:
|
|
46
|
+
# Use ** pattern for recursive search
|
|
47
|
+
pattern = os.path.join(test_path, "**", "*.json")
|
|
48
|
+
found_files = sorted(glob.glob(pattern, recursive=True))
|
|
49
|
+
rich_print(
|
|
50
|
+
f"Found {len(found_files)} files in '{test_path}' (recursive search)"
|
|
51
|
+
)
|
|
52
|
+
test_cases.extend(found_files)
|
|
53
|
+
else:
|
|
54
|
+
# Original behavior for non-recursive search
|
|
55
|
+
pattern = os.path.join(test_path, "*.json")
|
|
56
|
+
found_files = sorted(glob.glob(pattern))
|
|
57
|
+
rich_print(
|
|
58
|
+
f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
|
|
59
|
+
)
|
|
60
|
+
test_cases.extend(found_files)
|
|
61
|
+
else:
|
|
62
|
+
# If it's a file pattern, just use it directly
|
|
63
|
+
found_files = sorted(glob.glob(test_path))
|
|
64
|
+
test_cases.extend(found_files)
|
|
65
|
+
|
|
66
|
+
# Filter out non-JSON files and agent.json files
|
|
67
|
+
filtered_cases = [
|
|
68
|
+
tc
|
|
69
|
+
for tc in test_cases
|
|
70
|
+
if tc.endswith(".json") and not tc.endswith("agent.json")
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# create mapping of test case name to file path
|
|
74
|
+
unique_files_map: dict[str, str] = {}
|
|
75
|
+
|
|
76
|
+
for f in filtered_cases:
|
|
77
|
+
name = Path(f).stem
|
|
78
|
+
if name not in unique_files_map:
|
|
79
|
+
unique_files_map[name] = f
|
|
80
|
+
else:
|
|
81
|
+
rich_print(
|
|
82
|
+
f"[bold red]Duplicate test case name detected:[/bold red] "
|
|
83
|
+
f"'{name}' (skipping file '{f}')"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
unique_files = list(unique_files_map.values())
|
|
87
|
+
rich_print(
|
|
88
|
+
f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
|
|
89
|
+
)
|
|
90
|
+
return unique_files
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _removesuffix(s: str, suf: str) -> str:
|
|
94
|
+
"""Remove suffix from string (for Python < 3.9 compatibility)"""
|
|
95
|
+
return s[: -len(suf)] if s.endswith(suf) else s
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
|
|
99
|
+
"""
|
|
100
|
+
Get available runs from the output directory.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
output_dir: Output directory path
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Dictionary mapping test case stems to sets of run numbers
|
|
107
|
+
"""
|
|
108
|
+
available_runs = defaultdict(set)
|
|
109
|
+
for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
|
|
110
|
+
# strip the fixed tail
|
|
111
|
+
name = _removesuffix(os.path.basename(f), ".messages.json")
|
|
112
|
+
# match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
|
|
113
|
+
m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
|
|
114
|
+
if not m:
|
|
115
|
+
continue
|
|
116
|
+
stem = m.group("stem")
|
|
117
|
+
run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
|
|
118
|
+
available_runs[stem].add(run_num)
|
|
119
|
+
|
|
120
|
+
return available_runs
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def enumerate_jobs(
|
|
124
|
+
test_cases: List[str],
|
|
125
|
+
n_runs: int,
|
|
126
|
+
skip_available_results: bool,
|
|
127
|
+
output_dir: str,
|
|
128
|
+
) -> List[Tuple[int, str, int]]:
|
|
129
|
+
"""
|
|
130
|
+
Enumerate jobs to be run.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
test_cases: List of test case file paths
|
|
134
|
+
n_runs: Number of runs per test case
|
|
135
|
+
skip_available_results: Whether to skip available results
|
|
136
|
+
output_dir: Output directory path
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of tuples (task_n, test_case, run_idx)
|
|
140
|
+
"""
|
|
141
|
+
jobs = []
|
|
142
|
+
task_n = 0
|
|
143
|
+
|
|
144
|
+
available_runs = (
|
|
145
|
+
get_available_runs(output_dir) if skip_available_results else {}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
for test_case in test_cases:
|
|
149
|
+
stem = Path(test_case).stem
|
|
150
|
+
|
|
151
|
+
for run_idx in range(n_runs):
|
|
152
|
+
run_number = run_idx + 1
|
|
153
|
+
|
|
154
|
+
# Skip precisely this (test, run) if results exist
|
|
155
|
+
if skip_available_results and (
|
|
156
|
+
run_number in available_runs.get(stem, set())
|
|
157
|
+
):
|
|
158
|
+
print(
|
|
159
|
+
f"Skipping {stem} run {run_number} as results already exist."
|
|
160
|
+
)
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
jobs.append((task_n, test_case, run_idx))
|
|
164
|
+
task_n += 1
|
|
165
|
+
|
|
166
|
+
return jobs
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def run_jobs(
|
|
170
|
+
jobs: List[Tuple[int, str, int]],
|
|
171
|
+
config: TestConfig,
|
|
172
|
+
clients: Clients,
|
|
173
|
+
process_func: Callable,
|
|
174
|
+
num_workers: int,
|
|
175
|
+
) -> List[Any]:
|
|
176
|
+
"""
|
|
177
|
+
Run jobs using ThreadPoolExecutor.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
jobs: List of jobs to run
|
|
181
|
+
config: Test configuration
|
|
182
|
+
clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
|
|
183
|
+
process_func: Function to process each job
|
|
184
|
+
num_workers: Number of worker threads
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of results from all jobs
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
191
|
+
rich_print(
|
|
192
|
+
"[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
|
|
193
|
+
)
|
|
194
|
+
config.enable_manual_user_input = (
|
|
195
|
+
False # disable manual user input for parallel execution
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
executor = ThreadPoolExecutor(max_workers=num_workers)
|
|
199
|
+
futures = []
|
|
200
|
+
|
|
201
|
+
for task_n, test_case, run_idx in jobs:
|
|
202
|
+
future = executor.submit(
|
|
203
|
+
process_func,
|
|
204
|
+
task_n,
|
|
205
|
+
test_case,
|
|
206
|
+
config,
|
|
207
|
+
clients.inference_backend,
|
|
208
|
+
clients.resource_map,
|
|
209
|
+
clients.llm_user,
|
|
210
|
+
clients.llmaaj_provider,
|
|
211
|
+
run_idx,
|
|
212
|
+
)
|
|
213
|
+
futures.append(((test_case, run_idx), future))
|
|
214
|
+
|
|
215
|
+
results = []
|
|
216
|
+
|
|
217
|
+
if futures:
|
|
218
|
+
if LOGGING_ENABLED:
|
|
219
|
+
# No progress bar when logging - just process tasks
|
|
220
|
+
for (test_case, run_idx), future in futures:
|
|
221
|
+
try:
|
|
222
|
+
results.extend(future.result())
|
|
223
|
+
except Exception as e:
|
|
224
|
+
import traceback
|
|
225
|
+
|
|
226
|
+
rich_print(f"test case {test_case} fails with {e}")
|
|
227
|
+
|
|
228
|
+
traceback.print_exc()
|
|
229
|
+
else:
|
|
230
|
+
with Progress() as progress:
|
|
231
|
+
task1 = progress.add_task(
|
|
232
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
233
|
+
total=len(futures),
|
|
234
|
+
)
|
|
235
|
+
for (test_case, run_idx), future in futures:
|
|
236
|
+
try:
|
|
237
|
+
results.extend(future.result())
|
|
238
|
+
except Exception as e:
|
|
239
|
+
import traceback
|
|
240
|
+
|
|
241
|
+
rich_print(f"test case {test_case} fails with {e}")
|
|
242
|
+
|
|
243
|
+
traceback.print_exc()
|
|
244
|
+
finally:
|
|
245
|
+
progress.update(task1, advance=1)
|
|
246
|
+
|
|
247
|
+
return results
|