ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.type import CallTracker, Message, RuntimeResponse
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RuntimeAdapter:
|
|
6
|
+
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def run(
|
|
9
|
+
self,
|
|
10
|
+
user_message: Message,
|
|
11
|
+
context: dict,
|
|
12
|
+
thread_id=None,
|
|
13
|
+
) -> RuntimeResponse:
|
|
14
|
+
pass
|
|
@@ -1,33 +1,31 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
import os
|
|
3
|
-
import yaml
|
|
4
1
|
import json
|
|
5
|
-
import
|
|
2
|
+
import os
|
|
6
3
|
import time
|
|
7
|
-
from
|
|
8
|
-
|
|
4
|
+
from typing import Any, Dict, Generator, List, Mapping
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
import rich
|
|
8
|
+
import yaml
|
|
9
9
|
|
|
10
|
+
from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
|
|
11
|
+
RuntimeAdapter,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
14
|
+
WatsonXProvider,
|
|
15
|
+
)
|
|
10
16
|
from wxo_agentic_evaluation.type import (
|
|
11
17
|
ContentType,
|
|
12
|
-
|
|
18
|
+
ConversationalConfidenceThresholdScore,
|
|
13
19
|
ConversationalSearch,
|
|
14
20
|
ConversationalSearchCitations,
|
|
15
21
|
ConversationalSearchResultMetadata,
|
|
16
|
-
ConversationalConfidenceThresholdScore,
|
|
17
22
|
ConversationalSearchResults,
|
|
18
23
|
ConversationSearchMetadata,
|
|
24
|
+
Message,
|
|
25
|
+
RuntimeResponse,
|
|
19
26
|
)
|
|
20
|
-
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
21
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
22
|
-
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
23
|
-
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
24
27
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def is_end(user_input: Message):
|
|
28
|
-
if "END" in user_input.content.strip():
|
|
29
|
-
return True
|
|
30
|
-
return False
|
|
28
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
def is_transfer_response(step_detail: Dict):
|
|
@@ -39,40 +37,12 @@ def is_transfer_response(step_detail: Dict):
|
|
|
39
37
|
return False
|
|
40
38
|
|
|
41
39
|
|
|
42
|
-
class
|
|
43
|
-
tool_call: List = []
|
|
44
|
-
tool_response: List = []
|
|
45
|
-
generic: List = []
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class WXOClient:
|
|
49
|
-
def __init__(self, service_url, api_key):
|
|
50
|
-
self.service_url = service_url
|
|
51
|
-
self.api_key = api_key
|
|
52
|
-
|
|
53
|
-
def _get_headers(self) -> dict:
|
|
54
|
-
headers = {}
|
|
55
|
-
if self.api_key:
|
|
56
|
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
57
|
-
return headers
|
|
58
|
-
|
|
59
|
-
def post(self, payload: dict, path: str, stream=False):
|
|
60
|
-
url = f"{self.service_url}/{path}"
|
|
61
|
-
return requests.post(
|
|
62
|
-
url=url, headers=self._get_headers(), json=payload, stream=stream
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
def get(self, path: str, params: dict = None):
|
|
66
|
-
url = f"{self.service_url}/{path}"
|
|
67
|
-
return requests.get(url, params=params, headers=self._get_headers())
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class WXOInferenceBackend:
|
|
40
|
+
class WXORuntimeAdapter(RuntimeAdapter):
|
|
71
41
|
def __init__(self, wxo_client):
|
|
72
42
|
self.wxo_client = wxo_client
|
|
73
43
|
self.enable_saas_mode = is_saas_url(wxo_client.service_url)
|
|
74
44
|
|
|
75
|
-
def
|
|
45
|
+
def _runs_endpoint(self, user_input: Message, agent_name, thread_id=None):
|
|
76
46
|
agent_id = self.get_agent_id(agent_name)
|
|
77
47
|
payload = {"message": user_input.model_dump(), "agent_id": agent_id}
|
|
78
48
|
if thread_id:
|
|
@@ -108,7 +78,9 @@ class WXOInferenceBackend:
|
|
|
108
78
|
else:
|
|
109
79
|
path = "v1/orchestrate/runs?stream=true"
|
|
110
80
|
|
|
111
|
-
response: requests.Response = self.wxo_client.post(
|
|
81
|
+
response: requests.Response = self.wxo_client.post(
|
|
82
|
+
payload, path, stream=True
|
|
83
|
+
)
|
|
112
84
|
import json
|
|
113
85
|
|
|
114
86
|
for chunk in self._parse_events(response):
|
|
@@ -161,7 +133,9 @@ class WXOInferenceBackend:
|
|
|
161
133
|
citations = parse_citations()
|
|
162
134
|
retrieval_context = parsed_search_results()
|
|
163
135
|
citations_title = conversational_search.get("citations_title", "")
|
|
164
|
-
response_length_option = conversational_search.get(
|
|
136
|
+
response_length_option = conversational_search.get(
|
|
137
|
+
"response_length_option", ""
|
|
138
|
+
)
|
|
165
139
|
text = conversational_search.get("text", "")
|
|
166
140
|
|
|
167
141
|
confidence_scores = ConversationalConfidenceThresholdScore(
|
|
@@ -184,20 +158,21 @@ class WXOInferenceBackend:
|
|
|
184
158
|
|
|
185
159
|
return conversational_search
|
|
186
160
|
|
|
187
|
-
def
|
|
161
|
+
def run(
|
|
188
162
|
self,
|
|
189
163
|
user_input: Message,
|
|
190
|
-
|
|
191
|
-
call_tracker: CallTracker,
|
|
164
|
+
context: dict,
|
|
192
165
|
thread_id=None,
|
|
193
|
-
) ->
|
|
166
|
+
) -> RuntimeResponse:
|
|
167
|
+
|
|
168
|
+
agent_name = context["agent_name"]
|
|
169
|
+
call_tracker = context["call_tracker"]
|
|
194
170
|
recover = False
|
|
195
171
|
messages = list()
|
|
196
172
|
conversational_search_data = []
|
|
197
173
|
|
|
198
174
|
start_time = time.time()
|
|
199
175
|
for chunk in self._stream_events(user_input, agent_name, thread_id):
|
|
200
|
-
|
|
201
176
|
event = chunk.get("event", "")
|
|
202
177
|
if _thread_id := chunk.get("data", {}).get("thread_id"):
|
|
203
178
|
thread_id = _thread_id
|
|
@@ -234,7 +209,9 @@ class WXOInferenceBackend:
|
|
|
234
209
|
)
|
|
235
210
|
)
|
|
236
211
|
end_time = time.time()
|
|
237
|
-
call_tracker.tool_call.append(
|
|
212
|
+
call_tracker.tool_call.append(
|
|
213
|
+
end_time - start_time
|
|
214
|
+
)
|
|
238
215
|
start_time = end_time
|
|
239
216
|
elif step_detail["type"] == "tool_call":
|
|
240
217
|
# in step details, we could have [tool_response, tool_call]
|
|
@@ -252,7 +229,9 @@ class WXOInferenceBackend:
|
|
|
252
229
|
)
|
|
253
230
|
)
|
|
254
231
|
end_time = time.time()
|
|
255
|
-
call_tracker.tool_call.append(
|
|
232
|
+
call_tracker.tool_call.append(
|
|
233
|
+
end_time - start_time
|
|
234
|
+
)
|
|
256
235
|
start_time = end_time
|
|
257
236
|
elif step_detail["type"] == "tool_response":
|
|
258
237
|
content = json.dumps(step_detail)
|
|
@@ -266,7 +245,9 @@ class WXOInferenceBackend:
|
|
|
266
245
|
)
|
|
267
246
|
)
|
|
268
247
|
end_time = time.time()
|
|
269
|
-
call_tracker.tool_response.append(
|
|
248
|
+
call_tracker.tool_response.append(
|
|
249
|
+
end_time - start_time
|
|
250
|
+
)
|
|
270
251
|
start_time = end_time
|
|
271
252
|
elif content_field := delta.get("content"):
|
|
272
253
|
for val in content_field:
|
|
@@ -285,7 +266,9 @@ class WXOInferenceBackend:
|
|
|
285
266
|
chunk=event,
|
|
286
267
|
)
|
|
287
268
|
end_time = time.time()
|
|
288
|
-
call_tracker.generic.append(
|
|
269
|
+
call_tracker.generic.append(
|
|
270
|
+
end_time - start_time
|
|
271
|
+
)
|
|
289
272
|
start_time = end_time
|
|
290
273
|
|
|
291
274
|
# NOTE: The event here that is parsed is part of the "message.created" event
|
|
@@ -309,10 +292,14 @@ class WXOInferenceBackend:
|
|
|
309
292
|
"""
|
|
310
293
|
|
|
311
294
|
last_message = json.loads(messages[-1].content)
|
|
312
|
-
tool_call_id = last_message.get(
|
|
295
|
+
tool_call_id = last_message.get(
|
|
296
|
+
"tool_call_id", None
|
|
297
|
+
)
|
|
313
298
|
assert tool_call_id is not None
|
|
314
|
-
conversational_search_metadata =
|
|
315
|
-
|
|
299
|
+
conversational_search_metadata = (
|
|
300
|
+
ConversationSearchMetadata(
|
|
301
|
+
tool_call_id=tool_call_id
|
|
302
|
+
)
|
|
316
303
|
)
|
|
317
304
|
conversational_search = (
|
|
318
305
|
self.parse_conversational_search_response(
|
|
@@ -320,7 +307,9 @@ class WXOInferenceBackend:
|
|
|
320
307
|
metadata=conversational_search_metadata,
|
|
321
308
|
)
|
|
322
309
|
)
|
|
323
|
-
conversational_search_data.append(
|
|
310
|
+
conversational_search_data.append(
|
|
311
|
+
conversational_search
|
|
312
|
+
)
|
|
324
313
|
messages.append(
|
|
325
314
|
Message(
|
|
326
315
|
role=role,
|
|
@@ -361,7 +350,11 @@ class WXOInferenceBackend:
|
|
|
361
350
|
f"Recovered {len(messages)} messages from thread_id {thread_id}",
|
|
362
351
|
)
|
|
363
352
|
|
|
364
|
-
return
|
|
353
|
+
return RuntimeResponse(
|
|
354
|
+
messages=messages,
|
|
355
|
+
thread_id=thread_id,
|
|
356
|
+
context={"conversational_search_data": conversational_search_data},
|
|
357
|
+
)
|
|
365
358
|
|
|
366
359
|
def _parse_events(
|
|
367
360
|
self, stream: Generator[bytes, None, None]
|
|
@@ -406,6 +399,13 @@ class WXOInferenceBackend:
|
|
|
406
399
|
tool_json = {"type": "tool_call"}
|
|
407
400
|
tool_json.update(tool)
|
|
408
401
|
content = json.dumps(tool_json)
|
|
402
|
+
# TO-DO: review do we even need the get messages for retry loop anymore?
|
|
403
|
+
if msg_content := entry.get("content"):
|
|
404
|
+
if (
|
|
405
|
+
msg_content[0].get("response_type")
|
|
406
|
+
== "conversational_search"
|
|
407
|
+
):
|
|
408
|
+
continue
|
|
409
409
|
messages.append(
|
|
410
410
|
Message(
|
|
411
411
|
role=role,
|
|
@@ -419,7 +419,9 @@ class WXOInferenceBackend:
|
|
|
419
419
|
content = json.dumps(step_detail)
|
|
420
420
|
messages.append(
|
|
421
421
|
Message(
|
|
422
|
-
role=role,
|
|
422
|
+
role=role,
|
|
423
|
+
content=content,
|
|
424
|
+
type=content_type,
|
|
423
425
|
)
|
|
424
426
|
)
|
|
425
427
|
else:
|
|
@@ -427,7 +429,9 @@ class WXOInferenceBackend:
|
|
|
427
429
|
content_type = ContentType.tool_response
|
|
428
430
|
messages.append(
|
|
429
431
|
Message(
|
|
430
|
-
role=role,
|
|
432
|
+
role=role,
|
|
433
|
+
content=content,
|
|
434
|
+
type=content_type,
|
|
431
435
|
)
|
|
432
436
|
)
|
|
433
437
|
if content_field := entry.get("content"):
|
|
@@ -436,12 +440,19 @@ class WXOInferenceBackend:
|
|
|
436
440
|
if val["response_type"] == ContentType.text:
|
|
437
441
|
messages.append(
|
|
438
442
|
Message(
|
|
439
|
-
role=role,
|
|
443
|
+
role=role,
|
|
444
|
+
content=val["text"],
|
|
445
|
+
type=ContentType.text,
|
|
440
446
|
)
|
|
441
447
|
)
|
|
442
|
-
if
|
|
443
|
-
|
|
444
|
-
|
|
448
|
+
if (
|
|
449
|
+
val["response_type"]
|
|
450
|
+
== ContentType.conversational_search
|
|
451
|
+
):
|
|
452
|
+
conversational_search_metadata = (
|
|
453
|
+
ConversationSearchMetadata(
|
|
454
|
+
tool_call_id=tool_call_id
|
|
455
|
+
)
|
|
445
456
|
)
|
|
446
457
|
messages.append(
|
|
447
458
|
Message(
|
|
@@ -503,94 +514,21 @@ class WXOInferenceBackend:
|
|
|
503
514
|
return None
|
|
504
515
|
|
|
505
516
|
|
|
506
|
-
class EvaluationController:
|
|
507
|
-
def __init__(
|
|
508
|
-
self,
|
|
509
|
-
wxo_inference_backend: WXOInferenceBackend,
|
|
510
|
-
llm_user: LLMUser,
|
|
511
|
-
config: TestConfig,
|
|
512
|
-
):
|
|
513
|
-
self.wxo_inference_backend = wxo_inference_backend
|
|
514
|
-
self.llm_user = llm_user
|
|
515
|
-
self.config = config
|
|
516
|
-
|
|
517
|
-
def run(
|
|
518
|
-
self, task_n, story, agent_name: str, starting_user_input: str = None
|
|
519
|
-
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
|
|
520
|
-
step = 0
|
|
521
|
-
thread_id = None
|
|
522
|
-
conversation_history: List[Message] = []
|
|
523
|
-
conversational_search_history_data = []
|
|
524
|
-
call_tracker = CallTracker()
|
|
525
|
-
# make this configurable
|
|
526
|
-
while step < 20:
|
|
527
|
-
|
|
528
|
-
if step == 0 and starting_user_input:
|
|
529
|
-
user_input = Message(
|
|
530
|
-
role="user", content=starting_user_input, type=ContentType.text
|
|
531
|
-
)
|
|
532
|
-
else:
|
|
533
|
-
if self.config.enable_manual_user_input == True:
|
|
534
|
-
content = input(
|
|
535
|
-
"[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
|
|
536
|
-
)
|
|
537
|
-
user_input = Message(
|
|
538
|
-
role="user", content=content, type=ContentType.text
|
|
539
|
-
)
|
|
540
|
-
else: # llm
|
|
541
|
-
user_input = self.llm_user.generate_user_input(
|
|
542
|
-
story, conversation_history
|
|
543
|
-
)
|
|
544
|
-
if self.config.enable_verbose_logging:
|
|
545
|
-
rich.print(
|
|
546
|
-
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
547
|
-
user_input.content,
|
|
548
|
-
)
|
|
549
|
-
if is_end(user_input):
|
|
550
|
-
break
|
|
551
|
-
conversation_history.append(user_input)
|
|
552
|
-
messages, thread_id, conversational_search_data = (
|
|
553
|
-
self.wxo_inference_backend.stream_messages(
|
|
554
|
-
user_input,
|
|
555
|
-
agent_name=agent_name,
|
|
556
|
-
thread_id=thread_id,
|
|
557
|
-
call_tracker=call_tracker,
|
|
558
|
-
)
|
|
559
|
-
)
|
|
560
|
-
if not messages:
|
|
561
|
-
raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
|
|
562
|
-
if self.config.enable_verbose_logging:
|
|
563
|
-
for message in messages:
|
|
564
|
-
rich.print(
|
|
565
|
-
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
566
|
-
message.content,
|
|
567
|
-
)
|
|
568
|
-
conversation_history.extend(messages)
|
|
569
|
-
conversational_search_history_data.extend(conversational_search_data)
|
|
570
|
-
step += 1
|
|
571
|
-
return conversation_history, call_tracker, conversational_search_history_data
|
|
572
|
-
|
|
573
|
-
def get_wxo_client(
|
|
574
|
-
service_url: str, tenant_name: str, token: str = None
|
|
575
|
-
) -> WXOClient:
|
|
576
|
-
if not token:
|
|
577
|
-
token = tenant_setup(service_url, tenant_name)
|
|
578
|
-
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
579
|
-
return wxo_client
|
|
580
|
-
|
|
581
|
-
|
|
582
517
|
if __name__ == "__main__":
|
|
583
518
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
584
|
-
auth_config_path =
|
|
519
|
+
auth_config_path = (
|
|
520
|
+
f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
521
|
+
)
|
|
585
522
|
with open(auth_config_path, "r") as f:
|
|
586
523
|
auth_config = yaml.safe_load(f)
|
|
524
|
+
|
|
587
525
|
tenant_name = "local"
|
|
588
526
|
token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
|
|
589
527
|
|
|
590
528
|
wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
|
|
591
|
-
inference_backend =
|
|
592
|
-
resp = wxo_client.get("orchestrate/agents")
|
|
529
|
+
inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
|
|
530
|
+
resp = wxo_client.get("v1/orchestrate/agents")
|
|
593
531
|
resp = resp.json()
|
|
594
|
-
|
|
532
|
+
|
|
595
533
|
for agent in resp:
|
|
596
534
|
print(agent["name"], agent["display_name"])
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from enum import unique
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable, Dict, List, Set, Tuple
|
|
9
|
+
|
|
10
|
+
from rich import print as rich_print
|
|
11
|
+
from rich.progress import Progress
|
|
12
|
+
|
|
13
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
14
|
+
from wxo_agentic_evaluation.clients import Clients
|
|
15
|
+
from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def discover_tests(
|
|
19
|
+
test_paths: List[str], recursive_search: bool = False
|
|
20
|
+
) -> List[str]:
|
|
21
|
+
"""
|
|
22
|
+
Discover test cases from the given test paths.
|
|
23
|
+
|
|
24
|
+
This function searches for JSON test case files in the provided paths.
|
|
25
|
+
When recursive_search is enabled, it will search through all subdirectories
|
|
26
|
+
recursively. Otherwise, it will only search the top level of each directory.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
test_paths: List of paths to search for test cases
|
|
30
|
+
recursive_search: Whether to search recursively in subdirectories
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of unique test case names
|
|
34
|
+
"""
|
|
35
|
+
test_cases = []
|
|
36
|
+
for test_path in test_paths:
|
|
37
|
+
# Check if the path exists
|
|
38
|
+
if not glob.glob(test_path):
|
|
39
|
+
rich_print(
|
|
40
|
+
f"[bold yellow]Warning: Path '{test_path}' does not exist. Skipping.[/bold yellow]"
|
|
41
|
+
)
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
if os.path.isdir(test_path):
|
|
45
|
+
if recursive_search:
|
|
46
|
+
# Use ** pattern for recursive search
|
|
47
|
+
pattern = os.path.join(test_path, "**", "*.json")
|
|
48
|
+
found_files = sorted(glob.glob(pattern, recursive=True))
|
|
49
|
+
rich_print(
|
|
50
|
+
f"Found {len(found_files)} files in '{test_path}' (recursive search)"
|
|
51
|
+
)
|
|
52
|
+
test_cases.extend(found_files)
|
|
53
|
+
else:
|
|
54
|
+
# Original behavior for non-recursive search
|
|
55
|
+
pattern = os.path.join(test_path, "*.json")
|
|
56
|
+
found_files = sorted(glob.glob(pattern))
|
|
57
|
+
rich_print(
|
|
58
|
+
f"Found {len(found_files)} files in '{test_path}' (non-recursive)"
|
|
59
|
+
)
|
|
60
|
+
test_cases.extend(found_files)
|
|
61
|
+
else:
|
|
62
|
+
# If it's a file pattern, just use it directly
|
|
63
|
+
found_files = sorted(glob.glob(test_path))
|
|
64
|
+
test_cases.extend(found_files)
|
|
65
|
+
|
|
66
|
+
# Filter out non-JSON files and agent.json files
|
|
67
|
+
filtered_cases = [
|
|
68
|
+
tc
|
|
69
|
+
for tc in test_cases
|
|
70
|
+
if tc.endswith(".json") and not tc.endswith("agent.json")
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# create mapping of test case name to file path
|
|
74
|
+
unique_files_map: dict[str, str] = {}
|
|
75
|
+
|
|
76
|
+
for f in filtered_cases:
|
|
77
|
+
name = Path(f).stem
|
|
78
|
+
if name not in unique_files_map:
|
|
79
|
+
unique_files_map[name] = f
|
|
80
|
+
else:
|
|
81
|
+
rich_print(
|
|
82
|
+
f"[bold red]Duplicate test case name detected:[/bold red] "
|
|
83
|
+
f"'{name}' (skipping file '{f}')"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
unique_files = list(unique_files_map.values())
|
|
87
|
+
rich_print(
|
|
88
|
+
f"[bold green]Discovered {len(unique_files)} test cases in total[/bold green]"
|
|
89
|
+
)
|
|
90
|
+
return unique_files
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _removesuffix(s: str, suf: str) -> str:
|
|
94
|
+
"""Remove suffix from string (for Python < 3.9 compatibility)"""
|
|
95
|
+
return s[: -len(suf)] if s.endswith(suf) else s
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_available_runs(output_dir: str) -> Dict[str, Set[int]]:
|
|
99
|
+
"""
|
|
100
|
+
Get available runs from the output directory.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
output_dir: Output directory path
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Dictionary mapping test case stems to sets of run numbers
|
|
107
|
+
"""
|
|
108
|
+
available_runs = defaultdict(set)
|
|
109
|
+
for f in glob.glob(os.path.join(output_dir, "messages", "*.messages.json")):
|
|
110
|
+
# strip the fixed tail
|
|
111
|
+
name = _removesuffix(os.path.basename(f), ".messages.json")
|
|
112
|
+
# match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
|
|
113
|
+
m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
|
|
114
|
+
if not m:
|
|
115
|
+
continue
|
|
116
|
+
stem = m.group("stem")
|
|
117
|
+
run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
|
|
118
|
+
available_runs[stem].add(run_num)
|
|
119
|
+
|
|
120
|
+
return available_runs
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def enumerate_jobs(
|
|
124
|
+
test_cases: List[str],
|
|
125
|
+
n_runs: int,
|
|
126
|
+
skip_available_results: bool,
|
|
127
|
+
output_dir: str,
|
|
128
|
+
) -> List[Tuple[int, str, int]]:
|
|
129
|
+
"""
|
|
130
|
+
Enumerate jobs to be run.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
test_cases: List of test case file paths
|
|
134
|
+
n_runs: Number of runs per test case
|
|
135
|
+
skip_available_results: Whether to skip available results
|
|
136
|
+
output_dir: Output directory path
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of tuples (task_n, test_case, run_idx)
|
|
140
|
+
"""
|
|
141
|
+
jobs = []
|
|
142
|
+
task_n = 0
|
|
143
|
+
|
|
144
|
+
available_runs = (
|
|
145
|
+
get_available_runs(output_dir) if skip_available_results else {}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
for test_case in test_cases:
|
|
149
|
+
stem = Path(test_case).stem
|
|
150
|
+
|
|
151
|
+
for run_idx in range(n_runs):
|
|
152
|
+
run_number = run_idx + 1
|
|
153
|
+
|
|
154
|
+
# Skip precisely this (test, run) if results exist
|
|
155
|
+
if skip_available_results and (
|
|
156
|
+
run_number in available_runs.get(stem, set())
|
|
157
|
+
):
|
|
158
|
+
print(
|
|
159
|
+
f"Skipping {stem} run {run_number} as results already exist."
|
|
160
|
+
)
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
jobs.append((task_n, test_case, run_idx))
|
|
164
|
+
task_n += 1
|
|
165
|
+
|
|
166
|
+
return jobs
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def run_jobs(
|
|
170
|
+
jobs: List[Tuple[int, str, int]],
|
|
171
|
+
config: TestConfig,
|
|
172
|
+
clients: Clients,
|
|
173
|
+
process_func: Callable,
|
|
174
|
+
num_workers: int,
|
|
175
|
+
) -> List[Any]:
|
|
176
|
+
"""
|
|
177
|
+
Run jobs using ThreadPoolExecutor.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
jobs: List of jobs to run
|
|
181
|
+
config: Test configuration
|
|
182
|
+
clients: Tuple of clients (wxo_client, llmaaj_provider, resource_map, inference_backend, llm_user)
|
|
183
|
+
process_func: Function to process each job
|
|
184
|
+
num_workers: Number of worker threads
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of results from all jobs
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
if config.num_workers > 1 and config.enable_manual_user_input:
|
|
191
|
+
rich_print(
|
|
192
|
+
"[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
|
|
193
|
+
)
|
|
194
|
+
config.enable_manual_user_input = (
|
|
195
|
+
False # disable manual user input for parallel execution
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
executor = ThreadPoolExecutor(max_workers=num_workers)
|
|
199
|
+
futures = []
|
|
200
|
+
|
|
201
|
+
for task_n, test_case, run_idx in jobs:
|
|
202
|
+
future = executor.submit(
|
|
203
|
+
process_func,
|
|
204
|
+
task_n,
|
|
205
|
+
test_case,
|
|
206
|
+
config,
|
|
207
|
+
clients.inference_backend,
|
|
208
|
+
clients.resource_map,
|
|
209
|
+
clients.llm_user,
|
|
210
|
+
clients.llmaaj_provider,
|
|
211
|
+
run_idx,
|
|
212
|
+
)
|
|
213
|
+
futures.append(((test_case, run_idx), future))
|
|
214
|
+
|
|
215
|
+
results = []
|
|
216
|
+
|
|
217
|
+
if futures:
|
|
218
|
+
if LOGGING_ENABLED:
|
|
219
|
+
# No progress bar when logging - just process tasks
|
|
220
|
+
for (test_case, run_idx), future in futures:
|
|
221
|
+
try:
|
|
222
|
+
results.extend(future.result())
|
|
223
|
+
except Exception as e:
|
|
224
|
+
import traceback
|
|
225
|
+
|
|
226
|
+
rich_print(f"test case {test_case} fails with {e}")
|
|
227
|
+
|
|
228
|
+
traceback.print_exc()
|
|
229
|
+
else:
|
|
230
|
+
with Progress() as progress:
|
|
231
|
+
task1 = progress.add_task(
|
|
232
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
233
|
+
total=len(futures),
|
|
234
|
+
)
|
|
235
|
+
for (test_case, run_idx), future in futures:
|
|
236
|
+
try:
|
|
237
|
+
results.extend(future.result())
|
|
238
|
+
except Exception as e:
|
|
239
|
+
import traceback
|
|
240
|
+
|
|
241
|
+
rich_print(f"test case {test_case} fails with {e}")
|
|
242
|
+
|
|
243
|
+
traceback.print_exc()
|
|
244
|
+
finally:
|
|
245
|
+
progress.update(task1, advance=1)
|
|
246
|
+
|
|
247
|
+
return results
|