ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
- wxo_agentic_evaluation/__init__.py +0 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
- wxo_agentic_evaluation/analytics/tools/main.py +163 -0
- wxo_agentic_evaluation/analytics/tools/types.py +130 -0
- wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
- wxo_agentic_evaluation/analyze_run.py +123 -0
- wxo_agentic_evaluation/annotate.py +40 -0
- wxo_agentic_evaluation/arg_configs.py +78 -0
- wxo_agentic_evaluation/batch_annotate.py +181 -0
- wxo_agentic_evaluation/data_annotator.py +253 -0
- wxo_agentic_evaluation/evaluation_package.py +518 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
- wxo_agentic_evaluation/external_agent/types.py +65 -0
- wxo_agentic_evaluation/inference_backend.py +601 -0
- wxo_agentic_evaluation/llm_matching.py +39 -0
- wxo_agentic_evaluation/llm_rag_eval.py +47 -0
- wxo_agentic_evaluation/llm_user.py +38 -0
- wxo_agentic_evaluation/main.py +231 -0
- wxo_agentic_evaluation/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
- wxo_agentic_evaluation/metrics/metrics.py +101 -0
- wxo_agentic_evaluation/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
- wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
- wxo_agentic_evaluation/prompt/template_render.py +90 -0
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
- wxo_agentic_evaluation/record_chat.py +165 -0
- wxo_agentic_evaluation/service_instance.py +179 -0
- wxo_agentic_evaluation/tool_planner.py +228 -0
- wxo_agentic_evaluation/type.py +176 -0
- wxo_agentic_evaluation/utils/__init__.py +6 -0
- wxo_agentic_evaluation/utils/utils.py +233 -0
- wxo_agentic_evaluation/watsonx_provider.py +175 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import os
|
|
3
|
+
import yaml
|
|
4
|
+
import json
|
|
5
|
+
import rich
|
|
6
|
+
import time
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from typing import List, Generator, Dict, Tuple, Mapping, Any
|
|
9
|
+
|
|
10
|
+
from wxo_agentic_evaluation.type import (
|
|
11
|
+
ContentType,
|
|
12
|
+
Message,
|
|
13
|
+
ConversationalSearch,
|
|
14
|
+
ConversationalSearchCitations,
|
|
15
|
+
ConversationalSearchResultMetadata,
|
|
16
|
+
ConversationalConfidenceThresholdScore,
|
|
17
|
+
ConversationalSearchResults,
|
|
18
|
+
ConversationSearchMetadata,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
22
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
23
|
+
from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
|
|
24
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
25
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
26
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_end(user_input: Message):
|
|
30
|
+
if "END" in user_input.content.strip():
|
|
31
|
+
return True
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_transfer_response(step_detail: Dict):
|
|
36
|
+
if step_detail["type"] == "tool_response" and step_detail["name"].startswith(
|
|
37
|
+
"transfer_to_"
|
|
38
|
+
):
|
|
39
|
+
return True
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CallTracker(BaseModel):
|
|
44
|
+
tool_call: List = []
|
|
45
|
+
tool_response: List = []
|
|
46
|
+
generic: List = []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class WXOClient:
|
|
50
|
+
def __init__(self, service_url, api_key):
|
|
51
|
+
self.service_url = service_url
|
|
52
|
+
self.api_key = api_key
|
|
53
|
+
|
|
54
|
+
def _get_headers(self) -> dict:
|
|
55
|
+
headers = {}
|
|
56
|
+
if self.api_key:
|
|
57
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
58
|
+
return headers
|
|
59
|
+
|
|
60
|
+
def post(self, payload: dict, path: str, stream=False):
|
|
61
|
+
url = f"{self.service_url}/{path}"
|
|
62
|
+
return requests.post(
|
|
63
|
+
url=url, headers=self._get_headers(), json=payload, stream=stream
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def get(self, path: str, params: dict = None):
|
|
67
|
+
url = f"{self.service_url}/{path}"
|
|
68
|
+
return requests.get(url, params=params, headers=self._get_headers())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class WXOInferenceBackend:
|
|
72
|
+
def __init__(self, wxo_client):
|
|
73
|
+
self.wxo_client = wxo_client
|
|
74
|
+
self.enable_saas_mode = is_saas_url(wxo_client.service_url)
|
|
75
|
+
|
|
76
|
+
def run(self, user_input: Message, agent_name, thread_id=None):
|
|
77
|
+
agent_id = self.get_agent_id(agent_name)
|
|
78
|
+
payload = {"message": user_input.model_dump(), "agent_id": agent_id}
|
|
79
|
+
if thread_id:
|
|
80
|
+
payload["thread_id"] = thread_id
|
|
81
|
+
|
|
82
|
+
if self.enable_saas_mode:
|
|
83
|
+
path = "/v1/orchestrate/runs"
|
|
84
|
+
else:
|
|
85
|
+
path = "/orchestrate/runs"
|
|
86
|
+
|
|
87
|
+
response: requests.Response = self.wxo_client.post(payload, path)
|
|
88
|
+
|
|
89
|
+
if int(response.status_code) == 200:
|
|
90
|
+
result = response.json()
|
|
91
|
+
return result["thread_id"]
|
|
92
|
+
else:
|
|
93
|
+
response.raise_for_status()
|
|
94
|
+
|
|
95
|
+
def _stream_events(
|
|
96
|
+
self, user_input: Message, agent_name: str, thread_id=None
|
|
97
|
+
) -> Generator[Dict, None, None]:
|
|
98
|
+
agent_id = self.get_agent_id(agent_name)
|
|
99
|
+
payload = {"message": user_input.model_dump(), "agent_id": agent_id}
|
|
100
|
+
if thread_id:
|
|
101
|
+
payload["thread_id"] = thread_id
|
|
102
|
+
|
|
103
|
+
if self.enable_saas_mode:
|
|
104
|
+
path = "/v1/orchestrate/runs?stream=true"
|
|
105
|
+
else:
|
|
106
|
+
path = "/orchestrate/runs?stream=true"
|
|
107
|
+
|
|
108
|
+
response: requests.Response = self.wxo_client.post(payload, path, stream=True)
|
|
109
|
+
import json
|
|
110
|
+
|
|
111
|
+
for chunk in self._parse_events(response):
|
|
112
|
+
chunk = json.loads(chunk.strip())
|
|
113
|
+
yield chunk
|
|
114
|
+
|
|
115
|
+
def parse_conversational_search_response(
|
|
116
|
+
self,
|
|
117
|
+
conversational_search: Mapping[str, Any],
|
|
118
|
+
metadata: ConversationSearchMetadata,
|
|
119
|
+
) -> ConversationalSearch:
|
|
120
|
+
def parse_citations():
|
|
121
|
+
citations = conversational_search["citations"]
|
|
122
|
+
parsed_citations = []
|
|
123
|
+
for citation in citations:
|
|
124
|
+
c = ConversationalSearchCitations(
|
|
125
|
+
url=citation.get("url", ""),
|
|
126
|
+
body=citation.get("body", ""),
|
|
127
|
+
text=citation.get("text", ""),
|
|
128
|
+
title=citation.get("title", ""),
|
|
129
|
+
range_start=citation.get("range_start"),
|
|
130
|
+
range_end=citation.get("range_end"),
|
|
131
|
+
search_result_idx=citation.get("search_result_idx"),
|
|
132
|
+
)
|
|
133
|
+
parsed_citations.append(c)
|
|
134
|
+
|
|
135
|
+
return parsed_citations
|
|
136
|
+
|
|
137
|
+
def parsed_search_results():
|
|
138
|
+
search_results = conversational_search["search_results"]
|
|
139
|
+
parsed_search_results = []
|
|
140
|
+
for result in search_results:
|
|
141
|
+
result_metadata = result.get("result_metadata", {})
|
|
142
|
+
result_metadata = ConversationalSearchResultMetadata(
|
|
143
|
+
score=result_metadata.get("score"),
|
|
144
|
+
document_retrieval_source=result_metadata.get(
|
|
145
|
+
"document_retrieval_source"
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
c = ConversationalSearchResults(
|
|
149
|
+
url=result.get("url", ""),
|
|
150
|
+
body=result.get("body", ""),
|
|
151
|
+
title=result.get("title", ""),
|
|
152
|
+
result_metadata=result_metadata,
|
|
153
|
+
)
|
|
154
|
+
parsed_search_results.append(c)
|
|
155
|
+
|
|
156
|
+
return parsed_search_results
|
|
157
|
+
|
|
158
|
+
citations = parse_citations()
|
|
159
|
+
retrieval_context = parsed_search_results()
|
|
160
|
+
citations_title = conversational_search.get("citations_title", "")
|
|
161
|
+
response_length_option = conversational_search.get("response_length_option", "")
|
|
162
|
+
text = conversational_search.get("text", "")
|
|
163
|
+
|
|
164
|
+
confidence_scores = ConversationalConfidenceThresholdScore(
|
|
165
|
+
**conversational_search.get("confidence_scores")
|
|
166
|
+
)
|
|
167
|
+
response_type = conversational_search.get("response_type")
|
|
168
|
+
# should always be conversational_search
|
|
169
|
+
assert response_type == ContentType.conversational_search
|
|
170
|
+
|
|
171
|
+
conversational_search = ConversationalSearch(
|
|
172
|
+
metadata=metadata,
|
|
173
|
+
response_type=response_type,
|
|
174
|
+
text=text,
|
|
175
|
+
citations=citations,
|
|
176
|
+
search_results=retrieval_context,
|
|
177
|
+
citations_title=citations_title,
|
|
178
|
+
confidence_scores=confidence_scores,
|
|
179
|
+
response_length_option=response_length_option,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return conversational_search
|
|
183
|
+
|
|
184
|
+
def stream_messages(
|
|
185
|
+
self,
|
|
186
|
+
user_input: Message,
|
|
187
|
+
agent_name: str,
|
|
188
|
+
call_tracker: CallTracker,
|
|
189
|
+
thread_id=None,
|
|
190
|
+
) -> Tuple[List[Message], str, List[ConversationalSearch]]:
|
|
191
|
+
recover = False
|
|
192
|
+
messages = list()
|
|
193
|
+
conversational_search_data = []
|
|
194
|
+
|
|
195
|
+
start_time = time.time()
|
|
196
|
+
for chunk in self._stream_events(user_input, agent_name, thread_id):
|
|
197
|
+
|
|
198
|
+
event = chunk.get("event", "")
|
|
199
|
+
if _thread_id := chunk.get("data", {}).get("thread_id"):
|
|
200
|
+
thread_id = _thread_id
|
|
201
|
+
if delta := chunk.get("data", {}).get("delta"):
|
|
202
|
+
role = delta["role"]
|
|
203
|
+
if step_details := delta.get("step_details"):
|
|
204
|
+
if any(
|
|
205
|
+
is_transfer_response(step_detail)
|
|
206
|
+
for step_detail in step_details
|
|
207
|
+
):
|
|
208
|
+
continue
|
|
209
|
+
for idx, step_detail in enumerate(step_details):
|
|
210
|
+
if step_detail["type"] == "tool_calls":
|
|
211
|
+
# in step details, we could have [tool_response, tool_call]
|
|
212
|
+
# in this case, we skip since we already capture the tool call
|
|
213
|
+
if idx == 1:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
content_type = ContentType.tool_call
|
|
217
|
+
for tool in step_detail["tool_calls"]:
|
|
218
|
+
# Only add "transfer_to_" calls here. Other tool calls are already
|
|
219
|
+
# captured in the next block, including them here will cause duplication
|
|
220
|
+
# if not tool["name"].startswith("transfer_to_"):
|
|
221
|
+
# continue
|
|
222
|
+
tool_json = {"type": "tool_call"}
|
|
223
|
+
tool_json.update(tool)
|
|
224
|
+
content = json.dumps(tool_json)
|
|
225
|
+
messages.append(
|
|
226
|
+
Message(
|
|
227
|
+
role=role,
|
|
228
|
+
content=content,
|
|
229
|
+
type=content_type,
|
|
230
|
+
event=event,
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
end_time = time.time()
|
|
234
|
+
call_tracker.tool_call.append(end_time - start_time)
|
|
235
|
+
start_time = end_time
|
|
236
|
+
elif step_detail["type"] == "tool_call":
|
|
237
|
+
# in step details, we could have [tool_response, tool_call]
|
|
238
|
+
# in this case, we skip since we already capture the tool call
|
|
239
|
+
if idx == 1:
|
|
240
|
+
continue
|
|
241
|
+
content_type = ContentType.tool_call
|
|
242
|
+
content = json.dumps(step_detail)
|
|
243
|
+
messages.append(
|
|
244
|
+
Message(
|
|
245
|
+
role=role,
|
|
246
|
+
content=content,
|
|
247
|
+
type=content_type,
|
|
248
|
+
event=event,
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
end_time = time.time()
|
|
252
|
+
call_tracker.tool_call.append(end_time - start_time)
|
|
253
|
+
start_time = end_time
|
|
254
|
+
elif step_detail["type"] == "tool_response":
|
|
255
|
+
content = json.dumps(step_detail)
|
|
256
|
+
content_type = ContentType.tool_response
|
|
257
|
+
messages.append(
|
|
258
|
+
Message(
|
|
259
|
+
role=role,
|
|
260
|
+
content=content,
|
|
261
|
+
type=content_type,
|
|
262
|
+
event=event,
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
end_time = time.time()
|
|
266
|
+
call_tracker.tool_response.append(end_time - start_time)
|
|
267
|
+
start_time = end_time
|
|
268
|
+
elif content_field := delta.get("content"):
|
|
269
|
+
for val in content_field:
|
|
270
|
+
response_type = val["response_type"]
|
|
271
|
+
# TODO: is this ever hit? the event name is "message.created", and it seems the event should be "message.delta"
|
|
272
|
+
if (
|
|
273
|
+
response_type == ContentType.text
|
|
274
|
+
and chunk["event"] == "message_created"
|
|
275
|
+
):
|
|
276
|
+
messages.append(
|
|
277
|
+
Message(
|
|
278
|
+
role=role,
|
|
279
|
+
content=val["text"],
|
|
280
|
+
type=ContentType.text,
|
|
281
|
+
),
|
|
282
|
+
chunk=event,
|
|
283
|
+
)
|
|
284
|
+
end_time = time.time()
|
|
285
|
+
call_tracker.generic.append(end_time - start_time)
|
|
286
|
+
start_time = end_time
|
|
287
|
+
|
|
288
|
+
# NOTE: The event here that is parsed is part of the "message.created" event
|
|
289
|
+
elif message := chunk.get("data", {}).get("message"):
|
|
290
|
+
role = message["role"]
|
|
291
|
+
for content in message["content"]:
|
|
292
|
+
if (
|
|
293
|
+
content["response_type"]
|
|
294
|
+
== ContentType.conversational_search
|
|
295
|
+
):
|
|
296
|
+
end_time = time.time()
|
|
297
|
+
call_tracker.generic.append(end_time - start_time)
|
|
298
|
+
start_time = end_time
|
|
299
|
+
|
|
300
|
+
""" This is under the assumption the flow is (tool call -> tool response -> response back to user).
|
|
301
|
+
In other words, the tool response is not fed back in to the agent.
|
|
302
|
+
We get the previous message and extract the `tool_call_id`.
|
|
303
|
+
|
|
304
|
+
NOTE: The previous message is a tool call because how we parse the event stream.
|
|
305
|
+
NOTE: The conversational search response event does not have a 'tool call id' which can be used to associate with the 'conversational search response'.
|
|
306
|
+
"""
|
|
307
|
+
|
|
308
|
+
last_message = json.loads(messages[-1].content)
|
|
309
|
+
tool_call_id = last_message.get("tool_call_id", None)
|
|
310
|
+
assert tool_call_id is not None
|
|
311
|
+
conversational_search_metadata = ConversationSearchMetadata(
|
|
312
|
+
tool_call_id=tool_call_id
|
|
313
|
+
)
|
|
314
|
+
conversational_search = (
|
|
315
|
+
self.parse_conversational_search_response(
|
|
316
|
+
conversational_search=content,
|
|
317
|
+
metadata=conversational_search_metadata,
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
conversational_search_data.append(conversational_search)
|
|
321
|
+
messages.append(
|
|
322
|
+
Message(
|
|
323
|
+
role=role,
|
|
324
|
+
content=content["text"],
|
|
325
|
+
type=ContentType.conversational_search,
|
|
326
|
+
conversational_search_metadata=conversational_search_metadata,
|
|
327
|
+
event=event,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
if content["response_type"] == ContentType.text:
|
|
331
|
+
messages.append(
|
|
332
|
+
Message(
|
|
333
|
+
role=role,
|
|
334
|
+
content=content["text"],
|
|
335
|
+
type=ContentType.text,
|
|
336
|
+
event=chunk["event"],
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
end_time = time.time()
|
|
340
|
+
call_tracker.generic.append(end_time - start_time)
|
|
341
|
+
start_time = end_time
|
|
342
|
+
else:
|
|
343
|
+
# Exit the loop if we lose the thread_id
|
|
344
|
+
recover = True
|
|
345
|
+
break
|
|
346
|
+
|
|
347
|
+
if recover and (thread_id is not None):
|
|
348
|
+
rich.print(
|
|
349
|
+
"🔬 [bold][magenta]INFO:[/magenta][/bold]",
|
|
350
|
+
f"Attempting to recover messages from thread_id {thread_id}",
|
|
351
|
+
)
|
|
352
|
+
# If we lose the thread_id, we need to wait for a bit to allow the message to come through
|
|
353
|
+
# before attempting to recover the messages.
|
|
354
|
+
time.sleep(10)
|
|
355
|
+
messages = self.recover_messages(thread_id)
|
|
356
|
+
rich.print(
|
|
357
|
+
"🔬 [bold][magenta]INFO:[/magenta][/bold]",
|
|
358
|
+
f"Recovered {len(messages)} messages from thread_id {thread_id}",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return messages, thread_id, conversational_search_data
|
|
362
|
+
|
|
363
|
+
def _parse_events(
|
|
364
|
+
self, stream: Generator[bytes, None, None]
|
|
365
|
+
) -> Generator[bytes, None, None]:
|
|
366
|
+
data = b""
|
|
367
|
+
for chunk in stream:
|
|
368
|
+
for line in chunk.splitlines(True):
|
|
369
|
+
data += line
|
|
370
|
+
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n", b"\n")):
|
|
371
|
+
yield data
|
|
372
|
+
data = b""
|
|
373
|
+
if data:
|
|
374
|
+
yield data
|
|
375
|
+
|
|
376
|
+
def recover_messages(self, thread_id: str) -> List[Message]:
|
|
377
|
+
messages = self.get_messages(thread_id)
|
|
378
|
+
return self._get_messages_after_last_user(messages)
|
|
379
|
+
|
|
380
|
+
def get_messages(self, thread_id) -> List[Message]:
|
|
381
|
+
if self.enable_saas_mode:
|
|
382
|
+
path = f"v1/orchestrate/threads/{thread_id}/messages"
|
|
383
|
+
else:
|
|
384
|
+
path = f"threads/{thread_id}/messages"
|
|
385
|
+
response = self.wxo_client.get(path)
|
|
386
|
+
if response.status_code == 200:
|
|
387
|
+
result = response.json()
|
|
388
|
+
|
|
389
|
+
else:
|
|
390
|
+
response.raise_for_status()
|
|
391
|
+
|
|
392
|
+
messages = []
|
|
393
|
+
for entry in result:
|
|
394
|
+
tool_call_id = None
|
|
395
|
+
if step_history := entry.get("step_history"):
|
|
396
|
+
for step_message in step_history:
|
|
397
|
+
role = step_message["role"]
|
|
398
|
+
if step_details := step_message.get("step_details"):
|
|
399
|
+
for step_detail in step_details:
|
|
400
|
+
if step_detail["type"] == "tool_calls":
|
|
401
|
+
content_type = ContentType.tool_call
|
|
402
|
+
for tool in step_detail["tool_calls"]:
|
|
403
|
+
tool_json = {"type": "tool_call"}
|
|
404
|
+
tool_json.update(tool)
|
|
405
|
+
content = json.dumps(tool_json)
|
|
406
|
+
messages.append(
|
|
407
|
+
Message(
|
|
408
|
+
role=role,
|
|
409
|
+
content=content,
|
|
410
|
+
type=content_type,
|
|
411
|
+
)
|
|
412
|
+
)
|
|
413
|
+
elif step_detail["type"] == "tool_call":
|
|
414
|
+
tool_call_id = step_detail["tool_call_id"]
|
|
415
|
+
content_type = ContentType.tool_call
|
|
416
|
+
content = json.dumps(step_detail)
|
|
417
|
+
messages.append(
|
|
418
|
+
Message(
|
|
419
|
+
role=role, content=content, type=content_type
|
|
420
|
+
)
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
content = json.dumps(step_detail)
|
|
424
|
+
content_type = ContentType.tool_response
|
|
425
|
+
messages.append(
|
|
426
|
+
Message(
|
|
427
|
+
role=role, content=content, type=content_type
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
if content_field := entry.get("content"):
|
|
431
|
+
role = entry["role"]
|
|
432
|
+
for val in content_field:
|
|
433
|
+
if val["response_type"] == ContentType.text:
|
|
434
|
+
messages.append(
|
|
435
|
+
Message(
|
|
436
|
+
role=role, content=val["text"], type=ContentType.text
|
|
437
|
+
)
|
|
438
|
+
)
|
|
439
|
+
if val["response_type"] == ContentType.conversational_search:
|
|
440
|
+
conversational_search_metadata = ConversationSearchMetadata(
|
|
441
|
+
tool_call_id=tool_call_id
|
|
442
|
+
)
|
|
443
|
+
messages.append(
|
|
444
|
+
Message(
|
|
445
|
+
role=role,
|
|
446
|
+
content=val["text"],
|
|
447
|
+
type=ContentType.text,
|
|
448
|
+
conversational_search_metadata=conversational_search_metadata,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return messages
|
|
453
|
+
|
|
454
|
+
@staticmethod
|
|
455
|
+
def _get_messages_after_last_user(messages: List[Message]) -> List[Message]:
|
|
456
|
+
for i in range(len(messages) - 1, -1, -1):
|
|
457
|
+
if messages[i].role == "user":
|
|
458
|
+
return messages[i + 1 :]
|
|
459
|
+
return messages
|
|
460
|
+
|
|
461
|
+
def get_agent_id(self, agent_name: str):
|
|
462
|
+
if self.enable_saas_mode:
|
|
463
|
+
path = "v1/orchestrate/agents"
|
|
464
|
+
else:
|
|
465
|
+
path = "orchestrate/agents"
|
|
466
|
+
|
|
467
|
+
response = self.wxo_client.get(path)
|
|
468
|
+
|
|
469
|
+
if response.status_code == 200:
|
|
470
|
+
result = response.json()
|
|
471
|
+
for agent in result:
|
|
472
|
+
if agent.get("name", "") == agent_name:
|
|
473
|
+
return agent.get("id")
|
|
474
|
+
|
|
475
|
+
raise Exception(f"Agent with name {agent_name} not found.")
|
|
476
|
+
|
|
477
|
+
else:
|
|
478
|
+
response.raise_for_status()
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
class EvaluationController:
|
|
482
|
+
def __init__(
|
|
483
|
+
self,
|
|
484
|
+
wxo_inference_backend: WXOInferenceBackend,
|
|
485
|
+
llm_user: LLMUser,
|
|
486
|
+
config: TestConfig,
|
|
487
|
+
):
|
|
488
|
+
self.wxo_inference_backend = wxo_inference_backend
|
|
489
|
+
self.llm_user = llm_user
|
|
490
|
+
self.config = config
|
|
491
|
+
|
|
492
|
+
def run(
|
|
493
|
+
self, task_n, story, agent_name: str, starting_user_input: str = None
|
|
494
|
+
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
|
|
495
|
+
step = 0
|
|
496
|
+
thread_id = None
|
|
497
|
+
conversation_history: List[Message] = []
|
|
498
|
+
conversational_search_history_data = []
|
|
499
|
+
call_tracker = CallTracker()
|
|
500
|
+
# make this configurable
|
|
501
|
+
while step < 20:
|
|
502
|
+
|
|
503
|
+
if step == 0 and starting_user_input:
|
|
504
|
+
user_input = Message(
|
|
505
|
+
role="user", content=starting_user_input, type=ContentType.text
|
|
506
|
+
)
|
|
507
|
+
else:
|
|
508
|
+
if self.config.enable_manual_user_input == True:
|
|
509
|
+
content = input(
|
|
510
|
+
"[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
|
|
511
|
+
)
|
|
512
|
+
user_input = Message(
|
|
513
|
+
role="user", content=content, type=ContentType.text
|
|
514
|
+
)
|
|
515
|
+
else: # llm
|
|
516
|
+
user_input = self.llm_user.generate_user_input(
|
|
517
|
+
story, conversation_history
|
|
518
|
+
)
|
|
519
|
+
if self.config.enable_verbose_logging:
|
|
520
|
+
rich.print(
|
|
521
|
+
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
522
|
+
user_input.content,
|
|
523
|
+
)
|
|
524
|
+
if is_end(user_input):
|
|
525
|
+
break
|
|
526
|
+
conversation_history.append(user_input)
|
|
527
|
+
messages, thread_id, conversational_search_data = (
|
|
528
|
+
self.wxo_inference_backend.stream_messages(
|
|
529
|
+
user_input,
|
|
530
|
+
agent_name=agent_name,
|
|
531
|
+
thread_id=thread_id,
|
|
532
|
+
call_tracker=call_tracker,
|
|
533
|
+
)
|
|
534
|
+
)
|
|
535
|
+
if self.config.enable_verbose_logging:
|
|
536
|
+
for message in messages:
|
|
537
|
+
rich.print(
|
|
538
|
+
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
539
|
+
message.content,
|
|
540
|
+
)
|
|
541
|
+
conversation_history.extend(messages)
|
|
542
|
+
conversational_search_history_data.extend(conversational_search_data)
|
|
543
|
+
step += 1
|
|
544
|
+
return conversation_history, call_tracker, conversational_search_history_data
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def get_wxo_client(service_url: str, token: str):
|
|
548
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
549
|
+
return wxo_client
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def get_wxo_inference_backend(
|
|
553
|
+
service_url: str, tenant_name: str, token: str = None
|
|
554
|
+
) -> WXOInferenceBackend:
|
|
555
|
+
if not token:
|
|
556
|
+
token = tenant_setup(service_url, tenant_name)
|
|
557
|
+
wxo_client = get_wxo_client(service_url, token)
|
|
558
|
+
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
559
|
+
return inference_backend
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
if __name__ == "__main__":
|
|
563
|
+
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
564
|
+
llm_user = LLMUser(
|
|
565
|
+
wai_client=wai_client,
|
|
566
|
+
template=LlamaUserTemplateRenderer(
|
|
567
|
+
"src/wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2"
|
|
568
|
+
),
|
|
569
|
+
user_response_style=None,
|
|
570
|
+
)
|
|
571
|
+
auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
572
|
+
with open(auth_config_path, "r") as f:
|
|
573
|
+
auth_config = yaml.safe_load(f)
|
|
574
|
+
tenant_name = "local"
|
|
575
|
+
token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
|
|
576
|
+
|
|
577
|
+
wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
|
|
578
|
+
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
579
|
+
config = TestConfig(
|
|
580
|
+
test_paths=[],
|
|
581
|
+
output_dir="./wxo_agentic_evaluation/results",
|
|
582
|
+
auth_config=auth_config,
|
|
583
|
+
wxo_lite_version="0.1.3",
|
|
584
|
+
)
|
|
585
|
+
evaluation_controller = EvaluationController(
|
|
586
|
+
wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
|
|
587
|
+
)
|
|
588
|
+
history, _, _ = evaluation_controller.run(
|
|
589
|
+
0,
|
|
590
|
+
"Your username is nken and you want to find out the timeoff schedule of your reports from 20250101 o 202505t",
|
|
591
|
+
agent_name="hr_agent",
|
|
592
|
+
)
|
|
593
|
+
# starting_user_input="my username is nken, i want to know the timeoff schedule for my reports from 20250101 to 202505")
|
|
594
|
+
|
|
595
|
+
result = list()
|
|
596
|
+
for message in history:
|
|
597
|
+
result.append(message.model_dump())
|
|
598
|
+
|
|
599
|
+
os.makedirs("./wxo_agentic_evaluation/results", exist_ok=True)
|
|
600
|
+
with open("./wxo_agentic_evaluation/results/messages.json", "w") as f:
|
|
601
|
+
json.dump(result, f)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
2
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
3
|
+
KeywordMatchingTemplateRenderer,
|
|
4
|
+
SemanticMatchingTemplateRenderer,
|
|
5
|
+
)
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMMatcher:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
llm_client: WatsonXProvider,
|
|
13
|
+
keyword_template: KeywordMatchingTemplateRenderer,
|
|
14
|
+
semantic_template: SemanticMatchingTemplateRenderer,
|
|
15
|
+
):
|
|
16
|
+
self.llm_client = llm_client
|
|
17
|
+
self.keyword_template = keyword_template
|
|
18
|
+
self.semantic_template = semantic_template
|
|
19
|
+
|
|
20
|
+
def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
|
|
21
|
+
if len(keywords) == 0:
|
|
22
|
+
return True
|
|
23
|
+
# return True if no keywords are provided
|
|
24
|
+
# This allows for skipping keyword check by providing an empty list
|
|
25
|
+
keywords_text = "\n".join(keywords)
|
|
26
|
+
prompt = self.keyword_template.render(
|
|
27
|
+
keywords_text=keywords_text, response_text=response_text
|
|
28
|
+
)
|
|
29
|
+
output = self.llm_client.query(prompt)
|
|
30
|
+
result = output["generated_text"].strip().lower()
|
|
31
|
+
return result.startswith("true")
|
|
32
|
+
|
|
33
|
+
def semantic_match(self, prediction: str, ground_truth: str) -> bool:
|
|
34
|
+
prompt = self.semantic_template.render(
|
|
35
|
+
expected_text=ground_truth, actual_text=prediction
|
|
36
|
+
)
|
|
37
|
+
output = self.llm_client.query(prompt)
|
|
38
|
+
result = output["generated_text"].strip().lower()
|
|
39
|
+
return result.startswith("true")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
5
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
6
|
+
FaithfulnessTemplateRenderer,
|
|
7
|
+
AnswerRelevancyTemplateRenderer,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMJudge:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
llm_client: WatsonXProvider,
|
|
16
|
+
faithfulness: FaithfulnessTemplateRenderer,
|
|
17
|
+
answer_relevancy: AnswerRelevancyTemplateRenderer,
|
|
18
|
+
):
|
|
19
|
+
self.llm_client = llm_client
|
|
20
|
+
self.faithfulness_template = faithfulness
|
|
21
|
+
self.answer_relevancy_template = answer_relevancy
|
|
22
|
+
|
|
23
|
+
# TODO: implement callable, and implement decorator to retry the LLM call
|
|
24
|
+
def faithfulness(self, claim, retrieval_context: List[str]) -> Faithfulness:
|
|
25
|
+
retrieval_context = "\n".join(retrieval_context)
|
|
26
|
+
prompt = self.faithfulness_template.render(
|
|
27
|
+
claim=claim, retrieval_context=retrieval_context
|
|
28
|
+
)
|
|
29
|
+
output = self.llm_client.query(prompt)
|
|
30
|
+
result = output["generated_text"].strip().lower()
|
|
31
|
+
|
|
32
|
+
faithfulness = Faithfulness.model_validate(json.loads(result))
|
|
33
|
+
|
|
34
|
+
return faithfulness
|
|
35
|
+
|
|
36
|
+
def answer_relevancy(
|
|
37
|
+
self, question: str, context: str, answer: str
|
|
38
|
+
) -> AnswerRelevancy:
|
|
39
|
+
prompt = self.answer_relevancy_template.render(
|
|
40
|
+
question=question, context=context, answer=answer
|
|
41
|
+
)
|
|
42
|
+
output = self.llm_client.query(prompt)
|
|
43
|
+
result = output["generated_text"].strip().lower()
|
|
44
|
+
|
|
45
|
+
answer_relevancy = AnswerRelevancy(answer_relevancy=json.loads(result))
|
|
46
|
+
|
|
47
|
+
return answer_relevancy
|