ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,601 @@
1
+ import requests
2
+ import os
3
+ import yaml
4
+ import json
5
+ import rich
6
+ import time
7
+ from pydantic import BaseModel
8
+ from typing import List, Generator, Dict, Tuple, Mapping, Any
9
+
10
+ from wxo_agentic_evaluation.type import (
11
+ ContentType,
12
+ Message,
13
+ ConversationalSearch,
14
+ ConversationalSearchCitations,
15
+ ConversationalSearchResultMetadata,
16
+ ConversationalConfidenceThresholdScore,
17
+ ConversationalSearchResults,
18
+ ConversationSearchMetadata,
19
+ )
20
+
21
+ from wxo_agentic_evaluation.llm_user import LLMUser
22
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
23
+ from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
24
+ from wxo_agentic_evaluation.arg_configs import TestConfig
25
+ from wxo_agentic_evaluation.service_instance import tenant_setup
26
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
27
+
28
+
29
+ def is_end(user_input: Message):
30
+ if "END" in user_input.content.strip():
31
+ return True
32
+ return False
33
+
34
+
35
+ def is_transfer_response(step_detail: Dict):
36
+ if step_detail["type"] == "tool_response" and step_detail["name"].startswith(
37
+ "transfer_to_"
38
+ ):
39
+ return True
40
+ return False
41
+
42
+
43
+ class CallTracker(BaseModel):
44
+ tool_call: List = []
45
+ tool_response: List = []
46
+ generic: List = []
47
+
48
+
49
+ class WXOClient:
50
+ def __init__(self, service_url, api_key):
51
+ self.service_url = service_url
52
+ self.api_key = api_key
53
+
54
+ def _get_headers(self) -> dict:
55
+ headers = {}
56
+ if self.api_key:
57
+ headers["Authorization"] = f"Bearer {self.api_key}"
58
+ return headers
59
+
60
+ def post(self, payload: dict, path: str, stream=False):
61
+ url = f"{self.service_url}/{path}"
62
+ return requests.post(
63
+ url=url, headers=self._get_headers(), json=payload, stream=stream
64
+ )
65
+
66
+ def get(self, path: str, params: dict = None):
67
+ url = f"{self.service_url}/{path}"
68
+ return requests.get(url, params=params, headers=self._get_headers())
69
+
70
+
71
+ class WXOInferenceBackend:
72
+ def __init__(self, wxo_client):
73
+ self.wxo_client = wxo_client
74
+ self.enable_saas_mode = is_saas_url(wxo_client.service_url)
75
+
76
+ def run(self, user_input: Message, agent_name, thread_id=None):
77
+ agent_id = self.get_agent_id(agent_name)
78
+ payload = {"message": user_input.model_dump(), "agent_id": agent_id}
79
+ if thread_id:
80
+ payload["thread_id"] = thread_id
81
+
82
+ if self.enable_saas_mode:
83
+ path = "/v1/orchestrate/runs"
84
+ else:
85
+ path = "/orchestrate/runs"
86
+
87
+ response: requests.Response = self.wxo_client.post(payload, path)
88
+
89
+ if int(response.status_code) == 200:
90
+ result = response.json()
91
+ return result["thread_id"]
92
+ else:
93
+ response.raise_for_status()
94
+
95
+ def _stream_events(
96
+ self, user_input: Message, agent_name: str, thread_id=None
97
+ ) -> Generator[Dict, None, None]:
98
+ agent_id = self.get_agent_id(agent_name)
99
+ payload = {"message": user_input.model_dump(), "agent_id": agent_id}
100
+ if thread_id:
101
+ payload["thread_id"] = thread_id
102
+
103
+ if self.enable_saas_mode:
104
+ path = "/v1/orchestrate/runs?stream=true"
105
+ else:
106
+ path = "/orchestrate/runs?stream=true"
107
+
108
+ response: requests.Response = self.wxo_client.post(payload, path, stream=True)
109
+ import json
110
+
111
+ for chunk in self._parse_events(response):
112
+ chunk = json.loads(chunk.strip())
113
+ yield chunk
114
+
115
+ def parse_conversational_search_response(
116
+ self,
117
+ conversational_search: Mapping[str, Any],
118
+ metadata: ConversationSearchMetadata,
119
+ ) -> ConversationalSearch:
120
+ def parse_citations():
121
+ citations = conversational_search["citations"]
122
+ parsed_citations = []
123
+ for citation in citations:
124
+ c = ConversationalSearchCitations(
125
+ url=citation.get("url", ""),
126
+ body=citation.get("body", ""),
127
+ text=citation.get("text", ""),
128
+ title=citation.get("title", ""),
129
+ range_start=citation.get("range_start"),
130
+ range_end=citation.get("range_end"),
131
+ search_result_idx=citation.get("search_result_idx"),
132
+ )
133
+ parsed_citations.append(c)
134
+
135
+ return parsed_citations
136
+
137
+ def parsed_search_results():
138
+ search_results = conversational_search["search_results"]
139
+ parsed_search_results = []
140
+ for result in search_results:
141
+ result_metadata = result.get("result_metadata", {})
142
+ result_metadata = ConversationalSearchResultMetadata(
143
+ score=result_metadata.get("score"),
144
+ document_retrieval_source=result_metadata.get(
145
+ "document_retrieval_source"
146
+ ),
147
+ )
148
+ c = ConversationalSearchResults(
149
+ url=result.get("url", ""),
150
+ body=result.get("body", ""),
151
+ title=result.get("title", ""),
152
+ result_metadata=result_metadata,
153
+ )
154
+ parsed_search_results.append(c)
155
+
156
+ return parsed_search_results
157
+
158
+ citations = parse_citations()
159
+ retrieval_context = parsed_search_results()
160
+ citations_title = conversational_search.get("citations_title", "")
161
+ response_length_option = conversational_search.get("response_length_option", "")
162
+ text = conversational_search.get("text", "")
163
+
164
+ confidence_scores = ConversationalConfidenceThresholdScore(
165
+ **conversational_search.get("confidence_scores")
166
+ )
167
+ response_type = conversational_search.get("response_type")
168
+ # should always be conversational_search
169
+ assert response_type == ContentType.conversational_search
170
+
171
+ conversational_search = ConversationalSearch(
172
+ metadata=metadata,
173
+ response_type=response_type,
174
+ text=text,
175
+ citations=citations,
176
+ search_results=retrieval_context,
177
+ citations_title=citations_title,
178
+ confidence_scores=confidence_scores,
179
+ response_length_option=response_length_option,
180
+ )
181
+
182
+ return conversational_search
183
+
184
+ def stream_messages(
185
+ self,
186
+ user_input: Message,
187
+ agent_name: str,
188
+ call_tracker: CallTracker,
189
+ thread_id=None,
190
+ ) -> Tuple[List[Message], str, List[ConversationalSearch]]:
191
+ recover = False
192
+ messages = list()
193
+ conversational_search_data = []
194
+
195
+ start_time = time.time()
196
+ for chunk in self._stream_events(user_input, agent_name, thread_id):
197
+
198
+ event = chunk.get("event", "")
199
+ if _thread_id := chunk.get("data", {}).get("thread_id"):
200
+ thread_id = _thread_id
201
+ if delta := chunk.get("data", {}).get("delta"):
202
+ role = delta["role"]
203
+ if step_details := delta.get("step_details"):
204
+ if any(
205
+ is_transfer_response(step_detail)
206
+ for step_detail in step_details
207
+ ):
208
+ continue
209
+ for idx, step_detail in enumerate(step_details):
210
+ if step_detail["type"] == "tool_calls":
211
+ # in step details, we could have [tool_response, tool_call]
212
+ # in this case, we skip since we already capture the tool call
213
+ if idx == 1:
214
+ continue
215
+
216
+ content_type = ContentType.tool_call
217
+ for tool in step_detail["tool_calls"]:
218
+ # Only add "transfer_to_" calls here. Other tool calls are already
219
+ # captured in the next block, including them here will cause duplication
220
+ # if not tool["name"].startswith("transfer_to_"):
221
+ # continue
222
+ tool_json = {"type": "tool_call"}
223
+ tool_json.update(tool)
224
+ content = json.dumps(tool_json)
225
+ messages.append(
226
+ Message(
227
+ role=role,
228
+ content=content,
229
+ type=content_type,
230
+ event=event,
231
+ )
232
+ )
233
+ end_time = time.time()
234
+ call_tracker.tool_call.append(end_time - start_time)
235
+ start_time = end_time
236
+ elif step_detail["type"] == "tool_call":
237
+ # in step details, we could have [tool_response, tool_call]
238
+ # in this case, we skip since we already capture the tool call
239
+ if idx == 1:
240
+ continue
241
+ content_type = ContentType.tool_call
242
+ content = json.dumps(step_detail)
243
+ messages.append(
244
+ Message(
245
+ role=role,
246
+ content=content,
247
+ type=content_type,
248
+ event=event,
249
+ )
250
+ )
251
+ end_time = time.time()
252
+ call_tracker.tool_call.append(end_time - start_time)
253
+ start_time = end_time
254
+ elif step_detail["type"] == "tool_response":
255
+ content = json.dumps(step_detail)
256
+ content_type = ContentType.tool_response
257
+ messages.append(
258
+ Message(
259
+ role=role,
260
+ content=content,
261
+ type=content_type,
262
+ event=event,
263
+ )
264
+ )
265
+ end_time = time.time()
266
+ call_tracker.tool_response.append(end_time - start_time)
267
+ start_time = end_time
268
+ elif content_field := delta.get("content"):
269
+ for val in content_field:
270
+ response_type = val["response_type"]
271
+ # TODO: is this ever hit? the event name is "message.created", and it seems the event should be "message.delta"
272
+ if (
273
+ response_type == ContentType.text
274
+ and chunk["event"] == "message_created"
275
+ ):
276
+ messages.append(
277
+ Message(
278
+ role=role,
279
+ content=val["text"],
280
+ type=ContentType.text,
281
+ ),
282
+ chunk=event,
283
+ )
284
+ end_time = time.time()
285
+ call_tracker.generic.append(end_time - start_time)
286
+ start_time = end_time
287
+
288
+ # NOTE: The event here that is parsed is part of the "message.created" event
289
+ elif message := chunk.get("data", {}).get("message"):
290
+ role = message["role"]
291
+ for content in message["content"]:
292
+ if (
293
+ content["response_type"]
294
+ == ContentType.conversational_search
295
+ ):
296
+ end_time = time.time()
297
+ call_tracker.generic.append(end_time - start_time)
298
+ start_time = end_time
299
+
300
+ """ This is under the assumption the flow is (tool call -> tool response -> response back to user).
301
+ In other words, the tool response is not fed back in to the agent.
302
+ We get the previous message and extract the `tool_call_id`.
303
+
304
+ NOTE: The previous message is a tool call because how we parse the event stream.
305
+ NOTE: The conversational search response event does not have a 'tool call id' which can be used to associate with the 'conversational search response'.
306
+ """
307
+
308
+ last_message = json.loads(messages[-1].content)
309
+ tool_call_id = last_message.get("tool_call_id", None)
310
+ assert tool_call_id is not None
311
+ conversational_search_metadata = ConversationSearchMetadata(
312
+ tool_call_id=tool_call_id
313
+ )
314
+ conversational_search = (
315
+ self.parse_conversational_search_response(
316
+ conversational_search=content,
317
+ metadata=conversational_search_metadata,
318
+ )
319
+ )
320
+ conversational_search_data.append(conversational_search)
321
+ messages.append(
322
+ Message(
323
+ role=role,
324
+ content=content["text"],
325
+ type=ContentType.conversational_search,
326
+ conversational_search_metadata=conversational_search_metadata,
327
+ event=event,
328
+ )
329
+ )
330
+ if content["response_type"] == ContentType.text:
331
+ messages.append(
332
+ Message(
333
+ role=role,
334
+ content=content["text"],
335
+ type=ContentType.text,
336
+ event=chunk["event"],
337
+ )
338
+ )
339
+ end_time = time.time()
340
+ call_tracker.generic.append(end_time - start_time)
341
+ start_time = end_time
342
+ else:
343
+ # Exit the loop if we lose the thread_id
344
+ recover = True
345
+ break
346
+
347
+ if recover and (thread_id is not None):
348
+ rich.print(
349
+ "🔬 [bold][magenta]INFO:[/magenta][/bold]",
350
+ f"Attempting to recover messages from thread_id {thread_id}",
351
+ )
352
+ # If we lose the thread_id, we need to wait for a bit to allow the message to come through
353
+ # before attempting to recover the messages.
354
+ time.sleep(10)
355
+ messages = self.recover_messages(thread_id)
356
+ rich.print(
357
+ "🔬 [bold][magenta]INFO:[/magenta][/bold]",
358
+ f"Recovered {len(messages)} messages from thread_id {thread_id}",
359
+ )
360
+
361
+ return messages, thread_id, conversational_search_data
362
+
363
+ def _parse_events(
364
+ self, stream: Generator[bytes, None, None]
365
+ ) -> Generator[bytes, None, None]:
366
+ data = b""
367
+ for chunk in stream:
368
+ for line in chunk.splitlines(True):
369
+ data += line
370
+ if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n", b"\n")):
371
+ yield data
372
+ data = b""
373
+ if data:
374
+ yield data
375
+
376
+ def recover_messages(self, thread_id: str) -> List[Message]:
377
+ messages = self.get_messages(thread_id)
378
+ return self._get_messages_after_last_user(messages)
379
+
380
+ def get_messages(self, thread_id) -> List[Message]:
381
+ if self.enable_saas_mode:
382
+ path = f"v1/orchestrate/threads/{thread_id}/messages"
383
+ else:
384
+ path = f"threads/{thread_id}/messages"
385
+ response = self.wxo_client.get(path)
386
+ if response.status_code == 200:
387
+ result = response.json()
388
+
389
+ else:
390
+ response.raise_for_status()
391
+
392
+ messages = []
393
+ for entry in result:
394
+ tool_call_id = None
395
+ if step_history := entry.get("step_history"):
396
+ for step_message in step_history:
397
+ role = step_message["role"]
398
+ if step_details := step_message.get("step_details"):
399
+ for step_detail in step_details:
400
+ if step_detail["type"] == "tool_calls":
401
+ content_type = ContentType.tool_call
402
+ for tool in step_detail["tool_calls"]:
403
+ tool_json = {"type": "tool_call"}
404
+ tool_json.update(tool)
405
+ content = json.dumps(tool_json)
406
+ messages.append(
407
+ Message(
408
+ role=role,
409
+ content=content,
410
+ type=content_type,
411
+ )
412
+ )
413
+ elif step_detail["type"] == "tool_call":
414
+ tool_call_id = step_detail["tool_call_id"]
415
+ content_type = ContentType.tool_call
416
+ content = json.dumps(step_detail)
417
+ messages.append(
418
+ Message(
419
+ role=role, content=content, type=content_type
420
+ )
421
+ )
422
+ else:
423
+ content = json.dumps(step_detail)
424
+ content_type = ContentType.tool_response
425
+ messages.append(
426
+ Message(
427
+ role=role, content=content, type=content_type
428
+ )
429
+ )
430
+ if content_field := entry.get("content"):
431
+ role = entry["role"]
432
+ for val in content_field:
433
+ if val["response_type"] == ContentType.text:
434
+ messages.append(
435
+ Message(
436
+ role=role, content=val["text"], type=ContentType.text
437
+ )
438
+ )
439
+ if val["response_type"] == ContentType.conversational_search:
440
+ conversational_search_metadata = ConversationSearchMetadata(
441
+ tool_call_id=tool_call_id
442
+ )
443
+ messages.append(
444
+ Message(
445
+ role=role,
446
+ content=val["text"],
447
+ type=ContentType.text,
448
+ conversational_search_metadata=conversational_search_metadata,
449
+ )
450
+ )
451
+
452
+ return messages
453
+
454
+ @staticmethod
455
+ def _get_messages_after_last_user(messages: List[Message]) -> List[Message]:
456
+ for i in range(len(messages) - 1, -1, -1):
457
+ if messages[i].role == "user":
458
+ return messages[i + 1 :]
459
+ return messages
460
+
461
+ def get_agent_id(self, agent_name: str):
462
+ if self.enable_saas_mode:
463
+ path = "v1/orchestrate/agents"
464
+ else:
465
+ path = "orchestrate/agents"
466
+
467
+ response = self.wxo_client.get(path)
468
+
469
+ if response.status_code == 200:
470
+ result = response.json()
471
+ for agent in result:
472
+ if agent.get("name", "") == agent_name:
473
+ return agent.get("id")
474
+
475
+ raise Exception(f"Agent with name {agent_name} not found.")
476
+
477
+ else:
478
+ response.raise_for_status()
479
+
480
+
481
+ class EvaluationController:
482
+ def __init__(
483
+ self,
484
+ wxo_inference_backend: WXOInferenceBackend,
485
+ llm_user: LLMUser,
486
+ config: TestConfig,
487
+ ):
488
+ self.wxo_inference_backend = wxo_inference_backend
489
+ self.llm_user = llm_user
490
+ self.config = config
491
+
492
+ def run(
493
+ self, task_n, story, agent_name: str, starting_user_input: str = None
494
+ ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
495
+ step = 0
496
+ thread_id = None
497
+ conversation_history: List[Message] = []
498
+ conversational_search_history_data = []
499
+ call_tracker = CallTracker()
500
+ # make this configurable
501
+ while step < 20:
502
+
503
+ if step == 0 and starting_user_input:
504
+ user_input = Message(
505
+ role="user", content=starting_user_input, type=ContentType.text
506
+ )
507
+ else:
508
+ if self.config.enable_manual_user_input == True:
509
+ content = input(
510
+ "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
511
+ )
512
+ user_input = Message(
513
+ role="user", content=content, type=ContentType.text
514
+ )
515
+ else: # llm
516
+ user_input = self.llm_user.generate_user_input(
517
+ story, conversation_history
518
+ )
519
+ if self.config.enable_verbose_logging:
520
+ rich.print(
521
+ f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
522
+ user_input.content,
523
+ )
524
+ if is_end(user_input):
525
+ break
526
+ conversation_history.append(user_input)
527
+ messages, thread_id, conversational_search_data = (
528
+ self.wxo_inference_backend.stream_messages(
529
+ user_input,
530
+ agent_name=agent_name,
531
+ thread_id=thread_id,
532
+ call_tracker=call_tracker,
533
+ )
534
+ )
535
+ if self.config.enable_verbose_logging:
536
+ for message in messages:
537
+ rich.print(
538
+ f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
539
+ message.content,
540
+ )
541
+ conversation_history.extend(messages)
542
+ conversational_search_history_data.extend(conversational_search_data)
543
+ step += 1
544
+ return conversation_history, call_tracker, conversational_search_history_data
545
+
546
+
547
+ def get_wxo_client(service_url: str, token: str):
548
+ wxo_client = WXOClient(service_url=service_url, api_key=token)
549
+ return wxo_client
550
+
551
+
552
+ def get_wxo_inference_backend(
553
+ service_url: str, tenant_name: str, token: str = None
554
+ ) -> WXOInferenceBackend:
555
+ if not token:
556
+ token = tenant_setup(service_url, tenant_name)
557
+ wxo_client = get_wxo_client(service_url, token)
558
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
559
+ return inference_backend
560
+
561
+
562
+ if __name__ == "__main__":
563
+ wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
564
+ llm_user = LLMUser(
565
+ wai_client=wai_client,
566
+ template=LlamaUserTemplateRenderer(
567
+ "src/wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2"
568
+ ),
569
+ user_response_style=None,
570
+ )
571
+ auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
572
+ with open(auth_config_path, "r") as f:
573
+ auth_config = yaml.safe_load(f)
574
+ tenant_name = "local"
575
+ token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]
576
+
577
+ wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
578
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
579
+ config = TestConfig(
580
+ test_paths=[],
581
+ output_dir="./wxo_agentic_evaluation/results",
582
+ auth_config=auth_config,
583
+ wxo_lite_version="0.1.3",
584
+ )
585
+ evaluation_controller = EvaluationController(
586
+ wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
587
+ )
588
+ history, _, _ = evaluation_controller.run(
589
+ 0,
590
+ "Your username is nken and you want to find out the timeoff schedule of your reports from 20250101 o 202505t",
591
+ agent_name="hr_agent",
592
+ )
593
+ # starting_user_input="my username is nken, i want to know the timeoff schedule for my reports from 20250101 to 202505")
594
+
595
+ result = list()
596
+ for message in history:
597
+ result.append(message.model_dump())
598
+
599
+ os.makedirs("./wxo_agentic_evaluation/results", exist_ok=True)
600
+ with open("./wxo_agentic_evaluation/results/messages.json", "w") as f:
601
+ json.dump(result, f)
@@ -0,0 +1,39 @@
1
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
2
+ from wxo_agentic_evaluation.prompt.template_render import (
3
+ KeywordMatchingTemplateRenderer,
4
+ SemanticMatchingTemplateRenderer,
5
+ )
6
+ from typing import List
7
+
8
+
9
+ class LLMMatcher:
10
+ def __init__(
11
+ self,
12
+ llm_client: WatsonXProvider,
13
+ keyword_template: KeywordMatchingTemplateRenderer,
14
+ semantic_template: SemanticMatchingTemplateRenderer,
15
+ ):
16
+ self.llm_client = llm_client
17
+ self.keyword_template = keyword_template
18
+ self.semantic_template = semantic_template
19
+
20
+ def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
21
+ if len(keywords) == 0:
22
+ return True
23
+ # return True if no keywords are provided
24
+ # This allows for skipping keyword check by providing an empty list
25
+ keywords_text = "\n".join(keywords)
26
+ prompt = self.keyword_template.render(
27
+ keywords_text=keywords_text, response_text=response_text
28
+ )
29
+ output = self.llm_client.query(prompt)
30
+ result = output["generated_text"].strip().lower()
31
+ return result.startswith("true")
32
+
33
+ def semantic_match(self, prediction: str, ground_truth: str) -> bool:
34
+ prompt = self.semantic_template.render(
35
+ expected_text=ground_truth, actual_text=prediction
36
+ )
37
+ output = self.llm_client.query(prompt)
38
+ result = output["generated_text"].strip().lower()
39
+ return result.startswith("true")
@@ -0,0 +1,47 @@
1
+ from typing import List
2
+ import json
3
+
4
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
5
+ from wxo_agentic_evaluation.prompt.template_render import (
6
+ FaithfulnessTemplateRenderer,
7
+ AnswerRelevancyTemplateRenderer,
8
+ )
9
+ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
10
+
11
+
12
+ class LLMJudge:
13
+ def __init__(
14
+ self,
15
+ llm_client: WatsonXProvider,
16
+ faithfulness: FaithfulnessTemplateRenderer,
17
+ answer_relevancy: AnswerRelevancyTemplateRenderer,
18
+ ):
19
+ self.llm_client = llm_client
20
+ self.faithfulness_template = faithfulness
21
+ self.answer_relevancy_template = answer_relevancy
22
+
23
+ # TODO: implement callable, and implement decorator to retry the LLM call
24
+ def faithfulness(self, claim, retrieval_context: List[str]) -> Faithfulness:
25
+ retrieval_context = "\n".join(retrieval_context)
26
+ prompt = self.faithfulness_template.render(
27
+ claim=claim, retrieval_context=retrieval_context
28
+ )
29
+ output = self.llm_client.query(prompt)
30
+ result = output["generated_text"].strip().lower()
31
+
32
+ faithfulness = Faithfulness.model_validate(json.loads(result))
33
+
34
+ return faithfulness
35
+
36
+ def answer_relevancy(
37
+ self, question: str, context: str, answer: str
38
+ ) -> AnswerRelevancy:
39
+ prompt = self.answer_relevancy_template.render(
40
+ question=question, context=context, answer=answer
41
+ )
42
+ output = self.llm_client.query(prompt)
43
+ result = output["generated_text"].strip().lower()
44
+
45
+ answer_relevancy = AnswerRelevancy(answer_relevancy=json.loads(result))
46
+
47
+ return answer_relevancy