ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,518 @@
1
+ from typing import List
2
+ import json
3
+ import os
4
+ import rich
5
+
6
+ from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
7
+
8
+ from wxo_agentic_evaluation.type import (
9
+ ContentType,
10
+ Message,
11
+ EvaluationData,
12
+ ToolCallAndRoutingMetrics,
13
+ EventTypes,
14
+ ConversationalSearch,
15
+ ExtendedMessage,
16
+ )
17
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
18
+ from wxo_agentic_evaluation.metrics.metrics import (
19
+ KnowledgeBaseMetrics,
20
+ KeywordSemanticSearchMetric,
21
+ )
22
+ from wxo_agentic_evaluation.prompt.template_render import (
23
+ KeywordMatchingTemplateRenderer,
24
+ SemanticMatchingTemplateRenderer,
25
+ FaithfulnessTemplateRenderer,
26
+ AnswerRelevancyTemplateRenderer,
27
+ )
28
+ from wxo_agentic_evaluation.llm_matching import LLMMatcher
29
+ from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
30
+ from wxo_agentic_evaluation import __file__
31
+
32
+ root_dir = os.path.dirname(__file__)
33
+ KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
34
+ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
35
+ FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
36
+ ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
37
+
38
+
39
+ class EvaluationPackage:
40
+ def __init__(
41
+ self,
42
+ test_case_name,
43
+ ground_truth,
44
+ messages,
45
+ conversational_search_data: List[ConversationalSearch] = None,
46
+ is_analyze_run=False,
47
+ ):
48
+ self.tool_dictionary = {
49
+ goal_detail.name: goal_detail
50
+ for goal_detail in ground_truth.goal_details
51
+ if goal_detail.type == ContentType.tool_call
52
+ }
53
+ self.text_list = [
54
+ goal_detail
55
+ for goal_detail in ground_truth.goal_details
56
+ if goal_detail.type == ContentType.text
57
+ ]
58
+ self.messages = messages
59
+ self.conversational_search_data = conversational_search_data
60
+ self.validate_ground_truth(ground_truth, test_case_name)
61
+ self.ground_truth = ground_truth
62
+ self.test_case_name = test_case_name
63
+ self.is_analyze_run = is_analyze_run
64
+
65
+ self.matcher = LLMMatcher(
66
+ llm_client=WatsonXProvider(
67
+ model_id="meta-llama/llama-3-405b-instruct",
68
+ llm_decode_parameter={
69
+ "min_new_tokens": 0,
70
+ "decoding_method": "greedy",
71
+ "max_new_tokens": 10,
72
+ },
73
+ ),
74
+ keyword_template=KeywordMatchingTemplateRenderer(
75
+ KEYWORD_MATCHING_PROMPT_PATH
76
+ ),
77
+ semantic_template=SemanticMatchingTemplateRenderer(
78
+ SEMANTIC_MATCHING_PROMPT_PATH
79
+ ),
80
+ )
81
+ self.rag_llm_as_a_judge = LLMJudge(
82
+ llm_client=WatsonXProvider(
83
+ model_id="meta-llama/llama-3-405b-instruct",
84
+ llm_decode_parameter={
85
+ "min_new_tokens": 0,
86
+ "decoding_method": "greedy",
87
+ "max_new_tokens": 4096,
88
+ },
89
+ ),
90
+ faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
91
+ answer_relevancy=AnswerRelevancyTemplateRenderer(
92
+ ANSWER_RELEVANCY_PROMPT_PATH
93
+ ),
94
+ )
95
+
96
+ @staticmethod
97
+ def is_topological_sort(graph, ordering):
98
+ position = {node: i for i, node in enumerate(ordering)}
99
+ for u in graph:
100
+ for v in graph[u]:
101
+ if u not in position or v not in position:
102
+ return False
103
+ if position[u] >= position[v]:
104
+ return False
105
+ return True
106
+
107
+ @staticmethod
108
+ def validate_ground_truth(ground_truth, test_case_name):
109
+ if len(ground_truth.agent) == 0:
110
+ raise ValueError(
111
+ f"No agent provided in the ground truth. test_case_name: {test_case_name}"
112
+ )
113
+
114
+ if len(ground_truth.goals) == 0:
115
+ raise ValueError(
116
+ f"No goals provided in the ground truth. test_case_name: {test_case_name}"
117
+ )
118
+
119
+ if len(ground_truth.goal_details) == 0:
120
+ raise ValueError(
121
+ f"No goal details provided in the ground truth. test_case_name: {test_case_name}"
122
+ )
123
+
124
+ if len(ground_truth.story) == 0:
125
+ raise ValueError(
126
+ f"No story provided in the ground truth. test_case_name: {test_case_name}"
127
+ )
128
+
129
+ goals = set()
130
+
131
+ for key, value in ground_truth.goals.items():
132
+ goals.add(key)
133
+ if isinstance(value, list):
134
+ goals.update(value)
135
+ else:
136
+ raise ValueError(
137
+ f"The goal '{key}' is not mapping to a list: {value}. test_case_name: {test_case_name}"
138
+ )
139
+
140
+ for goal_detail in ground_truth.goal_details:
141
+ if goal_detail.name not in goals:
142
+ raise ValueError(
143
+ f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
144
+ )
145
+ if goal_detail.name == "summarize":
146
+ if len(goal_detail.keywords) == 0 and len(goal_detail.response) == 0:
147
+ rich.print(
148
+ f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
149
+ )
150
+ elif len(goal_detail.response) == 0:
151
+ rich.print(
152
+ f"⚠️‼️ [bold][yellow] WARNING:[/yellow][/bold] Summarize goal has no final response. test_case_name: {test_case_name}"
153
+ )
154
+ if len(ground_truth.goal_details) != len(goals):
155
+ raise ValueError(
156
+ f"Goal details count does not match the goals count: {len(ground_truth.goal_details)} != {len(goals)}. test_case_name: {test_case_name}"
157
+ )
158
+
159
+ def _print_kw_sm(
160
+ self, keyword_semantic_match_list: List[KeywordSemanticSearchMetric]
161
+ ):
162
+ """Prints the keyword match/mismatch, and semantic match/mismatch results
163
+ Right now only successful matches are printed
164
+ """
165
+
166
+ for keyword_semantic_match in keyword_semantic_match_list:
167
+ if (
168
+ keyword_semantic_match.semantic_match
169
+ and keyword_semantic_match.keyword_match
170
+ ):
171
+ rich.print(
172
+ f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
173
+ )
174
+
175
+ def traverse(self):
176
+ labelled_messages = []
177
+ message_outcomes = []
178
+ labelled_messages_without_text_step = []
179
+ # Counters for tool-calling related metrics
180
+ tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
181
+ total_tool_calls=0,
182
+ expected_tool_calls=0,
183
+ relevant_tool_calls=0,
184
+ correct_tool_calls=0,
185
+ total_routing_calls=0,
186
+ expected_routing_calls=0,
187
+ )
188
+ tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
189
+
190
+ for message in self.messages:
191
+ if message.type == ContentType.tool_call:
192
+ tool_call_and_routing_metrics.total_tool_calls += 1
193
+ msg_tool_call = json.loads(message.content)
194
+
195
+ # Check for transfer_* calls
196
+ if msg_tool_call["name"].startswith("transfer_to_"):
197
+ tool_call_and_routing_metrics.total_routing_calls += 1
198
+
199
+ # evaluating more than once is fine
200
+ # agent could make repeated calls with the same function signature
201
+ # in our is_topological_sort algorithm, the most recent occurrence is evaluated
202
+ matching_goal_details = [
203
+ goal_detail
204
+ for goal_detail in self.tool_dictionary.values()
205
+ if goal_detail.tool_name == msg_tool_call["name"]
206
+ ]
207
+ if len(matching_goal_details) > 0:
208
+ tool_call_and_routing_metrics.relevant_tool_calls += 1 # tool name matches one of the expected tool names, as defined in the ground truth
209
+ found = False
210
+ possible_ground_truth = []
211
+ for goal_detail in matching_goal_details:
212
+ if (
213
+ is_transfer := msg_tool_call["name"].startswith(
214
+ "transfer_to_"
215
+ )
216
+ ) or msg_tool_call["args"] == goal_detail.args:
217
+ labelled_messages.append(goal_detail.name)
218
+ labelled_messages_without_text_step.append(goal_detail.name)
219
+ if is_transfer:
220
+ tool_call_and_routing_metrics.expected_routing_calls += (
221
+ 1
222
+ )
223
+ else:
224
+ tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
225
+ found = True
226
+ message_outcome = ExtendedMessage(message=message)
227
+ message_outcomes.append(message_outcome)
228
+ break
229
+ else:
230
+ possible_ground_truth.append(goal_detail.args)
231
+
232
+ if not found:
233
+ message_outcome = ExtendedMessage(message=message)
234
+ message_outcome.reason = {
235
+ "reason": "incorrect parameter",
236
+ "actual": msg_tool_call["args"],
237
+ "expected": possible_ground_truth,
238
+ }
239
+ message_outcomes.append(message_outcome)
240
+ rich.print(
241
+ f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
242
+ f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
243
+ )
244
+ labelled_messages.append(
245
+ msg_tool_call["name"] + "_WRONG_PARAMETERS"
246
+ )
247
+ else:
248
+ # TO-DO: we need a way to backtrack agent/tool pairs.
249
+ # if we route to an agent without the right toolset, that makes it a routing error.
250
+ # this will remove the need to label routing calls explicitly
251
+ if not msg_tool_call["name"].startswith("transfer_to_"):
252
+ rich.print(
253
+ f"[red][ERROR] Wrong function call: {msg_tool_call['name']}[/red]"
254
+ )
255
+ labelled_messages.append(
256
+ msg_tool_call["name"] + "_WRONG_FUNCTION_CALL"
257
+ )
258
+ message_outcome = ExtendedMessage(message=message)
259
+ message_outcome.reason = {"reason": "irrelevant tool call"}
260
+ message_outcomes.append(message_outcome)
261
+
262
+ elif message.type == ContentType.tool_response:
263
+ found = False
264
+ for keyword in ERROR_KEYWORDS:
265
+ if keyword in message.content.lower():
266
+ message_outcome = ExtendedMessage(message=message)
267
+ message_outcome.reason = {"reason": "runtime error"}
268
+ message_outcomes.append(message_outcome)
269
+ found = True
270
+ break
271
+ if not found:
272
+ message_outcome = ExtendedMessage(message=message)
273
+ message_outcomes.append(message_outcome)
274
+ else:
275
+
276
+ message_outcome = ExtendedMessage(message=message)
277
+ message_outcomes.append(message_outcome)
278
+ assistant_responses = [
279
+ message
280
+ for message in self.messages
281
+ if message.event == EventTypes.message_created
282
+ and message.role == "assistant"
283
+ ]
284
+ keyword_semantic_list = []
285
+ for message in assistant_responses:
286
+ for goal_detail in self.text_list:
287
+ if goal_detail.name not in labelled_messages:
288
+ keyword_match: bool = self.matcher.keywords_match(
289
+ message.content, goal_detail.keywords
290
+ )
291
+ semantic_match: bool = self.matcher.semantic_match(
292
+ message.content, goal_detail.response
293
+ )
294
+ keyword_semantic_match = KeywordSemanticSearchMetric(
295
+ keyword_match=keyword_match,
296
+ semantic_match=semantic_match,
297
+ message=message.content,
298
+ goal_detail=goal_detail.name,
299
+ )
300
+ if keyword_match and semantic_match:
301
+ labelled_messages.append(goal_detail.name)
302
+ keyword_semantic_list.append(keyword_semantic_match)
303
+ break
304
+
305
+ # only prints when the semantic and keyword matched
306
+ self._print_kw_sm(keyword_semantic_list)
307
+
308
+ return (
309
+ labelled_messages,
310
+ labelled_messages_without_text_step,
311
+ keyword_semantic_list,
312
+ tool_call_and_routing_metrics,
313
+ message_outcomes,
314
+ )
315
+
316
+ def _is_text_match(
317
+ self, keyword_semantic_match_list: List[KeywordSemanticSearchMetric]
318
+ ):
319
+
320
+ if len(self.text_list) == 0:
321
+ return "NA"
322
+ elif len(self.text_list) == len(keyword_semantic_match_list):
323
+ return "Summary Matched"
324
+ else:
325
+ return "Summary MisMatched"
326
+
327
+ def generate_summary(self):
328
+ llm_steps = 0
329
+ total_step = 0
330
+ (
331
+ labelled_messages,
332
+ labelled_messages_without_text_step,
333
+ matches,
334
+ metrics,
335
+ message_with_reasons,
336
+ ) = self.traverse()
337
+ if self.is_analyze_run:
338
+ print(labelled_messages)
339
+ wrong_call_count = sum(
340
+ 1 for msg in labelled_messages if "_WRONG_FUNCTION_CALL" in msg
341
+ )
342
+ is_success = self.is_topological_sort(
343
+ self.ground_truth.goals, labelled_messages
344
+ )
345
+ match = self._is_text_match(matches)
346
+
347
+ for message in self.messages:
348
+ if message.role == "assistant" and (
349
+ message.type
350
+ in (
351
+ ContentType.text,
352
+ ContentType.conversational_search,
353
+ ContentType.tool_call,
354
+ )
355
+ ):
356
+ llm_steps += 1
357
+ total_step += 1
358
+
359
+ knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
360
+ # TO-DO: the table is not printing properly anymore with the new columns introduced
361
+ # we need to introduce a separate table for these.
362
+ data = {
363
+ "Dataset": self.test_case_name,
364
+ "Total Step": total_step,
365
+ "Agent Step": llm_steps,
366
+ "Ground Truth Calls": len(self.tool_dictionary),
367
+ "Wrong Function Calls": wrong_call_count,
368
+ # "Bad Calls": 0,
369
+ "Wrong Parameters": sum(
370
+ 1 for msg in labelled_messages if "_WRONG_PARAMETERS" in msg
371
+ ),
372
+ "Wrong Routing Calls": sum(
373
+ 1 for msg in labelled_messages if "_WRONG_ROUTING_CALL" in msg
374
+ ),
375
+ "Text Match": match,
376
+ "Journey Success": is_success,
377
+ # "Tool Call Accuracy": metrics.tool_call_accuracy,
378
+ # "Tool Call Relevancy": metrics.tool_call_relevancy,
379
+ # "Agent Routing Accuracy": metrics.agent_routing_accuracy
380
+ }
381
+
382
+ return (
383
+ data,
384
+ matches,
385
+ knowledge_base_metric_summary,
386
+ message_with_reasons,
387
+ metrics,
388
+ )
389
+
390
+ def _get_messages_by_role_before_cs(
391
+ self, idx_conversational_search: int, role: str, type: str = "text"
392
+ ):
393
+ """Utility method to filter `self.messages` for messages with a given role
394
+ that occur before the conversational search message index
395
+ """
396
+
397
+ filtered_messages = [
398
+ message
399
+ for idx, message in enumerate(self.messages)
400
+ if idx < idx_conversational_search
401
+ and message.role == role
402
+ and message.type == type
403
+ ]
404
+
405
+ return filtered_messages
406
+
407
+ def _weave_user_assistant_messages(self, user_messages, assistant_messages):
408
+ weave = []
409
+ for user, assistant in zip(user_messages, assistant_messages):
410
+ msg = f"User: {user.content}\nAssistant: {assistant.content}\n\n"
411
+ weave.append(msg)
412
+
413
+ return " ".join(weave)
414
+
415
+ def _find_tool_call_name(self, tool_call_id):
416
+ for message in self.messages:
417
+ if message.type == ContentType.tool_call:
418
+ content = json.loads(message.content)
419
+ id = content.get("tool_call_id", "")
420
+ if id == tool_call_id:
421
+ return content.get("name")
422
+
423
+ raise Exception(f"'{tool_call_id}' not found in messages")
424
+
425
+ def generate_knowledge_base_metric_summary(self) -> KnowledgeBaseMetrics:
426
+ idx_conv_search = [
427
+ idx
428
+ for idx, message in enumerate(self.messages)
429
+ if message.type == ContentType.conversational_search
430
+ ]
431
+ metrics = []
432
+
433
+ for search_index in idx_conv_search:
434
+ user_messages = self._get_messages_by_role_before_cs(
435
+ role="user", idx_conversational_search=search_index
436
+ )
437
+ assistant_messages = self._get_messages_by_role_before_cs(
438
+ role="assistant",
439
+ idx_conversational_search=search_index,
440
+ type=ContentType.text,
441
+ )
442
+
443
+ context = self._weave_user_assistant_messages(
444
+ user_messages, assistant_messages
445
+ )
446
+ most_recent_user_message = user_messages[-1]
447
+ search_message = self.messages[search_index]
448
+
449
+ # find the conversational search metadata associated with this message
450
+ conversational_search_data = None
451
+ if self.conversational_search_data:
452
+ for cs_metadata in self.conversational_search_data:
453
+ if (
454
+ search_message.conversational_search_metadata.tool_call_id
455
+ == cs_metadata.metadata.tool_call_id
456
+ ):
457
+ conversational_search_data = cs_metadata
458
+
459
+ tool_name = self._find_tool_call_name(
460
+ conversational_search_data.metadata.tool_call_id
461
+ ) # name of knowledge base
462
+
463
+ search_results = [
464
+ result.body for result in conversational_search_data.search_results
465
+ ]
466
+ faithfulness = self.rag_llm_as_a_judge.faithfulness(
467
+ conversational_search_data.text, search_results
468
+ )
469
+ answer_relevancy = self.rag_llm_as_a_judge.answer_relevancy(
470
+ question=most_recent_user_message.content,
471
+ context=context,
472
+ answer=search_message.content,
473
+ )
474
+ knowledge_base_metrics = KnowledgeBaseMetrics(
475
+ dataset_name=self.test_case_name,
476
+ knowledge_base_name=tool_name,
477
+ tool_call_id=search_message.conversational_search_metadata.tool_call_id,
478
+ faithfulness=faithfulness,
479
+ answer_relevancy=answer_relevancy,
480
+ confidence_scores=conversational_search_data.confidence_scores,
481
+ )
482
+
483
+ metrics.append(knowledge_base_metrics)
484
+
485
+ return metrics
486
+
487
+
488
+ if __name__ == "__main__":
489
+
490
+ messages = []
491
+
492
+ with open(
493
+ "./benchmarks/workday_tools/concise/result/llama/messages/data18.messages.json",
494
+ "r",
495
+ encoding="utf-8",
496
+ ) as f:
497
+
498
+ temp = json.load(f)
499
+
500
+ for message in temp:
501
+ messages.append(Message.model_validate(message))
502
+
503
+ for message in messages:
504
+ if message.role == "user":
505
+ rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
506
+ else:
507
+ rich.print("[orange3]WXO:[/orange3]", message.content)
508
+
509
+ with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
510
+ ground_truth = EvaluationData.model_validate(json.load(f))
511
+
512
+ evaluate_package = EvaluationPackage(
513
+ test_case_name="data1.messages.json",
514
+ ground_truth=ground_truth,
515
+ messages=messages,
516
+ )
517
+ print(evaluate_package.generate_summary())
518
+ # print(evaluate_package.traverse())
@@ -0,0 +1,69 @@
1
+ from langchain_core.messages import AIMessageChunk, ToolCallChunk, BaseMessage, AIMessage, ToolMessage, HumanMessage
2
+ from langchain_openai.chat_models.base import _convert_message_to_dict, _convert_dict_to_message
3
+ from wxo_agentic_evaluation.external_agent.types import UniversalData
4
+ import yaml
5
+ import requests
6
+ from typing import Generator
7
+ import json
8
+
9
+
10
+ MESSAGES = [AIMessage(content="how can i help you"), HumanMessage("what's the holiday is June 13th in us?"),
11
+ ToolMessage(content="{tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}", tool_call_id="11111"),
12
+ AIMessage(content="it's National Sweing Machine Day")]
13
+
14
+
15
+ class ExternalAgentValidation:
16
+ def __init__(self, credential, auth_scheme, service_url):
17
+ self.credential = credential
18
+ self.auth_scheme = auth_scheme
19
+ self.service_url = service_url
20
+
21
+ def get_auth_header(self):
22
+ if self.auth_scheme == "API_KEY":
23
+ header = {"x-api-key": self.credential}
24
+
25
+ elif self.auth_scheme == "BEARER_TOKEN":
26
+ header = {"Authorization": f"Bearer {self.credential}"}
27
+
28
+ else:
29
+ raise Exception(f"Auth scheme: {self.auth_scheme} is not supported")
30
+
31
+ return header
32
+
33
+ def _parse_streaming_evenst(self, resp: Generator[bytes, None, None]):
34
+ data = b''
35
+ for chunk in resp:
36
+ for line in chunk.splitlines(True):
37
+ if line.startswith(b'data:'):
38
+ line = line.replace(b'data:', b'')
39
+ if line.strip() == b'[DONE]':
40
+ return
41
+ data += line
42
+ if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
43
+ yield data
44
+ data = b''
45
+ if data:
46
+ yield data
47
+
48
+ def call_validation(self, input: str):
49
+ header = {"Content-Type": "application/json"}
50
+ header.update(self.get_auth_header())
51
+
52
+ messages = [_convert_message_to_dict(message=message) for message in MESSAGES]
53
+ messages.append(_convert_message_to_dict(HumanMessage(input)))
54
+
55
+ payload = {"messages": messages}
56
+
57
+ resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
58
+ results = []
59
+ for json_str in self._parse_streaming_evenst(resp):
60
+ json_dict = None
61
+ try:
62
+ json_dict = json.loads(json_str)
63
+ UniversalData(**json_dict)
64
+ results.append(json_dict)
65
+ except Exception as e:
66
+ print(f"event parsing failed with {e}")
67
+ raise e
68
+
69
+ return results
@@ -0,0 +1,65 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Union, Literal
3
+
4
+
5
+ class ThinkingStepDetails(BaseModel):
6
+ type: Literal["thinking"]
7
+ content: str
8
+
9
+
10
+ class ToolCall(BaseModel):
11
+ name: str
12
+ args: dict
13
+ id: str
14
+
15
+
16
+ class ToolCallsStepDetails(BaseModel):
17
+ type: Literal["tool_calls"]
18
+ tool_calls: List[ToolCall]
19
+
20
+
21
+ class ToolResponseStepDetails(BaseModel):
22
+ type: Literal["tool_response"]
23
+ content: str # could also be List[dict], if pre-parsed
24
+ name: str
25
+ tool_call_id: str
26
+
27
+
28
+ StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
29
+
30
+
31
+ class DeltaMessageChoice(BaseModel):
32
+ delta: dict
33
+
34
+
35
+ class ThreadMessageDeltaChoice(BaseModel):
36
+ delta: dict
37
+
38
+
39
+ class ThreadRunStepDeltaChoice(BaseModel):
40
+ delta: dict
41
+
42
+
43
+ class BaseEventData(BaseModel):
44
+ id: str
45
+ object: str
46
+ thread_id: str
47
+ model: str | None = None
48
+ created: int | None = None
49
+
50
+
51
+ class ThreadMessageDeltaData(BaseEventData):
52
+ object: Literal["thread.message.delta"]
53
+ choices: List[ThreadMessageDeltaChoice]
54
+
55
+
56
+ class ThreadRunStepDeltaData(BaseEventData):
57
+ object: Literal["thread.run.step.delta"]
58
+ choices: List[dict]
59
+
60
+
61
+ class UniversalData(BaseEventData):
62
+ object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
63
+ Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
64
+ choices: List[ThreadMessageDeltaChoice]
65
+ choices: List[Union[ThreadMessageDeltaChoice, dict]]