ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
- wxo_agentic_evaluation/__init__.py +0 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
- wxo_agentic_evaluation/analytics/tools/main.py +163 -0
- wxo_agentic_evaluation/analytics/tools/types.py +130 -0
- wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
- wxo_agentic_evaluation/analyze_run.py +123 -0
- wxo_agentic_evaluation/annotate.py +40 -0
- wxo_agentic_evaluation/arg_configs.py +78 -0
- wxo_agentic_evaluation/batch_annotate.py +181 -0
- wxo_agentic_evaluation/data_annotator.py +253 -0
- wxo_agentic_evaluation/evaluation_package.py +518 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
- wxo_agentic_evaluation/external_agent/types.py +65 -0
- wxo_agentic_evaluation/inference_backend.py +601 -0
- wxo_agentic_evaluation/llm_matching.py +39 -0
- wxo_agentic_evaluation/llm_rag_eval.py +47 -0
- wxo_agentic_evaluation/llm_user.py +38 -0
- wxo_agentic_evaluation/main.py +231 -0
- wxo_agentic_evaluation/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
- wxo_agentic_evaluation/metrics/metrics.py +101 -0
- wxo_agentic_evaluation/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
- wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
- wxo_agentic_evaluation/prompt/template_render.py +90 -0
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
- wxo_agentic_evaluation/record_chat.py +165 -0
- wxo_agentic_evaluation/service_instance.py +179 -0
- wxo_agentic_evaluation/tool_planner.py +228 -0
- wxo_agentic_evaluation/type.py +176 -0
- wxo_agentic_evaluation/utils/__init__.py +6 -0
- wxo_agentic_evaluation/utils/utils.py +233 -0
- wxo_agentic_evaluation/watsonx_provider.py +175 -0
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import rich
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.type import (
|
|
9
|
+
ContentType,
|
|
10
|
+
Message,
|
|
11
|
+
EvaluationData,
|
|
12
|
+
ToolCallAndRoutingMetrics,
|
|
13
|
+
EventTypes,
|
|
14
|
+
ConversationalSearch,
|
|
15
|
+
ExtendedMessage,
|
|
16
|
+
)
|
|
17
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
18
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
19
|
+
KnowledgeBaseMetrics,
|
|
20
|
+
KeywordSemanticSearchMetric,
|
|
21
|
+
)
|
|
22
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
23
|
+
KeywordMatchingTemplateRenderer,
|
|
24
|
+
SemanticMatchingTemplateRenderer,
|
|
25
|
+
FaithfulnessTemplateRenderer,
|
|
26
|
+
AnswerRelevancyTemplateRenderer,
|
|
27
|
+
)
|
|
28
|
+
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
29
|
+
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
30
|
+
from wxo_agentic_evaluation import __file__
|
|
31
|
+
|
|
32
|
+
root_dir = os.path.dirname(__file__)
|
|
33
|
+
KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
|
|
34
|
+
SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
|
|
35
|
+
FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
|
|
36
|
+
ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class EvaluationPackage:
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
test_case_name,
|
|
43
|
+
ground_truth,
|
|
44
|
+
messages,
|
|
45
|
+
conversational_search_data: List[ConversationalSearch] = None,
|
|
46
|
+
is_analyze_run=False,
|
|
47
|
+
):
|
|
48
|
+
self.tool_dictionary = {
|
|
49
|
+
goal_detail.name: goal_detail
|
|
50
|
+
for goal_detail in ground_truth.goal_details
|
|
51
|
+
if goal_detail.type == ContentType.tool_call
|
|
52
|
+
}
|
|
53
|
+
self.text_list = [
|
|
54
|
+
goal_detail
|
|
55
|
+
for goal_detail in ground_truth.goal_details
|
|
56
|
+
if goal_detail.type == ContentType.text
|
|
57
|
+
]
|
|
58
|
+
self.messages = messages
|
|
59
|
+
self.conversational_search_data = conversational_search_data
|
|
60
|
+
self.validate_ground_truth(ground_truth, test_case_name)
|
|
61
|
+
self.ground_truth = ground_truth
|
|
62
|
+
self.test_case_name = test_case_name
|
|
63
|
+
self.is_analyze_run = is_analyze_run
|
|
64
|
+
|
|
65
|
+
self.matcher = LLMMatcher(
|
|
66
|
+
llm_client=WatsonXProvider(
|
|
67
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
68
|
+
llm_decode_parameter={
|
|
69
|
+
"min_new_tokens": 0,
|
|
70
|
+
"decoding_method": "greedy",
|
|
71
|
+
"max_new_tokens": 10,
|
|
72
|
+
},
|
|
73
|
+
),
|
|
74
|
+
keyword_template=KeywordMatchingTemplateRenderer(
|
|
75
|
+
KEYWORD_MATCHING_PROMPT_PATH
|
|
76
|
+
),
|
|
77
|
+
semantic_template=SemanticMatchingTemplateRenderer(
|
|
78
|
+
SEMANTIC_MATCHING_PROMPT_PATH
|
|
79
|
+
),
|
|
80
|
+
)
|
|
81
|
+
self.rag_llm_as_a_judge = LLMJudge(
|
|
82
|
+
llm_client=WatsonXProvider(
|
|
83
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
84
|
+
llm_decode_parameter={
|
|
85
|
+
"min_new_tokens": 0,
|
|
86
|
+
"decoding_method": "greedy",
|
|
87
|
+
"max_new_tokens": 4096,
|
|
88
|
+
},
|
|
89
|
+
),
|
|
90
|
+
faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
|
|
91
|
+
answer_relevancy=AnswerRelevancyTemplateRenderer(
|
|
92
|
+
ANSWER_RELEVANCY_PROMPT_PATH
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def is_topological_sort(graph, ordering):
|
|
98
|
+
position = {node: i for i, node in enumerate(ordering)}
|
|
99
|
+
for u in graph:
|
|
100
|
+
for v in graph[u]:
|
|
101
|
+
if u not in position or v not in position:
|
|
102
|
+
return False
|
|
103
|
+
if position[u] >= position[v]:
|
|
104
|
+
return False
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def validate_ground_truth(ground_truth, test_case_name):
|
|
109
|
+
if len(ground_truth.agent) == 0:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"No agent provided in the ground truth. test_case_name: {test_case_name}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if len(ground_truth.goals) == 0:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"No goals provided in the ground truth. test_case_name: {test_case_name}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if len(ground_truth.goal_details) == 0:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"No goal details provided in the ground truth. test_case_name: {test_case_name}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if len(ground_truth.story) == 0:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"No story provided in the ground truth. test_case_name: {test_case_name}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
goals = set()
|
|
130
|
+
|
|
131
|
+
for key, value in ground_truth.goals.items():
|
|
132
|
+
goals.add(key)
|
|
133
|
+
if isinstance(value, list):
|
|
134
|
+
goals.update(value)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f"The goal '{key}' is not mapping to a list: {value}. test_case_name: {test_case_name}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
for goal_detail in ground_truth.goal_details:
|
|
141
|
+
if goal_detail.name not in goals:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
|
|
144
|
+
)
|
|
145
|
+
if goal_detail.name == "summarize":
|
|
146
|
+
if len(goal_detail.keywords) == 0 and len(goal_detail.response) == 0:
|
|
147
|
+
rich.print(
|
|
148
|
+
f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
|
|
149
|
+
)
|
|
150
|
+
elif len(goal_detail.response) == 0:
|
|
151
|
+
rich.print(
|
|
152
|
+
f"⚠️‼️ [bold][yellow] WARNING:[/yellow][/bold] Summarize goal has no final response. test_case_name: {test_case_name}"
|
|
153
|
+
)
|
|
154
|
+
if len(ground_truth.goal_details) != len(goals):
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"Goal details count does not match the goals count: {len(ground_truth.goal_details)} != {len(goals)}. test_case_name: {test_case_name}"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _print_kw_sm(
|
|
160
|
+
self, keyword_semantic_match_list: List[KeywordSemanticSearchMetric]
|
|
161
|
+
):
|
|
162
|
+
"""Prints the keyword match/mismatch, and semantic match/mismatch results
|
|
163
|
+
Right now only successful matches are printed
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
for keyword_semantic_match in keyword_semantic_match_list:
|
|
167
|
+
if (
|
|
168
|
+
keyword_semantic_match.semantic_match
|
|
169
|
+
and keyword_semantic_match.keyword_match
|
|
170
|
+
):
|
|
171
|
+
rich.print(
|
|
172
|
+
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def traverse(self):
|
|
176
|
+
labelled_messages = []
|
|
177
|
+
message_outcomes = []
|
|
178
|
+
labelled_messages_without_text_step = []
|
|
179
|
+
# Counters for tool-calling related metrics
|
|
180
|
+
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
|
|
181
|
+
total_tool_calls=0,
|
|
182
|
+
expected_tool_calls=0,
|
|
183
|
+
relevant_tool_calls=0,
|
|
184
|
+
correct_tool_calls=0,
|
|
185
|
+
total_routing_calls=0,
|
|
186
|
+
expected_routing_calls=0,
|
|
187
|
+
)
|
|
188
|
+
tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
|
|
189
|
+
|
|
190
|
+
for message in self.messages:
|
|
191
|
+
if message.type == ContentType.tool_call:
|
|
192
|
+
tool_call_and_routing_metrics.total_tool_calls += 1
|
|
193
|
+
msg_tool_call = json.loads(message.content)
|
|
194
|
+
|
|
195
|
+
# Check for transfer_* calls
|
|
196
|
+
if msg_tool_call["name"].startswith("transfer_to_"):
|
|
197
|
+
tool_call_and_routing_metrics.total_routing_calls += 1
|
|
198
|
+
|
|
199
|
+
# evaluating more than once is fine
|
|
200
|
+
# agent could make repeated calls with the same function signature
|
|
201
|
+
# in our is_topological_sort algorithm, the most recent occurrence is evaluated
|
|
202
|
+
matching_goal_details = [
|
|
203
|
+
goal_detail
|
|
204
|
+
for goal_detail in self.tool_dictionary.values()
|
|
205
|
+
if goal_detail.tool_name == msg_tool_call["name"]
|
|
206
|
+
]
|
|
207
|
+
if len(matching_goal_details) > 0:
|
|
208
|
+
tool_call_and_routing_metrics.relevant_tool_calls += 1 # tool name matches one of the expected tool names, as defined in the ground truth
|
|
209
|
+
found = False
|
|
210
|
+
possible_ground_truth = []
|
|
211
|
+
for goal_detail in matching_goal_details:
|
|
212
|
+
if (
|
|
213
|
+
is_transfer := msg_tool_call["name"].startswith(
|
|
214
|
+
"transfer_to_"
|
|
215
|
+
)
|
|
216
|
+
) or msg_tool_call["args"] == goal_detail.args:
|
|
217
|
+
labelled_messages.append(goal_detail.name)
|
|
218
|
+
labelled_messages_without_text_step.append(goal_detail.name)
|
|
219
|
+
if is_transfer:
|
|
220
|
+
tool_call_and_routing_metrics.expected_routing_calls += (
|
|
221
|
+
1
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
225
|
+
found = True
|
|
226
|
+
message_outcome = ExtendedMessage(message=message)
|
|
227
|
+
message_outcomes.append(message_outcome)
|
|
228
|
+
break
|
|
229
|
+
else:
|
|
230
|
+
possible_ground_truth.append(goal_detail.args)
|
|
231
|
+
|
|
232
|
+
if not found:
|
|
233
|
+
message_outcome = ExtendedMessage(message=message)
|
|
234
|
+
message_outcome.reason = {
|
|
235
|
+
"reason": "incorrect parameter",
|
|
236
|
+
"actual": msg_tool_call["args"],
|
|
237
|
+
"expected": possible_ground_truth,
|
|
238
|
+
}
|
|
239
|
+
message_outcomes.append(message_outcome)
|
|
240
|
+
rich.print(
|
|
241
|
+
f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
|
|
242
|
+
f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
|
|
243
|
+
)
|
|
244
|
+
labelled_messages.append(
|
|
245
|
+
msg_tool_call["name"] + "_WRONG_PARAMETERS"
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
# TO-DO: we need a way to backtrack agent/tool pairs.
|
|
249
|
+
# if we route to an agent without the right toolset, that makes it a routing error.
|
|
250
|
+
# this will remove the need to label routing calls explicitly
|
|
251
|
+
if not msg_tool_call["name"].startswith("transfer_to_"):
|
|
252
|
+
rich.print(
|
|
253
|
+
f"[red][ERROR] Wrong function call: {msg_tool_call['name']}[/red]"
|
|
254
|
+
)
|
|
255
|
+
labelled_messages.append(
|
|
256
|
+
msg_tool_call["name"] + "_WRONG_FUNCTION_CALL"
|
|
257
|
+
)
|
|
258
|
+
message_outcome = ExtendedMessage(message=message)
|
|
259
|
+
message_outcome.reason = {"reason": "irrelevant tool call"}
|
|
260
|
+
message_outcomes.append(message_outcome)
|
|
261
|
+
|
|
262
|
+
elif message.type == ContentType.tool_response:
|
|
263
|
+
found = False
|
|
264
|
+
for keyword in ERROR_KEYWORDS:
|
|
265
|
+
if keyword in message.content.lower():
|
|
266
|
+
message_outcome = ExtendedMessage(message=message)
|
|
267
|
+
message_outcome.reason = {"reason": "runtime error"}
|
|
268
|
+
message_outcomes.append(message_outcome)
|
|
269
|
+
found = True
|
|
270
|
+
break
|
|
271
|
+
if not found:
|
|
272
|
+
message_outcome = ExtendedMessage(message=message)
|
|
273
|
+
message_outcomes.append(message_outcome)
|
|
274
|
+
else:
|
|
275
|
+
|
|
276
|
+
message_outcome = ExtendedMessage(message=message)
|
|
277
|
+
message_outcomes.append(message_outcome)
|
|
278
|
+
assistant_responses = [
|
|
279
|
+
message
|
|
280
|
+
for message in self.messages
|
|
281
|
+
if message.event == EventTypes.message_created
|
|
282
|
+
and message.role == "assistant"
|
|
283
|
+
]
|
|
284
|
+
keyword_semantic_list = []
|
|
285
|
+
for message in assistant_responses:
|
|
286
|
+
for goal_detail in self.text_list:
|
|
287
|
+
if goal_detail.name not in labelled_messages:
|
|
288
|
+
keyword_match: bool = self.matcher.keywords_match(
|
|
289
|
+
message.content, goal_detail.keywords
|
|
290
|
+
)
|
|
291
|
+
semantic_match: bool = self.matcher.semantic_match(
|
|
292
|
+
message.content, goal_detail.response
|
|
293
|
+
)
|
|
294
|
+
keyword_semantic_match = KeywordSemanticSearchMetric(
|
|
295
|
+
keyword_match=keyword_match,
|
|
296
|
+
semantic_match=semantic_match,
|
|
297
|
+
message=message.content,
|
|
298
|
+
goal_detail=goal_detail.name,
|
|
299
|
+
)
|
|
300
|
+
if keyword_match and semantic_match:
|
|
301
|
+
labelled_messages.append(goal_detail.name)
|
|
302
|
+
keyword_semantic_list.append(keyword_semantic_match)
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
# only prints when the semantic and keyword matched
|
|
306
|
+
self._print_kw_sm(keyword_semantic_list)
|
|
307
|
+
|
|
308
|
+
return (
|
|
309
|
+
labelled_messages,
|
|
310
|
+
labelled_messages_without_text_step,
|
|
311
|
+
keyword_semantic_list,
|
|
312
|
+
tool_call_and_routing_metrics,
|
|
313
|
+
message_outcomes,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def _is_text_match(
|
|
317
|
+
self, keyword_semantic_match_list: List[KeywordSemanticSearchMetric]
|
|
318
|
+
):
|
|
319
|
+
|
|
320
|
+
if len(self.text_list) == 0:
|
|
321
|
+
return "NA"
|
|
322
|
+
elif len(self.text_list) == len(keyword_semantic_match_list):
|
|
323
|
+
return "Summary Matched"
|
|
324
|
+
else:
|
|
325
|
+
return "Summary MisMatched"
|
|
326
|
+
|
|
327
|
+
def generate_summary(self):
|
|
328
|
+
llm_steps = 0
|
|
329
|
+
total_step = 0
|
|
330
|
+
(
|
|
331
|
+
labelled_messages,
|
|
332
|
+
labelled_messages_without_text_step,
|
|
333
|
+
matches,
|
|
334
|
+
metrics,
|
|
335
|
+
message_with_reasons,
|
|
336
|
+
) = self.traverse()
|
|
337
|
+
if self.is_analyze_run:
|
|
338
|
+
print(labelled_messages)
|
|
339
|
+
wrong_call_count = sum(
|
|
340
|
+
1 for msg in labelled_messages if "_WRONG_FUNCTION_CALL" in msg
|
|
341
|
+
)
|
|
342
|
+
is_success = self.is_topological_sort(
|
|
343
|
+
self.ground_truth.goals, labelled_messages
|
|
344
|
+
)
|
|
345
|
+
match = self._is_text_match(matches)
|
|
346
|
+
|
|
347
|
+
for message in self.messages:
|
|
348
|
+
if message.role == "assistant" and (
|
|
349
|
+
message.type
|
|
350
|
+
in (
|
|
351
|
+
ContentType.text,
|
|
352
|
+
ContentType.conversational_search,
|
|
353
|
+
ContentType.tool_call,
|
|
354
|
+
)
|
|
355
|
+
):
|
|
356
|
+
llm_steps += 1
|
|
357
|
+
total_step += 1
|
|
358
|
+
|
|
359
|
+
knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
|
|
360
|
+
# TO-DO: the table is not printing properly anymore with the new columns introduced
|
|
361
|
+
# we need to introduce a separate table for these.
|
|
362
|
+
data = {
|
|
363
|
+
"Dataset": self.test_case_name,
|
|
364
|
+
"Total Step": total_step,
|
|
365
|
+
"Agent Step": llm_steps,
|
|
366
|
+
"Ground Truth Calls": len(self.tool_dictionary),
|
|
367
|
+
"Wrong Function Calls": wrong_call_count,
|
|
368
|
+
# "Bad Calls": 0,
|
|
369
|
+
"Wrong Parameters": sum(
|
|
370
|
+
1 for msg in labelled_messages if "_WRONG_PARAMETERS" in msg
|
|
371
|
+
),
|
|
372
|
+
"Wrong Routing Calls": sum(
|
|
373
|
+
1 for msg in labelled_messages if "_WRONG_ROUTING_CALL" in msg
|
|
374
|
+
),
|
|
375
|
+
"Text Match": match,
|
|
376
|
+
"Journey Success": is_success,
|
|
377
|
+
# "Tool Call Accuracy": metrics.tool_call_accuracy,
|
|
378
|
+
# "Tool Call Relevancy": metrics.tool_call_relevancy,
|
|
379
|
+
# "Agent Routing Accuracy": metrics.agent_routing_accuracy
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return (
|
|
383
|
+
data,
|
|
384
|
+
matches,
|
|
385
|
+
knowledge_base_metric_summary,
|
|
386
|
+
message_with_reasons,
|
|
387
|
+
metrics,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def _get_messages_by_role_before_cs(
|
|
391
|
+
self, idx_conversational_search: int, role: str, type: str = "text"
|
|
392
|
+
):
|
|
393
|
+
"""Utility method to filter `self.messages` for messages with a given role
|
|
394
|
+
that occur before the conversational search message index
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
filtered_messages = [
|
|
398
|
+
message
|
|
399
|
+
for idx, message in enumerate(self.messages)
|
|
400
|
+
if idx < idx_conversational_search
|
|
401
|
+
and message.role == role
|
|
402
|
+
and message.type == type
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
return filtered_messages
|
|
406
|
+
|
|
407
|
+
def _weave_user_assistant_messages(self, user_messages, assistant_messages):
|
|
408
|
+
weave = []
|
|
409
|
+
for user, assistant in zip(user_messages, assistant_messages):
|
|
410
|
+
msg = f"User: {user.content}\nAssistant: {assistant.content}\n\n"
|
|
411
|
+
weave.append(msg)
|
|
412
|
+
|
|
413
|
+
return " ".join(weave)
|
|
414
|
+
|
|
415
|
+
def _find_tool_call_name(self, tool_call_id):
|
|
416
|
+
for message in self.messages:
|
|
417
|
+
if message.type == ContentType.tool_call:
|
|
418
|
+
content = json.loads(message.content)
|
|
419
|
+
id = content.get("tool_call_id", "")
|
|
420
|
+
if id == tool_call_id:
|
|
421
|
+
return content.get("name")
|
|
422
|
+
|
|
423
|
+
raise Exception(f"'{tool_call_id}' not found in messages")
|
|
424
|
+
|
|
425
|
+
def generate_knowledge_base_metric_summary(self) -> KnowledgeBaseMetrics:
|
|
426
|
+
idx_conv_search = [
|
|
427
|
+
idx
|
|
428
|
+
for idx, message in enumerate(self.messages)
|
|
429
|
+
if message.type == ContentType.conversational_search
|
|
430
|
+
]
|
|
431
|
+
metrics = []
|
|
432
|
+
|
|
433
|
+
for search_index in idx_conv_search:
|
|
434
|
+
user_messages = self._get_messages_by_role_before_cs(
|
|
435
|
+
role="user", idx_conversational_search=search_index
|
|
436
|
+
)
|
|
437
|
+
assistant_messages = self._get_messages_by_role_before_cs(
|
|
438
|
+
role="assistant",
|
|
439
|
+
idx_conversational_search=search_index,
|
|
440
|
+
type=ContentType.text,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
context = self._weave_user_assistant_messages(
|
|
444
|
+
user_messages, assistant_messages
|
|
445
|
+
)
|
|
446
|
+
most_recent_user_message = user_messages[-1]
|
|
447
|
+
search_message = self.messages[search_index]
|
|
448
|
+
|
|
449
|
+
# find the conversational search metadata associated with this message
|
|
450
|
+
conversational_search_data = None
|
|
451
|
+
if self.conversational_search_data:
|
|
452
|
+
for cs_metadata in self.conversational_search_data:
|
|
453
|
+
if (
|
|
454
|
+
search_message.conversational_search_metadata.tool_call_id
|
|
455
|
+
== cs_metadata.metadata.tool_call_id
|
|
456
|
+
):
|
|
457
|
+
conversational_search_data = cs_metadata
|
|
458
|
+
|
|
459
|
+
tool_name = self._find_tool_call_name(
|
|
460
|
+
conversational_search_data.metadata.tool_call_id
|
|
461
|
+
) # name of knowledge base
|
|
462
|
+
|
|
463
|
+
search_results = [
|
|
464
|
+
result.body for result in conversational_search_data.search_results
|
|
465
|
+
]
|
|
466
|
+
faithfulness = self.rag_llm_as_a_judge.faithfulness(
|
|
467
|
+
conversational_search_data.text, search_results
|
|
468
|
+
)
|
|
469
|
+
answer_relevancy = self.rag_llm_as_a_judge.answer_relevancy(
|
|
470
|
+
question=most_recent_user_message.content,
|
|
471
|
+
context=context,
|
|
472
|
+
answer=search_message.content,
|
|
473
|
+
)
|
|
474
|
+
knowledge_base_metrics = KnowledgeBaseMetrics(
|
|
475
|
+
dataset_name=self.test_case_name,
|
|
476
|
+
knowledge_base_name=tool_name,
|
|
477
|
+
tool_call_id=search_message.conversational_search_metadata.tool_call_id,
|
|
478
|
+
faithfulness=faithfulness,
|
|
479
|
+
answer_relevancy=answer_relevancy,
|
|
480
|
+
confidence_scores=conversational_search_data.confidence_scores,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
metrics.append(knowledge_base_metrics)
|
|
484
|
+
|
|
485
|
+
return metrics
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
if __name__ == "__main__":
|
|
489
|
+
|
|
490
|
+
messages = []
|
|
491
|
+
|
|
492
|
+
with open(
|
|
493
|
+
"./benchmarks/workday_tools/concise/result/llama/messages/data18.messages.json",
|
|
494
|
+
"r",
|
|
495
|
+
encoding="utf-8",
|
|
496
|
+
) as f:
|
|
497
|
+
|
|
498
|
+
temp = json.load(f)
|
|
499
|
+
|
|
500
|
+
for message in temp:
|
|
501
|
+
messages.append(Message.model_validate(message))
|
|
502
|
+
|
|
503
|
+
for message in messages:
|
|
504
|
+
if message.role == "user":
|
|
505
|
+
rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
|
|
506
|
+
else:
|
|
507
|
+
rich.print("[orange3]WXO:[/orange3]", message.content)
|
|
508
|
+
|
|
509
|
+
with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
|
|
510
|
+
ground_truth = EvaluationData.model_validate(json.load(f))
|
|
511
|
+
|
|
512
|
+
evaluate_package = EvaluationPackage(
|
|
513
|
+
test_case_name="data1.messages.json",
|
|
514
|
+
ground_truth=ground_truth,
|
|
515
|
+
messages=messages,
|
|
516
|
+
)
|
|
517
|
+
print(evaluate_package.generate_summary())
|
|
518
|
+
# print(evaluate_package.traverse())
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from langchain_core.messages import AIMessageChunk, ToolCallChunk, BaseMessage, AIMessage, ToolMessage, HumanMessage
|
|
2
|
+
from langchain_openai.chat_models.base import _convert_message_to_dict, _convert_dict_to_message
|
|
3
|
+
from wxo_agentic_evaluation.external_agent.types import UniversalData
|
|
4
|
+
import yaml
|
|
5
|
+
import requests
|
|
6
|
+
from typing import Generator
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
MESSAGES = [AIMessage(content="how can i help you"), HumanMessage("what's the holiday is June 13th in us?"),
|
|
11
|
+
ToolMessage(content="{tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}", tool_call_id="11111"),
|
|
12
|
+
AIMessage(content="it's National Sweing Machine Day")]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExternalAgentValidation:
|
|
16
|
+
def __init__(self, credential, auth_scheme, service_url):
|
|
17
|
+
self.credential = credential
|
|
18
|
+
self.auth_scheme = auth_scheme
|
|
19
|
+
self.service_url = service_url
|
|
20
|
+
|
|
21
|
+
def get_auth_header(self):
|
|
22
|
+
if self.auth_scheme == "API_KEY":
|
|
23
|
+
header = {"x-api-key": self.credential}
|
|
24
|
+
|
|
25
|
+
elif self.auth_scheme == "BEARER_TOKEN":
|
|
26
|
+
header = {"Authorization": f"Bearer {self.credential}"}
|
|
27
|
+
|
|
28
|
+
else:
|
|
29
|
+
raise Exception(f"Auth scheme: {self.auth_scheme} is not supported")
|
|
30
|
+
|
|
31
|
+
return header
|
|
32
|
+
|
|
33
|
+
def _parse_streaming_evenst(self, resp: Generator[bytes, None, None]):
|
|
34
|
+
data = b''
|
|
35
|
+
for chunk in resp:
|
|
36
|
+
for line in chunk.splitlines(True):
|
|
37
|
+
if line.startswith(b'data:'):
|
|
38
|
+
line = line.replace(b'data:', b'')
|
|
39
|
+
if line.strip() == b'[DONE]':
|
|
40
|
+
return
|
|
41
|
+
data += line
|
|
42
|
+
if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
|
|
43
|
+
yield data
|
|
44
|
+
data = b''
|
|
45
|
+
if data:
|
|
46
|
+
yield data
|
|
47
|
+
|
|
48
|
+
def call_validation(self, input: str):
|
|
49
|
+
header = {"Content-Type": "application/json"}
|
|
50
|
+
header.update(self.get_auth_header())
|
|
51
|
+
|
|
52
|
+
messages = [_convert_message_to_dict(message=message) for message in MESSAGES]
|
|
53
|
+
messages.append(_convert_message_to_dict(HumanMessage(input)))
|
|
54
|
+
|
|
55
|
+
payload = {"messages": messages}
|
|
56
|
+
|
|
57
|
+
resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
|
|
58
|
+
results = []
|
|
59
|
+
for json_str in self._parse_streaming_evenst(resp):
|
|
60
|
+
json_dict = None
|
|
61
|
+
try:
|
|
62
|
+
json_dict = json.loads(json_str)
|
|
63
|
+
UniversalData(**json_dict)
|
|
64
|
+
results.append(json_dict)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"event parsing failed with {e}")
|
|
67
|
+
raise e
|
|
68
|
+
|
|
69
|
+
return results
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import List, Union, Literal
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ThinkingStepDetails(BaseModel):
|
|
6
|
+
type: Literal["thinking"]
|
|
7
|
+
content: str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolCall(BaseModel):
|
|
11
|
+
name: str
|
|
12
|
+
args: dict
|
|
13
|
+
id: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ToolCallsStepDetails(BaseModel):
|
|
17
|
+
type: Literal["tool_calls"]
|
|
18
|
+
tool_calls: List[ToolCall]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ToolResponseStepDetails(BaseModel):
|
|
22
|
+
type: Literal["tool_response"]
|
|
23
|
+
content: str # could also be List[dict], if pre-parsed
|
|
24
|
+
name: str
|
|
25
|
+
tool_call_id: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
StepDetails = Union[ThinkingStepDetails, ToolCallsStepDetails, ToolResponseStepDetails]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DeltaMessageChoice(BaseModel):
|
|
32
|
+
delta: dict
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ThreadMessageDeltaChoice(BaseModel):
|
|
36
|
+
delta: dict
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ThreadRunStepDeltaChoice(BaseModel):
|
|
40
|
+
delta: dict
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BaseEventData(BaseModel):
|
|
44
|
+
id: str
|
|
45
|
+
object: str
|
|
46
|
+
thread_id: str
|
|
47
|
+
model: str | None = None
|
|
48
|
+
created: int | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ThreadMessageDeltaData(BaseEventData):
|
|
52
|
+
object: Literal["thread.message.delta"]
|
|
53
|
+
choices: List[ThreadMessageDeltaChoice]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ThreadRunStepDeltaData(BaseEventData):
|
|
57
|
+
object: Literal["thread.run.step.delta"]
|
|
58
|
+
choices: List[dict]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class UniversalData(BaseEventData):
|
|
62
|
+
object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
|
|
63
|
+
Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
|
|
64
|
+
choices: List[ThreadMessageDeltaChoice]
|
|
65
|
+
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|