ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,46 +1,53 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import json
|
|
3
|
-
import os
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
4
5
|
import rich
|
|
5
6
|
|
|
7
|
+
from wxo_agentic_evaluation import __file__
|
|
6
8
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
7
|
-
|
|
8
|
-
from wxo_agentic_evaluation.
|
|
9
|
-
ContentType,
|
|
10
|
-
Message,
|
|
11
|
-
EvaluationData,
|
|
12
|
-
EventTypes,
|
|
13
|
-
ConversationalSearch,
|
|
14
|
-
ExtendedMessage,
|
|
15
|
-
)
|
|
16
|
-
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
17
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
9
|
+
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
10
|
+
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
18
11
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
19
|
-
KnowledgeBaseMetrics,
|
|
20
12
|
KeywordSemanticSearchMetric,
|
|
13
|
+
KnowledgeBaseMetrics,
|
|
14
|
+
TextMatchType,
|
|
21
15
|
ToolCallAndRoutingMetrics,
|
|
22
|
-
TextMatchType
|
|
23
16
|
)
|
|
24
17
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
18
|
+
AnswerRelevancyTemplateRenderer,
|
|
19
|
+
FaithfulnessTemplateRenderer,
|
|
25
20
|
KeywordMatchingTemplateRenderer,
|
|
26
21
|
SemanticMatchingTemplateRenderer,
|
|
27
|
-
FaithfulnessTemplateRenderer,
|
|
28
|
-
AnswerRelevancyTemplateRenderer,
|
|
29
22
|
)
|
|
30
|
-
from wxo_agentic_evaluation.
|
|
31
|
-
from wxo_agentic_evaluation.
|
|
32
|
-
from wxo_agentic_evaluation import
|
|
23
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
24
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
25
|
+
from wxo_agentic_evaluation.type import (
|
|
26
|
+
ContentType,
|
|
27
|
+
ConversationalSearch,
|
|
28
|
+
EvaluationData,
|
|
29
|
+
EventTypes,
|
|
30
|
+
ExtendedMessage,
|
|
31
|
+
Message,
|
|
32
|
+
)
|
|
33
33
|
|
|
34
34
|
root_dir = os.path.dirname(__file__)
|
|
35
|
-
KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
|
|
36
|
+
root_dir, "prompt", "keyword_matching_prompt.jinja2"
|
|
37
|
+
)
|
|
38
|
+
SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
|
|
39
|
+
root_dir, "prompt", "semantic_matching_prompt.jinja2"
|
|
40
|
+
)
|
|
41
|
+
FAITHFULNESS_PROMPT_PATH = os.path.join(
|
|
42
|
+
root_dir, "prompt", "faithfulness_prompt.jinja2"
|
|
43
|
+
)
|
|
44
|
+
ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
|
|
45
|
+
root_dir, "prompt", "answer_relevancy_prompt.jinja2"
|
|
46
|
+
)
|
|
39
47
|
|
|
40
48
|
RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
|
|
41
|
-
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS",
|
|
42
|
-
|
|
43
|
-
)
|
|
49
|
+
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
|
|
50
|
+
)
|
|
44
51
|
|
|
45
52
|
"""
|
|
46
53
|
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
@@ -50,6 +57,7 @@ single, summary step goals.
|
|
|
50
57
|
"""
|
|
51
58
|
DUMMY_GRAPH_NODE_NAME = "dummy-goal"
|
|
52
59
|
|
|
60
|
+
|
|
53
61
|
class EvaluationPackage:
|
|
54
62
|
def __init__(
|
|
55
63
|
self,
|
|
@@ -76,14 +84,18 @@ class EvaluationPackage:
|
|
|
76
84
|
self.ground_truth = ground_truth
|
|
77
85
|
self.test_case_name = test_case_name
|
|
78
86
|
self.resource_map = resource_map
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
if not self.is_attack_evaluation:
|
|
81
89
|
self.validate_ground_truth(self.ground_truth, self.test_case_name)
|
|
82
90
|
|
|
83
91
|
self.matcher = LLMMatcher(
|
|
84
92
|
llm_client=get_provider(
|
|
85
93
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
86
|
-
params={
|
|
94
|
+
params={
|
|
95
|
+
"min_new_tokens": 0,
|
|
96
|
+
"decoding_method": "greedy",
|
|
97
|
+
"max_new_tokens": 10,
|
|
98
|
+
},
|
|
87
99
|
),
|
|
88
100
|
keyword_template=KeywordMatchingTemplateRenderer(
|
|
89
101
|
KEYWORD_MATCHING_PROMPT_PATH
|
|
@@ -94,20 +106,24 @@ class EvaluationPackage:
|
|
|
94
106
|
)
|
|
95
107
|
self.rag_llm_as_a_judge = LLMJudge(
|
|
96
108
|
llm_client=get_provider(
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
109
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
110
|
+
params={
|
|
111
|
+
"min_new_tokens": 0,
|
|
112
|
+
"decoding_method": "greedy",
|
|
113
|
+
"max_new_tokens": 4096,
|
|
114
|
+
},
|
|
115
|
+
),
|
|
100
116
|
faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
|
|
101
117
|
answer_relevancy=AnswerRelevancyTemplateRenderer(
|
|
102
118
|
ANSWER_RELEVANCY_PROMPT_PATH
|
|
103
119
|
),
|
|
104
120
|
)
|
|
105
|
-
|
|
121
|
+
|
|
106
122
|
@staticmethod
|
|
107
123
|
def find_ground_node(graph, start_node):
|
|
108
|
-
"""
|
|
124
|
+
"""Simple implementation. Should be fixed in the future
|
|
109
125
|
|
|
110
|
-
Assumes that there is a single graph node that does not have children
|
|
126
|
+
Assumes that there is a single graph node that does not have children
|
|
111
127
|
"""
|
|
112
128
|
|
|
113
129
|
stack = [start_node]
|
|
@@ -117,21 +133,23 @@ class EvaluationPackage:
|
|
|
117
133
|
node = stack.pop()
|
|
118
134
|
if node not in visited_set:
|
|
119
135
|
visited_set.add(node)
|
|
120
|
-
|
|
136
|
+
|
|
121
137
|
# check for children
|
|
122
138
|
# improvement for future: add the ground nodes here
|
|
123
139
|
# right now, just return the first one
|
|
124
140
|
if not graph.get(node):
|
|
125
141
|
return node
|
|
126
|
-
|
|
142
|
+
|
|
127
143
|
stack.extend(graph[node])
|
|
128
|
-
|
|
144
|
+
|
|
129
145
|
return None
|
|
130
146
|
|
|
131
147
|
@staticmethod
|
|
132
148
|
def is_topological_sort(graph, ordering):
|
|
133
149
|
position = {node: i for i, node in enumerate(ordering)}
|
|
134
|
-
ground_node = EvaluationPackage.find_ground_node(
|
|
150
|
+
ground_node = EvaluationPackage.find_ground_node(
|
|
151
|
+
graph, list(graph.keys())[0]
|
|
152
|
+
)
|
|
135
153
|
|
|
136
154
|
if ground_node is not None:
|
|
137
155
|
graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
|
|
@@ -187,7 +205,11 @@ class EvaluationPackage:
|
|
|
187
205
|
f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
|
|
188
206
|
)
|
|
189
207
|
if goal_detail.name == "summarize":
|
|
190
|
-
if (
|
|
208
|
+
if (
|
|
209
|
+
not goal_detail.keywords or len(goal_detail.keywords) == 0
|
|
210
|
+
) and (
|
|
211
|
+
not goal_detail.response or len(goal_detail.response) == 0
|
|
212
|
+
):
|
|
191
213
|
rich.print(
|
|
192
214
|
f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
|
|
193
215
|
)
|
|
@@ -215,11 +237,10 @@ class EvaluationPackage:
|
|
|
215
237
|
rich.print(
|
|
216
238
|
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
217
239
|
)
|
|
218
|
-
|
|
240
|
+
|
|
219
241
|
@staticmethod
|
|
220
242
|
def _check_if_args_match_with_ignore(
|
|
221
|
-
|
|
222
|
-
expected_args: dict[str, str]
|
|
243
|
+
actual_args: dict[str, str], expected_args: dict[str, str]
|
|
223
244
|
) -> bool:
|
|
224
245
|
"""
|
|
225
246
|
This function checks if a registered tool call matches with the goal node when:
|
|
@@ -230,15 +251,15 @@ class EvaluationPackage:
|
|
|
230
251
|
Returns:
|
|
231
252
|
bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
|
|
232
253
|
"""
|
|
233
|
-
|
|
234
|
-
if(
|
|
235
|
-
set(actual_args.keys()) != set(expected_args.keys())
|
|
236
|
-
):
|
|
254
|
+
|
|
255
|
+
if set(actual_args.keys()) != set(expected_args.keys()):
|
|
237
256
|
return False
|
|
238
|
-
|
|
257
|
+
|
|
239
258
|
for key in actual_args:
|
|
240
|
-
if
|
|
241
|
-
|
|
259
|
+
if (
|
|
260
|
+
actual_args[key] != expected_args[key]
|
|
261
|
+
and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
|
|
262
|
+
):
|
|
242
263
|
return False
|
|
243
264
|
|
|
244
265
|
return True
|
|
@@ -248,18 +269,26 @@ class EvaluationPackage:
|
|
|
248
269
|
message_outcomes = []
|
|
249
270
|
labelled_messages_without_text_step = []
|
|
250
271
|
# Counters for tool-calling related metrics
|
|
251
|
-
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
|
|
272
|
+
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
|
|
273
|
+
tool_call_and_routing_metrics.expected_tool_calls = len(
|
|
274
|
+
self.tool_dictionary
|
|
252
275
|
)
|
|
253
|
-
|
|
254
|
-
|
|
276
|
+
correct_tool_calls = (
|
|
277
|
+
set()
|
|
278
|
+
) # sometimes, tool with the same signature can be called more than once
|
|
255
279
|
for message in self.messages:
|
|
256
280
|
if message.type == ContentType.tool_call:
|
|
257
281
|
|
|
258
282
|
msg_tool_call = json.loads(message.content)
|
|
259
|
-
if
|
|
283
|
+
if (
|
|
284
|
+
self.resource_map
|
|
285
|
+
and msg_tool_call["name"] in self.resource_map.agent2tools
|
|
286
|
+
):
|
|
260
287
|
tool_call_and_routing_metrics.total_routing_calls += 1
|
|
261
288
|
relevant = False
|
|
262
|
-
for tool in self.resource_map.agent2tools[
|
|
289
|
+
for tool in self.resource_map.agent2tools[
|
|
290
|
+
msg_tool_call["name"]
|
|
291
|
+
]:
|
|
263
292
|
for goal_detail in self.tool_dictionary.values():
|
|
264
293
|
if goal_detail.tool_name == tool:
|
|
265
294
|
relevant = True
|
|
@@ -268,7 +297,9 @@ class EvaluationPackage:
|
|
|
268
297
|
break
|
|
269
298
|
|
|
270
299
|
if relevant:
|
|
271
|
-
tool_call_and_routing_metrics.relevant_routing_calls +=
|
|
300
|
+
tool_call_and_routing_metrics.relevant_routing_calls += (
|
|
301
|
+
1
|
|
302
|
+
)
|
|
272
303
|
else:
|
|
273
304
|
message_outcome = ExtendedMessage(message=message)
|
|
274
305
|
message_outcome.reason = {
|
|
@@ -294,21 +325,26 @@ class EvaluationPackage:
|
|
|
294
325
|
possible_ground_truth_for_analysis = []
|
|
295
326
|
for goal_detail in matching_goal_details:
|
|
296
327
|
# {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
|
|
297
|
-
if goal_detail.args == {"IGNORE": None} or (
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
goal_detail.args
|
|
301
|
-
)
|
|
328
|
+
if goal_detail.args == {"IGNORE": None} or (
|
|
329
|
+
msg_tool_call["args"] == goal_detail.args
|
|
330
|
+
or self._check_if_args_match_with_ignore(
|
|
331
|
+
msg_tool_call["args"], goal_detail.args
|
|
332
|
+
)
|
|
333
|
+
):
|
|
302
334
|
labelled_messages.append(goal_detail.name)
|
|
303
|
-
labelled_messages_without_text_step.append(
|
|
335
|
+
labelled_messages_without_text_step.append(
|
|
336
|
+
goal_detail.name
|
|
337
|
+
)
|
|
304
338
|
correct_tool_calls.add(goal_detail.name)
|
|
305
|
-
#tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
339
|
+
# tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
306
340
|
found = True
|
|
307
341
|
message_outcome = ExtendedMessage(message=message)
|
|
308
342
|
message_outcomes.append(message_outcome)
|
|
309
343
|
break
|
|
310
344
|
else:
|
|
311
|
-
possible_ground_truth_for_analysis.append(
|
|
345
|
+
possible_ground_truth_for_analysis.append(
|
|
346
|
+
goal_detail.args
|
|
347
|
+
)
|
|
312
348
|
|
|
313
349
|
if not found:
|
|
314
350
|
message_outcome = ExtendedMessage(message=message)
|
|
@@ -324,7 +360,7 @@ class EvaluationPackage:
|
|
|
324
360
|
f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
|
|
325
361
|
)
|
|
326
362
|
else:
|
|
327
|
-
|
|
363
|
+
|
|
328
364
|
if not self.is_attack_evaluation:
|
|
329
365
|
rich.print(
|
|
330
366
|
f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
|
|
@@ -350,7 +386,9 @@ class EvaluationPackage:
|
|
|
350
386
|
message_outcome = ExtendedMessage(message=message)
|
|
351
387
|
message_outcomes.append(message_outcome)
|
|
352
388
|
|
|
353
|
-
tool_call_and_routing_metrics.correct_tool_calls = len(
|
|
389
|
+
tool_call_and_routing_metrics.correct_tool_calls = len(
|
|
390
|
+
correct_tool_calls
|
|
391
|
+
)
|
|
354
392
|
|
|
355
393
|
assistant_responses = [
|
|
356
394
|
message
|
|
@@ -430,7 +468,9 @@ class EvaluationPackage:
|
|
|
430
468
|
llm_steps += 1
|
|
431
469
|
total_step += 1
|
|
432
470
|
|
|
433
|
-
knowledge_base_metric_summary =
|
|
471
|
+
knowledge_base_metric_summary = (
|
|
472
|
+
self.generate_knowledge_base_metric_summary()
|
|
473
|
+
)
|
|
434
474
|
# TO-DO: the table is not printing properly anymore with the new columns introduced
|
|
435
475
|
# we need to introduce a separate table for these.
|
|
436
476
|
|
|
@@ -524,7 +564,8 @@ class EvaluationPackage:
|
|
|
524
564
|
) # name of knowledge base
|
|
525
565
|
|
|
526
566
|
search_results = [
|
|
527
|
-
result.body
|
|
567
|
+
result.body
|
|
568
|
+
for result in conversational_search_data.search_results
|
|
528
569
|
]
|
|
529
570
|
faithfulness = self.rag_llm_as_a_judge.faithfulness(
|
|
530
571
|
conversational_search_data.text, search_results
|
|
@@ -547,6 +588,7 @@ class EvaluationPackage:
|
|
|
547
588
|
|
|
548
589
|
return metrics
|
|
549
590
|
|
|
591
|
+
|
|
550
592
|
if __name__ == "__main__":
|
|
551
593
|
|
|
552
594
|
messages = []
|
|
@@ -564,7 +606,9 @@ if __name__ == "__main__":
|
|
|
564
606
|
|
|
565
607
|
for message in messages:
|
|
566
608
|
if message.role == "user":
|
|
567
|
-
rich.print(
|
|
609
|
+
rich.print(
|
|
610
|
+
"[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
|
|
611
|
+
)
|
|
568
612
|
else:
|
|
569
613
|
rich.print("[orange3]WXO:[/orange3]", message.content)
|
|
570
614
|
|
|
@@ -574,7 +618,7 @@ if __name__ == "__main__":
|
|
|
574
618
|
evaluate_package = EvaluationPackage(
|
|
575
619
|
test_case_name="data1.messages.json",
|
|
576
620
|
ground_truth=ground_truth,
|
|
577
|
-
messages=messages
|
|
621
|
+
messages=messages,
|
|
578
622
|
)
|
|
579
623
|
print(evaluate_package.generate_summary())
|
|
580
624
|
# print(evaluate_package.traverse())
|
|
@@ -1,21 +1,28 @@
|
|
|
1
1
|
import importlib.resources
|
|
2
2
|
import json
|
|
3
|
+
|
|
3
4
|
import rich
|
|
4
5
|
|
|
5
|
-
from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
|
|
6
|
-
from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
|
|
7
6
|
from wxo_agentic_evaluation import prompt
|
|
7
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
|
+
StoryGenerationTemplateRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.service_provider import ProviderConfig, get_provider
|
|
8
11
|
|
|
9
12
|
console = rich.console.Console()
|
|
10
13
|
|
|
14
|
+
|
|
11
15
|
def starting_sentence_generation_prompt():
|
|
12
|
-
with importlib.resources.path(
|
|
16
|
+
with importlib.resources.path(
|
|
17
|
+
prompt, "starting_sentence_generation_prompt.jinja2"
|
|
18
|
+
) as fp:
|
|
13
19
|
# reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
|
|
14
20
|
# the starting sentence generation prompts uses the same input variable
|
|
15
21
|
render = StoryGenerationTemplateRenderer(str(fp))
|
|
16
|
-
|
|
22
|
+
|
|
17
23
|
return render
|
|
18
24
|
|
|
25
|
+
|
|
19
26
|
def generate_starting_sentence(annotated_data: dict):
|
|
20
27
|
renderer = starting_sentence_generation_prompt()
|
|
21
28
|
llm_decode_parameter = {
|
|
@@ -23,7 +30,9 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
23
30
|
"decoding_method": "greedy",
|
|
24
31
|
"max_new_tokens": 4096,
|
|
25
32
|
}
|
|
26
|
-
wai_client = get_provider(
|
|
33
|
+
wai_client = get_provider(
|
|
34
|
+
model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter
|
|
35
|
+
)
|
|
27
36
|
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
|
|
28
37
|
res = wai_client.query(prompt)
|
|
29
38
|
res = res.strip()
|
|
@@ -33,5 +42,7 @@ def generate_starting_sentence(annotated_data: dict):
|
|
|
33
42
|
res = json.loads(res)
|
|
34
43
|
return res["starting_sentence"]
|
|
35
44
|
except Exception:
|
|
36
|
-
console.log(
|
|
37
|
-
|
|
45
|
+
console.log(
|
|
46
|
+
f"The generated `starting_sentence` had incorrect format: '{res}'"
|
|
47
|
+
)
|
|
48
|
+
return res
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Generator
|
|
3
|
+
|
|
2
4
|
import requests
|
|
3
|
-
import json
|
|
4
5
|
import rich
|
|
5
6
|
|
|
6
|
-
from wxo_agentic_evaluation.external_agent.types import
|
|
7
|
-
|
|
7
|
+
from wxo_agentic_evaluation.external_agent.types import (
|
|
8
|
+
SchemaValidationResults,
|
|
9
|
+
UniversalData,
|
|
10
|
+
)
|
|
8
11
|
|
|
9
12
|
MESSAGES = [
|
|
10
13
|
{"role": "user", "content": "what's the holiday is June 13th in us?"},
|
|
11
|
-
{
|
|
12
|
-
|
|
14
|
+
{
|
|
15
|
+
"role": "assistant",
|
|
16
|
+
"content": 'tool_name: calendar_lookup, args {"location": "USA", "data": "06-13-2025"}}',
|
|
17
|
+
},
|
|
18
|
+
{"role": "assistant", "content": "it's National Sewing Machine Day"},
|
|
13
19
|
]
|
|
14
20
|
|
|
15
21
|
|
|
@@ -18,7 +24,7 @@ class ExternalAgentValidation:
|
|
|
18
24
|
self.credential = credential
|
|
19
25
|
self.auth_scheme = auth_scheme
|
|
20
26
|
self.service_url = service_url
|
|
21
|
-
|
|
27
|
+
|
|
22
28
|
@property
|
|
23
29
|
def header(self):
|
|
24
30
|
header = {"Content-Type": "application/json"}
|
|
@@ -32,23 +38,23 @@ class ExternalAgentValidation:
|
|
|
32
38
|
return header
|
|
33
39
|
|
|
34
40
|
def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
|
|
35
|
-
data = b
|
|
41
|
+
data = b""
|
|
36
42
|
for chunk in resp:
|
|
37
43
|
for line in chunk.splitlines(True):
|
|
38
|
-
if line.startswith(b
|
|
39
|
-
line = line.replace(b
|
|
40
|
-
if line.strip() == b
|
|
44
|
+
if line.startswith(b"data:"):
|
|
45
|
+
line = line.replace(b"data:", b"")
|
|
46
|
+
if line.strip() == b"[DONE]":
|
|
41
47
|
return
|
|
42
48
|
data += line
|
|
43
|
-
if data.endswith((b
|
|
49
|
+
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
|
|
44
50
|
# NOTE: edge case, "data" can be sent in two different chunks
|
|
45
|
-
if data.startswith(b
|
|
46
|
-
data = data.replace(b
|
|
51
|
+
if data.startswith(b"data:"):
|
|
52
|
+
data = data.replace(b"data:", b"")
|
|
47
53
|
yield data
|
|
48
|
-
data = b
|
|
54
|
+
data = b""
|
|
49
55
|
if data:
|
|
50
56
|
yield data
|
|
51
|
-
|
|
57
|
+
|
|
52
58
|
def _validate_streaming_response(self, resp):
|
|
53
59
|
success = True
|
|
54
60
|
logged_events = []
|
|
@@ -61,52 +67,57 @@ class ExternalAgentValidation:
|
|
|
61
67
|
except Exception as e:
|
|
62
68
|
success = False
|
|
63
69
|
break
|
|
64
|
-
|
|
70
|
+
|
|
65
71
|
return success, logged_events
|
|
66
72
|
|
|
67
73
|
def _validate_schema_compliance(self, messages):
|
|
68
74
|
payload = {"stream": True}
|
|
69
75
|
payload["messages"] = messages
|
|
70
|
-
resp = requests.post(
|
|
76
|
+
resp = requests.post(
|
|
77
|
+
url=self.service_url, headers=self.header, json=payload
|
|
78
|
+
)
|
|
71
79
|
success, logged_events = self._validate_streaming_response(resp)
|
|
72
80
|
|
|
73
81
|
msg = ", ".join([msg["content"] for msg in payload["messages"]])
|
|
74
82
|
|
|
75
83
|
if success:
|
|
76
|
-
rich.print(
|
|
84
|
+
rich.print(
|
|
85
|
+
f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'."
|
|
86
|
+
)
|
|
77
87
|
else:
|
|
78
|
-
rich.print(
|
|
88
|
+
rich.print(
|
|
89
|
+
f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n"
|
|
90
|
+
)
|
|
79
91
|
|
|
80
92
|
return success, logged_events
|
|
81
93
|
|
|
82
|
-
def call_validation(
|
|
94
|
+
def call_validation(
|
|
95
|
+
self, input_str: str, add_context: bool = False
|
|
96
|
+
) -> SchemaValidationResults:
|
|
83
97
|
if add_context:
|
|
84
98
|
return self.block_validation(input_str)
|
|
85
99
|
|
|
86
|
-
msg = {
|
|
87
|
-
|
|
88
|
-
"content": input_str
|
|
89
|
-
}
|
|
90
|
-
|
|
100
|
+
msg = {"role": "user", "content": input_str}
|
|
101
|
+
|
|
91
102
|
success, logged_events = self._validate_schema_compliance([msg])
|
|
92
|
-
results = SchemaValidationResults(
|
|
103
|
+
results = SchemaValidationResults(
|
|
104
|
+
success=success, logged_events=logged_events, messages=[msg]
|
|
105
|
+
)
|
|
93
106
|
|
|
94
107
|
return results.model_dump()
|
|
95
|
-
|
|
108
|
+
|
|
96
109
|
def block_validation(self, input_str: str) -> SchemaValidationResults:
|
|
97
|
-
"""
|
|
98
|
-
"""
|
|
110
|
+
"""Tests a block of messages"""
|
|
99
111
|
rich.print(
|
|
100
112
|
f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
|
|
101
113
|
)
|
|
102
114
|
|
|
103
|
-
msg = {
|
|
104
|
-
"role": "user",
|
|
105
|
-
"content": input_str
|
|
106
|
-
}
|
|
115
|
+
msg = {"role": "user", "content": input_str}
|
|
107
116
|
|
|
108
117
|
messages = MESSAGES + [msg]
|
|
109
118
|
success, logged_events = self._validate_schema_compliance(messages)
|
|
110
|
-
results = SchemaValidationResults(
|
|
119
|
+
results = SchemaValidationResults(
|
|
120
|
+
success=success, logged_events=logged_events, messages=messages
|
|
121
|
+
)
|
|
111
122
|
|
|
112
|
-
return results.model_dump()
|
|
123
|
+
return results.model_dump()
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from typing import List, Mapping
|
|
1
|
+
from typing import Any, List, Mapping
|
|
2
|
+
|
|
2
3
|
from rich.console import Console
|
|
3
4
|
|
|
4
|
-
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
5
5
|
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
6
|
+
from wxo_agentic_evaluation.data_annotator import (
|
|
7
|
+
KeywordsGenerationLLM,
|
|
8
|
+
LlamaKeywordsGenerationTemplateRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.external_agent import generate_starting_sentence
|
|
6
11
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
7
|
-
|
|
12
|
+
|
|
8
13
|
|
|
9
14
|
class ExternalAgentPerformanceTest:
|
|
10
15
|
def __init__(self, agent_name: str, test_data: List[str]):
|
|
@@ -12,8 +17,7 @@ class ExternalAgentPerformanceTest:
|
|
|
12
17
|
self.goal_template = {
|
|
13
18
|
"agent": agent_name,
|
|
14
19
|
"goals": {"summarize": []},
|
|
15
|
-
"goal_details": [
|
|
16
|
-
],
|
|
20
|
+
"goal_details": [],
|
|
17
21
|
"story": "<placeholder>",
|
|
18
22
|
}
|
|
19
23
|
|
|
@@ -24,42 +28,50 @@ class ExternalAgentPerformanceTest:
|
|
|
24
28
|
"decoding_method": "greedy",
|
|
25
29
|
"max_new_tokens": 256,
|
|
26
30
|
}
|
|
27
|
-
wai_client = get_provider(
|
|
28
|
-
|
|
31
|
+
wai_client = get_provider(
|
|
32
|
+
model_id=kw_gen_config.model_id, params=llm_decode_parameter
|
|
33
|
+
)
|
|
34
|
+
|
|
29
35
|
self.kw_gen = KeywordsGenerationLLM(
|
|
30
36
|
provider=wai_client,
|
|
31
37
|
template=LlamaKeywordsGenerationTemplateRenderer(
|
|
32
38
|
kw_gen_config.prompt_config
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
|
|
36
42
|
def generate_tests(self) -> List[Mapping[str, Any]]:
|
|
37
43
|
console = Console()
|
|
38
44
|
goal_templates = []
|
|
39
45
|
|
|
40
|
-
with console.status(
|
|
46
|
+
with console.status(
|
|
47
|
+
"[gold3]Creating starting sentence for user story from input file for performance testing"
|
|
48
|
+
) as status:
|
|
41
49
|
for sentence, response in self.test_data:
|
|
42
50
|
goal_temp = self.goal_template.copy()
|
|
43
51
|
goal_temp["story"] = sentence
|
|
44
52
|
|
|
45
53
|
keywords = self.kw_gen.genereate_keywords(response)
|
|
46
54
|
summarize_step = {
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
"name": "summarize",
|
|
56
|
+
"type": "text",
|
|
57
|
+
"response": response,
|
|
58
|
+
"keywords": keywords,
|
|
59
|
+
}
|
|
52
60
|
goal_temp["goal_details"] = [summarize_step]
|
|
53
|
-
goal_temp["starting_sentence"] = generate_starting_sentence(
|
|
61
|
+
goal_temp["starting_sentence"] = generate_starting_sentence(
|
|
62
|
+
goal_temp
|
|
63
|
+
)
|
|
54
64
|
|
|
55
65
|
goal_templates.append(goal_temp)
|
|
56
|
-
|
|
66
|
+
|
|
57
67
|
status.stop()
|
|
58
|
-
console.print(
|
|
68
|
+
console.print(
|
|
69
|
+
"[bold green]Done creating starting sentence from provided input data"
|
|
70
|
+
)
|
|
59
71
|
|
|
60
72
|
return goal_templates
|
|
61
73
|
|
|
62
74
|
|
|
63
75
|
if __name__ == "__main__":
|
|
64
76
|
t = ExternalAgentPerformanceTest("test")
|
|
65
|
-
t.generate_tests()
|
|
77
|
+
t.generate_tests()
|