ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,46 +1,53 @@
1
- from typing import List
2
1
  import json
3
- import os
2
+ import os
3
+ from typing import List
4
+
4
5
  import rich
5
6
 
7
+ from wxo_agentic_evaluation import __file__
6
8
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
7
-
8
- from wxo_agentic_evaluation.type import (
9
- ContentType,
10
- Message,
11
- EvaluationData,
12
- EventTypes,
13
- ConversationalSearch,
14
- ExtendedMessage,
15
- )
16
- from wxo_agentic_evaluation.resource_map import ResourceMap
17
- from wxo_agentic_evaluation.service_provider import get_provider
9
+ from wxo_agentic_evaluation.llm_matching import LLMMatcher
10
+ from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
18
11
  from wxo_agentic_evaluation.metrics.metrics import (
19
- KnowledgeBaseMetrics,
20
12
  KeywordSemanticSearchMetric,
13
+ KnowledgeBaseMetrics,
14
+ TextMatchType,
21
15
  ToolCallAndRoutingMetrics,
22
- TextMatchType
23
16
  )
24
17
  from wxo_agentic_evaluation.prompt.template_render import (
18
+ AnswerRelevancyTemplateRenderer,
19
+ FaithfulnessTemplateRenderer,
25
20
  KeywordMatchingTemplateRenderer,
26
21
  SemanticMatchingTemplateRenderer,
27
- FaithfulnessTemplateRenderer,
28
- AnswerRelevancyTemplateRenderer,
29
22
  )
30
- from wxo_agentic_evaluation.llm_matching import LLMMatcher
31
- from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
32
- from wxo_agentic_evaluation import __file__
23
+ from wxo_agentic_evaluation.resource_map import ResourceMap
24
+ from wxo_agentic_evaluation.service_provider import get_provider
25
+ from wxo_agentic_evaluation.type import (
26
+ ContentType,
27
+ ConversationalSearch,
28
+ EvaluationData,
29
+ EventTypes,
30
+ ExtendedMessage,
31
+ Message,
32
+ )
33
33
 
34
34
  root_dir = os.path.dirname(__file__)
35
- KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
36
- SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
37
- FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
38
- ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
35
+ KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
36
+ root_dir, "prompt", "keyword_matching_prompt.jinja2"
37
+ )
38
+ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
39
+ root_dir, "prompt", "semantic_matching_prompt.jinja2"
40
+ )
41
+ FAITHFULNESS_PROMPT_PATH = os.path.join(
42
+ root_dir, "prompt", "faithfulness_prompt.jinja2"
43
+ )
44
+ ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
45
+ root_dir, "prompt", "answer_relevancy_prompt.jinja2"
46
+ )
39
47
 
40
48
  RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
41
- "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS",
42
- "<IGNORE>"
43
- )
49
+ "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
50
+ )
44
51
 
45
52
  """
46
53
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
@@ -50,6 +57,7 @@ single, summary step goals.
50
57
  """
51
58
  DUMMY_GRAPH_NODE_NAME = "dummy-goal"
52
59
 
60
+
53
61
  class EvaluationPackage:
54
62
  def __init__(
55
63
  self,
@@ -76,14 +84,18 @@ class EvaluationPackage:
76
84
  self.ground_truth = ground_truth
77
85
  self.test_case_name = test_case_name
78
86
  self.resource_map = resource_map
79
-
87
+
80
88
  if not self.is_attack_evaluation:
81
89
  self.validate_ground_truth(self.ground_truth, self.test_case_name)
82
90
 
83
91
  self.matcher = LLMMatcher(
84
92
  llm_client=get_provider(
85
93
  model_id="meta-llama/llama-3-405b-instruct",
86
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 10},
94
+ params={
95
+ "min_new_tokens": 0,
96
+ "decoding_method": "greedy",
97
+ "max_new_tokens": 10,
98
+ },
87
99
  ),
88
100
  keyword_template=KeywordMatchingTemplateRenderer(
89
101
  KEYWORD_MATCHING_PROMPT_PATH
@@ -94,20 +106,24 @@ class EvaluationPackage:
94
106
  )
95
107
  self.rag_llm_as_a_judge = LLMJudge(
96
108
  llm_client=get_provider(
97
- model_id="meta-llama/llama-3-405b-instruct",
98
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
99
- ),
109
+ model_id="meta-llama/llama-3-405b-instruct",
110
+ params={
111
+ "min_new_tokens": 0,
112
+ "decoding_method": "greedy",
113
+ "max_new_tokens": 4096,
114
+ },
115
+ ),
100
116
  faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
101
117
  answer_relevancy=AnswerRelevancyTemplateRenderer(
102
118
  ANSWER_RELEVANCY_PROMPT_PATH
103
119
  ),
104
120
  )
105
-
121
+
106
122
  @staticmethod
107
123
  def find_ground_node(graph, start_node):
108
- """ Simple implementation. Should be fixed in the future
124
+ """Simple implementation. Should be fixed in the future
109
125
 
110
- Assumes that there is a single graph node that does not have children
126
+ Assumes that there is a single graph node that does not have children
111
127
  """
112
128
 
113
129
  stack = [start_node]
@@ -117,21 +133,23 @@ class EvaluationPackage:
117
133
  node = stack.pop()
118
134
  if node not in visited_set:
119
135
  visited_set.add(node)
120
-
136
+
121
137
  # check for children
122
138
  # improvement for future: add the ground nodes here
123
139
  # right now, just return the first one
124
140
  if not graph.get(node):
125
141
  return node
126
-
142
+
127
143
  stack.extend(graph[node])
128
-
144
+
129
145
  return None
130
146
 
131
147
  @staticmethod
132
148
  def is_topological_sort(graph, ordering):
133
149
  position = {node: i for i, node in enumerate(ordering)}
134
- ground_node = EvaluationPackage.find_ground_node(graph, list(graph.keys())[0])
150
+ ground_node = EvaluationPackage.find_ground_node(
151
+ graph, list(graph.keys())[0]
152
+ )
135
153
 
136
154
  if ground_node is not None:
137
155
  graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
@@ -187,7 +205,11 @@ class EvaluationPackage:
187
205
  f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
188
206
  )
189
207
  if goal_detail.name == "summarize":
190
- if (not goal_detail.keywords or len(goal_detail.keywords) == 0) and (not goal_detail.response or len(goal_detail.response) == 0):
208
+ if (
209
+ not goal_detail.keywords or len(goal_detail.keywords) == 0
210
+ ) and (
211
+ not goal_detail.response or len(goal_detail.response) == 0
212
+ ):
191
213
  rich.print(
192
214
  f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
193
215
  )
@@ -215,11 +237,10 @@ class EvaluationPackage:
215
237
  rich.print(
216
238
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
217
239
  )
218
-
240
+
219
241
  @staticmethod
220
242
  def _check_if_args_match_with_ignore(
221
- actual_args: dict[str, str],
222
- expected_args: dict[str, str]
243
+ actual_args: dict[str, str], expected_args: dict[str, str]
223
244
  ) -> bool:
224
245
  """
225
246
  This function checks if a registered tool call matches with the goal node when:
@@ -230,15 +251,15 @@ class EvaluationPackage:
230
251
  Returns:
231
252
  bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
232
253
  """
233
-
234
- if(
235
- set(actual_args.keys()) != set(expected_args.keys())
236
- ):
254
+
255
+ if set(actual_args.keys()) != set(expected_args.keys()):
237
256
  return False
238
-
257
+
239
258
  for key in actual_args:
240
- if actual_args[key] != expected_args[key] \
241
- and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
259
+ if (
260
+ actual_args[key] != expected_args[key]
261
+ and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
262
+ ):
242
263
  return False
243
264
 
244
265
  return True
@@ -248,18 +269,26 @@ class EvaluationPackage:
248
269
  message_outcomes = []
249
270
  labelled_messages_without_text_step = []
250
271
  # Counters for tool-calling related metrics
251
- tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
272
+ tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
273
+ tool_call_and_routing_metrics.expected_tool_calls = len(
274
+ self.tool_dictionary
252
275
  )
253
- tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
254
- correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
276
+ correct_tool_calls = (
277
+ set()
278
+ ) # sometimes, tool with the same signature can be called more than once
255
279
  for message in self.messages:
256
280
  if message.type == ContentType.tool_call:
257
281
 
258
282
  msg_tool_call = json.loads(message.content)
259
- if self.resource_map and msg_tool_call["name"] in self.resource_map.agent2tools:
283
+ if (
284
+ self.resource_map
285
+ and msg_tool_call["name"] in self.resource_map.agent2tools
286
+ ):
260
287
  tool_call_and_routing_metrics.total_routing_calls += 1
261
288
  relevant = False
262
- for tool in self.resource_map.agent2tools[msg_tool_call["name"]]:
289
+ for tool in self.resource_map.agent2tools[
290
+ msg_tool_call["name"]
291
+ ]:
263
292
  for goal_detail in self.tool_dictionary.values():
264
293
  if goal_detail.tool_name == tool:
265
294
  relevant = True
@@ -268,7 +297,9 @@ class EvaluationPackage:
268
297
  break
269
298
 
270
299
  if relevant:
271
- tool_call_and_routing_metrics.relevant_routing_calls += 1
300
+ tool_call_and_routing_metrics.relevant_routing_calls += (
301
+ 1
302
+ )
272
303
  else:
273
304
  message_outcome = ExtendedMessage(message=message)
274
305
  message_outcome.reason = {
@@ -294,21 +325,26 @@ class EvaluationPackage:
294
325
  possible_ground_truth_for_analysis = []
295
326
  for goal_detail in matching_goal_details:
296
327
  # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
297
- if goal_detail.args == {"IGNORE": None} or (msg_tool_call["args"] == goal_detail.args or
298
- self._check_if_args_match_with_ignore(
299
- msg_tool_call["args"],
300
- goal_detail.args
301
- )):
328
+ if goal_detail.args == {"IGNORE": None} or (
329
+ msg_tool_call["args"] == goal_detail.args
330
+ or self._check_if_args_match_with_ignore(
331
+ msg_tool_call["args"], goal_detail.args
332
+ )
333
+ ):
302
334
  labelled_messages.append(goal_detail.name)
303
- labelled_messages_without_text_step.append(goal_detail.name)
335
+ labelled_messages_without_text_step.append(
336
+ goal_detail.name
337
+ )
304
338
  correct_tool_calls.add(goal_detail.name)
305
- #tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
339
+ # tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
306
340
  found = True
307
341
  message_outcome = ExtendedMessage(message=message)
308
342
  message_outcomes.append(message_outcome)
309
343
  break
310
344
  else:
311
- possible_ground_truth_for_analysis.append(goal_detail.args)
345
+ possible_ground_truth_for_analysis.append(
346
+ goal_detail.args
347
+ )
312
348
 
313
349
  if not found:
314
350
  message_outcome = ExtendedMessage(message=message)
@@ -324,7 +360,7 @@ class EvaluationPackage:
324
360
  f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
325
361
  )
326
362
  else:
327
-
363
+
328
364
  if not self.is_attack_evaluation:
329
365
  rich.print(
330
366
  f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
@@ -350,7 +386,9 @@ class EvaluationPackage:
350
386
  message_outcome = ExtendedMessage(message=message)
351
387
  message_outcomes.append(message_outcome)
352
388
 
353
- tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
389
+ tool_call_and_routing_metrics.correct_tool_calls = len(
390
+ correct_tool_calls
391
+ )
354
392
 
355
393
  assistant_responses = [
356
394
  message
@@ -430,7 +468,9 @@ class EvaluationPackage:
430
468
  llm_steps += 1
431
469
  total_step += 1
432
470
 
433
- knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
471
+ knowledge_base_metric_summary = (
472
+ self.generate_knowledge_base_metric_summary()
473
+ )
434
474
  # TO-DO: the table is not printing properly anymore with the new columns introduced
435
475
  # we need to introduce a separate table for these.
436
476
 
@@ -524,7 +564,8 @@ class EvaluationPackage:
524
564
  ) # name of knowledge base
525
565
 
526
566
  search_results = [
527
- result.body for result in conversational_search_data.search_results
567
+ result.body
568
+ for result in conversational_search_data.search_results
528
569
  ]
529
570
  faithfulness = self.rag_llm_as_a_judge.faithfulness(
530
571
  conversational_search_data.text, search_results
@@ -547,6 +588,7 @@ class EvaluationPackage:
547
588
 
548
589
  return metrics
549
590
 
591
+
550
592
  if __name__ == "__main__":
551
593
 
552
594
  messages = []
@@ -564,7 +606,9 @@ if __name__ == "__main__":
564
606
 
565
607
  for message in messages:
566
608
  if message.role == "user":
567
- rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
609
+ rich.print(
610
+ "[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
611
+ )
568
612
  else:
569
613
  rich.print("[orange3]WXO:[/orange3]", message.content)
570
614
 
@@ -574,7 +618,7 @@ if __name__ == "__main__":
574
618
  evaluate_package = EvaluationPackage(
575
619
  test_case_name="data1.messages.json",
576
620
  ground_truth=ground_truth,
577
- messages=messages
621
+ messages=messages,
578
622
  )
579
623
  print(evaluate_package.generate_summary())
580
624
  # print(evaluate_package.traverse())
@@ -1,21 +1,28 @@
1
1
  import importlib.resources
2
2
  import json
3
+
3
4
  import rich
4
5
 
5
- from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
7
6
  from wxo_agentic_evaluation import prompt
7
+ from wxo_agentic_evaluation.prompt.template_render import (
8
+ StoryGenerationTemplateRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.service_provider import ProviderConfig, get_provider
8
11
 
9
12
  console = rich.console.Console()
10
13
 
14
+
11
15
  def starting_sentence_generation_prompt():
12
- with importlib.resources.path(prompt, "starting_sentence_generation_prompt.jinja2") as fp:
16
+ with importlib.resources.path(
17
+ prompt, "starting_sentence_generation_prompt.jinja2"
18
+ ) as fp:
13
19
  # reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
14
20
  # the starting sentence generation prompts uses the same input variable
15
21
  render = StoryGenerationTemplateRenderer(str(fp))
16
-
22
+
17
23
  return render
18
24
 
25
+
19
26
  def generate_starting_sentence(annotated_data: dict):
20
27
  renderer = starting_sentence_generation_prompt()
21
28
  llm_decode_parameter = {
@@ -23,7 +30,9 @@ def generate_starting_sentence(annotated_data: dict):
23
30
  "decoding_method": "greedy",
24
31
  "max_new_tokens": 4096,
25
32
  }
26
- wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
33
+ wai_client = get_provider(
34
+ model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter
35
+ )
27
36
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
28
37
  res = wai_client.query(prompt)
29
38
  res = res.strip()
@@ -33,5 +42,7 @@ def generate_starting_sentence(annotated_data: dict):
33
42
  res = json.loads(res)
34
43
  return res["starting_sentence"]
35
44
  except Exception:
36
- console.log(f"The generated `starting_sentence` had incorrect format: '{res}'")
37
- return res
45
+ console.log(
46
+ f"The generated `starting_sentence` had incorrect format: '{res}'"
47
+ )
48
+ return res
@@ -1,15 +1,21 @@
1
+ import json
1
2
  from typing import Generator
3
+
2
4
  import requests
3
- import json
4
5
  import rich
5
6
 
6
- from wxo_agentic_evaluation.external_agent.types import UniversalData, SchemaValidationResults
7
-
7
+ from wxo_agentic_evaluation.external_agent.types import (
8
+ SchemaValidationResults,
9
+ UniversalData,
10
+ )
8
11
 
9
12
  MESSAGES = [
10
13
  {"role": "user", "content": "what's the holiday is June 13th in us?"},
11
- {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
12
- {"role": "assistant", "content":"it's National Sewing Machine Day"}
14
+ {
15
+ "role": "assistant",
16
+ "content": 'tool_name: calendar_lookup, args {"location": "USA", "data": "06-13-2025"}}',
17
+ },
18
+ {"role": "assistant", "content": "it's National Sewing Machine Day"},
13
19
  ]
14
20
 
15
21
 
@@ -18,7 +24,7 @@ class ExternalAgentValidation:
18
24
  self.credential = credential
19
25
  self.auth_scheme = auth_scheme
20
26
  self.service_url = service_url
21
-
27
+
22
28
  @property
23
29
  def header(self):
24
30
  header = {"Content-Type": "application/json"}
@@ -32,23 +38,23 @@ class ExternalAgentValidation:
32
38
  return header
33
39
 
34
40
  def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
35
- data = b''
41
+ data = b""
36
42
  for chunk in resp:
37
43
  for line in chunk.splitlines(True):
38
- if line.startswith(b'data:'):
39
- line = line.replace(b'data:', b'')
40
- if line.strip() == b'[DONE]':
44
+ if line.startswith(b"data:"):
45
+ line = line.replace(b"data:", b"")
46
+ if line.strip() == b"[DONE]":
41
47
  return
42
48
  data += line
43
- if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
49
+ if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
44
50
  # NOTE: edge case, "data" can be sent in two different chunks
45
- if data.startswith(b'data:'):
46
- data = data.replace(b'data:', b'')
51
+ if data.startswith(b"data:"):
52
+ data = data.replace(b"data:", b"")
47
53
  yield data
48
- data = b''
54
+ data = b""
49
55
  if data:
50
56
  yield data
51
-
57
+
52
58
  def _validate_streaming_response(self, resp):
53
59
  success = True
54
60
  logged_events = []
@@ -61,52 +67,57 @@ class ExternalAgentValidation:
61
67
  except Exception as e:
62
68
  success = False
63
69
  break
64
-
70
+
65
71
  return success, logged_events
66
72
 
67
73
  def _validate_schema_compliance(self, messages):
68
74
  payload = {"stream": True}
69
75
  payload["messages"] = messages
70
- resp = requests.post(url=self.service_url, headers=self.header, json=payload)
76
+ resp = requests.post(
77
+ url=self.service_url, headers=self.header, json=payload
78
+ )
71
79
  success, logged_events = self._validate_streaming_response(resp)
72
80
 
73
81
  msg = ", ".join([msg["content"] for msg in payload["messages"]])
74
82
 
75
83
  if success:
76
- rich.print(f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'.")
84
+ rich.print(
85
+ f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'."
86
+ )
77
87
  else:
78
- rich.print(f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n")
88
+ rich.print(
89
+ f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n"
90
+ )
79
91
 
80
92
  return success, logged_events
81
93
 
82
- def call_validation(self, input_str: str, add_context: bool = False) -> SchemaValidationResults:
94
+ def call_validation(
95
+ self, input_str: str, add_context: bool = False
96
+ ) -> SchemaValidationResults:
83
97
  if add_context:
84
98
  return self.block_validation(input_str)
85
99
 
86
- msg = {
87
- "role": "user",
88
- "content": input_str
89
- }
90
-
100
+ msg = {"role": "user", "content": input_str}
101
+
91
102
  success, logged_events = self._validate_schema_compliance([msg])
92
- results = SchemaValidationResults(success=success, logged_events=logged_events, messages=[msg])
103
+ results = SchemaValidationResults(
104
+ success=success, logged_events=logged_events, messages=[msg]
105
+ )
93
106
 
94
107
  return results.model_dump()
95
-
108
+
96
109
  def block_validation(self, input_str: str) -> SchemaValidationResults:
97
- """ Tests a block of messages
98
- """
110
+ """Tests a block of messages"""
99
111
  rich.print(
100
112
  f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
101
113
  )
102
114
 
103
- msg = {
104
- "role": "user",
105
- "content": input_str
106
- }
115
+ msg = {"role": "user", "content": input_str}
107
116
 
108
117
  messages = MESSAGES + [msg]
109
118
  success, logged_events = self._validate_schema_compliance(messages)
110
- results = SchemaValidationResults(success=success, logged_events=logged_events, messages=messages)
119
+ results = SchemaValidationResults(
120
+ success=success, logged_events=logged_events, messages=messages
121
+ )
111
122
 
112
- return results.model_dump()
123
+ return results.model_dump()
@@ -1,10 +1,15 @@
1
- from typing import List, Mapping, Any
1
+ from typing import Any, List, Mapping
2
+
2
3
  from rich.console import Console
3
4
 
4
- from wxo_agentic_evaluation.external_agent import generate_starting_sentence
5
5
  from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
+ from wxo_agentic_evaluation.data_annotator import (
7
+ KeywordsGenerationLLM,
8
+ LlamaKeywordsGenerationTemplateRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.external_agent import generate_starting_sentence
6
11
  from wxo_agentic_evaluation.service_provider import get_provider
7
- from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
12
+
8
13
 
9
14
  class ExternalAgentPerformanceTest:
10
15
  def __init__(self, agent_name: str, test_data: List[str]):
@@ -12,8 +17,7 @@ class ExternalAgentPerformanceTest:
12
17
  self.goal_template = {
13
18
  "agent": agent_name,
14
19
  "goals": {"summarize": []},
15
- "goal_details": [
16
- ],
20
+ "goal_details": [],
17
21
  "story": "<placeholder>",
18
22
  }
19
23
 
@@ -24,42 +28,50 @@ class ExternalAgentPerformanceTest:
24
28
  "decoding_method": "greedy",
25
29
  "max_new_tokens": 256,
26
30
  }
27
- wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
28
-
31
+ wai_client = get_provider(
32
+ model_id=kw_gen_config.model_id, params=llm_decode_parameter
33
+ )
34
+
29
35
  self.kw_gen = KeywordsGenerationLLM(
30
36
  provider=wai_client,
31
37
  template=LlamaKeywordsGenerationTemplateRenderer(
32
38
  kw_gen_config.prompt_config
33
- ),
34
- )
35
-
39
+ ),
40
+ )
41
+
36
42
  def generate_tests(self) -> List[Mapping[str, Any]]:
37
43
  console = Console()
38
44
  goal_templates = []
39
45
 
40
- with console.status("[gold3]Creating starting sentence for user story from input file for performance testing") as status:
46
+ with console.status(
47
+ "[gold3]Creating starting sentence for user story from input file for performance testing"
48
+ ) as status:
41
49
  for sentence, response in self.test_data:
42
50
  goal_temp = self.goal_template.copy()
43
51
  goal_temp["story"] = sentence
44
52
 
45
53
  keywords = self.kw_gen.genereate_keywords(response)
46
54
  summarize_step = {
47
- "name": "summarize",
48
- "type": "text",
49
- "response": response,
50
- "keywords": keywords
51
- }
55
+ "name": "summarize",
56
+ "type": "text",
57
+ "response": response,
58
+ "keywords": keywords,
59
+ }
52
60
  goal_temp["goal_details"] = [summarize_step]
53
- goal_temp["starting_sentence"] = generate_starting_sentence(goal_temp)
61
+ goal_temp["starting_sentence"] = generate_starting_sentence(
62
+ goal_temp
63
+ )
54
64
 
55
65
  goal_templates.append(goal_temp)
56
-
66
+
57
67
  status.stop()
58
- console.print("[bold green]Done creating starting sentence from provided input data")
68
+ console.print(
69
+ "[bold green]Done creating starting sentence from provided input data"
70
+ )
59
71
 
60
72
  return goal_templates
61
73
 
62
74
 
63
75
  if __name__ == "__main__":
64
76
  t = ExternalAgentPerformanceTest("test")
65
- t.generate_tests()
77
+ t.generate_tests()