ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +65 -20
  8. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  9. wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
  10. wxo_agentic_evaluation/inference_backend.py +117 -14
  11. wxo_agentic_evaluation/llm_user.py +2 -1
  12. wxo_agentic_evaluation/main.py +5 -0
  13. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  14. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  15. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  16. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  17. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  19. wxo_agentic_evaluation/quick_eval.py +342 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  21. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  22. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  23. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  24. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  46. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  47. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  48. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  49. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  50. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  51. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  52. wxo_agentic_evaluation/service_instance.py +2 -2
  53. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  54. wxo_agentic_evaluation/tool_planner.py +3 -1
  55. wxo_agentic_evaluation/type.py +33 -2
  56. wxo_agentic_evaluation/utils/__init__.py +0 -1
  57. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  58. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  59. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  60. wxo_agentic_evaluation/utils/utils.py +167 -5
  61. ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD +0 -56
  62. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  63. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,11 @@ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_match
37
37
  FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
38
38
  ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
39
39
 
40
+ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
41
+ "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS",
42
+ "<IGNORE>"
43
+ )
44
+
40
45
  """
41
46
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
42
47
  - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -52,8 +57,8 @@ class EvaluationPackage:
52
57
  ground_truth,
53
58
  messages,
54
59
  conversational_search_data: List[ConversationalSearch] = None,
55
- is_analyze_run=False,
56
60
  resource_map: ResourceMap = None,
61
+ is_attack_evaluation: bool = False,
57
62
  ):
58
63
  self.tool_dictionary = {
59
64
  goal_detail.name: goal_detail
@@ -67,10 +72,13 @@ class EvaluationPackage:
67
72
  ]
68
73
  self.messages = messages
69
74
  self.conversational_search_data = conversational_search_data
70
- self.validate_ground_truth(ground_truth, test_case_name)
75
+ self.is_attack_evaluation = is_attack_evaluation
71
76
  self.ground_truth = ground_truth
72
77
  self.test_case_name = test_case_name
73
- self.is_analyze_run = is_analyze_run
78
+ self.resource_map = resource_map
79
+
80
+ if not self.is_attack_evaluation:
81
+ self.validate_ground_truth(self.ground_truth, self.test_case_name)
74
82
 
75
83
  self.matcher = LLMMatcher(
76
84
  llm_client=get_provider(
@@ -94,8 +102,6 @@ class EvaluationPackage:
94
102
  ANSWER_RELEVANCY_PROMPT_PATH
95
103
  ),
96
104
  )
97
-
98
- self.resource_map = resource_map
99
105
 
100
106
  @staticmethod
101
107
  def find_ground_node(graph, start_node):
@@ -209,6 +215,33 @@ class EvaluationPackage:
209
215
  rich.print(
210
216
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
211
217
  )
218
+
219
+ @staticmethod
220
+ def _check_if_args_match_with_ignore(
221
+ actual_args: dict[str, str],
222
+ expected_args: dict[str, str]
223
+ ) -> bool:
224
+ """
225
+ This function checks if a registered tool call matches with the goal node when:
226
+ - the arg value marked as wrong is labelled with the "<IGNORE>" value in the corresponding ground truth
227
+ Args:
228
+ actual_args (dict): Made during inference.
229
+ expected_args (dict): Defined in the test case/ground truth.
230
+ Returns:
231
+ bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
232
+ """
233
+
234
+ if(
235
+ set(actual_args.keys()) != set(expected_args.keys())
236
+ ):
237
+ return False
238
+
239
+ for key in actual_args:
240
+ if actual_args[key] != expected_args[key] \
241
+ and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
242
+ return False
243
+
244
+ return True
212
245
 
213
246
  def traverse(self):
214
247
  labelled_messages = []
@@ -218,7 +251,7 @@ class EvaluationPackage:
218
251
  tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
219
252
  )
220
253
  tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
221
-
254
+ correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
222
255
  for message in self.messages:
223
256
  if message.type == ContentType.tool_call:
224
257
 
@@ -244,6 +277,7 @@ class EvaluationPackage:
244
277
 
245
278
  continue
246
279
 
280
+ # TO-DO: re-think how deduplication works in the context of precision & recall
247
281
  tool_call_and_routing_metrics.total_tool_calls += 1
248
282
 
249
283
  # evaluating more than once is fine
@@ -259,11 +293,16 @@ class EvaluationPackage:
259
293
  found = False
260
294
  possible_ground_truth_for_analysis = []
261
295
  for goal_detail in matching_goal_details:
262
- if msg_tool_call["args"] == goal_detail.args:
296
+ # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
297
+ if goal_detail.args == {"IGNORE": None} or (msg_tool_call["args"] == goal_detail.args or
298
+ self._check_if_args_match_with_ignore(
299
+ msg_tool_call["args"],
300
+ goal_detail.args
301
+ )):
263
302
  labelled_messages.append(goal_detail.name)
264
303
  labelled_messages_without_text_step.append(goal_detail.name)
265
-
266
- tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
304
+ correct_tool_calls.add(goal_detail.name)
305
+ #tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
267
306
  found = True
268
307
  message_outcome = ExtendedMessage(message=message)
269
308
  message_outcomes.append(message_outcome)
@@ -279,15 +318,17 @@ class EvaluationPackage:
279
318
  "expected": possible_ground_truth_for_analysis,
280
319
  }
281
320
  message_outcomes.append(message_outcome)
321
+ if not self.is_attack_evaluation:
322
+ rich.print(
323
+ f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
324
+ f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
325
+ )
326
+ else:
327
+
328
+ if not self.is_attack_evaluation:
282
329
  rich.print(
283
- f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
284
- f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
330
+ f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
285
331
  )
286
- else:
287
-
288
- rich.print(
289
- f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
290
- )
291
332
  # note: this is incorrect after the 1.6 change
292
333
  message_outcome = ExtendedMessage(message=message)
293
334
  message_outcome.reason = {"reason": "irrelevant tool call"}
@@ -308,6 +349,9 @@ class EvaluationPackage:
308
349
  else:
309
350
  message_outcome = ExtendedMessage(message=message)
310
351
  message_outcomes.append(message_outcome)
352
+
353
+ tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
354
+
311
355
  assistant_responses = [
312
356
  message
313
357
  for message in self.messages
@@ -368,8 +412,6 @@ class EvaluationPackage:
368
412
  metrics,
369
413
  message_with_reasons,
370
414
  ) = self.traverse()
371
- if self.is_analyze_run:
372
- print(labelled_messages)
373
415
 
374
416
  is_success = self.is_topological_sort(
375
417
  self.ground_truth.goals, labelled_messages
@@ -433,7 +475,11 @@ class EvaluationPackage:
433
475
  for message in self.messages:
434
476
  if message.type == ContentType.tool_call:
435
477
  content = json.loads(message.content)
436
- id = content.get("tool_call_id", "")
478
+ """
479
+ - In ADK 1.9, for tool call events, the "tool_call_id" is now "id"
480
+ - still parse out "tool_call_id" for backwards compatibility
481
+ """
482
+ id = content.get("tool_call_id") or content.get("id")
437
483
  if id == tool_call_id:
438
484
  return content.get("name")
439
485
 
@@ -501,7 +547,6 @@ class EvaluationPackage:
501
547
 
502
548
  return metrics
503
549
 
504
-
505
550
  if __name__ == "__main__":
506
551
 
507
552
  messages = []
@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
23
23
  "decoding_method": "greedy",
24
24
  "max_new_tokens": 4096,
25
25
  }
26
- wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
26
+ wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
27
27
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
28
28
  res = wai_client.query(prompt)
29
29
  res = res.strip()
@@ -3,7 +3,7 @@ from rich.console import Console
3
3
 
4
4
  from wxo_agentic_evaluation.external_agent import generate_starting_sentence
5
5
  from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
- from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
6
+ from wxo_agentic_evaluation.service_provider import get_provider
7
7
  from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
8
8
 
9
9
  class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
19
19
 
20
20
  kw_gen_config = KeywordsGenerationConfig()
21
21
 
22
- provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
23
22
  llm_decode_parameter = {
24
23
  "min_new_tokens": 0,
25
24
  "decoding_method": "greedy",
26
25
  "max_new_tokens": 256,
27
26
  }
28
- wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
27
+ wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
29
28
 
30
29
  self.kw_gen = KeywordsGenerationLLM(
31
30
  provider=wai_client,
@@ -6,6 +6,8 @@ import rich
6
6
  import time
7
7
  from pydantic import BaseModel
8
8
  from typing import List, Generator, Dict, Tuple, Mapping, Any
9
+ from enum import Enum
10
+ from collections import deque
9
11
 
10
12
  from wxo_agentic_evaluation.type import (
11
13
  ContentType,
@@ -21,14 +23,39 @@ from wxo_agentic_evaluation.llm_user import LLMUser
21
23
  from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
22
24
  from wxo_agentic_evaluation.arg_configs import TestConfig
23
25
  from wxo_agentic_evaluation.service_instance import tenant_setup
24
- from wxo_agentic_evaluation.utils.utils import is_saas_url
25
-
26
-
27
- def is_end(user_input: Message):
28
- if "END" in user_input.content.strip():
29
- return True
30
- return False
26
+ from wxo_agentic_evaluation.utils.utils import (
27
+ is_saas_url,
28
+ safe_divide,
29
+ Tokenizer
30
+ )
31
31
 
32
+ tokenizer = Tokenizer()
33
+
34
+ class Roles(Enum):
35
+ ASSISTANT = "assistant"
36
+ USER = "user"
37
+
38
+ def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
39
+ """Calculate the word overlap similarity score between the .content field of two Message objects.
40
+ Args:
41
+ first_message_text (str): The .content field of the first message.
42
+ second_message_text (str): The .content field of the second message.
43
+ """
44
+
45
+ words_in_first_message = tokenizer(first_message_text)
46
+ words_in_second_message = tokenizer(second_message_text)
47
+
48
+ # Calculate the number of common words
49
+ common_words = set(words_in_first_message) & set(words_in_second_message)
50
+ unique_words = set(words_in_first_message + words_in_second_message)
51
+
52
+ unique_words_count = len(unique_words)
53
+ common_words_count = len(common_words)
54
+
55
+ return safe_divide(
56
+ common_words_count,
57
+ unique_words_count
58
+ )
32
59
 
33
60
  def is_transfer_response(step_detail: Dict):
34
61
  # this is not very reliable
@@ -394,6 +421,7 @@ class WXOInferenceBackend:
394
421
 
395
422
  messages = []
396
423
  for entry in result:
424
+
397
425
  tool_call_id = None
398
426
  if step_history := entry.get("step_history"):
399
427
  for step_message in step_history:
@@ -406,6 +434,10 @@ class WXOInferenceBackend:
406
434
  tool_json = {"type": "tool_call"}
407
435
  tool_json.update(tool)
408
436
  content = json.dumps(tool_json)
437
+ # TO-DO: review do we even need the get messages for retry loop anymore?
438
+ if msg_content := entry.get("content"):
439
+ if msg_content[0].get("response_type") == "conversational_search":
440
+ continue
409
441
  messages.append(
410
442
  Message(
411
443
  role=role,
@@ -504,6 +536,11 @@ class WXOInferenceBackend:
504
536
 
505
537
 
506
538
  class EvaluationController:
539
+
540
+ MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
541
+ MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)) # if any two consecutive messages are >98% similar, the inference loop will be terminated
542
+ MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
543
+
507
544
  def __init__(
508
545
  self,
509
546
  wxo_inference_backend: WXOInferenceBackend,
@@ -513,18 +550,24 @@ class EvaluationController:
513
550
  self.wxo_inference_backend = wxo_inference_backend
514
551
  self.llm_user = llm_user
515
552
  self.config = config
553
+ self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
554
+
555
+ if self.repeating_output_detection:
556
+ # Use deque for efficient O(1) operations
557
+ self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
558
+ self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
516
559
 
517
560
  def run(
518
- self, task_n, story, agent_name: str, starting_user_input: str = None
561
+ self, task_n, story, agent_name: str, starting_user_input: str = None, attack_instructions: str = None
519
562
  ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
520
563
  step = 0
521
564
  thread_id = None
522
565
  conversation_history: List[Message] = []
523
566
  conversational_search_history_data = []
524
567
  call_tracker = CallTracker()
525
- # make this configurable
526
- while step < 20:
527
568
 
569
+ # make this configurable
570
+ while step < self.MAX_CONVERSATION_STEPS:
528
571
  if step == 0 and starting_user_input:
529
572
  user_input = Message(
530
573
  role="user", content=starting_user_input, type=ContentType.text
@@ -539,16 +582,22 @@ class EvaluationController:
539
582
  )
540
583
  else: # llm
541
584
  user_input = self.llm_user.generate_user_input(
542
- story, conversation_history
585
+ story, conversation_history, attack_instructions=attack_instructions
543
586
  )
544
587
  if self.config.enable_verbose_logging:
545
588
  rich.print(
546
589
  f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
547
590
  user_input.content,
548
591
  )
549
- if is_end(user_input):
592
+
593
+ if self._is_end(user_input):
550
594
  break
595
+
596
+ if self.repeating_output_detection:
597
+ self.recent_user_messages.append(user_input.content)
598
+
551
599
  conversation_history.append(user_input)
600
+
552
601
  messages, thread_id, conversational_search_data = (
553
602
  self.wxo_inference_backend.stream_messages(
554
603
  user_input,
@@ -559,16 +608,70 @@ class EvaluationController:
559
608
  )
560
609
  if not messages:
561
610
  raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
562
- if self.config.enable_verbose_logging:
563
- for message in messages:
611
+
612
+ for message in messages:
613
+ if self.repeating_output_detection:
614
+ if message.role == Roles.ASSISTANT and message.type == ContentType.text:
615
+ self.recent_assistant_messages.append(message.content)
616
+
617
+ if self.config.enable_verbose_logging:
564
618
  rich.print(
565
619
  f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
566
620
  message.content,
567
621
  )
622
+
568
623
  conversation_history.extend(messages)
569
624
  conversational_search_history_data.extend(conversational_search_data)
625
+
570
626
  step += 1
571
627
  return conversation_history, call_tracker, conversational_search_history_data
628
+
629
+ def _is_looping(self, messages: deque) -> bool:
630
+ """Checks whether the user or assistant is stuck in a loop.
631
+ Args:
632
+ messages (deque): Defines the message cache to be assessed for similarity.
633
+ Returns:
634
+ bool: True if stuck in a loop, False otherwise.
635
+ """
636
+ sim_count = 0
637
+
638
+ if len(messages) >= self.MAX_REPEATING_MESSAGES:
639
+ oldest_cached_message = messages[0]
640
+ for i, old_message in enumerate(messages):
641
+ if i == 0:
642
+ continue
643
+ if oldest_cached_message == old_message:
644
+ sim_count += 1
645
+ elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
646
+ sim_count += 1
647
+
648
+ return sim_count >= self.MAX_REPEATING_MESSAGES - 1
649
+
650
+ def _is_end(self, current_user_input: Message) -> bool:
651
+ """
652
+ Check if the user input indicates the end of the conversation.
653
+
654
+ - This function checks if the user input contains 'END'.
655
+ - An END is also triggered when the message cache(s) is filled with messages that are too similar.
656
+ - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
657
+ Args:
658
+ current_user_input (Message): The user message.
659
+ Returns:
660
+ bool: True if the user input indicates an END, False otherwise.
661
+ """
662
+ current_user_message_content = current_user_input.content.strip()
663
+
664
+ # Check if the user message contains 'END'
665
+ if "END" in current_user_message_content:
666
+ return True
667
+
668
+ if self.repeating_output_detection:
669
+ # Check for repeating user or assistant messages
670
+ if (self._is_looping(self.recent_user_messages) or
671
+ self._is_looping(self.recent_assistant_messages)):
672
+ return True
673
+
674
+ return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
572
675
 
573
676
  def get_wxo_client(
574
677
  service_url: str, tenant_name: str, token: str = None
@@ -17,7 +17,7 @@ class LLMUser:
17
17
  )
18
18
 
19
19
  def generate_user_input(
20
- self, user_story, conversation_history: List[Message]
20
+ self, user_story, conversation_history: List[Message], attack_instructions: str = None
21
21
  ) -> Message | None:
22
22
  # the tool response is already summarized, we don't need that to take over the chat history context window
23
23
  prompt_input = self.prompt_template.render(
@@ -28,6 +28,7 @@ class LLMUser:
28
28
  ],
29
29
  user_story=user_story,
30
30
  user_response_style=self.user_response_style,
31
+ attack_instructions=attack_instructions,
31
32
  )
32
33
  user_input = self.wai_client.query(prompt_input)
33
34
  user_input = Message(
@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
107
107
 
108
108
  def main(config: TestConfig):
109
109
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
110
+ if config.num_workers > 1 and config.enable_manual_user_input:
111
+ rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
112
+ config.enable_manual_user_input = False # disable manual user input for parallel execution
113
+ # reason: threads continue to stream messages while waiting for user input, which is not desired
114
+ # and the manual input prompt is not labelled properly in the UI
110
115
  wxo_client = get_wxo_client(
111
116
  config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
112
117
  )
@@ -1,5 +1,5 @@
1
1
  import math
2
- from typing import List, Mapping, Any
2
+ from typing import List, Mapping, Any, Tuple, Optional
3
3
  from enum import Enum
4
4
 
5
5
  from pydantic import BaseModel, computed_field
@@ -166,3 +166,24 @@ class ToolCallAndRoutingMetrics(BaseModel):
166
166
  ),
167
167
  2,
168
168
  )
169
+
170
+ class FailedStaticTestCases(BaseModel):
171
+ metric_name: str
172
+ description: str
173
+ explanation: str
174
+
175
+ class FailedSemanticTestCases(BaseModel):
176
+ metric_name: str
177
+ evidence: str
178
+ explanation: str
179
+ output: int
180
+ confidence: float
181
+
182
+ class ReferenceLessEvalMetrics(BaseModel):
183
+ dataset_name: str
184
+ number_of_tool_calls: int
185
+ number_of_successful_tool_calls: int
186
+ number_of_static_failed_tool_calls: int
187
+ number_of_semantic_failed_tool_calls: int
188
+ failed_static_tool_calls: Optional[List[Tuple[int, List[FailedStaticTestCases]]]]
189
+ failed_semantic_tool_calls: Optional[List[Tuple[int, List[FailedSemanticTestCases]]]]
@@ -0,0 +1,178 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are an expert at evaluating tool descriptions for AI agents.
3
+ Your task is to analyze how well a tool description serves as documentation for a specific function, helping an AI agent understand when and how to use that tool effectively.
4
+
5
+ ## EVALUATION APPROACH
6
+
7
+ - You will evaluate the tool description across **five distinct criteria**.
8
+ - Each criterion examines a different aspect of description quality.
9
+ - Consider the function name and parameters as important context, they tell you what the tool actually does, and the description should work together with this information.
10
+
11
+ ## EVALUATION CRITERIA
12
+
13
+ 1. **Uses Vague Language**
14
+ Evaluate whether the description contains unclear, ambiguous, or non-specific terms that make it difficult to understand what the tool does.
15
+
16
+ Signs of vague language:
17
+ - Generic words that could apply to anything ("tool", "data", "information" without specifics)
18
+ - Unclear abbreviations without context ("comp" instead of "compensation")
19
+ - Ambiguous pronouns ("it", "this", "that" without clear references)
20
+ - Non-specific qualifiers ("some", "various", "different" without elaboration)
21
+
22
+ **2. Contains Redundant Information**
23
+ Evaluate whether the description repeats the same information multiple times within itself, making it unnecessarily verbose.
24
+
25
+ Signs of redundancy:
26
+ - Providing the same information in different ways within the description itself
27
+ - Repeating key terms or concepts multiple times unnecessarily
28
+ - Including duplicate/unnecessary details that don't add clarity
29
+
30
+ **3. Provides No New Information**
31
+ Evaluate whether the description adds meaningful details beyond what you can already understand from the function name and parameters alone.
32
+ In order to assess if a description provides meaningful, new information - ask yourself: If someone only saw the function name and parameter names, would this description teach them anything new about what the tool does, when to use it, or how it works?
33
+
34
+ **4. Does Not Convey Tool Purpose**
35
+ Evaluate whether the description fails to clearly explain what the tool actually does or accomplishes.
36
+
37
+ Signs of unclear purpose:
38
+ - Reader would be confused about what happens when they call this function
39
+ - Description doesn't explain the tool's core functionality
40
+ - Unclear what problem this tool solves or what outcome it produces
41
+
42
+ **5. Does Not Help in Identifying Tool Uniquely**
43
+ Evaluate whether the description is so generic that it could apply to many different tools, making it difficult for an agent to choose the right tool for a specific task.
44
+
45
+ Signs of non-unique descriptions:
46
+ - Description could accurately describe dozens of different functions
47
+ - Lacks specific details that distinguish this tool from similar ones
48
+ - Doesn't highlight what makes this tool different from alternatives
49
+
50
+ Here are some example evaluations which isolate the aforementioned criteria for your reference:
51
+
52
+ ## EXAMPLES
53
+
54
+ **Example 1 - Uses Vague Language:**
55
+ **Tool Name:** get_employee_compensation_details
56
+ **Description:** "Retrieves relevant employee data from the system"
57
+ **Parameters:** employee_id, include_historical
58
+
59
+ Expected Response:
60
+ ```json
61
+ {
62
+ "uses_vague_language": "TRUE",
63
+ "contains_redundant_information": "FALSE",
64
+ "provides_no_new_information": "FALSE",
65
+ "does_not_convey_tool_purpose": "FALSE",
66
+ "does_not_help_in_identifying_tool_uniquely": "FALSE",
67
+ "reason": "Uses vague term 'relevant employee data' which doesn't indicate this tool specifically handles compensation information"
68
+ }
69
+ ```
70
+
71
+ **Example 2 - Contains Redundant Information:**
72
+ **Tool Name:** update_employee_phone
73
+ **Description:** "Updates and modifies the employee's phone number by changing their phone contact information"
74
+ **Parameters:** employee_id, new_phone_number
75
+
76
+ Expected Response:
77
+ ```json
78
+ {
79
+ "uses_vague_language": "FALSE",
80
+ "contains_redundant_information": "TRUE",
81
+ "provides_no_new_information": "FALSE",
82
+ "does_not_convey_tool_purpose": "FALSE",
83
+ "does_not_help_in_identifying_tool_uniquely": "FALSE",
84
+ "reason": "Unnecessarily repeats the concept of updating/changing/modifying phone information multiple times"
85
+ }
86
+ ```
87
+
88
+ **Example 3 - Provides No New Information:**
89
+ **Tool Name:** get_holiday_calendar
90
+ **Description:** "Gets holiday calendar"
91
+ **Parameters:** country_code, year
92
+
93
+ Expected Response:
94
+ ```json
95
+ {
96
+ "uses_vague_language": "FALSE",
97
+ "contains_redundant_information": "FALSE",
98
+ "provides_no_new_information": "TRUE",
99
+ "does_not_convey_tool_purpose": "FALSE",
100
+ "does_not_help_in_identifying_tool_uniquely": "FALSE",
101
+ "reason": "Description exactly mirrors the function name without explaining what the calendar contains or how parameters are used"
102
+ }
103
+ ```
104
+
105
+ **Example 4 - Does Not Convey Tool Purpose**
106
+ **Tool Name:** initiate_promotion_workflow
107
+ **Description:** "Creates a workflow entry in the system"
108
+ **Parameters:** employee_id, new_position, effective_date, manager_approval
109
+
110
+ Expected Response:
111
+ ```json
112
+ {
113
+ "uses_vague_language": "FALSE",
114
+ "contains_redundant_information": "FALSE",
115
+ "provides_no_new_information": "FALSE",
116
+ "does_not_convey_tool_purpose": "TRUE",
117
+ "does_not_help_in_identifying_tool_uniquely": "FALSE",
118
+ "reason": "Describes the technical implementation (creating a workflow entry) rather than the business purpose or outcome of initiating a promotion process"
119
+ }
120
+ ```
121
+
122
+ **Example 5 - Does Not Help Identify Tool Uniquely:**
123
+ **Tool Name:** get_employee_contact_details
124
+ **Description:** "Retrieves employee information from the HR system"
125
+ **Parameters:** employee_id
126
+
127
+ Expected Response:
128
+ ```json
129
+ {
130
+ "uses_vague_language": "FALSE",
131
+ "contains_redundant_information": "FALSE",
132
+ "provides_no_new_information": "FALSE",
133
+ "does_not_convey_tool_purpose": "FALSE",
134
+ "does_not_help_in_identifying_tool_uniquely": "TRUE",
135
+ "reason": "Generic description could apply to any employee data retrieval function - doesn't specify that it returns contact details"
136
+ }
137
+ ```
138
+
139
+ **Here are some instructions on how you should respond, whenever you are asked to evaluate a tool description:**
140
+
141
+ ## REQUIRED RESPONSE FORMAT
142
+
143
+ Respond with a single JSON object containing your evaluation with the following keys:
144
+ - uses_vague_language: (TRUE/FALSE) your assessment of whether the description uses vague language.
145
+ - contains_redundant_information: (TRUE/FALSE) your assessment of whether the description contains redundant information.
146
+ - provides_no_new_information: (TRUE/FALSE) your assessment of whether the description provides additional insight not observed in the function name and parameters.
147
+ - does_not_convey_tool_purpose: (TRUE/FALSE) your assessment of whether the description clarifies tool purpose and usage.
148
+ - does_not_help_in_identifying_tool_uniquely: (TRUE/FALSE) your assessment of whether the description will help identify it uniquely amongst other possible tools in an agent's toolkit.
149
+
150
+ Here is a sample JSON object embdedded inside a code block for reference:
151
+
152
+ ```json
153
+ {
154
+ "uses_vague_language": "TRUE",
155
+ "contains_redundant_information": "FALSE",
156
+ "provides_no_new_information": "TRUE",
157
+ "does_not_convey_tool_purpose": "FALSE",
158
+ "does_not_help_in_identifying_tool_uniquely": "TRUE",
159
+ "reason": "Brief explanation of the main issues found"
160
+ }
161
+ ```
162
+
163
+ **IMPORTANT:**
164
+ - Follow the same syntax for your JSON response as you saw in the sample provided within the code block.
165
+ - Use only "TRUE" or "FALSE" for each criterion, except for the "reason" field in the JSON object.
166
+ - Do NOT add any text, comments or explanations outside the JSON object.
167
+ - Do NOT use any markdown formatting.
168
+ - Provide a concise explanation only inside the "reason" field of the JSON object, and nowhere else.
169
+
170
+ <|eot_id|>
171
+ <|start_header_id|>user<|end_header_id|>
172
+ **Evaluate this tool's description using the aforementioned criteria:**
173
+
174
+ **Tool Name:** {{ tool_definition.tool_name }}
175
+ **Description:** "{{ tool_definition.tool_description }}"
176
+ **Parameters:** {{ tool_definition.tool_params | join(', ') }}
177
+ <|eot_id|>
178
+ <|start_header_id|>assistant<|end_header_id|>