ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,303 @@
1
+ import os
2
+ from collections import deque
3
+ from typing import List, Tuple
4
+
5
+ import rich
6
+
7
+ from wxo_agentic_evaluation.arg_configs import TestConfig
8
+ from wxo_agentic_evaluation.llm_user import LLMUser
9
+ from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
10
+ RuntimeAdapter,
11
+ )
12
+ from wxo_agentic_evaluation.type import (
13
+ CallTracker,
14
+ ContentType,
15
+ ConversationalSearch,
16
+ Message,
17
+ Roles,
18
+ )
19
+ from wxo_agentic_evaluation.utils.utils import Tokenizer, safe_divide
20
+
21
+ tokenizer = Tokenizer()
22
+
23
+
24
+ def calculate_word_overlap_similarity_score(
25
+ first_message_text: str, second_message_text: str
26
+ ) -> float:
27
+ """Calculate the word overlap similarity score between the .content field of two Message objects.
28
+ Args:
29
+ first_message_text (str): The .content field of the first message.
30
+ second_message_text (str): The .content field of the second message.
31
+ """
32
+
33
+ words_in_first_message = tokenizer(first_message_text)
34
+ words_in_second_message = tokenizer(second_message_text)
35
+
36
+ # Calculate the number of common words
37
+ common_words = set(words_in_first_message) & set(words_in_second_message)
38
+ unique_words = set(words_in_first_message + words_in_second_message)
39
+
40
+ unique_words_count = len(unique_words)
41
+ common_words_count = len(common_words)
42
+
43
+ return safe_divide(common_words_count, unique_words_count)
44
+
45
+
46
+ def _generate_user_input(
47
+ user_turn: int,
48
+ story: str,
49
+ conversation_history: list[Message],
50
+ llm_user: LLMUser,
51
+ enable_manual_user_input: bool = False,
52
+ starting_user_input: str | None = None,
53
+ attack_instructions: str | None = None,
54
+ ) -> Message:
55
+ """Generates the user input for the current turn."""
56
+
57
+ if user_turn == 0 and starting_user_input is not None:
58
+ return Message(
59
+ role="user",
60
+ content=starting_user_input,
61
+ type=ContentType.text,
62
+ )
63
+
64
+ if enable_manual_user_input:
65
+ content = input("[medium_orchid1]Enter your input[/medium_orchid1] ✍️: ")
66
+ return Message(role="user", content=content, type=ContentType.text)
67
+
68
+ # llm generated user input
69
+ return llm_user.generate_user_input(
70
+ story,
71
+ conversation_history,
72
+ attack_instructions=attack_instructions,
73
+ )
74
+
75
+
76
+ class EvaluationController:
77
+ MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
78
+ MESSAGE_SIMILARITY_THRESHOLD = float(
79
+ os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
80
+ ) # if any two consecutive messages are >98% similar, the inference loop will be terminated
81
+ MAX_REPEATING_MESSAGES = int(
82
+ os.getenv("MAX_REPEATING_MESSAGES", 3)
83
+ ) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
84
+
85
+ def __init__(
86
+ self,
87
+ runtime: RuntimeAdapter,
88
+ llm_user: LLMUser,
89
+ config: TestConfig,
90
+ ):
91
+ self.runtime = runtime
92
+ self.llm_user = llm_user
93
+ self.config = config
94
+ self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
95
+
96
+ if self.repeating_output_detection:
97
+ # Use deque for efficient O(1) operations
98
+ self.recent_user_messages = deque(
99
+ maxlen=self.MAX_REPEATING_MESSAGES
100
+ )
101
+ self.recent_assistant_messages = deque(
102
+ maxlen=self.MAX_REPEATING_MESSAGES
103
+ )
104
+
105
+ def run(
106
+ self,
107
+ task_n,
108
+ story,
109
+ agent_name: str,
110
+ starting_user_input: str | None = None,
111
+ attack_instructions: str | None = None,
112
+ max_user_turns: int | None = None,
113
+ ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch], str]:
114
+ thread_id = None
115
+ conversation_history: List[Message] = []
116
+ conversational_search_history_data = []
117
+ call_tracker = CallTracker()
118
+
119
+ max_turns = (
120
+ self.MAX_CONVERSATION_STEPS
121
+ if max_user_turns is None
122
+ else max_user_turns
123
+ )
124
+
125
+ for user_turn in range(max_turns):
126
+ user_input = _generate_user_input(
127
+ user_turn=user_turn,
128
+ story=story,
129
+ conversation_history=conversation_history,
130
+ llm_user=self.llm_user,
131
+ enable_manual_user_input=self.config.enable_manual_user_input,
132
+ starting_user_input=starting_user_input,
133
+ attack_instructions=attack_instructions,
134
+ )
135
+
136
+ if self.config.enable_verbose_logging:
137
+ rich.print(
138
+ f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
139
+ user_input.content,
140
+ )
141
+
142
+ if self._is_end(user_input):
143
+ break
144
+
145
+ if self.repeating_output_detection:
146
+ self.recent_user_messages.append(user_input.content)
147
+
148
+ conversation_history.append(user_input)
149
+
150
+ # (
151
+ # messages,
152
+ # thread_id,
153
+ # conversational_search_data,
154
+ # )
155
+ resp = self.runtime.run(
156
+ user_input,
157
+ context={
158
+ "agent_name": agent_name,
159
+ "call_tracker": call_tracker,
160
+ },
161
+ thread_id=thread_id,
162
+ )
163
+ messages = resp.messages
164
+ thread_id = resp.thread_id
165
+ call_tracker.metadata = {"thread_id": thread_id}
166
+ conversational_search_data = resp.context.get("conversational_search_data", [])
167
+ if not messages:
168
+ raise RuntimeError(
169
+ f"[Task-{task_n}] No messages is produced. Exiting task."
170
+ )
171
+
172
+ for message in messages:
173
+ if self.repeating_output_detection:
174
+ if (
175
+ message.role == Roles.ASSISTANT
176
+ and message.type == ContentType.text
177
+ ):
178
+ self.recent_assistant_messages.append(message.content)
179
+
180
+ if self.config.enable_verbose_logging:
181
+ rich.print(
182
+ f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
183
+ message.content,
184
+ )
185
+
186
+ # hook for subclasses
187
+ if self._post_message_hook(
188
+ task_n=task_n,
189
+ step=user_turn,
190
+ message=message,
191
+ conversation_history=conversation_history,
192
+ ):
193
+ return (
194
+ conversation_history,
195
+ call_tracker,
196
+ conversational_search_history_data,
197
+ thread_id
198
+ )
199
+
200
+ conversation_history.extend(messages)
201
+ conversational_search_history_data.extend(
202
+ conversational_search_data
203
+ )
204
+
205
+ return (
206
+ conversation_history,
207
+ call_tracker,
208
+ conversational_search_history_data,
209
+ thread_id
210
+ )
211
+
212
+ def _post_message_hook(self, **kwargs) -> bool:
213
+ """
214
+ Hook for subclasses to extend behavior.
215
+ Return True to break the loop early.
216
+ """
217
+ return False
218
+
219
+ def _is_looping(self, messages: deque) -> bool:
220
+ """Checks whether the user or assistant is stuck in a loop.
221
+ Args:
222
+ messages (deque): Defines the message cache to be assessed for similarity.
223
+ Returns:
224
+ bool: True if stuck in a loop, False otherwise.
225
+ """
226
+ sim_count = 0
227
+
228
+ if len(messages) >= self.MAX_REPEATING_MESSAGES:
229
+ oldest_cached_message = messages[0]
230
+ for i, old_message in enumerate(messages):
231
+ if i == 0:
232
+ continue
233
+ if oldest_cached_message == old_message:
234
+ sim_count += 1
235
+ elif (
236
+ calculate_word_overlap_similarity_score(
237
+ oldest_cached_message, old_message
238
+ )
239
+ > self.MESSAGE_SIMILARITY_THRESHOLD
240
+ ):
241
+ sim_count += 1
242
+
243
+ return sim_count >= self.MAX_REPEATING_MESSAGES - 1
244
+
245
+ def _is_end(self, current_user_input: Message) -> bool:
246
+ """
247
+ Check if the user input indicates the end of the conversation.
248
+
249
+ - This function checks if the user input contains 'END'.
250
+ - An END is also triggered when the message cache(s) is filled with messages that are too similar.
251
+ - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
252
+ Args:
253
+ current_user_input (Message): The user message.
254
+ Returns:
255
+ bool: True if the user input indicates an END, False otherwise.
256
+ """
257
+ current_user_message_content = current_user_input.content.strip()
258
+
259
+ # Check if the user message contains 'END'
260
+ if "END" in current_user_message_content:
261
+ return True
262
+
263
+ if self.repeating_output_detection:
264
+ # Check for repeating user or assistant messages
265
+ if self._is_looping(self.recent_user_messages) or self._is_looping(
266
+ self.recent_assistant_messages
267
+ ):
268
+ return True
269
+
270
+ # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
271
+ return False
272
+
273
+
274
+ class AttackEvaluationController(EvaluationController):
275
+ def __init__(
276
+ self, *args, attack_data=None, attack_evaluator=None, **kwargs
277
+ ):
278
+ super().__init__(*args, **kwargs)
279
+ self.attack_data = attack_data
280
+ self.attack_evaluator = attack_evaluator
281
+
282
+ def _post_message_hook(
283
+ self, task_n, step, message, conversation_history
284
+ ) -> bool:
285
+ """Override hook to add live attack evaluation."""
286
+ if self.attack_evaluator and self.attack_data:
287
+ success = self.attack_evaluator.evaluate(
288
+ self.attack_data, conversation_history + [message]
289
+ )
290
+ if success:
291
+ rich.print(
292
+ f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
293
+ )
294
+ # persist the live result so the aggregator can pick it up later
295
+ try:
296
+ self.attack_evaluator.save_evaluation_result(
297
+ self.attack_data, True
298
+ )
299
+ except Exception:
300
+ pass
301
+ conversation_history.append(message)
302
+ return True
303
+ return False