ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import deque
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
|
|
5
|
+
import rich
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
8
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
9
|
+
from wxo_agentic_evaluation.runtime_adapter.runtime_adapter import (
|
|
10
|
+
RuntimeAdapter,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.type import (
|
|
13
|
+
CallTracker,
|
|
14
|
+
ContentType,
|
|
15
|
+
ConversationalSearch,
|
|
16
|
+
Message,
|
|
17
|
+
Roles,
|
|
18
|
+
)
|
|
19
|
+
from wxo_agentic_evaluation.utils.utils import Tokenizer, safe_divide
|
|
20
|
+
|
|
21
|
+
tokenizer = Tokenizer()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def calculate_word_overlap_similarity_score(
|
|
25
|
+
first_message_text: str, second_message_text: str
|
|
26
|
+
) -> float:
|
|
27
|
+
"""Calculate the word overlap similarity score between the .content field of two Message objects.
|
|
28
|
+
Args:
|
|
29
|
+
first_message_text (str): The .content field of the first message.
|
|
30
|
+
second_message_text (str): The .content field of the second message.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
words_in_first_message = tokenizer(first_message_text)
|
|
34
|
+
words_in_second_message = tokenizer(second_message_text)
|
|
35
|
+
|
|
36
|
+
# Calculate the number of common words
|
|
37
|
+
common_words = set(words_in_first_message) & set(words_in_second_message)
|
|
38
|
+
unique_words = set(words_in_first_message + words_in_second_message)
|
|
39
|
+
|
|
40
|
+
unique_words_count = len(unique_words)
|
|
41
|
+
common_words_count = len(common_words)
|
|
42
|
+
|
|
43
|
+
return safe_divide(common_words_count, unique_words_count)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _generate_user_input(
|
|
47
|
+
user_turn: int,
|
|
48
|
+
story: str,
|
|
49
|
+
conversation_history: list[Message],
|
|
50
|
+
llm_user: LLMUser,
|
|
51
|
+
enable_manual_user_input: bool = False,
|
|
52
|
+
starting_user_input: str | None = None,
|
|
53
|
+
attack_instructions: str | None = None,
|
|
54
|
+
) -> Message:
|
|
55
|
+
"""Generates the user input for the current turn."""
|
|
56
|
+
|
|
57
|
+
if user_turn == 0 and starting_user_input is not None:
|
|
58
|
+
return Message(
|
|
59
|
+
role="user",
|
|
60
|
+
content=starting_user_input,
|
|
61
|
+
type=ContentType.text,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if enable_manual_user_input:
|
|
65
|
+
content = input("[medium_orchid1]Enter your input[/medium_orchid1] ✍️: ")
|
|
66
|
+
return Message(role="user", content=content, type=ContentType.text)
|
|
67
|
+
|
|
68
|
+
# llm generated user input
|
|
69
|
+
return llm_user.generate_user_input(
|
|
70
|
+
story,
|
|
71
|
+
conversation_history,
|
|
72
|
+
attack_instructions=attack_instructions,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EvaluationController:
|
|
77
|
+
MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
|
|
78
|
+
MESSAGE_SIMILARITY_THRESHOLD = float(
|
|
79
|
+
os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
|
|
80
|
+
) # if any two consecutive messages are >98% similar, the inference loop will be terminated
|
|
81
|
+
MAX_REPEATING_MESSAGES = int(
|
|
82
|
+
os.getenv("MAX_REPEATING_MESSAGES", 3)
|
|
83
|
+
) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
runtime: RuntimeAdapter,
|
|
88
|
+
llm_user: LLMUser,
|
|
89
|
+
config: TestConfig,
|
|
90
|
+
):
|
|
91
|
+
self.runtime = runtime
|
|
92
|
+
self.llm_user = llm_user
|
|
93
|
+
self.config = config
|
|
94
|
+
self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
|
|
95
|
+
|
|
96
|
+
if self.repeating_output_detection:
|
|
97
|
+
# Use deque for efficient O(1) operations
|
|
98
|
+
self.recent_user_messages = deque(
|
|
99
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
100
|
+
)
|
|
101
|
+
self.recent_assistant_messages = deque(
|
|
102
|
+
maxlen=self.MAX_REPEATING_MESSAGES
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def run(
|
|
106
|
+
self,
|
|
107
|
+
task_n,
|
|
108
|
+
story,
|
|
109
|
+
agent_name: str,
|
|
110
|
+
starting_user_input: str | None = None,
|
|
111
|
+
attack_instructions: str | None = None,
|
|
112
|
+
max_user_turns: int | None = None,
|
|
113
|
+
) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch], str]:
|
|
114
|
+
thread_id = None
|
|
115
|
+
conversation_history: List[Message] = []
|
|
116
|
+
conversational_search_history_data = []
|
|
117
|
+
call_tracker = CallTracker()
|
|
118
|
+
|
|
119
|
+
max_turns = (
|
|
120
|
+
self.MAX_CONVERSATION_STEPS
|
|
121
|
+
if max_user_turns is None
|
|
122
|
+
else max_user_turns
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
for user_turn in range(max_turns):
|
|
126
|
+
user_input = _generate_user_input(
|
|
127
|
+
user_turn=user_turn,
|
|
128
|
+
story=story,
|
|
129
|
+
conversation_history=conversation_history,
|
|
130
|
+
llm_user=self.llm_user,
|
|
131
|
+
enable_manual_user_input=self.config.enable_manual_user_input,
|
|
132
|
+
starting_user_input=starting_user_input,
|
|
133
|
+
attack_instructions=attack_instructions,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if self.config.enable_verbose_logging:
|
|
137
|
+
rich.print(
|
|
138
|
+
f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
|
|
139
|
+
user_input.content,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if self._is_end(user_input):
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if self.repeating_output_detection:
|
|
146
|
+
self.recent_user_messages.append(user_input.content)
|
|
147
|
+
|
|
148
|
+
conversation_history.append(user_input)
|
|
149
|
+
|
|
150
|
+
# (
|
|
151
|
+
# messages,
|
|
152
|
+
# thread_id,
|
|
153
|
+
# conversational_search_data,
|
|
154
|
+
# )
|
|
155
|
+
resp = self.runtime.run(
|
|
156
|
+
user_input,
|
|
157
|
+
context={
|
|
158
|
+
"agent_name": agent_name,
|
|
159
|
+
"call_tracker": call_tracker,
|
|
160
|
+
},
|
|
161
|
+
thread_id=thread_id,
|
|
162
|
+
)
|
|
163
|
+
messages = resp.messages
|
|
164
|
+
thread_id = resp.thread_id
|
|
165
|
+
call_tracker.metadata = {"thread_id": thread_id}
|
|
166
|
+
conversational_search_data = resp.context.get("conversational_search_data", [])
|
|
167
|
+
if not messages:
|
|
168
|
+
raise RuntimeError(
|
|
169
|
+
f"[Task-{task_n}] No messages is produced. Exiting task."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
for message in messages:
|
|
173
|
+
if self.repeating_output_detection:
|
|
174
|
+
if (
|
|
175
|
+
message.role == Roles.ASSISTANT
|
|
176
|
+
and message.type == ContentType.text
|
|
177
|
+
):
|
|
178
|
+
self.recent_assistant_messages.append(message.content)
|
|
179
|
+
|
|
180
|
+
if self.config.enable_verbose_logging:
|
|
181
|
+
rich.print(
|
|
182
|
+
f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
|
|
183
|
+
message.content,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# hook for subclasses
|
|
187
|
+
if self._post_message_hook(
|
|
188
|
+
task_n=task_n,
|
|
189
|
+
step=user_turn,
|
|
190
|
+
message=message,
|
|
191
|
+
conversation_history=conversation_history,
|
|
192
|
+
):
|
|
193
|
+
return (
|
|
194
|
+
conversation_history,
|
|
195
|
+
call_tracker,
|
|
196
|
+
conversational_search_history_data,
|
|
197
|
+
thread_id
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
conversation_history.extend(messages)
|
|
201
|
+
conversational_search_history_data.extend(
|
|
202
|
+
conversational_search_data
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return (
|
|
206
|
+
conversation_history,
|
|
207
|
+
call_tracker,
|
|
208
|
+
conversational_search_history_data,
|
|
209
|
+
thread_id
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
def _post_message_hook(self, **kwargs) -> bool:
|
|
213
|
+
"""
|
|
214
|
+
Hook for subclasses to extend behavior.
|
|
215
|
+
Return True to break the loop early.
|
|
216
|
+
"""
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
def _is_looping(self, messages: deque) -> bool:
|
|
220
|
+
"""Checks whether the user or assistant is stuck in a loop.
|
|
221
|
+
Args:
|
|
222
|
+
messages (deque): Defines the message cache to be assessed for similarity.
|
|
223
|
+
Returns:
|
|
224
|
+
bool: True if stuck in a loop, False otherwise.
|
|
225
|
+
"""
|
|
226
|
+
sim_count = 0
|
|
227
|
+
|
|
228
|
+
if len(messages) >= self.MAX_REPEATING_MESSAGES:
|
|
229
|
+
oldest_cached_message = messages[0]
|
|
230
|
+
for i, old_message in enumerate(messages):
|
|
231
|
+
if i == 0:
|
|
232
|
+
continue
|
|
233
|
+
if oldest_cached_message == old_message:
|
|
234
|
+
sim_count += 1
|
|
235
|
+
elif (
|
|
236
|
+
calculate_word_overlap_similarity_score(
|
|
237
|
+
oldest_cached_message, old_message
|
|
238
|
+
)
|
|
239
|
+
> self.MESSAGE_SIMILARITY_THRESHOLD
|
|
240
|
+
):
|
|
241
|
+
sim_count += 1
|
|
242
|
+
|
|
243
|
+
return sim_count >= self.MAX_REPEATING_MESSAGES - 1
|
|
244
|
+
|
|
245
|
+
def _is_end(self, current_user_input: Message) -> bool:
|
|
246
|
+
"""
|
|
247
|
+
Check if the user input indicates the end of the conversation.
|
|
248
|
+
|
|
249
|
+
- This function checks if the user input contains 'END'.
|
|
250
|
+
- An END is also triggered when the message cache(s) is filled with messages that are too similar.
|
|
251
|
+
- Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
|
|
252
|
+
Args:
|
|
253
|
+
current_user_input (Message): The user message.
|
|
254
|
+
Returns:
|
|
255
|
+
bool: True if the user input indicates an END, False otherwise.
|
|
256
|
+
"""
|
|
257
|
+
current_user_message_content = current_user_input.content.strip()
|
|
258
|
+
|
|
259
|
+
# Check if the user message contains 'END'
|
|
260
|
+
if "END" in current_user_message_content:
|
|
261
|
+
return True
|
|
262
|
+
|
|
263
|
+
if self.repeating_output_detection:
|
|
264
|
+
# Check for repeating user or assistant messages
|
|
265
|
+
if self._is_looping(self.recent_user_messages) or self._is_looping(
|
|
266
|
+
self.recent_assistant_messages
|
|
267
|
+
):
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
# Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class AttackEvaluationController(EvaluationController):
|
|
275
|
+
def __init__(
|
|
276
|
+
self, *args, attack_data=None, attack_evaluator=None, **kwargs
|
|
277
|
+
):
|
|
278
|
+
super().__init__(*args, **kwargs)
|
|
279
|
+
self.attack_data = attack_data
|
|
280
|
+
self.attack_evaluator = attack_evaluator
|
|
281
|
+
|
|
282
|
+
def _post_message_hook(
|
|
283
|
+
self, task_n, step, message, conversation_history
|
|
284
|
+
) -> bool:
|
|
285
|
+
"""Override hook to add live attack evaluation."""
|
|
286
|
+
if self.attack_evaluator and self.attack_data:
|
|
287
|
+
success = self.attack_evaluator.evaluate(
|
|
288
|
+
self.attack_data, conversation_history + [message]
|
|
289
|
+
)
|
|
290
|
+
if success:
|
|
291
|
+
rich.print(
|
|
292
|
+
f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
|
|
293
|
+
)
|
|
294
|
+
# persist the live result so the aggregator can pick it up later
|
|
295
|
+
try:
|
|
296
|
+
self.attack_evaluator.save_evaluation_result(
|
|
297
|
+
self.attack_data, True
|
|
298
|
+
)
|
|
299
|
+
except Exception:
|
|
300
|
+
pass
|
|
301
|
+
conversation_history.append(message)
|
|
302
|
+
return True
|
|
303
|
+
return False
|