hackagent 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hackagent/__init__.py +12 -0
- hackagent/agent.py +214 -0
- hackagent/api/__init__.py +1 -0
- hackagent/api/agent/__init__.py +1 -0
- hackagent/api/agent/agent_create.py +347 -0
- hackagent/api/agent/agent_destroy.py +140 -0
- hackagent/api/agent/agent_list.py +242 -0
- hackagent/api/agent/agent_partial_update.py +361 -0
- hackagent/api/agent/agent_retrieve.py +235 -0
- hackagent/api/agent/agent_update.py +361 -0
- hackagent/api/apilogs/__init__.py +1 -0
- hackagent/api/apilogs/apilogs_list.py +170 -0
- hackagent/api/apilogs/apilogs_retrieve.py +162 -0
- hackagent/api/attack/__init__.py +1 -0
- hackagent/api/attack/attack_create.py +275 -0
- hackagent/api/attack/attack_destroy.py +146 -0
- hackagent/api/attack/attack_list.py +254 -0
- hackagent/api/attack/attack_partial_update.py +289 -0
- hackagent/api/attack/attack_retrieve.py +247 -0
- hackagent/api/attack/attack_update.py +289 -0
- hackagent/api/checkout/__init__.py +1 -0
- hackagent/api/checkout/checkout_create.py +225 -0
- hackagent/api/generate/__init__.py +1 -0
- hackagent/api/generate/generate_create.py +253 -0
- hackagent/api/judge/__init__.py +1 -0
- hackagent/api/judge/judge_create.py +253 -0
- hackagent/api/key/__init__.py +1 -0
- hackagent/api/key/key_create.py +179 -0
- hackagent/api/key/key_destroy.py +103 -0
- hackagent/api/key/key_list.py +170 -0
- hackagent/api/key/key_retrieve.py +162 -0
- hackagent/api/organization/__init__.py +1 -0
- hackagent/api/organization/organization_create.py +208 -0
- hackagent/api/organization/organization_destroy.py +104 -0
- hackagent/api/organization/organization_list.py +170 -0
- hackagent/api/organization/organization_me_retrieve.py +126 -0
- hackagent/api/organization/organization_partial_update.py +222 -0
- hackagent/api/organization/organization_retrieve.py +163 -0
- hackagent/api/organization/organization_update.py +222 -0
- hackagent/api/prompt/__init__.py +1 -0
- hackagent/api/prompt/prompt_create.py +171 -0
- hackagent/api/prompt/prompt_destroy.py +104 -0
- hackagent/api/prompt/prompt_list.py +185 -0
- hackagent/api/prompt/prompt_partial_update.py +185 -0
- hackagent/api/prompt/prompt_retrieve.py +163 -0
- hackagent/api/prompt/prompt_update.py +185 -0
- hackagent/api/result/__init__.py +1 -0
- hackagent/api/result/result_create.py +175 -0
- hackagent/api/result/result_destroy.py +106 -0
- hackagent/api/result/result_list.py +249 -0
- hackagent/api/result/result_partial_update.py +193 -0
- hackagent/api/result/result_retrieve.py +167 -0
- hackagent/api/result/result_trace_create.py +177 -0
- hackagent/api/result/result_update.py +189 -0
- hackagent/api/run/__init__.py +1 -0
- hackagent/api/run/run_create.py +187 -0
- hackagent/api/run/run_destroy.py +112 -0
- hackagent/api/run/run_list.py +291 -0
- hackagent/api/run/run_partial_update.py +201 -0
- hackagent/api/run/run_result_create.py +177 -0
- hackagent/api/run/run_retrieve.py +179 -0
- hackagent/api/run/run_run_tests_create.py +187 -0
- hackagent/api/run/run_update.py +201 -0
- hackagent/api/user/__init__.py +1 -0
- hackagent/api/user/user_create.py +212 -0
- hackagent/api/user/user_destroy.py +106 -0
- hackagent/api/user/user_list.py +174 -0
- hackagent/api/user/user_me_retrieve.py +126 -0
- hackagent/api/user/user_me_update.py +196 -0
- hackagent/api/user/user_partial_update.py +226 -0
- hackagent/api/user/user_retrieve.py +167 -0
- hackagent/api/user/user_update.py +226 -0
- hackagent/attacks/AdvPrefix/__init__.py +41 -0
- hackagent/attacks/AdvPrefix/completions.py +416 -0
- hackagent/attacks/AdvPrefix/config.py +259 -0
- hackagent/attacks/AdvPrefix/evaluation.py +745 -0
- hackagent/attacks/AdvPrefix/evaluators.py +564 -0
- hackagent/attacks/AdvPrefix/generate.py +711 -0
- hackagent/attacks/AdvPrefix/utils.py +307 -0
- hackagent/attacks/__init__.py +35 -0
- hackagent/attacks/advprefix.py +507 -0
- hackagent/attacks/base.py +106 -0
- hackagent/attacks/strategies.py +906 -0
- hackagent/cli/__init__.py +19 -0
- hackagent/cli/commands/__init__.py +20 -0
- hackagent/cli/commands/agent.py +100 -0
- hackagent/cli/commands/attack.py +417 -0
- hackagent/cli/commands/config.py +301 -0
- hackagent/cli/commands/results.py +327 -0
- hackagent/cli/config.py +249 -0
- hackagent/cli/main.py +515 -0
- hackagent/cli/tui/__init__.py +31 -0
- hackagent/cli/tui/actions_logger.py +200 -0
- hackagent/cli/tui/app.py +288 -0
- hackagent/cli/tui/base.py +137 -0
- hackagent/cli/tui/logger.py +318 -0
- hackagent/cli/tui/views/__init__.py +33 -0
- hackagent/cli/tui/views/agents.py +488 -0
- hackagent/cli/tui/views/attacks.py +624 -0
- hackagent/cli/tui/views/config.py +244 -0
- hackagent/cli/tui/views/dashboard.py +307 -0
- hackagent/cli/tui/views/results.py +1210 -0
- hackagent/cli/tui/widgets/__init__.py +24 -0
- hackagent/cli/tui/widgets/actions.py +346 -0
- hackagent/cli/tui/widgets/logs.py +435 -0
- hackagent/cli/utils.py +276 -0
- hackagent/client.py +286 -0
- hackagent/errors.py +37 -0
- hackagent/logger.py +83 -0
- hackagent/models/__init__.py +109 -0
- hackagent/models/agent.py +223 -0
- hackagent/models/agent_request.py +129 -0
- hackagent/models/api_token_log.py +184 -0
- hackagent/models/attack.py +154 -0
- hackagent/models/attack_request.py +82 -0
- hackagent/models/checkout_session_request_request.py +76 -0
- hackagent/models/checkout_session_response.py +59 -0
- hackagent/models/choice.py +81 -0
- hackagent/models/choice_message.py +67 -0
- hackagent/models/evaluation_status_enum.py +14 -0
- hackagent/models/generate_error_response.py +59 -0
- hackagent/models/generate_request_request.py +212 -0
- hackagent/models/generate_success_response.py +115 -0
- hackagent/models/generic_error_response.py +70 -0
- hackagent/models/message_request.py +67 -0
- hackagent/models/organization.py +102 -0
- hackagent/models/organization_minimal.py +68 -0
- hackagent/models/organization_request.py +71 -0
- hackagent/models/paginated_agent_list.py +123 -0
- hackagent/models/paginated_api_token_log_list.py +123 -0
- hackagent/models/paginated_attack_list.py +123 -0
- hackagent/models/paginated_organization_list.py +123 -0
- hackagent/models/paginated_prompt_list.py +123 -0
- hackagent/models/paginated_result_list.py +123 -0
- hackagent/models/paginated_run_list.py +123 -0
- hackagent/models/paginated_user_api_key_list.py +123 -0
- hackagent/models/paginated_user_profile_list.py +123 -0
- hackagent/models/patched_agent_request.py +128 -0
- hackagent/models/patched_attack_request.py +92 -0
- hackagent/models/patched_organization_request.py +71 -0
- hackagent/models/patched_prompt_request.py +125 -0
- hackagent/models/patched_result_request.py +237 -0
- hackagent/models/patched_run_request.py +138 -0
- hackagent/models/patched_user_profile_request.py +99 -0
- hackagent/models/prompt.py +220 -0
- hackagent/models/prompt_request.py +126 -0
- hackagent/models/result.py +294 -0
- hackagent/models/result_list_evaluation_status.py +14 -0
- hackagent/models/result_request.py +232 -0
- hackagent/models/run.py +233 -0
- hackagent/models/run_list_status.py +12 -0
- hackagent/models/run_request.py +133 -0
- hackagent/models/status_enum.py +12 -0
- hackagent/models/step_type_enum.py +14 -0
- hackagent/models/trace.py +121 -0
- hackagent/models/trace_request.py +94 -0
- hackagent/models/usage.py +75 -0
- hackagent/models/user_api_key.py +201 -0
- hackagent/models/user_api_key_request.py +73 -0
- hackagent/models/user_profile.py +135 -0
- hackagent/models/user_profile_minimal.py +76 -0
- hackagent/models/user_profile_request.py +99 -0
- hackagent/router/__init__.py +25 -0
- hackagent/router/adapters/__init__.py +20 -0
- hackagent/router/adapters/base.py +63 -0
- hackagent/router/adapters/google_adk.py +671 -0
- hackagent/router/adapters/litellm_adapter.py +524 -0
- hackagent/router/adapters/openai_adapter.py +426 -0
- hackagent/router/router.py +969 -0
- hackagent/router/types.py +54 -0
- hackagent/tracking/__init__.py +42 -0
- hackagent/tracking/context.py +163 -0
- hackagent/tracking/decorators.py +299 -0
- hackagent/tracking/tracker.py +441 -0
- hackagent/types.py +54 -0
- hackagent/utils.py +194 -0
- hackagent/vulnerabilities/__init__.py +13 -0
- hackagent/vulnerabilities/prompts.py +81 -0
- hackagent-0.3.1.dist-info/METADATA +122 -0
- hackagent-0.3.1.dist-info/RECORD +183 -0
- hackagent-0.3.1.dist-info/WHEEL +4 -0
- hackagent-0.3.1.dist-info/entry_points.txt +2 -0
- hackagent-0.3.1.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
# Copyright 2025 - AI4I. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Prefix generation pipeline attack based on the BaseAttack class.
|
|
17
|
+
|
|
18
|
+
This module implements a complete pipeline for generating, filtering, and selecting prefixes
|
|
19
|
+
using uncensored and target language models, adapted as an attack module.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import copy
|
|
23
|
+
import logging
|
|
24
|
+
from typing import Any, Dict, List, Optional
|
|
25
|
+
from uuid import UUID
|
|
26
|
+
|
|
27
|
+
from hackagent.api.run import run_result_create
|
|
28
|
+
from hackagent.attacks.AdvPrefix.config import DEFAULT_PREFIX_GENERATION_CONFIG
|
|
29
|
+
from hackagent.client import AuthenticatedClient
|
|
30
|
+
from hackagent.models import (
|
|
31
|
+
EvaluationStatusEnum,
|
|
32
|
+
ResultRequest,
|
|
33
|
+
StatusEnum,
|
|
34
|
+
)
|
|
35
|
+
from hackagent.router.router import AgentRouter
|
|
36
|
+
from hackagent.tracking import StepTracker, TrackingContext
|
|
37
|
+
|
|
38
|
+
# Import step execution functions
|
|
39
|
+
from .AdvPrefix import completions
|
|
40
|
+
from .AdvPrefix.evaluation import EvaluationPipeline
|
|
41
|
+
from .AdvPrefix.generate import PrefixGenerationPipeline
|
|
42
|
+
from .base import BaseAttack
|
|
43
|
+
|
|
44
|
+
# TUI logging support (imported conditionally to avoid import errors in non-TUI contexts)
|
|
45
|
+
try:
|
|
46
|
+
from hackagent.cli.tui.logger import with_tui_logging
|
|
47
|
+
except ImportError:
|
|
48
|
+
# Fallback decorator that does nothing if TUI is not available
|
|
49
|
+
def with_tui_logging(*args, **kwargs):
|
|
50
|
+
def decorator(func):
|
|
51
|
+
return func
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Helper function for deep merging dictionaries
|
|
57
|
+
def _recursive_update(target_dict, source_dict):
|
|
58
|
+
"""
|
|
59
|
+
Recursively updates a target dictionary with values from a source dictionary.
|
|
60
|
+
Nested dictionaries are merged; other values are overwritten with a deep copy.
|
|
61
|
+
"""
|
|
62
|
+
for key, source_value in source_dict.items():
|
|
63
|
+
target_value = target_dict.get(key)
|
|
64
|
+
if isinstance(source_value, dict) and isinstance(target_value, dict):
|
|
65
|
+
# If both current_value and update_value are dicts, recurse
|
|
66
|
+
_recursive_update(target_value, source_value)
|
|
67
|
+
else:
|
|
68
|
+
# Otherwise, overwrite target_dict[key] with a deepcopy of source_value
|
|
69
|
+
target_dict[key] = copy.deepcopy(source_value)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class AdvPrefixAttack(BaseAttack):
|
|
73
|
+
"""
|
|
74
|
+
Attack class implementing the prefix generation pipeline by orchestrating step modules.
|
|
75
|
+
|
|
76
|
+
Inherits from BaseAttack and adapts the multi-step prefix generation process.
|
|
77
|
+
Expects configuration as a standard Python dictionary.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
config: Optional[Dict[str, Any]] = None,
|
|
83
|
+
client: Optional[AuthenticatedClient] = None,
|
|
84
|
+
agent_router: Optional[AgentRouter] = None,
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Initialize the pipeline with configuration.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
config: An optional dictionary containing pipeline parameters to override defaults.
|
|
91
|
+
client: An AuthenticatedClient instance passed from the strategy.
|
|
92
|
+
agent_router: An AgentRouter instance passed from the strategy.
|
|
93
|
+
"""
|
|
94
|
+
if client is None:
|
|
95
|
+
raise ValueError("AuthenticatedClient must be provided to AdvPrefixAttack.")
|
|
96
|
+
if agent_router is None:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
"Victim AgentRouter instance must be provided to AdvPrefixAttack."
|
|
99
|
+
)
|
|
100
|
+
self.client = client
|
|
101
|
+
self.agent_router = agent_router
|
|
102
|
+
|
|
103
|
+
# Start with a deep copy of the defaults to prevent any modification to the
|
|
104
|
+
current_config = copy.deepcopy(DEFAULT_PREFIX_GENERATION_CONFIG)
|
|
105
|
+
|
|
106
|
+
if config: # config is the user-provided sparse dictionary of overrides
|
|
107
|
+
_recursive_update(current_config, config)
|
|
108
|
+
|
|
109
|
+
# --- Define run_id and run_dir BEFORE calling super().__init__() ---
|
|
110
|
+
# Use config directly before it's potentially modified by BaseAttack
|
|
111
|
+
self.run_id = current_config.get("run_id")
|
|
112
|
+
output_dir = current_config.get("output_dir")
|
|
113
|
+
if not output_dir:
|
|
114
|
+
raise ValueError("Configuration missing required key: 'output_dir'")
|
|
115
|
+
# Use output_dir directly without nesting under run_id to avoid timestamp_UUID/UUID structure
|
|
116
|
+
self.run_dir = output_dir
|
|
117
|
+
# Add run_id to config if it wasn't there, needed by BaseAttack perhaps
|
|
118
|
+
current_config["run_id"] = self.run_id
|
|
119
|
+
# --- Assign self.run_id here as well ---
|
|
120
|
+
|
|
121
|
+
# ---------------------------------------
|
|
122
|
+
|
|
123
|
+
# --- Get logger instance BEFORE calling super().__init__() ---
|
|
124
|
+
# Use hierarchical logger name to ensure TUI handler inheritance
|
|
125
|
+
self.logger = logging.getLogger("hackagent.attacks.advprefix")
|
|
126
|
+
# ------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
# Initialize tracking (will be set up in run())
|
|
129
|
+
self.tracker: StepTracker | None = None
|
|
130
|
+
|
|
131
|
+
# Make a copy to avoid modifying the original dict if passed by reference
|
|
132
|
+
base_config = current_config.copy()
|
|
133
|
+
|
|
134
|
+
super().__init__(base_config)
|
|
135
|
+
|
|
136
|
+
# Preprocessor is now integrated into PrefixGenerationPipeline
|
|
137
|
+
# No separate initialization needed
|
|
138
|
+
|
|
139
|
+
# _setup() is called by super().__init__()
|
|
140
|
+
|
|
141
|
+
def _validate_config(self):
|
|
142
|
+
"""
|
|
143
|
+
Validates the provided configuration dictionary.
|
|
144
|
+
(Checks are now done on self.config which is a dict).
|
|
145
|
+
"""
|
|
146
|
+
super()._validate_config() # Base validation (checks if it's a dict)
|
|
147
|
+
|
|
148
|
+
# Define required keys, noting that some steps might have optional dependencies
|
|
149
|
+
# 'input_csv' removed as goals are passed to run()
|
|
150
|
+
required_keys = [
|
|
151
|
+
"output_dir",
|
|
152
|
+
"start_step",
|
|
153
|
+
# Keys needed for Preprocessor init
|
|
154
|
+
"min_char_length",
|
|
155
|
+
"max_token_segments",
|
|
156
|
+
"n_candidates_per_goal",
|
|
157
|
+
# Keys needed for Step 1
|
|
158
|
+
"meta_prefixes",
|
|
159
|
+
"meta_prefix_samples",
|
|
160
|
+
"batch_size",
|
|
161
|
+
"max_new_tokens",
|
|
162
|
+
"guided_topk",
|
|
163
|
+
"temperature",
|
|
164
|
+
# Keys needed for Step 4
|
|
165
|
+
"surrogate_attack_prompt",
|
|
166
|
+
# Keys needed for Step 6
|
|
167
|
+
"max_new_tokens_completion",
|
|
168
|
+
"n_samples",
|
|
169
|
+
# Keys needed for Step 7: Evaluation (includes judge evaluation, aggregation, and selection)
|
|
170
|
+
"judges",
|
|
171
|
+
"batch_size_judge",
|
|
172
|
+
"max_new_tokens_eval",
|
|
173
|
+
"filter_len",
|
|
174
|
+
"pasr_weight",
|
|
175
|
+
"n_prefixes_per_goal",
|
|
176
|
+
"selection_judges",
|
|
177
|
+
"max_ce", # Used in Step 5 (Preprocessor) and Step 7 (NLL filtering in aggregation)
|
|
178
|
+
]
|
|
179
|
+
missing_keys = [k for k in required_keys if k not in self.config]
|
|
180
|
+
if missing_keys:
|
|
181
|
+
# Provide more context in the error message
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"Configuration dictionary missing required keys: {', '.join(missing_keys)}"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Example type checks using .get()
|
|
187
|
+
if not isinstance(self.config.get("meta_prefixes"), list):
|
|
188
|
+
raise TypeError("Config key 'meta_prefixes' must be a list.")
|
|
189
|
+
if not isinstance(self.config.get("judges"), list):
|
|
190
|
+
raise TypeError("Config key 'judges' must be a list.")
|
|
191
|
+
if not isinstance(self.config.get("selection_judges"), list):
|
|
192
|
+
raise TypeError("Config key 'selection_judges' must be a list.")
|
|
193
|
+
# Add more specific type/value checks as needed (e.g., check types within lists)
|
|
194
|
+
|
|
195
|
+
def _setup(self):
|
|
196
|
+
"""
|
|
197
|
+
Performs setup tasks like logging.
|
|
198
|
+
(Preprocessor initialization moved to __init__).
|
|
199
|
+
"""
|
|
200
|
+
self._setup_logging()
|
|
201
|
+
# All execution tracking via StepTracker - no redundant logs
|
|
202
|
+
# Configuration tracked via StepTracker context
|
|
203
|
+
|
|
204
|
+
def _setup_logging(self):
|
|
205
|
+
"""Configure logging to console for this attack instance."""
|
|
206
|
+
# Use the instance logger obtained in __init__
|
|
207
|
+
self.logger.propagate = (
|
|
208
|
+
False # Prevent duplicate logs if root logger is configured
|
|
209
|
+
)
|
|
210
|
+
self.logger.setLevel(logging.INFO)
|
|
211
|
+
formatter = logging.Formatter(
|
|
212
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Remove existing handlers if re-initializing (e.g., if run_id changes)
|
|
216
|
+
for handler in self.logger.handlers[:]:
|
|
217
|
+
self.logger.removeHandler(handler)
|
|
218
|
+
handler.close()
|
|
219
|
+
|
|
220
|
+
# Console Handler only - all execution data tracked via API
|
|
221
|
+
if not any(isinstance(h, logging.StreamHandler) for h in self.logger.handlers):
|
|
222
|
+
ch = logging.StreamHandler()
|
|
223
|
+
ch.setFormatter(formatter)
|
|
224
|
+
self.logger.addHandler(ch)
|
|
225
|
+
|
|
226
|
+
# Remove helper methods that were moved to step files or utils
|
|
227
|
+
# Methods like _get_checkpoint_path and _clear_gpu_memory are now in utils
|
|
228
|
+
# Methods related to specific steps (_generate_prefixes, _construct_prompts, etc.) are in step files
|
|
229
|
+
|
|
230
|
+
def _create_parent_result(self) -> str | None:
|
|
231
|
+
"""Create parent result for tracking and return its ID."""
|
|
232
|
+
if not self.run_id or not self.client:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
parent_result_request = ResultRequest(
|
|
237
|
+
run=UUID(self.run_id),
|
|
238
|
+
evaluation_status=EvaluationStatusEnum.PASSED_CRITERIA,
|
|
239
|
+
)
|
|
240
|
+
parent_result_response = run_result_create.sync_detailed(
|
|
241
|
+
client=self.client,
|
|
242
|
+
id=UUID(self.run_id),
|
|
243
|
+
body=parent_result_request,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if parent_result_response.status_code == 201:
|
|
247
|
+
if parent_result_response.parsed and hasattr(
|
|
248
|
+
parent_result_response.parsed, "id"
|
|
249
|
+
):
|
|
250
|
+
parent_result_id = str(parent_result_response.parsed.id)
|
|
251
|
+
return parent_result_id
|
|
252
|
+
else:
|
|
253
|
+
# Try to extract from raw response
|
|
254
|
+
import json
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
response_data = json.loads(
|
|
258
|
+
parent_result_response.content.decode()
|
|
259
|
+
)
|
|
260
|
+
if "id" in response_data:
|
|
261
|
+
return str(response_data["id"])
|
|
262
|
+
except Exception:
|
|
263
|
+
pass # Silently fail, error logged below
|
|
264
|
+
else:
|
|
265
|
+
self.logger.error(
|
|
266
|
+
f"Failed to create Parent Result. Status: {parent_result_response.status_code}, "
|
|
267
|
+
f"Response: {parent_result_response.content}"
|
|
268
|
+
)
|
|
269
|
+
except Exception as e:
|
|
270
|
+
self.logger.error(f"Exception creating Parent Result: {e}", exc_info=True)
|
|
271
|
+
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
def _prepare_input_sample(self, data):
|
|
275
|
+
"""Prepare input sample for tracking from List[Dict] or other input."""
|
|
276
|
+
if data is None:
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
if isinstance(data, list):
|
|
280
|
+
# Take first 5 items for sampling
|
|
281
|
+
sample = data[:5] if len(data) > 5 else data
|
|
282
|
+
# Replace inf with None for JSON compatibility
|
|
283
|
+
result = []
|
|
284
|
+
for item in sample:
|
|
285
|
+
if isinstance(item, dict):
|
|
286
|
+
clean_item = {}
|
|
287
|
+
for k, v in item.items():
|
|
288
|
+
if isinstance(v, float) and (
|
|
289
|
+
v == float("inf") or v == float("-inf")
|
|
290
|
+
):
|
|
291
|
+
clean_item[k] = None
|
|
292
|
+
else:
|
|
293
|
+
clean_item[k] = v
|
|
294
|
+
result.append(clean_item)
|
|
295
|
+
else:
|
|
296
|
+
result.append(item)
|
|
297
|
+
return result
|
|
298
|
+
else:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
def _get_pipeline_steps(self):
|
|
302
|
+
"""Define the attack pipeline configuration."""
|
|
303
|
+
return [
|
|
304
|
+
{
|
|
305
|
+
"name": "Generation: Generate and Filter Adversarial Prefixes",
|
|
306
|
+
"function": lambda **kwargs: PrefixGenerationPipeline(
|
|
307
|
+
logger=kwargs["logger"],
|
|
308
|
+
client=kwargs["client"],
|
|
309
|
+
agent_router=kwargs["agent_router"],
|
|
310
|
+
config=kwargs["config"],
|
|
311
|
+
).execute(goals=kwargs["goals"]),
|
|
312
|
+
"step_type_enum": "GENERATION",
|
|
313
|
+
"config_keys": [
|
|
314
|
+
"generator",
|
|
315
|
+
"batch_size",
|
|
316
|
+
"max_new_tokens",
|
|
317
|
+
"guided_topk",
|
|
318
|
+
"temperature",
|
|
319
|
+
"meta_prefixes",
|
|
320
|
+
"meta_prefix_samples",
|
|
321
|
+
"min_char_length",
|
|
322
|
+
"max_ce",
|
|
323
|
+
"max_token_segments",
|
|
324
|
+
"n_candidates_per_goal",
|
|
325
|
+
"surrogate_attack_prompt",
|
|
326
|
+
],
|
|
327
|
+
"input_data_arg_name": "goals",
|
|
328
|
+
"required_args": ["logger", "client", "config", "agent_router"],
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
"name": "Execution: Get Completions from Target Model",
|
|
332
|
+
"function": completions.execute,
|
|
333
|
+
"step_type_enum": "EXECUTION",
|
|
334
|
+
"config_keys": ["batch_size", "max_new_tokens_completion", "n_samples"],
|
|
335
|
+
"input_data_arg_name": "input_data",
|
|
336
|
+
"required_args": ["logger", "config", "agent_router"],
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
"name": "Evaluation: Judge, Aggregate, and Select Best Prefixes",
|
|
340
|
+
"function": lambda input_data,
|
|
341
|
+
config,
|
|
342
|
+
logger,
|
|
343
|
+
client: EvaluationPipeline(
|
|
344
|
+
config=config, logger=logger, client=client
|
|
345
|
+
).execute(input_data=input_data),
|
|
346
|
+
"step_type_enum": "EVALUATION",
|
|
347
|
+
"config_keys": [
|
|
348
|
+
"judges",
|
|
349
|
+
"batch_size_judge",
|
|
350
|
+
"max_new_tokens_eval",
|
|
351
|
+
"filter_len",
|
|
352
|
+
"pasr_weight",
|
|
353
|
+
"n_prefixes_per_goal",
|
|
354
|
+
"selection_judges",
|
|
355
|
+
"max_ce",
|
|
356
|
+
],
|
|
357
|
+
"input_data_arg_name": "input_data",
|
|
358
|
+
"required_args": ["logger", "client", "config"],
|
|
359
|
+
},
|
|
360
|
+
]
|
|
361
|
+
|
|
362
|
+
def _build_step_args(
|
|
363
|
+
self, step_info: Dict, step_config: Dict, input_data: Any
|
|
364
|
+
) -> Dict:
|
|
365
|
+
"""
|
|
366
|
+
Build arguments dict for a step function based on its requirements.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
step_info: Pipeline step configuration
|
|
370
|
+
step_config: Step-specific config values
|
|
371
|
+
input_data: Input data (goals for Generation, output from previous stage for others)
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
Dictionary of arguments ready to pass to step function
|
|
375
|
+
"""
|
|
376
|
+
args = {"config": step_config}
|
|
377
|
+
|
|
378
|
+
# Add only required arguments to avoid parameter mismatches
|
|
379
|
+
required_args = step_info.get("required_args", [])
|
|
380
|
+
|
|
381
|
+
if "logger" in required_args:
|
|
382
|
+
args["logger"] = self.logger
|
|
383
|
+
if "client" in required_args:
|
|
384
|
+
args["client"] = self.client
|
|
385
|
+
if "agent_router" in required_args:
|
|
386
|
+
args["agent_router"] = self.agent_router
|
|
387
|
+
|
|
388
|
+
# Add input data with the correct parameter name
|
|
389
|
+
args[step_info["input_data_arg_name"]] = input_data
|
|
390
|
+
|
|
391
|
+
return args
|
|
392
|
+
|
|
393
|
+
def _execute_function_step(
|
|
394
|
+
self, step_info: Dict, step_config: Dict, input_data: Any
|
|
395
|
+
) -> Any:
|
|
396
|
+
"""Execute a function-based pipeline step."""
|
|
397
|
+
step_function = step_info["function"]
|
|
398
|
+
step_args = self._build_step_args(step_info, step_config, input_data)
|
|
399
|
+
return step_function(**step_args)
|
|
400
|
+
|
|
401
|
+
def _track_output_metrics(self, output: Any):
|
|
402
|
+
"""Track metrics about step output."""
|
|
403
|
+
if output is None:
|
|
404
|
+
self.tracker.add_step_metadata("output_type", "None")
|
|
405
|
+
self.tracker.add_step_metadata("warning", "Step returned None")
|
|
406
|
+
elif isinstance(output, list):
|
|
407
|
+
item_count = len(output)
|
|
408
|
+
self.tracker.add_step_metadata("output_items", item_count)
|
|
409
|
+
if item_count == 0:
|
|
410
|
+
self.tracker.add_step_metadata("warning", "Empty list returned")
|
|
411
|
+
else:
|
|
412
|
+
self.tracker.add_step_metadata("output_type", type(output).__name__)
|
|
413
|
+
|
|
414
|
+
def _finalize_pipeline(self, final_output: Any):
|
|
415
|
+
"""Determine final status and update tracking."""
|
|
416
|
+
if final_output is not None and len(final_output) > 0:
|
|
417
|
+
eval_status = EvaluationStatusEnum.PASSED_CRITERIA
|
|
418
|
+
eval_notes = None
|
|
419
|
+
run_status = StatusEnum.COMPLETED
|
|
420
|
+
else:
|
|
421
|
+
eval_status = EvaluationStatusEnum.FAILED_CRITERIA
|
|
422
|
+
eval_notes = "Pipeline completed with no resulting prefixes."
|
|
423
|
+
run_status = StatusEnum.COMPLETED
|
|
424
|
+
|
|
425
|
+
if self.tracker:
|
|
426
|
+
self.tracker.update_result_status(eval_status, eval_notes)
|
|
427
|
+
self.tracker.update_run_status(run_status)
|
|
428
|
+
|
|
429
|
+
@with_tui_logging(logger_name="hackagent.attacks", level=logging.INFO)
|
|
430
|
+
def run(self, goals: List[str]) -> List[Dict]:
|
|
431
|
+
"""
|
|
432
|
+
Executes the full prefix generation pipeline.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
goals: A list of goal strings to generate prefixes for.
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
List of dictionaries containing the final selected prefixes,
|
|
439
|
+
or empty list if no prefixes were generated.
|
|
440
|
+
"""
|
|
441
|
+
if not goals:
|
|
442
|
+
return []
|
|
443
|
+
|
|
444
|
+
# Initialize tracking
|
|
445
|
+
parent_result_id = self._create_parent_result() if self.run_id else None
|
|
446
|
+
tracking_context = TrackingContext(
|
|
447
|
+
client=self.client,
|
|
448
|
+
run_id=self.run_id,
|
|
449
|
+
parent_result_id=parent_result_id,
|
|
450
|
+
logger=self.logger,
|
|
451
|
+
)
|
|
452
|
+
tracking_context.add_metadata("attack_type", "advprefix")
|
|
453
|
+
tracking_context.add_metadata("num_goals", len(goals))
|
|
454
|
+
|
|
455
|
+
self.tracker = StepTracker(tracking_context)
|
|
456
|
+
self.tracker.update_run_status(StatusEnum.RUNNING)
|
|
457
|
+
|
|
458
|
+
# Initialize pipeline
|
|
459
|
+
pipeline_steps = self._get_pipeline_steps()
|
|
460
|
+
current_step_index = self.config.get("start_step", 1) - 1
|
|
461
|
+
last_step_output = goals # Start with raw goals
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
for i in range(current_step_index, len(pipeline_steps)):
|
|
465
|
+
step_info = pipeline_steps[i]
|
|
466
|
+
step_name = step_info["name"]
|
|
467
|
+
step_type = step_info["step_type_enum"]
|
|
468
|
+
|
|
469
|
+
# Prepare tracking data
|
|
470
|
+
input_sample = self._prepare_input_sample(last_step_output)
|
|
471
|
+
step_config = {
|
|
472
|
+
k: self.config[k]
|
|
473
|
+
for k in step_info.get("config_keys", [])
|
|
474
|
+
if k in self.config
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
# Calculate and log progress
|
|
478
|
+
step_progress = int(50 + (i / len(pipeline_steps)) * 40) # 50-90%
|
|
479
|
+
self.logger.info(f"━━━ Progress: {step_progress}% ━━━")
|
|
480
|
+
|
|
481
|
+
with self.tracker.track_step(
|
|
482
|
+
step_name, step_type, input_sample, step_config
|
|
483
|
+
):
|
|
484
|
+
# Execute the step
|
|
485
|
+
if "function" in step_info:
|
|
486
|
+
last_step_output = self._execute_function_step(
|
|
487
|
+
step_info, step_config, last_step_output
|
|
488
|
+
)
|
|
489
|
+
else:
|
|
490
|
+
self.logger.warning(
|
|
491
|
+
f"No function defined for {step_name}. Skipping."
|
|
492
|
+
)
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
self._track_output_metrics(last_step_output)
|
|
496
|
+
|
|
497
|
+
# Log step completion
|
|
498
|
+
self.logger.info(f"✅ Completed: {step_name}")
|
|
499
|
+
|
|
500
|
+
except Exception:
|
|
501
|
+
if self.tracker:
|
|
502
|
+
self.tracker.update_run_status(StatusEnum.FAILED)
|
|
503
|
+
raise
|
|
504
|
+
|
|
505
|
+
# Finalize and return results
|
|
506
|
+
self._finalize_pipeline(last_step_output)
|
|
507
|
+
return last_step_output if last_step_output is not None else []
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright 2025 - AI4I. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import abc
|
|
16
|
+
from typing import Any, Dict
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseAttack(abc.ABC):
|
|
20
|
+
"""
|
|
21
|
+
Abstract base class for black-box attacks against language models.
|
|
22
|
+
|
|
23
|
+
This class provides the foundational interface and structure that all
|
|
24
|
+
attack implementations must follow. It handles common initialization
|
|
25
|
+
patterns and enforces a consistent API across different attack types.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
config: A dictionary containing configuration settings for the attack.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, config: Dict[str, Any]):
|
|
32
|
+
"""
|
|
33
|
+
Initializes the attack with configuration parameters.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
config: A dictionary containing configuration settings for the attack.
|
|
37
|
+
Must include all required parameters for the specific attack type.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
TypeError: If config is not a dictionary.
|
|
41
|
+
ValueError: If required configuration parameters are missing or invalid.
|
|
42
|
+
"""
|
|
43
|
+
self.config = config
|
|
44
|
+
self._validate_config()
|
|
45
|
+
self._setup()
|
|
46
|
+
|
|
47
|
+
def _validate_config(self):
|
|
48
|
+
"""
|
|
49
|
+
Validates the provided configuration.
|
|
50
|
+
|
|
51
|
+
This method performs basic validation of the configuration dictionary.
|
|
52
|
+
Subclasses should override this method to enforce specific configuration
|
|
53
|
+
requirements for their attack type.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
TypeError: If the configuration is not a dictionary.
|
|
57
|
+
ValueError: If required configuration parameters are missing or invalid.
|
|
58
|
+
"""
|
|
59
|
+
if not isinstance(self.config, dict):
|
|
60
|
+
raise TypeError("Configuration must be a dictionary.")
|
|
61
|
+
# Add more specific validation in subclasses as needed
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
def _setup(self):
|
|
65
|
+
"""
|
|
66
|
+
Performs any necessary setup based on the configuration.
|
|
67
|
+
|
|
68
|
+
This method is called after configuration validation and provides
|
|
69
|
+
an opportunity for subclasses to perform initialization tasks such
|
|
70
|
+
as loading models, creating output directories, or setting up logging.
|
|
71
|
+
|
|
72
|
+
Subclasses can override this for specific setup tasks.
|
|
73
|
+
"""
|
|
74
|
+
# Add setup logic in subclasses (e.g., logging, directories)
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abc.abstractmethod
|
|
78
|
+
def run(self, **kwargs: Any) -> Any:
|
|
79
|
+
"""
|
|
80
|
+
Executes the attack logic.
|
|
81
|
+
|
|
82
|
+
This abstract method must be implemented by all attack subclasses
|
|
83
|
+
to define their specific attack methodology and execution flow.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
**kwargs: Attack-specific arguments that vary by implementation.
|
|
87
|
+
Common examples include:
|
|
88
|
+
- input_prompts: List of prompts to test
|
|
89
|
+
- goals: List of target goals for the attack
|
|
90
|
+
- dataset: Input dataset for evaluation
|
|
91
|
+
- target_model: The model to attack
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Attack-specific results. The format varies by implementation but
|
|
95
|
+
typically includes:
|
|
96
|
+
- adversarial_examples: Generated adversarial inputs
|
|
97
|
+
- success_metrics: Attack success rates and statistics
|
|
98
|
+
- detailed_results: Comprehensive result data (e.g., pandas DataFrame)
|
|
99
|
+
- attack_report: Summary of attack performance
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
NotImplementedError: If the method is not implemented by a subclass.
|
|
103
|
+
RuntimeError: If the attack execution fails due to configuration
|
|
104
|
+
or runtime errors.
|
|
105
|
+
"""
|
|
106
|
+
pass
|