hackagent 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. hackagent/__init__.py +12 -0
  2. hackagent/agent.py +214 -0
  3. hackagent/api/__init__.py +1 -0
  4. hackagent/api/agent/__init__.py +1 -0
  5. hackagent/api/agent/agent_create.py +347 -0
  6. hackagent/api/agent/agent_destroy.py +140 -0
  7. hackagent/api/agent/agent_list.py +242 -0
  8. hackagent/api/agent/agent_partial_update.py +361 -0
  9. hackagent/api/agent/agent_retrieve.py +235 -0
  10. hackagent/api/agent/agent_update.py +361 -0
  11. hackagent/api/apilogs/__init__.py +1 -0
  12. hackagent/api/apilogs/apilogs_list.py +170 -0
  13. hackagent/api/apilogs/apilogs_retrieve.py +162 -0
  14. hackagent/api/attack/__init__.py +1 -0
  15. hackagent/api/attack/attack_create.py +275 -0
  16. hackagent/api/attack/attack_destroy.py +146 -0
  17. hackagent/api/attack/attack_list.py +254 -0
  18. hackagent/api/attack/attack_partial_update.py +289 -0
  19. hackagent/api/attack/attack_retrieve.py +247 -0
  20. hackagent/api/attack/attack_update.py +289 -0
  21. hackagent/api/checkout/__init__.py +1 -0
  22. hackagent/api/checkout/checkout_create.py +225 -0
  23. hackagent/api/generate/__init__.py +1 -0
  24. hackagent/api/generate/generate_create.py +253 -0
  25. hackagent/api/judge/__init__.py +1 -0
  26. hackagent/api/judge/judge_create.py +253 -0
  27. hackagent/api/key/__init__.py +1 -0
  28. hackagent/api/key/key_create.py +179 -0
  29. hackagent/api/key/key_destroy.py +103 -0
  30. hackagent/api/key/key_list.py +170 -0
  31. hackagent/api/key/key_retrieve.py +162 -0
  32. hackagent/api/organization/__init__.py +1 -0
  33. hackagent/api/organization/organization_create.py +208 -0
  34. hackagent/api/organization/organization_destroy.py +104 -0
  35. hackagent/api/organization/organization_list.py +170 -0
  36. hackagent/api/organization/organization_me_retrieve.py +126 -0
  37. hackagent/api/organization/organization_partial_update.py +222 -0
  38. hackagent/api/organization/organization_retrieve.py +163 -0
  39. hackagent/api/organization/organization_update.py +222 -0
  40. hackagent/api/prompt/__init__.py +1 -0
  41. hackagent/api/prompt/prompt_create.py +171 -0
  42. hackagent/api/prompt/prompt_destroy.py +104 -0
  43. hackagent/api/prompt/prompt_list.py +185 -0
  44. hackagent/api/prompt/prompt_partial_update.py +185 -0
  45. hackagent/api/prompt/prompt_retrieve.py +163 -0
  46. hackagent/api/prompt/prompt_update.py +185 -0
  47. hackagent/api/result/__init__.py +1 -0
  48. hackagent/api/result/result_create.py +175 -0
  49. hackagent/api/result/result_destroy.py +106 -0
  50. hackagent/api/result/result_list.py +249 -0
  51. hackagent/api/result/result_partial_update.py +193 -0
  52. hackagent/api/result/result_retrieve.py +167 -0
  53. hackagent/api/result/result_trace_create.py +177 -0
  54. hackagent/api/result/result_update.py +189 -0
  55. hackagent/api/run/__init__.py +1 -0
  56. hackagent/api/run/run_create.py +187 -0
  57. hackagent/api/run/run_destroy.py +112 -0
  58. hackagent/api/run/run_list.py +291 -0
  59. hackagent/api/run/run_partial_update.py +201 -0
  60. hackagent/api/run/run_result_create.py +177 -0
  61. hackagent/api/run/run_retrieve.py +179 -0
  62. hackagent/api/run/run_run_tests_create.py +187 -0
  63. hackagent/api/run/run_update.py +201 -0
  64. hackagent/api/user/__init__.py +1 -0
  65. hackagent/api/user/user_create.py +212 -0
  66. hackagent/api/user/user_destroy.py +106 -0
  67. hackagent/api/user/user_list.py +174 -0
  68. hackagent/api/user/user_me_retrieve.py +126 -0
  69. hackagent/api/user/user_me_update.py +196 -0
  70. hackagent/api/user/user_partial_update.py +226 -0
  71. hackagent/api/user/user_retrieve.py +167 -0
  72. hackagent/api/user/user_update.py +226 -0
  73. hackagent/attacks/AdvPrefix/__init__.py +41 -0
  74. hackagent/attacks/AdvPrefix/completions.py +416 -0
  75. hackagent/attacks/AdvPrefix/config.py +259 -0
  76. hackagent/attacks/AdvPrefix/evaluation.py +745 -0
  77. hackagent/attacks/AdvPrefix/evaluators.py +564 -0
  78. hackagent/attacks/AdvPrefix/generate.py +711 -0
  79. hackagent/attacks/AdvPrefix/utils.py +307 -0
  80. hackagent/attacks/__init__.py +35 -0
  81. hackagent/attacks/advprefix.py +507 -0
  82. hackagent/attacks/base.py +106 -0
  83. hackagent/attacks/strategies.py +906 -0
  84. hackagent/cli/__init__.py +19 -0
  85. hackagent/cli/commands/__init__.py +20 -0
  86. hackagent/cli/commands/agent.py +100 -0
  87. hackagent/cli/commands/attack.py +417 -0
  88. hackagent/cli/commands/config.py +301 -0
  89. hackagent/cli/commands/results.py +327 -0
  90. hackagent/cli/config.py +249 -0
  91. hackagent/cli/main.py +515 -0
  92. hackagent/cli/tui/__init__.py +31 -0
  93. hackagent/cli/tui/actions_logger.py +200 -0
  94. hackagent/cli/tui/app.py +288 -0
  95. hackagent/cli/tui/base.py +137 -0
  96. hackagent/cli/tui/logger.py +318 -0
  97. hackagent/cli/tui/views/__init__.py +33 -0
  98. hackagent/cli/tui/views/agents.py +488 -0
  99. hackagent/cli/tui/views/attacks.py +624 -0
  100. hackagent/cli/tui/views/config.py +244 -0
  101. hackagent/cli/tui/views/dashboard.py +307 -0
  102. hackagent/cli/tui/views/results.py +1210 -0
  103. hackagent/cli/tui/widgets/__init__.py +24 -0
  104. hackagent/cli/tui/widgets/actions.py +346 -0
  105. hackagent/cli/tui/widgets/logs.py +435 -0
  106. hackagent/cli/utils.py +276 -0
  107. hackagent/client.py +286 -0
  108. hackagent/errors.py +37 -0
  109. hackagent/logger.py +83 -0
  110. hackagent/models/__init__.py +109 -0
  111. hackagent/models/agent.py +223 -0
  112. hackagent/models/agent_request.py +129 -0
  113. hackagent/models/api_token_log.py +184 -0
  114. hackagent/models/attack.py +154 -0
  115. hackagent/models/attack_request.py +82 -0
  116. hackagent/models/checkout_session_request_request.py +76 -0
  117. hackagent/models/checkout_session_response.py +59 -0
  118. hackagent/models/choice.py +81 -0
  119. hackagent/models/choice_message.py +67 -0
  120. hackagent/models/evaluation_status_enum.py +14 -0
  121. hackagent/models/generate_error_response.py +59 -0
  122. hackagent/models/generate_request_request.py +212 -0
  123. hackagent/models/generate_success_response.py +115 -0
  124. hackagent/models/generic_error_response.py +70 -0
  125. hackagent/models/message_request.py +67 -0
  126. hackagent/models/organization.py +102 -0
  127. hackagent/models/organization_minimal.py +68 -0
  128. hackagent/models/organization_request.py +71 -0
  129. hackagent/models/paginated_agent_list.py +123 -0
  130. hackagent/models/paginated_api_token_log_list.py +123 -0
  131. hackagent/models/paginated_attack_list.py +123 -0
  132. hackagent/models/paginated_organization_list.py +123 -0
  133. hackagent/models/paginated_prompt_list.py +123 -0
  134. hackagent/models/paginated_result_list.py +123 -0
  135. hackagent/models/paginated_run_list.py +123 -0
  136. hackagent/models/paginated_user_api_key_list.py +123 -0
  137. hackagent/models/paginated_user_profile_list.py +123 -0
  138. hackagent/models/patched_agent_request.py +128 -0
  139. hackagent/models/patched_attack_request.py +92 -0
  140. hackagent/models/patched_organization_request.py +71 -0
  141. hackagent/models/patched_prompt_request.py +125 -0
  142. hackagent/models/patched_result_request.py +237 -0
  143. hackagent/models/patched_run_request.py +138 -0
  144. hackagent/models/patched_user_profile_request.py +99 -0
  145. hackagent/models/prompt.py +220 -0
  146. hackagent/models/prompt_request.py +126 -0
  147. hackagent/models/result.py +294 -0
  148. hackagent/models/result_list_evaluation_status.py +14 -0
  149. hackagent/models/result_request.py +232 -0
  150. hackagent/models/run.py +233 -0
  151. hackagent/models/run_list_status.py +12 -0
  152. hackagent/models/run_request.py +133 -0
  153. hackagent/models/status_enum.py +12 -0
  154. hackagent/models/step_type_enum.py +14 -0
  155. hackagent/models/trace.py +121 -0
  156. hackagent/models/trace_request.py +94 -0
  157. hackagent/models/usage.py +75 -0
  158. hackagent/models/user_api_key.py +201 -0
  159. hackagent/models/user_api_key_request.py +73 -0
  160. hackagent/models/user_profile.py +135 -0
  161. hackagent/models/user_profile_minimal.py +76 -0
  162. hackagent/models/user_profile_request.py +99 -0
  163. hackagent/router/__init__.py +25 -0
  164. hackagent/router/adapters/__init__.py +20 -0
  165. hackagent/router/adapters/base.py +63 -0
  166. hackagent/router/adapters/google_adk.py +671 -0
  167. hackagent/router/adapters/litellm_adapter.py +524 -0
  168. hackagent/router/adapters/openai_adapter.py +426 -0
  169. hackagent/router/router.py +969 -0
  170. hackagent/router/types.py +54 -0
  171. hackagent/tracking/__init__.py +42 -0
  172. hackagent/tracking/context.py +163 -0
  173. hackagent/tracking/decorators.py +299 -0
  174. hackagent/tracking/tracker.py +441 -0
  175. hackagent/types.py +54 -0
  176. hackagent/utils.py +194 -0
  177. hackagent/vulnerabilities/__init__.py +13 -0
  178. hackagent/vulnerabilities/prompts.py +81 -0
  179. hackagent-0.3.1.dist-info/METADATA +122 -0
  180. hackagent-0.3.1.dist-info/RECORD +183 -0
  181. hackagent-0.3.1.dist-info/WHEEL +4 -0
  182. hackagent-0.3.1.dist-info/entry_points.txt +2 -0
  183. hackagent-0.3.1.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,507 @@
1
+ # Copyright 2025 - AI4I. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Prefix generation pipeline attack based on the BaseAttack class.
17
+
18
+ This module implements a complete pipeline for generating, filtering, and selecting prefixes
19
+ using uncensored and target language models, adapted as an attack module.
20
+ """
21
+
22
+ import copy
23
+ import logging
24
+ from typing import Any, Dict, List, Optional
25
+ from uuid import UUID
26
+
27
+ from hackagent.api.run import run_result_create
28
+ from hackagent.attacks.AdvPrefix.config import DEFAULT_PREFIX_GENERATION_CONFIG
29
+ from hackagent.client import AuthenticatedClient
30
+ from hackagent.models import (
31
+ EvaluationStatusEnum,
32
+ ResultRequest,
33
+ StatusEnum,
34
+ )
35
+ from hackagent.router.router import AgentRouter
36
+ from hackagent.tracking import StepTracker, TrackingContext
37
+
38
+ # Import step execution functions
39
+ from .AdvPrefix import completions
40
+ from .AdvPrefix.evaluation import EvaluationPipeline
41
+ from .AdvPrefix.generate import PrefixGenerationPipeline
42
+ from .base import BaseAttack
43
+
44
+ # TUI logging support (imported conditionally to avoid import errors in non-TUI contexts)
45
+ try:
46
+ from hackagent.cli.tui.logger import with_tui_logging
47
+ except ImportError:
48
+ # Fallback decorator that does nothing if TUI is not available
49
+ def with_tui_logging(*args, **kwargs):
50
+ def decorator(func):
51
+ return func
52
+
53
+ return decorator
54
+
55
+
56
+ # Helper function for deep merging dictionaries
57
+ def _recursive_update(target_dict, source_dict):
58
+ """
59
+ Recursively updates a target dictionary with values from a source dictionary.
60
+ Nested dictionaries are merged; other values are overwritten with a deep copy.
61
+ """
62
+ for key, source_value in source_dict.items():
63
+ target_value = target_dict.get(key)
64
+ if isinstance(source_value, dict) and isinstance(target_value, dict):
65
+ # If both current_value and update_value are dicts, recurse
66
+ _recursive_update(target_value, source_value)
67
+ else:
68
+ # Otherwise, overwrite target_dict[key] with a deepcopy of source_value
69
+ target_dict[key] = copy.deepcopy(source_value)
70
+
71
+
72
+ class AdvPrefixAttack(BaseAttack):
73
+ """
74
+ Attack class implementing the prefix generation pipeline by orchestrating step modules.
75
+
76
+ Inherits from BaseAttack and adapts the multi-step prefix generation process.
77
+ Expects configuration as a standard Python dictionary.
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ config: Optional[Dict[str, Any]] = None,
83
+ client: Optional[AuthenticatedClient] = None,
84
+ agent_router: Optional[AgentRouter] = None,
85
+ ):
86
+ """
87
+ Initialize the pipeline with configuration.
88
+
89
+ Args:
90
+ config: An optional dictionary containing pipeline parameters to override defaults.
91
+ client: An AuthenticatedClient instance passed from the strategy.
92
+ agent_router: An AgentRouter instance passed from the strategy.
93
+ """
94
+ if client is None:
95
+ raise ValueError("AuthenticatedClient must be provided to AdvPrefixAttack.")
96
+ if agent_router is None:
97
+ raise ValueError(
98
+ "Victim AgentRouter instance must be provided to AdvPrefixAttack."
99
+ )
100
+ self.client = client
101
+ self.agent_router = agent_router
102
+
103
+ # Start with a deep copy of the defaults to prevent any modification to the
104
+ current_config = copy.deepcopy(DEFAULT_PREFIX_GENERATION_CONFIG)
105
+
106
+ if config: # config is the user-provided sparse dictionary of overrides
107
+ _recursive_update(current_config, config)
108
+
109
+ # --- Define run_id and run_dir BEFORE calling super().__init__() ---
110
+ # Use config directly before it's potentially modified by BaseAttack
111
+ self.run_id = current_config.get("run_id")
112
+ output_dir = current_config.get("output_dir")
113
+ if not output_dir:
114
+ raise ValueError("Configuration missing required key: 'output_dir'")
115
+ # Use output_dir directly without nesting under run_id to avoid timestamp_UUID/UUID structure
116
+ self.run_dir = output_dir
117
+ # Add run_id to config if it wasn't there, needed by BaseAttack perhaps
118
+ current_config["run_id"] = self.run_id
119
+ # --- Assign self.run_id here as well ---
120
+
121
+ # ---------------------------------------
122
+
123
+ # --- Get logger instance BEFORE calling super().__init__() ---
124
+ # Use hierarchical logger name to ensure TUI handler inheritance
125
+ self.logger = logging.getLogger("hackagent.attacks.advprefix")
126
+ # ------------------------------------------------------------
127
+
128
+ # Initialize tracking (will be set up in run())
129
+ self.tracker: StepTracker | None = None
130
+
131
+ # Make a copy to avoid modifying the original dict if passed by reference
132
+ base_config = current_config.copy()
133
+
134
+ super().__init__(base_config)
135
+
136
+ # Preprocessor is now integrated into PrefixGenerationPipeline
137
+ # No separate initialization needed
138
+
139
+ # _setup() is called by super().__init__()
140
+
141
+ def _validate_config(self):
142
+ """
143
+ Validates the provided configuration dictionary.
144
+ (Checks are now done on self.config which is a dict).
145
+ """
146
+ super()._validate_config() # Base validation (checks if it's a dict)
147
+
148
+ # Define required keys, noting that some steps might have optional dependencies
149
+ # 'input_csv' removed as goals are passed to run()
150
+ required_keys = [
151
+ "output_dir",
152
+ "start_step",
153
+ # Keys needed for Preprocessor init
154
+ "min_char_length",
155
+ "max_token_segments",
156
+ "n_candidates_per_goal",
157
+ # Keys needed for Step 1
158
+ "meta_prefixes",
159
+ "meta_prefix_samples",
160
+ "batch_size",
161
+ "max_new_tokens",
162
+ "guided_topk",
163
+ "temperature",
164
+ # Keys needed for Step 4
165
+ "surrogate_attack_prompt",
166
+ # Keys needed for Step 6
167
+ "max_new_tokens_completion",
168
+ "n_samples",
169
+ # Keys needed for Step 7: Evaluation (includes judge evaluation, aggregation, and selection)
170
+ "judges",
171
+ "batch_size_judge",
172
+ "max_new_tokens_eval",
173
+ "filter_len",
174
+ "pasr_weight",
175
+ "n_prefixes_per_goal",
176
+ "selection_judges",
177
+ "max_ce", # Used in Step 5 (Preprocessor) and Step 7 (NLL filtering in aggregation)
178
+ ]
179
+ missing_keys = [k for k in required_keys if k not in self.config]
180
+ if missing_keys:
181
+ # Provide more context in the error message
182
+ raise ValueError(
183
+ f"Configuration dictionary missing required keys: {', '.join(missing_keys)}"
184
+ )
185
+
186
+ # Example type checks using .get()
187
+ if not isinstance(self.config.get("meta_prefixes"), list):
188
+ raise TypeError("Config key 'meta_prefixes' must be a list.")
189
+ if not isinstance(self.config.get("judges"), list):
190
+ raise TypeError("Config key 'judges' must be a list.")
191
+ if not isinstance(self.config.get("selection_judges"), list):
192
+ raise TypeError("Config key 'selection_judges' must be a list.")
193
+ # Add more specific type/value checks as needed (e.g., check types within lists)
194
+
195
+ def _setup(self):
196
+ """
197
+ Performs setup tasks like logging.
198
+ (Preprocessor initialization moved to __init__).
199
+ """
200
+ self._setup_logging()
201
+ # All execution tracking via StepTracker - no redundant logs
202
+ # Configuration tracked via StepTracker context
203
+
204
+ def _setup_logging(self):
205
+ """Configure logging to console for this attack instance."""
206
+ # Use the instance logger obtained in __init__
207
+ self.logger.propagate = (
208
+ False # Prevent duplicate logs if root logger is configured
209
+ )
210
+ self.logger.setLevel(logging.INFO)
211
+ formatter = logging.Formatter(
212
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
213
+ )
214
+
215
+ # Remove existing handlers if re-initializing (e.g., if run_id changes)
216
+ for handler in self.logger.handlers[:]:
217
+ self.logger.removeHandler(handler)
218
+ handler.close()
219
+
220
+ # Console Handler only - all execution data tracked via API
221
+ if not any(isinstance(h, logging.StreamHandler) for h in self.logger.handlers):
222
+ ch = logging.StreamHandler()
223
+ ch.setFormatter(formatter)
224
+ self.logger.addHandler(ch)
225
+
226
+ # Remove helper methods that were moved to step files or utils
227
+ # Methods like _get_checkpoint_path and _clear_gpu_memory are now in utils
228
+ # Methods related to specific steps (_generate_prefixes, _construct_prompts, etc.) are in step files
229
+
230
+ def _create_parent_result(self) -> str | None:
231
+ """Create parent result for tracking and return its ID."""
232
+ if not self.run_id or not self.client:
233
+ return None
234
+
235
+ try:
236
+ parent_result_request = ResultRequest(
237
+ run=UUID(self.run_id),
238
+ evaluation_status=EvaluationStatusEnum.PASSED_CRITERIA,
239
+ )
240
+ parent_result_response = run_result_create.sync_detailed(
241
+ client=self.client,
242
+ id=UUID(self.run_id),
243
+ body=parent_result_request,
244
+ )
245
+
246
+ if parent_result_response.status_code == 201:
247
+ if parent_result_response.parsed and hasattr(
248
+ parent_result_response.parsed, "id"
249
+ ):
250
+ parent_result_id = str(parent_result_response.parsed.id)
251
+ return parent_result_id
252
+ else:
253
+ # Try to extract from raw response
254
+ import json
255
+
256
+ try:
257
+ response_data = json.loads(
258
+ parent_result_response.content.decode()
259
+ )
260
+ if "id" in response_data:
261
+ return str(response_data["id"])
262
+ except Exception:
263
+ pass # Silently fail, error logged below
264
+ else:
265
+ self.logger.error(
266
+ f"Failed to create Parent Result. Status: {parent_result_response.status_code}, "
267
+ f"Response: {parent_result_response.content}"
268
+ )
269
+ except Exception as e:
270
+ self.logger.error(f"Exception creating Parent Result: {e}", exc_info=True)
271
+
272
+ return None
273
+
274
+ def _prepare_input_sample(self, data):
275
+ """Prepare input sample for tracking from List[Dict] or other input."""
276
+ if data is None:
277
+ return None
278
+
279
+ if isinstance(data, list):
280
+ # Take first 5 items for sampling
281
+ sample = data[:5] if len(data) > 5 else data
282
+ # Replace inf with None for JSON compatibility
283
+ result = []
284
+ for item in sample:
285
+ if isinstance(item, dict):
286
+ clean_item = {}
287
+ for k, v in item.items():
288
+ if isinstance(v, float) and (
289
+ v == float("inf") or v == float("-inf")
290
+ ):
291
+ clean_item[k] = None
292
+ else:
293
+ clean_item[k] = v
294
+ result.append(clean_item)
295
+ else:
296
+ result.append(item)
297
+ return result
298
+ else:
299
+ return None
300
+
301
+ def _get_pipeline_steps(self):
302
+ """Define the attack pipeline configuration."""
303
+ return [
304
+ {
305
+ "name": "Generation: Generate and Filter Adversarial Prefixes",
306
+ "function": lambda **kwargs: PrefixGenerationPipeline(
307
+ logger=kwargs["logger"],
308
+ client=kwargs["client"],
309
+ agent_router=kwargs["agent_router"],
310
+ config=kwargs["config"],
311
+ ).execute(goals=kwargs["goals"]),
312
+ "step_type_enum": "GENERATION",
313
+ "config_keys": [
314
+ "generator",
315
+ "batch_size",
316
+ "max_new_tokens",
317
+ "guided_topk",
318
+ "temperature",
319
+ "meta_prefixes",
320
+ "meta_prefix_samples",
321
+ "min_char_length",
322
+ "max_ce",
323
+ "max_token_segments",
324
+ "n_candidates_per_goal",
325
+ "surrogate_attack_prompt",
326
+ ],
327
+ "input_data_arg_name": "goals",
328
+ "required_args": ["logger", "client", "config", "agent_router"],
329
+ },
330
+ {
331
+ "name": "Execution: Get Completions from Target Model",
332
+ "function": completions.execute,
333
+ "step_type_enum": "EXECUTION",
334
+ "config_keys": ["batch_size", "max_new_tokens_completion", "n_samples"],
335
+ "input_data_arg_name": "input_data",
336
+ "required_args": ["logger", "config", "agent_router"],
337
+ },
338
+ {
339
+ "name": "Evaluation: Judge, Aggregate, and Select Best Prefixes",
340
+ "function": lambda input_data,
341
+ config,
342
+ logger,
343
+ client: EvaluationPipeline(
344
+ config=config, logger=logger, client=client
345
+ ).execute(input_data=input_data),
346
+ "step_type_enum": "EVALUATION",
347
+ "config_keys": [
348
+ "judges",
349
+ "batch_size_judge",
350
+ "max_new_tokens_eval",
351
+ "filter_len",
352
+ "pasr_weight",
353
+ "n_prefixes_per_goal",
354
+ "selection_judges",
355
+ "max_ce",
356
+ ],
357
+ "input_data_arg_name": "input_data",
358
+ "required_args": ["logger", "client", "config"],
359
+ },
360
+ ]
361
+
362
+ def _build_step_args(
363
+ self, step_info: Dict, step_config: Dict, input_data: Any
364
+ ) -> Dict:
365
+ """
366
+ Build arguments dict for a step function based on its requirements.
367
+
368
+ Args:
369
+ step_info: Pipeline step configuration
370
+ step_config: Step-specific config values
371
+ input_data: Input data (goals for Generation, output from previous stage for others)
372
+
373
+ Returns:
374
+ Dictionary of arguments ready to pass to step function
375
+ """
376
+ args = {"config": step_config}
377
+
378
+ # Add only required arguments to avoid parameter mismatches
379
+ required_args = step_info.get("required_args", [])
380
+
381
+ if "logger" in required_args:
382
+ args["logger"] = self.logger
383
+ if "client" in required_args:
384
+ args["client"] = self.client
385
+ if "agent_router" in required_args:
386
+ args["agent_router"] = self.agent_router
387
+
388
+ # Add input data with the correct parameter name
389
+ args[step_info["input_data_arg_name"]] = input_data
390
+
391
+ return args
392
+
393
+ def _execute_function_step(
394
+ self, step_info: Dict, step_config: Dict, input_data: Any
395
+ ) -> Any:
396
+ """Execute a function-based pipeline step."""
397
+ step_function = step_info["function"]
398
+ step_args = self._build_step_args(step_info, step_config, input_data)
399
+ return step_function(**step_args)
400
+
401
+ def _track_output_metrics(self, output: Any):
402
+ """Track metrics about step output."""
403
+ if output is None:
404
+ self.tracker.add_step_metadata("output_type", "None")
405
+ self.tracker.add_step_metadata("warning", "Step returned None")
406
+ elif isinstance(output, list):
407
+ item_count = len(output)
408
+ self.tracker.add_step_metadata("output_items", item_count)
409
+ if item_count == 0:
410
+ self.tracker.add_step_metadata("warning", "Empty list returned")
411
+ else:
412
+ self.tracker.add_step_metadata("output_type", type(output).__name__)
413
+
414
+ def _finalize_pipeline(self, final_output: Any):
415
+ """Determine final status and update tracking."""
416
+ if final_output is not None and len(final_output) > 0:
417
+ eval_status = EvaluationStatusEnum.PASSED_CRITERIA
418
+ eval_notes = None
419
+ run_status = StatusEnum.COMPLETED
420
+ else:
421
+ eval_status = EvaluationStatusEnum.FAILED_CRITERIA
422
+ eval_notes = "Pipeline completed with no resulting prefixes."
423
+ run_status = StatusEnum.COMPLETED
424
+
425
+ if self.tracker:
426
+ self.tracker.update_result_status(eval_status, eval_notes)
427
+ self.tracker.update_run_status(run_status)
428
+
429
+ @with_tui_logging(logger_name="hackagent.attacks", level=logging.INFO)
430
+ def run(self, goals: List[str]) -> List[Dict]:
431
+ """
432
+ Executes the full prefix generation pipeline.
433
+
434
+ Args:
435
+ goals: A list of goal strings to generate prefixes for.
436
+
437
+ Returns:
438
+ List of dictionaries containing the final selected prefixes,
439
+ or empty list if no prefixes were generated.
440
+ """
441
+ if not goals:
442
+ return []
443
+
444
+ # Initialize tracking
445
+ parent_result_id = self._create_parent_result() if self.run_id else None
446
+ tracking_context = TrackingContext(
447
+ client=self.client,
448
+ run_id=self.run_id,
449
+ parent_result_id=parent_result_id,
450
+ logger=self.logger,
451
+ )
452
+ tracking_context.add_metadata("attack_type", "advprefix")
453
+ tracking_context.add_metadata("num_goals", len(goals))
454
+
455
+ self.tracker = StepTracker(tracking_context)
456
+ self.tracker.update_run_status(StatusEnum.RUNNING)
457
+
458
+ # Initialize pipeline
459
+ pipeline_steps = self._get_pipeline_steps()
460
+ current_step_index = self.config.get("start_step", 1) - 1
461
+ last_step_output = goals # Start with raw goals
462
+
463
+ try:
464
+ for i in range(current_step_index, len(pipeline_steps)):
465
+ step_info = pipeline_steps[i]
466
+ step_name = step_info["name"]
467
+ step_type = step_info["step_type_enum"]
468
+
469
+ # Prepare tracking data
470
+ input_sample = self._prepare_input_sample(last_step_output)
471
+ step_config = {
472
+ k: self.config[k]
473
+ for k in step_info.get("config_keys", [])
474
+ if k in self.config
475
+ }
476
+
477
+ # Calculate and log progress
478
+ step_progress = int(50 + (i / len(pipeline_steps)) * 40) # 50-90%
479
+ self.logger.info(f"━━━ Progress: {step_progress}% ━━━")
480
+
481
+ with self.tracker.track_step(
482
+ step_name, step_type, input_sample, step_config
483
+ ):
484
+ # Execute the step
485
+ if "function" in step_info:
486
+ last_step_output = self._execute_function_step(
487
+ step_info, step_config, last_step_output
488
+ )
489
+ else:
490
+ self.logger.warning(
491
+ f"No function defined for {step_name}. Skipping."
492
+ )
493
+ continue
494
+
495
+ self._track_output_metrics(last_step_output)
496
+
497
+ # Log step completion
498
+ self.logger.info(f"✅ Completed: {step_name}")
499
+
500
+ except Exception:
501
+ if self.tracker:
502
+ self.tracker.update_run_status(StatusEnum.FAILED)
503
+ raise
504
+
505
+ # Finalize and return results
506
+ self._finalize_pipeline(last_step_output)
507
+ return last_step_output if last_step_output is not None else []
@@ -0,0 +1,106 @@
1
+ # Copyright 2025 - AI4I. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import abc
16
+ from typing import Any, Dict
17
+
18
+
19
+ class BaseAttack(abc.ABC):
20
+ """
21
+ Abstract base class for black-box attacks against language models.
22
+
23
+ This class provides the foundational interface and structure that all
24
+ attack implementations must follow. It handles common initialization
25
+ patterns and enforces a consistent API across different attack types.
26
+
27
+ Attributes:
28
+ config: A dictionary containing configuration settings for the attack.
29
+ """
30
+
31
+ def __init__(self, config: Dict[str, Any]):
32
+ """
33
+ Initializes the attack with configuration parameters.
34
+
35
+ Args:
36
+ config: A dictionary containing configuration settings for the attack.
37
+ Must include all required parameters for the specific attack type.
38
+
39
+ Raises:
40
+ TypeError: If config is not a dictionary.
41
+ ValueError: If required configuration parameters are missing or invalid.
42
+ """
43
+ self.config = config
44
+ self._validate_config()
45
+ self._setup()
46
+
47
+ def _validate_config(self):
48
+ """
49
+ Validates the provided configuration.
50
+
51
+ This method performs basic validation of the configuration dictionary.
52
+ Subclasses should override this method to enforce specific configuration
53
+ requirements for their attack type.
54
+
55
+ Raises:
56
+ TypeError: If the configuration is not a dictionary.
57
+ ValueError: If required configuration parameters are missing or invalid.
58
+ """
59
+ if not isinstance(self.config, dict):
60
+ raise TypeError("Configuration must be a dictionary.")
61
+ # Add more specific validation in subclasses as needed
62
+ pass
63
+
64
+ def _setup(self):
65
+ """
66
+ Performs any necessary setup based on the configuration.
67
+
68
+ This method is called after configuration validation and provides
69
+ an opportunity for subclasses to perform initialization tasks such
70
+ as loading models, creating output directories, or setting up logging.
71
+
72
+ Subclasses can override this for specific setup tasks.
73
+ """
74
+ # Add setup logic in subclasses (e.g., logging, directories)
75
+ pass
76
+
77
+ @abc.abstractmethod
78
+ def run(self, **kwargs: Any) -> Any:
79
+ """
80
+ Executes the attack logic.
81
+
82
+ This abstract method must be implemented by all attack subclasses
83
+ to define their specific attack methodology and execution flow.
84
+
85
+ Args:
86
+ **kwargs: Attack-specific arguments that vary by implementation.
87
+ Common examples include:
88
+ - input_prompts: List of prompts to test
89
+ - goals: List of target goals for the attack
90
+ - dataset: Input dataset for evaluation
91
+ - target_model: The model to attack
92
+
93
+ Returns:
94
+ Attack-specific results. The format varies by implementation but
95
+ typically includes:
96
+ - adversarial_examples: Generated adversarial inputs
97
+ - success_metrics: Attack success rates and statistics
98
+ - detailed_results: Comprehensive result data (e.g., pandas DataFrame)
99
+ - attack_report: Summary of attack performance
100
+
101
+ Raises:
102
+ NotImplementedError: If the method is not implemented by a subclass.
103
+ RuntimeError: If the attack execution fails due to configuration
104
+ or runtime errors.
105
+ """
106
+ pass