hackagent 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. hackagent/__init__.py +12 -0
  2. hackagent/agent.py +214 -0
  3. hackagent/api/__init__.py +1 -0
  4. hackagent/api/agent/__init__.py +1 -0
  5. hackagent/api/agent/agent_create.py +347 -0
  6. hackagent/api/agent/agent_destroy.py +140 -0
  7. hackagent/api/agent/agent_list.py +242 -0
  8. hackagent/api/agent/agent_partial_update.py +361 -0
  9. hackagent/api/agent/agent_retrieve.py +235 -0
  10. hackagent/api/agent/agent_update.py +361 -0
  11. hackagent/api/apilogs/__init__.py +1 -0
  12. hackagent/api/apilogs/apilogs_list.py +170 -0
  13. hackagent/api/apilogs/apilogs_retrieve.py +162 -0
  14. hackagent/api/attack/__init__.py +1 -0
  15. hackagent/api/attack/attack_create.py +275 -0
  16. hackagent/api/attack/attack_destroy.py +146 -0
  17. hackagent/api/attack/attack_list.py +254 -0
  18. hackagent/api/attack/attack_partial_update.py +289 -0
  19. hackagent/api/attack/attack_retrieve.py +247 -0
  20. hackagent/api/attack/attack_update.py +289 -0
  21. hackagent/api/checkout/__init__.py +1 -0
  22. hackagent/api/checkout/checkout_create.py +225 -0
  23. hackagent/api/generate/__init__.py +1 -0
  24. hackagent/api/generate/generate_create.py +253 -0
  25. hackagent/api/judge/__init__.py +1 -0
  26. hackagent/api/judge/judge_create.py +253 -0
  27. hackagent/api/key/__init__.py +1 -0
  28. hackagent/api/key/key_create.py +179 -0
  29. hackagent/api/key/key_destroy.py +103 -0
  30. hackagent/api/key/key_list.py +170 -0
  31. hackagent/api/key/key_retrieve.py +162 -0
  32. hackagent/api/organization/__init__.py +1 -0
  33. hackagent/api/organization/organization_create.py +208 -0
  34. hackagent/api/organization/organization_destroy.py +104 -0
  35. hackagent/api/organization/organization_list.py +170 -0
  36. hackagent/api/organization/organization_me_retrieve.py +126 -0
  37. hackagent/api/organization/organization_partial_update.py +222 -0
  38. hackagent/api/organization/organization_retrieve.py +163 -0
  39. hackagent/api/organization/organization_update.py +222 -0
  40. hackagent/api/prompt/__init__.py +1 -0
  41. hackagent/api/prompt/prompt_create.py +171 -0
  42. hackagent/api/prompt/prompt_destroy.py +104 -0
  43. hackagent/api/prompt/prompt_list.py +185 -0
  44. hackagent/api/prompt/prompt_partial_update.py +185 -0
  45. hackagent/api/prompt/prompt_retrieve.py +163 -0
  46. hackagent/api/prompt/prompt_update.py +185 -0
  47. hackagent/api/result/__init__.py +1 -0
  48. hackagent/api/result/result_create.py +175 -0
  49. hackagent/api/result/result_destroy.py +106 -0
  50. hackagent/api/result/result_list.py +249 -0
  51. hackagent/api/result/result_partial_update.py +193 -0
  52. hackagent/api/result/result_retrieve.py +167 -0
  53. hackagent/api/result/result_trace_create.py +177 -0
  54. hackagent/api/result/result_update.py +189 -0
  55. hackagent/api/run/__init__.py +1 -0
  56. hackagent/api/run/run_create.py +187 -0
  57. hackagent/api/run/run_destroy.py +112 -0
  58. hackagent/api/run/run_list.py +291 -0
  59. hackagent/api/run/run_partial_update.py +201 -0
  60. hackagent/api/run/run_result_create.py +177 -0
  61. hackagent/api/run/run_retrieve.py +179 -0
  62. hackagent/api/run/run_run_tests_create.py +187 -0
  63. hackagent/api/run/run_update.py +201 -0
  64. hackagent/api/user/__init__.py +1 -0
  65. hackagent/api/user/user_create.py +212 -0
  66. hackagent/api/user/user_destroy.py +106 -0
  67. hackagent/api/user/user_list.py +174 -0
  68. hackagent/api/user/user_me_retrieve.py +126 -0
  69. hackagent/api/user/user_me_update.py +196 -0
  70. hackagent/api/user/user_partial_update.py +226 -0
  71. hackagent/api/user/user_retrieve.py +167 -0
  72. hackagent/api/user/user_update.py +226 -0
  73. hackagent/attacks/AdvPrefix/__init__.py +41 -0
  74. hackagent/attacks/AdvPrefix/completions.py +416 -0
  75. hackagent/attacks/AdvPrefix/config.py +259 -0
  76. hackagent/attacks/AdvPrefix/evaluation.py +745 -0
  77. hackagent/attacks/AdvPrefix/evaluators.py +564 -0
  78. hackagent/attacks/AdvPrefix/generate.py +711 -0
  79. hackagent/attacks/AdvPrefix/utils.py +307 -0
  80. hackagent/attacks/__init__.py +35 -0
  81. hackagent/attacks/advprefix.py +507 -0
  82. hackagent/attacks/base.py +106 -0
  83. hackagent/attacks/strategies.py +906 -0
  84. hackagent/cli/__init__.py +19 -0
  85. hackagent/cli/commands/__init__.py +20 -0
  86. hackagent/cli/commands/agent.py +100 -0
  87. hackagent/cli/commands/attack.py +417 -0
  88. hackagent/cli/commands/config.py +301 -0
  89. hackagent/cli/commands/results.py +327 -0
  90. hackagent/cli/config.py +249 -0
  91. hackagent/cli/main.py +515 -0
  92. hackagent/cli/tui/__init__.py +31 -0
  93. hackagent/cli/tui/actions_logger.py +200 -0
  94. hackagent/cli/tui/app.py +288 -0
  95. hackagent/cli/tui/base.py +137 -0
  96. hackagent/cli/tui/logger.py +318 -0
  97. hackagent/cli/tui/views/__init__.py +33 -0
  98. hackagent/cli/tui/views/agents.py +488 -0
  99. hackagent/cli/tui/views/attacks.py +624 -0
  100. hackagent/cli/tui/views/config.py +244 -0
  101. hackagent/cli/tui/views/dashboard.py +307 -0
  102. hackagent/cli/tui/views/results.py +1210 -0
  103. hackagent/cli/tui/widgets/__init__.py +24 -0
  104. hackagent/cli/tui/widgets/actions.py +346 -0
  105. hackagent/cli/tui/widgets/logs.py +435 -0
  106. hackagent/cli/utils.py +276 -0
  107. hackagent/client.py +286 -0
  108. hackagent/errors.py +37 -0
  109. hackagent/logger.py +83 -0
  110. hackagent/models/__init__.py +109 -0
  111. hackagent/models/agent.py +223 -0
  112. hackagent/models/agent_request.py +129 -0
  113. hackagent/models/api_token_log.py +184 -0
  114. hackagent/models/attack.py +154 -0
  115. hackagent/models/attack_request.py +82 -0
  116. hackagent/models/checkout_session_request_request.py +76 -0
  117. hackagent/models/checkout_session_response.py +59 -0
  118. hackagent/models/choice.py +81 -0
  119. hackagent/models/choice_message.py +67 -0
  120. hackagent/models/evaluation_status_enum.py +14 -0
  121. hackagent/models/generate_error_response.py +59 -0
  122. hackagent/models/generate_request_request.py +212 -0
  123. hackagent/models/generate_success_response.py +115 -0
  124. hackagent/models/generic_error_response.py +70 -0
  125. hackagent/models/message_request.py +67 -0
  126. hackagent/models/organization.py +102 -0
  127. hackagent/models/organization_minimal.py +68 -0
  128. hackagent/models/organization_request.py +71 -0
  129. hackagent/models/paginated_agent_list.py +123 -0
  130. hackagent/models/paginated_api_token_log_list.py +123 -0
  131. hackagent/models/paginated_attack_list.py +123 -0
  132. hackagent/models/paginated_organization_list.py +123 -0
  133. hackagent/models/paginated_prompt_list.py +123 -0
  134. hackagent/models/paginated_result_list.py +123 -0
  135. hackagent/models/paginated_run_list.py +123 -0
  136. hackagent/models/paginated_user_api_key_list.py +123 -0
  137. hackagent/models/paginated_user_profile_list.py +123 -0
  138. hackagent/models/patched_agent_request.py +128 -0
  139. hackagent/models/patched_attack_request.py +92 -0
  140. hackagent/models/patched_organization_request.py +71 -0
  141. hackagent/models/patched_prompt_request.py +125 -0
  142. hackagent/models/patched_result_request.py +237 -0
  143. hackagent/models/patched_run_request.py +138 -0
  144. hackagent/models/patched_user_profile_request.py +99 -0
  145. hackagent/models/prompt.py +220 -0
  146. hackagent/models/prompt_request.py +126 -0
  147. hackagent/models/result.py +294 -0
  148. hackagent/models/result_list_evaluation_status.py +14 -0
  149. hackagent/models/result_request.py +232 -0
  150. hackagent/models/run.py +233 -0
  151. hackagent/models/run_list_status.py +12 -0
  152. hackagent/models/run_request.py +133 -0
  153. hackagent/models/status_enum.py +12 -0
  154. hackagent/models/step_type_enum.py +14 -0
  155. hackagent/models/trace.py +121 -0
  156. hackagent/models/trace_request.py +94 -0
  157. hackagent/models/usage.py +75 -0
  158. hackagent/models/user_api_key.py +201 -0
  159. hackagent/models/user_api_key_request.py +73 -0
  160. hackagent/models/user_profile.py +135 -0
  161. hackagent/models/user_profile_minimal.py +76 -0
  162. hackagent/models/user_profile_request.py +99 -0
  163. hackagent/router/__init__.py +25 -0
  164. hackagent/router/adapters/__init__.py +20 -0
  165. hackagent/router/adapters/base.py +63 -0
  166. hackagent/router/adapters/google_adk.py +671 -0
  167. hackagent/router/adapters/litellm_adapter.py +524 -0
  168. hackagent/router/adapters/openai_adapter.py +426 -0
  169. hackagent/router/router.py +969 -0
  170. hackagent/router/types.py +54 -0
  171. hackagent/tracking/__init__.py +42 -0
  172. hackagent/tracking/context.py +163 -0
  173. hackagent/tracking/decorators.py +299 -0
  174. hackagent/tracking/tracker.py +441 -0
  175. hackagent/types.py +54 -0
  176. hackagent/utils.py +194 -0
  177. hackagent/vulnerabilities/__init__.py +13 -0
  178. hackagent/vulnerabilities/prompts.py +81 -0
  179. hackagent-0.3.1.dist-info/METADATA +122 -0
  180. hackagent-0.3.1.dist-info/RECORD +183 -0
  181. hackagent-0.3.1.dist-info/WHEEL +4 -0
  182. hackagent-0.3.1.dist-info/entry_points.txt +2 -0
  183. hackagent-0.3.1.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,906 @@
1
+ # Copyright 2025 - AI4I. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Attack strategy implementations using the Strategy pattern.
17
+
18
+ This module provides different attack strategies that can be executed against victim agents.
19
+ The Strategy pattern allows for dynamic selection and execution of various attack methodologies,
20
+ each with their own specific configurations and execution logic.
21
+
22
+ The module includes:
23
+ - Abstract base class `AttackStrategy` defining the interface
24
+ - Concrete implementations like `AdvPrefix` for adversarial prefix attacks
25
+ - Helper methods for HTTP response handling and data parsing
26
+ - Integration with the HackAgent backend API for attack execution and result tracking
27
+ """
28
+
29
+ import abc
30
+ import json # For ManagedAttackStrategy
31
+ import logging
32
+ import os # Added for path joining
33
+ from http import HTTPStatus # Added for checking 201 status
34
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
35
+ from uuid import UUID # Added import
36
+
37
+ import httpx # Added for manual HTTP call in AdvPrefix
38
+ import pandas as pd # For AdvPrefix
39
+
40
+ # Imports for specific strategies, moved from agent.py or direct_test_executor.py
41
+ from hackagent import errors # Import the errors module
42
+ from hackagent.api.attack.attack_create import (
43
+ sync_detailed as attacks_create_sync_detailed,
44
+ )
45
+ from hackagent.api.run import run_run_tests_create
46
+ from hackagent.attacks.advprefix import (
47
+ AdvPrefixAttack,
48
+ ) # Used by LocalPrefix
49
+ from hackagent.errors import HackAgentError
50
+ from hackagent.models import Run
51
+ from hackagent.models.attack_request import (
52
+ AttackRequest,
53
+ ) # For creating attacks via attacks_create API
54
+ from hackagent.models.run_request import RunRequest
55
+
56
+ if TYPE_CHECKING:
57
+ from hackagent.agent import HackAgent
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+ # --- Strategy Pattern for Attacks ---
62
+
63
+
64
+ class AttackStrategy(abc.ABC):
65
+ """
66
+ Abstract base class for implementing attack strategies using the Strategy pattern.
67
+
68
+ This class provides the foundational interface for all attack strategy implementations.
69
+ It handles common functionality such as HTTP response processing, data parsing,
70
+ and interaction with the HackAgent backend API.
71
+
72
+ Attributes:
73
+ hack_agent: Reference to the HackAgent instance that owns this strategy.
74
+ client: Authenticated client for API communication.
75
+ """
76
+
77
+ def __init__(self, hack_agent: "HackAgent"):
78
+ """
79
+ Initialize the attack strategy with a reference to the parent HackAgent.
80
+
81
+ Args:
82
+ hack_agent: The HackAgent instance that will use this strategy.
83
+ Provides access to the authenticated client and agent configuration.
84
+ """
85
+ self.hack_agent = hack_agent
86
+ self.client = hack_agent.client
87
+
88
+ @abc.abstractmethod
89
+ def execute(
90
+ self,
91
+ attack_config: Dict[str, Any],
92
+ run_config_override: Optional[Dict[str, Any]],
93
+ fail_on_run_error: bool,
94
+ max_wait_time_seconds: Optional[int] = None,
95
+ poll_interval_seconds: Optional[int] = None,
96
+ _tui_app: Optional[Any] = None,
97
+ _tui_log_callback: Optional[Any] = None,
98
+ ) -> Any:
99
+ """
100
+ Execute the attack strategy with the provided configuration.
101
+
102
+ This abstract method must be implemented by all concrete strategy classes
103
+ to define their specific attack execution logic.
104
+
105
+ Args:
106
+ attack_config: Configuration dictionary containing attack-specific parameters.
107
+ Must include 'attack_type' and other parameters specific to the strategy.
108
+ run_config_override: Optional configuration overrides for the attack run.
109
+ Can be used to modify default run parameters.
110
+ fail_on_run_error: Whether to raise an exception if the attack run fails.
111
+ If False, errors may be handled gracefully depending on the strategy.
112
+ max_wait_time_seconds: Maximum time to wait for attack completion.
113
+ Not used by all strategies.
114
+ poll_interval_seconds: Interval for polling attack status.
115
+ Not used by all strategies.
116
+ _tui_app: Optional Textual App instance for TUI logging integration.
117
+ Internal parameter used when running attacks from the TUI.
118
+ _tui_log_callback: Optional callback function for TUI log handling.
119
+ Internal parameter used when running attacks from the TUI.
120
+
121
+ Returns:
122
+ Strategy-specific results. The format varies by implementation but
123
+ typically includes attack results, success metrics, or result data.
124
+
125
+ Raises:
126
+ NotImplementedError: If not implemented by a concrete strategy class.
127
+ HackAgentError: For various attack execution failures.
128
+ ValueError: For invalid configuration parameters.
129
+ """
130
+ pass
131
+
132
+ def _decode_response_content(self, response: httpx.Response) -> str:
133
+ """
134
+ Decode HTTP response content to a UTF-8 string with error handling.
135
+
136
+ Args:
137
+ response: The httpx.Response object containing the response data.
138
+
139
+ Returns:
140
+ The decoded content as a UTF-8 string, or 'N/A' if content is None or empty.
141
+ Uses 'replace' error handling to avoid decoding exceptions.
142
+ """
143
+ return (
144
+ response.content.decode("utf-8", errors="replace")
145
+ if response.content
146
+ else "N/A"
147
+ )
148
+
149
+ def _parse_json_from_response_data(
150
+ self,
151
+ response: httpx.Response,
152
+ decoded_content: str,
153
+ attack_type_for_error_msg: str,
154
+ ) -> Optional[Dict[str, Any]]:
155
+ """
156
+ Parse JSON data from an HTTP response with comprehensive error handling.
157
+
158
+ This method attempts to parse JSON from response content and falls back
159
+ to pre-parsed attributes if direct parsing fails. It handles various
160
+ edge cases and provides detailed error logging.
161
+
162
+ Args:
163
+ response: The httpx.Response object to parse.
164
+ decoded_content: The already decoded string content of the response.
165
+ attack_type_for_error_msg: Descriptive string for error messages,
166
+ typically the attack type being processed.
167
+
168
+ Returns:
169
+ A dictionary containing the parsed JSON data if successful,
170
+ None if parsing fails for non-critical cases.
171
+
172
+ Raises:
173
+ HackAgentError: If response status is 201 (Created) but JSON parsing
174
+ fails critically, indicating a server-side issue.
175
+ """
176
+ parsed_data_dict: Optional[Dict[str, Any]] = None
177
+ if response.content:
178
+ try:
179
+ parsed_data_dict = json.loads(decoded_content)
180
+ except json.JSONDecodeError as jde:
181
+ if (
182
+ response.status_code == 201
183
+ ): # Critical for 201 if body exists but is bad JSON
184
+ logger.error(
185
+ f"Failed to parse JSON for {attack_type_for_error_msg} (201 response with content): {jde}. Content: {decoded_content}"
186
+ )
187
+ raise HackAgentError(
188
+ f"Failed to parse 201 response JSON for {attack_type_for_error_msg} (content present): {jde}"
189
+ ) from jde
190
+ logger.warning(
191
+ f"Could not parse JSON from response body for {attack_type_for_error_msg} (status {response.status_code}). Content: {decoded_content}",
192
+ exc_info=False,
193
+ ) # exc_info=False to avoid verbose log for non-critical parse fail
194
+ # Do not return None yet, try pre-parsed attributes next
195
+
196
+ # Try pre-parsed attributes, especially if content parsing failed or content was empty
197
+ if not parsed_data_dict and hasattr(response, "parsed") and response.parsed:
198
+ logger.debug(
199
+ f"Attempting to use pre-parsed attribute for {attack_type_for_error_msg}"
200
+ )
201
+ if hasattr(response.parsed, "additional_properties") and isinstance(
202
+ response.parsed.additional_properties, dict
203
+ ):
204
+ parsed_data_dict = response.parsed.additional_properties
205
+ elif isinstance(response.parsed, dict):
206
+ parsed_data_dict = response.parsed
207
+ else:
208
+ logger.warning(
209
+ f"Response has 'parsed' attribute but it's not a usable dict for {attack_type_for_error_msg}. Type: {type(response.parsed)}"
210
+ )
211
+
212
+ return parsed_data_dict
213
+
214
+ def _get_parsed_data_from_initiate_response(
215
+ self,
216
+ response: httpx.Response,
217
+ decoded_content: str,
218
+ attack_type_for_error_msg: str,
219
+ ) -> Dict[str, Any]:
220
+ """
221
+ Process an attack initiation response and extract parsed data.
222
+
223
+ This method handles different HTTP status codes and ensures that
224
+ the response contains valid, parseable data for further processing.
225
+ It provides comprehensive error handling for various failure scenarios.
226
+
227
+ Args:
228
+ response: The httpx.Response object from an attack initiation request.
229
+ decoded_content: Pre-decoded string content of the response.
230
+ attack_type_for_error_msg: Descriptive string for error messages.
231
+
232
+ Returns:
233
+ A dictionary containing the parsed response data.
234
+
235
+ Raises:
236
+ HackAgentError: If the response indicates failure (status >= 300),
237
+ if a 201 response lacks parseable data, or if unexpected
238
+ status codes are received without valid data.
239
+ """
240
+ parsed_data_dict = self._parse_json_from_response_data(
241
+ response, decoded_content, attack_type_for_error_msg
242
+ )
243
+
244
+ if response.status_code == 201:
245
+ if not parsed_data_dict:
246
+ # This case implies that _parse_json_from_response_data returned None for a 201, which means
247
+ # either no content, or content that wasn't JSON, or pre-parsed attributes also failed.
248
+ # If content was present but bad JSON, _parse_json_from_response_data would have raised.
249
+ logger.error(
250
+ f"201 for {attack_type_for_error_msg} but no parsable dictionary body was found. Decoded content: '{decoded_content}', Pre-parsed type: {type(response.parsed if hasattr(response, 'parsed') else None)}"
251
+ )
252
+ raise HackAgentError(
253
+ f"201 for {attack_type_for_error_msg} but no parsable dictionary body was found."
254
+ )
255
+
256
+ elif response.status_code >= 300:
257
+ err_text = f"Failed to initiate {attack_type_for_error_msg}. Status: {response.status_code}, Body: {decoded_content}"
258
+ logger.error(err_text)
259
+ raise HackAgentError(err_text)
260
+
261
+ else: # Unexpected success status codes (e.g., 200 OK instead of 201 Created, or other 2xx)
262
+ logger.warning(
263
+ f"Unexpected success status {response.status_code} from initiate_{attack_type_for_error_msg}. Content: {decoded_content}"
264
+ )
265
+ if (
266
+ not parsed_data_dict
267
+ ): # If still no data after trying for an unexpected success status
268
+ err_text = (
269
+ f"Could not obtain parsable data from initiate_{attack_type_for_error_msg} response with unexpected status {response.status_code}. "
270
+ f"Content: {decoded_content}"
271
+ )
272
+ logger.error(err_text)
273
+ raise HackAgentError(err_text)
274
+
275
+ if (
276
+ not parsed_data_dict
277
+ ): # Should be caught by earlier checks, but as a final safeguard
278
+ logger.error(
279
+ f"Internal logic error: Parsed data dictionary is None for {attack_type_for_error_msg} status {response.status_code} without raising earlier. Content: {decoded_content}"
280
+ )
281
+ raise HackAgentError(
282
+ f"Failed to obtain parsed data for {attack_type_for_error_msg} (status {response.status_code}). Check logs for parsing attempts."
283
+ )
284
+ return parsed_data_dict
285
+
286
+ def _extract_ids_from_data_dict(
287
+ self,
288
+ parsed_data_dict: Dict[str, Any],
289
+ attack_type_for_error_msg: str,
290
+ original_content: str,
291
+ ) -> Tuple[str, Optional[str]]:
292
+ """
293
+ Extract attack ID and optional run ID from a parsed response dictionary.
294
+
295
+ This method extracts the mandatory 'id' field (attack_id) and optional
296
+ 'associated_run_id' field from API response data.
297
+
298
+ Args:
299
+ parsed_data_dict: Dictionary containing parsed response data.
300
+ attack_type_for_error_msg: Descriptive string for error messages.
301
+ original_content: Original response content string for error reporting.
302
+
303
+ Returns:
304
+ A tuple containing (attack_id, run_id). The attack_id is always a string,
305
+ while run_id may be None if not present in the response.
306
+
307
+ Raises:
308
+ HackAgentError: If the mandatory attack_id cannot be extracted or
309
+ is invalid.
310
+ """
311
+ raw_attack_id = parsed_data_dict.get("id")
312
+ attack_id_str = str(raw_attack_id) if raw_attack_id is not None else None
313
+
314
+ if attack_id_str is None:
315
+ err_detail = (
316
+ f"Could not extract mandatory attack_id ('{attack_id_str}') "
317
+ f"from initiate_{attack_type_for_error_msg} response. "
318
+ f"Source dict: {parsed_data_dict}, Original Decoded Content: '{original_content}'"
319
+ )
320
+ logger.error(err_detail)
321
+ raise HackAgentError(err_detail)
322
+
323
+ raw_run_id = parsed_data_dict.get("associated_run_id")
324
+ run_id_str = str(raw_run_id) if raw_run_id is not None else None
325
+
326
+ logger.info(
327
+ f"Extracted Attack ID: {attack_id_str} and optional server-associated Run ID: {run_id_str if run_id_str else 'Not Provided'} for {attack_type_for_error_msg}."
328
+ )
329
+ return attack_id_str, run_id_str
330
+
331
+ def extract_attack_and_run_ids_from_initiate_response(
332
+ self, response: httpx.Response, attack_type_for_error_msg: str = "attack"
333
+ ) -> Tuple[str, Optional[str]]:
334
+ """
335
+ Orchestrate the extraction of attack and run IDs from an attack creation response.
336
+
337
+ This is the main entry point for extracting IDs from API responses. It coordinates
338
+ the decoding, parsing, and extraction process using the helper methods.
339
+
340
+ Args:
341
+ response: The httpx.Response object from an attack creation API call.
342
+ attack_type_for_error_msg: Descriptive string for error messages,
343
+ defaults to "attack".
344
+
345
+ Returns:
346
+ A tuple containing (attack_id, run_id). The attack_id is always present
347
+ as a string, while run_id may be None if not provided in the response.
348
+
349
+ Raises:
350
+ HackAgentError: If the attack_id cannot be extracted or if the response
351
+ indicates an error condition.
352
+ """
353
+ logger.debug(
354
+ f"Attempting to extract Attack/Run IDs for '{attack_type_for_error_msg}' from response (status: {response.status_code})"
355
+ )
356
+ decoded_content = self._decode_response_content(response)
357
+ parsed_data_dict = self._get_parsed_data_from_initiate_response(
358
+ response, decoded_content, attack_type_for_error_msg
359
+ )
360
+ return self._extract_ids_from_data_dict(
361
+ parsed_data_dict, attack_type_for_error_msg, decoded_content
362
+ )
363
+
364
+
365
+ class AdvPrefix(AttackStrategy):
366
+ """
367
+ Strategy implementation for AdvPrefix (Adversarial Prefix) attacks.
368
+
369
+ This strategy implements adversarial prefix generation attacks that use
370
+ uncensored models to generate prefixes that can elicit harmful responses
371
+ from target models. The attack follows a multi-stage pipeline including
372
+ prefix generation, cross-entropy computation, completion generation,
373
+ evaluation, and final selection.
374
+
375
+ The strategy integrates with the HackAgent backend to track attack
376
+ progress and results while executing the local AdvPrefix pipeline.
377
+ """
378
+
379
+ def _prepare_and_validate_attack_params(
380
+ self,
381
+ attack_config: Dict[str, Any],
382
+ ) -> List[Any]:
383
+ """
384
+ Validate and extract necessary parameters from the attack configuration.
385
+
386
+ This method ensures that the attack configuration contains all required
387
+ parameters for the AdvPrefix attack execution.
388
+
389
+ Args:
390
+ attack_config: Dictionary containing attack configuration parameters.
391
+ Must include a 'goals' key with a list of target goals.
392
+
393
+ Returns:
394
+ A list of goals extracted from the attack configuration.
395
+
396
+ Raises:
397
+ ValueError: If the 'goals' key is missing or is not a list.
398
+ """
399
+ goals = attack_config.get("goals")
400
+ if not isinstance(goals, list):
401
+ raise ValueError(
402
+ "'attack_config' must contain 'goals' list for AdvPrefixAttack."
403
+ )
404
+
405
+ return goals
406
+
407
+ def _create_server_attack_record(
408
+ self,
409
+ victim_agent_id: UUID,
410
+ organization_id: UUID,
411
+ attack_config: Dict[str, Any], # Used for summary
412
+ ) -> str:
413
+ """
414
+ Create an Attack record on the HackAgent server.
415
+
416
+ This method creates a new attack record in the backend system to track
417
+ the AdvPrefix attack execution and results.
418
+
419
+ Args:
420
+ victim_agent_id: UUID of the target agent being attacked.
421
+ organization_id: UUID of the organization running the attack.
422
+ attack_config: Configuration dictionary for the attack, stored
423
+ as metadata in the attack record.
424
+
425
+ Returns:
426
+ The string ID of the created attack record.
427
+
428
+ Raises:
429
+ HackAgentError: If the attack record creation fails or if the
430
+ response cannot be parsed to extract the attack ID.
431
+ """
432
+ logger.info("Creating Attack record on the server.")
433
+ attack_type = "advprefix"
434
+
435
+ payload = {
436
+ "type": attack_type,
437
+ "agent": str(victim_agent_id), # Convert UUID to string
438
+ "organization": str(organization_id), # Convert UUID to string
439
+ "configuration": attack_config,
440
+ }
441
+ try:
442
+ attack_req_obj = AttackRequest.from_dict(payload)
443
+ logger.debug(
444
+ f"Attempting to create Attack record with payload: {attack_req_obj.to_dict()}"
445
+ )
446
+ response = attacks_create_sync_detailed(
447
+ client=self.client, body=attack_req_obj
448
+ )
449
+ except Exception as e:
450
+ logger.error(
451
+ f"Failed to construct/send AttackRequest for {attack_type} record: {e}",
452
+ exc_info=True,
453
+ )
454
+ raise HackAgentError(
455
+ f"Failed to send AttackRequest for {attack_type} record: {e}"
456
+ ) from e
457
+
458
+ attack_id, _ = self.extract_attack_and_run_ids_from_initiate_response(
459
+ response=response, attack_type_for_error_msg=attack_type
460
+ )
461
+ logger.info(f"Attack record created on server. Attack ID: {attack_id}.")
462
+ return attack_id
463
+
464
+ def _create_server_run_record(
465
+ self,
466
+ attack_id: str,
467
+ victim_agent_id: str,
468
+ run_config_override: Optional[Dict[str, Any]],
469
+ ) -> str:
470
+ """
471
+ Create a Run record on the HackAgent server for tracking attack execution.
472
+
473
+ This method creates a new run record associated with the attack to track
474
+ the specific execution instance and its results.
475
+
476
+ Args:
477
+ attack_id: String ID of the attack record this run belongs to.
478
+ victim_agent_id: String ID of the target agent being attacked.
479
+ run_config_override: Optional configuration overrides for this
480
+ specific run instance.
481
+
482
+ Returns:
483
+ The string ID of the created run record.
484
+
485
+ Raises:
486
+ HackAgentError: If the run record creation fails, if the response
487
+ cannot be parsed, or if the run ID cannot be extracted.
488
+ """
489
+ logger.info(
490
+ f"Attempting to explicitly create a Run record for Attack ID: {attack_id}"
491
+ )
492
+ payload = RunRequest(
493
+ attack=attack_id,
494
+ agent=victim_agent_id,
495
+ run_config=run_config_override if run_config_override else {},
496
+ )
497
+ try:
498
+ # response_obj is the custom hackagent.types.Response[Run]
499
+ response_obj = run_run_tests_create.sync_detailed(
500
+ client=self.client, body=payload
501
+ )
502
+
503
+ created_run: Optional[Run] = response_obj.parsed
504
+
505
+ # If the auto-generated client didn't parse for 201, but it's a success, try manual parsing.
506
+ if created_run is None and response_obj.status_code == HTTPStatus.CREATED:
507
+ logger.info(
508
+ f"Run creation returned 201 (CREATED), attempting to manually parse response content for Attack ID: {attack_id}"
509
+ )
510
+ if response_obj.content:
511
+ try:
512
+ created_run_data = json.loads(
513
+ response_obj.content.decode("utf-8")
514
+ )
515
+ created_run = Run.from_dict(
516
+ created_run_data
517
+ ) # Use the Run model's from_dict
518
+ logger.info(
519
+ f"Manually parsed Run object from 201 response for Attack ID {attack_id}. Run ID: {created_run.id if created_run and hasattr(created_run, 'id') else 'Parse_Failed_Or_No_ID'}"
520
+ )
521
+ except json.JSONDecodeError as jde:
522
+ logger.error(
523
+ f"Failed to manually parse JSON from 201 response content for Attack ID {attack_id}: {jde}. Content: {response_obj.content.decode('utf-8', errors='replace')}",
524
+ exc_info=True,
525
+ )
526
+ # created_run remains None, will be caught by the check below
527
+ except Exception as e:
528
+ logger.error(
529
+ f"Unexpected error manually parsing 201 response content for Attack ID {attack_id}: {e}",
530
+ exc_info=True,
531
+ )
532
+ # created_run remains None, will be caught by the check below
533
+ else:
534
+ logger.warning(
535
+ f"Run creation returned 201 (CREATED) but response content was empty for Attack ID: {attack_id}. Cannot manually parse."
536
+ )
537
+
538
+ if not created_run or not hasattr(created_run, "id") or not created_run.id:
539
+ status_code_val = (
540
+ response_obj.status_code
541
+ if hasattr(response_obj, "status_code")
542
+ else "Unknown Status"
543
+ )
544
+ content_val = (
545
+ response_obj.content.decode("utf-8", errors="replace")
546
+ if hasattr(response_obj, "content") and response_obj.content
547
+ else "No content"
548
+ )
549
+
550
+ logger.error(
551
+ f"Failed to get valid Run ID from run creation for Attack {attack_id}. "
552
+ f"Status: {status_code_val}, Parsed: {created_run}, Content: {content_val}"
553
+ )
554
+ raise HackAgentError(
555
+ f"Server API for Run creation returned status {status_code_val} "
556
+ f"but response parsing failed, lacked Run ID, or an error occurred. Content: {content_val}"
557
+ )
558
+
559
+ run_id = str(created_run.id)
560
+ logger.info(
561
+ f"Successfully created Run ID: {run_id} for Attack ID: {attack_id}"
562
+ )
563
+ return run_id
564
+
565
+ except errors.UnexpectedStatus as use:
566
+ # This is caught if client.raise_on_unexpected_status is True and server returns non-200
567
+ error_content = (
568
+ use.content.decode("utf-8", errors="replace")
569
+ if use.content
570
+ else "No content"
571
+ )
572
+ logger.error(
573
+ f"API error (UnexpectedStatus {use.status_code}) creating Run for Attack {attack_id}: {error_content}",
574
+ exc_info=True,
575
+ )
576
+ raise HackAgentError(
577
+ f"Failed to create Run for Attack {attack_id} (API status {use.status_code}): {error_content}"
578
+ ) from use
579
+ except Exception as e:
580
+ logger.error(
581
+ f"Error creating Run for Attack {attack_id}: {e}", exc_info=True
582
+ )
583
+ raise HackAgentError(
584
+ f"Failed to create Run for Attack {attack_id}: {e}"
585
+ ) from e
586
+
587
+ def _prepare_attack_config(
588
+ self,
589
+ attack_config: Dict[str, Any],
590
+ run_id: str,
591
+ attack_id: str,
592
+ ) -> Dict[str, Any]:
593
+ """
594
+ Prepare the configuration dictionary for the local AdvPrefixAttack execution.
595
+
596
+ This method processes the user-provided attack configuration and adds
597
+ necessary parameters for the AdvPrefix attack execution, including
598
+ server-generated IDs and client objects.
599
+
600
+ Args:
601
+ attack_config: Original attack configuration provided by the user.
602
+ run_id: Server-generated run ID for tracking this execution.
603
+ attack_id: Server-generated attack ID for this attack instance.
604
+
605
+ Returns:
606
+ A dictionary containing the prepared configuration with all necessary
607
+ parameters for AdvPrefixAttack execution, including client references
608
+ and execution metadata.
609
+ """
610
+ logger.debug(f"Preparing local attack config for Run ID: {run_id}")
611
+ # Deep copy the user-provided attack_config to avoid modifying it directly.
612
+ prepared_config = json.loads(json.dumps(attack_config))
613
+
614
+ # Explicitly set/override 'run_id' with the server-generated run_id.
615
+ # This 'run_id' will be used by AdvPrefixAttack to initialize its self.run_id.
616
+ original_config_run_id = prepared_config.get("run_id")
617
+ prepared_config["run_id"] = run_id
618
+ if original_config_run_id and original_config_run_id != run_id:
619
+ logger.info(
620
+ f"Overriding 'run_id' in attack_config from '{original_config_run_id}' to server Run ID '{run_id}' for AdvPrefixAttack."
621
+ )
622
+ elif not original_config_run_id:
623
+ logger.info(
624
+ f"Set 'run_id' in attack_config to server Run ID '{run_id}' for AdvPrefixAttack."
625
+ )
626
+
627
+ # Update with other necessary parameters for AdvPrefixAttack
628
+ prepared_config.update(
629
+ {
630
+ "hackagent_client": self.client,
631
+ "agent_router": self.hack_agent.router,
632
+ # "initial_run_id": run_id, # This is no longer needed as AdvPrefixAttack.run will use self.run_id
633
+ "attack_id": attack_id,
634
+ }
635
+ )
636
+
637
+ # Ensure 'output_dir' is present, defaulting if necessary.
638
+ # AdvPrefixAttack uses this as the final output directory (no nested run_id subdirectory).
639
+ if "output_dir" not in prepared_config:
640
+ # Create timestamp-based directory structure for better organization
641
+ # Format: ./logs/runs/YYYY-MM-DD/HH-MM-SS_attack-id-prefix
642
+ from datetime import datetime
643
+
644
+ now = datetime.now()
645
+ date_dir = now.strftime("%Y-%m-%d")
646
+ time_prefix = now.strftime("%H-%M-%S")
647
+ attack_id_short = attack_id[:8] if len(attack_id) > 8 else attack_id
648
+ prepared_config["output_dir"] = (
649
+ f"./logs/runs/{date_dir}/{time_prefix}_{attack_id_short}"
650
+ )
651
+ logger.warning(
652
+ f"'output_dir' not in attack_config for AdvPrefixAttack, defaulting to {prepared_config['output_dir']}"
653
+ )
654
+
655
+ return prepared_config
656
+
657
+ def _execute_local_prefix_attack(
658
+ self,
659
+ attack_config: Dict[str, Any],
660
+ goals: List[Any],
661
+ run_id: str, # Server run_id
662
+ attack_id: str,
663
+ _tui_app: Optional[Any] = None,
664
+ _tui_log_callback: Optional[Any] = None,
665
+ ) -> Optional[pd.DataFrame]:
666
+ """
667
+ Execute the local AdvPrefix attack using the configured pipeline.
668
+
669
+ This method instantiates and runs the AdvPrefixAttack with the prepared
670
+ configuration and target goals. It handles the execution of the complete
671
+ adversarial prefix generation pipeline.
672
+
673
+ Args:
674
+ attack_config: Attack configuration dictionary containing pipeline parameters.
675
+ goals: List of target goals for the adversarial prefix generation.
676
+ run_id: Server-generated run ID for tracking this execution.
677
+ attack_id: Server-generated attack ID for this attack instance.
678
+
679
+ Returns:
680
+ A pandas DataFrame containing the attack results if successful,
681
+ None if the attack execution fails.
682
+
683
+ Note:
684
+ This method handles exceptions internally and returns None on failure
685
+ rather than raising exceptions, allowing the calling code to handle
686
+ failures gracefully.
687
+ """
688
+ logger.info(
689
+ f"Executing local prefix attack for Attack ID {attack_id}, Server Run ID {run_id}."
690
+ )
691
+ try:
692
+ # runner_config from _prepare_attack_config is a flat dictionary
693
+ # containing pipeline params, client object, and router object.
694
+ flat_prepared_config = self._prepare_attack_config(
695
+ attack_config, run_id, attack_id
696
+ )
697
+
698
+ # Extract the client and router objects that AdvPrefixAttack expects as direct arguments.
699
+ # The key for the client object in flat_prepared_config is "hackagent_client".
700
+ adv_prefix_client = flat_prepared_config.pop("hackagent_client")
701
+ adv_prefix_router = flat_prepared_config.pop("agent_router")
702
+
703
+ # Remove other keys that are not part of AdvPrefixAttack's 'config' dictionary
704
+ # or were passed for strategy-level logic but not for AdvPrefixAttack.__init__.
705
+ flat_prepared_config.pop(
706
+ "attack_type", None
707
+ ) # Already handled if in original attack_config
708
+ flat_prepared_config.pop(
709
+ "goals", None
710
+ ) # Already handled if in original attack_config
711
+
712
+ # The remaining flat_prepared_config is now the dictionary
713
+ # that AdvPrefixAttack expects for its 'config' parameter.
714
+ # This dictionary includes user's settings, run_id, attack_id, output_dir etc.
715
+
716
+ runner = AdvPrefixAttack(
717
+ config=flat_prepared_config,
718
+ client=adv_prefix_client,
719
+ agent_router=adv_prefix_router,
720
+ )
721
+
722
+ # Attach TUI log handler if TUI context is provided
723
+ if _tui_app and _tui_log_callback:
724
+ try:
725
+ from hackagent.cli.tui.logger import attach_tui_handler
726
+
727
+ attach_tui_handler(
728
+ attack_instance=runner,
729
+ app=_tui_app,
730
+ callback=_tui_log_callback,
731
+ )
732
+ logger.info("TUI log handler attached to attack instance")
733
+ except ImportError:
734
+ logger.warning(
735
+ "Failed to import TUI logger, logs will not be shown in TUI"
736
+ )
737
+
738
+ # AdvPrefixAttack.run will use its self.run_id, which is initialized from runner_config["run_id"].
739
+ results_df = runner.run(goals=goals) # No longer pass initial_run_id
740
+ logger.info(
741
+ f"Local prefix attack completed for Attack ID {attack_id}, Server Run ID {run_id}."
742
+ )
743
+ return results_df
744
+ except Exception as e:
745
+ logger.error(
746
+ f"Error during local prefix attack execution for Attack ID {attack_id}, Server Run ID {run_id}: {e}",
747
+ exc_info=True,
748
+ )
749
+ return None # Or re-raise if appropriate for the calling context
750
+
751
+ def _log_local_run_persistence_info(
752
+ self,
753
+ attack_config: Dict[str, Any],
754
+ attack_id: str,
755
+ run_id: str,
756
+ fail_on_run_error: bool, # To decide if error during this info step is critical
757
+ ):
758
+ """
759
+ Log information about local run data persistence and file locations.
760
+
761
+ This method logs details about where local attack execution data
762
+ (such as intermediate CSV files) are stored for debugging and
763
+ result retrieval purposes.
764
+
765
+ Args:
766
+ attack_config: Attack configuration containing output directory settings.
767
+ attack_id: String ID of the attack record.
768
+ run_id: String ID of the run record.
769
+ fail_on_run_error: Whether errors in this step should be treated as
770
+ critical. Currently unused as this method only logs information.
771
+
772
+ Note:
773
+ This method currently only performs logging operations. If actual
774
+ file operations were performed, error handling would be more critical
775
+ based on the fail_on_run_error parameter.
776
+ """
777
+ # This method currently only logs. If actual operations were done, error handling would be more critical.
778
+ try:
779
+ base_output_dir = attack_config.get(
780
+ "output_dir", f"./hackagent_local_runs/{attack_id}"
781
+ )
782
+ actual_run_output_dir = os.path.join(base_output_dir, f"run_{run_id}")
783
+ input_csv_hint = attack_config.get(
784
+ "input_csv_for_model_persistence", "step9_output.csv"
785
+ )
786
+ logger.info(
787
+ f"Local run data (for potential Pydantic model persistence/Step10): Dir='{actual_run_output_dir}', CSV hint='{input_csv_hint}'."
788
+ )
789
+ except Exception as e:
790
+ logger.error(
791
+ f"Error preparing local run persistence info for Attack {attack_id}: {e}",
792
+ exc_info=True,
793
+ )
794
+ if fail_on_run_error:
795
+ # This is just logging info, so might not be fatal unless other operations depend on it.
796
+ # For now, just log and continue, but could raise if this setup was critical.
797
+ pass
798
+
799
+ def execute(
800
+ self,
801
+ attack_config: Dict[str, Any],
802
+ run_config_override: Optional[Dict[str, Any]],
803
+ fail_on_run_error: bool,
804
+ max_wait_time_seconds: Optional[int] = None,
805
+ poll_interval_seconds: Optional[int] = None,
806
+ _tui_app: Optional[Any] = None,
807
+ _tui_log_callback: Optional[Any] = None,
808
+ ) -> Any:
809
+ """
810
+ Execute the complete AdvPrefix attack workflow.
811
+
812
+ This method orchestrates the full AdvPrefix attack execution, including
813
+ server-side record creation, local attack execution, and result processing.
814
+ It follows a structured workflow:
815
+
816
+ 1. Create an Attack record on the HackAgent server for tracking
817
+ 2. Create a Run record associated with the Attack for this execution
818
+ 3. Execute the local AdvPrefix pipeline with the target goals
819
+ 4. Log persistence information for results and intermediate data
820
+
821
+ Args:
822
+ attack_config: Configuration dictionary containing attack parameters.
823
+ Must include 'goals' key with a list of target goals for the attack.
824
+ May include 'output_dir' and other AdvPrefix pipeline parameters.
825
+ run_config_override: Optional configuration overrides for this specific
826
+ run. Can be used to modify default run parameters without affecting
827
+ the main attack configuration.
828
+ fail_on_run_error: Whether to raise an exception if the local attack
829
+ execution fails. If False, the method will return None for failed
830
+ executions instead of raising an exception.
831
+
832
+ Returns:
833
+ A pandas DataFrame containing the attack results from the local AdvPrefix
834
+ execution if successful. Returns None if the attack fails and
835
+ fail_on_run_error is False.
836
+
837
+ Raises:
838
+ HackAgentError: If victim agent ID or organization ID is not available,
839
+ if server record creation fails, or if local execution fails and
840
+ fail_on_run_error is True.
841
+ ValueError: If the 'goals' key is missing from attack_config.
842
+
843
+ Note:
844
+ This method creates server-side records for tracking and audit purposes
845
+ but the actual attack execution happens locally. Future versions may
846
+ include server-side result uploading and status updates.
847
+ """
848
+ victim_agent_id: UUID = self.hack_agent.router.backend_agent.id
849
+ organization_id: UUID = self.hack_agent.router.organization_id
850
+
851
+ if not victim_agent_id or not organization_id:
852
+ raise HackAgentError(
853
+ "Victim agent ID or Organization ID is not available. Ensure agent is initialized."
854
+ )
855
+
856
+ # 1. Create Attack record on the server
857
+ attack_id = self._create_server_attack_record(
858
+ victim_agent_id=victim_agent_id,
859
+ organization_id=organization_id,
860
+ attack_config=attack_config, # Pass for summary or details
861
+ )
862
+ logger.info(f"AdvPrefix server Attack record created with ID: {attack_id}")
863
+
864
+ # 2. Create Run record on the server
865
+ run_id = self._create_server_run_record(
866
+ attack_id=attack_id,
867
+ victim_agent_id=victim_agent_id,
868
+ run_config_override=run_config_override,
869
+ )
870
+ logger.info(
871
+ f"AdvPrefix server Run record created with ID: {run_id} for Attack ID: {attack_id}"
872
+ )
873
+
874
+ # 3. Execute the local AdvPrefix attack logic
875
+ goals = attack_config.get("goals")
876
+ if not goals:
877
+ raise ValueError("AdvPrefix attack requires 'goals' in attack_config.")
878
+
879
+ # Assuming _execute_local_prefix_attack is now synchronous
880
+ local_results_df = self._execute_local_prefix_attack(
881
+ attack_config=attack_config,
882
+ goals=goals,
883
+ run_id=run_id,
884
+ attack_id=attack_id,
885
+ _tui_app=_tui_app,
886
+ _tui_log_callback=_tui_log_callback,
887
+ )
888
+
889
+ # 4. Log persistence info (which internally might update server records)
890
+ # This step might be expanded to explicitly update server records if needed.
891
+ self._log_local_run_persistence_info(
892
+ attack_config, attack_id, run_id, fail_on_run_error
893
+ )
894
+
895
+ if local_results_df is None and fail_on_run_error:
896
+ raise HackAgentError(
897
+ f"AdvPrefix local execution failed for Attack ID {attack_id} and Run ID {run_id}."
898
+ )
899
+
900
+ logger.info(f"AdvPrefix attack execution completed for Attack ID {attack_id}.")
901
+ # Return the DataFrame from the local execution as the primary result for now.
902
+ # Future: Might return a more comprehensive result object or the server Run object.
903
+ return local_results_df
904
+
905
+
906
+ # --- End Strategy Pattern ---