hackagent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. hackagent/__init__.py +23 -0
  2. hackagent/agent.py +193 -0
  3. hackagent/api/__init__.py +1 -0
  4. hackagent/api/agent/__init__.py +1 -0
  5. hackagent/api/agent/agent_create.py +340 -0
  6. hackagent/api/agent/agent_destroy.py +136 -0
  7. hackagent/api/agent/agent_list.py +234 -0
  8. hackagent/api/agent/agent_partial_update.py +354 -0
  9. hackagent/api/agent/agent_retrieve.py +227 -0
  10. hackagent/api/agent/agent_update.py +354 -0
  11. hackagent/api/attack/__init__.py +1 -0
  12. hackagent/api/attack/attack_create.py +264 -0
  13. hackagent/api/attack/attack_destroy.py +140 -0
  14. hackagent/api/attack/attack_list.py +242 -0
  15. hackagent/api/attack/attack_partial_update.py +278 -0
  16. hackagent/api/attack/attack_retrieve.py +235 -0
  17. hackagent/api/attack/attack_update.py +278 -0
  18. hackagent/api/key/__init__.py +1 -0
  19. hackagent/api/key/key_create.py +168 -0
  20. hackagent/api/key/key_destroy.py +97 -0
  21. hackagent/api/key/key_list.py +158 -0
  22. hackagent/api/key/key_retrieve.py +150 -0
  23. hackagent/api/prompt/__init__.py +1 -0
  24. hackagent/api/prompt/prompt_create.py +160 -0
  25. hackagent/api/prompt/prompt_destroy.py +98 -0
  26. hackagent/api/prompt/prompt_list.py +173 -0
  27. hackagent/api/prompt/prompt_partial_update.py +174 -0
  28. hackagent/api/prompt/prompt_retrieve.py +151 -0
  29. hackagent/api/prompt/prompt_update.py +174 -0
  30. hackagent/api/result/__init__.py +1 -0
  31. hackagent/api/result/result_create.py +160 -0
  32. hackagent/api/result/result_destroy.py +98 -0
  33. hackagent/api/result/result_list.py +233 -0
  34. hackagent/api/result/result_partial_update.py +178 -0
  35. hackagent/api/result/result_retrieve.py +151 -0
  36. hackagent/api/result/result_trace_create.py +178 -0
  37. hackagent/api/result/result_update.py +174 -0
  38. hackagent/api/run/__init__.py +1 -0
  39. hackagent/api/run/run_create.py +172 -0
  40. hackagent/api/run/run_destroy.py +104 -0
  41. hackagent/api/run/run_list.py +260 -0
  42. hackagent/api/run/run_partial_update.py +186 -0
  43. hackagent/api/run/run_result_create.py +178 -0
  44. hackagent/api/run/run_retrieve.py +163 -0
  45. hackagent/api/run/run_run_tests_create.py +172 -0
  46. hackagent/api/run/run_update.py +186 -0
  47. hackagent/attacks/AdvPrefix/README.md +7 -0
  48. hackagent/attacks/AdvPrefix/__init__.py +0 -0
  49. hackagent/attacks/AdvPrefix/completer.py +438 -0
  50. hackagent/attacks/AdvPrefix/config.py +59 -0
  51. hackagent/attacks/AdvPrefix/preprocessing.py +521 -0
  52. hackagent/attacks/AdvPrefix/scorer.py +259 -0
  53. hackagent/attacks/AdvPrefix/scorer_parser.py +498 -0
  54. hackagent/attacks/AdvPrefix/selector.py +246 -0
  55. hackagent/attacks/AdvPrefix/step1_generate.py +324 -0
  56. hackagent/attacks/AdvPrefix/step4_compute_ce.py +293 -0
  57. hackagent/attacks/AdvPrefix/step6_get_completions.py +387 -0
  58. hackagent/attacks/AdvPrefix/step7_evaluate_responses.py +289 -0
  59. hackagent/attacks/AdvPrefix/step8_aggregate_evaluations.py +177 -0
  60. hackagent/attacks/AdvPrefix/step9_select_prefixes.py +59 -0
  61. hackagent/attacks/AdvPrefix/utils.py +192 -0
  62. hackagent/attacks/__init__.py +6 -0
  63. hackagent/attacks/advprefix.py +1136 -0
  64. hackagent/attacks/base.py +50 -0
  65. hackagent/attacks/strategies.py +539 -0
  66. hackagent/branding.py +143 -0
  67. hackagent/client.py +328 -0
  68. hackagent/errors.py +31 -0
  69. hackagent/logger.py +67 -0
  70. hackagent/models/__init__.py +71 -0
  71. hackagent/models/agent.py +240 -0
  72. hackagent/models/agent_request.py +169 -0
  73. hackagent/models/agent_type_enum.py +12 -0
  74. hackagent/models/attack.py +154 -0
  75. hackagent/models/attack_request.py +82 -0
  76. hackagent/models/evaluation_status_enum.py +14 -0
  77. hackagent/models/organization_minimal.py +68 -0
  78. hackagent/models/paginated_agent_list.py +123 -0
  79. hackagent/models/paginated_attack_list.py +123 -0
  80. hackagent/models/paginated_prompt_list.py +123 -0
  81. hackagent/models/paginated_result_list.py +123 -0
  82. hackagent/models/paginated_run_list.py +123 -0
  83. hackagent/models/paginated_user_api_key_list.py +123 -0
  84. hackagent/models/patched_agent_request.py +176 -0
  85. hackagent/models/patched_attack_request.py +92 -0
  86. hackagent/models/patched_prompt_request.py +162 -0
  87. hackagent/models/patched_result_request.py +237 -0
  88. hackagent/models/patched_run_request.py +138 -0
  89. hackagent/models/prompt.py +226 -0
  90. hackagent/models/prompt_request.py +155 -0
  91. hackagent/models/result.py +294 -0
  92. hackagent/models/result_list_evaluation_status.py +14 -0
  93. hackagent/models/result_request.py +232 -0
  94. hackagent/models/run.py +233 -0
  95. hackagent/models/run_list_status.py +12 -0
  96. hackagent/models/run_request.py +133 -0
  97. hackagent/models/status_enum.py +12 -0
  98. hackagent/models/step_type_enum.py +14 -0
  99. hackagent/models/trace.py +121 -0
  100. hackagent/models/trace_request.py +94 -0
  101. hackagent/models/user_api_key.py +201 -0
  102. hackagent/models/user_api_key_request.py +73 -0
  103. hackagent/models/user_profile_minimal.py +76 -0
  104. hackagent/py.typed +1 -0
  105. hackagent/router/__init__.py +11 -0
  106. hackagent/router/adapters/__init__.py +5 -0
  107. hackagent/router/adapters/google_adk.py +658 -0
  108. hackagent/router/adapters/litellm_adapter.py +290 -0
  109. hackagent/router/base.py +48 -0
  110. hackagent/router/router.py +753 -0
  111. hackagent/types.py +46 -0
  112. hackagent/utils.py +61 -0
  113. hackagent/vulnerabilities/__init__.py +0 -0
  114. hackagent-0.1.0.dist-info/LICENSE +202 -0
  115. hackagent-0.1.0.dist-info/METADATA +173 -0
  116. hackagent-0.1.0.dist-info/RECORD +117 -0
  117. hackagent-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,50 @@
1
+ import abc
2
+ from typing import Any, Dict
3
+
4
+
5
+ class BaseAttack(abc.ABC):
6
+ """
7
+ Abstract base class for black-box attacks against language models.
8
+ """
9
+
10
+ def __init__(self, config: Dict[str, Any]):
11
+ """
12
+ Initializes the attack with configuration parameters.
13
+
14
+ Args:
15
+ config: A dictionary containing configuration settings for the attack.
16
+ """
17
+ self.config = config
18
+ self._validate_config()
19
+ self._setup()
20
+
21
+ def _validate_config(self):
22
+ """
23
+ Validates the provided configuration.
24
+ Subclasses can override this to enforce specific config requirements.
25
+ """
26
+ if not isinstance(self.config, dict):
27
+ raise TypeError("Configuration must be a dictionary.")
28
+ # Add more specific validation in subclasses as needed
29
+ pass
30
+
31
+ def _setup(self):
32
+ """
33
+ Performs any necessary setup based on the configuration.
34
+ Subclasses can override this for specific setup tasks (e.g., loading models, data).
35
+ """
36
+ # Add setup logic in subclasses (e.g., logging, directories)
37
+ pass
38
+
39
+ @abc.abstractmethod
40
+ def run(self, **kwargs: Any) -> Any:
41
+ """
42
+ Executes the attack logic.
43
+
44
+ Args:
45
+ **kwargs: Attack-specific arguments (e.g., input prompts, goals, dataset).
46
+
47
+ Returns:
48
+ Attack-specific results (e.g., adversarial examples, success metrics, report).
49
+ """
50
+ pass
@@ -0,0 +1,539 @@
1
+ import logging
2
+ import abc
3
+ import json # For ManagedAttackStrategy
4
+ import pandas as pd # For AdvPrefix
5
+ import os # Added for path joining
6
+ import httpx # Added for manual HTTP call in AdvPrefix
7
+ from http import HTTPStatus # Added for checking 201 status
8
+ from typing import Any, Optional, List, Dict, Tuple, TYPE_CHECKING
9
+
10
+ # Imports for specific strategies, moved from agent.py or direct_test_executor.py
11
+ from hackagent import errors # Import the errors module
12
+ from hackagent.api.run import run_run_tests_create
13
+ from hackagent.api.attack.attack_create import (
14
+ sync_detailed as attacks_create_sync_detailed,
15
+ )
16
+ from hackagent.models import Run
17
+ from hackagent.models.run_request import RunRequest
18
+ from hackagent.models.attack_request import (
19
+ AttackRequest,
20
+ ) # For creating attacks via attacks_create API
21
+ from hackagent.errors import HackAgentError
22
+ from hackagent.attacks.advprefix import (
23
+ AdvPrefixAttack,
24
+ ) # Used by LocalPrefix
25
+
26
+ if TYPE_CHECKING:
27
+ from hackagent.agent import HackAgent
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # --- Strategy Pattern for Attacks ---
32
+
33
+
34
+ class AttackStrategy(abc.ABC):
35
+ """Abstract base class for an attack strategy."""
36
+
37
+ def __init__(self, hack_agent: "HackAgent"):
38
+ self.hack_agent = hack_agent
39
+ self.client = hack_agent.client
40
+
41
+ @abc.abstractmethod
42
+ def execute(
43
+ self,
44
+ attack_config: Dict[str, Any],
45
+ run_config_override: Optional[Dict[str, Any]],
46
+ fail_on_run_error: bool,
47
+ max_wait_time_seconds: Optional[int] = None,
48
+ poll_interval_seconds: Optional[int] = None,
49
+ ) -> Any:
50
+ """Executes the attack strategy."""
51
+ pass
52
+
53
+ def _decode_response_content(self, response: httpx.Response) -> str:
54
+ """
55
+ Decodes the HTTP response content to a string.
56
+
57
+ Args:
58
+ response: The httpx.Response object.
59
+
60
+ Returns:
61
+ The decoded content as a string, or 'N/A' if content is None.
62
+ """
63
+ return (
64
+ response.content.decode("utf-8", errors="replace")
65
+ if response.content
66
+ else "N/A"
67
+ )
68
+
69
+ def _parse_json_from_response_data(
70
+ self,
71
+ response: httpx.Response,
72
+ decoded_content: str,
73
+ attack_type_for_error_msg: str,
74
+ ) -> Optional[Dict[str, Any]]:
75
+ """
76
+ Tries to parse JSON data from various parts of an httpx.Response.
77
+ Handles direct content parsing and checks for pre-parsed attributes.
78
+
79
+ Args:
80
+ response: The httpx.Response object.
81
+ decoded_content: The already decoded string content of the response.
82
+ attack_type_for_error_msg: A string describing the attack type, for error messages.
83
+
84
+ Returns:
85
+ A dictionary if JSON parsing is successful, None otherwise.
86
+
87
+ Raises:
88
+ HackAgentError: If response status is 201 but JSON parsing fails critically.
89
+ """
90
+ parsed_data_dict: Optional[Dict[str, Any]] = None
91
+ if response.content:
92
+ try:
93
+ parsed_data_dict = json.loads(decoded_content)
94
+ except json.JSONDecodeError as jde:
95
+ if (
96
+ response.status_code == 201
97
+ ): # Critical for 201 if body exists but is bad JSON
98
+ logger.error(
99
+ f"Failed to parse JSON for {attack_type_for_error_msg} (201 response with content): {jde}. Content: {decoded_content}"
100
+ )
101
+ raise HackAgentError(
102
+ f"Failed to parse 201 response JSON for {attack_type_for_error_msg} (content present): {jde}"
103
+ ) from jde
104
+ logger.warning(
105
+ f"Could not parse JSON from response body for {attack_type_for_error_msg} (status {response.status_code}). Content: {decoded_content}",
106
+ exc_info=False,
107
+ ) # exc_info=False to avoid verbose log for non-critical parse fail
108
+ # Do not return None yet, try pre-parsed attributes next
109
+
110
+ # Try pre-parsed attributes, especially if content parsing failed or content was empty
111
+ if not parsed_data_dict and hasattr(response, "parsed") and response.parsed:
112
+ logger.debug(
113
+ f"Attempting to use pre-parsed attribute for {attack_type_for_error_msg}"
114
+ )
115
+ if hasattr(response.parsed, "additional_properties") and isinstance(
116
+ response.parsed.additional_properties, dict
117
+ ):
118
+ parsed_data_dict = response.parsed.additional_properties
119
+ elif isinstance(response.parsed, dict):
120
+ parsed_data_dict = response.parsed
121
+ else:
122
+ logger.warning(
123
+ f"Response has 'parsed' attribute but it's not a usable dict for {attack_type_for_error_msg}. Type: {type(response.parsed)}"
124
+ )
125
+
126
+ return parsed_data_dict
127
+
128
+ def _get_parsed_data_from_initiate_response(
129
+ self,
130
+ response: httpx.Response,
131
+ decoded_content: str,
132
+ attack_type_for_error_msg: str,
133
+ ) -> Dict[str, Any]:
134
+ """
135
+ Handles different HTTP status codes to retrieve a parsed data dictionary
136
+ from an attack initiation response.
137
+
138
+ Args:
139
+ response: The httpx.Response object.
140
+ decoded_content: Decoded string content of the response.
141
+ attack_type_for_error_msg: String describing attack type for errors.
142
+
143
+ Returns:
144
+ The parsed data dictionary.
145
+
146
+ Raises:
147
+ HackAgentError: If the response indicates failure or data cannot be parsed appropriately for critical statuses.
148
+ """
149
+ parsed_data_dict = self._parse_json_from_response_data(
150
+ response, decoded_content, attack_type_for_error_msg
151
+ )
152
+
153
+ if response.status_code == 201:
154
+ if not parsed_data_dict:
155
+ # This case implies that _parse_json_from_response_data returned None for a 201, which means
156
+ # either no content, or content that wasn't JSON, or pre-parsed attributes also failed.
157
+ # If content was present but bad JSON, _parse_json_from_response_data would have raised.
158
+ logger.error(
159
+ f"201 for {attack_type_for_error_msg} but no parsable dictionary body was found. Decoded content: '{decoded_content}', Pre-parsed type: {type(response.parsed if hasattr(response, 'parsed') else None)}"
160
+ )
161
+ raise HackAgentError(
162
+ f"201 for {attack_type_for_error_msg} but no parsable dictionary body was found."
163
+ )
164
+
165
+ elif response.status_code >= 300:
166
+ err_text = f"Failed to initiate {attack_type_for_error_msg}. Status: {response.status_code}, Body: {decoded_content}"
167
+ logger.error(err_text)
168
+ raise HackAgentError(err_text)
169
+
170
+ else: # Unexpected success status codes (e.g., 200 OK instead of 201 Created, or other 2xx)
171
+ logger.warning(
172
+ f"Unexpected success status {response.status_code} from initiate_{attack_type_for_error_msg}. Content: {decoded_content}"
173
+ )
174
+ if (
175
+ not parsed_data_dict
176
+ ): # If still no data after trying for an unexpected success status
177
+ err_text = (
178
+ f"Could not obtain parsable data from initiate_{attack_type_for_error_msg} response with unexpected status {response.status_code}. "
179
+ f"Content: {decoded_content}"
180
+ )
181
+ logger.error(err_text)
182
+ raise HackAgentError(err_text)
183
+
184
+ if (
185
+ not parsed_data_dict
186
+ ): # Should be caught by earlier checks, but as a final safeguard
187
+ logger.error(
188
+ f"Internal logic error: Parsed data dictionary is None for {attack_type_for_error_msg} status {response.status_code} without raising earlier. Content: {decoded_content}"
189
+ )
190
+ raise HackAgentError(
191
+ f"Failed to obtain parsed data for {attack_type_for_error_msg} (status {response.status_code}). Check logs for parsing attempts."
192
+ )
193
+ return parsed_data_dict
194
+
195
+ def _extract_ids_from_data_dict(
196
+ self,
197
+ parsed_data_dict: Dict[str, Any],
198
+ attack_type_for_error_msg: str,
199
+ original_content: str,
200
+ ) -> Tuple[str, Optional[str]]:
201
+ """
202
+ Extracts 'id' (attack_id) and optionally 'associated_run_id' from a parsed data dictionary.
203
+ Attack_id is considered mandatory.
204
+ """
205
+ raw_attack_id = parsed_data_dict.get("id")
206
+ attack_id_str = str(raw_attack_id) if raw_attack_id is not None else None
207
+
208
+ if attack_id_str is None:
209
+ err_detail = (
210
+ f"Could not extract mandatory attack_id ('{attack_id_str}') "
211
+ f"from initiate_{attack_type_for_error_msg} response. "
212
+ f"Source dict: {parsed_data_dict}, Original Decoded Content: '{original_content}'"
213
+ )
214
+ logger.error(err_detail)
215
+ raise HackAgentError(err_detail)
216
+
217
+ raw_run_id = parsed_data_dict.get("associated_run_id")
218
+ run_id_str = str(raw_run_id) if raw_run_id is not None else None
219
+
220
+ logger.info(
221
+ f"Extracted Attack ID: {attack_id_str} and optional server-associated Run ID: {run_id_str if run_id_str else 'Not Provided'} for {attack_type_for_error_msg}."
222
+ )
223
+ return attack_id_str, run_id_str
224
+
225
+ def extract_attack_and_run_ids_from_initiate_response(
226
+ self, response: httpx.Response, attack_type_for_error_msg: str = "attack"
227
+ ) -> Tuple[str, Optional[str]]:
228
+ """Orchestrates the extraction of attack_id and optionally associated_run_id from an Attack creation response."""
229
+ logger.debug(
230
+ f"Attempting to extract Attack/Run IDs for '{attack_type_for_error_msg}' from response (status: {response.status_code})"
231
+ )
232
+ decoded_content = self._decode_response_content(response)
233
+ parsed_data_dict = self._get_parsed_data_from_initiate_response(
234
+ response, decoded_content, attack_type_for_error_msg
235
+ )
236
+ return self._extract_ids_from_data_dict(
237
+ parsed_data_dict, attack_type_for_error_msg, decoded_content
238
+ )
239
+
240
+
241
+ class AdvPrefix(AttackStrategy):
242
+ """Strategy for 'advprefix' attacks."""
243
+
244
+ def _prepare_and_validate_attack_params(
245
+ self,
246
+ attack_config: Dict[str, Any],
247
+ ) -> List[Any]:
248
+ """Validates and extracts necessary parameters from attack_config."""
249
+
250
+ goals = attack_config.get("goals")
251
+ if not isinstance(goals, list):
252
+ raise ValueError(
253
+ "'attack_config' must contain 'goals' list for AdvPrefixAttack."
254
+ )
255
+
256
+ return goals
257
+
258
+ def _create_server_attack_record(
259
+ self,
260
+ victim_agent_id: str,
261
+ organization_id: str,
262
+ attack_config: Dict[str, Any], # Used for summary
263
+ ) -> str:
264
+ """Creates the Attack record on the server and returns the attack_id."""
265
+ logger.info("Creating Attack record on the server.")
266
+ attack_type = "advprefix"
267
+
268
+ payload = {
269
+ "type": attack_type,
270
+ "agent": victim_agent_id,
271
+ "organization": organization_id,
272
+ "configuration": attack_config,
273
+ }
274
+ try:
275
+ attack_req_obj = AttackRequest.from_dict(payload)
276
+ logger.debug(
277
+ f"Attempting to create Attack record with payload: {attack_req_obj.to_dict()}"
278
+ )
279
+ response = attacks_create_sync_detailed(
280
+ client=self.client, body=attack_req_obj
281
+ )
282
+ except Exception as e:
283
+ logger.error(
284
+ f"Failed to construct/send AttackRequest for {attack_type} record: {e}",
285
+ exc_info=True,
286
+ )
287
+ raise HackAgentError(
288
+ f"Failed to send AttackRequest for {attack_type} record: {e}"
289
+ ) from e
290
+
291
+ attack_id, _ = self.extract_attack_and_run_ids_from_initiate_response(
292
+ response=response, attack_type_for_error_msg=attack_type
293
+ )
294
+ logger.info(f"Attack record created on server. Attack ID: {attack_id}.")
295
+ return attack_id
296
+
297
+ def _create_server_run_record(
298
+ self,
299
+ attack_id: str,
300
+ victim_agent_id: str,
301
+ run_config_override: Optional[Dict[str, Any]],
302
+ ) -> str:
303
+ """Explicitly creates a Run record on the server and returns the run_id."""
304
+ logger.info(
305
+ f"Attempting to explicitly create a Run record for Attack ID: {attack_id}"
306
+ )
307
+ payload = RunRequest(
308
+ attack=attack_id,
309
+ agent=victim_agent_id,
310
+ run_config=run_config_override if run_config_override else {},
311
+ )
312
+ try:
313
+ # response_obj is the custom hackagent.types.Response[Run]
314
+ response_obj = run_run_tests_create.sync_detailed(
315
+ client=self.client, body=payload
316
+ )
317
+
318
+ created_run: Optional[Run] = response_obj.parsed
319
+
320
+ # If the auto-generated client didn't parse for 201, but it's a success, try manual parsing.
321
+ if created_run is None and response_obj.status_code == HTTPStatus.CREATED:
322
+ logger.info(
323
+ f"Run creation returned 201 (CREATED), attempting to manually parse response content for Attack ID: {attack_id}"
324
+ )
325
+ if response_obj.content:
326
+ try:
327
+ created_run_data = json.loads(
328
+ response_obj.content.decode("utf-8")
329
+ )
330
+ created_run = Run.from_dict(
331
+ created_run_data
332
+ ) # Use the Run model's from_dict
333
+ logger.info(
334
+ f"Manually parsed Run object from 201 response for Attack ID {attack_id}. Run ID: {created_run.id if created_run and hasattr(created_run, 'id') else 'Parse_Failed_Or_No_ID'}"
335
+ )
336
+ except json.JSONDecodeError as jde:
337
+ logger.error(
338
+ f"Failed to manually parse JSON from 201 response content for Attack ID {attack_id}: {jde}. Content: {response_obj.content.decode('utf-8', errors='replace')}",
339
+ exc_info=True,
340
+ )
341
+ # created_run remains None, will be caught by the check below
342
+ except Exception as e:
343
+ logger.error(
344
+ f"Unexpected error manually parsing 201 response content for Attack ID {attack_id}: {e}",
345
+ exc_info=True,
346
+ )
347
+ # created_run remains None, will be caught by the check below
348
+ else:
349
+ logger.warning(
350
+ f"Run creation returned 201 (CREATED) but response content was empty for Attack ID: {attack_id}. Cannot manually parse."
351
+ )
352
+
353
+ if not created_run or not hasattr(created_run, "id") or not created_run.id:
354
+ status_code_val = (
355
+ response_obj.status_code
356
+ if hasattr(response_obj, "status_code")
357
+ else "Unknown Status"
358
+ )
359
+ content_val = (
360
+ response_obj.content.decode("utf-8", errors="replace")
361
+ if hasattr(response_obj, "content") and response_obj.content
362
+ else "No content"
363
+ )
364
+
365
+ logger.error(
366
+ f"Failed to get valid Run ID from run creation for Attack {attack_id}. "
367
+ f"Status: {status_code_val}, Parsed: {created_run}, Content: {content_val}"
368
+ )
369
+ raise HackAgentError(
370
+ f"Server API for Run creation returned status {status_code_val} "
371
+ f"but response parsing failed, lacked Run ID, or an error occurred. Content: {content_val}"
372
+ )
373
+
374
+ run_id = str(created_run.id)
375
+ logger.info(
376
+ f"Successfully created Run ID: {run_id} for Attack ID: {attack_id}"
377
+ )
378
+ return run_id
379
+
380
+ except errors.UnexpectedStatus as use:
381
+ # This is caught if client.raise_on_unexpected_status is True and server returns non-200
382
+ error_content = (
383
+ use.content.decode("utf-8", errors="replace")
384
+ if use.content
385
+ else "No content"
386
+ )
387
+ logger.error(
388
+ f"API error (UnexpectedStatus {use.status_code}) creating Run for Attack {attack_id}: {error_content}",
389
+ exc_info=True,
390
+ )
391
+ raise HackAgentError(
392
+ f"Failed to create Run for Attack {attack_id} (API status {use.status_code}): {error_content}"
393
+ ) from use
394
+ except Exception as e:
395
+ logger.error(
396
+ f"Error creating Run for Attack {attack_id}: {e}", exc_info=True
397
+ )
398
+ raise HackAgentError(
399
+ f"Failed to create Run for Attack {attack_id}: {e}"
400
+ ) from e
401
+
402
+ def _prepare_attack_config(
403
+ self,
404
+ attack_config: Dict[str, Any],
405
+ run_id: str,
406
+ attack_id: str,
407
+ ) -> Dict[str, Any]:
408
+ """Prepares the configuration for the local AdvPrefixAttack."""
409
+ logger.debug(f"Preparing local attack config for Run ID: {run_id}")
410
+ current_config = json.loads(json.dumps(attack_config)) # Deep copy
411
+
412
+ original_run_id = current_config.get("run_id")
413
+ current_config["run_id"] = run_id
414
+ if original_run_id and original_run_id != run_id:
415
+ logger.info(
416
+ f"Updated 'run_id' in attack_config from '{original_run_id}' to server Run ID '{run_id}'."
417
+ )
418
+ elif not original_run_id:
419
+ logger.info(f"Set 'run_id' in attack_config to server Run ID '{run_id}'.")
420
+
421
+ if "output_dir" not in current_config:
422
+ current_config["output_dir"] = f"./hackagent_local_runs/{attack_id}"
423
+ logger.warning(
424
+ f"'output_dir' not in attack_config, defaulting to {current_config['output_dir']}"
425
+ )
426
+
427
+ return current_config
428
+
429
+ async def _execute_local_prefix_attack(
430
+ self,
431
+ attack_config: Dict[str, Any],
432
+ goals: List[Any],
433
+ run_id: str, # For logging and potentially for the attack runner
434
+ attack_id: str, # For logging
435
+ ) -> Optional[pd.DataFrame]:
436
+ """Executes the AdvPrefixAttack locally."""
437
+ logger.info(
438
+ f"Starting local AdvPrefixAttack for Attack ID {attack_id} (Run ID: {run_id})..."
439
+ )
440
+ runner = AdvPrefixAttack(
441
+ config=attack_config,
442
+ client=self.hack_agent.client, # Pass existing client
443
+ agent_router=self.hack_agent.router, # Pass main victim router
444
+ )
445
+ results_df = await runner.run(goals=goals, initial_run_id=run_id)
446
+ logger.info(f"Local AdvPrefixAttack completed for Attack ID {attack_id}.")
447
+ return results_df
448
+
449
+ def _log_local_run_persistence_info(
450
+ self,
451
+ attack_config: Dict[str, Any],
452
+ attack_id: str,
453
+ run_id: str,
454
+ fail_on_run_error: bool, # To decide if error during this info step is critical
455
+ ):
456
+ """Logs information about where local run data (like CSVs for Step10) would be."""
457
+ # This method currently only logs. If actual operations were done, error handling would be more critical.
458
+ try:
459
+ base_output_dir = attack_config.get(
460
+ "output_dir", f"./hackagent_local_runs/{attack_id}"
461
+ )
462
+ actual_run_output_dir = os.path.join(base_output_dir, f"run_{run_id}")
463
+ input_csv_hint = attack_config.get(
464
+ "input_csv_for_model_persistence", "step9_output.csv"
465
+ )
466
+ logger.info(
467
+ f"Local run data (for potential Pydantic model persistence/Step10): Dir='{actual_run_output_dir}', CSV hint='{input_csv_hint}'."
468
+ )
469
+ except Exception as e:
470
+ logger.error(
471
+ f"Error preparing local run persistence info for Attack {attack_id}: {e}",
472
+ exc_info=True,
473
+ )
474
+ if fail_on_run_error:
475
+ # This is just logging info, so might not be fatal unless other operations depend on it.
476
+ # For now, just log and continue, but could raise if this setup was critical.
477
+ pass
478
+
479
+ async def execute(
480
+ self,
481
+ attack_config: Dict[str, Any],
482
+ run_config_override: Optional[Dict[str, Any]],
483
+ fail_on_run_error: bool,
484
+ ) -> Any:
485
+ logger.info("Executing AdvPrefix.")
486
+ router = self.hack_agent.router
487
+ attack_id_str: Optional[str] = None
488
+
489
+ try:
490
+ goals = self._prepare_and_validate_attack_params(attack_config)
491
+
492
+ attack_id_str = self._create_server_attack_record(
493
+ victim_agent_id=str(router.backend_agent.id),
494
+ organization_id=str(router.organization_id),
495
+ attack_config=attack_config,
496
+ )
497
+
498
+ run_id_for_local_ops = self._create_server_run_record(
499
+ attack_id=attack_id_str,
500
+ victim_agent_id=str(router.backend_agent.id),
501
+ run_config_override=run_config_override,
502
+ )
503
+
504
+ current_attack_config = self._prepare_attack_config(
505
+ attack_config=attack_config,
506
+ run_id=run_id_for_local_ops,
507
+ attack_id=attack_id_str,
508
+ )
509
+
510
+ local_results_df = await self._execute_local_prefix_attack(
511
+ attack_config=current_attack_config,
512
+ goals=goals,
513
+ run_id=run_id_for_local_ops,
514
+ attack_id=attack_id_str,
515
+ )
516
+
517
+ self._log_local_run_persistence_info(
518
+ attack_config=current_attack_config,
519
+ attack_id=attack_id_str,
520
+ run_id=run_id_for_local_ops,
521
+ fail_on_run_error=fail_on_run_error,
522
+ )
523
+
524
+ return local_results_df # Return the DataFrame as per original behavior
525
+
526
+ except Exception as e:
527
+ log_attack_id = attack_id_str or "PRE-ATTACK_CREATION"
528
+ logger.error(
529
+ f"Error in AdvPrefix for Attack ID '{log_attack_id}': {e}",
530
+ exc_info=True,
531
+ )
532
+ if fail_on_run_error:
533
+ raise HackAgentError(
534
+ f"AdvPrefix failed for Attack ID {log_attack_id}: {e}"
535
+ ) from e
536
+ return None # Return None if not failing on error and an error occurred
537
+
538
+
539
+ # --- End Strategy Pattern ---