hackagent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hackagent/__init__.py +23 -0
- hackagent/agent.py +193 -0
- hackagent/api/__init__.py +1 -0
- hackagent/api/agent/__init__.py +1 -0
- hackagent/api/agent/agent_create.py +340 -0
- hackagent/api/agent/agent_destroy.py +136 -0
- hackagent/api/agent/agent_list.py +234 -0
- hackagent/api/agent/agent_partial_update.py +354 -0
- hackagent/api/agent/agent_retrieve.py +227 -0
- hackagent/api/agent/agent_update.py +354 -0
- hackagent/api/attack/__init__.py +1 -0
- hackagent/api/attack/attack_create.py +264 -0
- hackagent/api/attack/attack_destroy.py +140 -0
- hackagent/api/attack/attack_list.py +242 -0
- hackagent/api/attack/attack_partial_update.py +278 -0
- hackagent/api/attack/attack_retrieve.py +235 -0
- hackagent/api/attack/attack_update.py +278 -0
- hackagent/api/key/__init__.py +1 -0
- hackagent/api/key/key_create.py +168 -0
- hackagent/api/key/key_destroy.py +97 -0
- hackagent/api/key/key_list.py +158 -0
- hackagent/api/key/key_retrieve.py +150 -0
- hackagent/api/prompt/__init__.py +1 -0
- hackagent/api/prompt/prompt_create.py +160 -0
- hackagent/api/prompt/prompt_destroy.py +98 -0
- hackagent/api/prompt/prompt_list.py +173 -0
- hackagent/api/prompt/prompt_partial_update.py +174 -0
- hackagent/api/prompt/prompt_retrieve.py +151 -0
- hackagent/api/prompt/prompt_update.py +174 -0
- hackagent/api/result/__init__.py +1 -0
- hackagent/api/result/result_create.py +160 -0
- hackagent/api/result/result_destroy.py +98 -0
- hackagent/api/result/result_list.py +233 -0
- hackagent/api/result/result_partial_update.py +178 -0
- hackagent/api/result/result_retrieve.py +151 -0
- hackagent/api/result/result_trace_create.py +178 -0
- hackagent/api/result/result_update.py +174 -0
- hackagent/api/run/__init__.py +1 -0
- hackagent/api/run/run_create.py +172 -0
- hackagent/api/run/run_destroy.py +104 -0
- hackagent/api/run/run_list.py +260 -0
- hackagent/api/run/run_partial_update.py +186 -0
- hackagent/api/run/run_result_create.py +178 -0
- hackagent/api/run/run_retrieve.py +163 -0
- hackagent/api/run/run_run_tests_create.py +172 -0
- hackagent/api/run/run_update.py +186 -0
- hackagent/attacks/AdvPrefix/README.md +7 -0
- hackagent/attacks/AdvPrefix/__init__.py +0 -0
- hackagent/attacks/AdvPrefix/completer.py +438 -0
- hackagent/attacks/AdvPrefix/config.py +59 -0
- hackagent/attacks/AdvPrefix/preprocessing.py +521 -0
- hackagent/attacks/AdvPrefix/scorer.py +259 -0
- hackagent/attacks/AdvPrefix/scorer_parser.py +498 -0
- hackagent/attacks/AdvPrefix/selector.py +246 -0
- hackagent/attacks/AdvPrefix/step1_generate.py +324 -0
- hackagent/attacks/AdvPrefix/step4_compute_ce.py +293 -0
- hackagent/attacks/AdvPrefix/step6_get_completions.py +387 -0
- hackagent/attacks/AdvPrefix/step7_evaluate_responses.py +289 -0
- hackagent/attacks/AdvPrefix/step8_aggregate_evaluations.py +177 -0
- hackagent/attacks/AdvPrefix/step9_select_prefixes.py +59 -0
- hackagent/attacks/AdvPrefix/utils.py +192 -0
- hackagent/attacks/__init__.py +6 -0
- hackagent/attacks/advprefix.py +1136 -0
- hackagent/attacks/base.py +50 -0
- hackagent/attacks/strategies.py +539 -0
- hackagent/branding.py +143 -0
- hackagent/client.py +328 -0
- hackagent/errors.py +31 -0
- hackagent/logger.py +67 -0
- hackagent/models/__init__.py +71 -0
- hackagent/models/agent.py +240 -0
- hackagent/models/agent_request.py +169 -0
- hackagent/models/agent_type_enum.py +12 -0
- hackagent/models/attack.py +154 -0
- hackagent/models/attack_request.py +82 -0
- hackagent/models/evaluation_status_enum.py +14 -0
- hackagent/models/organization_minimal.py +68 -0
- hackagent/models/paginated_agent_list.py +123 -0
- hackagent/models/paginated_attack_list.py +123 -0
- hackagent/models/paginated_prompt_list.py +123 -0
- hackagent/models/paginated_result_list.py +123 -0
- hackagent/models/paginated_run_list.py +123 -0
- hackagent/models/paginated_user_api_key_list.py +123 -0
- hackagent/models/patched_agent_request.py +176 -0
- hackagent/models/patched_attack_request.py +92 -0
- hackagent/models/patched_prompt_request.py +162 -0
- hackagent/models/patched_result_request.py +237 -0
- hackagent/models/patched_run_request.py +138 -0
- hackagent/models/prompt.py +226 -0
- hackagent/models/prompt_request.py +155 -0
- hackagent/models/result.py +294 -0
- hackagent/models/result_list_evaluation_status.py +14 -0
- hackagent/models/result_request.py +232 -0
- hackagent/models/run.py +233 -0
- hackagent/models/run_list_status.py +12 -0
- hackagent/models/run_request.py +133 -0
- hackagent/models/status_enum.py +12 -0
- hackagent/models/step_type_enum.py +14 -0
- hackagent/models/trace.py +121 -0
- hackagent/models/trace_request.py +94 -0
- hackagent/models/user_api_key.py +201 -0
- hackagent/models/user_api_key_request.py +73 -0
- hackagent/models/user_profile_minimal.py +76 -0
- hackagent/py.typed +1 -0
- hackagent/router/__init__.py +11 -0
- hackagent/router/adapters/__init__.py +5 -0
- hackagent/router/adapters/google_adk.py +658 -0
- hackagent/router/adapters/litellm_adapter.py +290 -0
- hackagent/router/base.py +48 -0
- hackagent/router/router.py +753 -0
- hackagent/types.py +46 -0
- hackagent/utils.py +61 -0
- hackagent/vulnerabilities/__init__.py +0 -0
- hackagent-0.1.0.dist-info/LICENSE +202 -0
- hackagent-0.1.0.dist-info/METADATA +173 -0
- hackagent-0.1.0.dist-info/RECORD +117 -0
- hackagent-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from rich.progress import (
|
|
7
|
+
Progress,
|
|
8
|
+
BarColumn,
|
|
9
|
+
TextColumn,
|
|
10
|
+
TimeRemainingColumn,
|
|
11
|
+
MofNCompleteColumn,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
)
|
|
14
|
+
from .utils import call_litellm_completion # Import the utility
|
|
15
|
+
|
|
16
|
+
# Configure LiteLLM logging (optional)
|
|
17
|
+
# litellm.set_verbose = True
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# --- Configuration Classes ---
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ScorerConfig:
|
|
26
|
+
"""Base configuration for scorers"""
|
|
27
|
+
|
|
28
|
+
model_id: str # Identifier for the model used for scoring (litellm string)
|
|
29
|
+
batch_size: int = 1 # Default to 1 for LiteLLM scorer processing
|
|
30
|
+
# surrogate_attack_prompt: str = "" # Maybe needed if formatting prompts
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class LiteLLMAPIScoreConfig(ScorerConfig):
|
|
35
|
+
"""Configuration specific to LiteLLM API-based scoring"""
|
|
36
|
+
|
|
37
|
+
endpoint: Optional[str] = None
|
|
38
|
+
api_key: Optional[str] = None
|
|
39
|
+
surrogate_attack_prompt: str = "" # Prompt template if needed for API call
|
|
40
|
+
logprob_token_buffer: int = (
|
|
41
|
+
5 # How many extra tokens to request beyond estimated prefix length
|
|
42
|
+
)
|
|
43
|
+
request_timeout: int = 120 # Timeout for API calls
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --- Base Scorer Class (Optional but good practice) ---
|
|
47
|
+
class BaseScorer:
|
|
48
|
+
"""Abstract base class for scorers."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, config: ScorerConfig):
|
|
51
|
+
self.config = config
|
|
52
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
53
|
+
|
|
54
|
+
def calculate_score(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
55
|
+
"""Calculates scores (e.g., NLL) for prefixes."""
|
|
56
|
+
raise NotImplementedError
|
|
57
|
+
|
|
58
|
+
def __del__(self):
|
|
59
|
+
"""Cleanup resources."""
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class LiteLLMAPIScorer(BaseScorer):
|
|
64
|
+
"""
|
|
65
|
+
Calculate an approximate NLL score for prefixes using LiteLLM APIs that support logprobs.
|
|
66
|
+
Note: This is NOT equivalent to precise CE calculation.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, config: LiteLLMAPIScoreConfig):
|
|
70
|
+
super().__init__(config)
|
|
71
|
+
self.config: LiteLLMAPIScoreConfig # Type hint
|
|
72
|
+
self.api_key = None
|
|
73
|
+
if self.config.api_key:
|
|
74
|
+
self.api_key = os.environ.get(self.config.api_key)
|
|
75
|
+
if not self.api_key:
|
|
76
|
+
self.logger.warning(
|
|
77
|
+
f"Environment variable {self.config.api_key} not set for API key."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self.logger.info(
|
|
81
|
+
f"LiteLLMAPIScorer initialized for model {self.config.model_id}. Token counts will be estimated based on character length."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _estimate_token_count(self, text: str) -> int:
|
|
85
|
+
"""Estimate token count using character count."""
|
|
86
|
+
# Only use character-based estimation
|
|
87
|
+
# Rough estimate: 4 chars per token (adjust if needed)
|
|
88
|
+
count = (len(text) // 4) + 1
|
|
89
|
+
# self.logger.debug(f"Estimated token count for text length {len(text)}: {count}")
|
|
90
|
+
return count
|
|
91
|
+
|
|
92
|
+
def calculate_score(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
93
|
+
"""
|
|
94
|
+
Calculate approximate NLL score using litellm.completion logprobs via utility.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
df: DataFrame with 'goal' and 'prefix' columns.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
DataFrame with additional 'prefix_nll' column (approximate score).
|
|
101
|
+
"""
|
|
102
|
+
self.logger.info(
|
|
103
|
+
f"Calculating approximate NLL (LiteLLM) for {len(df)} prefixes using {self.config.model_id}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
result_df = df.copy()
|
|
107
|
+
# Ensure column exists, default to infinity (bad score)
|
|
108
|
+
if "prefix_nll" not in result_df.columns:
|
|
109
|
+
result_df["prefix_nll"] = float("inf")
|
|
110
|
+
else:
|
|
111
|
+
result_df["prefix_nll"] = result_df["prefix_nll"].fillna(float("inf"))
|
|
112
|
+
|
|
113
|
+
approx_nlls = []
|
|
114
|
+
with Progress(
|
|
115
|
+
SpinnerColumn(),
|
|
116
|
+
TextColumn("[progress.description]{task.description}"),
|
|
117
|
+
BarColumn(),
|
|
118
|
+
MofNCompleteColumn(),
|
|
119
|
+
TextColumn("[progress.percentage]{task.percentage:>3.1f}%"),
|
|
120
|
+
TimeRemainingColumn(),
|
|
121
|
+
) as progress_bar:
|
|
122
|
+
task = progress_bar.add_task(
|
|
123
|
+
"[cyan]Calculating Approx NLL (LiteLLM)...", total=len(df)
|
|
124
|
+
)
|
|
125
|
+
for index, row in df.iterrows():
|
|
126
|
+
goal = row["goal"]
|
|
127
|
+
prefix = row["prefix"]
|
|
128
|
+
current_nll = float("inf") # Default to bad score
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# 1. Format the goal as the prompt
|
|
132
|
+
formatted_goal = goal
|
|
133
|
+
if self.config.surrogate_attack_prompt:
|
|
134
|
+
try:
|
|
135
|
+
formatted_goal += (
|
|
136
|
+
self.config.surrogate_attack_prompt.format(prefix="")
|
|
137
|
+
) # Still formatting here
|
|
138
|
+
except Exception as fmt_e:
|
|
139
|
+
self.logger.warning(
|
|
140
|
+
f"Failed to format surrogate prompt for goal scoring, using original. Error: {fmt_e}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
messages = [{"role": "user", "content": formatted_goal}]
|
|
144
|
+
|
|
145
|
+
# 2. Estimate prefix length and set max_tokens
|
|
146
|
+
num_prefix_tokens = self._estimate_token_count(prefix.lstrip())
|
|
147
|
+
max_tokens_to_request = (
|
|
148
|
+
num_prefix_tokens + self.config.logprob_token_buffer
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# 3. Call the utility function requesting logprobs
|
|
152
|
+
_content, logprob_info, error = call_litellm_completion(
|
|
153
|
+
model_id=self.config.model_id,
|
|
154
|
+
messages=messages,
|
|
155
|
+
endpoint=self.config.endpoint,
|
|
156
|
+
api_key=self.api_key,
|
|
157
|
+
timeout=self.config.request_timeout,
|
|
158
|
+
temperature=0.0, # Deterministic for scoring
|
|
159
|
+
max_tokens=max_tokens_to_request,
|
|
160
|
+
logprobs=True, # Request logprobs
|
|
161
|
+
logger=self.logger,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# 4. Handle error or process logprobs
|
|
165
|
+
if error:
|
|
166
|
+
self.logger.error(
|
|
167
|
+
f"LiteLLM call failed during NLL calculation for item {index}: {error}"
|
|
168
|
+
)
|
|
169
|
+
# Keep current_nll as inf
|
|
170
|
+
elif logprob_info:
|
|
171
|
+
# Process logprobs (logic remains similar)
|
|
172
|
+
content_from_logprobs = getattr(logprob_info, "content", None)
|
|
173
|
+
if content_from_logprobs and isinstance(
|
|
174
|
+
content_from_logprobs, list
|
|
175
|
+
):
|
|
176
|
+
returned_tokens = [
|
|
177
|
+
item.get("token") for item in content_from_logprobs
|
|
178
|
+
]
|
|
179
|
+
returned_logprobs = [
|
|
180
|
+
item.get("logprob") for item in content_from_logprobs
|
|
181
|
+
]
|
|
182
|
+
generated_text = "".join(
|
|
183
|
+
t for t in returned_tokens if t is not None
|
|
184
|
+
)
|
|
185
|
+
target_prefix_stripped = prefix.lstrip()
|
|
186
|
+
|
|
187
|
+
if generated_text.startswith(target_prefix_stripped):
|
|
188
|
+
count = 0
|
|
189
|
+
summed_logprob = 0.0
|
|
190
|
+
for tok, lp in zip(returned_tokens, returned_logprobs):
|
|
191
|
+
if lp is None:
|
|
192
|
+
continue
|
|
193
|
+
summed_logprob += lp
|
|
194
|
+
count += 1
|
|
195
|
+
if count >= num_prefix_tokens:
|
|
196
|
+
break
|
|
197
|
+
if count > 0:
|
|
198
|
+
current_nll = -summed_logprob
|
|
199
|
+
else:
|
|
200
|
+
self.logger.warning(
|
|
201
|
+
f"Logprob alignment for item {index}: No valid logprobs found."
|
|
202
|
+
)
|
|
203
|
+
elif (
|
|
204
|
+
target_prefix_stripped.startswith(generated_text)
|
|
205
|
+
and len(generated_text) > 0
|
|
206
|
+
):
|
|
207
|
+
self.logger.warning(
|
|
208
|
+
f"Logprob alignment for item {index}: API generated only '{generated_text[:50]}...' which is a prefix of target."
|
|
209
|
+
)
|
|
210
|
+
summed_logprob = sum(
|
|
211
|
+
lp for lp in returned_logprobs if lp is not None
|
|
212
|
+
)
|
|
213
|
+
if returned_logprobs and any(
|
|
214
|
+
lp is not None for lp in returned_logprobs
|
|
215
|
+
):
|
|
216
|
+
current_nll = -summed_logprob
|
|
217
|
+
else:
|
|
218
|
+
self.logger.warning(
|
|
219
|
+
f"Logprob alignment for item {index}: Generated text '{generated_text[:50]}...' does not match target prefix '{target_prefix_stripped[:50]}...'."
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
self.logger.warning(
|
|
223
|
+
f"Logprobs returned for item {index}, but in unexpected format: {logprob_info}"
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
# Utility function already logs warning if logprobs requested but not found
|
|
227
|
+
pass # Keep current_nll as inf
|
|
228
|
+
|
|
229
|
+
except Exception as e: # Catch errors in the calling code (e.g., formatting, estimate_token_count)
|
|
230
|
+
self.logger.error(
|
|
231
|
+
f"Error calculating approx NLL for item {index} outside LiteLLM call: {e}",
|
|
232
|
+
exc_info=True,
|
|
233
|
+
)
|
|
234
|
+
# Keep current_nll as inf
|
|
235
|
+
|
|
236
|
+
approx_nlls.append(current_nll)
|
|
237
|
+
progress_bar.update(task, advance=1)
|
|
238
|
+
|
|
239
|
+
# Update DataFrame
|
|
240
|
+
if len(approx_nlls) == len(result_df):
|
|
241
|
+
result_df["prefix_nll"] = approx_nlls
|
|
242
|
+
else:
|
|
243
|
+
self.logger.error(
|
|
244
|
+
f"Mismatch between calculated approx NLLs ({len(approx_nlls)}) and DataFrame rows ({len(result_df)}). NLL column may be incorrect."
|
|
245
|
+
)
|
|
246
|
+
# Pad/truncate as fallback
|
|
247
|
+
if len(approx_nlls) < len(result_df):
|
|
248
|
+
approx_nlls.extend([float("inf")] * (len(result_df) - len(approx_nlls)))
|
|
249
|
+
result_df["prefix_nll"] = approx_nlls[: len(result_df)]
|
|
250
|
+
|
|
251
|
+
self.logger.info(
|
|
252
|
+
f"Finished calculating approximate NLL (LiteLLM) for {len(result_df)} prefixes."
|
|
253
|
+
)
|
|
254
|
+
return result_df
|
|
255
|
+
|
|
256
|
+
def __del__(self):
|
|
257
|
+
self.logger.info(
|
|
258
|
+
"LiteLLMAPIScorer resources released (no explicit cleanup needed)."
|
|
259
|
+
)
|