hackagent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. hackagent/__init__.py +23 -0
  2. hackagent/agent.py +193 -0
  3. hackagent/api/__init__.py +1 -0
  4. hackagent/api/agent/__init__.py +1 -0
  5. hackagent/api/agent/agent_create.py +340 -0
  6. hackagent/api/agent/agent_destroy.py +136 -0
  7. hackagent/api/agent/agent_list.py +234 -0
  8. hackagent/api/agent/agent_partial_update.py +354 -0
  9. hackagent/api/agent/agent_retrieve.py +227 -0
  10. hackagent/api/agent/agent_update.py +354 -0
  11. hackagent/api/attack/__init__.py +1 -0
  12. hackagent/api/attack/attack_create.py +264 -0
  13. hackagent/api/attack/attack_destroy.py +140 -0
  14. hackagent/api/attack/attack_list.py +242 -0
  15. hackagent/api/attack/attack_partial_update.py +278 -0
  16. hackagent/api/attack/attack_retrieve.py +235 -0
  17. hackagent/api/attack/attack_update.py +278 -0
  18. hackagent/api/key/__init__.py +1 -0
  19. hackagent/api/key/key_create.py +168 -0
  20. hackagent/api/key/key_destroy.py +97 -0
  21. hackagent/api/key/key_list.py +158 -0
  22. hackagent/api/key/key_retrieve.py +150 -0
  23. hackagent/api/prompt/__init__.py +1 -0
  24. hackagent/api/prompt/prompt_create.py +160 -0
  25. hackagent/api/prompt/prompt_destroy.py +98 -0
  26. hackagent/api/prompt/prompt_list.py +173 -0
  27. hackagent/api/prompt/prompt_partial_update.py +174 -0
  28. hackagent/api/prompt/prompt_retrieve.py +151 -0
  29. hackagent/api/prompt/prompt_update.py +174 -0
  30. hackagent/api/result/__init__.py +1 -0
  31. hackagent/api/result/result_create.py +160 -0
  32. hackagent/api/result/result_destroy.py +98 -0
  33. hackagent/api/result/result_list.py +233 -0
  34. hackagent/api/result/result_partial_update.py +178 -0
  35. hackagent/api/result/result_retrieve.py +151 -0
  36. hackagent/api/result/result_trace_create.py +178 -0
  37. hackagent/api/result/result_update.py +174 -0
  38. hackagent/api/run/__init__.py +1 -0
  39. hackagent/api/run/run_create.py +172 -0
  40. hackagent/api/run/run_destroy.py +104 -0
  41. hackagent/api/run/run_list.py +260 -0
  42. hackagent/api/run/run_partial_update.py +186 -0
  43. hackagent/api/run/run_result_create.py +178 -0
  44. hackagent/api/run/run_retrieve.py +163 -0
  45. hackagent/api/run/run_run_tests_create.py +172 -0
  46. hackagent/api/run/run_update.py +186 -0
  47. hackagent/attacks/AdvPrefix/README.md +7 -0
  48. hackagent/attacks/AdvPrefix/__init__.py +0 -0
  49. hackagent/attacks/AdvPrefix/completer.py +438 -0
  50. hackagent/attacks/AdvPrefix/config.py +59 -0
  51. hackagent/attacks/AdvPrefix/preprocessing.py +521 -0
  52. hackagent/attacks/AdvPrefix/scorer.py +259 -0
  53. hackagent/attacks/AdvPrefix/scorer_parser.py +498 -0
  54. hackagent/attacks/AdvPrefix/selector.py +246 -0
  55. hackagent/attacks/AdvPrefix/step1_generate.py +324 -0
  56. hackagent/attacks/AdvPrefix/step4_compute_ce.py +293 -0
  57. hackagent/attacks/AdvPrefix/step6_get_completions.py +387 -0
  58. hackagent/attacks/AdvPrefix/step7_evaluate_responses.py +289 -0
  59. hackagent/attacks/AdvPrefix/step8_aggregate_evaluations.py +177 -0
  60. hackagent/attacks/AdvPrefix/step9_select_prefixes.py +59 -0
  61. hackagent/attacks/AdvPrefix/utils.py +192 -0
  62. hackagent/attacks/__init__.py +6 -0
  63. hackagent/attacks/advprefix.py +1136 -0
  64. hackagent/attacks/base.py +50 -0
  65. hackagent/attacks/strategies.py +539 -0
  66. hackagent/branding.py +143 -0
  67. hackagent/client.py +328 -0
  68. hackagent/errors.py +31 -0
  69. hackagent/logger.py +67 -0
  70. hackagent/models/__init__.py +71 -0
  71. hackagent/models/agent.py +240 -0
  72. hackagent/models/agent_request.py +169 -0
  73. hackagent/models/agent_type_enum.py +12 -0
  74. hackagent/models/attack.py +154 -0
  75. hackagent/models/attack_request.py +82 -0
  76. hackagent/models/evaluation_status_enum.py +14 -0
  77. hackagent/models/organization_minimal.py +68 -0
  78. hackagent/models/paginated_agent_list.py +123 -0
  79. hackagent/models/paginated_attack_list.py +123 -0
  80. hackagent/models/paginated_prompt_list.py +123 -0
  81. hackagent/models/paginated_result_list.py +123 -0
  82. hackagent/models/paginated_run_list.py +123 -0
  83. hackagent/models/paginated_user_api_key_list.py +123 -0
  84. hackagent/models/patched_agent_request.py +176 -0
  85. hackagent/models/patched_attack_request.py +92 -0
  86. hackagent/models/patched_prompt_request.py +162 -0
  87. hackagent/models/patched_result_request.py +237 -0
  88. hackagent/models/patched_run_request.py +138 -0
  89. hackagent/models/prompt.py +226 -0
  90. hackagent/models/prompt_request.py +155 -0
  91. hackagent/models/result.py +294 -0
  92. hackagent/models/result_list_evaluation_status.py +14 -0
  93. hackagent/models/result_request.py +232 -0
  94. hackagent/models/run.py +233 -0
  95. hackagent/models/run_list_status.py +12 -0
  96. hackagent/models/run_request.py +133 -0
  97. hackagent/models/status_enum.py +12 -0
  98. hackagent/models/step_type_enum.py +14 -0
  99. hackagent/models/trace.py +121 -0
  100. hackagent/models/trace_request.py +94 -0
  101. hackagent/models/user_api_key.py +201 -0
  102. hackagent/models/user_api_key_request.py +73 -0
  103. hackagent/models/user_profile_minimal.py +76 -0
  104. hackagent/py.typed +1 -0
  105. hackagent/router/__init__.py +11 -0
  106. hackagent/router/adapters/__init__.py +5 -0
  107. hackagent/router/adapters/google_adk.py +658 -0
  108. hackagent/router/adapters/litellm_adapter.py +290 -0
  109. hackagent/router/base.py +48 -0
  110. hackagent/router/router.py +753 -0
  111. hackagent/types.py +46 -0
  112. hackagent/utils.py +61 -0
  113. hackagent/vulnerabilities/__init__.py +0 -0
  114. hackagent-0.1.0.dist-info/LICENSE +202 -0
  115. hackagent-0.1.0.dist-info/METADATA +173 -0
  116. hackagent-0.1.0.dist-info/RECORD +117 -0
  117. hackagent-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,259 @@
1
+ import os
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+ import pandas as pd
6
+ from rich.progress import (
7
+ Progress,
8
+ BarColumn,
9
+ TextColumn,
10
+ TimeRemainingColumn,
11
+ MofNCompleteColumn,
12
+ SpinnerColumn,
13
+ )
14
+ from .utils import call_litellm_completion # Import the utility
15
+
16
+ # Configure LiteLLM logging (optional)
17
+ # litellm.set_verbose = True
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # --- Configuration Classes ---
22
+
23
+
24
+ @dataclass
25
+ class ScorerConfig:
26
+ """Base configuration for scorers"""
27
+
28
+ model_id: str # Identifier for the model used for scoring (litellm string)
29
+ batch_size: int = 1 # Default to 1 for LiteLLM scorer processing
30
+ # surrogate_attack_prompt: str = "" # Maybe needed if formatting prompts
31
+
32
+
33
+ @dataclass
34
+ class LiteLLMAPIScoreConfig(ScorerConfig):
35
+ """Configuration specific to LiteLLM API-based scoring"""
36
+
37
+ endpoint: Optional[str] = None
38
+ api_key: Optional[str] = None
39
+ surrogate_attack_prompt: str = "" # Prompt template if needed for API call
40
+ logprob_token_buffer: int = (
41
+ 5 # How many extra tokens to request beyond estimated prefix length
42
+ )
43
+ request_timeout: int = 120 # Timeout for API calls
44
+
45
+
46
+ # --- Base Scorer Class (Optional but good practice) ---
47
+ class BaseScorer:
48
+ """Abstract base class for scorers."""
49
+
50
+ def __init__(self, config: ScorerConfig):
51
+ self.config = config
52
+ self.logger = logging.getLogger(self.__class__.__name__)
53
+
54
+ def calculate_score(self, df: pd.DataFrame) -> pd.DataFrame:
55
+ """Calculates scores (e.g., NLL) for prefixes."""
56
+ raise NotImplementedError
57
+
58
+ def __del__(self):
59
+ """Cleanup resources."""
60
+ pass
61
+
62
+
63
+ class LiteLLMAPIScorer(BaseScorer):
64
+ """
65
+ Calculate an approximate NLL score for prefixes using LiteLLM APIs that support logprobs.
66
+ Note: This is NOT equivalent to precise CE calculation.
67
+ """
68
+
69
+ def __init__(self, config: LiteLLMAPIScoreConfig):
70
+ super().__init__(config)
71
+ self.config: LiteLLMAPIScoreConfig # Type hint
72
+ self.api_key = None
73
+ if self.config.api_key:
74
+ self.api_key = os.environ.get(self.config.api_key)
75
+ if not self.api_key:
76
+ self.logger.warning(
77
+ f"Environment variable {self.config.api_key} not set for API key."
78
+ )
79
+
80
+ self.logger.info(
81
+ f"LiteLLMAPIScorer initialized for model {self.config.model_id}. Token counts will be estimated based on character length."
82
+ )
83
+
84
+ def _estimate_token_count(self, text: str) -> int:
85
+ """Estimate token count using character count."""
86
+ # Only use character-based estimation
87
+ # Rough estimate: 4 chars per token (adjust if needed)
88
+ count = (len(text) // 4) + 1
89
+ # self.logger.debug(f"Estimated token count for text length {len(text)}: {count}")
90
+ return count
91
+
92
+ def calculate_score(self, df: pd.DataFrame) -> pd.DataFrame:
93
+ """
94
+ Calculate approximate NLL score using litellm.completion logprobs via utility.
95
+
96
+ Args:
97
+ df: DataFrame with 'goal' and 'prefix' columns.
98
+
99
+ Returns:
100
+ DataFrame with additional 'prefix_nll' column (approximate score).
101
+ """
102
+ self.logger.info(
103
+ f"Calculating approximate NLL (LiteLLM) for {len(df)} prefixes using {self.config.model_id}"
104
+ )
105
+
106
+ result_df = df.copy()
107
+ # Ensure column exists, default to infinity (bad score)
108
+ if "prefix_nll" not in result_df.columns:
109
+ result_df["prefix_nll"] = float("inf")
110
+ else:
111
+ result_df["prefix_nll"] = result_df["prefix_nll"].fillna(float("inf"))
112
+
113
+ approx_nlls = []
114
+ with Progress(
115
+ SpinnerColumn(),
116
+ TextColumn("[progress.description]{task.description}"),
117
+ BarColumn(),
118
+ MofNCompleteColumn(),
119
+ TextColumn("[progress.percentage]{task.percentage:>3.1f}%"),
120
+ TimeRemainingColumn(),
121
+ ) as progress_bar:
122
+ task = progress_bar.add_task(
123
+ "[cyan]Calculating Approx NLL (LiteLLM)...", total=len(df)
124
+ )
125
+ for index, row in df.iterrows():
126
+ goal = row["goal"]
127
+ prefix = row["prefix"]
128
+ current_nll = float("inf") # Default to bad score
129
+
130
+ try:
131
+ # 1. Format the goal as the prompt
132
+ formatted_goal = goal
133
+ if self.config.surrogate_attack_prompt:
134
+ try:
135
+ formatted_goal += (
136
+ self.config.surrogate_attack_prompt.format(prefix="")
137
+ ) # Still formatting here
138
+ except Exception as fmt_e:
139
+ self.logger.warning(
140
+ f"Failed to format surrogate prompt for goal scoring, using original. Error: {fmt_e}"
141
+ )
142
+
143
+ messages = [{"role": "user", "content": formatted_goal}]
144
+
145
+ # 2. Estimate prefix length and set max_tokens
146
+ num_prefix_tokens = self._estimate_token_count(prefix.lstrip())
147
+ max_tokens_to_request = (
148
+ num_prefix_tokens + self.config.logprob_token_buffer
149
+ )
150
+
151
+ # 3. Call the utility function requesting logprobs
152
+ _content, logprob_info, error = call_litellm_completion(
153
+ model_id=self.config.model_id,
154
+ messages=messages,
155
+ endpoint=self.config.endpoint,
156
+ api_key=self.api_key,
157
+ timeout=self.config.request_timeout,
158
+ temperature=0.0, # Deterministic for scoring
159
+ max_tokens=max_tokens_to_request,
160
+ logprobs=True, # Request logprobs
161
+ logger=self.logger,
162
+ )
163
+
164
+ # 4. Handle error or process logprobs
165
+ if error:
166
+ self.logger.error(
167
+ f"LiteLLM call failed during NLL calculation for item {index}: {error}"
168
+ )
169
+ # Keep current_nll as inf
170
+ elif logprob_info:
171
+ # Process logprobs (logic remains similar)
172
+ content_from_logprobs = getattr(logprob_info, "content", None)
173
+ if content_from_logprobs and isinstance(
174
+ content_from_logprobs, list
175
+ ):
176
+ returned_tokens = [
177
+ item.get("token") for item in content_from_logprobs
178
+ ]
179
+ returned_logprobs = [
180
+ item.get("logprob") for item in content_from_logprobs
181
+ ]
182
+ generated_text = "".join(
183
+ t for t in returned_tokens if t is not None
184
+ )
185
+ target_prefix_stripped = prefix.lstrip()
186
+
187
+ if generated_text.startswith(target_prefix_stripped):
188
+ count = 0
189
+ summed_logprob = 0.0
190
+ for tok, lp in zip(returned_tokens, returned_logprobs):
191
+ if lp is None:
192
+ continue
193
+ summed_logprob += lp
194
+ count += 1
195
+ if count >= num_prefix_tokens:
196
+ break
197
+ if count > 0:
198
+ current_nll = -summed_logprob
199
+ else:
200
+ self.logger.warning(
201
+ f"Logprob alignment for item {index}: No valid logprobs found."
202
+ )
203
+ elif (
204
+ target_prefix_stripped.startswith(generated_text)
205
+ and len(generated_text) > 0
206
+ ):
207
+ self.logger.warning(
208
+ f"Logprob alignment for item {index}: API generated only '{generated_text[:50]}...' which is a prefix of target."
209
+ )
210
+ summed_logprob = sum(
211
+ lp for lp in returned_logprobs if lp is not None
212
+ )
213
+ if returned_logprobs and any(
214
+ lp is not None for lp in returned_logprobs
215
+ ):
216
+ current_nll = -summed_logprob
217
+ else:
218
+ self.logger.warning(
219
+ f"Logprob alignment for item {index}: Generated text '{generated_text[:50]}...' does not match target prefix '{target_prefix_stripped[:50]}...'."
220
+ )
221
+ else:
222
+ self.logger.warning(
223
+ f"Logprobs returned for item {index}, but in unexpected format: {logprob_info}"
224
+ )
225
+ else:
226
+ # Utility function already logs warning if logprobs requested but not found
227
+ pass # Keep current_nll as inf
228
+
229
+ except Exception as e: # Catch errors in the calling code (e.g., formatting, estimate_token_count)
230
+ self.logger.error(
231
+ f"Error calculating approx NLL for item {index} outside LiteLLM call: {e}",
232
+ exc_info=True,
233
+ )
234
+ # Keep current_nll as inf
235
+
236
+ approx_nlls.append(current_nll)
237
+ progress_bar.update(task, advance=1)
238
+
239
+ # Update DataFrame
240
+ if len(approx_nlls) == len(result_df):
241
+ result_df["prefix_nll"] = approx_nlls
242
+ else:
243
+ self.logger.error(
244
+ f"Mismatch between calculated approx NLLs ({len(approx_nlls)}) and DataFrame rows ({len(result_df)}). NLL column may be incorrect."
245
+ )
246
+ # Pad/truncate as fallback
247
+ if len(approx_nlls) < len(result_df):
248
+ approx_nlls.extend([float("inf")] * (len(result_df) - len(approx_nlls)))
249
+ result_df["prefix_nll"] = approx_nlls[: len(result_df)]
250
+
251
+ self.logger.info(
252
+ f"Finished calculating approximate NLL (LiteLLM) for {len(result_df)} prefixes."
253
+ )
254
+ return result_df
255
+
256
+ def __del__(self):
257
+ self.logger.info(
258
+ "LiteLLMAPIScorer resources released (no explicit cleanup needed)."
259
+ )