agenthacker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenthacker-0.1.0.dist-info/METADATA +403 -0
- agenthacker-0.1.0.dist-info/RECORD +30 -0
- agenthacker-0.1.0.dist-info/WHEEL +4 -0
- agenthacker-0.1.0.dist-info/licenses/LICENSE +201 -0
- agenthacker-0.1.0.dist-info/licenses/NOTICE +6 -0
- firewall_sdk/__init__.py +100 -0
- firewall_sdk/agent_helpers.py +128 -0
- firewall_sdk/alignment_check.py +113 -0
- firewall_sdk/anomaly.py +462 -0
- firewall_sdk/client.py +676 -0
- firewall_sdk/cloud_client.py +753 -0
- firewall_sdk/constants.py +21 -0
- firewall_sdk/context_summarizer.py +164 -0
- firewall_sdk/event_store.py +660 -0
- firewall_sdk/features.py +128 -0
- firewall_sdk/intent_gate.py +325 -0
- firewall_sdk/intent_guard.py +373 -0
- firewall_sdk/intent_splitter.py +114 -0
- firewall_sdk/invariant.py +113 -0
- firewall_sdk/lang.py +311 -0
- firewall_sdk/llm_guard.py +318 -0
- firewall_sdk/llm_judge.py +92 -0
- firewall_sdk/logger.py +273 -0
- firewall_sdk/output_guard.py +150 -0
- firewall_sdk/py.typed +0 -0
- firewall_sdk/scan_engine.py +569 -0
- firewall_sdk/schemas.py +25 -0
- firewall_sdk/tool_guard.py +67 -0
- firewall_sdk/trace.py +68 -0
- firewall_sdk/translate_guard.py +188 -0
firewall_sdk/client.py
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 AgentHacker
|
|
3
|
+
|
|
4
|
+
"""High-level ``Firewall`` client — the simplest way to protect an agent.
|
|
5
|
+
|
|
6
|
+
This is the *cloud tier* (Tier 1): a thin, friendly wrapper over the backend
|
|
7
|
+
API. You instantiate it once, call :meth:`Firewall.check` before your LLM, and
|
|
8
|
+
:meth:`Firewall.log` after. No local ML, no boilerplate.
|
|
9
|
+
|
|
10
|
+
Constructed **without an API key** it runs in **local mode**: the four local
|
|
11
|
+
checkpoints (``scan_input``/``scan_data``/``scan_tool_call``/``scan_output``)
|
|
12
|
+
work fully offline, ``check()`` falls back to the deterministic CP-1 input scan,
|
|
13
|
+
and the cloud-only features degrade gracefully. Set ``AGENTHACKER_API_KEY`` (or
|
|
14
|
+
pass ``api_key=``) to enable the cloud tier.
|
|
15
|
+
|
|
16
|
+
from firewall_sdk import Firewall
|
|
17
|
+
|
|
18
|
+
fw = Firewall( # api_key defaults to AGENTHACKER_API_KEY
|
|
19
|
+
agent="support_bot",
|
|
20
|
+
agent_intents=["Answer account questions", "Help reset passwords"],
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
check = fw.check(user_msg, user_id="user@acme.com", session_id=sid)
|
|
24
|
+
if check.blocked:
|
|
25
|
+
return check.reason # e.g. "prompt_injection"
|
|
26
|
+
|
|
27
|
+
answer = my_llm(user_msg) # your own model — unchanged
|
|
28
|
+
|
|
29
|
+
fw.log(user_msg, answer, user_id="user@acme.com", session_id=sid,
|
|
30
|
+
tokens=512, latency_ms=210)
|
|
31
|
+
|
|
32
|
+
Every layer is a switch (see :mod:`firewall_sdk.features`). The constructor
|
|
33
|
+
exposes the cloud-relevant ones; pass ``None`` to leave a flag at its
|
|
34
|
+
default/env value, or ``True``/``False`` to force it.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import hashlib
|
|
40
|
+
import logging
|
|
41
|
+
import os
|
|
42
|
+
from collections import OrderedDict
|
|
43
|
+
from dataclasses import dataclass, field
|
|
44
|
+
from typing import TYPE_CHECKING, Any
|
|
45
|
+
|
|
46
|
+
from firewall_sdk import features
|
|
47
|
+
from firewall_sdk.cloud_client import CloudClient
|
|
48
|
+
from firewall_sdk.context_summarizer import ConversationState, local_summary
|
|
49
|
+
from firewall_sdk.schemas import CLEAN, ScanResult
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
import re
|
|
53
|
+
from collections.abc import Callable
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
_USER_HASH_LEN = 16
|
|
58
|
+
_RISK_ENFORCEMENT_MODES = ("off", "restrict", "block")
|
|
59
|
+
# Cap on per-session conversation state held in memory. Oldest sessions are
|
|
60
|
+
# evicted first so a long-running process can't grow this without bound.
|
|
61
|
+
_MAX_SESSIONS = 1000
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class CheckResult:
|
|
66
|
+
"""The verdict for a single message.
|
|
67
|
+
|
|
68
|
+
``allowed`` is the inverse of ``blocked``; the object is truthy when it is
|
|
69
|
+
safe to proceed, so ``if fw.check(msg):`` reads naturally.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
blocked: bool
|
|
73
|
+
allowed: bool
|
|
74
|
+
reason: str | None = None # threat_type when blocked, else None
|
|
75
|
+
confidence: float = 0.0
|
|
76
|
+
intents: list[dict] = field(default_factory=list)
|
|
77
|
+
continuation: bool = False # True when this message continued an in-progress task
|
|
78
|
+
risk_level: str | None = None # set only when risk_enforcement is on
|
|
79
|
+
restricted: bool = False # True when a HIGH-risk user should be limited
|
|
80
|
+
raw: dict = field(default_factory=dict)
|
|
81
|
+
|
|
82
|
+
def __bool__(self) -> bool:
|
|
83
|
+
return self.allowed
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def derive_hash(user_id: str | None) -> str | None:
|
|
87
|
+
"""Stable, config-free identity hash for risk tracking.
|
|
88
|
+
|
|
89
|
+
Returns ``sha256(user_id)[:16]`` — the same scheme documented in the
|
|
90
|
+
customer API guide. It needs no shared salt, so the same user maps to the
|
|
91
|
+
same hash on every server. Pass ``user_hash=`` to :meth:`Firewall.check`
|
|
92
|
+
if you'd rather supply your own (e.g. a salted HMAC) for irreversibility.
|
|
93
|
+
"""
|
|
94
|
+
if not user_id:
|
|
95
|
+
return None
|
|
96
|
+
return hashlib.sha256(user_id.encode("utf-8")).hexdigest()[:_USER_HASH_LEN]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Firewall:
|
|
100
|
+
"""One object that fronts the AgentHacker backend for an agent."""
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
api_key: str | None = None,
|
|
105
|
+
*,
|
|
106
|
+
api_url: str | None = None,
|
|
107
|
+
http_timeout: float | None = None,
|
|
108
|
+
agent: str | None = None,
|
|
109
|
+
agent_intents: list[str] | None = None,
|
|
110
|
+
risk_enforcement: str = "off",
|
|
111
|
+
bedrock_system_prompt: str | None = None,
|
|
112
|
+
# Defaults for the local-checkpoint methods (scan_input/scan_data/
|
|
113
|
+
# scan_tool_call/scan_output). Each is overridable per call.
|
|
114
|
+
max_input_length: int = 8000,
|
|
115
|
+
allowed_tools: set[str] | None = None,
|
|
116
|
+
id_resolver: "Callable[[str, dict], set[str]] | None" = None,
|
|
117
|
+
allow_data_urls: bool = False,
|
|
118
|
+
entity_pattern: "re.Pattern | None" = None,
|
|
119
|
+
leakage_label: str = "Cross-Entity Data Leakage",
|
|
120
|
+
# Feature switches — None = leave at default/env, True/False = force.
|
|
121
|
+
intent_check: bool | None = None,
|
|
122
|
+
risk_tracking: bool | None = None,
|
|
123
|
+
multilingual: bool | None = None,
|
|
124
|
+
input_scan: bool | None = None,
|
|
125
|
+
data_field_scan: bool | None = None,
|
|
126
|
+
output_guard: bool | None = None,
|
|
127
|
+
output_judge: bool | None = None,
|
|
128
|
+
tool_auth: bool | None = None,
|
|
129
|
+
audit_log: bool | None = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
key = api_key or os.environ.get("AGENTHACKER_API_KEY")
|
|
132
|
+
url = api_url or os.environ.get("AGENTHACKER_API_URL")
|
|
133
|
+
if key:
|
|
134
|
+
# CloudClient defaults api_url to the production gateway; only
|
|
135
|
+
# override when we actually have a value (passing None would break
|
|
136
|
+
# .rstrip()).
|
|
137
|
+
self._client: CloudClient | None = (
|
|
138
|
+
CloudClient(key, url, timeout=http_timeout)
|
|
139
|
+
if url
|
|
140
|
+
else CloudClient(key, timeout=http_timeout)
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
# Keyless LOCAL mode. The four local checkpoints
|
|
144
|
+
# (scan_input/scan_data/scan_tool_call/scan_output) run fully
|
|
145
|
+
# offline; cloud-only features (the Bedrock intent/output judges,
|
|
146
|
+
# centralized risk scoring, event logging, reports) are disabled and
|
|
147
|
+
# degrade gracefully. Set AGENTHACKER_API_KEY to enable the cloud tier.
|
|
148
|
+
self._client = None
|
|
149
|
+
logger.info(
|
|
150
|
+
"Firewall running in LOCAL mode (no API key): local checkpoints "
|
|
151
|
+
"active; cloud judges / risk / events / reports disabled. Set "
|
|
152
|
+
"AGENTHACKER_API_KEY to enable the cloud tier."
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
self._agent = agent
|
|
156
|
+
self._agent_intents = list(agent_intents) if agent_intents else None
|
|
157
|
+
self._bedrock_system_prompt = bedrock_system_prompt
|
|
158
|
+
|
|
159
|
+
# Defaults for the local-checkpoint convenience methods.
|
|
160
|
+
self._max_input_length = max_input_length
|
|
161
|
+
self._allowed_tools = allowed_tools
|
|
162
|
+
self._id_resolver = id_resolver
|
|
163
|
+
self._allow_data_urls = allow_data_urls
|
|
164
|
+
self._entity_pattern = entity_pattern
|
|
165
|
+
self._leakage_label = leakage_label
|
|
166
|
+
|
|
167
|
+
if risk_enforcement not in _RISK_ENFORCEMENT_MODES:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"risk_enforcement must be one of {_RISK_ENFORCEMENT_MODES}, "
|
|
170
|
+
f"got {risk_enforcement!r}"
|
|
171
|
+
)
|
|
172
|
+
self._risk_enforcement = risk_enforcement
|
|
173
|
+
|
|
174
|
+
# Per-session conversation state for multi-turn continuation. Bounded
|
|
175
|
+
# (oldest-evicted) so a long-lived process can't leak memory across the
|
|
176
|
+
# many session_ids it serves. ConversationState never stores raw user
|
|
177
|
+
# text, so this is safe to retain.
|
|
178
|
+
self._sessions: "OrderedDict[str, ConversationState]" = OrderedDict()
|
|
179
|
+
self._active_task: "OrderedDict[str, str]" = OrderedDict()
|
|
180
|
+
|
|
181
|
+
# Apply only the flags the caller set explicitly to the shared registry.
|
|
182
|
+
explicit = {
|
|
183
|
+
"intent_check": intent_check,
|
|
184
|
+
"risk_tracking": risk_tracking,
|
|
185
|
+
"multilingual": multilingual,
|
|
186
|
+
"input_scan": input_scan,
|
|
187
|
+
"data_field_scan": data_field_scan,
|
|
188
|
+
"output_guard": output_guard,
|
|
189
|
+
"output_judge": output_judge,
|
|
190
|
+
"tool_auth": tool_auth,
|
|
191
|
+
"audit_log": audit_log,
|
|
192
|
+
}
|
|
193
|
+
overrides = {k: v for k, v in explicit.items() if v is not None}
|
|
194
|
+
if overrides:
|
|
195
|
+
features.configure_features(**overrides)
|
|
196
|
+
|
|
197
|
+
# ── Core flow ─────────────────────────────────────────────────────
|
|
198
|
+
|
|
199
|
+
def check(
|
|
200
|
+
self,
|
|
201
|
+
message: str,
|
|
202
|
+
*,
|
|
203
|
+
user_id: str | None = None,
|
|
204
|
+
user_hash: str | None = None,
|
|
205
|
+
session_id: str = "",
|
|
206
|
+
agent: str | None = None,
|
|
207
|
+
agent_intents: list[str] | None = None,
|
|
208
|
+
) -> CheckResult:
|
|
209
|
+
"""Run a message through the Bedrock intent judge before your LLM.
|
|
210
|
+
|
|
211
|
+
Returns a :class:`CheckResult`. ``check.blocked`` is True for an attack
|
|
212
|
+
or out-of-scope request; ``check.reason`` is the threat type.
|
|
213
|
+
|
|
214
|
+
**Open-ended agents:** leave ``agent_intents`` unset (None). The judge
|
|
215
|
+
still runs full attack/jailbreak detection; it just doesn't scope the
|
|
216
|
+
message against a fixed intent list, which would otherwise block
|
|
217
|
+
legitimate open-ended steps. Declare ``agent_intents`` only for bounded
|
|
218
|
+
agents with a known menu of actions (e.g. a support bot).
|
|
219
|
+
|
|
220
|
+
**Multi-turn:** pass a stable ``session_id`` and the firewall threads the
|
|
221
|
+
in-progress task + a secure local summary into the judge, so a bare
|
|
222
|
+
follow-up detail ("June 3rd" while booking) is recognised as a
|
|
223
|
+
continuation instead of being blocked as out-of-scope. Call
|
|
224
|
+
:meth:`note_response` after the agent replies to enrich that context.
|
|
225
|
+
"""
|
|
226
|
+
if not features.is_enabled("intent_check"):
|
|
227
|
+
return CheckResult(blocked=False, allowed=True, confidence=1.0)
|
|
228
|
+
|
|
229
|
+
if self._client is None:
|
|
230
|
+
# Local mode: no Bedrock judge. Fall back to the deterministic CP-1
|
|
231
|
+
# input scan (a strict subset, but real protection with no network).
|
|
232
|
+
return self._local_check(message)
|
|
233
|
+
|
|
234
|
+
uhash = user_hash or derive_hash(user_id) or ""
|
|
235
|
+
intents = agent_intents if agent_intents is not None else self._agent_intents
|
|
236
|
+
|
|
237
|
+
# Multi-turn context: what task is in progress and what has been confirmed.
|
|
238
|
+
active_task = self._active_task.get(session_id) if session_id else None
|
|
239
|
+
summary = ""
|
|
240
|
+
if session_id and session_id in self._sessions:
|
|
241
|
+
summary = local_summary(self._sessions[session_id])
|
|
242
|
+
|
|
243
|
+
result = (
|
|
244
|
+
self._client.classify_intent(
|
|
245
|
+
message,
|
|
246
|
+
session_id=session_id,
|
|
247
|
+
user_hash=uhash,
|
|
248
|
+
agent=agent or self._agent,
|
|
249
|
+
agent_intents=intents,
|
|
250
|
+
system_prompt_suffix=self._bedrock_system_prompt,
|
|
251
|
+
active_task=active_task,
|
|
252
|
+
conversation_summary=summary or None,
|
|
253
|
+
)
|
|
254
|
+
or {}
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
blocked = result.get("decision") == "block"
|
|
258
|
+
out = CheckResult(
|
|
259
|
+
blocked=blocked,
|
|
260
|
+
allowed=not blocked,
|
|
261
|
+
reason=(result.get("threat_type") or "policy_violation")
|
|
262
|
+
if blocked
|
|
263
|
+
else None,
|
|
264
|
+
confidence=float(result.get("confidence", 0.0) or 0.0),
|
|
265
|
+
intents=result.get("intents") or [],
|
|
266
|
+
continuation=bool(result.get("continuation", False)),
|
|
267
|
+
raw=result,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Opt-in risk enforcement: also weigh the user's accumulated risk.
|
|
271
|
+
if not blocked and self._risk_enforcement != "off" and uhash:
|
|
272
|
+
risk = self._client.get_risk_score(uhash) or {}
|
|
273
|
+
level = risk.get("level")
|
|
274
|
+
out.risk_level = level
|
|
275
|
+
if self._risk_enforcement == "block" and level == "CRITICAL":
|
|
276
|
+
out.blocked = True
|
|
277
|
+
out.allowed = False
|
|
278
|
+
out.reason = "high_risk_user"
|
|
279
|
+
elif self._risk_enforcement == "restrict" and level in ("HIGH", "CRITICAL"):
|
|
280
|
+
out.restricted = True
|
|
281
|
+
|
|
282
|
+
# Record the accepted action so the NEXT turn has continuation context.
|
|
283
|
+
if session_id and not out.blocked:
|
|
284
|
+
self._record_accepted(session_id, out.intents)
|
|
285
|
+
return out
|
|
286
|
+
|
|
287
|
+
def _local_check(self, message: str) -> CheckResult:
|
|
288
|
+
"""Local-mode fallback for :meth:`check` (no backend configured).
|
|
289
|
+
|
|
290
|
+
Runs the deterministic CP-1 input scan (regex rules) in place of the
|
|
291
|
+
Bedrock intent judge. Narrower coverage than the cloud judge, but it
|
|
292
|
+
still blocks known prompt-injection / jailbreak patterns with no network
|
|
293
|
+
call. Risk enforcement is skipped (no centralized risk data locally).
|
|
294
|
+
"""
|
|
295
|
+
result = self.scan_input(message)
|
|
296
|
+
if result.clean:
|
|
297
|
+
return CheckResult(blocked=False, allowed=True, confidence=1.0)
|
|
298
|
+
return CheckResult(
|
|
299
|
+
blocked=True,
|
|
300
|
+
allowed=False,
|
|
301
|
+
reason=result.rule_id or "prompt_injection",
|
|
302
|
+
confidence=1.0,
|
|
303
|
+
raw={
|
|
304
|
+
"source": "local",
|
|
305
|
+
"rule_id": result.rule_id,
|
|
306
|
+
"rule_name": result.rule_name,
|
|
307
|
+
"matched_text": result.matched_text,
|
|
308
|
+
},
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# ── Multi-turn session state ──────────────────────────────────────
|
|
312
|
+
|
|
313
|
+
def _session_state(self, session_id: str) -> ConversationState:
|
|
314
|
+
"""Get-or-create the bounded ConversationState for a session (LRU)."""
|
|
315
|
+
state = self._sessions.get(session_id)
|
|
316
|
+
if state is None:
|
|
317
|
+
state = ConversationState()
|
|
318
|
+
self._sessions[session_id] = state
|
|
319
|
+
while len(self._sessions) > _MAX_SESSIONS:
|
|
320
|
+
self._sessions.popitem(last=False)
|
|
321
|
+
else:
|
|
322
|
+
self._sessions.move_to_end(session_id)
|
|
323
|
+
return state
|
|
324
|
+
|
|
325
|
+
def _record_accepted(self, session_id: str, intents: list[dict]) -> None:
|
|
326
|
+
"""Record the primary in-scope intent as the session's active task."""
|
|
327
|
+
in_scope = [
|
|
328
|
+
i.get("text", "")
|
|
329
|
+
for i in intents
|
|
330
|
+
if isinstance(i, dict) and i.get("in_scope", True) and i.get("text")
|
|
331
|
+
]
|
|
332
|
+
if not in_scope:
|
|
333
|
+
return
|
|
334
|
+
primary = in_scope[0]
|
|
335
|
+
self._session_state(session_id).record_accepted_intent(primary)
|
|
336
|
+
self._active_task[session_id] = primary
|
|
337
|
+
self._active_task.move_to_end(session_id)
|
|
338
|
+
while len(self._active_task) > _MAX_SESSIONS:
|
|
339
|
+
self._active_task.popitem(last=False)
|
|
340
|
+
|
|
341
|
+
def note_response(
|
|
342
|
+
self,
|
|
343
|
+
session_id: str,
|
|
344
|
+
response_text: str,
|
|
345
|
+
*,
|
|
346
|
+
intent_name: str | None = None,
|
|
347
|
+
) -> None:
|
|
348
|
+
"""Feed the agent's reply back into session context (no network call).
|
|
349
|
+
|
|
350
|
+
Call this after your agent answers an allowed message. The reply (which
|
|
351
|
+
never contains raw user text) sharpens the local summary passed to the
|
|
352
|
+
next :meth:`check`, so follow-up details are judged with full context.
|
|
353
|
+
Optionally pass ``intent_name`` to also set the active task explicitly.
|
|
354
|
+
"""
|
|
355
|
+
if not session_id or not response_text:
|
|
356
|
+
return
|
|
357
|
+
state = self._session_state(session_id)
|
|
358
|
+
state.record_agent_response(response_text)
|
|
359
|
+
if intent_name:
|
|
360
|
+
self._active_task[session_id] = intent_name
|
|
361
|
+
self._active_task.move_to_end(session_id)
|
|
362
|
+
|
|
363
|
+
def clear_session(self, session_id: str) -> None:
|
|
364
|
+
"""Forget a session's conversation state (e.g. on logout / task complete)."""
|
|
365
|
+
self._sessions.pop(session_id, None)
|
|
366
|
+
self._active_task.pop(session_id, None)
|
|
367
|
+
|
|
368
|
+
def log(
|
|
369
|
+
self,
|
|
370
|
+
message: str | None = None,
|
|
371
|
+
response: str | None = None,
|
|
372
|
+
*,
|
|
373
|
+
blocked: bool = False,
|
|
374
|
+
user_id: str | None = None,
|
|
375
|
+
user_hash: str | None = None,
|
|
376
|
+
session_id: str | None = None,
|
|
377
|
+
agent: str | None = None,
|
|
378
|
+
checkpoint: str | None = None,
|
|
379
|
+
rule_id: str | None = None,
|
|
380
|
+
rule_name: str | None = None,
|
|
381
|
+
tokens: int = 0,
|
|
382
|
+
tool_calls: int = 0,
|
|
383
|
+
latency_ms: float = 0.0,
|
|
384
|
+
) -> None:
|
|
385
|
+
"""Record an invocation (fire-and-forget). Powers stats, risk, reports.
|
|
386
|
+
|
|
387
|
+
Never blocks or raises — a logging failure must not break your agent.
|
|
388
|
+
"""
|
|
389
|
+
if not features.is_enabled("audit_log"):
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
agent_name = agent or self._agent
|
|
393
|
+
# Honor risk_tracking by omitting the identity when it's off — the
|
|
394
|
+
# backend then won't build a per-user risk profile.
|
|
395
|
+
uhash = None
|
|
396
|
+
if features.is_enabled("risk_tracking"):
|
|
397
|
+
uhash = user_hash or derive_hash(user_id)
|
|
398
|
+
|
|
399
|
+
invocation: dict[str, Any] = {
|
|
400
|
+
"blocked": blocked,
|
|
401
|
+
"tokens": tokens,
|
|
402
|
+
"tool_calls": tool_calls,
|
|
403
|
+
"latency_ms": latency_ms,
|
|
404
|
+
"agent": agent_name,
|
|
405
|
+
}
|
|
406
|
+
if checkpoint:
|
|
407
|
+
invocation["checkpoint"] = checkpoint
|
|
408
|
+
if rule_id:
|
|
409
|
+
invocation["rule_id"] = rule_id
|
|
410
|
+
if message:
|
|
411
|
+
invocation["question_preview"] = message[:500]
|
|
412
|
+
|
|
413
|
+
batch: dict[str, Any] = {
|
|
414
|
+
"user_hash": uhash,
|
|
415
|
+
"agent": agent_name,
|
|
416
|
+
"invocation": invocation,
|
|
417
|
+
"firewall_events": [],
|
|
418
|
+
}
|
|
419
|
+
if session_id:
|
|
420
|
+
batch["session_id"] = session_id
|
|
421
|
+
if blocked:
|
|
422
|
+
batch["firewall_events"] = [
|
|
423
|
+
{
|
|
424
|
+
"checkpoint": checkpoint,
|
|
425
|
+
"rule_id": rule_id,
|
|
426
|
+
"rule_name": rule_name,
|
|
427
|
+
"excerpt": (message or "")[:100],
|
|
428
|
+
"agent": agent_name,
|
|
429
|
+
}
|
|
430
|
+
]
|
|
431
|
+
if message and agent_name:
|
|
432
|
+
batch["s3_chat_line"] = {
|
|
433
|
+
"role": "user",
|
|
434
|
+
"content": message,
|
|
435
|
+
"agent": agent_name,
|
|
436
|
+
}
|
|
437
|
+
if self._client is not None: # local mode: nowhere to submit
|
|
438
|
+
self._client.submit_events(batch)
|
|
439
|
+
|
|
440
|
+
# ── Local checkpoints for the agent loop ──────────────────────────
|
|
441
|
+
#
|
|
442
|
+
# An agent isn't a request/response chatbot. Wire these at the loop's
|
|
443
|
+
# chokepoints — no need to import the low-level scan_* functions:
|
|
444
|
+
#
|
|
445
|
+
# think → fw.check(goal) / fw.scan_input(text)
|
|
446
|
+
# act → fw.scan_tool_call(name, args)
|
|
447
|
+
# ingest → fw.scan_data(tool_output, allow_urls=...)
|
|
448
|
+
# respond → fw.scan_output(answer)
|
|
449
|
+
#
|
|
450
|
+
# All four run locally (sub-millisecond, no network) and return a
|
|
451
|
+
# ``ScanResult`` whose ``.clean`` is True when it's safe to proceed.
|
|
452
|
+
|
|
453
|
+
def scan_input(
|
|
454
|
+
self, text: str, *, max_input_length: int | None = None
|
|
455
|
+
) -> ScanResult:
|
|
456
|
+
"""CP-1: scan a user message / agent input for injection (the *think* step)."""
|
|
457
|
+
from firewall_sdk.scan_engine import scan_input as _scan_input
|
|
458
|
+
|
|
459
|
+
return _scan_input(
|
|
460
|
+
text, max_input_length=max_input_length or self._max_input_length
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
def scan_data(self, text: str, *, allow_urls: bool | None = None) -> ScanResult:
|
|
464
|
+
"""CP-2: scan tool output / retrieved data before it re-enters the loop.
|
|
465
|
+
|
|
466
|
+
Catches indirect (data-borne) injection and leaked secrets. Set
|
|
467
|
+
``allow_urls=True`` (or construct ``Firewall(allow_data_urls=True)``) for
|
|
468
|
+
a web-browsing agent whose results legitimately contain links.
|
|
469
|
+
"""
|
|
470
|
+
au = self._allow_data_urls if allow_urls is None else allow_urls
|
|
471
|
+
from firewall_sdk.scan_engine import scan_data_field as _scan_data_field
|
|
472
|
+
|
|
473
|
+
return _scan_data_field(text, allow_urls=au)
|
|
474
|
+
|
|
475
|
+
# The *ingest* step reads naturally as "guard this tool output".
|
|
476
|
+
guard_tool_output = scan_data
|
|
477
|
+
|
|
478
|
+
def scan_tool_call(
|
|
479
|
+
self,
|
|
480
|
+
name: str,
|
|
481
|
+
args: dict,
|
|
482
|
+
*,
|
|
483
|
+
allowed_ids: set[str] | None = None,
|
|
484
|
+
allowed_tools: set[str] | None = None,
|
|
485
|
+
id_resolver: "Callable[[str, dict], set[str]] | None" = None,
|
|
486
|
+
) -> ScanResult:
|
|
487
|
+
"""CP-3: authorize a tool call before it runs (the *act* step).
|
|
488
|
+
|
|
489
|
+
Uses the constructor's ``allowed_tools`` / ``id_resolver`` unless
|
|
490
|
+
overridden here. With no allowlist configured anywhere this is a
|
|
491
|
+
documented no-op (returns CLEAN) — set ``allowed_tools`` to enforce it.
|
|
492
|
+
"""
|
|
493
|
+
tools = allowed_tools if allowed_tools is not None else self._allowed_tools
|
|
494
|
+
if tools is None:
|
|
495
|
+
return CLEAN
|
|
496
|
+
from firewall_sdk.tool_guard import scan_tool_call as _scan_tool_call
|
|
497
|
+
|
|
498
|
+
return _scan_tool_call(
|
|
499
|
+
name,
|
|
500
|
+
args,
|
|
501
|
+
allowed_ids,
|
|
502
|
+
allowed_tools=tools,
|
|
503
|
+
id_resolver=id_resolver if id_resolver is not None else self._id_resolver,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
def scan_output(
|
|
507
|
+
self,
|
|
508
|
+
text: str,
|
|
509
|
+
*,
|
|
510
|
+
system_prompt: str = "",
|
|
511
|
+
requester_email: str = "",
|
|
512
|
+
allowed_ids: set[str] | None = None,
|
|
513
|
+
all_user_emails: set[str] | None = None,
|
|
514
|
+
entity_pattern: "re.Pattern | None" = None,
|
|
515
|
+
leakage_label: str | None = None,
|
|
516
|
+
) -> ScanResult:
|
|
517
|
+
"""CP-4: scan the agent's final answer before returning it (the *respond* step).
|
|
518
|
+
|
|
519
|
+
Called bare (``fw.scan_output(answer)``) it checks for leaked secrets and
|
|
520
|
+
offensive content. Supply ``system_prompt`` / entity inputs to also catch
|
|
521
|
+
system-prompt and cross-entity data leakage.
|
|
522
|
+
"""
|
|
523
|
+
from firewall_sdk.output_guard import scan_output as _scan_output
|
|
524
|
+
|
|
525
|
+
return _scan_output(
|
|
526
|
+
text,
|
|
527
|
+
system_prompt=system_prompt,
|
|
528
|
+
requester_email=requester_email,
|
|
529
|
+
allowed_ids=allowed_ids,
|
|
530
|
+
all_user_emails=all_user_emails,
|
|
531
|
+
entity_pattern=entity_pattern
|
|
532
|
+
if entity_pattern is not None
|
|
533
|
+
else self._entity_pattern,
|
|
534
|
+
leakage_label=leakage_label
|
|
535
|
+
if leakage_label is not None
|
|
536
|
+
else self._leakage_label,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
def check_output(
|
|
540
|
+
self,
|
|
541
|
+
text: str,
|
|
542
|
+
*,
|
|
543
|
+
user_request: str = "",
|
|
544
|
+
persona: str = "",
|
|
545
|
+
user_id: str | None = None,
|
|
546
|
+
user_hash: str | None = None,
|
|
547
|
+
session_id: str = "",
|
|
548
|
+
agent: str | None = None,
|
|
549
|
+
agent_intents: list[str] | None = None,
|
|
550
|
+
# Inputs forwarded to the deterministic regex scan_output (CP-4):
|
|
551
|
+
system_prompt: str = "",
|
|
552
|
+
requester_email: str = "",
|
|
553
|
+
allowed_ids: set[str] | None = None,
|
|
554
|
+
all_user_emails: set[str] | None = None,
|
|
555
|
+
entity_pattern: "re.Pattern | None" = None,
|
|
556
|
+
leakage_label: str | None = None,
|
|
557
|
+
) -> ScanResult:
|
|
558
|
+
"""CP-4 (full): deterministic regex scan FIRST, then the Bedrock output judge.
|
|
559
|
+
|
|
560
|
+
The cheap, deterministic ``scan_output`` runs first (leaked secrets,
|
|
561
|
+
system-prompt shingle overlap, cross-entity leakage, slurs). If it flags
|
|
562
|
+
anything, that verdict is returned immediately — no model call. Otherwise,
|
|
563
|
+
when the ``output_judge`` feature is on, the Bedrock judge inspects the
|
|
564
|
+
output for *clear* prompt-injection success (refusal-suppression, persona
|
|
565
|
+
break, intent-deviation, tool-abuse, data-exfiltration) and blocks on a
|
|
566
|
+
high-confidence signal. Fail-open: a backend error leaves the
|
|
567
|
+
already-passed regex verdict in place.
|
|
568
|
+
|
|
569
|
+
Pass ``system_prompt`` (the verbatim prompt) for the regex S-01 leak check,
|
|
570
|
+
and ``persona`` (a short non-secret description) for the judge's
|
|
571
|
+
persona-break detection — the verbatim prompt is deliberately NOT sent to
|
|
572
|
+
the judge. Pass ``user_request`` so the judge can spot intent-deviation.
|
|
573
|
+
"""
|
|
574
|
+
local = self.scan_output(
|
|
575
|
+
text,
|
|
576
|
+
system_prompt=system_prompt,
|
|
577
|
+
requester_email=requester_email,
|
|
578
|
+
allowed_ids=allowed_ids,
|
|
579
|
+
all_user_emails=all_user_emails,
|
|
580
|
+
entity_pattern=entity_pattern,
|
|
581
|
+
leakage_label=leakage_label,
|
|
582
|
+
)
|
|
583
|
+
if not local.clean:
|
|
584
|
+
return local
|
|
585
|
+
if not features.is_enabled("output_judge"):
|
|
586
|
+
return local
|
|
587
|
+
if self._client is None:
|
|
588
|
+
return local # local mode: regex verdict only, no judge
|
|
589
|
+
|
|
590
|
+
uhash = user_hash or derive_hash(user_id) or ""
|
|
591
|
+
intents = agent_intents if agent_intents is not None else self._agent_intents
|
|
592
|
+
verdict = (
|
|
593
|
+
self._client.classify_output(
|
|
594
|
+
text,
|
|
595
|
+
session_id=session_id,
|
|
596
|
+
user_hash=uhash,
|
|
597
|
+
agent=agent or self._agent,
|
|
598
|
+
user_request=user_request or None,
|
|
599
|
+
system_prompt=persona
|
|
600
|
+
or None, # redacted persona, never the verbatim prompt
|
|
601
|
+
agent_intents=intents,
|
|
602
|
+
)
|
|
603
|
+
or {}
|
|
604
|
+
)
|
|
605
|
+
if verdict.get("decision") == "block":
|
|
606
|
+
signal = verdict.get("signal") or "injection"
|
|
607
|
+
reasoning = verdict.get("reasoning") or ""
|
|
608
|
+
return ScanResult(
|
|
609
|
+
clean=False,
|
|
610
|
+
rule_id="BEDROCK_OUTPUT",
|
|
611
|
+
rule_name=f"Bedrock Output Judge: {signal}",
|
|
612
|
+
matched_text=(reasoning or signal)[:100],
|
|
613
|
+
)
|
|
614
|
+
return CLEAN
|
|
615
|
+
|
|
616
|
+
# ── Startup check ─────────────────────────────────────────────────
|
|
617
|
+
|
|
618
|
+
def preflight(self, *, raise_on_error: bool = False) -> dict:
|
|
619
|
+
"""Verify the API key + URL at startup so misconfiguration is caught early.
|
|
620
|
+
|
|
621
|
+
Cloud calls fail open, so a bad key/URL is otherwise invisible: the agent
|
|
622
|
+
looks protected while nothing is checked. Call this once at startup.
|
|
623
|
+
|
|
624
|
+
Returns ``{"ok", "status", "url", "detail"}``. By default it logs a loud
|
|
625
|
+
warning on failure and returns (so a transient blip doesn't crash boot);
|
|
626
|
+
pass ``raise_on_error=True`` to hard-fail startup on an auth/config error.
|
|
627
|
+
|
|
628
|
+
In local mode (no API key) there is nothing to verify, so this reports
|
|
629
|
+
``{"ok": True, "status": "local"}``.
|
|
630
|
+
"""
|
|
631
|
+
if self._client is None:
|
|
632
|
+
return {
|
|
633
|
+
"ok": True,
|
|
634
|
+
"status": "local",
|
|
635
|
+
"url": None,
|
|
636
|
+
"detail": "local mode (no API key): cloud checks disabled, "
|
|
637
|
+
"local checkpoints active",
|
|
638
|
+
}
|
|
639
|
+
result = self._client.preflight()
|
|
640
|
+
if not result.get("ok"):
|
|
641
|
+
msg = (
|
|
642
|
+
f"AgentHacker preflight failed (status={result.get('status')}, "
|
|
643
|
+
f"url={result.get('url')}): {result.get('detail')}. The firewall will "
|
|
644
|
+
f"FAIL OPEN — verify AGENTHACKER_API_KEY / AGENTHACKER_API_URL."
|
|
645
|
+
)
|
|
646
|
+
if raise_on_error:
|
|
647
|
+
raise RuntimeError(msg)
|
|
648
|
+
logger.warning(msg)
|
|
649
|
+
return result
|
|
650
|
+
|
|
651
|
+
# ── Convenience reads ─────────────────────────────────────────────
|
|
652
|
+
|
|
653
|
+
def risk(
|
|
654
|
+
self, user_id: str | None = None, *, user_hash: str | None = None
|
|
655
|
+
) -> dict | None:
|
|
656
|
+
"""Return a user's current risk score dict, or None (None in local mode)."""
|
|
657
|
+
if self._client is None:
|
|
658
|
+
return None
|
|
659
|
+
uhash = user_hash or derive_hash(user_id)
|
|
660
|
+
if not uhash:
|
|
661
|
+
return None
|
|
662
|
+
return self._client.get_risk_score(uhash)
|
|
663
|
+
|
|
664
|
+
def report(self, date_range: str = "30d", agent: str | None = None) -> dict | None:
|
|
665
|
+
"""Generate a security audit report (HTML + JSON). See CloudClient.generate_report.
|
|
666
|
+
|
|
667
|
+
Returns None in local mode (no backend).
|
|
668
|
+
"""
|
|
669
|
+
if self._client is None:
|
|
670
|
+
return None
|
|
671
|
+
return self._client.generate_report(
|
|
672
|
+
date_range=date_range, agent=agent or self._agent
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
__all__ = ["Firewall", "CheckResult", "derive_hash"]
|