agenthacker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
firewall_sdk/client.py ADDED
@@ -0,0 +1,676 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2026 AgentHacker
3
+
4
+ """High-level ``Firewall`` client — the simplest way to protect an agent.
5
+
6
+ This is the *cloud tier* (Tier 1): a thin, friendly wrapper over the backend
7
+ API. You instantiate it once, call :meth:`Firewall.check` before your LLM, and
8
+ :meth:`Firewall.log` after. No local ML, no boilerplate.
9
+
10
+ Constructed **without an API key** it runs in **local mode**: the four local
11
+ checkpoints (``scan_input``/``scan_data``/``scan_tool_call``/``scan_output``)
12
+ work fully offline, ``check()`` falls back to the deterministic CP-1 input scan,
13
+ and the cloud-only features degrade gracefully. Set ``AGENTHACKER_API_KEY`` (or
14
+ pass ``api_key=``) to enable the cloud tier.
15
+
16
+ from firewall_sdk import Firewall
17
+
18
+ fw = Firewall( # api_key defaults to AGENTHACKER_API_KEY
19
+ agent="support_bot",
20
+ agent_intents=["Answer account questions", "Help reset passwords"],
21
+ )
22
+
23
+ check = fw.check(user_msg, user_id="user@acme.com", session_id=sid)
24
+ if check.blocked:
25
+ return check.reason # e.g. "prompt_injection"
26
+
27
+ answer = my_llm(user_msg) # your own model — unchanged
28
+
29
+ fw.log(user_msg, answer, user_id="user@acme.com", session_id=sid,
30
+ tokens=512, latency_ms=210)
31
+
32
+ Every layer is a switch (see :mod:`firewall_sdk.features`). The constructor
33
+ exposes the cloud-relevant ones; pass ``None`` to leave a flag at its
34
+ default/env value, or ``True``/``False`` to force it.
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import hashlib
40
+ import logging
41
+ import os
42
+ from collections import OrderedDict
43
+ from dataclasses import dataclass, field
44
+ from typing import TYPE_CHECKING, Any
45
+
46
+ from firewall_sdk import features
47
+ from firewall_sdk.cloud_client import CloudClient
48
+ from firewall_sdk.context_summarizer import ConversationState, local_summary
49
+ from firewall_sdk.schemas import CLEAN, ScanResult
50
+
51
+ if TYPE_CHECKING:
52
+ import re
53
+ from collections.abc import Callable
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+ _USER_HASH_LEN = 16
58
+ _RISK_ENFORCEMENT_MODES = ("off", "restrict", "block")
59
+ # Cap on per-session conversation state held in memory. Oldest sessions are
60
+ # evicted first so a long-running process can't grow this without bound.
61
+ _MAX_SESSIONS = 1000
62
+
63
+
64
+ @dataclass
65
+ class CheckResult:
66
+ """The verdict for a single message.
67
+
68
+ ``allowed`` is the inverse of ``blocked``; the object is truthy when it is
69
+ safe to proceed, so ``if fw.check(msg):`` reads naturally.
70
+ """
71
+
72
+ blocked: bool
73
+ allowed: bool
74
+ reason: str | None = None # threat_type when blocked, else None
75
+ confidence: float = 0.0
76
+ intents: list[dict] = field(default_factory=list)
77
+ continuation: bool = False # True when this message continued an in-progress task
78
+ risk_level: str | None = None # set only when risk_enforcement is on
79
+ restricted: bool = False # True when a HIGH-risk user should be limited
80
+ raw: dict = field(default_factory=dict)
81
+
82
+ def __bool__(self) -> bool:
83
+ return self.allowed
84
+
85
+
86
+ def derive_hash(user_id: str | None) -> str | None:
87
+ """Stable, config-free identity hash for risk tracking.
88
+
89
+ Returns ``sha256(user_id)[:16]`` — the same scheme documented in the
90
+ customer API guide. It needs no shared salt, so the same user maps to the
91
+ same hash on every server. Pass ``user_hash=`` to :meth:`Firewall.check`
92
+ if you'd rather supply your own (e.g. a salted HMAC) for irreversibility.
93
+ """
94
+ if not user_id:
95
+ return None
96
+ return hashlib.sha256(user_id.encode("utf-8")).hexdigest()[:_USER_HASH_LEN]
97
+
98
+
99
+ class Firewall:
100
+ """One object that fronts the AgentHacker backend for an agent."""
101
+
102
+ def __init__(
103
+ self,
104
+ api_key: str | None = None,
105
+ *,
106
+ api_url: str | None = None,
107
+ http_timeout: float | None = None,
108
+ agent: str | None = None,
109
+ agent_intents: list[str] | None = None,
110
+ risk_enforcement: str = "off",
111
+ bedrock_system_prompt: str | None = None,
112
+ # Defaults for the local-checkpoint methods (scan_input/scan_data/
113
+ # scan_tool_call/scan_output). Each is overridable per call.
114
+ max_input_length: int = 8000,
115
+ allowed_tools: set[str] | None = None,
116
+ id_resolver: "Callable[[str, dict], set[str]] | None" = None,
117
+ allow_data_urls: bool = False,
118
+ entity_pattern: "re.Pattern | None" = None,
119
+ leakage_label: str = "Cross-Entity Data Leakage",
120
+ # Feature switches — None = leave at default/env, True/False = force.
121
+ intent_check: bool | None = None,
122
+ risk_tracking: bool | None = None,
123
+ multilingual: bool | None = None,
124
+ input_scan: bool | None = None,
125
+ data_field_scan: bool | None = None,
126
+ output_guard: bool | None = None,
127
+ output_judge: bool | None = None,
128
+ tool_auth: bool | None = None,
129
+ audit_log: bool | None = None,
130
+ ) -> None:
131
+ key = api_key or os.environ.get("AGENTHACKER_API_KEY")
132
+ url = api_url or os.environ.get("AGENTHACKER_API_URL")
133
+ if key:
134
+ # CloudClient defaults api_url to the production gateway; only
135
+ # override when we actually have a value (passing None would break
136
+ # .rstrip()).
137
+ self._client: CloudClient | None = (
138
+ CloudClient(key, url, timeout=http_timeout)
139
+ if url
140
+ else CloudClient(key, timeout=http_timeout)
141
+ )
142
+ else:
143
+ # Keyless LOCAL mode. The four local checkpoints
144
+ # (scan_input/scan_data/scan_tool_call/scan_output) run fully
145
+ # offline; cloud-only features (the Bedrock intent/output judges,
146
+ # centralized risk scoring, event logging, reports) are disabled and
147
+ # degrade gracefully. Set AGENTHACKER_API_KEY to enable the cloud tier.
148
+ self._client = None
149
+ logger.info(
150
+ "Firewall running in LOCAL mode (no API key): local checkpoints "
151
+ "active; cloud judges / risk / events / reports disabled. Set "
152
+ "AGENTHACKER_API_KEY to enable the cloud tier."
153
+ )
154
+
155
+ self._agent = agent
156
+ self._agent_intents = list(agent_intents) if agent_intents else None
157
+ self._bedrock_system_prompt = bedrock_system_prompt
158
+
159
+ # Defaults for the local-checkpoint convenience methods.
160
+ self._max_input_length = max_input_length
161
+ self._allowed_tools = allowed_tools
162
+ self._id_resolver = id_resolver
163
+ self._allow_data_urls = allow_data_urls
164
+ self._entity_pattern = entity_pattern
165
+ self._leakage_label = leakage_label
166
+
167
+ if risk_enforcement not in _RISK_ENFORCEMENT_MODES:
168
+ raise ValueError(
169
+ f"risk_enforcement must be one of {_RISK_ENFORCEMENT_MODES}, "
170
+ f"got {risk_enforcement!r}"
171
+ )
172
+ self._risk_enforcement = risk_enforcement
173
+
174
+ # Per-session conversation state for multi-turn continuation. Bounded
175
+ # (oldest-evicted) so a long-lived process can't leak memory across the
176
+ # many session_ids it serves. ConversationState never stores raw user
177
+ # text, so this is safe to retain.
178
+ self._sessions: "OrderedDict[str, ConversationState]" = OrderedDict()
179
+ self._active_task: "OrderedDict[str, str]" = OrderedDict()
180
+
181
+ # Apply only the flags the caller set explicitly to the shared registry.
182
+ explicit = {
183
+ "intent_check": intent_check,
184
+ "risk_tracking": risk_tracking,
185
+ "multilingual": multilingual,
186
+ "input_scan": input_scan,
187
+ "data_field_scan": data_field_scan,
188
+ "output_guard": output_guard,
189
+ "output_judge": output_judge,
190
+ "tool_auth": tool_auth,
191
+ "audit_log": audit_log,
192
+ }
193
+ overrides = {k: v for k, v in explicit.items() if v is not None}
194
+ if overrides:
195
+ features.configure_features(**overrides)
196
+
197
+ # ── Core flow ─────────────────────────────────────────────────────
198
+
199
+ def check(
200
+ self,
201
+ message: str,
202
+ *,
203
+ user_id: str | None = None,
204
+ user_hash: str | None = None,
205
+ session_id: str = "",
206
+ agent: str | None = None,
207
+ agent_intents: list[str] | None = None,
208
+ ) -> CheckResult:
209
+ """Run a message through the Bedrock intent judge before your LLM.
210
+
211
+ Returns a :class:`CheckResult`. ``check.blocked`` is True for an attack
212
+ or out-of-scope request; ``check.reason`` is the threat type.
213
+
214
+ **Open-ended agents:** leave ``agent_intents`` unset (None). The judge
215
+ still runs full attack/jailbreak detection; it just doesn't scope the
216
+ message against a fixed intent list, which would otherwise block
217
+ legitimate open-ended steps. Declare ``agent_intents`` only for bounded
218
+ agents with a known menu of actions (e.g. a support bot).
219
+
220
+ **Multi-turn:** pass a stable ``session_id`` and the firewall threads the
221
+ in-progress task + a secure local summary into the judge, so a bare
222
+ follow-up detail ("June 3rd" while booking) is recognised as a
223
+ continuation instead of being blocked as out-of-scope. Call
224
+ :meth:`note_response` after the agent replies to enrich that context.
225
+ """
226
+ if not features.is_enabled("intent_check"):
227
+ return CheckResult(blocked=False, allowed=True, confidence=1.0)
228
+
229
+ if self._client is None:
230
+ # Local mode: no Bedrock judge. Fall back to the deterministic CP-1
231
+ # input scan (a strict subset, but real protection with no network).
232
+ return self._local_check(message)
233
+
234
+ uhash = user_hash or derive_hash(user_id) or ""
235
+ intents = agent_intents if agent_intents is not None else self._agent_intents
236
+
237
+ # Multi-turn context: what task is in progress and what has been confirmed.
238
+ active_task = self._active_task.get(session_id) if session_id else None
239
+ summary = ""
240
+ if session_id and session_id in self._sessions:
241
+ summary = local_summary(self._sessions[session_id])
242
+
243
+ result = (
244
+ self._client.classify_intent(
245
+ message,
246
+ session_id=session_id,
247
+ user_hash=uhash,
248
+ agent=agent or self._agent,
249
+ agent_intents=intents,
250
+ system_prompt_suffix=self._bedrock_system_prompt,
251
+ active_task=active_task,
252
+ conversation_summary=summary or None,
253
+ )
254
+ or {}
255
+ )
256
+
257
+ blocked = result.get("decision") == "block"
258
+ out = CheckResult(
259
+ blocked=blocked,
260
+ allowed=not blocked,
261
+ reason=(result.get("threat_type") or "policy_violation")
262
+ if blocked
263
+ else None,
264
+ confidence=float(result.get("confidence", 0.0) or 0.0),
265
+ intents=result.get("intents") or [],
266
+ continuation=bool(result.get("continuation", False)),
267
+ raw=result,
268
+ )
269
+
270
+ # Opt-in risk enforcement: also weigh the user's accumulated risk.
271
+ if not blocked and self._risk_enforcement != "off" and uhash:
272
+ risk = self._client.get_risk_score(uhash) or {}
273
+ level = risk.get("level")
274
+ out.risk_level = level
275
+ if self._risk_enforcement == "block" and level == "CRITICAL":
276
+ out.blocked = True
277
+ out.allowed = False
278
+ out.reason = "high_risk_user"
279
+ elif self._risk_enforcement == "restrict" and level in ("HIGH", "CRITICAL"):
280
+ out.restricted = True
281
+
282
+ # Record the accepted action so the NEXT turn has continuation context.
283
+ if session_id and not out.blocked:
284
+ self._record_accepted(session_id, out.intents)
285
+ return out
286
+
287
+ def _local_check(self, message: str) -> CheckResult:
288
+ """Local-mode fallback for :meth:`check` (no backend configured).
289
+
290
+ Runs the deterministic CP-1 input scan (regex rules) in place of the
291
+ Bedrock intent judge. Narrower coverage than the cloud judge, but it
292
+ still blocks known prompt-injection / jailbreak patterns with no network
293
+ call. Risk enforcement is skipped (no centralized risk data locally).
294
+ """
295
+ result = self.scan_input(message)
296
+ if result.clean:
297
+ return CheckResult(blocked=False, allowed=True, confidence=1.0)
298
+ return CheckResult(
299
+ blocked=True,
300
+ allowed=False,
301
+ reason=result.rule_id or "prompt_injection",
302
+ confidence=1.0,
303
+ raw={
304
+ "source": "local",
305
+ "rule_id": result.rule_id,
306
+ "rule_name": result.rule_name,
307
+ "matched_text": result.matched_text,
308
+ },
309
+ )
310
+
311
+ # ── Multi-turn session state ──────────────────────────────────────
312
+
313
+ def _session_state(self, session_id: str) -> ConversationState:
314
+ """Get-or-create the bounded ConversationState for a session (LRU)."""
315
+ state = self._sessions.get(session_id)
316
+ if state is None:
317
+ state = ConversationState()
318
+ self._sessions[session_id] = state
319
+ while len(self._sessions) > _MAX_SESSIONS:
320
+ self._sessions.popitem(last=False)
321
+ else:
322
+ self._sessions.move_to_end(session_id)
323
+ return state
324
+
325
+ def _record_accepted(self, session_id: str, intents: list[dict]) -> None:
326
+ """Record the primary in-scope intent as the session's active task."""
327
+ in_scope = [
328
+ i.get("text", "")
329
+ for i in intents
330
+ if isinstance(i, dict) and i.get("in_scope", True) and i.get("text")
331
+ ]
332
+ if not in_scope:
333
+ return
334
+ primary = in_scope[0]
335
+ self._session_state(session_id).record_accepted_intent(primary)
336
+ self._active_task[session_id] = primary
337
+ self._active_task.move_to_end(session_id)
338
+ while len(self._active_task) > _MAX_SESSIONS:
339
+ self._active_task.popitem(last=False)
340
+
341
+ def note_response(
342
+ self,
343
+ session_id: str,
344
+ response_text: str,
345
+ *,
346
+ intent_name: str | None = None,
347
+ ) -> None:
348
+ """Feed the agent's reply back into session context (no network call).
349
+
350
+ Call this after your agent answers an allowed message. The reply (which
351
+ never contains raw user text) sharpens the local summary passed to the
352
+ next :meth:`check`, so follow-up details are judged with full context.
353
+ Optionally pass ``intent_name`` to also set the active task explicitly.
354
+ """
355
+ if not session_id or not response_text:
356
+ return
357
+ state = self._session_state(session_id)
358
+ state.record_agent_response(response_text)
359
+ if intent_name:
360
+ self._active_task[session_id] = intent_name
361
+ self._active_task.move_to_end(session_id)
362
+
363
+ def clear_session(self, session_id: str) -> None:
364
+ """Forget a session's conversation state (e.g. on logout / task complete)."""
365
+ self._sessions.pop(session_id, None)
366
+ self._active_task.pop(session_id, None)
367
+
368
+ def log(
369
+ self,
370
+ message: str | None = None,
371
+ response: str | None = None,
372
+ *,
373
+ blocked: bool = False,
374
+ user_id: str | None = None,
375
+ user_hash: str | None = None,
376
+ session_id: str | None = None,
377
+ agent: str | None = None,
378
+ checkpoint: str | None = None,
379
+ rule_id: str | None = None,
380
+ rule_name: str | None = None,
381
+ tokens: int = 0,
382
+ tool_calls: int = 0,
383
+ latency_ms: float = 0.0,
384
+ ) -> None:
385
+ """Record an invocation (fire-and-forget). Powers stats, risk, reports.
386
+
387
+ Never blocks or raises — a logging failure must not break your agent.
388
+ """
389
+ if not features.is_enabled("audit_log"):
390
+ return
391
+
392
+ agent_name = agent or self._agent
393
+ # Honor risk_tracking by omitting the identity when it's off — the
394
+ # backend then won't build a per-user risk profile.
395
+ uhash = None
396
+ if features.is_enabled("risk_tracking"):
397
+ uhash = user_hash or derive_hash(user_id)
398
+
399
+ invocation: dict[str, Any] = {
400
+ "blocked": blocked,
401
+ "tokens": tokens,
402
+ "tool_calls": tool_calls,
403
+ "latency_ms": latency_ms,
404
+ "agent": agent_name,
405
+ }
406
+ if checkpoint:
407
+ invocation["checkpoint"] = checkpoint
408
+ if rule_id:
409
+ invocation["rule_id"] = rule_id
410
+ if message:
411
+ invocation["question_preview"] = message[:500]
412
+
413
+ batch: dict[str, Any] = {
414
+ "user_hash": uhash,
415
+ "agent": agent_name,
416
+ "invocation": invocation,
417
+ "firewall_events": [],
418
+ }
419
+ if session_id:
420
+ batch["session_id"] = session_id
421
+ if blocked:
422
+ batch["firewall_events"] = [
423
+ {
424
+ "checkpoint": checkpoint,
425
+ "rule_id": rule_id,
426
+ "rule_name": rule_name,
427
+ "excerpt": (message or "")[:100],
428
+ "agent": agent_name,
429
+ }
430
+ ]
431
+ if message and agent_name:
432
+ batch["s3_chat_line"] = {
433
+ "role": "user",
434
+ "content": message,
435
+ "agent": agent_name,
436
+ }
437
+ if self._client is not None: # local mode: nowhere to submit
438
+ self._client.submit_events(batch)
439
+
440
+ # ── Local checkpoints for the agent loop ──────────────────────────
441
+ #
442
+ # An agent isn't a request/response chatbot. Wire these at the loop's
443
+ # chokepoints — no need to import the low-level scan_* functions:
444
+ #
445
+ # think → fw.check(goal) / fw.scan_input(text)
446
+ # act → fw.scan_tool_call(name, args)
447
+ # ingest → fw.scan_data(tool_output, allow_urls=...)
448
+ # respond → fw.scan_output(answer)
449
+ #
450
+ # All four run locally (sub-millisecond, no network) and return a
451
+ # ``ScanResult`` whose ``.clean`` is True when it's safe to proceed.
452
+
453
+ def scan_input(
454
+ self, text: str, *, max_input_length: int | None = None
455
+ ) -> ScanResult:
456
+ """CP-1: scan a user message / agent input for injection (the *think* step)."""
457
+ from firewall_sdk.scan_engine import scan_input as _scan_input
458
+
459
+ return _scan_input(
460
+ text, max_input_length=max_input_length or self._max_input_length
461
+ )
462
+
463
+ def scan_data(self, text: str, *, allow_urls: bool | None = None) -> ScanResult:
464
+ """CP-2: scan tool output / retrieved data before it re-enters the loop.
465
+
466
+ Catches indirect (data-borne) injection and leaked secrets. Set
467
+ ``allow_urls=True`` (or construct ``Firewall(allow_data_urls=True)``) for
468
+ a web-browsing agent whose results legitimately contain links.
469
+ """
470
+ au = self._allow_data_urls if allow_urls is None else allow_urls
471
+ from firewall_sdk.scan_engine import scan_data_field as _scan_data_field
472
+
473
+ return _scan_data_field(text, allow_urls=au)
474
+
475
+ # The *ingest* step reads naturally as "guard this tool output".
476
+ guard_tool_output = scan_data
477
+
478
+ def scan_tool_call(
479
+ self,
480
+ name: str,
481
+ args: dict,
482
+ *,
483
+ allowed_ids: set[str] | None = None,
484
+ allowed_tools: set[str] | None = None,
485
+ id_resolver: "Callable[[str, dict], set[str]] | None" = None,
486
+ ) -> ScanResult:
487
+ """CP-3: authorize a tool call before it runs (the *act* step).
488
+
489
+ Uses the constructor's ``allowed_tools`` / ``id_resolver`` unless
490
+ overridden here. With no allowlist configured anywhere this is a
491
+ documented no-op (returns CLEAN) — set ``allowed_tools`` to enforce it.
492
+ """
493
+ tools = allowed_tools if allowed_tools is not None else self._allowed_tools
494
+ if tools is None:
495
+ return CLEAN
496
+ from firewall_sdk.tool_guard import scan_tool_call as _scan_tool_call
497
+
498
+ return _scan_tool_call(
499
+ name,
500
+ args,
501
+ allowed_ids,
502
+ allowed_tools=tools,
503
+ id_resolver=id_resolver if id_resolver is not None else self._id_resolver,
504
+ )
505
+
506
+ def scan_output(
507
+ self,
508
+ text: str,
509
+ *,
510
+ system_prompt: str = "",
511
+ requester_email: str = "",
512
+ allowed_ids: set[str] | None = None,
513
+ all_user_emails: set[str] | None = None,
514
+ entity_pattern: "re.Pattern | None" = None,
515
+ leakage_label: str | None = None,
516
+ ) -> ScanResult:
517
+ """CP-4: scan the agent's final answer before returning it (the *respond* step).
518
+
519
+ Called bare (``fw.scan_output(answer)``) it checks for leaked secrets and
520
+ offensive content. Supply ``system_prompt`` / entity inputs to also catch
521
+ system-prompt and cross-entity data leakage.
522
+ """
523
+ from firewall_sdk.output_guard import scan_output as _scan_output
524
+
525
+ return _scan_output(
526
+ text,
527
+ system_prompt=system_prompt,
528
+ requester_email=requester_email,
529
+ allowed_ids=allowed_ids,
530
+ all_user_emails=all_user_emails,
531
+ entity_pattern=entity_pattern
532
+ if entity_pattern is not None
533
+ else self._entity_pattern,
534
+ leakage_label=leakage_label
535
+ if leakage_label is not None
536
+ else self._leakage_label,
537
+ )
538
+
539
+ def check_output(
540
+ self,
541
+ text: str,
542
+ *,
543
+ user_request: str = "",
544
+ persona: str = "",
545
+ user_id: str | None = None,
546
+ user_hash: str | None = None,
547
+ session_id: str = "",
548
+ agent: str | None = None,
549
+ agent_intents: list[str] | None = None,
550
+ # Inputs forwarded to the deterministic regex scan_output (CP-4):
551
+ system_prompt: str = "",
552
+ requester_email: str = "",
553
+ allowed_ids: set[str] | None = None,
554
+ all_user_emails: set[str] | None = None,
555
+ entity_pattern: "re.Pattern | None" = None,
556
+ leakage_label: str | None = None,
557
+ ) -> ScanResult:
558
+ """CP-4 (full): deterministic regex scan FIRST, then the Bedrock output judge.
559
+
560
+ The cheap, deterministic ``scan_output`` runs first (leaked secrets,
561
+ system-prompt shingle overlap, cross-entity leakage, slurs). If it flags
562
+ anything, that verdict is returned immediately — no model call. Otherwise,
563
+ when the ``output_judge`` feature is on, the Bedrock judge inspects the
564
+ output for *clear* prompt-injection success (refusal-suppression, persona
565
+ break, intent-deviation, tool-abuse, data-exfiltration) and blocks on a
566
+ high-confidence signal. Fail-open: a backend error leaves the
567
+ already-passed regex verdict in place.
568
+
569
+ Pass ``system_prompt`` (the verbatim prompt) for the regex S-01 leak check,
570
+ and ``persona`` (a short non-secret description) for the judge's
571
+ persona-break detection — the verbatim prompt is deliberately NOT sent to
572
+ the judge. Pass ``user_request`` so the judge can spot intent-deviation.
573
+ """
574
+ local = self.scan_output(
575
+ text,
576
+ system_prompt=system_prompt,
577
+ requester_email=requester_email,
578
+ allowed_ids=allowed_ids,
579
+ all_user_emails=all_user_emails,
580
+ entity_pattern=entity_pattern,
581
+ leakage_label=leakage_label,
582
+ )
583
+ if not local.clean:
584
+ return local
585
+ if not features.is_enabled("output_judge"):
586
+ return local
587
+ if self._client is None:
588
+ return local # local mode: regex verdict only, no judge
589
+
590
+ uhash = user_hash or derive_hash(user_id) or ""
591
+ intents = agent_intents if agent_intents is not None else self._agent_intents
592
+ verdict = (
593
+ self._client.classify_output(
594
+ text,
595
+ session_id=session_id,
596
+ user_hash=uhash,
597
+ agent=agent or self._agent,
598
+ user_request=user_request or None,
599
+ system_prompt=persona
600
+ or None, # redacted persona, never the verbatim prompt
601
+ agent_intents=intents,
602
+ )
603
+ or {}
604
+ )
605
+ if verdict.get("decision") == "block":
606
+ signal = verdict.get("signal") or "injection"
607
+ reasoning = verdict.get("reasoning") or ""
608
+ return ScanResult(
609
+ clean=False,
610
+ rule_id="BEDROCK_OUTPUT",
611
+ rule_name=f"Bedrock Output Judge: {signal}",
612
+ matched_text=(reasoning or signal)[:100],
613
+ )
614
+ return CLEAN
615
+
616
+ # ── Startup check ─────────────────────────────────────────────────
617
+
618
+ def preflight(self, *, raise_on_error: bool = False) -> dict:
619
+ """Verify the API key + URL at startup so misconfiguration is caught early.
620
+
621
+ Cloud calls fail open, so a bad key/URL is otherwise invisible: the agent
622
+ looks protected while nothing is checked. Call this once at startup.
623
+
624
+ Returns ``{"ok", "status", "url", "detail"}``. By default it logs a loud
625
+ warning on failure and returns (so a transient blip doesn't crash boot);
626
+ pass ``raise_on_error=True`` to hard-fail startup on an auth/config error.
627
+
628
+ In local mode (no API key) there is nothing to verify, so this reports
629
+ ``{"ok": True, "status": "local"}``.
630
+ """
631
+ if self._client is None:
632
+ return {
633
+ "ok": True,
634
+ "status": "local",
635
+ "url": None,
636
+ "detail": "local mode (no API key): cloud checks disabled, "
637
+ "local checkpoints active",
638
+ }
639
+ result = self._client.preflight()
640
+ if not result.get("ok"):
641
+ msg = (
642
+ f"AgentHacker preflight failed (status={result.get('status')}, "
643
+ f"url={result.get('url')}): {result.get('detail')}. The firewall will "
644
+ f"FAIL OPEN — verify AGENTHACKER_API_KEY / AGENTHACKER_API_URL."
645
+ )
646
+ if raise_on_error:
647
+ raise RuntimeError(msg)
648
+ logger.warning(msg)
649
+ return result
650
+
651
+ # ── Convenience reads ─────────────────────────────────────────────
652
+
653
+ def risk(
654
+ self, user_id: str | None = None, *, user_hash: str | None = None
655
+ ) -> dict | None:
656
+ """Return a user's current risk score dict, or None (None in local mode)."""
657
+ if self._client is None:
658
+ return None
659
+ uhash = user_hash or derive_hash(user_id)
660
+ if not uhash:
661
+ return None
662
+ return self._client.get_risk_score(uhash)
663
+
664
+ def report(self, date_range: str = "30d", agent: str | None = None) -> dict | None:
665
+ """Generate a security audit report (HTML + JSON). See CloudClient.generate_report.
666
+
667
+ Returns None in local mode (no backend).
668
+ """
669
+ if self._client is None:
670
+ return None
671
+ return self._client.generate_report(
672
+ date_range=date_range, agent=agent or self._agent
673
+ )
674
+
675
+
676
+ __all__ = ["Firewall", "CheckResult", "derive_hash"]