agenthacker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2026 AgentHacker
3
+
4
+ """Unified feature-flag registry — one switch per firewall layer.
5
+
6
+ Every protection layer in the SDK can be turned on or off through this single
7
+ registry, so customers never hit a feature they can't control. The governing
8
+ rule is *secure by default, never silently expensive*:
9
+
10
+ - Cheap + effective layers default **ON** (recommended): input/data/output
11
+ scanning, tool authorization, multilingual normalization, intent check,
12
+ risk tracking, audit logging.
13
+ - Expensive or behavior-changing layers default **OFF** (opt-in) and keep their
14
+ own startup switches: ``llm_guard`` (ENABLE_LLM_GUARD), ``invariant``
15
+ (ENABLE_INVARIANT), ``translate_guard`` (warmup), risk *enforcement*.
16
+
17
+ A flag can be set three ways, highest priority first:
18
+
19
+ 1. Programmatically: ``configure_features(input_scan=False)`` or the matching
20
+ keyword on :class:`firewall_sdk.Firewall`.
21
+ 2. Environment variable: ``AGENTHACKER_INPUT_SCAN=0`` (accepts
22
+ 0/1, true/false, yes/no, on/off — case-insensitive).
23
+ 3. Built-in default (see ``DEFAULTS`` below).
24
+
25
+ Granularity is the *layer*, not the individual rule — there is intentionally no
26
+ switch for "R-01 but not R-08", which would be a security footgun.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import os
32
+
33
+ # Default state for every known flag. ON = recommended, cheap, effective.
34
+ DEFAULTS: dict[str, bool] = {
35
+ # Local checkpoints (run inside the agent process) — all cheap (<5 ms).
36
+ "input_scan": True, # CP-1 regex injection scan of user input
37
+ "data_field_scan": True, # CP-2 regex scan of tool-result fields
38
+ "output_guard": True, # CP-4 leakage/secret/offensive scan of responses
39
+ "tool_auth": True, # CP-3 tool allowlist + entity-ownership check
40
+ "multilingual": True, # invisible-char / homoglyph / tag-block hardening
41
+ # Cloud layers (used by the Firewall client) — recommended.
42
+ "intent_check": True, # Bedrock injection/jailbreak/off-scope judge
43
+ "output_judge": True, # Bedrock LLM judge for clear injection in outputs (CP-4 LLM layer)
44
+ "risk_tracking": True, # build per-user risk profiles from events
45
+ "audit_log": True, # ship events to the backend (powers stats/reports)
46
+ }
47
+
48
+ # Legacy env vars that already gated a feature before this registry existed.
49
+ # Honored for backward compatibility when AGENTHACKER_<FLAG> is not set.
50
+ _LEGACY_ENV: dict[str, str] = {
51
+ "risk_tracking": "FIREWALL_ANOMALY_DETECTION",
52
+ }
53
+
54
+ _TRUE = {"1", "true", "yes", "on"}
55
+ _FALSE = {"0", "false", "no", "off"}
56
+
57
+ # Programmatic overrides set via configure_features(). Highest priority.
58
+ _overrides: dict[str, bool] = {}
59
+
60
+
61
+ def _parse_bool(value: str) -> bool | None:
62
+ v = value.strip().lower()
63
+ if v in _TRUE:
64
+ return True
65
+ if v in _FALSE:
66
+ return False
67
+ return None
68
+
69
+
70
+ def is_enabled(flag: str) -> bool:
71
+ """Return the effective state of *flag* (override > env > legacy > default)."""
72
+ if flag not in DEFAULTS:
73
+ raise KeyError(
74
+ f"Unknown feature flag {flag!r}. Known flags: {sorted(DEFAULTS)}"
75
+ )
76
+ if flag in _overrides:
77
+ return _overrides[flag]
78
+ env = os.environ.get("AGENTHACKER_" + flag.upper())
79
+ if env is not None:
80
+ parsed = _parse_bool(env)
81
+ if parsed is not None:
82
+ return parsed
83
+ legacy = _LEGACY_ENV.get(flag)
84
+ if legacy is not None:
85
+ env = os.environ.get(legacy)
86
+ if env is not None:
87
+ parsed = _parse_bool(env)
88
+ if parsed is not None:
89
+ return parsed
90
+ return DEFAULTS[flag]
91
+
92
+
93
+ def configure_features(**flags: bool) -> None:
94
+ """Set programmatic overrides for one or more flags.
95
+
96
+ Raises ValueError on an unknown flag name so typos fail loudly.
97
+
98
+ Example::
99
+
100
+ from firewall_sdk import configure_features
101
+ configure_features(output_judge=False, output_guard=True)
102
+ """
103
+ unknown = set(flags) - set(DEFAULTS)
104
+ if unknown:
105
+ raise ValueError(
106
+ f"Unknown feature flag(s): {sorted(unknown)}. Known: {sorted(DEFAULTS)}"
107
+ )
108
+ for name, value in flags.items():
109
+ _overrides[name] = bool(value)
110
+
111
+
112
+ def reset_features() -> None:
113
+ """Clear all programmatic overrides. For test isolation."""
114
+ _overrides.clear()
115
+
116
+
117
+ def all_features() -> dict[str, bool]:
118
+ """Return the current effective state of every flag (handy for debugging)."""
119
+ return {flag: is_enabled(flag) for flag in DEFAULTS}
120
+
121
+
122
+ __all__: list[str] = [
123
+ "DEFAULTS",
124
+ "is_enabled",
125
+ "configure_features",
126
+ "reset_features",
127
+ "all_features",
128
+ ]
@@ -0,0 +1,325 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2026 AgentHacker
3
+
4
+ """IntentGate — the simple one-call SDK interface for Stage 2A intent filtering.
5
+
6
+ Two-phase pipeline:
7
+ Phase 1 (local, fast): cosine similarity on the full message against the
8
+ agent's declared intents. Passes cleanly? Done — no network call.
9
+ EXCEPTION: compound messages (2+ question marks) always skip to Phase 2
10
+ so sub-intents can be individually scored rather than the whole blob.
11
+ Phase 2 (cloud): cosine failed OR message is compound. Calls the
12
+ AgentHacker backend which uses Bedrock to split the message into
13
+ sub-intents and scope each one against the declared intent list.
14
+ Sub-intents marked in_scope=True reach Claude; others are dropped.
15
+ Fallback: if Bedrock returns no intents, local cosine splits by "?" and
16
+ scores each piece independently.
17
+
18
+ Usage — in firewall.py:
19
+ from firewall_sdk import IntentGate, Intent
20
+
21
+ INTENT_GATE = IntentGate(
22
+ intents=[
23
+ Intent("book_appointment", "Book or schedule a medical appointment",
24
+ ["book appointment", "schedule a visit", "I need to see a doctor"]),
25
+ Intent("check_schedule", "View upcoming or past appointments",
26
+ ["what are my appointments", "show my schedule"]),
27
+ ],
28
+ threshold=0.42,
29
+ )
30
+
31
+ Usage — in agent.py:
32
+ result = INTENT_GATE.scan_sync(question, user_hash=user_hash, agent="my_agent")
33
+ if result.blocked:
34
+ return refusal(result.scan)
35
+ filtered_question = "\\n".join(result.passed_intents)
36
+ intent_name = result.intent_name # for tool locking
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import logging
42
+ import re
43
+ from dataclasses import dataclass, field
44
+
45
+ from firewall_sdk.intent_guard import Intent, IntentGuard
46
+ from firewall_sdk.schemas import CLEAN, ScanResult
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ @dataclass
52
+ class IntentGateResult:
53
+ blocked: bool
54
+ passed_intents: list[str] = field(default_factory=list)
55
+ failed_intents: list[str] = field(default_factory=list)
56
+ intent_name: str | None = None
57
+ bedrock_decision: str | None = None
58
+ bedrock_confidence: float | None = None
59
+ continuation: bool = False
60
+ scan: ScanResult = field(default_factory=lambda: CLEAN)
61
+ debug: list[dict] = field(default_factory=list)
62
+
63
+
64
+ def _is_compound(message: str) -> bool:
65
+ """True when a message contains multiple distinct questions."""
66
+ return message.count("?") >= 2
67
+
68
+
69
+ def _local_split(message: str) -> list[str]:
70
+ """Split a compound message by '?' into individual question strings.
71
+
72
+ Each returned string ends with '?' so the sub-intent reads naturally.
73
+ """
74
+ parts = [p.strip() for p in re.split(r"\?", message) if p.strip()]
75
+ if len(parts) <= 1:
76
+ return [message]
77
+ return [p + "?" for p in parts]
78
+
79
+
80
+ class IntentGate:
81
+ """Single entry point for the two-phase intent gate.
82
+
83
+ Raises ValueError at construction if intents is empty — catching
84
+ misconfiguration at startup rather than silently passing everything.
85
+ """
86
+
87
+ def __init__(self, intents: list[Intent], threshold: float = 0.35) -> None:
88
+ if not intents:
89
+ raise ValueError(
90
+ "IntentGate requires at least one Intent. "
91
+ "Declare your agent's allowed intents so the scope gate has something to enforce."
92
+ )
93
+ self._guard = IntentGuard(intents, threshold)
94
+ # Plain descriptions passed to Bedrock so it can scope sub-intents without local embeddings
95
+ self._intent_descriptions: list[str] = [
96
+ f"{i.name}: {i.description}" for i in intents
97
+ ]
98
+
99
+ def scan_sync(
100
+ self,
101
+ message: str,
102
+ *,
103
+ user_hash: str = "",
104
+ agent: str = "",
105
+ actor_role: str | None = None,
106
+ min_words: int = 2,
107
+ llm_guard_injection: bool = False,
108
+ active_task: str | None = None,
109
+ conversation_summary: str | None = None,
110
+ session_id: str = "",
111
+ ) -> IntentGateResult:
112
+ """Synchronous version for agents that don't use async/await.
113
+
114
+ llm_guard_injection=True means a dedicated prompt-injection classifier
115
+ (LLM Guard) flagged this message upstream. Rather than letting that
116
+ classifier hard-refuse on its own, the caller routes the hit here: it
117
+ forces the cloud-judge path (even if the local cosine would pass) and is
118
+ forwarded to the judge as a strong advisory. Fail-safe: if no judge is
119
+ reachable to corroborate, the message is blocked rather than allowed.
120
+
121
+ active_task / conversation_summary carry multi-turn context. When an
122
+ action is already in progress (active_task set), a bare follow-up detail
123
+ (a date, a name) would score near-zero against every declared intent and
124
+ be wrongly blocked. So an active task forces the cloud-judge path — the
125
+ judge sees the context and decides continuation-vs-deviation — instead of
126
+ trusting (or failing) the context-free local cosine. result.continuation
127
+ is True when the judge ruled the message a legitimate continuation.
128
+ """
129
+ # Short messages (confirmations, single words) bypass the gate entirely —
130
+ # UNLESS a task is in progress, where even a short follow-up ("June 3rd",
131
+ # "yes") needs continuation-aware judgment and a short deviation/attack
132
+ # must not get a free pass.
133
+ if not active_task and len(message.split()) <= min_words:
134
+ return IntentGateResult(blocked=False, passed_intents=[message])
135
+
136
+ compound = _is_compound(message)
137
+
138
+ # Non-Latin / mixed-script (code-switched, homoglyph) input is exactly
139
+ # where the local cosine is least trustworthy — force it to the cloud
140
+ # split+scope path rather than trusting a clean Phase-1 pass.
141
+ try:
142
+ from firewall_sdk import lang
143
+
144
+ non_latin = (
145
+ lang.is_mostly_non_latin(message)
146
+ or lang.is_mixed_script(message)
147
+ or lang.has_foreign_segment(
148
+ message
149
+ ) # code-switching / sandwich attacks
150
+ )
151
+ except Exception:
152
+ non_latin = True # fail toward the safer (cloud) path
153
+
154
+ # Phase 1: fast local cosine on the full message.
155
+ # If it passes cleanly, no backend call needed — one intent chip returned.
156
+ quick = self._guard.scan(message)
157
+
158
+ # A genuine semantic-similarity MISS — the guard is enabled (model
159
+ # installed/built) AND the full-message cosine fell below threshold — is
160
+ # a weak prompt-injection signal worth flagging to the cloud judge so it
161
+ # scrutinizes the message harder. Crucially this is distinct from "the
162
+ # semantic layer isn't installed": when the guard is disabled, scan()
163
+ # returns CLEAN, so `ready` is False here and we send NO suspicion hint.
164
+ semantic_miss = self._guard.ready and not quick.clean
165
+
166
+ # An LLM-Guard injection hit OR an active task forces the cloud-judge path
167
+ # even when the local cosine would otherwise fast-pass — the dedicated
168
+ # injection classifier (or the conversational context) sees something the
169
+ # scope embedding did not.
170
+ if (
171
+ quick.clean
172
+ and not non_latin
173
+ and not llm_guard_injection
174
+ and not active_task
175
+ ):
176
+ intent_name, conf = self._guard.classify_local(message)
177
+ logger.debug(
178
+ "IntentGate phase-1 pass: intent=%r conf=%.3f", intent_name, conf
179
+ )
180
+ return IntentGateResult(
181
+ blocked=False,
182
+ passed_intents=[message],
183
+ intent_name=intent_name,
184
+ debug=[{"text": message, "passed": True}],
185
+ )
186
+
187
+ # Phase 2: cosine failed — call backend for split + scope classification
188
+ bedrock_info: dict | None = None
189
+ try:
190
+ from firewall_sdk.cloud_client import get_client
191
+
192
+ client = get_client()
193
+ if client is not None:
194
+ bedrock_info = client.classify_intent(
195
+ message,
196
+ session_id=session_id,
197
+ user_hash=user_hash,
198
+ agent=agent,
199
+ actor_role=actor_role,
200
+ agent_intents=self._intent_descriptions,
201
+ semantic_miss=semantic_miss,
202
+ llm_guard_injection=llm_guard_injection,
203
+ active_task=active_task,
204
+ conversation_summary=conversation_summary,
205
+ )
206
+ except Exception as exc:
207
+ logger.debug("IntentGate: backend call failed (non-fatal): %s", exc)
208
+
209
+ # Fail-safe: an upstream LLM-Guard injection hit that we could NOT get a
210
+ # cloud judge to corroborate (no client configured, or the call failed)
211
+ # must not be silently allowed — fall back to the original hard block.
212
+ if llm_guard_injection and bedrock_info is None:
213
+ logger.debug(
214
+ "IntentGate: LLM-Guard hit with no reachable judge — fail-safe block"
215
+ )
216
+ return IntentGateResult(
217
+ blocked=True,
218
+ failed_intents=[message],
219
+ scan=ScanResult(
220
+ clean=False,
221
+ rule_id="LG_INPUT",
222
+ rule_name="LLM Guard PromptInjection (uncorroborated)",
223
+ matched_text=message[:100],
224
+ ),
225
+ debug=[{"text": message, "passed": False}],
226
+ )
227
+
228
+ bedrock_decision = (bedrock_info or {}).get("decision")
229
+ bedrock_confidence = (bedrock_info or {}).get("confidence")
230
+ continuation = bool((bedrock_info or {}).get("continuation", False))
231
+
232
+ # Security block
233
+ if (
234
+ bedrock_info
235
+ and bedrock_decision == "block"
236
+ and (bedrock_confidence or 0) > 0.7
237
+ ):
238
+ logger.debug(
239
+ "IntentGate: Bedrock security block threat=%s",
240
+ bedrock_info.get("threat_type"),
241
+ )
242
+ return IntentGateResult(
243
+ blocked=True,
244
+ failed_intents=[message],
245
+ bedrock_decision=bedrock_decision,
246
+ bedrock_confidence=bedrock_confidence,
247
+ scan=ScanResult(
248
+ clean=False,
249
+ rule_id="BEDROCK_INTENT",
250
+ rule_name="Bedrock Intent Classification",
251
+ matched_text=(bedrock_info.get("threat_type") or "threat")[:100],
252
+ ),
253
+ debug=[{"text": message, "passed": False}],
254
+ )
255
+
256
+ # Use Bedrock's split intents with in_scope flags.
257
+ raw_intents: list[dict] = (bedrock_info or {}).get("intents") or []
258
+ passed: list[str] = []
259
+ failed: list[str] = []
260
+ debug: list[dict] = []
261
+
262
+ if raw_intents:
263
+ for item in raw_intents:
264
+ text = item.get("text", "") if isinstance(item, dict) else str(item)
265
+ in_scope = (
266
+ item.get("in_scope", True) if isinstance(item, dict) else True
267
+ )
268
+ if in_scope:
269
+ passed.append(text)
270
+ debug.append({"text": text, "passed": True})
271
+ else:
272
+ failed.append(text)
273
+ debug.append({"text": text, "passed": False})
274
+ elif compound and not llm_guard_injection:
275
+ # Bedrock returned nothing (old backend or unavailable) but we know this
276
+ # is a compound message — split locally by "?" and score each sub-intent.
277
+ # NOT used when LLM Guard flagged the message: the local cosine split is
278
+ # the untrusted "no judge" fallback, and letting a flagged message pass on
279
+ # it would defeat the fail-safe. A flagged message with no usable judge
280
+ # verdict falls through to the block below.
281
+ for sub in _local_split(message):
282
+ sub_scan = self._guard.scan(sub)
283
+ if sub_scan.clean:
284
+ passed.append(sub)
285
+ debug.append({"text": sub, "passed": True})
286
+ else:
287
+ failed.append(sub)
288
+ debug.append({"text": sub, "passed": False})
289
+ else:
290
+ # Simple message, cosine already failed, no backend — block it.
291
+ debug.append({"text": message, "passed": False})
292
+
293
+ if not passed:
294
+ return IntentGateResult(
295
+ blocked=True,
296
+ failed_intents=failed or [message],
297
+ bedrock_decision=bedrock_decision,
298
+ bedrock_confidence=bedrock_confidence,
299
+ scan=ScanResult(
300
+ clean=False,
301
+ rule_id="R-16",
302
+ rule_name="Out-of-Scope Intent",
303
+ matched_text=message[:100],
304
+ ),
305
+ debug=debug,
306
+ )
307
+
308
+ intent_name, _ = self._guard.classify_local(passed[0])
309
+ logger.debug(
310
+ "IntentGate phase-2 pass: %d/%d intents, locked=%r continuation=%s",
311
+ len(passed),
312
+ len(raw_intents),
313
+ intent_name,
314
+ continuation,
315
+ )
316
+ return IntentGateResult(
317
+ blocked=False,
318
+ passed_intents=passed,
319
+ failed_intents=failed,
320
+ intent_name=intent_name,
321
+ bedrock_decision=bedrock_decision,
322
+ bedrock_confidence=bedrock_confidence,
323
+ continuation=continuation,
324
+ debug=debug,
325
+ )