PyPI - agenthacker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agenthacker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

agenthacker-0.1.0.dist-info/METADATA +403 -0
agenthacker-0.1.0.dist-info/RECORD +30 -0
agenthacker-0.1.0.dist-info/WHEEL +4 -0
agenthacker-0.1.0.dist-info/licenses/LICENSE +201 -0
agenthacker-0.1.0.dist-info/licenses/NOTICE +6 -0
firewall_sdk/__init__.py +100 -0
firewall_sdk/agent_helpers.py +128 -0
firewall_sdk/alignment_check.py +113 -0
firewall_sdk/anomaly.py +462 -0
firewall_sdk/client.py +676 -0
firewall_sdk/cloud_client.py +753 -0
firewall_sdk/constants.py +21 -0
firewall_sdk/context_summarizer.py +164 -0
firewall_sdk/event_store.py +660 -0
firewall_sdk/features.py +128 -0
firewall_sdk/intent_gate.py +325 -0
firewall_sdk/intent_guard.py +373 -0
firewall_sdk/intent_splitter.py +114 -0
firewall_sdk/invariant.py +113 -0
firewall_sdk/lang.py +311 -0
firewall_sdk/llm_guard.py +318 -0
firewall_sdk/llm_judge.py +92 -0
firewall_sdk/logger.py +273 -0
firewall_sdk/output_guard.py +150 -0
firewall_sdk/py.typed +0 -0
firewall_sdk/scan_engine.py +569 -0
firewall_sdk/schemas.py +25 -0
firewall_sdk/tool_guard.py +67 -0
firewall_sdk/trace.py +68 -0
firewall_sdk/translate_guard.py +188 -0

firewall_sdk/features.py ADDED Viewed

@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2026 AgentHacker
+"""Unified feature-flag registry — one switch per firewall layer.
+Every protection layer in the SDK can be turned on or off through this single
+registry, so customers never hit a feature they can't control. The governing
+rule is *secure by default, never silently expensive*:
+- Cheap + effective layers default **ON** (recommended): input/data/output
+  scanning, tool authorization, multilingual normalization, intent check,
+  risk tracking, audit logging.
+- Expensive or behavior-changing layers default **OFF** (opt-in) and keep their
+  own startup switches: ``llm_guard`` (ENABLE_LLM_GUARD), ``invariant``
+  (ENABLE_INVARIANT), ``translate_guard`` (warmup), risk *enforcement*.
+A flag can be set three ways, highest priority first:
+1. Programmatically: ``configure_features(input_scan=False)`` or the matching
+   keyword on :class:`firewall_sdk.Firewall`.
+2. Environment variable: ``AGENTHACKER_INPUT_SCAN=0`` (accepts
+   0/1, true/false, yes/no, on/off — case-insensitive).
+3. Built-in default (see ``DEFAULTS`` below).
+Granularity is the *layer*, not the individual rule — there is intentionally no
+switch for "R-01 but not R-08", which would be a security footgun.
+"""
+from __future__ import annotations
+import os
+# Default state for every known flag. ON = recommended, cheap, effective.
+DEFAULTS: dict[str, bool] = {
+    # Local checkpoints (run inside the agent process) — all cheap (<5 ms).
+    "input_scan": True,  # CP-1 regex injection scan of user input
+    "data_field_scan": True,  # CP-2 regex scan of tool-result fields
+    "output_guard": True,  # CP-4 leakage/secret/offensive scan of responses
+    "tool_auth": True,  # CP-3 tool allowlist + entity-ownership check
+    "multilingual": True,  # invisible-char / homoglyph / tag-block hardening
+    # Cloud layers (used by the Firewall client) — recommended.
+    "intent_check": True,  # Bedrock injection/jailbreak/off-scope judge
+    "output_judge": True,  # Bedrock LLM judge for clear injection in outputs (CP-4 LLM layer)
+    "risk_tracking": True,  # build per-user risk profiles from events
+    "audit_log": True,  # ship events to the backend (powers stats/reports)
+}
+# Legacy env vars that already gated a feature before this registry existed.
+# Honored for backward compatibility when AGENTHACKER_<FLAG> is not set.
+_LEGACY_ENV: dict[str, str] = {
+    "risk_tracking": "FIREWALL_ANOMALY_DETECTION",
+}
+_TRUE = {"1", "true", "yes", "on"}
+_FALSE = {"0", "false", "no", "off"}
+# Programmatic overrides set via configure_features(). Highest priority.
+_overrides: dict[str, bool] = {}
+def _parse_bool(value: str) -> bool | None:
+    v = value.strip().lower()
+    if v in _TRUE:
+        return True
+    if v in _FALSE:
+        return False
+    return None
+def is_enabled(flag: str) -> bool:
+    """Return the effective state of *flag* (override > env > legacy > default)."""
+    if flag not in DEFAULTS:
+        raise KeyError(
+            f"Unknown feature flag {flag!r}. Known flags: {sorted(DEFAULTS)}"
+        )
+    if flag in _overrides:
+        return _overrides[flag]
+    env = os.environ.get("AGENTHACKER_" + flag.upper())
+    if env is not None:
+        parsed = _parse_bool(env)
+        if parsed is not None:
+            return parsed
+    legacy = _LEGACY_ENV.get(flag)
+    if legacy is not None:
+        env = os.environ.get(legacy)
+        if env is not None:
+            parsed = _parse_bool(env)
+            if parsed is not None:
+                return parsed
+    return DEFAULTS[flag]
+def configure_features(**flags: bool) -> None:
+    """Set programmatic overrides for one or more flags.
+    Raises ValueError on an unknown flag name so typos fail loudly.
+    Example::
+        from firewall_sdk import configure_features
+        configure_features(output_judge=False, output_guard=True)
+    """
+    unknown = set(flags) - set(DEFAULTS)
+    if unknown:
+        raise ValueError(
+            f"Unknown feature flag(s): {sorted(unknown)}. Known: {sorted(DEFAULTS)}"
+        )
+    for name, value in flags.items():
+        _overrides[name] = bool(value)
+def reset_features() -> None:
+    """Clear all programmatic overrides. For test isolation."""
+    _overrides.clear()
+def all_features() -> dict[str, bool]:
+    """Return the current effective state of every flag (handy for debugging)."""
+    return {flag: is_enabled(flag) for flag in DEFAULTS}
+__all__: list[str] = [
+    "DEFAULTS",
+    "is_enabled",
+    "configure_features",
+    "reset_features",
+    "all_features",
+]

firewall_sdk/intent_gate.py ADDED Viewed

@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2026 AgentHacker
+"""IntentGate — the simple one-call SDK interface for Stage 2A intent filtering.
+Two-phase pipeline:
+  Phase 1 (local, fast): cosine similarity on the full message against the
+      agent's declared intents. Passes cleanly? Done — no network call.
+      EXCEPTION: compound messages (2+ question marks) always skip to Phase 2
+      so sub-intents can be individually scored rather than the whole blob.
+  Phase 2 (cloud): cosine failed OR message is compound. Calls the
+      AgentHacker backend which uses Bedrock to split the message into
+      sub-intents and scope each one against the declared intent list.
+      Sub-intents marked in_scope=True reach Claude; others are dropped.
+      Fallback: if Bedrock returns no intents, local cosine splits by "?" and
+      scores each piece independently.
+Usage — in firewall.py:
+    from firewall_sdk import IntentGate, Intent
+    INTENT_GATE = IntentGate(
+        intents=[
+            Intent("book_appointment", "Book or schedule a medical appointment",
+                   ["book appointment", "schedule a visit", "I need to see a doctor"]),
+            Intent("check_schedule",   "View upcoming or past appointments",
+                   ["what are my appointments", "show my schedule"]),
+        ],
+        threshold=0.42,
+    )
+Usage — in agent.py:
+    result = INTENT_GATE.scan_sync(question, user_hash=user_hash, agent="my_agent")
+    if result.blocked:
+        return refusal(result.scan)
+    filtered_question = "\\n".join(result.passed_intents)
+    intent_name = result.intent_name  # for tool locking
+"""
+from __future__ import annotations
+import logging
+import re
+from dataclasses import dataclass, field
+from firewall_sdk.intent_guard import Intent, IntentGuard
+from firewall_sdk.schemas import CLEAN, ScanResult
+logger = logging.getLogger(__name__)
+@dataclass
+class IntentGateResult:
+    blocked: bool
+    passed_intents: list[str] = field(default_factory=list)
+    failed_intents: list[str] = field(default_factory=list)
+    intent_name: str | None = None
+    bedrock_decision: str | None = None
+    bedrock_confidence: float | None = None
+    continuation: bool = False
+    scan: ScanResult = field(default_factory=lambda: CLEAN)
+    debug: list[dict] = field(default_factory=list)
+def _is_compound(message: str) -> bool:
+    """True when a message contains multiple distinct questions."""
+    return message.count("?") >= 2
+def _local_split(message: str) -> list[str]:
+    """Split a compound message by '?' into individual question strings.
+    Each returned string ends with '?' so the sub-intent reads naturally.
+    """
+    parts = [p.strip() for p in re.split(r"\?", message) if p.strip()]
+    if len(parts) <= 1:
+        return [message]
+    return [p + "?" for p in parts]
+class IntentGate:
+    """Single entry point for the two-phase intent gate.
+    Raises ValueError at construction if intents is empty — catching
+    misconfiguration at startup rather than silently passing everything.
+    """
+    def __init__(self, intents: list[Intent], threshold: float = 0.35) -> None:
+        if not intents:
+            raise ValueError(
+                "IntentGate requires at least one Intent. "
+                "Declare your agent's allowed intents so the scope gate has something to enforce."
+            )
+        self._guard = IntentGuard(intents, threshold)
+        # Plain descriptions passed to Bedrock so it can scope sub-intents without local embeddings
+        self._intent_descriptions: list[str] = [
+            f"{i.name}: {i.description}" for i in intents
+        ]
+    def scan_sync(
+        self,
+        message: str,
+        *,
+        user_hash: str = "",
+        agent: str = "",
+        actor_role: str | None = None,
+        min_words: int = 2,
+        llm_guard_injection: bool = False,
+        active_task: str | None = None,
+        conversation_summary: str | None = None,
+        session_id: str = "",
+    ) -> IntentGateResult:
+        """Synchronous version for agents that don't use async/await.
+        llm_guard_injection=True means a dedicated prompt-injection classifier
+        (LLM Guard) flagged this message upstream. Rather than letting that
+        classifier hard-refuse on its own, the caller routes the hit here: it
+        forces the cloud-judge path (even if the local cosine would pass) and is
+        forwarded to the judge as a strong advisory. Fail-safe: if no judge is
+        reachable to corroborate, the message is blocked rather than allowed.
+        active_task / conversation_summary carry multi-turn context. When an
+        action is already in progress (active_task set), a bare follow-up detail
+        (a date, a name) would score near-zero against every declared intent and
+        be wrongly blocked. So an active task forces the cloud-judge path — the
+        judge sees the context and decides continuation-vs-deviation — instead of
+        trusting (or failing) the context-free local cosine. result.continuation
+        is True when the judge ruled the message a legitimate continuation.
+        """
+        # Short messages (confirmations, single words) bypass the gate entirely —
+        # UNLESS a task is in progress, where even a short follow-up ("June 3rd",
+        # "yes") needs continuation-aware judgment and a short deviation/attack
+        # must not get a free pass.
+        if not active_task and len(message.split()) <= min_words:
+            return IntentGateResult(blocked=False, passed_intents=[message])
+        compound = _is_compound(message)
+        # Non-Latin / mixed-script (code-switched, homoglyph) input is exactly
+        # where the local cosine is least trustworthy — force it to the cloud
+        # split+scope path rather than trusting a clean Phase-1 pass.
+        try:
+            from firewall_sdk import lang
+            non_latin = (
+                lang.is_mostly_non_latin(message)
+                or lang.is_mixed_script(message)
+                or lang.has_foreign_segment(
+                    message
+                )  # code-switching / sandwich attacks
+            )
+        except Exception:
+            non_latin = True  # fail toward the safer (cloud) path
+        # Phase 1: fast local cosine on the full message.
+        # If it passes cleanly, no backend call needed — one intent chip returned.
+        quick = self._guard.scan(message)
+        # A genuine semantic-similarity MISS — the guard is enabled (model
+        # installed/built) AND the full-message cosine fell below threshold — is
+        # a weak prompt-injection signal worth flagging to the cloud judge so it
+        # scrutinizes the message harder. Crucially this is distinct from "the
+        # semantic layer isn't installed": when the guard is disabled, scan()
+        # returns CLEAN, so `ready` is False here and we send NO suspicion hint.
+        semantic_miss = self._guard.ready and not quick.clean
+        # An LLM-Guard injection hit OR an active task forces the cloud-judge path
+        # even when the local cosine would otherwise fast-pass — the dedicated
+        # injection classifier (or the conversational context) sees something the
+        # scope embedding did not.
+        if (
+            quick.clean
+            and not non_latin
+            and not llm_guard_injection
+            and not active_task
+        ):
+            intent_name, conf = self._guard.classify_local(message)
+            logger.debug(
+                "IntentGate phase-1 pass: intent=%r conf=%.3f", intent_name, conf
+            )
+            return IntentGateResult(
+                blocked=False,
+                passed_intents=[message],
+                intent_name=intent_name,
+                debug=[{"text": message, "passed": True}],
+            )
+        # Phase 2: cosine failed — call backend for split + scope classification
+        bedrock_info: dict | None = None
+        try:
+            from firewall_sdk.cloud_client import get_client
+            client = get_client()
+            if client is not None:
+                bedrock_info = client.classify_intent(
+                    message,
+                    session_id=session_id,
+                    user_hash=user_hash,
+                    agent=agent,
+                    actor_role=actor_role,
+                    agent_intents=self._intent_descriptions,
+                    semantic_miss=semantic_miss,
+                    llm_guard_injection=llm_guard_injection,
+                    active_task=active_task,
+                    conversation_summary=conversation_summary,
+                )
+        except Exception as exc:
+            logger.debug("IntentGate: backend call failed (non-fatal): %s", exc)
+        # Fail-safe: an upstream LLM-Guard injection hit that we could NOT get a
+        # cloud judge to corroborate (no client configured, or the call failed)
+        # must not be silently allowed — fall back to the original hard block.
+        if llm_guard_injection and bedrock_info is None:
+            logger.debug(
+                "IntentGate: LLM-Guard hit with no reachable judge — fail-safe block"
+            )
+            return IntentGateResult(
+                blocked=True,
+                failed_intents=[message],
+                scan=ScanResult(
+                    clean=False,
+                    rule_id="LG_INPUT",
+                    rule_name="LLM Guard PromptInjection (uncorroborated)",
+                    matched_text=message[:100],
+                ),
+                debug=[{"text": message, "passed": False}],
+            )
+        bedrock_decision = (bedrock_info or {}).get("decision")
+        bedrock_confidence = (bedrock_info or {}).get("confidence")
+        continuation = bool((bedrock_info or {}).get("continuation", False))
+        # Security block
+        if (
+            bedrock_info
+            and bedrock_decision == "block"
+            and (bedrock_confidence or 0) > 0.7
+        ):
+            logger.debug(
+                "IntentGate: Bedrock security block threat=%s",
+                bedrock_info.get("threat_type"),
+            )
+            return IntentGateResult(
+                blocked=True,
+                failed_intents=[message],
+                bedrock_decision=bedrock_decision,
+                bedrock_confidence=bedrock_confidence,
+                scan=ScanResult(
+                    clean=False,
+                    rule_id="BEDROCK_INTENT",
+                    rule_name="Bedrock Intent Classification",
+                    matched_text=(bedrock_info.get("threat_type") or "threat")[:100],
+                ),
+                debug=[{"text": message, "passed": False}],
+            )
+        # Use Bedrock's split intents with in_scope flags.
+        raw_intents: list[dict] = (bedrock_info or {}).get("intents") or []
+        passed: list[str] = []
+        failed: list[str] = []
+        debug: list[dict] = []
+        if raw_intents:
+            for item in raw_intents:
+                text = item.get("text", "") if isinstance(item, dict) else str(item)
+                in_scope = (
+                    item.get("in_scope", True) if isinstance(item, dict) else True
+                )
+                if in_scope:
+                    passed.append(text)
+                    debug.append({"text": text, "passed": True})
+                else:
+                    failed.append(text)
+                    debug.append({"text": text, "passed": False})
+        elif compound and not llm_guard_injection:
+            # Bedrock returned nothing (old backend or unavailable) but we know this
+            # is a compound message — split locally by "?" and score each sub-intent.
+            # NOT used when LLM Guard flagged the message: the local cosine split is
+            # the untrusted "no judge" fallback, and letting a flagged message pass on
+            # it would defeat the fail-safe. A flagged message with no usable judge
+            # verdict falls through to the block below.
+            for sub in _local_split(message):
+                sub_scan = self._guard.scan(sub)
+                if sub_scan.clean:
+                    passed.append(sub)
+                    debug.append({"text": sub, "passed": True})
+                else:
+                    failed.append(sub)
+                    debug.append({"text": sub, "passed": False})
+        else:
+            # Simple message, cosine already failed, no backend — block it.
+            debug.append({"text": message, "passed": False})
+        if not passed:
+            return IntentGateResult(
+                blocked=True,
+                failed_intents=failed or [message],
+                bedrock_decision=bedrock_decision,
+                bedrock_confidence=bedrock_confidence,
+                scan=ScanResult(
+                    clean=False,
+                    rule_id="R-16",
+                    rule_name="Out-of-Scope Intent",
+                    matched_text=message[:100],
+                ),
+                debug=debug,
+            )
+        intent_name, _ = self._guard.classify_local(passed[0])
+        logger.debug(
+            "IntentGate phase-2 pass: %d/%d intents, locked=%r continuation=%s",
+            len(passed),
+            len(raw_intents),
+            intent_name,
+            continuation,
+        )
+        return IntentGateResult(
+            blocked=False,
+            passed_intents=passed,
+            failed_intents=failed,
+            intent_name=intent_name,
+            bedrock_decision=bedrock_decision,
+            bedrock_confidence=bedrock_confidence,
+            continuation=continuation,
+            debug=debug,
+        )