PyPI - agenthacker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

agenthacker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

agenthacker-0.1.0.dist-info/METADATA +403 -0
agenthacker-0.1.0.dist-info/RECORD +30 -0
agenthacker-0.1.0.dist-info/WHEEL +4 -0
agenthacker-0.1.0.dist-info/licenses/LICENSE +201 -0
agenthacker-0.1.0.dist-info/licenses/NOTICE +6 -0
firewall_sdk/__init__.py +100 -0
firewall_sdk/agent_helpers.py +128 -0
firewall_sdk/alignment_check.py +113 -0
firewall_sdk/anomaly.py +462 -0
firewall_sdk/client.py +676 -0
firewall_sdk/cloud_client.py +753 -0
firewall_sdk/constants.py +21 -0
firewall_sdk/context_summarizer.py +164 -0
firewall_sdk/event_store.py +660 -0
firewall_sdk/features.py +128 -0
firewall_sdk/intent_gate.py +325 -0
firewall_sdk/intent_guard.py +373 -0
firewall_sdk/intent_splitter.py +114 -0
firewall_sdk/invariant.py +113 -0
firewall_sdk/lang.py +311 -0
firewall_sdk/llm_guard.py +318 -0
firewall_sdk/llm_judge.py +92 -0
firewall_sdk/logger.py +273 -0
firewall_sdk/output_guard.py +150 -0
firewall_sdk/py.typed +0 -0
firewall_sdk/scan_engine.py +569 -0
firewall_sdk/schemas.py +25 -0
firewall_sdk/tool_guard.py +67 -0
firewall_sdk/trace.py +68 -0
firewall_sdk/translate_guard.py +188 -0

firewall_sdk/anomaly.py ADDED Viewed

@@ -0,0 +1,462 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2026 AgentHacker
+"""Anomaly detection and risk scoring for firewall-sdk agents.
+Score formula (0–100):
+  session_component  = session_rate × damping × 70   (max 70 pts)
+  weighted_component = weighted_rate × 30             (max 30 pts, only if total >= 8)
+  risk_score         = min(100, session + weighted)
+Session = blocked/total in the last 2 hours.
+Damping = min(1.0, session_count / 3) — ramps up over the first 3 messages so
+          a single rejection never instantly blocks anyone.
+Weighted = log-decay all-time rate: Σ(w×blocked)/Σ(w) where w=1/ln(rank+1).
+           Starts at 0 and stays 0 until the user has 8+ total invocations.
+Usage (inside an agent endpoint):
+    from firewall_sdk import check_user_risk, RiskLevel
+    risk = check_user_risk(user_hash, store._conn)
+    if risk.level == RiskLevel.CRITICAL:
+        raise HTTPException(423, "Locked", headers={"Retry-After": "3600"})
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any
+logger = logging.getLogger(__name__)
+_SESSION_HOURS = 2  # window that counts as "current session"
+_WEIGHTED_MIN_MSGS = (
+    8  # weighted component is 0 until user has this many total messages
+)
+_SESSION_DAMPING_MSGS = (
+    3  # session rate ramps to full weight after this many session messages
+)
+# ── Feature flag ──────────────────────────────────────────────────────
+def is_anomaly_enabled() -> bool:
+    """Return True if anomaly detection is active (default: True).
+    Set env var FIREWALL_ANOMALY_DETECTION=0 to disable. When disabled,
+    check_user_risk() always returns LOW so no enforcement happens and
+    the dashboard can hide the risk panel entirely.
+    """
+    return os.environ.get("FIREWALL_ANOMALY_DETECTION", "1").lower() not in (
+        "0",
+        "false",
+        "no",
+        "off",
+    )
+# ── Public types ──────────────────────────────────────────────────────
+class RiskLevel(str, Enum):
+    """Risk tier derived from a numerical score."""
+    LOW = "LOW"  # 0–30:  proceed normally
+    MEDIUM = "MEDIUM"  # 31–60: read-only tools
+    HIGH = "HIGH"  # 61–80: read-only + shorter context + system-prompt flag
+    CRITICAL = "CRITICAL"  # 81–100: block (HTTP 423)
+    @classmethod
+    def from_score(cls, score: float) -> "RiskLevel":
+        if score >= 81:
+            return cls.CRITICAL
+        if score >= 61:
+            return cls.HIGH
+        if score >= 31:
+            return cls.MEDIUM
+        return cls.LOW
+@dataclass
+class RiskFactor:
+    """A single contributing component with its score contribution."""
+    name: str
+    description: str
+    contribution: float  # points added to the final score
+    signal_value: Any  # raw value for display in the dashboard
+@dataclass
+class RiskScore:
+    """Complete risk assessment for a user."""
+    user_hash: str
+    score: float  # 0.0–100.0 (clamped)
+    level: RiskLevel
+    factors: list[RiskFactor] = field(default_factory=list)
+    computed_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+# ── Scorer ────────────────────────────────────────────────────────────
+class RiskScorer:
+    """Stateless risk scorer — accepts a psycopg2 connection and queries Postgres.
+    Degrades gracefully when conn is None (NullStore paths) or on DB errors:
+    always returns RiskScore(score=0.0, level=LOW, factors=[]).
+    """
+    def compute(
+        self,
+        user_hash: str,
+        conn: Any,
+        window_hours: int = _SESSION_HOURS,
+    ) -> RiskScore:
+        """Compute a RiskScore for user_hash."""
+        if conn is None:
+            return self._zero(user_hash)
+        try:
+            return self._compute_internal(user_hash, conn)
+        except Exception:
+            logger.warning(
+                "RiskScorer.compute failed for %s — returning LOW",
+                user_hash,
+                exc_info=True,
+            )
+            return self._zero(user_hash)
+    # ── Internal ─────────────────────────────────────────────────────
+    @staticmethod
+    def _zero(user_hash: str) -> RiskScore:
+        return RiskScore(user_hash=user_hash, score=0.0, level=RiskLevel.LOW)
+    def _compute_internal(self, user_hash: str, conn: Any) -> RiskScore:
+        factors: list[RiskFactor] = []
+        for signal_fn in (
+            self._signal_session,
+            self._signal_weighted,
+        ):
+            try:
+                factor = signal_fn(user_hash, conn)
+                factors.append(factor)
+            except Exception:
+                logger.debug(
+                    "Signal %s failed — skipping", signal_fn.__name__, exc_info=True
+                )
+        score = round(min(100.0, sum(f.contribution for f in factors)), 2)
+        return RiskScore(
+            user_hash=user_hash,
+            score=score,
+            level=RiskLevel.from_score(score),
+            factors=factors,  # always include both factors so the dashboard can display them
+        )
+    # ── Signal 1: session threat rate (max 70 pts) ────────────────────
+    #
+    # Uses log-decay recency weighting within the 2h window so the most
+    # recent blocked messages count more than older ones in the same session.
+    # weight for rank r = 1 / ln(r + 1), r=1 is most recent message.
+    #
+    # weighted_rate = Σ(w × blocked) / Σ(w)   within last 2h
+    # damping       = min(1.0, session_count / 3)  — ramps over first 3 msgs
+    # contribution  = weighted_rate × damping × 70
+    def _signal_session(self, user_hash: str, conn: Any) -> RiskFactor:
+        sql = """
+            SELECT
+                COUNT(*) AS session_total,
+                COUNT(*) FILTER (WHERE blocked) AS session_blocked,
+                SUM(CASE WHEN blocked THEN 1.0/LN(rn::float + 1) ELSE 0 END) AS w_flagged,
+                SUM(1.0/LN(rn::float + 1)) AS w_total
+            FROM (
+                SELECT blocked,
+                       ROW_NUMBER() OVER (ORDER BY timestamp DESC) AS rn
+                FROM invocations
+                WHERE user_hash = %s
+                  AND timestamp > NOW() - INTERVAL %s
+            ) t
+        """
+        row = _row4(conn, sql, (user_hash, f"{_SESSION_HOURS} hours"))
+        session_total = int(row[0] or 0)
+        session_blocked = int(row[1] or 0)
+        w_flagged = float(row[2] or 0)
+        w_total = float(row[3] or 0)
+        if session_total == 0:
+            return RiskFactor(
+                name="session_threat_rate",
+                description="No activity in current session",
+                contribution=0.0,
+                signal_value={"blocked": 0, "total": 0, "rate": 0.0},
+            )
+        weighted_rate = w_flagged / w_total if w_total > 0 else 0.0
+        damping = min(1.0, session_total / _SESSION_DAMPING_MSGS)
+        contribution = round(weighted_rate * damping * 70.0, 2)
+        return RiskFactor(
+            name="session_threat_rate",
+            description=(
+                f"{session_blocked}/{session_total} requests blocked in current session "
+                f"(log-decay weighted rate {weighted_rate * 100:.1f}%, damping {damping:.2f})"
+            ),
+            contribution=contribution,
+            signal_value={
+                "blocked": session_blocked,
+                "total": session_total,
+                "rate": round(weighted_rate, 3),
+                "damping": round(damping, 3),
+            },
+        )
+    # ── Signal 2: weighted all-time threat rate (max 30 pts) ──────────
+    #
+    # weight for rank r = 1 / ln(r + 1)   (r=1 is most recent)
+    # weighted_rate = Σ(w × blocked) / Σ(w)
+    # Only activates once the user has >= 8 total invocations.
+    # Before that, contribution is always 0 — one bad message in a fresh
+    # account cannot accumulate history-based risk.
+    def _signal_weighted(self, user_hash: str, conn: Any) -> RiskFactor:
+        sql = """
+            SELECT
+                COUNT(*) AS total_count,
+                SUM(CASE WHEN blocked THEN 1.0/LN(rn::float + 1) ELSE 0 END) AS w_flagged,
+                SUM(1.0/LN(rn::float + 1)) AS w_total
+            FROM (
+                SELECT blocked,
+                       ROW_NUMBER() OVER (ORDER BY timestamp DESC) AS rn
+                FROM invocations
+                WHERE user_hash = %s
+            ) t
+        """
+        row = _row3(conn, sql, (user_hash,))
+        total_count = int(row[0] or 0)
+        w_flagged = float(row[1] or 0)
+        w_total = float(row[2] or 0)
+        if total_count < _WEIGHTED_MIN_MSGS:
+            return RiskFactor(
+                name="weighted_all_time_rate",
+                description=(
+                    f"Weighted history not yet active "
+                    f"({total_count}/{_WEIGHTED_MIN_MSGS} messages minimum)"
+                ),
+                contribution=0.0,
+                signal_value={
+                    "total_count": total_count,
+                    "min_required": _WEIGHTED_MIN_MSGS,
+                },
+            )
+        weighted_rate = w_flagged / w_total if w_total > 0 else 0.0
+        contribution = round(weighted_rate * 30.0, 2)
+        return RiskFactor(
+            name="weighted_all_time_rate",
+            description=(
+                f"Weighted all-time block rate: {weighted_rate * 100:.1f}% "
+                f"across {total_count} total requests (log-decay)"
+            ),
+            contribution=contribution,
+            signal_value={
+                "total_count": total_count,
+                "weighted_rate": round(weighted_rate, 3),
+            },
+        )
+# ── Content signal: script anomaly (soft) ────────────────────────────
+def script_risk_factor(text: str) -> RiskFactor | None:
+    """Build a soft RiskFactor for non-Latin / mixed-script / homoglyph input.
+    Capped at 15 points by ``lang.script_risk`` — a soft signal that nudges
+    behavioural risk for a burst of homoglyph/mixed-script probing, never a
+    hard block. Returns None when there is no script anomaly (no factor to add)
+    so legitimate non-English traffic with no anomaly stays unaffected.
+    """
+    try:
+        from firewall_sdk import lang
+        contribution = lang.script_risk(text)
+    except Exception:
+        return None
+    if contribution <= 0.0:
+        return None
+    return RiskFactor(
+        name="script_anomaly",
+        description="Non-Latin / mixed-script / homoglyph-confusable input",
+        contribution=contribution,
+        signal_value={"text_preview": text[:80]},
+    )
+def _apply_extra_factors(
+    risk: RiskScore, extra_factors: list[RiskFactor] | None
+) -> RiskScore:
+    """Fold opt-in extra factors into a RiskScore, re-clamping score and level.
+    Skips pardoned scores (a manual pardon means the user is trusted).
+    """
+    if not extra_factors:
+        return risk
+    if any(f.name == "manual_pardon" for f in risk.factors):
+        return risk
+    factors = risk.factors + list(extra_factors)
+    score = round(min(100.0, sum(f.contribution for f in factors)), 2)
+    return RiskScore(
+        user_hash=risk.user_hash,
+        score=score,
+        level=RiskLevel.from_score(score),
+        factors=factors,
+        computed_at=risk.computed_at,
+    )
+# ── Query helpers ─────────────────────────────────────────────────────
+def _row(conn: Any, sql: str, params: tuple) -> tuple:
+    with conn.cursor() as cur:
+        cur.execute(sql, params)
+        return cur.fetchone() or (None, None)
+def _row3(conn: Any, sql: str, params: tuple) -> tuple:
+    with conn.cursor() as cur:
+        cur.execute(sql, params)
+        return cur.fetchone() or (None, None, None)
+def _row4(conn: Any, sql: str, params: tuple) -> tuple:
+    with conn.cursor() as cur:
+        cur.execute(sql, params)
+        return cur.fetchone() or (None, None, None, None)
+# ── Public convenience function ───────────────────────────────────────
+def check_user_risk(
+    user_hash: str,
+    conn: Any,
+    window_hours: int = _SESSION_HOURS,
+    *,
+    extra_factors: list[RiskFactor] | None = None,
+) -> RiskScore:
+    """Compute and return a RiskScore for user_hash.
+    When a cloud client is configured, delegates to the centralized Aurora
+    backend for cross-session, cross-instance risk scoring. Falls back to
+    local psycopg2 computation if the cloud client is unavailable or returns
+    an error.
+    Without a cloud client, checks for an active manual pardon first, then
+    runs the local RiskScorer against the local psycopg2 connection.
+    extra_factors: optional opt-in content factors (e.g. ``script_risk_factor``)
+        folded into the final score and level. Ignored for pardoned users.
+    """
+    if not is_anomaly_enabled():
+        return RiskScore(user_hash=user_hash, score=0.0, level=RiskLevel.LOW)
+    # ── Cloud path: delegate to centralized Aurora ─────────────────────
+    try:
+        from firewall_sdk.cloud_client import get_client
+        client = get_client()
+        if client is not None:
+            data = client.get_risk_score(user_hash)
+            if data is not None:
+                now = datetime.now(timezone.utc)
+                pardoned = False
+                if data.get("pardoned_until"):
+                    try:
+                        pardoned_dt = datetime.fromisoformat(data["pardoned_until"])
+                        pardoned = pardoned_dt > now
+                    except ValueError:
+                        pass
+                if pardoned:
+                    return RiskScore(
+                        user_hash=user_hash,
+                        score=0.0,
+                        level=RiskLevel.LOW,
+                        factors=[
+                            RiskFactor(
+                                name="manual_pardon",
+                                description="Pardoned via admin dashboard",
+                                contribution=0.0,
+                                signal_value={
+                                    "pardoned_until": data.get("pardoned_until")
+                                },
+                            )
+                        ],
+                    )
+                factors = [
+                    RiskFactor(
+                        name=f["name"],
+                        description=f["description"],
+                        contribution=f["contribution"],
+                        signal_value=f.get("signal_value"),
+                    )
+                    for f in data.get("factors", [])
+                ]
+                return _apply_extra_factors(
+                    RiskScore(
+                        user_hash=user_hash,
+                        score=float(data.get("score", 0.0)),
+                        level=RiskLevel(data.get("level", "LOW")),
+                        factors=factors,
+                    ),
+                    extra_factors,
+                )
+    except Exception:
+        logger.debug(
+            "check_user_risk cloud lookup failed — falling back to local", exc_info=True
+        )
+    # ── Local path: psycopg2 + pardon check ───────────────────────────
+    if conn is not None:
+        try:
+            now = datetime.now(timezone.utc)
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT pardoned_until FROM user_risk_scores WHERE user_hash = %s",
+                    (user_hash,),
+                )
+                row = cur.fetchone()
+            if row and row[0] and row[0] > now:
+                pardoned_until = row[0]
+                return RiskScore(
+                    user_hash=user_hash,
+                    score=0.0,
+                    level=RiskLevel.LOW,
+                    factors=[
+                        RiskFactor(
+                            name="manual_pardon",
+                            description=f"Manually reset by admin — active until {pardoned_until.isoformat()}",
+                            contribution=0.0,
+                            signal_value={"pardoned_until": pardoned_until.isoformat()},
+                        )
+                    ],
+                )
+        except Exception:
+            logger.debug(
+                "check_user_risk pardon lookup failed — proceeding with compute",
+                exc_info=True,
+            )
+    return _apply_extra_factors(
+        RiskScorer().compute(user_hash, conn, window_hours), extra_factors
+    )