PyPI - gauntlet-ai - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gauntlet-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

gauntlet/__init__.py +20 -0
gauntlet/cli.py +246 -0
gauntlet/config.py +174 -0
gauntlet/data/embeddings.npz +0 -0
gauntlet/data/metadata.json +109 -0
gauntlet/detector.py +274 -0
gauntlet/exceptions.py +13 -0
gauntlet/layers/__init__.py +1 -0
gauntlet/layers/embeddings.py +269 -0
gauntlet/layers/llm_judge.py +319 -0
gauntlet/layers/rules.py +852 -0
gauntlet/mcp_server.py +135 -0
gauntlet/models.py +83 -0
gauntlet_ai-0.1.0.dist-info/METADATA +281 -0
gauntlet_ai-0.1.0.dist-info/RECORD +17 -0
gauntlet_ai-0.1.0.dist-info/WHEEL +4 -0
gauntlet_ai-0.1.0.dist-info/entry_points.txt +2 -0

gauntlet/detector.py ADDED Viewed

@@ -0,0 +1,274 @@
+"""Core Gauntlet detector with three-layer cascade.
+Provides the Gauntlet class and detect() convenience function for
+prompt injection detection.
+"""
+import logging
+import time
+from gauntlet.config import get_anthropic_key, get_openai_key
+from gauntlet.layers.rules import RulesDetector
+from gauntlet.models import DetectionResult, LayerResult
+logger = logging.getLogger(__name__)
+class Gauntlet:
+    """Three-layer cascade prompt injection detector.
+    Orchestrates detection through:
+    - Layer 1: Rules (fast regex pattern matching) - always available
+    - Layer 2: Embeddings (semantic similarity) - requires OpenAI key
+    - Layer 3: LLM Judge (Claude reasoning) - requires Anthropic key
+    The pipeline stops at the first layer that detects an injection.
+    Examples:
+        # Layer 1 only (zero config)
+        g = Gauntlet()
+        result = g.detect("ignore previous instructions")
+        # All layers (BYOK)
+        g = Gauntlet(openai_key="sk-...", anthropic_key="sk-ant-...")
+        result = g.detect("subtle attack")
+        # Auto-resolve keys from config/env
+        g = Gauntlet()  # reads ~/.gauntlet/config.toml or env vars
+    """
+    def __init__(
+        self,
+        openai_key: str | None = None,
+        anthropic_key: str | None = None,
+        embedding_threshold: float = 0.55,
+        embedding_model: str = "text-embedding-3-small",
+        llm_model: str = "claude-3-haiku-20240307",
+        llm_timeout: float = 3.0,
+        confidence_threshold: float = 0.70,
+    ) -> None:
+        """Initialize the Gauntlet detector.
+        Key resolution order:
+        1. Constructor args
+        2. Config file (~/.gauntlet/config.toml)
+        3. Environment variables (OPENAI_API_KEY, ANTHROPIC_API_KEY)
+        4. Layer 1 only (no keys needed)
+        Args:
+            openai_key: OpenAI API key for Layer 2.
+            anthropic_key: Anthropic API key for Layer 3.
+            embedding_threshold: Similarity threshold for Layer 2.
+            embedding_model: OpenAI embedding model name.
+            llm_model: Claude model name for Layer 3.
+            llm_timeout: Timeout for Layer 3 API calls.
+            confidence_threshold: Min confidence for Layer 3 detection.
+        """
+        # Resolve keys
+        self._openai_key = openai_key or get_openai_key()
+        self._anthropic_key = anthropic_key or get_anthropic_key()
+        # Layer 1: Always available
+        self._rules = RulesDetector()
+        # Layer 2: Embeddings (lazy init)
+        self._embeddings = None
+        self._embedding_threshold = embedding_threshold
+        self._embedding_model = embedding_model
+        # Layer 3: LLM Judge (lazy init)
+        self._llm = None
+        self._llm_model = llm_model
+        self._llm_timeout = llm_timeout
+        self._confidence_threshold = confidence_threshold
+    def _get_embeddings_detector(self):
+        """Lazy-initialize Layer 2 detector."""
+        if self._embeddings is None and self._openai_key:
+            try:
+                from gauntlet.layers.embeddings import EmbeddingsDetector
+                self._embeddings = EmbeddingsDetector(
+                    openai_key=self._openai_key,
+                    threshold=self._embedding_threshold,
+                    model=self._embedding_model,
+                )
+            except ImportError:
+                logger.debug("Layer 2 deps not installed (openai, numpy)")
+            except Exception as e:
+                logger.warning("Failed to initialize Layer 2: %s", type(e).__name__)
+        return self._embeddings
+    def _get_llm_detector(self):
+        """Lazy-initialize Layer 3 detector."""
+        if self._llm is None and self._anthropic_key:
+            try:
+                from gauntlet.layers.llm_judge import LLMDetector
+                self._llm = LLMDetector(
+                    anthropic_key=self._anthropic_key,
+                    model=self._llm_model,
+                    timeout=self._llm_timeout,
+                    confidence_threshold=self._confidence_threshold,
+                )
+            except ImportError:
+                logger.debug("Layer 3 deps not installed (anthropic)")
+            except Exception as e:
+                logger.warning("Failed to initialize Layer 3: %s", type(e).__name__)
+        return self._llm
+    @property
+    def available_layers(self) -> list[int]:
+        """Return list of available layer numbers."""
+        layers = [1]
+        if self._openai_key:
+            try:
+                import numpy  # noqa: F401
+                import openai  # noqa: F401
+                layers.append(2)
+            except ImportError:
+                pass
+        if self._anthropic_key:
+            try:
+                import anthropic  # noqa: F401
+                layers.append(3)
+            except ImportError:
+                pass
+        return layers
+    def detect(
+        self,
+        text: str,
+        layers: list[int] | None = None,
+    ) -> DetectionResult:
+        """Run text through the detection cascade.
+        Args:
+            text: The input text to analyze.
+            layers: Specific layers to run (default: all available).
+                    e.g., [1] for rules only, [1, 2] for rules + embeddings.
+        Returns:
+            DetectionResult with detection outcome and layer results.
+        """
+        if not text or not text.strip():
+            return DetectionResult(
+                is_injection=False,
+                confidence=0.0,
+                attack_type=None,
+                detected_by_layer=None,
+                layer_results=[],
+                total_latency_ms=0.0,
+            )
+        start_time = time.perf_counter()
+        layer_results: list[LayerResult] = []
+        errors: list[str] = []
+        layers_skipped: list[int] = []
+        run_layers = layers or self.available_layers
+        if layers:
+            invalid = [l for l in layers if l not in (1, 2, 3)]
+            if invalid:
+                raise ValueError(f"Invalid layer numbers: {invalid}. Must be 1, 2, or 3.")
+        def _build_result(
+            is_injection: bool = False,
+            confidence: float = 0.0,
+            attack_type: str | None = None,
+            detected_by_layer: int | None = None,
+        ) -> DetectionResult:
+            return DetectionResult(
+                is_injection=is_injection,
+                confidence=confidence,
+                attack_type=attack_type,
+                detected_by_layer=detected_by_layer,
+                layer_results=layer_results,
+                total_latency_ms=(time.perf_counter() - start_time) * 1000,
+                errors=errors,
+                layers_skipped=layers_skipped,
+            )
+        # Layer 1: Rules
+        if 1 in run_layers:
+            l1_result = self._rules.detect(text)
+            layer_results.append(l1_result)
+            if l1_result.error:
+                errors.append(f"Layer 1 (rules): {l1_result.error}")
+            if l1_result.is_injection:
+                return _build_result(
+                    is_injection=True,
+                    confidence=l1_result.confidence,
+                    attack_type=l1_result.attack_type,
+                    detected_by_layer=1,
+                )
+        # Layer 2: Embeddings
+        if 2 in run_layers:
+            embeddings = self._get_embeddings_detector()
+            if embeddings:
+                l2_result = embeddings.detect(text)
+                layer_results.append(l2_result)
+                if l2_result.error:
+                    errors.append(f"Layer 2 (embeddings): {l2_result.error}")
+                if l2_result.is_injection:
+                    return _build_result(
+                        is_injection=True,
+                        confidence=l2_result.confidence,
+                        attack_type=l2_result.attack_type,
+                        detected_by_layer=2,
+                    )
+            else:
+                layers_skipped.append(2)
+        # Layer 3: LLM Judge
+        if 3 in run_layers:
+            llm = self._get_llm_detector()
+            if llm:
+                l3_result = llm.detect(text)
+                layer_results.append(l3_result)
+                if l3_result.error:
+                    errors.append(f"Layer 3 (llm_judge): {l3_result.error}")
+                if l3_result.is_injection:
+                    return _build_result(
+                        is_injection=True,
+                        confidence=l3_result.confidence,
+                        attack_type=l3_result.attack_type,
+                        detected_by_layer=3,
+                    )
+            else:
+                layers_skipped.append(3)
+        # No detection
+        return _build_result()
+def detect(text: str, **kwargs) -> DetectionResult:
+    """Convenience function for quick detection.
+    Uses Layer 1 (rules) only by default. Pass openai_key and/or
+    anthropic_key for additional layers.
+    Args:
+        text: The input text to analyze.
+        **kwargs: Passed to Gauntlet constructor.
+    Returns:
+        DetectionResult with detection outcome.
+    Examples:
+        # Layer 1 only
+        result = detect("ignore previous instructions")
+        # All layers
+        result = detect("text", openai_key="sk-...", anthropic_key="sk-ant-...")
+    """
+    g = Gauntlet(**kwargs)
+    return g.detect(text)
+__all__ = ["Gauntlet", "detect"]

gauntlet/exceptions.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Gauntlet exceptions."""
+class GauntletError(Exception):
+    """Base exception for Gauntlet."""
+class ConfigError(GauntletError):
+    """Configuration error."""
+class DetectionError(GauntletError):
+    """Detection layer error."""

gauntlet/layers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Detection layers for Gauntlet."""

gauntlet/layers/embeddings.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""Layer 2: Embeddings-based prompt injection detection.
+This module provides semantic similarity-based detection for prompt injection
+attacks. It compares user input embeddings against pre-computed attack
+embeddings using local numpy cosine similarity.
+Detection flow: Input text -> OpenAI embedding -> Local cosine similarity -> threshold check
+Requires: pip install gauntlet-ai[embeddings] (openai, numpy)
+"""
+import logging
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from gauntlet.models import LayerResult
+logger = logging.getLogger(__name__)
+# Default paths for pre-computed data
+_DATA_DIR = Path(__file__).parent.parent / "data"
+_DEFAULT_EMBEDDINGS_PATH = _DATA_DIR / "embeddings.npz"
+_DEFAULT_METADATA_PATH = _DATA_DIR / "metadata.json"
+@dataclass
+class SimilarityMatch:
+    """A single similarity match from the embeddings database."""
+    index: int
+    category: str
+    subcategory: str | None
+    label: str
+    similarity: float
+class EmbeddingsDetector:
+    """Semantic similarity-based detector using local cosine similarity.
+    This is Layer 2 of the detection cascade - designed to catch attacks
+    that bypass Layer 1's regex patterns by using semantic similarity
+    to pre-computed attack embeddings shipped with the package.
+    Requires an OpenAI API key for generating input embeddings.
+    """
+    def __init__(
+        self,
+        openai_key: str,
+        threshold: float = 0.55,
+        model: str = "text-embedding-3-small",
+        embeddings_path: Path | None = None,
+        metadata_path: Path | None = None,
+    ) -> None:
+        """Initialize the embeddings detector.
+        Args:
+            openai_key: OpenAI API key for generating embeddings.
+            threshold: Similarity threshold (0.0-1.0). Default 0.55.
+            model: Embedding model name.
+            embeddings_path: Path to .npz file with pre-computed embeddings.
+            metadata_path: Path to metadata JSON file.
+        """
+        try:
+            import numpy as np
+            from openai import OpenAI
+        except ImportError:
+            raise ImportError(
+                "Layer 2 requires openai and numpy. "
+                "Install with: pip install gauntlet-ai[embeddings]"
+            )
+        self._np = np
+        self._client = OpenAI(api_key=openai_key, timeout=10.0)
+        self.threshold = threshold
+        self.model = model
+        # Load pre-computed embeddings
+        emb_path = embeddings_path or _DEFAULT_EMBEDDINGS_PATH
+        meta_path = metadata_path or _DEFAULT_METADATA_PATH
+        self._embeddings = None
+        self._metadata = None
+        if emb_path.exists():
+            data = np.load(str(emb_path), allow_pickle=False)
+            self._embeddings = data["embeddings"]
+        else:
+            logger.warning(f"Embeddings file not found: {emb_path}")
+        if meta_path.exists():
+            import json
+            with open(meta_path) as f:
+                self._metadata = json.load(f)
+        else:
+            logger.warning(f"Metadata file not found: {meta_path}")
+    def _get_embedding(self, text: str) -> list[float]:
+        """Generate embedding for input text using OpenAI API.
+        Args:
+            text: The input text to embed.
+        Returns:
+            List of floats representing the embedding vector.
+        """
+        response = self._client.embeddings.create(
+            model=self.model,
+            input=text,
+        )
+        return response.data[0].embedding
+    def _cosine_similarity(self, query: list[float], threshold: float | None = None) -> list[tuple[int, float]]:
+        """Compute cosine similarity between query and all stored embeddings.
+        Args:
+            query: The query embedding vector.
+            threshold: Similarity threshold override. Uses self.threshold if None.
+        Returns:
+            List of (index, similarity) tuples sorted by similarity descending.
+        """
+        effective_threshold = threshold if threshold is not None else self.threshold
+        np = self._np
+        if self._embeddings is None:
+            return []
+        query_vec = np.array(query, dtype=np.float32)
+        query_norm = np.linalg.norm(query_vec)
+        if query_norm == 0:
+            return []
+        query_vec = query_vec / query_norm
+        # Normalize stored embeddings (they should already be normalized, but just in case)
+        norms = np.linalg.norm(self._embeddings, axis=1, keepdims=True)
+        norms = np.where(norms == 0, 1, norms)
+        normalized = self._embeddings / norms
+        similarities = normalized @ query_vec
+        # Get indices sorted by similarity (descending)
+        sorted_indices = np.argsort(similarities)[::-1]
+        results = []
+        for idx in sorted_indices:
+            sim = float(similarities[idx])
+            if sim < effective_threshold:
+                break
+            # Clamp to [0, 1] to handle floating-point precision
+            sim = max(0.0, min(1.0, sim))
+            results.append((int(idx), sim))
+        return results
+    def _get_match_metadata(self, index: int) -> dict:
+        """Get metadata for a given embedding index."""
+        if self._metadata and "patterns" in self._metadata:
+            patterns = self._metadata["patterns"]
+            if 0 <= index < len(patterns):
+                return patterns[index]
+        return {"category": "unknown", "subcategory": None, "label": "unknown"}
+    def detect(self, text: str) -> LayerResult:
+        """Check text for prompt injection using semantic similarity.
+        Args:
+            text: The input text to analyze.
+        Returns:
+            LayerResult with detection outcome.
+        """
+        start_time = time.perf_counter()
+        try:
+            if self._embeddings is None:
+                latency_ms = (time.perf_counter() - start_time) * 1000
+                return LayerResult(
+                    is_injection=False,
+                    confidence=0.0,
+                    attack_type=None,
+                    layer=2,
+                    latency_ms=latency_ms,
+                    details=None,
+                    error="No pre-computed embeddings found",
+                )
+            embedding = self._get_embedding(text)
+            matches = self._cosine_similarity(embedding)
+            latency_ms = (time.perf_counter() - start_time) * 1000
+            if matches:
+                top_idx, top_sim = matches[0]
+                meta = self._get_match_metadata(top_idx)
+                return LayerResult(
+                    is_injection=True,
+                    confidence=top_sim,
+                    attack_type=meta.get("category", "unknown"),
+                    layer=2,
+                    latency_ms=latency_ms,
+                    details={
+                        "similarity": top_sim,
+                        "matched_category": meta.get("category"),
+                        "matched_subcategory": meta.get("subcategory"),
+                        "matched_label": meta.get("label"),
+                        "threshold": self.threshold,
+                        "total_matches": len(matches),
+                    },
+                )
+            return LayerResult(
+                is_injection=False,
+                confidence=0.0,
+                attack_type=None,
+                layer=2,
+                latency_ms=latency_ms,
+                details={"threshold": self.threshold},
+            )
+        except Exception as e:
+            latency_ms = (time.perf_counter() - start_time) * 1000
+            logger.warning(f"Layer 2 embeddings detection failed: {e}")
+            return LayerResult(
+                is_injection=False,
+                confidence=0.0,
+                attack_type=None,
+                layer=2,
+                latency_ms=latency_ms,
+                details=None,
+                error=str(e),
+            )
+    def get_top_matches(self, text: str, top_k: int = 5) -> list[SimilarityMatch]:
+        """Get top similarity matches for debugging/analysis.
+        Args:
+            text: The input text to analyze.
+            top_k: Number of top matches to return.
+        Returns:
+            List of SimilarityMatch objects.
+        """
+        try:
+            embedding = self._get_embedding(text)
+            # Use lower threshold for debugging
+            matches = self._cosine_similarity(embedding, threshold=0.3)
+            results = []
+            for idx, sim in matches[:top_k]:
+                meta = self._get_match_metadata(idx)
+                results.append(
+                    SimilarityMatch(
+                        index=idx,
+                        category=meta.get("category", "unknown"),
+                        subcategory=meta.get("subcategory"),
+                        label=meta.get("label", "unknown"),
+                        similarity=sim,
+                    )
+                )
+            return results
+        except Exception as e:
+            logger.warning(f"get_top_matches failed: {e}")
+            return []
+__all__ = ["EmbeddingsDetector", "SimilarityMatch"]