gauntlet-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gauntlet/detector.py ADDED
@@ -0,0 +1,274 @@
1
+ """Core Gauntlet detector with three-layer cascade.
2
+
3
+ Provides the Gauntlet class and detect() convenience function for
4
+ prompt injection detection.
5
+ """
6
+
7
+ import logging
8
+ import time
9
+
10
+ from gauntlet.config import get_anthropic_key, get_openai_key
11
+ from gauntlet.layers.rules import RulesDetector
12
+ from gauntlet.models import DetectionResult, LayerResult
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Gauntlet:
18
+ """Three-layer cascade prompt injection detector.
19
+
20
+ Orchestrates detection through:
21
+ - Layer 1: Rules (fast regex pattern matching) - always available
22
+ - Layer 2: Embeddings (semantic similarity) - requires OpenAI key
23
+ - Layer 3: LLM Judge (Claude reasoning) - requires Anthropic key
24
+
25
+ The pipeline stops at the first layer that detects an injection.
26
+
27
+ Examples:
28
+ # Layer 1 only (zero config)
29
+ g = Gauntlet()
30
+ result = g.detect("ignore previous instructions")
31
+
32
+ # All layers (BYOK)
33
+ g = Gauntlet(openai_key="sk-...", anthropic_key="sk-ant-...")
34
+ result = g.detect("subtle attack")
35
+
36
+ # Auto-resolve keys from config/env
37
+ g = Gauntlet() # reads ~/.gauntlet/config.toml or env vars
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ openai_key: str | None = None,
43
+ anthropic_key: str | None = None,
44
+ embedding_threshold: float = 0.55,
45
+ embedding_model: str = "text-embedding-3-small",
46
+ llm_model: str = "claude-3-haiku-20240307",
47
+ llm_timeout: float = 3.0,
48
+ confidence_threshold: float = 0.70,
49
+ ) -> None:
50
+ """Initialize the Gauntlet detector.
51
+
52
+ Key resolution order:
53
+ 1. Constructor args
54
+ 2. Config file (~/.gauntlet/config.toml)
55
+ 3. Environment variables (OPENAI_API_KEY, ANTHROPIC_API_KEY)
56
+ 4. Layer 1 only (no keys needed)
57
+
58
+ Args:
59
+ openai_key: OpenAI API key for Layer 2.
60
+ anthropic_key: Anthropic API key for Layer 3.
61
+ embedding_threshold: Similarity threshold for Layer 2.
62
+ embedding_model: OpenAI embedding model name.
63
+ llm_model: Claude model name for Layer 3.
64
+ llm_timeout: Timeout for Layer 3 API calls.
65
+ confidence_threshold: Min confidence for Layer 3 detection.
66
+ """
67
+ # Resolve keys
68
+ self._openai_key = openai_key or get_openai_key()
69
+ self._anthropic_key = anthropic_key or get_anthropic_key()
70
+
71
+ # Layer 1: Always available
72
+ self._rules = RulesDetector()
73
+
74
+ # Layer 2: Embeddings (lazy init)
75
+ self._embeddings = None
76
+ self._embedding_threshold = embedding_threshold
77
+ self._embedding_model = embedding_model
78
+
79
+ # Layer 3: LLM Judge (lazy init)
80
+ self._llm = None
81
+ self._llm_model = llm_model
82
+ self._llm_timeout = llm_timeout
83
+ self._confidence_threshold = confidence_threshold
84
+
85
+ def _get_embeddings_detector(self):
86
+ """Lazy-initialize Layer 2 detector."""
87
+ if self._embeddings is None and self._openai_key:
88
+ try:
89
+ from gauntlet.layers.embeddings import EmbeddingsDetector
90
+ self._embeddings = EmbeddingsDetector(
91
+ openai_key=self._openai_key,
92
+ threshold=self._embedding_threshold,
93
+ model=self._embedding_model,
94
+ )
95
+ except ImportError:
96
+ logger.debug("Layer 2 deps not installed (openai, numpy)")
97
+ except Exception as e:
98
+ logger.warning("Failed to initialize Layer 2: %s", type(e).__name__)
99
+ return self._embeddings
100
+
101
+ def _get_llm_detector(self):
102
+ """Lazy-initialize Layer 3 detector."""
103
+ if self._llm is None and self._anthropic_key:
104
+ try:
105
+ from gauntlet.layers.llm_judge import LLMDetector
106
+ self._llm = LLMDetector(
107
+ anthropic_key=self._anthropic_key,
108
+ model=self._llm_model,
109
+ timeout=self._llm_timeout,
110
+ confidence_threshold=self._confidence_threshold,
111
+ )
112
+ except ImportError:
113
+ logger.debug("Layer 3 deps not installed (anthropic)")
114
+ except Exception as e:
115
+ logger.warning("Failed to initialize Layer 3: %s", type(e).__name__)
116
+ return self._llm
117
+
118
+ @property
119
+ def available_layers(self) -> list[int]:
120
+ """Return list of available layer numbers."""
121
+ layers = [1]
122
+ if self._openai_key:
123
+ try:
124
+ import numpy # noqa: F401
125
+ import openai # noqa: F401
126
+ layers.append(2)
127
+ except ImportError:
128
+ pass
129
+ if self._anthropic_key:
130
+ try:
131
+ import anthropic # noqa: F401
132
+ layers.append(3)
133
+ except ImportError:
134
+ pass
135
+ return layers
136
+
137
+ def detect(
138
+ self,
139
+ text: str,
140
+ layers: list[int] | None = None,
141
+ ) -> DetectionResult:
142
+ """Run text through the detection cascade.
143
+
144
+ Args:
145
+ text: The input text to analyze.
146
+ layers: Specific layers to run (default: all available).
147
+ e.g., [1] for rules only, [1, 2] for rules + embeddings.
148
+
149
+ Returns:
150
+ DetectionResult with detection outcome and layer results.
151
+ """
152
+ if not text or not text.strip():
153
+ return DetectionResult(
154
+ is_injection=False,
155
+ confidence=0.0,
156
+ attack_type=None,
157
+ detected_by_layer=None,
158
+ layer_results=[],
159
+ total_latency_ms=0.0,
160
+ )
161
+
162
+ start_time = time.perf_counter()
163
+ layer_results: list[LayerResult] = []
164
+ errors: list[str] = []
165
+ layers_skipped: list[int] = []
166
+ run_layers = layers or self.available_layers
167
+
168
+ if layers:
169
+ invalid = [l for l in layers if l not in (1, 2, 3)]
170
+ if invalid:
171
+ raise ValueError(f"Invalid layer numbers: {invalid}. Must be 1, 2, or 3.")
172
+
173
+ def _build_result(
174
+ is_injection: bool = False,
175
+ confidence: float = 0.0,
176
+ attack_type: str | None = None,
177
+ detected_by_layer: int | None = None,
178
+ ) -> DetectionResult:
179
+ return DetectionResult(
180
+ is_injection=is_injection,
181
+ confidence=confidence,
182
+ attack_type=attack_type,
183
+ detected_by_layer=detected_by_layer,
184
+ layer_results=layer_results,
185
+ total_latency_ms=(time.perf_counter() - start_time) * 1000,
186
+ errors=errors,
187
+ layers_skipped=layers_skipped,
188
+ )
189
+
190
+ # Layer 1: Rules
191
+ if 1 in run_layers:
192
+ l1_result = self._rules.detect(text)
193
+ layer_results.append(l1_result)
194
+
195
+ if l1_result.error:
196
+ errors.append(f"Layer 1 (rules): {l1_result.error}")
197
+
198
+ if l1_result.is_injection:
199
+ return _build_result(
200
+ is_injection=True,
201
+ confidence=l1_result.confidence,
202
+ attack_type=l1_result.attack_type,
203
+ detected_by_layer=1,
204
+ )
205
+
206
+ # Layer 2: Embeddings
207
+ if 2 in run_layers:
208
+ embeddings = self._get_embeddings_detector()
209
+ if embeddings:
210
+ l2_result = embeddings.detect(text)
211
+ layer_results.append(l2_result)
212
+
213
+ if l2_result.error:
214
+ errors.append(f"Layer 2 (embeddings): {l2_result.error}")
215
+
216
+ if l2_result.is_injection:
217
+ return _build_result(
218
+ is_injection=True,
219
+ confidence=l2_result.confidence,
220
+ attack_type=l2_result.attack_type,
221
+ detected_by_layer=2,
222
+ )
223
+ else:
224
+ layers_skipped.append(2)
225
+
226
+ # Layer 3: LLM Judge
227
+ if 3 in run_layers:
228
+ llm = self._get_llm_detector()
229
+ if llm:
230
+ l3_result = llm.detect(text)
231
+ layer_results.append(l3_result)
232
+
233
+ if l3_result.error:
234
+ errors.append(f"Layer 3 (llm_judge): {l3_result.error}")
235
+
236
+ if l3_result.is_injection:
237
+ return _build_result(
238
+ is_injection=True,
239
+ confidence=l3_result.confidence,
240
+ attack_type=l3_result.attack_type,
241
+ detected_by_layer=3,
242
+ )
243
+ else:
244
+ layers_skipped.append(3)
245
+
246
+ # No detection
247
+ return _build_result()
248
+
249
+
250
+ def detect(text: str, **kwargs) -> DetectionResult:
251
+ """Convenience function for quick detection.
252
+
253
+ Uses Layer 1 (rules) only by default. Pass openai_key and/or
254
+ anthropic_key for additional layers.
255
+
256
+ Args:
257
+ text: The input text to analyze.
258
+ **kwargs: Passed to Gauntlet constructor.
259
+
260
+ Returns:
261
+ DetectionResult with detection outcome.
262
+
263
+ Examples:
264
+ # Layer 1 only
265
+ result = detect("ignore previous instructions")
266
+
267
+ # All layers
268
+ result = detect("text", openai_key="sk-...", anthropic_key="sk-ant-...")
269
+ """
270
+ g = Gauntlet(**kwargs)
271
+ return g.detect(text)
272
+
273
+
274
+ __all__ = ["Gauntlet", "detect"]
gauntlet/exceptions.py ADDED
@@ -0,0 +1,13 @@
1
+ """Gauntlet exceptions."""
2
+
3
+
4
+ class GauntletError(Exception):
5
+ """Base exception for Gauntlet."""
6
+
7
+
8
+ class ConfigError(GauntletError):
9
+ """Configuration error."""
10
+
11
+
12
+ class DetectionError(GauntletError):
13
+ """Detection layer error."""
@@ -0,0 +1 @@
1
+ """Detection layers for Gauntlet."""
@@ -0,0 +1,269 @@
1
+ """Layer 2: Embeddings-based prompt injection detection.
2
+
3
+ This module provides semantic similarity-based detection for prompt injection
4
+ attacks. It compares user input embeddings against pre-computed attack
5
+ embeddings using local numpy cosine similarity.
6
+
7
+ Detection flow: Input text -> OpenAI embedding -> Local cosine similarity -> threshold check
8
+
9
+ Requires: pip install gauntlet-ai[embeddings] (openai, numpy)
10
+ """
11
+
12
+ import logging
13
+ import time
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+
17
+ from gauntlet.models import LayerResult
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Default paths for pre-computed data
22
+ _DATA_DIR = Path(__file__).parent.parent / "data"
23
+ _DEFAULT_EMBEDDINGS_PATH = _DATA_DIR / "embeddings.npz"
24
+ _DEFAULT_METADATA_PATH = _DATA_DIR / "metadata.json"
25
+
26
+
27
+ @dataclass
28
+ class SimilarityMatch:
29
+ """A single similarity match from the embeddings database."""
30
+
31
+ index: int
32
+ category: str
33
+ subcategory: str | None
34
+ label: str
35
+ similarity: float
36
+
37
+
38
+ class EmbeddingsDetector:
39
+ """Semantic similarity-based detector using local cosine similarity.
40
+
41
+ This is Layer 2 of the detection cascade - designed to catch attacks
42
+ that bypass Layer 1's regex patterns by using semantic similarity
43
+ to pre-computed attack embeddings shipped with the package.
44
+
45
+ Requires an OpenAI API key for generating input embeddings.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ openai_key: str,
51
+ threshold: float = 0.55,
52
+ model: str = "text-embedding-3-small",
53
+ embeddings_path: Path | None = None,
54
+ metadata_path: Path | None = None,
55
+ ) -> None:
56
+ """Initialize the embeddings detector.
57
+
58
+ Args:
59
+ openai_key: OpenAI API key for generating embeddings.
60
+ threshold: Similarity threshold (0.0-1.0). Default 0.55.
61
+ model: Embedding model name.
62
+ embeddings_path: Path to .npz file with pre-computed embeddings.
63
+ metadata_path: Path to metadata JSON file.
64
+ """
65
+ try:
66
+ import numpy as np
67
+ from openai import OpenAI
68
+ except ImportError:
69
+ raise ImportError(
70
+ "Layer 2 requires openai and numpy. "
71
+ "Install with: pip install gauntlet-ai[embeddings]"
72
+ )
73
+
74
+ self._np = np
75
+ self._client = OpenAI(api_key=openai_key, timeout=10.0)
76
+ self.threshold = threshold
77
+ self.model = model
78
+
79
+ # Load pre-computed embeddings
80
+ emb_path = embeddings_path or _DEFAULT_EMBEDDINGS_PATH
81
+ meta_path = metadata_path or _DEFAULT_METADATA_PATH
82
+
83
+ self._embeddings = None
84
+ self._metadata = None
85
+
86
+ if emb_path.exists():
87
+ data = np.load(str(emb_path), allow_pickle=False)
88
+ self._embeddings = data["embeddings"]
89
+ else:
90
+ logger.warning(f"Embeddings file not found: {emb_path}")
91
+
92
+ if meta_path.exists():
93
+ import json
94
+ with open(meta_path) as f:
95
+ self._metadata = json.load(f)
96
+ else:
97
+ logger.warning(f"Metadata file not found: {meta_path}")
98
+
99
+ def _get_embedding(self, text: str) -> list[float]:
100
+ """Generate embedding for input text using OpenAI API.
101
+
102
+ Args:
103
+ text: The input text to embed.
104
+
105
+ Returns:
106
+ List of floats representing the embedding vector.
107
+ """
108
+ response = self._client.embeddings.create(
109
+ model=self.model,
110
+ input=text,
111
+ )
112
+ return response.data[0].embedding
113
+
114
+ def _cosine_similarity(self, query: list[float], threshold: float | None = None) -> list[tuple[int, float]]:
115
+ """Compute cosine similarity between query and all stored embeddings.
116
+
117
+ Args:
118
+ query: The query embedding vector.
119
+ threshold: Similarity threshold override. Uses self.threshold if None.
120
+
121
+ Returns:
122
+ List of (index, similarity) tuples sorted by similarity descending.
123
+ """
124
+ effective_threshold = threshold if threshold is not None else self.threshold
125
+ np = self._np
126
+ if self._embeddings is None:
127
+ return []
128
+
129
+ query_vec = np.array(query, dtype=np.float32)
130
+ query_norm = np.linalg.norm(query_vec)
131
+ if query_norm == 0:
132
+ return []
133
+
134
+ query_vec = query_vec / query_norm
135
+
136
+ # Normalize stored embeddings (they should already be normalized, but just in case)
137
+ norms = np.linalg.norm(self._embeddings, axis=1, keepdims=True)
138
+ norms = np.where(norms == 0, 1, norms)
139
+ normalized = self._embeddings / norms
140
+
141
+ similarities = normalized @ query_vec
142
+
143
+ # Get indices sorted by similarity (descending)
144
+ sorted_indices = np.argsort(similarities)[::-1]
145
+
146
+ results = []
147
+ for idx in sorted_indices:
148
+ sim = float(similarities[idx])
149
+ if sim < effective_threshold:
150
+ break
151
+ # Clamp to [0, 1] to handle floating-point precision
152
+ sim = max(0.0, min(1.0, sim))
153
+ results.append((int(idx), sim))
154
+
155
+ return results
156
+
157
+ def _get_match_metadata(self, index: int) -> dict:
158
+ """Get metadata for a given embedding index."""
159
+ if self._metadata and "patterns" in self._metadata:
160
+ patterns = self._metadata["patterns"]
161
+ if 0 <= index < len(patterns):
162
+ return patterns[index]
163
+ return {"category": "unknown", "subcategory": None, "label": "unknown"}
164
+
165
+ def detect(self, text: str) -> LayerResult:
166
+ """Check text for prompt injection using semantic similarity.
167
+
168
+ Args:
169
+ text: The input text to analyze.
170
+
171
+ Returns:
172
+ LayerResult with detection outcome.
173
+ """
174
+ start_time = time.perf_counter()
175
+
176
+ try:
177
+ if self._embeddings is None:
178
+ latency_ms = (time.perf_counter() - start_time) * 1000
179
+ return LayerResult(
180
+ is_injection=False,
181
+ confidence=0.0,
182
+ attack_type=None,
183
+ layer=2,
184
+ latency_ms=latency_ms,
185
+ details=None,
186
+ error="No pre-computed embeddings found",
187
+ )
188
+
189
+ embedding = self._get_embedding(text)
190
+ matches = self._cosine_similarity(embedding)
191
+
192
+ latency_ms = (time.perf_counter() - start_time) * 1000
193
+
194
+ if matches:
195
+ top_idx, top_sim = matches[0]
196
+ meta = self._get_match_metadata(top_idx)
197
+ return LayerResult(
198
+ is_injection=True,
199
+ confidence=top_sim,
200
+ attack_type=meta.get("category", "unknown"),
201
+ layer=2,
202
+ latency_ms=latency_ms,
203
+ details={
204
+ "similarity": top_sim,
205
+ "matched_category": meta.get("category"),
206
+ "matched_subcategory": meta.get("subcategory"),
207
+ "matched_label": meta.get("label"),
208
+ "threshold": self.threshold,
209
+ "total_matches": len(matches),
210
+ },
211
+ )
212
+
213
+ return LayerResult(
214
+ is_injection=False,
215
+ confidence=0.0,
216
+ attack_type=None,
217
+ layer=2,
218
+ latency_ms=latency_ms,
219
+ details={"threshold": self.threshold},
220
+ )
221
+
222
+ except Exception as e:
223
+ latency_ms = (time.perf_counter() - start_time) * 1000
224
+ logger.warning(f"Layer 2 embeddings detection failed: {e}")
225
+ return LayerResult(
226
+ is_injection=False,
227
+ confidence=0.0,
228
+ attack_type=None,
229
+ layer=2,
230
+ latency_ms=latency_ms,
231
+ details=None,
232
+ error=str(e),
233
+ )
234
+
235
+ def get_top_matches(self, text: str, top_k: int = 5) -> list[SimilarityMatch]:
236
+ """Get top similarity matches for debugging/analysis.
237
+
238
+ Args:
239
+ text: The input text to analyze.
240
+ top_k: Number of top matches to return.
241
+
242
+ Returns:
243
+ List of SimilarityMatch objects.
244
+ """
245
+ try:
246
+ embedding = self._get_embedding(text)
247
+
248
+ # Use lower threshold for debugging
249
+ matches = self._cosine_similarity(embedding, threshold=0.3)
250
+
251
+ results = []
252
+ for idx, sim in matches[:top_k]:
253
+ meta = self._get_match_metadata(idx)
254
+ results.append(
255
+ SimilarityMatch(
256
+ index=idx,
257
+ category=meta.get("category", "unknown"),
258
+ subcategory=meta.get("subcategory"),
259
+ label=meta.get("label", "unknown"),
260
+ similarity=sim,
261
+ )
262
+ )
263
+ return results
264
+ except Exception as e:
265
+ logger.warning(f"get_top_matches failed: {e}")
266
+ return []
267
+
268
+
269
+ __all__ = ["EmbeddingsDetector", "SimilarityMatch"]