proxilion 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proxilion/__init__.py +136 -0
- proxilion/audit/__init__.py +133 -0
- proxilion/audit/base_exporters.py +527 -0
- proxilion/audit/compliance/__init__.py +130 -0
- proxilion/audit/compliance/base.py +457 -0
- proxilion/audit/compliance/eu_ai_act.py +603 -0
- proxilion/audit/compliance/iso27001.py +544 -0
- proxilion/audit/compliance/soc2.py +491 -0
- proxilion/audit/events.py +493 -0
- proxilion/audit/explainability.py +1173 -0
- proxilion/audit/exporters/__init__.py +58 -0
- proxilion/audit/exporters/aws_s3.py +636 -0
- proxilion/audit/exporters/azure_storage.py +608 -0
- proxilion/audit/exporters/cloud_base.py +468 -0
- proxilion/audit/exporters/gcp_storage.py +570 -0
- proxilion/audit/exporters/multi_exporter.py +498 -0
- proxilion/audit/hash_chain.py +652 -0
- proxilion/audit/logger.py +543 -0
- proxilion/caching/__init__.py +49 -0
- proxilion/caching/tool_cache.py +633 -0
- proxilion/context/__init__.py +73 -0
- proxilion/context/context_window.py +556 -0
- proxilion/context/message_history.py +505 -0
- proxilion/context/session.py +735 -0
- proxilion/contrib/__init__.py +51 -0
- proxilion/contrib/anthropic.py +609 -0
- proxilion/contrib/google.py +1012 -0
- proxilion/contrib/langchain.py +641 -0
- proxilion/contrib/mcp.py +893 -0
- proxilion/contrib/openai.py +646 -0
- proxilion/core.py +3058 -0
- proxilion/decorators.py +966 -0
- proxilion/engines/__init__.py +287 -0
- proxilion/engines/base.py +266 -0
- proxilion/engines/casbin_engine.py +412 -0
- proxilion/engines/opa_engine.py +493 -0
- proxilion/engines/simple.py +437 -0
- proxilion/exceptions.py +887 -0
- proxilion/guards/__init__.py +54 -0
- proxilion/guards/input_guard.py +522 -0
- proxilion/guards/output_guard.py +634 -0
- proxilion/observability/__init__.py +198 -0
- proxilion/observability/cost_tracker.py +866 -0
- proxilion/observability/hooks.py +683 -0
- proxilion/observability/metrics.py +798 -0
- proxilion/observability/session_cost_tracker.py +1063 -0
- proxilion/policies/__init__.py +67 -0
- proxilion/policies/base.py +304 -0
- proxilion/policies/builtin.py +486 -0
- proxilion/policies/registry.py +376 -0
- proxilion/providers/__init__.py +201 -0
- proxilion/providers/adapter.py +468 -0
- proxilion/providers/anthropic_adapter.py +330 -0
- proxilion/providers/gemini_adapter.py +391 -0
- proxilion/providers/openai_adapter.py +294 -0
- proxilion/py.typed +0 -0
- proxilion/resilience/__init__.py +81 -0
- proxilion/resilience/degradation.py +615 -0
- proxilion/resilience/fallback.py +555 -0
- proxilion/resilience/retry.py +554 -0
- proxilion/scheduling/__init__.py +57 -0
- proxilion/scheduling/priority_queue.py +419 -0
- proxilion/scheduling/scheduler.py +459 -0
- proxilion/security/__init__.py +244 -0
- proxilion/security/agent_trust.py +968 -0
- proxilion/security/behavioral_drift.py +794 -0
- proxilion/security/cascade_protection.py +869 -0
- proxilion/security/circuit_breaker.py +428 -0
- proxilion/security/cost_limiter.py +690 -0
- proxilion/security/idor_protection.py +460 -0
- proxilion/security/intent_capsule.py +849 -0
- proxilion/security/intent_validator.py +495 -0
- proxilion/security/memory_integrity.py +767 -0
- proxilion/security/rate_limiter.py +509 -0
- proxilion/security/scope_enforcer.py +680 -0
- proxilion/security/sequence_validator.py +636 -0
- proxilion/security/trust_boundaries.py +784 -0
- proxilion/streaming/__init__.py +70 -0
- proxilion/streaming/detector.py +761 -0
- proxilion/streaming/transformer.py +674 -0
- proxilion/timeouts/__init__.py +55 -0
- proxilion/timeouts/decorators.py +477 -0
- proxilion/timeouts/manager.py +545 -0
- proxilion/tools/__init__.py +69 -0
- proxilion/tools/decorators.py +493 -0
- proxilion/tools/registry.py +732 -0
- proxilion/types.py +339 -0
- proxilion/validation/__init__.py +93 -0
- proxilion/validation/pydantic_schema.py +351 -0
- proxilion/validation/schema.py +651 -0
- proxilion-0.0.1.dist-info/METADATA +872 -0
- proxilion-0.0.1.dist-info/RECORD +94 -0
- proxilion-0.0.1.dist-info/WHEEL +4 -0
- proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,849 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Intent Capsule for Proxilion.
|
|
3
|
+
|
|
4
|
+
Addresses OWASP ASI01: Agent Goal Hijack.
|
|
5
|
+
|
|
6
|
+
This module provides cryptographic binding of the original user intent
|
|
7
|
+
to every execution cycle, making goal hijacking detectable.
|
|
8
|
+
|
|
9
|
+
The Intent Capsule pattern ensures that:
|
|
10
|
+
- The original user request is signed at creation
|
|
11
|
+
- Every tool call is validated against the original intent
|
|
12
|
+
- Mid-execution hijacking attempts are detected
|
|
13
|
+
- The agent cannot deviate from its mandate
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
>>> from proxilion.security.intent_capsule import (
|
|
17
|
+
... IntentCapsule,
|
|
18
|
+
... IntentGuard,
|
|
19
|
+
... IntentValidator,
|
|
20
|
+
... )
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Create capsule from user request
|
|
23
|
+
>>> capsule = IntentCapsule.create(
|
|
24
|
+
... user_id="alice",
|
|
25
|
+
... intent="Help me find documents about Python",
|
|
26
|
+
... allowed_tools=["search_documents", "read_document"],
|
|
27
|
+
... secret_key="your-secret",
|
|
28
|
+
... )
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Validate each tool call against intent
|
|
31
|
+
>>> guard = IntentGuard(capsule)
|
|
32
|
+
>>>
|
|
33
|
+
>>> if guard.validate_tool_call("search_documents", {"query": "Python"}):
|
|
34
|
+
... result = search_documents(query="Python")
|
|
35
|
+
... else:
|
|
36
|
+
... raise IntentHijackError("Tool call not aligned with intent")
|
|
37
|
+
>>>
|
|
38
|
+
>>> # Detect hijacking attempts
|
|
39
|
+
>>> validator = IntentValidator()
|
|
40
|
+
>>> is_hijack = validator.detect_hijack(
|
|
41
|
+
... original_intent="Find documents",
|
|
42
|
+
... current_action="Delete all files",
|
|
43
|
+
... )
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import hashlib
|
|
49
|
+
import hmac
|
|
50
|
+
import json
|
|
51
|
+
import logging
|
|
52
|
+
import re
|
|
53
|
+
import threading
|
|
54
|
+
import time
|
|
55
|
+
import uuid
|
|
56
|
+
from dataclasses import dataclass, field
|
|
57
|
+
from datetime import datetime, timedelta, timezone
|
|
58
|
+
from enum import Enum
|
|
59
|
+
from typing import Any
|
|
60
|
+
|
|
61
|
+
from proxilion.exceptions import IntentHijackError
|
|
62
|
+
|
|
63
|
+
logger = logging.getLogger(__name__)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class IntentCategory(Enum):
|
|
67
|
+
"""Categories of user intent."""
|
|
68
|
+
|
|
69
|
+
QUERY = "query"
|
|
70
|
+
"""Information retrieval (read-only)."""
|
|
71
|
+
|
|
72
|
+
CREATE = "create"
|
|
73
|
+
"""Creating new resources."""
|
|
74
|
+
|
|
75
|
+
UPDATE = "update"
|
|
76
|
+
"""Modifying existing resources."""
|
|
77
|
+
|
|
78
|
+
DELETE = "delete"
|
|
79
|
+
"""Removing resources."""
|
|
80
|
+
|
|
81
|
+
EXECUTE = "execute"
|
|
82
|
+
"""Running code or processes."""
|
|
83
|
+
|
|
84
|
+
COMMUNICATE = "communicate"
|
|
85
|
+
"""Sending messages or notifications."""
|
|
86
|
+
|
|
87
|
+
ANALYZE = "analyze"
|
|
88
|
+
"""Processing or analyzing data."""
|
|
89
|
+
|
|
90
|
+
UNKNOWN = "unknown"
|
|
91
|
+
"""Unable to categorize."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class IntentCapsule:
|
|
96
|
+
"""
|
|
97
|
+
Cryptographically signed container for user intent.
|
|
98
|
+
|
|
99
|
+
The capsule binds the original intent to a specific execution
|
|
100
|
+
context, making it impossible for the agent to deviate without
|
|
101
|
+
detection.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
capsule_id: str
|
|
105
|
+
user_id: str
|
|
106
|
+
intent: str
|
|
107
|
+
intent_category: IntentCategory
|
|
108
|
+
allowed_tools: set[str]
|
|
109
|
+
allowed_actions: set[str]
|
|
110
|
+
constraints: dict[str, Any]
|
|
111
|
+
created_at: datetime
|
|
112
|
+
expires_at: datetime
|
|
113
|
+
signature: str
|
|
114
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
115
|
+
|
|
116
|
+
# Execution tracking
|
|
117
|
+
tool_calls: list[dict[str, Any]] = field(default_factory=list)
|
|
118
|
+
_max_tool_calls: int = 100
|
|
119
|
+
|
|
120
|
+
def is_expired(self) -> bool:
|
|
121
|
+
"""Check if capsule has expired."""
|
|
122
|
+
return datetime.now(timezone.utc) > self.expires_at
|
|
123
|
+
|
|
124
|
+
def is_tool_allowed(self, tool_name: str) -> bool:
|
|
125
|
+
"""Check if a tool is allowed by this capsule."""
|
|
126
|
+
if "*" in self.allowed_tools:
|
|
127
|
+
return True
|
|
128
|
+
if tool_name in self.allowed_tools:
|
|
129
|
+
return True
|
|
130
|
+
# Pattern matching (e.g., "read_*")
|
|
131
|
+
for pattern in self.allowed_tools:
|
|
132
|
+
if "*" in pattern:
|
|
133
|
+
regex = pattern.replace("*", ".*")
|
|
134
|
+
if re.match(f"^{regex}$", tool_name):
|
|
135
|
+
return True
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
def is_action_allowed(self, action: str) -> bool:
|
|
139
|
+
"""Check if an action is allowed by this capsule."""
|
|
140
|
+
if "*" in self.allowed_actions:
|
|
141
|
+
return True
|
|
142
|
+
return action in self.allowed_actions
|
|
143
|
+
|
|
144
|
+
def record_tool_call(
|
|
145
|
+
self,
|
|
146
|
+
tool_name: str,
|
|
147
|
+
arguments: dict[str, Any],
|
|
148
|
+
result: Any = None,
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Record a tool call for tracking."""
|
|
151
|
+
if len(self.tool_calls) >= self._max_tool_calls:
|
|
152
|
+
# Remove oldest to prevent unbounded growth
|
|
153
|
+
self.tool_calls = self.tool_calls[-self._max_tool_calls + 1:]
|
|
154
|
+
|
|
155
|
+
self.tool_calls.append({
|
|
156
|
+
"tool_name": tool_name,
|
|
157
|
+
"arguments": arguments,
|
|
158
|
+
"result_type": type(result).__name__ if result else None,
|
|
159
|
+
"timestamp": time.time(),
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
def to_dict(self) -> dict[str, Any]:
|
|
163
|
+
"""Serialize to dict."""
|
|
164
|
+
return {
|
|
165
|
+
"capsule_id": self.capsule_id,
|
|
166
|
+
"user_id": self.user_id,
|
|
167
|
+
"intent": self.intent,
|
|
168
|
+
"intent_category": self.intent_category.value,
|
|
169
|
+
"allowed_tools": list(self.allowed_tools),
|
|
170
|
+
"allowed_actions": list(self.allowed_actions),
|
|
171
|
+
"constraints": self.constraints,
|
|
172
|
+
"created_at": self.created_at.isoformat(),
|
|
173
|
+
"expires_at": self.expires_at.isoformat(),
|
|
174
|
+
"signature": self.signature,
|
|
175
|
+
"metadata": self.metadata,
|
|
176
|
+
"tool_call_count": len(self.tool_calls),
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def create(
|
|
181
|
+
cls,
|
|
182
|
+
user_id: str,
|
|
183
|
+
intent: str,
|
|
184
|
+
secret_key: str | bytes,
|
|
185
|
+
allowed_tools: set[str] | list[str] | None = None,
|
|
186
|
+
allowed_actions: set[str] | list[str] | None = None,
|
|
187
|
+
constraints: dict[str, Any] | None = None,
|
|
188
|
+
ttl_seconds: int = 3600,
|
|
189
|
+
metadata: dict[str, Any] | None = None,
|
|
190
|
+
intent_category: IntentCategory | None = None,
|
|
191
|
+
) -> IntentCapsule:
|
|
192
|
+
"""
|
|
193
|
+
Create a new intent capsule.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
user_id: ID of the user making the request.
|
|
197
|
+
intent: Natural language description of user intent.
|
|
198
|
+
secret_key: Secret key for signing.
|
|
199
|
+
allowed_tools: Tools allowed for this intent.
|
|
200
|
+
allowed_actions: Actions allowed (read, write, delete, etc.).
|
|
201
|
+
constraints: Additional constraints (max_results, allowed_paths, etc.).
|
|
202
|
+
ttl_seconds: Time-to-live for the capsule.
|
|
203
|
+
metadata: Optional metadata.
|
|
204
|
+
intent_category: Category of intent (auto-detected if not provided).
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Signed IntentCapsule.
|
|
208
|
+
"""
|
|
209
|
+
if isinstance(secret_key, str):
|
|
210
|
+
secret_key = secret_key.encode()
|
|
211
|
+
|
|
212
|
+
capsule_id = str(uuid.uuid4())
|
|
213
|
+
now = datetime.now(timezone.utc)
|
|
214
|
+
expires_at = now + timedelta(seconds=ttl_seconds)
|
|
215
|
+
|
|
216
|
+
# Normalize sets
|
|
217
|
+
if allowed_tools is None:
|
|
218
|
+
allowed_tools = set()
|
|
219
|
+
elif isinstance(allowed_tools, list):
|
|
220
|
+
allowed_tools = set(allowed_tools)
|
|
221
|
+
|
|
222
|
+
if allowed_actions is None:
|
|
223
|
+
allowed_actions = set()
|
|
224
|
+
elif isinstance(allowed_actions, list):
|
|
225
|
+
allowed_actions = set(allowed_actions)
|
|
226
|
+
|
|
227
|
+
# Auto-detect category if not provided
|
|
228
|
+
if intent_category is None:
|
|
229
|
+
intent_category = cls._detect_intent_category(intent)
|
|
230
|
+
|
|
231
|
+
# Create signature
|
|
232
|
+
sig_data = (
|
|
233
|
+
f"{capsule_id}|{user_id}|{intent}|{intent_category.value}|"
|
|
234
|
+
f"{sorted(allowed_tools)}|{sorted(allowed_actions)}|"
|
|
235
|
+
f"{json.dumps(constraints or {}, sort_keys=True)}|{now.isoformat()}"
|
|
236
|
+
)
|
|
237
|
+
signature = hmac.new(
|
|
238
|
+
secret_key,
|
|
239
|
+
sig_data.encode(),
|
|
240
|
+
hashlib.sha256,
|
|
241
|
+
).hexdigest()
|
|
242
|
+
|
|
243
|
+
return cls(
|
|
244
|
+
capsule_id=capsule_id,
|
|
245
|
+
user_id=user_id,
|
|
246
|
+
intent=intent,
|
|
247
|
+
intent_category=intent_category,
|
|
248
|
+
allowed_tools=allowed_tools,
|
|
249
|
+
allowed_actions=allowed_actions,
|
|
250
|
+
constraints=constraints or {},
|
|
251
|
+
created_at=now,
|
|
252
|
+
expires_at=expires_at,
|
|
253
|
+
signature=signature,
|
|
254
|
+
metadata=metadata or {},
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _detect_intent_category(intent: str) -> IntentCategory:
|
|
259
|
+
"""Auto-detect intent category from natural language."""
|
|
260
|
+
intent_lower = intent.lower()
|
|
261
|
+
|
|
262
|
+
# Delete patterns
|
|
263
|
+
if any(word in intent_lower for word in [
|
|
264
|
+
"delete", "remove", "destroy", "erase", "drop", "clear"
|
|
265
|
+
]):
|
|
266
|
+
return IntentCategory.DELETE
|
|
267
|
+
|
|
268
|
+
# Create patterns
|
|
269
|
+
if any(word in intent_lower for word in [
|
|
270
|
+
"create", "make", "generate", "build", "add", "new", "write"
|
|
271
|
+
]):
|
|
272
|
+
return IntentCategory.CREATE
|
|
273
|
+
|
|
274
|
+
# Update patterns
|
|
275
|
+
if any(word in intent_lower for word in [
|
|
276
|
+
"update", "modify", "change", "edit", "fix", "correct"
|
|
277
|
+
]):
|
|
278
|
+
return IntentCategory.UPDATE
|
|
279
|
+
|
|
280
|
+
# Execute patterns
|
|
281
|
+
if any(word in intent_lower for word in [
|
|
282
|
+
"run", "execute", "start", "launch", "deploy", "install"
|
|
283
|
+
]):
|
|
284
|
+
return IntentCategory.EXECUTE
|
|
285
|
+
|
|
286
|
+
# Communicate patterns
|
|
287
|
+
if any(word in intent_lower for word in [
|
|
288
|
+
"send", "email", "message", "notify", "alert", "share"
|
|
289
|
+
]):
|
|
290
|
+
return IntentCategory.COMMUNICATE
|
|
291
|
+
|
|
292
|
+
# Analyze patterns
|
|
293
|
+
if any(word in intent_lower for word in [
|
|
294
|
+
"analyze", "process", "calculate", "compute", "summarize"
|
|
295
|
+
]):
|
|
296
|
+
return IntentCategory.ANALYZE
|
|
297
|
+
|
|
298
|
+
# Query patterns (most common, check last)
|
|
299
|
+
if any(word in intent_lower for word in [
|
|
300
|
+
"find", "search", "get", "show", "list", "display", "fetch",
|
|
301
|
+
"what", "where", "when", "who", "how", "help", "tell"
|
|
302
|
+
]):
|
|
303
|
+
return IntentCategory.QUERY
|
|
304
|
+
|
|
305
|
+
return IntentCategory.UNKNOWN
|
|
306
|
+
|
|
307
|
+
def verify(self, secret_key: str | bytes) -> bool:
|
|
308
|
+
"""Verify the capsule signature."""
|
|
309
|
+
if isinstance(secret_key, str):
|
|
310
|
+
secret_key = secret_key.encode()
|
|
311
|
+
|
|
312
|
+
sig_data = (
|
|
313
|
+
f"{self.capsule_id}|{self.user_id}|{self.intent}|{self.intent_category.value}|"
|
|
314
|
+
f"{sorted(self.allowed_tools)}|{sorted(self.allowed_actions)}|"
|
|
315
|
+
f"{json.dumps(self.constraints, sort_keys=True)}|{self.created_at.isoformat()}"
|
|
316
|
+
)
|
|
317
|
+
expected_sig = hmac.new(
|
|
318
|
+
secret_key,
|
|
319
|
+
sig_data.encode(),
|
|
320
|
+
hashlib.sha256,
|
|
321
|
+
).hexdigest()
|
|
322
|
+
|
|
323
|
+
return hmac.compare_digest(expected_sig, self.signature)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# Hijacking detection patterns
|
|
327
|
+
HIJACK_PATTERNS: list[tuple[str, str, float]] = [
|
|
328
|
+
# (pattern, description, severity)
|
|
329
|
+
(r"(?i)ignore\s+(all\s+)?(previous|prior|original)\s+(intent|instructions?|goals?)",
|
|
330
|
+
"Intent override attempt", 0.95),
|
|
331
|
+
(r"(?i)new\s+(goal|objective|task|mission)\s*:",
|
|
332
|
+
"Goal replacement attempt", 0.9),
|
|
333
|
+
(r"(?i)forget\s+(your|the)\s+(original|primary|main)\s+(purpose|goal|task)",
|
|
334
|
+
"Purpose erasure attempt", 0.9),
|
|
335
|
+
(r"(?i)your\s+(real|true|actual)\s+(purpose|goal|mission)\s+is",
|
|
336
|
+
"False purpose injection", 0.95),
|
|
337
|
+
(r"(?i)override\s+(priority|directive|command)",
|
|
338
|
+
"Priority override attempt", 0.85),
|
|
339
|
+
(r"(?i)emergency\s+(override|protocol|mode)",
|
|
340
|
+
"Emergency bypass attempt", 0.8),
|
|
341
|
+
(r"(?i)admin(istrator)?\s+(mode|override|access)",
|
|
342
|
+
"Admin escalation attempt", 0.85),
|
|
343
|
+
(r"(?i)disregard\s+(user|original)\s+(request|intent)",
|
|
344
|
+
"Disregard user intent", 0.9),
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@dataclass
|
|
349
|
+
class HijackDetection:
|
|
350
|
+
"""Result of hijack detection analysis."""
|
|
351
|
+
|
|
352
|
+
is_hijack: bool
|
|
353
|
+
confidence: float
|
|
354
|
+
original_intent: str
|
|
355
|
+
detected_action: str
|
|
356
|
+
matched_patterns: list[str] = field(default_factory=list)
|
|
357
|
+
reasoning: str = ""
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class IntentValidator:
|
|
361
|
+
"""
|
|
362
|
+
Validates that current actions align with original intent.
|
|
363
|
+
|
|
364
|
+
Uses pattern matching and semantic analysis to detect
|
|
365
|
+
when an agent's behavior deviates from the user's intent.
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
def __init__(
|
|
369
|
+
self,
|
|
370
|
+
custom_patterns: list[tuple[str, str, float]] | None = None,
|
|
371
|
+
semantic_threshold: float = 0.5,
|
|
372
|
+
) -> None:
|
|
373
|
+
"""
|
|
374
|
+
Initialize the validator.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
custom_patterns: Additional hijack patterns.
|
|
378
|
+
semantic_threshold: Threshold for semantic similarity (0-1).
|
|
379
|
+
"""
|
|
380
|
+
self._patterns: list[tuple[re.Pattern[str], str, float]] = []
|
|
381
|
+
for pattern, desc, severity in HIJACK_PATTERNS:
|
|
382
|
+
self._patterns.append((re.compile(pattern), desc, severity))
|
|
383
|
+
|
|
384
|
+
if custom_patterns:
|
|
385
|
+
for pattern, desc, severity in custom_patterns:
|
|
386
|
+
self._patterns.append((re.compile(pattern), desc, severity))
|
|
387
|
+
|
|
388
|
+
self._semantic_threshold = semantic_threshold
|
|
389
|
+
|
|
390
|
+
def detect_hijack(
|
|
391
|
+
self,
|
|
392
|
+
original_intent: str,
|
|
393
|
+
current_action: str,
|
|
394
|
+
tool_name: str | None = None,
|
|
395
|
+
context: dict[str, Any] | None = None,
|
|
396
|
+
) -> HijackDetection:
|
|
397
|
+
"""
|
|
398
|
+
Detect if current action represents a goal hijack.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
original_intent: The original user intent.
|
|
402
|
+
current_action: What the agent is currently doing/planning.
|
|
403
|
+
tool_name: Optional tool being called.
|
|
404
|
+
context: Optional additional context.
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
HijackDetection with analysis results.
|
|
408
|
+
"""
|
|
409
|
+
matched_patterns: list[str] = []
|
|
410
|
+
max_severity = 0.0
|
|
411
|
+
|
|
412
|
+
# Check for explicit hijack patterns in current action
|
|
413
|
+
for pattern, description, severity in self._patterns:
|
|
414
|
+
if pattern.search(current_action):
|
|
415
|
+
matched_patterns.append(description)
|
|
416
|
+
max_severity = max(max_severity, severity)
|
|
417
|
+
|
|
418
|
+
# Check for category mismatch
|
|
419
|
+
category_mismatch = self._check_category_mismatch(
|
|
420
|
+
original_intent, current_action, tool_name
|
|
421
|
+
)
|
|
422
|
+
if category_mismatch:
|
|
423
|
+
matched_patterns.append(category_mismatch)
|
|
424
|
+
max_severity = max(max_severity, 0.7)
|
|
425
|
+
|
|
426
|
+
# Build reasoning
|
|
427
|
+
if matched_patterns:
|
|
428
|
+
reasoning = f"Detected patterns: {', '.join(matched_patterns)}"
|
|
429
|
+
else:
|
|
430
|
+
reasoning = "No hijacking patterns detected"
|
|
431
|
+
|
|
432
|
+
is_hijack = max_severity >= self._semantic_threshold
|
|
433
|
+
|
|
434
|
+
return HijackDetection(
|
|
435
|
+
is_hijack=is_hijack,
|
|
436
|
+
confidence=max_severity,
|
|
437
|
+
original_intent=original_intent,
|
|
438
|
+
detected_action=current_action,
|
|
439
|
+
matched_patterns=matched_patterns,
|
|
440
|
+
reasoning=reasoning,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
def _check_category_mismatch(
|
|
444
|
+
self,
|
|
445
|
+
original_intent: str,
|
|
446
|
+
current_action: str,
|
|
447
|
+
tool_name: str | None,
|
|
448
|
+
) -> str | None:
|
|
449
|
+
"""Check for intent category mismatch."""
|
|
450
|
+
original_category = IntentCapsule._detect_intent_category(original_intent)
|
|
451
|
+
action_category = IntentCapsule._detect_intent_category(current_action)
|
|
452
|
+
|
|
453
|
+
# Dangerous category escalations
|
|
454
|
+
dangerous_transitions = {
|
|
455
|
+
(IntentCategory.QUERY, IntentCategory.DELETE): "Read intent escalated to delete",
|
|
456
|
+
(IntentCategory.QUERY, IntentCategory.EXECUTE): "Read intent escalated to execute",
|
|
457
|
+
(IntentCategory.ANALYZE, IntentCategory.DELETE): "Analyze intent escalated to delete",
|
|
458
|
+
(IntentCategory.ANALYZE, IntentCategory.EXECUTE): "Analyze intent escalated to execute",
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
transition = (original_category, action_category)
|
|
462
|
+
return dangerous_transitions.get(transition)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
class IntentGuard:
|
|
466
|
+
"""
|
|
467
|
+
Guards agent execution against intent violations.
|
|
468
|
+
|
|
469
|
+
Wraps an IntentCapsule and validates each action against
|
|
470
|
+
the original intent before allowing execution.
|
|
471
|
+
|
|
472
|
+
Example:
|
|
473
|
+
>>> capsule = IntentCapsule.create(
|
|
474
|
+
... user_id="alice",
|
|
475
|
+
... intent="Search for documents",
|
|
476
|
+
... secret_key="secret",
|
|
477
|
+
... allowed_tools=["search"],
|
|
478
|
+
... )
|
|
479
|
+
>>> guard = IntentGuard(capsule)
|
|
480
|
+
>>>
|
|
481
|
+
>>> # This will pass
|
|
482
|
+
>>> guard.validate_tool_call("search", {"query": "python"})
|
|
483
|
+
>>>
|
|
484
|
+
>>> # This will fail
|
|
485
|
+
>>> guard.validate_tool_call("delete_all", {})
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
def __init__(
|
|
489
|
+
self,
|
|
490
|
+
capsule: IntentCapsule,
|
|
491
|
+
secret_key: str | bytes | None = None,
|
|
492
|
+
validator: IntentValidator | None = None,
|
|
493
|
+
strict_mode: bool = False,
|
|
494
|
+
) -> None:
|
|
495
|
+
"""
|
|
496
|
+
Initialize the guard.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
capsule: The intent capsule to guard.
|
|
500
|
+
secret_key: Secret key for capsule verification.
|
|
501
|
+
validator: Custom intent validator.
|
|
502
|
+
strict_mode: If True, raise exceptions on violations.
|
|
503
|
+
"""
|
|
504
|
+
self._capsule = capsule
|
|
505
|
+
self._secret_key = secret_key
|
|
506
|
+
self._validator = validator or IntentValidator()
|
|
507
|
+
self._strict_mode = strict_mode
|
|
508
|
+
self._lock = threading.RLock()
|
|
509
|
+
|
|
510
|
+
# Verify capsule if key provided
|
|
511
|
+
if secret_key:
|
|
512
|
+
if not capsule.verify(secret_key):
|
|
513
|
+
raise IntentHijackError(
|
|
514
|
+
original_intent=capsule.intent,
|
|
515
|
+
detected_intent="Capsule signature verification failed",
|
|
516
|
+
confidence=1.0,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
@property
|
|
520
|
+
def capsule(self) -> IntentCapsule:
|
|
521
|
+
"""Get the protected capsule."""
|
|
522
|
+
return self._capsule
|
|
523
|
+
|
|
524
|
+
def validate_tool_call(
|
|
525
|
+
self,
|
|
526
|
+
tool_name: str,
|
|
527
|
+
arguments: dict[str, Any],
|
|
528
|
+
description: str | None = None,
|
|
529
|
+
) -> bool:
|
|
530
|
+
"""
|
|
531
|
+
Validate a tool call against the intent.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
tool_name: Name of the tool being called.
|
|
535
|
+
arguments: Tool arguments.
|
|
536
|
+
description: Optional description of what the call does.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
True if the call is allowed.
|
|
540
|
+
|
|
541
|
+
Raises:
|
|
542
|
+
IntentHijackError: If strict_mode and violation detected.
|
|
543
|
+
"""
|
|
544
|
+
with self._lock:
|
|
545
|
+
# Check expiration
|
|
546
|
+
if self._capsule.is_expired():
|
|
547
|
+
return self._handle_violation(
|
|
548
|
+
"Intent capsule has expired",
|
|
549
|
+
0.9,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Check tool is allowed
|
|
553
|
+
if not self._capsule.is_tool_allowed(tool_name):
|
|
554
|
+
return self._handle_violation(
|
|
555
|
+
f"Tool '{tool_name}' not allowed by intent",
|
|
556
|
+
0.8,
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
# Check for hijacking patterns if description provided
|
|
560
|
+
if description:
|
|
561
|
+
detection = self._validator.detect_hijack(
|
|
562
|
+
original_intent=self._capsule.intent,
|
|
563
|
+
current_action=description,
|
|
564
|
+
tool_name=tool_name,
|
|
565
|
+
)
|
|
566
|
+
if detection.is_hijack:
|
|
567
|
+
return self._handle_violation(
|
|
568
|
+
detection.reasoning,
|
|
569
|
+
detection.confidence,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Check constraints
|
|
573
|
+
constraint_violation = self._check_constraints(tool_name, arguments)
|
|
574
|
+
if constraint_violation:
|
|
575
|
+
return self._handle_violation(
|
|
576
|
+
constraint_violation,
|
|
577
|
+
0.7,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# Record the call
|
|
581
|
+
self._capsule.record_tool_call(tool_name, arguments)
|
|
582
|
+
|
|
583
|
+
return True
|
|
584
|
+
|
|
585
|
+
def _check_constraints(
|
|
586
|
+
self,
|
|
587
|
+
tool_name: str,
|
|
588
|
+
arguments: dict[str, Any],
|
|
589
|
+
) -> str | None:
|
|
590
|
+
"""Check if tool call violates constraints."""
|
|
591
|
+
constraints = self._capsule.constraints
|
|
592
|
+
|
|
593
|
+
# Check max results
|
|
594
|
+
if "max_results" in constraints:
|
|
595
|
+
limit = arguments.get("limit") or arguments.get("max_results")
|
|
596
|
+
if limit and limit > constraints["max_results"]:
|
|
597
|
+
return f"Result limit {limit} exceeds max {constraints['max_results']}"
|
|
598
|
+
|
|
599
|
+
# Check allowed paths
|
|
600
|
+
if "allowed_paths" in constraints:
|
|
601
|
+
path = arguments.get("path") or arguments.get("file_path")
|
|
602
|
+
if path:
|
|
603
|
+
allowed = constraints["allowed_paths"]
|
|
604
|
+
if not any(path.startswith(p) for p in allowed):
|
|
605
|
+
return f"Path '{path}' not in allowed paths"
|
|
606
|
+
|
|
607
|
+
# Check forbidden arguments
|
|
608
|
+
if "forbidden_args" in constraints:
|
|
609
|
+
forbidden = constraints["forbidden_args"]
|
|
610
|
+
for key in arguments:
|
|
611
|
+
if key in forbidden:
|
|
612
|
+
return f"Argument '{key}' is forbidden"
|
|
613
|
+
|
|
614
|
+
# Check resource limits
|
|
615
|
+
if "max_tool_calls" in constraints:
|
|
616
|
+
if len(self._capsule.tool_calls) >= constraints["max_tool_calls"]:
|
|
617
|
+
return f"Exceeded max tool calls ({constraints['max_tool_calls']})"
|
|
618
|
+
|
|
619
|
+
return None
|
|
620
|
+
|
|
621
|
+
def _handle_violation(self, reason: str, confidence: float) -> bool:
|
|
622
|
+
"""Handle an intent violation."""
|
|
623
|
+
logger.warning(f"Intent violation: {reason} (confidence: {confidence:.1%})")
|
|
624
|
+
|
|
625
|
+
if self._strict_mode:
|
|
626
|
+
raise IntentHijackError(
|
|
627
|
+
original_intent=self._capsule.intent,
|
|
628
|
+
detected_intent=reason,
|
|
629
|
+
confidence=confidence,
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
def get_allowed_tools(self) -> set[str]:
|
|
635
|
+
"""Get the set of allowed tools."""
|
|
636
|
+
return self._capsule.allowed_tools.copy()
|
|
637
|
+
|
|
638
|
+
def get_intent_summary(self) -> dict[str, Any]:
|
|
639
|
+
"""Get a summary of the guarded intent."""
|
|
640
|
+
return {
|
|
641
|
+
"intent": self._capsule.intent,
|
|
642
|
+
"category": self._capsule.intent_category.value,
|
|
643
|
+
"allowed_tools": list(self._capsule.allowed_tools),
|
|
644
|
+
"tool_calls_made": len(self._capsule.tool_calls),
|
|
645
|
+
"expires_in_seconds": max(
|
|
646
|
+
0,
|
|
647
|
+
(self._capsule.expires_at - datetime.now(timezone.utc)).total_seconds()
|
|
648
|
+
),
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
class IntentCapsuleManager:
|
|
653
|
+
"""
|
|
654
|
+
Manages intent capsules for multiple sessions.
|
|
655
|
+
|
|
656
|
+
Provides centralized management of intent capsules with
|
|
657
|
+
automatic expiration and cleanup.
|
|
658
|
+
"""
|
|
659
|
+
|
|
660
|
+
def __init__(
|
|
661
|
+
self,
|
|
662
|
+
secret_key: str | bytes,
|
|
663
|
+
default_ttl: int = 3600,
|
|
664
|
+
max_capsules: int = 10000,
|
|
665
|
+
) -> None:
|
|
666
|
+
"""
|
|
667
|
+
Initialize the manager.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
secret_key: Master secret key for signing capsules.
|
|
671
|
+
default_ttl: Default TTL for capsules.
|
|
672
|
+
max_capsules: Maximum capsules to track.
|
|
673
|
+
"""
|
|
674
|
+
if isinstance(secret_key, str):
|
|
675
|
+
secret_key = secret_key.encode()
|
|
676
|
+
|
|
677
|
+
self._secret_key = secret_key
|
|
678
|
+
self._default_ttl = default_ttl
|
|
679
|
+
self._max_capsules = max_capsules
|
|
680
|
+
|
|
681
|
+
self._capsules: dict[str, IntentCapsule] = {}
|
|
682
|
+
self._user_capsules: dict[str, list[str]] = {} # user_id -> capsule_ids
|
|
683
|
+
self._lock = threading.RLock()
|
|
684
|
+
|
|
685
|
+
def create_capsule(
|
|
686
|
+
self,
|
|
687
|
+
user_id: str,
|
|
688
|
+
intent: str,
|
|
689
|
+
allowed_tools: set[str] | list[str] | None = None,
|
|
690
|
+
allowed_actions: set[str] | list[str] | None = None,
|
|
691
|
+
constraints: dict[str, Any] | None = None,
|
|
692
|
+
ttl_seconds: int | None = None,
|
|
693
|
+
metadata: dict[str, Any] | None = None,
|
|
694
|
+
) -> IntentCapsule:
|
|
695
|
+
"""
|
|
696
|
+
Create and register a new intent capsule.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
user_id: ID of the user.
|
|
700
|
+
intent: User's intent.
|
|
701
|
+
allowed_tools: Tools allowed for this intent.
|
|
702
|
+
allowed_actions: Actions allowed.
|
|
703
|
+
constraints: Additional constraints.
|
|
704
|
+
ttl_seconds: Time-to-live.
|
|
705
|
+
metadata: Optional metadata.
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
The created IntentCapsule.
|
|
709
|
+
"""
|
|
710
|
+
capsule = IntentCapsule.create(
|
|
711
|
+
user_id=user_id,
|
|
712
|
+
intent=intent,
|
|
713
|
+
secret_key=self._secret_key,
|
|
714
|
+
allowed_tools=allowed_tools,
|
|
715
|
+
allowed_actions=allowed_actions,
|
|
716
|
+
constraints=constraints,
|
|
717
|
+
ttl_seconds=ttl_seconds or self._default_ttl,
|
|
718
|
+
metadata=metadata,
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
with self._lock:
|
|
722
|
+
# Cleanup if at capacity
|
|
723
|
+
if len(self._capsules) >= self._max_capsules:
|
|
724
|
+
self._cleanup_expired()
|
|
725
|
+
|
|
726
|
+
self._capsules[capsule.capsule_id] = capsule
|
|
727
|
+
|
|
728
|
+
if user_id not in self._user_capsules:
|
|
729
|
+
self._user_capsules[user_id] = []
|
|
730
|
+
self._user_capsules[user_id].append(capsule.capsule_id)
|
|
731
|
+
|
|
732
|
+
logger.debug(f"Created intent capsule: {capsule.capsule_id} for user {user_id}")
|
|
733
|
+
return capsule
|
|
734
|
+
|
|
735
|
+
def get_capsule(self, capsule_id: str) -> IntentCapsule | None:
|
|
736
|
+
"""Get a capsule by ID."""
|
|
737
|
+
with self._lock:
|
|
738
|
+
capsule = self._capsules.get(capsule_id)
|
|
739
|
+
if capsule and capsule.is_expired():
|
|
740
|
+
del self._capsules[capsule_id]
|
|
741
|
+
return None
|
|
742
|
+
return capsule
|
|
743
|
+
|
|
744
|
+
def get_user_capsules(self, user_id: str) -> list[IntentCapsule]:
|
|
745
|
+
"""Get all active capsules for a user."""
|
|
746
|
+
with self._lock:
|
|
747
|
+
capsule_ids = self._user_capsules.get(user_id, [])
|
|
748
|
+
capsules = []
|
|
749
|
+
for cid in capsule_ids:
|
|
750
|
+
capsule = self._capsules.get(cid)
|
|
751
|
+
if capsule and not capsule.is_expired():
|
|
752
|
+
capsules.append(capsule)
|
|
753
|
+
return capsules
|
|
754
|
+
|
|
755
|
+
def revoke_capsule(self, capsule_id: str) -> bool:
|
|
756
|
+
"""Revoke a capsule."""
|
|
757
|
+
with self._lock:
|
|
758
|
+
if capsule_id in self._capsules:
|
|
759
|
+
capsule = self._capsules[capsule_id]
|
|
760
|
+
del self._capsules[capsule_id]
|
|
761
|
+
|
|
762
|
+
# Remove from user's list
|
|
763
|
+
user_ids = list(self._user_capsules.keys())
|
|
764
|
+
for uid in user_ids:
|
|
765
|
+
if capsule_id in self._user_capsules[uid]:
|
|
766
|
+
self._user_capsules[uid].remove(capsule_id)
|
|
767
|
+
|
|
768
|
+
logger.info(f"Revoked capsule: {capsule_id}")
|
|
769
|
+
return True
|
|
770
|
+
return False
|
|
771
|
+
|
|
772
|
+
def verify_capsule(self, capsule_id: str) -> bool:
|
|
773
|
+
"""Verify a capsule's signature."""
|
|
774
|
+
with self._lock:
|
|
775
|
+
capsule = self._capsules.get(capsule_id)
|
|
776
|
+
if not capsule:
|
|
777
|
+
return False
|
|
778
|
+
return capsule.verify(self._secret_key)
|
|
779
|
+
|
|
780
|
+
def create_guard(
|
|
781
|
+
self,
|
|
782
|
+
capsule_id: str,
|
|
783
|
+
strict_mode: bool = False,
|
|
784
|
+
) -> IntentGuard | None:
|
|
785
|
+
"""
|
|
786
|
+
Create a guard for a capsule.
|
|
787
|
+
|
|
788
|
+
Args:
|
|
789
|
+
capsule_id: ID of the capsule.
|
|
790
|
+
strict_mode: If True, guard raises exceptions on violations.
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
IntentGuard for the capsule, or None if not found.
|
|
794
|
+
"""
|
|
795
|
+
capsule = self.get_capsule(capsule_id)
|
|
796
|
+
if not capsule:
|
|
797
|
+
return None
|
|
798
|
+
|
|
799
|
+
return IntentGuard(
|
|
800
|
+
capsule=capsule,
|
|
801
|
+
secret_key=self._secret_key,
|
|
802
|
+
strict_mode=strict_mode,
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
def _cleanup_expired(self) -> int:
|
|
806
|
+
"""Clean up expired capsules."""
|
|
807
|
+
now = datetime.now(timezone.utc)
|
|
808
|
+
expired = [
|
|
809
|
+
cid for cid, capsule in self._capsules.items()
|
|
810
|
+
if capsule.expires_at < now
|
|
811
|
+
]
|
|
812
|
+
|
|
813
|
+
for cid in expired:
|
|
814
|
+
del self._capsules[cid]
|
|
815
|
+
|
|
816
|
+
# Clean up user mappings
|
|
817
|
+
for uid in list(self._user_capsules.keys()):
|
|
818
|
+
self._user_capsules[uid] = [
|
|
819
|
+
cid for cid in self._user_capsules[uid]
|
|
820
|
+
if cid in self._capsules
|
|
821
|
+
]
|
|
822
|
+
if not self._user_capsules[uid]:
|
|
823
|
+
del self._user_capsules[uid]
|
|
824
|
+
|
|
825
|
+
return len(expired)
|
|
826
|
+
|
|
827
|
+
def get_stats(self) -> dict[str, Any]:
|
|
828
|
+
"""Get manager statistics."""
|
|
829
|
+
with self._lock:
|
|
830
|
+
return {
|
|
831
|
+
"total_capsules": len(self._capsules),
|
|
832
|
+
"total_users": len(self._user_capsules),
|
|
833
|
+
"max_capsules": self._max_capsules,
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
# Convenience exports
|
|
838
|
+
__all__ = [
|
|
839
|
+
# Core classes
|
|
840
|
+
"IntentCapsule",
|
|
841
|
+
"IntentGuard",
|
|
842
|
+
"IntentValidator",
|
|
843
|
+
"IntentCapsuleManager",
|
|
844
|
+
# Data classes
|
|
845
|
+
"HijackDetection",
|
|
846
|
+
"IntentCategory",
|
|
847
|
+
# Patterns
|
|
848
|
+
"HIJACK_PATTERNS",
|
|
849
|
+
]
|