cortexhub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexhub/__init__.py +143 -0
- cortexhub/adapters/__init__.py +5 -0
- cortexhub/adapters/base.py +131 -0
- cortexhub/adapters/claude_agents.py +322 -0
- cortexhub/adapters/crewai.py +297 -0
- cortexhub/adapters/langgraph.py +386 -0
- cortexhub/adapters/openai_agents.py +192 -0
- cortexhub/audit/__init__.py +25 -0
- cortexhub/audit/events.py +165 -0
- cortexhub/auto_protect.py +128 -0
- cortexhub/backend/__init__.py +5 -0
- cortexhub/backend/client.py +348 -0
- cortexhub/client.py +2149 -0
- cortexhub/config.py +37 -0
- cortexhub/context/__init__.py +5 -0
- cortexhub/context/enricher.py +172 -0
- cortexhub/errors.py +123 -0
- cortexhub/frameworks.py +83 -0
- cortexhub/guardrails/__init__.py +3 -0
- cortexhub/guardrails/injection.py +180 -0
- cortexhub/guardrails/pii.py +378 -0
- cortexhub/guardrails/secrets.py +206 -0
- cortexhub/interceptors/__init__.py +3 -0
- cortexhub/interceptors/llm.py +62 -0
- cortexhub/interceptors/mcp.py +96 -0
- cortexhub/pipeline.py +92 -0
- cortexhub/policy/__init__.py +6 -0
- cortexhub/policy/effects.py +87 -0
- cortexhub/policy/evaluator.py +267 -0
- cortexhub/policy/loader.py +158 -0
- cortexhub/policy/models.py +123 -0
- cortexhub/policy/sync.py +183 -0
- cortexhub/telemetry/__init__.py +40 -0
- cortexhub/telemetry/otel.py +481 -0
- cortexhub/version.py +3 -0
- cortexhub-0.1.0.dist-info/METADATA +275 -0
- cortexhub-0.1.0.dist-info/RECORD +38 -0
- cortexhub-0.1.0.dist-info/WHEEL +4 -0
cortexhub/config.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Configuration management for CortexHub SDK.
|
|
2
|
+
|
|
3
|
+
The SDK configuration is minimal by design:
|
|
4
|
+
- Telemetry is ALWAYS governance mode (not configurable)
|
|
5
|
+
- Guardrail behavior comes from POLICIES (not SDK config)
|
|
6
|
+
- Approval behavior comes from POLICIES (not SDK config)
|
|
7
|
+
|
|
8
|
+
SDK only configures:
|
|
9
|
+
- Where to cache policies locally
|
|
10
|
+
- Connection settings (API key, URL)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Config:
|
|
17
|
+
"""SDK configuration - minimal by design.
|
|
18
|
+
|
|
19
|
+
Most behavior is determined by POLICIES in the CortexHub cloud,
|
|
20
|
+
not SDK configuration.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
policies_dir: str = "./policies",
|
|
26
|
+
):
|
|
27
|
+
"""Initialize configuration.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
policies_dir: Directory for local policy cache
|
|
31
|
+
"""
|
|
32
|
+
self.policies_dir = policies_dir
|
|
33
|
+
|
|
34
|
+
def validate(self) -> None:
|
|
35
|
+
"""Validate configuration settings."""
|
|
36
|
+
# Minimal validation - most config comes from cloud
|
|
37
|
+
pass
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Context enrichment for policy evaluation.
|
|
2
|
+
|
|
3
|
+
Adds runtime context to authorization requests:
|
|
4
|
+
- Agent roles and metadata
|
|
5
|
+
- Temporal constraints (time of day, business hours)
|
|
6
|
+
- Regulatory flags (HIPAA, SOX, GDPR)
|
|
7
|
+
- User context (when available)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import structlog
|
|
14
|
+
|
|
15
|
+
from cortexhub.policy.models import AuthorizationRequest
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AgentRegistry:
|
|
21
|
+
"""Registry mapping agent IDs to roles and metadata."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
"""Initialize agent registry."""
|
|
25
|
+
self._agents: dict[str, dict[str, Any]] = {}
|
|
26
|
+
|
|
27
|
+
def register(
|
|
28
|
+
self,
|
|
29
|
+
agent_id: str,
|
|
30
|
+
role: str,
|
|
31
|
+
permissions: list[str] | None = None,
|
|
32
|
+
metadata: dict[str, Any] | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Register an agent with role and permissions.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
agent_id: Unique agent identifier
|
|
38
|
+
role: Agent role (e.g., "nurse", "refund_processor")
|
|
39
|
+
permissions: List of allowed actions
|
|
40
|
+
metadata: Additional metadata (tenure, department, etc.)
|
|
41
|
+
"""
|
|
42
|
+
self._agents[agent_id] = {
|
|
43
|
+
"role": role,
|
|
44
|
+
"permissions": permissions or [],
|
|
45
|
+
"metadata": metadata or {},
|
|
46
|
+
}
|
|
47
|
+
logger.info("Agent registered", agent_id=agent_id, role=role)
|
|
48
|
+
|
|
49
|
+
def get(self, agent_id: str) -> dict[str, Any]:
|
|
50
|
+
"""Get agent metadata.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
agent_id: Agent identifier
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Agent metadata or empty dict if not found
|
|
57
|
+
"""
|
|
58
|
+
return self._agents.get(agent_id, {})
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ContextEnricher:
|
|
62
|
+
"""Enriches authorization requests with runtime context.
|
|
63
|
+
|
|
64
|
+
Adds:
|
|
65
|
+
- Agent roles and permissions
|
|
66
|
+
- Temporal constraints (time, business hours)
|
|
67
|
+
- Regulatory flags (HIPAA, SOX compliance)
|
|
68
|
+
- User context
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, agent_registry: AgentRegistry | None = None):
|
|
72
|
+
"""Initialize context enricher.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
agent_registry: Optional agent registry for role lookup
|
|
76
|
+
"""
|
|
77
|
+
self.agent_registry = agent_registry or AgentRegistry()
|
|
78
|
+
logger.info("Context enricher initialized")
|
|
79
|
+
|
|
80
|
+
def enrich(self, request: AuthorizationRequest) -> AuthorizationRequest:
|
|
81
|
+
"""Enrich authorization request with additional context.
|
|
82
|
+
|
|
83
|
+
INVARIANT: Enrichment may only ADD context keys, never MUTATE args or resource.
|
|
84
|
+
This protects policy correctness and audit integrity.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
request: Base authorization request
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Enriched authorization request
|
|
91
|
+
"""
|
|
92
|
+
agent_id = request.principal.id
|
|
93
|
+
|
|
94
|
+
# Get agent metadata
|
|
95
|
+
agent_metadata = self.agent_registry.get(agent_id)
|
|
96
|
+
|
|
97
|
+
if agent_metadata:
|
|
98
|
+
# Add agent role
|
|
99
|
+
request.context["agent_role"] = agent_metadata.get("role", "unknown")
|
|
100
|
+
request.context["agent_permissions"] = agent_metadata.get("permissions", [])
|
|
101
|
+
|
|
102
|
+
# Add agent metadata
|
|
103
|
+
metadata_dict = agent_metadata.get("metadata", {})
|
|
104
|
+
if metadata_dict:
|
|
105
|
+
request.context["agent_metadata"] = metadata_dict
|
|
106
|
+
|
|
107
|
+
logger.debug(
|
|
108
|
+
"Agent context enriched",
|
|
109
|
+
agent_id=agent_id,
|
|
110
|
+
role=agent_metadata.get("role"),
|
|
111
|
+
trace_id=request.trace_id,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Add temporal context
|
|
115
|
+
now = datetime.utcnow()
|
|
116
|
+
request.context["temporal"] = {
|
|
117
|
+
"timestamp": now.isoformat(),
|
|
118
|
+
"hour": now.hour,
|
|
119
|
+
"day_of_week": now.weekday(), # 0=Monday, 6=Sunday
|
|
120
|
+
"is_business_hours": self._is_business_hours(now),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Add regulatory context (based on agent role)
|
|
124
|
+
if agent_metadata:
|
|
125
|
+
role = agent_metadata.get("role", "")
|
|
126
|
+
request.context["regulatory"] = self._get_regulatory_context(role)
|
|
127
|
+
|
|
128
|
+
return request
|
|
129
|
+
|
|
130
|
+
def _is_business_hours(self, dt: datetime) -> bool:
|
|
131
|
+
"""Check if current time is within business hours.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
dt: Datetime to check
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
True if business hours (9 AM - 5 PM, Monday-Friday)
|
|
138
|
+
"""
|
|
139
|
+
# Business hours: 9 AM - 5 PM, Monday-Friday
|
|
140
|
+
is_weekday = dt.weekday() < 5 # 0-4 = Monday-Friday
|
|
141
|
+
is_work_hours = 9 <= dt.hour < 17 # 9 AM - 5 PM
|
|
142
|
+
return is_weekday and is_work_hours
|
|
143
|
+
|
|
144
|
+
def _get_regulatory_context(self, role: str) -> dict[str, Any]:
|
|
145
|
+
"""Get regulatory context based on agent role.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
role: Agent role
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Regulatory context dict
|
|
152
|
+
"""
|
|
153
|
+
context = {
|
|
154
|
+
"hipaa_applicable": False,
|
|
155
|
+
"sox_applicable": False,
|
|
156
|
+
"gdpr_applicable": False,
|
|
157
|
+
"minimum_necessary": False,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Healthcare roles = HIPAA
|
|
161
|
+
if role in ["nurse", "doctor", "healthcare_agent", "patient_communication"]:
|
|
162
|
+
context["hipaa_applicable"] = True
|
|
163
|
+
context["minimum_necessary"] = True # HIPAA minimum necessary rule
|
|
164
|
+
|
|
165
|
+
# Financial roles = SOX
|
|
166
|
+
if role in ["refund_processor", "billing_agent", "finance_agent"]:
|
|
167
|
+
context["sox_applicable"] = True
|
|
168
|
+
|
|
169
|
+
# All roles = GDPR (if EU)
|
|
170
|
+
context["gdpr_applicable"] = True
|
|
171
|
+
|
|
172
|
+
return context
|
cortexhub/errors.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Exception types for CortexHub SDK.
|
|
2
|
+
|
|
3
|
+
Distinct error types for debugging without ambiguity.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CortexHubError(Exception):
|
|
10
|
+
"""Base exception for all CortexHub errors."""
|
|
11
|
+
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConfigurationError(CortexHubError):
|
|
16
|
+
"""Raised when SDK configuration is invalid (missing API key, etc.)."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PolicyViolationError(CortexHubError):
|
|
22
|
+
"""Raised when a policy forbids a tool invocation (effect=DENY)."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, message: str, policy_id: str | None = None, reasoning: str = ""):
|
|
25
|
+
super().__init__(message)
|
|
26
|
+
self.policy_id = policy_id
|
|
27
|
+
self.reasoning = reasoning
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GuardrailViolationError(CortexHubError):
|
|
31
|
+
"""Raised when a guardrail detects a violation (PII, secrets, injection)."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
message: str,
|
|
36
|
+
guardrail_type: str,
|
|
37
|
+
findings: list[dict] | None = None,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(message)
|
|
40
|
+
self.guardrail_type = guardrail_type
|
|
41
|
+
self.findings = findings or []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ApprovalRequiredError(CortexHubError):
|
|
45
|
+
"""Raised when policy requires human approval before execution.
|
|
46
|
+
|
|
47
|
+
The SDK creates an approval record in CortexHub cloud. Customer's system
|
|
48
|
+
receives an approval.requested webhook and handles the approval workflow.
|
|
49
|
+
After decision, customer resumes the agent using framework-native mechanisms.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
approval_id: Cloud approval record ID (apr_xxx)
|
|
53
|
+
run_id: SDK session ID for correlation
|
|
54
|
+
tool_name: Name of the tool requiring approval
|
|
55
|
+
policy_id: Policy that triggered escalation
|
|
56
|
+
policy_name: Human-readable policy name
|
|
57
|
+
reason: Policy explanation (why approval is required)
|
|
58
|
+
expires_at: When approval expires (ISO format)
|
|
59
|
+
decision_endpoint: URL to submit decision
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
message: str,
|
|
65
|
+
*,
|
|
66
|
+
approval_id: str,
|
|
67
|
+
run_id: str,
|
|
68
|
+
tool_name: str,
|
|
69
|
+
policy_id: str | None = None,
|
|
70
|
+
policy_name: str | None = None,
|
|
71
|
+
reason: str = "",
|
|
72
|
+
expires_at: str | None = None,
|
|
73
|
+
decision_endpoint: str | None = None,
|
|
74
|
+
):
|
|
75
|
+
super().__init__(message)
|
|
76
|
+
self.approval_id = approval_id
|
|
77
|
+
self.run_id = run_id
|
|
78
|
+
self.tool_name = tool_name
|
|
79
|
+
self.policy_id = policy_id
|
|
80
|
+
self.policy_name = policy_name
|
|
81
|
+
self.reason = reason
|
|
82
|
+
self.expires_at = expires_at
|
|
83
|
+
self.decision_endpoint = decision_endpoint
|
|
84
|
+
|
|
85
|
+
def to_dict(self) -> dict[str, Any]:
|
|
86
|
+
"""Deterministic outcome for customer handling."""
|
|
87
|
+
return {
|
|
88
|
+
"type": "approval_required",
|
|
89
|
+
"blocked": True,
|
|
90
|
+
"approval_id": self.approval_id,
|
|
91
|
+
"run_id": self.run_id,
|
|
92
|
+
"tool_name": self.tool_name,
|
|
93
|
+
"policy_id": self.policy_id,
|
|
94
|
+
"policy_name": self.policy_name,
|
|
95
|
+
"reason": self.reason,
|
|
96
|
+
"expires_at": self.expires_at,
|
|
97
|
+
"decision_endpoint": self.decision_endpoint,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class ApprovalDeniedError(CortexHubError):
|
|
102
|
+
"""Raised when an approval request is denied or expired."""
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
message: str,
|
|
107
|
+
*,
|
|
108
|
+
approval_id: str,
|
|
109
|
+
denied_by: str | None = None,
|
|
110
|
+
reason: str = "",
|
|
111
|
+
):
|
|
112
|
+
super().__init__(message)
|
|
113
|
+
self.approval_id = approval_id
|
|
114
|
+
self.denied_by = denied_by
|
|
115
|
+
self.reason = reason
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class PolicyLoadError(CortexHubError):
|
|
119
|
+
"""Raised when policy bundle cannot be loaded or is invalid."""
|
|
120
|
+
|
|
121
|
+
def __init__(self, message: str, policies_dir: str):
|
|
122
|
+
super().__init__(message)
|
|
123
|
+
self.policies_dir = policies_dir
|
cortexhub/frameworks.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Supported framework definitions.
|
|
2
|
+
|
|
3
|
+
Users specify which framework they're using via the Framework enum.
|
|
4
|
+
This avoids fragile auto-detection and makes dependencies explicit.
|
|
5
|
+
|
|
6
|
+
We support AGENT frameworks only - not LLM proxies or RAG tools.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Framework(str, Enum):
|
|
13
|
+
"""Supported AI agent frameworks.
|
|
14
|
+
|
|
15
|
+
CortexHub supports frameworks that provide agent orchestration with
|
|
16
|
+
native approval/human-in-the-loop mechanisms:
|
|
17
|
+
|
|
18
|
+
- LangGraph: Checkpointing + interrupt()
|
|
19
|
+
- CrewAI: human_input=True
|
|
20
|
+
- OpenAI Agents SDK: needsApproval + state serialization
|
|
21
|
+
- Claude Agent SDK: Subagents + tool-based
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
import cortexhub
|
|
25
|
+
cortex = cortexhub.init("my_agent", cortexhub.Framework.LANGGRAPH)
|
|
26
|
+
|
|
27
|
+
Each framework requires its corresponding optional dependency:
|
|
28
|
+
pip install cortexhub[langgraph]
|
|
29
|
+
pip install cortexhub[crewai]
|
|
30
|
+
pip install cortexhub[openai-agents]
|
|
31
|
+
pip install cortexhub[claude-agents]
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
LANGGRAPH = "langgraph"
|
|
35
|
+
"""LangGraph - Stateful agent orchestration.
|
|
36
|
+
|
|
37
|
+
Features:
|
|
38
|
+
- Graph-based agent workflows
|
|
39
|
+
- Checkpointing for pause/resume
|
|
40
|
+
- interrupt() for human-in-the-loop
|
|
41
|
+
- Cycles and branches
|
|
42
|
+
|
|
43
|
+
Install: pip install cortexhub[langgraph]
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
CREWAI = "crewai"
|
|
47
|
+
"""CrewAI - Multi-agent crews.
|
|
48
|
+
|
|
49
|
+
Features:
|
|
50
|
+
- Role-based agents
|
|
51
|
+
- Sequential and hierarchical processes
|
|
52
|
+
- human_input=True for approval
|
|
53
|
+
- Task delegation
|
|
54
|
+
|
|
55
|
+
Install: pip install cortexhub[crewai]
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
OPENAI_AGENTS = "openai_agents"
|
|
59
|
+
"""OpenAI Agents SDK.
|
|
60
|
+
|
|
61
|
+
Features:
|
|
62
|
+
- Native tool calling
|
|
63
|
+
- needsApproval for human-in-the-loop
|
|
64
|
+
- State serialization for pause/resume
|
|
65
|
+
- Handoffs between agents
|
|
66
|
+
|
|
67
|
+
Install: pip install cortexhub[openai-agents]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
CLAUDE_AGENTS = "claude_agents"
|
|
71
|
+
"""Claude Agent SDK (formerly Claude Code SDK).
|
|
72
|
+
|
|
73
|
+
Features:
|
|
74
|
+
- Computer use (bash, files, code)
|
|
75
|
+
- Subagents for parallelization
|
|
76
|
+
- MCP integration
|
|
77
|
+
- Context compaction
|
|
78
|
+
|
|
79
|
+
Install: pip install cortexhub[claude-agents]
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __str__(self) -> str:
|
|
83
|
+
return self.value
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Prompt injection detection using pattern-based heuristics.
|
|
2
|
+
|
|
3
|
+
Lightweight, fast detection without ML dependencies.
|
|
4
|
+
Baseline defense against common attack patterns.
|
|
5
|
+
|
|
6
|
+
Premium Features (Backend):
|
|
7
|
+
- LLM Guard: ML-based injection + jailbreak detection
|
|
8
|
+
- Toxicity detection
|
|
9
|
+
- Advanced attack pattern recognition
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import structlog
|
|
17
|
+
|
|
18
|
+
logger = structlog.get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ManipulationDetectionResult:
|
|
23
|
+
"""Result of prompt manipulation detection scan."""
|
|
24
|
+
detected: bool
|
|
25
|
+
patterns: list[str]
|
|
26
|
+
findings: list[dict[str, Any]] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def count(self) -> int:
|
|
30
|
+
"""Total matches found."""
|
|
31
|
+
return len(self.findings)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PromptManipulationDetector:
|
|
35
|
+
"""Lightweight prompt injection detection using patterns.
|
|
36
|
+
|
|
37
|
+
Detects common attack patterns:
|
|
38
|
+
- Role switching ("ignore previous instructions")
|
|
39
|
+
- System override ("you are now admin")
|
|
40
|
+
- Delimiter abuse (excessive delimiters)
|
|
41
|
+
- Encoding tricks (base64, decode)
|
|
42
|
+
- Context escape ("end of system")
|
|
43
|
+
- Instruction injection (special tokens)
|
|
44
|
+
|
|
45
|
+
For premium ML-based detection (toxicity, advanced jailbreaks),
|
|
46
|
+
use backend LLM Guard integration.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, enabled: bool = True, sensitivity: float = 0.7):
|
|
50
|
+
"""Initialize prompt manipulation detector.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
enabled: Whether detection is enabled
|
|
54
|
+
sensitivity: Detection sensitivity (0.0-1.0, higher = more strict)
|
|
55
|
+
"""
|
|
56
|
+
self.enabled = enabled
|
|
57
|
+
self.sensitivity = sensitivity
|
|
58
|
+
self._compile_patterns()
|
|
59
|
+
logger.info(
|
|
60
|
+
"Prompt manipulation detector initialized (pattern-based, lightweight)",
|
|
61
|
+
enabled=enabled,
|
|
62
|
+
sensitivity=sensitivity,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _compile_patterns(self) -> None:
|
|
66
|
+
"""Compile patterns for common prompt manipulation techniques."""
|
|
67
|
+
self.patterns = {
|
|
68
|
+
# Role switching / instruction negation
|
|
69
|
+
"role_switch": re.compile(
|
|
70
|
+
r"\b(ignore|disregard|forget|override|bypass)\s+"
|
|
71
|
+
r"(previous|above|prior|all|any)\s+"
|
|
72
|
+
r"(instructions?|rules?|constraints?|context|prompts?)\b",
|
|
73
|
+
re.IGNORECASE,
|
|
74
|
+
),
|
|
75
|
+
# System override / privilege escalation
|
|
76
|
+
"system_override": re.compile(
|
|
77
|
+
r"\b(you are now|act as|pretend to be|system message|"
|
|
78
|
+
r"new instructions?|admin mode|developer mode|god mode)\b",
|
|
79
|
+
re.IGNORECASE,
|
|
80
|
+
),
|
|
81
|
+
# Delimiter abuse
|
|
82
|
+
"delimiter_abuse": re.compile(r"(```|---|===|\*\*\*|###){3,}"),
|
|
83
|
+
# Encoding tricks
|
|
84
|
+
"encoding_tricks": re.compile(
|
|
85
|
+
r"\b(base64|rot13|hex|decode|unescape|eval|exec)\s*[([]",
|
|
86
|
+
re.IGNORECASE,
|
|
87
|
+
),
|
|
88
|
+
# Context escape
|
|
89
|
+
"context_escape": re.compile(
|
|
90
|
+
r"\b(end of|start of|begin|finish)\s+(context|system|prompt|instructions?)\b",
|
|
91
|
+
re.IGNORECASE,
|
|
92
|
+
),
|
|
93
|
+
# Instruction injection (special tokens)
|
|
94
|
+
"instruction_injection": re.compile(
|
|
95
|
+
r"<\|?(im_start|im_end|system|assistant|user)\|?>",
|
|
96
|
+
re.IGNORECASE,
|
|
97
|
+
),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
def scan(self, text: str | dict[str, Any]) -> list[dict[str, Any]]:
|
|
101
|
+
"""Scan text for prompt manipulation patterns.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
text: Text or dict to scan
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
List of findings with pattern type and position
|
|
108
|
+
"""
|
|
109
|
+
if not self.enabled:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
# Convert dict to JSON string for scanning
|
|
113
|
+
if isinstance(text, dict):
|
|
114
|
+
import json
|
|
115
|
+
|
|
116
|
+
text = json.dumps(text)
|
|
117
|
+
|
|
118
|
+
findings = []
|
|
119
|
+
|
|
120
|
+
for pattern_name, pattern in self.patterns.items():
|
|
121
|
+
matches = pattern.finditer(text)
|
|
122
|
+
for match in matches:
|
|
123
|
+
findings.append(
|
|
124
|
+
{
|
|
125
|
+
"type": pattern_name,
|
|
126
|
+
"value": match.group(),
|
|
127
|
+
"start": match.start(),
|
|
128
|
+
"end": match.end(),
|
|
129
|
+
"confidence": 0.7, # Pattern-based has moderate confidence
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Filter by sensitivity threshold
|
|
134
|
+
findings = [f for f in findings if f["confidence"] >= (1.0 - self.sensitivity)]
|
|
135
|
+
|
|
136
|
+
if findings:
|
|
137
|
+
logger.warning(
|
|
138
|
+
"Prompt manipulation patterns detected (lightweight)",
|
|
139
|
+
count=len(findings),
|
|
140
|
+
patterns=list(set(f["type"] for f in findings)),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return findings
|
|
144
|
+
|
|
145
|
+
def detect(self, text: str | dict[str, Any]) -> ManipulationDetectionResult:
|
|
146
|
+
"""Detect prompt manipulation and return structured result.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
text: Text or dict to scan
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
ManipulationDetectionResult with detection details
|
|
153
|
+
"""
|
|
154
|
+
findings = self.scan(text)
|
|
155
|
+
|
|
156
|
+
if not findings:
|
|
157
|
+
return ManipulationDetectionResult(
|
|
158
|
+
detected=False,
|
|
159
|
+
patterns=[],
|
|
160
|
+
findings=[],
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
patterns = list(set(f["type"] for f in findings))
|
|
164
|
+
|
|
165
|
+
return ManipulationDetectionResult(
|
|
166
|
+
detected=True,
|
|
167
|
+
patterns=patterns,
|
|
168
|
+
findings=findings,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def has_manipulation(self, text: str | dict[str, Any]) -> bool:
|
|
172
|
+
"""Check if text contains manipulation patterns.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
text: Text or dict to check
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
True if manipulation patterns detected
|
|
179
|
+
"""
|
|
180
|
+
return len(self.scan(text)) > 0
|