proxilion 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proxilion/__init__.py +136 -0
- proxilion/audit/__init__.py +133 -0
- proxilion/audit/base_exporters.py +527 -0
- proxilion/audit/compliance/__init__.py +130 -0
- proxilion/audit/compliance/base.py +457 -0
- proxilion/audit/compliance/eu_ai_act.py +603 -0
- proxilion/audit/compliance/iso27001.py +544 -0
- proxilion/audit/compliance/soc2.py +491 -0
- proxilion/audit/events.py +493 -0
- proxilion/audit/explainability.py +1173 -0
- proxilion/audit/exporters/__init__.py +58 -0
- proxilion/audit/exporters/aws_s3.py +636 -0
- proxilion/audit/exporters/azure_storage.py +608 -0
- proxilion/audit/exporters/cloud_base.py +468 -0
- proxilion/audit/exporters/gcp_storage.py +570 -0
- proxilion/audit/exporters/multi_exporter.py +498 -0
- proxilion/audit/hash_chain.py +652 -0
- proxilion/audit/logger.py +543 -0
- proxilion/caching/__init__.py +49 -0
- proxilion/caching/tool_cache.py +633 -0
- proxilion/context/__init__.py +73 -0
- proxilion/context/context_window.py +556 -0
- proxilion/context/message_history.py +505 -0
- proxilion/context/session.py +735 -0
- proxilion/contrib/__init__.py +51 -0
- proxilion/contrib/anthropic.py +609 -0
- proxilion/contrib/google.py +1012 -0
- proxilion/contrib/langchain.py +641 -0
- proxilion/contrib/mcp.py +893 -0
- proxilion/contrib/openai.py +646 -0
- proxilion/core.py +3058 -0
- proxilion/decorators.py +966 -0
- proxilion/engines/__init__.py +287 -0
- proxilion/engines/base.py +266 -0
- proxilion/engines/casbin_engine.py +412 -0
- proxilion/engines/opa_engine.py +493 -0
- proxilion/engines/simple.py +437 -0
- proxilion/exceptions.py +887 -0
- proxilion/guards/__init__.py +54 -0
- proxilion/guards/input_guard.py +522 -0
- proxilion/guards/output_guard.py +634 -0
- proxilion/observability/__init__.py +198 -0
- proxilion/observability/cost_tracker.py +866 -0
- proxilion/observability/hooks.py +683 -0
- proxilion/observability/metrics.py +798 -0
- proxilion/observability/session_cost_tracker.py +1063 -0
- proxilion/policies/__init__.py +67 -0
- proxilion/policies/base.py +304 -0
- proxilion/policies/builtin.py +486 -0
- proxilion/policies/registry.py +376 -0
- proxilion/providers/__init__.py +201 -0
- proxilion/providers/adapter.py +468 -0
- proxilion/providers/anthropic_adapter.py +330 -0
- proxilion/providers/gemini_adapter.py +391 -0
- proxilion/providers/openai_adapter.py +294 -0
- proxilion/py.typed +0 -0
- proxilion/resilience/__init__.py +81 -0
- proxilion/resilience/degradation.py +615 -0
- proxilion/resilience/fallback.py +555 -0
- proxilion/resilience/retry.py +554 -0
- proxilion/scheduling/__init__.py +57 -0
- proxilion/scheduling/priority_queue.py +419 -0
- proxilion/scheduling/scheduler.py +459 -0
- proxilion/security/__init__.py +244 -0
- proxilion/security/agent_trust.py +968 -0
- proxilion/security/behavioral_drift.py +794 -0
- proxilion/security/cascade_protection.py +869 -0
- proxilion/security/circuit_breaker.py +428 -0
- proxilion/security/cost_limiter.py +690 -0
- proxilion/security/idor_protection.py +460 -0
- proxilion/security/intent_capsule.py +849 -0
- proxilion/security/intent_validator.py +495 -0
- proxilion/security/memory_integrity.py +767 -0
- proxilion/security/rate_limiter.py +509 -0
- proxilion/security/scope_enforcer.py +680 -0
- proxilion/security/sequence_validator.py +636 -0
- proxilion/security/trust_boundaries.py +784 -0
- proxilion/streaming/__init__.py +70 -0
- proxilion/streaming/detector.py +761 -0
- proxilion/streaming/transformer.py +674 -0
- proxilion/timeouts/__init__.py +55 -0
- proxilion/timeouts/decorators.py +477 -0
- proxilion/timeouts/manager.py +545 -0
- proxilion/tools/__init__.py +69 -0
- proxilion/tools/decorators.py +493 -0
- proxilion/tools/registry.py +732 -0
- proxilion/types.py +339 -0
- proxilion/validation/__init__.py +93 -0
- proxilion/validation/pydantic_schema.py +351 -0
- proxilion/validation/schema.py +651 -0
- proxilion-0.0.1.dist-info/METADATA +872 -0
- proxilion-0.0.1.dist-info/RECORD +94 -0
- proxilion-0.0.1.dist-info/WHEEL +4 -0
- proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,634 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Output guard for detecting sensitive data leakage.
|
|
3
|
+
|
|
4
|
+
Provides pattern-based detection of credentials, API keys, private keys,
|
|
5
|
+
internal paths, and other sensitive information that may leak through
|
|
6
|
+
LLM outputs.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> from proxilion.guards import OutputGuard
|
|
10
|
+
>>>
|
|
11
|
+
>>> guard = OutputGuard()
|
|
12
|
+
>>>
|
|
13
|
+
>>> # Check for leakage
|
|
14
|
+
>>> result = guard.check("The API key is sk-abc123...")
|
|
15
|
+
>>> if not result.passed:
|
|
16
|
+
... print(f"Leakage detected: {result.matched_patterns}")
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Redact sensitive data
|
|
19
|
+
>>> safe_output = guard.redact("Bearer token: eyJhbGc...")
|
|
20
|
+
>>> print(safe_output) # "Bearer token: [REDACTED]"
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
import re
|
|
27
|
+
from collections.abc import Callable
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from enum import Enum
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from proxilion.guards.input_guard import GuardAction, GuardResult
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LeakageCategory(Enum):
|
|
38
|
+
"""Category of data leakage."""
|
|
39
|
+
|
|
40
|
+
CREDENTIAL = "credential"
|
|
41
|
+
"""API keys, passwords, tokens."""
|
|
42
|
+
|
|
43
|
+
INTERNAL = "internal"
|
|
44
|
+
"""Internal paths, URLs, infrastructure details."""
|
|
45
|
+
|
|
46
|
+
SYSTEM_PROMPT = "system_prompt"
|
|
47
|
+
"""Leakage of system prompt or instructions."""
|
|
48
|
+
|
|
49
|
+
PII = "pii"
|
|
50
|
+
"""Personally identifiable information."""
|
|
51
|
+
|
|
52
|
+
FINANCIAL = "financial"
|
|
53
|
+
"""Credit card numbers, bank accounts."""
|
|
54
|
+
|
|
55
|
+
INFRASTRUCTURE = "infrastructure"
|
|
56
|
+
"""Internal hostnames, IP addresses, database names."""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class LeakagePattern:
|
|
61
|
+
"""
|
|
62
|
+
Pattern for detecting sensitive data leakage.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
name: Unique identifier for the pattern.
|
|
66
|
+
pattern: Regex pattern string.
|
|
67
|
+
category: Category of data this detects.
|
|
68
|
+
severity: Severity score from 0.0 to 1.0.
|
|
69
|
+
description: Human-readable description.
|
|
70
|
+
redaction: Text to replace matches with.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
name: str
|
|
74
|
+
pattern: str
|
|
75
|
+
category: LeakageCategory
|
|
76
|
+
severity: float = 0.8
|
|
77
|
+
description: str = ""
|
|
78
|
+
redaction: str = "[REDACTED]"
|
|
79
|
+
_compiled: re.Pattern[str] | None = field(default=None, repr=False, compare=False)
|
|
80
|
+
|
|
81
|
+
def __post_init__(self) -> None:
|
|
82
|
+
"""Compile the regex pattern."""
|
|
83
|
+
if self._compiled is None:
|
|
84
|
+
try:
|
|
85
|
+
self._compiled = re.compile(self.pattern, re.IGNORECASE | re.MULTILINE)
|
|
86
|
+
except re.error as e:
|
|
87
|
+
logger.error(f"Invalid regex pattern for {self.name}: {e}")
|
|
88
|
+
raise ValueError(f"Invalid regex pattern for {self.name}: {e}") from e
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def compiled(self) -> re.Pattern[str]:
|
|
92
|
+
"""Get the compiled regex pattern."""
|
|
93
|
+
if self._compiled is None:
|
|
94
|
+
self._compiled = re.compile(self.pattern, re.IGNORECASE | re.MULTILINE)
|
|
95
|
+
return self._compiled
|
|
96
|
+
|
|
97
|
+
def match(self, text: str) -> list[re.Match[str]]:
|
|
98
|
+
"""Find all matches of this pattern in text."""
|
|
99
|
+
return list(self.compiled.finditer(text))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class OutputFilter:
|
|
104
|
+
"""
|
|
105
|
+
Custom filter for output validation.
|
|
106
|
+
|
|
107
|
+
Allows for custom validation logic beyond regex patterns.
|
|
108
|
+
|
|
109
|
+
Attributes:
|
|
110
|
+
name: Unique identifier for the filter.
|
|
111
|
+
check_func: Function that returns True if output is safe.
|
|
112
|
+
action: Action to take if filter fails.
|
|
113
|
+
description: Human-readable description.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
name: str
|
|
117
|
+
check_func: Callable[[str, dict[str, Any] | None], bool]
|
|
118
|
+
action: GuardAction = GuardAction.WARN
|
|
119
|
+
description: str = ""
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Built-in leakage patterns
|
|
123
|
+
DEFAULT_LEAKAGE_PATTERNS: list[LeakagePattern] = [
|
|
124
|
+
# API Keys and Tokens
|
|
125
|
+
LeakagePattern(
|
|
126
|
+
name="api_key_generic",
|
|
127
|
+
pattern=r"(?i)(api[_-]?key|apikey|api_secret|api_token)\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{20,})['\"]?",
|
|
128
|
+
category=LeakageCategory.CREDENTIAL,
|
|
129
|
+
severity=0.95,
|
|
130
|
+
description="Generic API key patterns",
|
|
131
|
+
redaction="[API_KEY_REDACTED]",
|
|
132
|
+
),
|
|
133
|
+
LeakagePattern(
|
|
134
|
+
name="bearer_token",
|
|
135
|
+
pattern=r"(?i)bearer\s+([a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9_\-\.]+)",
|
|
136
|
+
category=LeakageCategory.CREDENTIAL,
|
|
137
|
+
severity=0.95,
|
|
138
|
+
description="Bearer authentication tokens (JWT)",
|
|
139
|
+
redaction="Bearer [TOKEN_REDACTED]",
|
|
140
|
+
),
|
|
141
|
+
LeakagePattern(
|
|
142
|
+
name="openai_key",
|
|
143
|
+
pattern=r"sk-(?:proj-)?[a-zA-Z0-9\-_]{20,}",
|
|
144
|
+
category=LeakageCategory.CREDENTIAL,
|
|
145
|
+
severity=0.95,
|
|
146
|
+
description="OpenAI API keys (including project keys)",
|
|
147
|
+
redaction="[OPENAI_KEY_REDACTED]",
|
|
148
|
+
),
|
|
149
|
+
LeakagePattern(
|
|
150
|
+
name="anthropic_key",
|
|
151
|
+
pattern=r"sk-ant-[a-zA-Z0-9\-]{20,}",
|
|
152
|
+
category=LeakageCategory.CREDENTIAL,
|
|
153
|
+
severity=0.95,
|
|
154
|
+
description="Anthropic API keys",
|
|
155
|
+
redaction="[ANTHROPIC_KEY_REDACTED]",
|
|
156
|
+
),
|
|
157
|
+
LeakagePattern(
|
|
158
|
+
name="aws_key",
|
|
159
|
+
pattern=r"(?i)(AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}",
|
|
160
|
+
category=LeakageCategory.CREDENTIAL,
|
|
161
|
+
severity=0.95,
|
|
162
|
+
description="AWS access key IDs",
|
|
163
|
+
redaction="[AWS_KEY_REDACTED]",
|
|
164
|
+
),
|
|
165
|
+
LeakagePattern(
|
|
166
|
+
name="aws_secret",
|
|
167
|
+
pattern=r"(?i)(aws_secret_access_key|aws_secret)\s*[:=]\s*['\"]?([a-zA-Z0-9/+=]{40})['\"]?",
|
|
168
|
+
category=LeakageCategory.CREDENTIAL,
|
|
169
|
+
severity=0.95,
|
|
170
|
+
description="AWS secret access keys",
|
|
171
|
+
redaction="[AWS_SECRET_REDACTED]",
|
|
172
|
+
),
|
|
173
|
+
LeakagePattern(
|
|
174
|
+
name="gcp_key",
|
|
175
|
+
pattern=r"(?i)(gcp|google)[_-]?(api[_-]?key|key)\s*[:=]\s*['\"]?AIza[a-zA-Z0-9_\-]{35}['\"]?",
|
|
176
|
+
category=LeakageCategory.CREDENTIAL,
|
|
177
|
+
severity=0.95,
|
|
178
|
+
description="Google Cloud API keys",
|
|
179
|
+
redaction="[GCP_KEY_REDACTED]",
|
|
180
|
+
),
|
|
181
|
+
LeakagePattern(
|
|
182
|
+
name="azure_key",
|
|
183
|
+
pattern=r"(?i)(azure|az)[_-]?(storage|account)[_-]?key\s*[:=]\s*['\"]?[a-zA-Z0-9/+=]{88}['\"]?",
|
|
184
|
+
category=LeakageCategory.CREDENTIAL,
|
|
185
|
+
severity=0.95,
|
|
186
|
+
description="Azure storage keys",
|
|
187
|
+
redaction="[AZURE_KEY_REDACTED]",
|
|
188
|
+
),
|
|
189
|
+
LeakagePattern(
|
|
190
|
+
name="github_token",
|
|
191
|
+
pattern=r"(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}",
|
|
192
|
+
category=LeakageCategory.CREDENTIAL,
|
|
193
|
+
severity=0.95,
|
|
194
|
+
description="GitHub personal access tokens",
|
|
195
|
+
redaction="[GITHUB_TOKEN_REDACTED]",
|
|
196
|
+
),
|
|
197
|
+
LeakagePattern(
|
|
198
|
+
name="slack_token",
|
|
199
|
+
pattern=r"xox[baprs]-[0-9a-zA-Z\-]{10,}",
|
|
200
|
+
category=LeakageCategory.CREDENTIAL,
|
|
201
|
+
severity=0.9,
|
|
202
|
+
description="Slack API tokens",
|
|
203
|
+
redaction="[SLACK_TOKEN_REDACTED]",
|
|
204
|
+
),
|
|
205
|
+
|
|
206
|
+
# Private Keys
|
|
207
|
+
LeakagePattern(
|
|
208
|
+
name="private_key",
|
|
209
|
+
pattern=r"-----BEGIN\s+(RSA\s+|EC\s+|DSA\s+|OPENSSH\s+)?PRIVATE\s+KEY-----",
|
|
210
|
+
category=LeakageCategory.CREDENTIAL,
|
|
211
|
+
severity=0.99,
|
|
212
|
+
description="Private key headers",
|
|
213
|
+
redaction="[PRIVATE_KEY_REDACTED]",
|
|
214
|
+
),
|
|
215
|
+
|
|
216
|
+
# Connection Strings
|
|
217
|
+
LeakagePattern(
|
|
218
|
+
name="connection_string_mongodb",
|
|
219
|
+
pattern=r"mongodb(\+srv)?:\/\/[^:]+:[^@]+@[^\s]+",
|
|
220
|
+
category=LeakageCategory.CREDENTIAL,
|
|
221
|
+
severity=0.95,
|
|
222
|
+
description="MongoDB connection strings with credentials",
|
|
223
|
+
redaction="[MONGODB_CONN_REDACTED]",
|
|
224
|
+
),
|
|
225
|
+
LeakagePattern(
|
|
226
|
+
name="connection_string_postgres",
|
|
227
|
+
pattern=r"postgres(ql)?:\/\/[^:]+:[^@]+@[^\s]+",
|
|
228
|
+
category=LeakageCategory.CREDENTIAL,
|
|
229
|
+
severity=0.95,
|
|
230
|
+
description="PostgreSQL connection strings with credentials",
|
|
231
|
+
redaction="[POSTGRES_CONN_REDACTED]",
|
|
232
|
+
),
|
|
233
|
+
LeakagePattern(
|
|
234
|
+
name="connection_string_mysql",
|
|
235
|
+
pattern=r"mysql:\/\/[^:]+:[^@]+@[^\s]+",
|
|
236
|
+
category=LeakageCategory.CREDENTIAL,
|
|
237
|
+
severity=0.95,
|
|
238
|
+
description="MySQL connection strings with credentials",
|
|
239
|
+
redaction="[MYSQL_CONN_REDACTED]",
|
|
240
|
+
),
|
|
241
|
+
LeakagePattern(
|
|
242
|
+
name="connection_string_redis",
|
|
243
|
+
pattern=r"redis(s)?:\/\/[^:]*:[^@]+@[^\s]+",
|
|
244
|
+
category=LeakageCategory.CREDENTIAL,
|
|
245
|
+
severity=0.95,
|
|
246
|
+
description="Redis connection strings with credentials",
|
|
247
|
+
redaction="[REDIS_CONN_REDACTED]",
|
|
248
|
+
),
|
|
249
|
+
|
|
250
|
+
# Internal Paths
|
|
251
|
+
LeakagePattern(
|
|
252
|
+
name="internal_path_unix",
|
|
253
|
+
pattern=r"(?i)(\/home\/[a-zA-Z0-9_\-]+|\/Users\/[a-zA-Z0-9_\-]+|\/var\/[a-zA-Z0-9_\-\/]+|\/etc\/[a-zA-Z0-9_\-\/]+|\/opt\/[a-zA-Z0-9_\-\/]+)\/[^\s]*",
|
|
254
|
+
category=LeakageCategory.INTERNAL,
|
|
255
|
+
severity=0.6,
|
|
256
|
+
description="Unix internal paths",
|
|
257
|
+
redaction="[PATH_REDACTED]",
|
|
258
|
+
),
|
|
259
|
+
LeakagePattern(
|
|
260
|
+
name="internal_path_windows",
|
|
261
|
+
pattern=r"(?i)C:\\Users\\[a-zA-Z0-9_\-]+\\[^\s]*",
|
|
262
|
+
category=LeakageCategory.INTERNAL,
|
|
263
|
+
severity=0.6,
|
|
264
|
+
description="Windows user paths",
|
|
265
|
+
redaction="[PATH_REDACTED]",
|
|
266
|
+
),
|
|
267
|
+
|
|
268
|
+
# System Prompt Leakage
|
|
269
|
+
LeakagePattern(
|
|
270
|
+
name="system_prompt_leak",
|
|
271
|
+
pattern=r"(?i)(my\s+instructions\s+are|i\s+was\s+told\s+to|my\s+system\s+prompt|my\s+initial\s+instructions|i\s+am\s+programmed\s+to|my\s+guidelines\s+state)",
|
|
272
|
+
category=LeakageCategory.SYSTEM_PROMPT,
|
|
273
|
+
severity=0.85,
|
|
274
|
+
description="Indicators of system prompt disclosure",
|
|
275
|
+
redaction="[SYSTEM_PROMPT_CONTENT_REDACTED]",
|
|
276
|
+
),
|
|
277
|
+
LeakagePattern(
|
|
278
|
+
name="system_prompt_markers",
|
|
279
|
+
pattern=r"(?i)(<<SYS>>|<\|system\|>|\[SYSTEM\]|###\s*System)",
|
|
280
|
+
category=LeakageCategory.SYSTEM_PROMPT,
|
|
281
|
+
severity=0.9,
|
|
282
|
+
description="System prompt formatting markers",
|
|
283
|
+
redaction="[SYSTEM_MARKER_REDACTED]",
|
|
284
|
+
),
|
|
285
|
+
|
|
286
|
+
# PII Patterns
|
|
287
|
+
LeakagePattern(
|
|
288
|
+
name="email_address",
|
|
289
|
+
pattern=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
|
|
290
|
+
category=LeakageCategory.PII,
|
|
291
|
+
severity=0.5,
|
|
292
|
+
description="Email addresses",
|
|
293
|
+
redaction="[EMAIL_REDACTED]",
|
|
294
|
+
),
|
|
295
|
+
LeakagePattern(
|
|
296
|
+
name="phone_number",
|
|
297
|
+
pattern=r"(?i)(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
|
|
298
|
+
category=LeakageCategory.PII,
|
|
299
|
+
severity=0.5,
|
|
300
|
+
description="Phone numbers (US format)",
|
|
301
|
+
redaction="[PHONE_REDACTED]",
|
|
302
|
+
),
|
|
303
|
+
LeakagePattern(
|
|
304
|
+
name="ssn",
|
|
305
|
+
pattern=r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b",
|
|
306
|
+
category=LeakageCategory.PII,
|
|
307
|
+
severity=0.9,
|
|
308
|
+
description="Social Security Numbers",
|
|
309
|
+
redaction="[SSN_REDACTED]",
|
|
310
|
+
),
|
|
311
|
+
|
|
312
|
+
# Financial
|
|
313
|
+
LeakagePattern(
|
|
314
|
+
name="credit_card",
|
|
315
|
+
pattern=r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9][0-9])[0-9]{12})\b",
|
|
316
|
+
category=LeakageCategory.FINANCIAL,
|
|
317
|
+
severity=0.95,
|
|
318
|
+
description="Credit card numbers (Visa, MC, Amex, Discover)",
|
|
319
|
+
redaction="[CARD_REDACTED]",
|
|
320
|
+
),
|
|
321
|
+
|
|
322
|
+
# Infrastructure
|
|
323
|
+
LeakagePattern(
|
|
324
|
+
name="internal_ip",
|
|
325
|
+
pattern=r"\b(10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b",
|
|
326
|
+
category=LeakageCategory.INFRASTRUCTURE,
|
|
327
|
+
severity=0.6,
|
|
328
|
+
description="Internal/private IP addresses",
|
|
329
|
+
redaction="[INTERNAL_IP_REDACTED]",
|
|
330
|
+
),
|
|
331
|
+
LeakagePattern(
|
|
332
|
+
name="password_in_text",
|
|
333
|
+
pattern=r"(?i)(password|passwd|pwd)\s*[:=]\s*['\"]?[^\s'\"]{4,}['\"]?",
|
|
334
|
+
category=LeakageCategory.CREDENTIAL,
|
|
335
|
+
severity=0.9,
|
|
336
|
+
description="Passwords in plaintext",
|
|
337
|
+
redaction="[PASSWORD_REDACTED]",
|
|
338
|
+
),
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class OutputGuard:
|
|
343
|
+
"""
|
|
344
|
+
Guard for detecting sensitive data leakage in LLM outputs.
|
|
345
|
+
|
|
346
|
+
Uses pattern matching to detect credentials, API keys, internal paths,
|
|
347
|
+
and other sensitive information that may leak through model outputs.
|
|
348
|
+
|
|
349
|
+
Example:
|
|
350
|
+
>>> guard = OutputGuard()
|
|
351
|
+
>>>
|
|
352
|
+
>>> # Check output
|
|
353
|
+
>>> result = guard.check("The API key is sk-abc123def456...")
|
|
354
|
+
>>> if not result.passed:
|
|
355
|
+
... print(f"Leakage: {result.matched_patterns}")
|
|
356
|
+
>>>
|
|
357
|
+
>>> # Redact sensitive data
|
|
358
|
+
>>> safe = guard.redact("Connection: mongodb://user:pass@host")
|
|
359
|
+
>>> print(safe) # Connection: [MONGODB_CONN_REDACTED]
|
|
360
|
+
|
|
361
|
+
Attributes:
|
|
362
|
+
patterns: List of leakage patterns to check.
|
|
363
|
+
filters: List of custom output filters.
|
|
364
|
+
action: Default action on violations.
|
|
365
|
+
threshold: Risk score threshold.
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
def __init__(
|
|
369
|
+
self,
|
|
370
|
+
patterns: list[LeakagePattern] | None = None,
|
|
371
|
+
filters: list[OutputFilter] | None = None,
|
|
372
|
+
action: GuardAction = GuardAction.WARN,
|
|
373
|
+
threshold: float = 0.5,
|
|
374
|
+
enable_pii: bool = False,
|
|
375
|
+
) -> None:
|
|
376
|
+
"""
|
|
377
|
+
Initialize the output guard.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
patterns: Custom patterns (uses defaults if None).
|
|
381
|
+
filters: Custom output filters.
|
|
382
|
+
action: Action to take when threshold is exceeded.
|
|
383
|
+
threshold: Risk score threshold (0.0 to 1.0).
|
|
384
|
+
enable_pii: Whether to enable PII detection patterns.
|
|
385
|
+
"""
|
|
386
|
+
if patterns is not None:
|
|
387
|
+
self.patterns = patterns
|
|
388
|
+
else:
|
|
389
|
+
# Filter out PII patterns if not enabled
|
|
390
|
+
self.patterns = [
|
|
391
|
+
p for p in DEFAULT_LEAKAGE_PATTERNS
|
|
392
|
+
if enable_pii or p.category != LeakageCategory.PII
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
self.filters = filters or []
|
|
396
|
+
self.action = action
|
|
397
|
+
self.threshold = threshold
|
|
398
|
+
self._pattern_index: dict[str, LeakagePattern] = {p.name: p for p in self.patterns}
|
|
399
|
+
|
|
400
|
+
def add_pattern(self, pattern: LeakagePattern) -> None:
|
|
401
|
+
"""
|
|
402
|
+
Add a custom leakage pattern.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
pattern: The pattern to add.
|
|
406
|
+
"""
|
|
407
|
+
self.patterns.append(pattern)
|
|
408
|
+
self._pattern_index[pattern.name] = pattern
|
|
409
|
+
|
|
410
|
+
def remove_pattern(self, name: str) -> bool:
|
|
411
|
+
"""
|
|
412
|
+
Remove a pattern by name.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
name: The pattern name to remove.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
True if pattern was removed, False if not found.
|
|
419
|
+
"""
|
|
420
|
+
if name in self._pattern_index:
|
|
421
|
+
pattern = self._pattern_index.pop(name)
|
|
422
|
+
self.patterns.remove(pattern)
|
|
423
|
+
return True
|
|
424
|
+
return False
|
|
425
|
+
|
|
426
|
+
def add_filter(self, filter_: OutputFilter) -> None:
|
|
427
|
+
"""
|
|
428
|
+
Add a custom output filter.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
filter_: The filter to add.
|
|
432
|
+
"""
|
|
433
|
+
self.filters.append(filter_)
|
|
434
|
+
|
|
435
|
+
def get_patterns(self) -> list[LeakagePattern]:
|
|
436
|
+
"""Get all registered patterns."""
|
|
437
|
+
return list(self.patterns)
|
|
438
|
+
|
|
439
|
+
def check(
|
|
440
|
+
self,
|
|
441
|
+
output_text: str,
|
|
442
|
+
context: dict[str, Any] | None = None,
|
|
443
|
+
) -> GuardResult:
|
|
444
|
+
"""
|
|
445
|
+
Check output text for sensitive data leakage.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
output_text: The output to check.
|
|
449
|
+
context: Optional context for evaluation.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
GuardResult with check outcome.
|
|
453
|
+
"""
|
|
454
|
+
if not output_text:
|
|
455
|
+
return GuardResult.allow()
|
|
456
|
+
|
|
457
|
+
context = context or {}
|
|
458
|
+
matched_patterns: list[str] = []
|
|
459
|
+
all_matches: list[dict[str, Any]] = []
|
|
460
|
+
severities: list[float] = []
|
|
461
|
+
|
|
462
|
+
# Check each pattern
|
|
463
|
+
for pattern in self.patterns:
|
|
464
|
+
matches = pattern.match(output_text)
|
|
465
|
+
if matches:
|
|
466
|
+
matched_patterns.append(pattern.name)
|
|
467
|
+
severities.append(pattern.severity)
|
|
468
|
+
|
|
469
|
+
for match in matches:
|
|
470
|
+
all_matches.append({
|
|
471
|
+
"pattern": pattern.name,
|
|
472
|
+
"category": pattern.category.value,
|
|
473
|
+
"severity": pattern.severity,
|
|
474
|
+
"matched_text": self._truncate_match(match.group()),
|
|
475
|
+
"start": match.start(),
|
|
476
|
+
"end": match.end(),
|
|
477
|
+
"redaction": pattern.redaction,
|
|
478
|
+
})
|
|
479
|
+
|
|
480
|
+
# Run custom filters
|
|
481
|
+
filter_failures: list[str] = []
|
|
482
|
+
for filter_ in self.filters:
|
|
483
|
+
try:
|
|
484
|
+
if not filter_.check_func(output_text, context):
|
|
485
|
+
filter_failures.append(filter_.name)
|
|
486
|
+
if filter_.action == GuardAction.BLOCK:
|
|
487
|
+
severities.append(1.0)
|
|
488
|
+
else:
|
|
489
|
+
severities.append(0.7)
|
|
490
|
+
except Exception as e:
|
|
491
|
+
logger.error(f"Output filter {filter_.name} raised exception: {e}")
|
|
492
|
+
|
|
493
|
+
# Calculate risk score
|
|
494
|
+
risk_score = self._calculate_risk_score(severities)
|
|
495
|
+
|
|
496
|
+
# Determine if check passed
|
|
497
|
+
passed = risk_score < self.threshold
|
|
498
|
+
|
|
499
|
+
# Determine action
|
|
500
|
+
action = GuardAction.ALLOW if passed else self.action
|
|
501
|
+
|
|
502
|
+
# Log based on action
|
|
503
|
+
if not passed:
|
|
504
|
+
if action == GuardAction.WARN:
|
|
505
|
+
logger.warning(
|
|
506
|
+
f"Output guard warning: risk_score={risk_score:.2f}, "
|
|
507
|
+
f"patterns={matched_patterns}, filters={filter_failures}"
|
|
508
|
+
)
|
|
509
|
+
elif action == GuardAction.BLOCK:
|
|
510
|
+
logger.info(
|
|
511
|
+
f"Output guard blocked: risk_score={risk_score:.2f}, "
|
|
512
|
+
f"patterns={matched_patterns}, filters={filter_failures}"
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
return GuardResult(
|
|
516
|
+
passed=passed,
|
|
517
|
+
action=action,
|
|
518
|
+
matched_patterns=matched_patterns + filter_failures,
|
|
519
|
+
risk_score=risk_score,
|
|
520
|
+
matches=all_matches,
|
|
521
|
+
context={"output_length": len(output_text), **context},
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
def redact(
|
|
525
|
+
self,
|
|
526
|
+
output_text: str,
|
|
527
|
+
categories: list[LeakageCategory] | None = None,
|
|
528
|
+
) -> str:
|
|
529
|
+
"""
|
|
530
|
+
Redact sensitive data from output text.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
output_text: Text to redact.
|
|
534
|
+
categories: Categories to redact (all if None).
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
Text with sensitive data redacted.
|
|
538
|
+
"""
|
|
539
|
+
if not output_text:
|
|
540
|
+
return output_text
|
|
541
|
+
|
|
542
|
+
result = output_text
|
|
543
|
+
|
|
544
|
+
for pattern in self.patterns:
|
|
545
|
+
# Filter by category if specified
|
|
546
|
+
if categories is not None and pattern.category not in categories:
|
|
547
|
+
continue
|
|
548
|
+
|
|
549
|
+
# Replace all matches with redaction text
|
|
550
|
+
result = pattern.compiled.sub(pattern.redaction, result)
|
|
551
|
+
|
|
552
|
+
return result
|
|
553
|
+
|
|
554
|
+
def _calculate_risk_score(self, severities: list[float]) -> float:
|
|
555
|
+
"""
|
|
556
|
+
Calculate overall risk score from matched pattern severities.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
severities: List of severity scores from matches.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
Risk score between 0.0 and 1.0.
|
|
563
|
+
"""
|
|
564
|
+
if not severities:
|
|
565
|
+
return 0.0
|
|
566
|
+
|
|
567
|
+
base_score = max(severities)
|
|
568
|
+
bonus = 0.1 * (len(severities) - 1)
|
|
569
|
+
return min(1.0, base_score + bonus)
|
|
570
|
+
|
|
571
|
+
def _truncate_match(self, text: str, max_length: int = 20) -> str:
|
|
572
|
+
"""Truncate matched text for logging (avoid leaking in logs)."""
|
|
573
|
+
if len(text) <= max_length:
|
|
574
|
+
return text[:4] + "..." + text[-4:] if len(text) > 8 else "[...]"
|
|
575
|
+
return text[:8] + "..." + text[-4:]
|
|
576
|
+
|
|
577
|
+
def configure(
|
|
578
|
+
self,
|
|
579
|
+
action: GuardAction | None = None,
|
|
580
|
+
threshold: float | None = None,
|
|
581
|
+
) -> None:
|
|
582
|
+
"""
|
|
583
|
+
Update guard configuration.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
action: New default action.
|
|
587
|
+
threshold: New risk threshold.
|
|
588
|
+
"""
|
|
589
|
+
if action is not None:
|
|
590
|
+
self.action = action
|
|
591
|
+
if threshold is not None:
|
|
592
|
+
if not 0.0 <= threshold <= 1.0:
|
|
593
|
+
raise ValueError("Threshold must be between 0.0 and 1.0")
|
|
594
|
+
self.threshold = threshold
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def create_output_guard(
|
|
598
|
+
include_defaults: bool = True,
|
|
599
|
+
custom_patterns: list[LeakagePattern] | None = None,
|
|
600
|
+
enable_pii: bool = False,
|
|
601
|
+
action: GuardAction = GuardAction.WARN,
|
|
602
|
+
threshold: float = 0.5,
|
|
603
|
+
) -> OutputGuard:
|
|
604
|
+
"""
|
|
605
|
+
Factory function to create an OutputGuard.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
include_defaults: Whether to include default patterns.
|
|
609
|
+
custom_patterns: Additional custom patterns.
|
|
610
|
+
enable_pii: Whether to enable PII detection.
|
|
611
|
+
action: Action to take on violations.
|
|
612
|
+
threshold: Risk score threshold.
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
Configured OutputGuard instance.
|
|
616
|
+
"""
|
|
617
|
+
patterns: list[LeakagePattern] = []
|
|
618
|
+
|
|
619
|
+
if include_defaults:
|
|
620
|
+
default_patterns = [
|
|
621
|
+
p for p in DEFAULT_LEAKAGE_PATTERNS
|
|
622
|
+
if enable_pii or p.category != LeakageCategory.PII
|
|
623
|
+
]
|
|
624
|
+
patterns.extend(default_patterns)
|
|
625
|
+
|
|
626
|
+
if custom_patterns:
|
|
627
|
+
patterns.extend(custom_patterns)
|
|
628
|
+
|
|
629
|
+
return OutputGuard(
|
|
630
|
+
patterns=patterns,
|
|
631
|
+
action=action,
|
|
632
|
+
threshold=threshold,
|
|
633
|
+
enable_pii=enable_pii,
|
|
634
|
+
)
|