cortexhub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexhub/__init__.py +143 -0
- cortexhub/adapters/__init__.py +5 -0
- cortexhub/adapters/base.py +131 -0
- cortexhub/adapters/claude_agents.py +322 -0
- cortexhub/adapters/crewai.py +297 -0
- cortexhub/adapters/langgraph.py +386 -0
- cortexhub/adapters/openai_agents.py +192 -0
- cortexhub/audit/__init__.py +25 -0
- cortexhub/audit/events.py +165 -0
- cortexhub/auto_protect.py +128 -0
- cortexhub/backend/__init__.py +5 -0
- cortexhub/backend/client.py +348 -0
- cortexhub/client.py +2149 -0
- cortexhub/config.py +37 -0
- cortexhub/context/__init__.py +5 -0
- cortexhub/context/enricher.py +172 -0
- cortexhub/errors.py +123 -0
- cortexhub/frameworks.py +83 -0
- cortexhub/guardrails/__init__.py +3 -0
- cortexhub/guardrails/injection.py +180 -0
- cortexhub/guardrails/pii.py +378 -0
- cortexhub/guardrails/secrets.py +206 -0
- cortexhub/interceptors/__init__.py +3 -0
- cortexhub/interceptors/llm.py +62 -0
- cortexhub/interceptors/mcp.py +96 -0
- cortexhub/pipeline.py +92 -0
- cortexhub/policy/__init__.py +6 -0
- cortexhub/policy/effects.py +87 -0
- cortexhub/policy/evaluator.py +267 -0
- cortexhub/policy/loader.py +158 -0
- cortexhub/policy/models.py +123 -0
- cortexhub/policy/sync.py +183 -0
- cortexhub/telemetry/__init__.py +40 -0
- cortexhub/telemetry/otel.py +481 -0
- cortexhub/version.py +3 -0
- cortexhub-0.1.0.dist-info/METADATA +275 -0
- cortexhub-0.1.0.dist-info/RECORD +38 -0
- cortexhub-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""PII detection and redaction using Presidio.
|
|
2
|
+
|
|
3
|
+
Production-grade PII detection with 50+ entity types using Microsoft Presidio.
|
|
4
|
+
NO fallbacks - Presidio is required.
|
|
5
|
+
|
|
6
|
+
Supports:
|
|
7
|
+
- User-configured PII types (redact only selected types)
|
|
8
|
+
- Custom regex patterns for company-specific sensitive data
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import structlog
|
|
18
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, Pattern, PatternRecognizer
|
|
19
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
20
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
21
|
+
from presidio_anonymizer.entities import OperatorConfig
|
|
22
|
+
|
|
23
|
+
logger = structlog.get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class CustomPattern:
|
|
28
|
+
"""Custom regex pattern for detection."""
|
|
29
|
+
name: str
|
|
30
|
+
pattern: str
|
|
31
|
+
description: str | None = None
|
|
32
|
+
enabled: bool = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class PIIDetectionResult:
|
|
37
|
+
"""Result of PII detection scan."""
|
|
38
|
+
detected: bool
|
|
39
|
+
count: int # Total raw matches
|
|
40
|
+
types: list[str] # Unique PII types found
|
|
41
|
+
counts_per_type: dict[str, int] = field(default_factory=dict) # Matches per type
|
|
42
|
+
unique_values_per_type: dict[str, int] = field(default_factory=dict) # Unique values per type
|
|
43
|
+
findings: list[dict[str, Any]] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def summary(self) -> str:
|
|
47
|
+
"""Human-readable summary of detections."""
|
|
48
|
+
if not self.detected:
|
|
49
|
+
return "No PII detected"
|
|
50
|
+
|
|
51
|
+
parts = []
|
|
52
|
+
for pii_type in sorted(self.types):
|
|
53
|
+
unique_count = self.unique_values_per_type.get(pii_type, 0)
|
|
54
|
+
total_count = self.counts_per_type.get(pii_type, 0)
|
|
55
|
+
|
|
56
|
+
# Format: "3 SSN" or "5 SSN (12 occurrences)" if duplicates
|
|
57
|
+
if unique_count == total_count:
|
|
58
|
+
parts.append(f"{unique_count} {pii_type}")
|
|
59
|
+
else:
|
|
60
|
+
parts.append(f"{unique_count} {pii_type} ({total_count} occurrences)")
|
|
61
|
+
|
|
62
|
+
return ", ".join(parts)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def unique_count(self) -> int:
|
|
66
|
+
"""Total unique PII values detected."""
|
|
67
|
+
return sum(self.unique_values_per_type.values())
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PIIDetector:
|
|
71
|
+
"""Production-grade PII detection using Microsoft Presidio.
|
|
72
|
+
|
|
73
|
+
Detects 50+ PII types:
|
|
74
|
+
- EMAIL_ADDRESS, PHONE_NUMBER
|
|
75
|
+
- US_SSN, US_PASSPORT, US_DRIVER_LICENSE
|
|
76
|
+
- CREDIT_CARD, IBAN_CODE, CRYPTO
|
|
77
|
+
- PERSON, LOCATION, ORGANIZATION
|
|
78
|
+
- MEDICAL_LICENSE, NRP (medical terms for HIPAA)
|
|
79
|
+
- And many more...
|
|
80
|
+
|
|
81
|
+
Supports user configuration:
|
|
82
|
+
- allowed_types: Only detect/redact these specific PII types
|
|
83
|
+
- custom_patterns: Custom regex patterns for company-specific data
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
enabled: bool = True,
|
|
89
|
+
confidence_threshold: float = 0.5,
|
|
90
|
+
allowed_types: list[str] | None = None,
|
|
91
|
+
custom_patterns: list[CustomPattern] | None = None,
|
|
92
|
+
):
|
|
93
|
+
"""Initialize PII detector with Presidio.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
enabled: Whether PII detection is enabled
|
|
97
|
+
confidence_threshold: Minimum confidence for detection (0.0-1.0)
|
|
98
|
+
allowed_types: If set, only detect these PII types (None = all types)
|
|
99
|
+
custom_patterns: Custom regex patterns to detect
|
|
100
|
+
"""
|
|
101
|
+
self.enabled = enabled
|
|
102
|
+
self.confidence_threshold = confidence_threshold
|
|
103
|
+
# Normalize to lowercase for comparison
|
|
104
|
+
self.allowed_types = [t.lower() for t in allowed_types] if allowed_types else None
|
|
105
|
+
self.custom_patterns = custom_patterns or []
|
|
106
|
+
|
|
107
|
+
# Configure NLP engine (use small model for speed)
|
|
108
|
+
configuration = {
|
|
109
|
+
"nlp_engine_name": "spacy",
|
|
110
|
+
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
|
114
|
+
nlp_engine = provider.create_engine()
|
|
115
|
+
|
|
116
|
+
# Initialize analyzer with custom registry
|
|
117
|
+
registry = RecognizerRegistry()
|
|
118
|
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
|
119
|
+
|
|
120
|
+
# Add custom pattern recognizers
|
|
121
|
+
for cp in self.custom_patterns:
|
|
122
|
+
if cp.enabled:
|
|
123
|
+
try:
|
|
124
|
+
pattern = Pattern(
|
|
125
|
+
name=cp.name,
|
|
126
|
+
regex=cp.pattern,
|
|
127
|
+
score=0.8, # High confidence for custom patterns
|
|
128
|
+
)
|
|
129
|
+
recognizer = PatternRecognizer(
|
|
130
|
+
supported_entity=f"CUSTOM_{cp.name.upper()}",
|
|
131
|
+
patterns=[pattern],
|
|
132
|
+
)
|
|
133
|
+
registry.add_recognizer(recognizer)
|
|
134
|
+
logger.info(f"Added custom PII pattern: {cp.name}")
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.warning(f"Failed to add custom pattern {cp.name}: {e}")
|
|
137
|
+
|
|
138
|
+
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
|
|
139
|
+
self.anonymizer = AnonymizerEngine()
|
|
140
|
+
|
|
141
|
+
logger.info(
|
|
142
|
+
"PII detector initialized",
|
|
143
|
+
enabled=enabled,
|
|
144
|
+
threshold=confidence_threshold,
|
|
145
|
+
nlp_model="en_core_web_sm",
|
|
146
|
+
allowed_types=self.allowed_types,
|
|
147
|
+
custom_patterns_count=len(self.custom_patterns),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def configure(
|
|
151
|
+
self,
|
|
152
|
+
allowed_types: list[str] | None = None,
|
|
153
|
+
custom_patterns: list[CustomPattern] | None = None,
|
|
154
|
+
) -> None:
|
|
155
|
+
"""Update configuration at runtime.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
allowed_types: If set, only detect these PII types (None = all types)
|
|
159
|
+
custom_patterns: Custom regex patterns to detect
|
|
160
|
+
"""
|
|
161
|
+
if allowed_types is not None:
|
|
162
|
+
self.allowed_types = [t.lower() for t in allowed_types]
|
|
163
|
+
|
|
164
|
+
if custom_patterns is not None:
|
|
165
|
+
self.custom_patterns = custom_patterns
|
|
166
|
+
# Re-add custom recognizers would require re-initializing the analyzer
|
|
167
|
+
# For now, custom patterns are only applied at initialization
|
|
168
|
+
logger.info(
|
|
169
|
+
"PII detector config updated",
|
|
170
|
+
allowed_types=self.allowed_types,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _filter_by_allowed_types(self, findings: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
174
|
+
"""Filter findings to only include allowed types.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
findings: All detected findings
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Filtered findings (only allowed types)
|
|
181
|
+
"""
|
|
182
|
+
if self.allowed_types is None:
|
|
183
|
+
return findings # No filter - return all
|
|
184
|
+
|
|
185
|
+
filtered = [f for f in findings if f["type"].lower() in self.allowed_types]
|
|
186
|
+
|
|
187
|
+
if len(filtered) != len(findings):
|
|
188
|
+
logger.debug(
|
|
189
|
+
"PII findings filtered by allowed types",
|
|
190
|
+
original_count=len(findings),
|
|
191
|
+
filtered_count=len(filtered),
|
|
192
|
+
allowed_types=self.allowed_types,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return filtered
|
|
196
|
+
|
|
197
|
+
def scan(self, text: str | dict[str, Any], filter_types: bool = True) -> list[dict[str, Any]]:
|
|
198
|
+
"""Scan text or dict for PII using Presidio.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
text: Text or dict to scan
|
|
202
|
+
filter_types: Whether to filter by allowed_types (default True)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of PII findings with type, value, position, confidence
|
|
206
|
+
"""
|
|
207
|
+
if not self.enabled:
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
# Convert dict to JSON string
|
|
211
|
+
if isinstance(text, dict):
|
|
212
|
+
text = json.dumps(text)
|
|
213
|
+
|
|
214
|
+
# Analyze with Presidio
|
|
215
|
+
results = self.analyzer.analyze(
|
|
216
|
+
text=text,
|
|
217
|
+
language="en",
|
|
218
|
+
score_threshold=self.confidence_threshold,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
findings = []
|
|
222
|
+
for result in results:
|
|
223
|
+
findings.append(
|
|
224
|
+
{
|
|
225
|
+
"type": result.entity_type.lower(),
|
|
226
|
+
"value": text[result.start : result.end],
|
|
227
|
+
"start": result.start,
|
|
228
|
+
"end": result.end,
|
|
229
|
+
"confidence": result.score,
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Filter by allowed types if configured
|
|
234
|
+
if filter_types:
|
|
235
|
+
findings = self._filter_by_allowed_types(findings)
|
|
236
|
+
|
|
237
|
+
if findings:
|
|
238
|
+
logger.warning(
|
|
239
|
+
"PII detected",
|
|
240
|
+
count=len(findings),
|
|
241
|
+
types=list(set(f["type"] for f in findings)),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return findings
|
|
245
|
+
|
|
246
|
+
def detect(self, text: str | dict[str, Any]) -> PIIDetectionResult:
|
|
247
|
+
"""Detect PII in text and return structured result.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
text: Text or dict to scan
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
PIIDetectionResult with detection details and counts per type
|
|
254
|
+
"""
|
|
255
|
+
findings = self.scan(text)
|
|
256
|
+
|
|
257
|
+
if not findings:
|
|
258
|
+
return PIIDetectionResult(
|
|
259
|
+
detected=False,
|
|
260
|
+
count=0,
|
|
261
|
+
types=[],
|
|
262
|
+
counts_per_type={},
|
|
263
|
+
unique_values_per_type={},
|
|
264
|
+
findings=[],
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Count occurrences per type
|
|
268
|
+
type_counts = Counter(f["type"] for f in findings)
|
|
269
|
+
|
|
270
|
+
# Count unique values per type (deduplicate by normalized value)
|
|
271
|
+
unique_per_type: dict[str, set[str]] = {}
|
|
272
|
+
for f in findings:
|
|
273
|
+
pii_type = f["type"]
|
|
274
|
+
# Normalize value for deduplication (lowercase, strip whitespace)
|
|
275
|
+
normalized_value = f["value"].lower().strip()
|
|
276
|
+
if pii_type not in unique_per_type:
|
|
277
|
+
unique_per_type[pii_type] = set()
|
|
278
|
+
unique_per_type[pii_type].add(normalized_value)
|
|
279
|
+
|
|
280
|
+
unique_counts = {t: len(values) for t, values in unique_per_type.items()}
|
|
281
|
+
|
|
282
|
+
return PIIDetectionResult(
|
|
283
|
+
detected=True,
|
|
284
|
+
count=len(findings),
|
|
285
|
+
types=list(type_counts.keys()),
|
|
286
|
+
counts_per_type=dict(type_counts),
|
|
287
|
+
unique_values_per_type=unique_counts,
|
|
288
|
+
findings=findings,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
def has_pii(self, text: str | dict[str, Any]) -> bool:
|
|
292
|
+
"""Check if text contains PII.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
text: Text or dict to check
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
True if PII detected
|
|
299
|
+
"""
|
|
300
|
+
return len(self.scan(text)) > 0
|
|
301
|
+
|
|
302
|
+
def redact(self, text: str | dict[str, Any]) -> tuple[str | dict, list[dict[str, Any]]]:
|
|
303
|
+
"""Scan and redact PII using Presidio anonymizer.
|
|
304
|
+
|
|
305
|
+
Only redacts PII types that are in allowed_types (if configured).
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
text: Text or dict to redact
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
(redacted_content, findings)
|
|
312
|
+
"""
|
|
313
|
+
is_dict = isinstance(text, dict)
|
|
314
|
+
original_text = json.dumps(text) if is_dict else text
|
|
315
|
+
|
|
316
|
+
# Analyze for PII
|
|
317
|
+
findings_raw = self.analyzer.analyze(
|
|
318
|
+
text=original_text,
|
|
319
|
+
language="en",
|
|
320
|
+
score_threshold=self.confidence_threshold,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
if not findings_raw:
|
|
324
|
+
return text, []
|
|
325
|
+
|
|
326
|
+
# Filter by allowed types if configured
|
|
327
|
+
if self.allowed_types is not None:
|
|
328
|
+
findings_raw = [
|
|
329
|
+
f for f in findings_raw
|
|
330
|
+
if f.entity_type.lower() in self.allowed_types
|
|
331
|
+
]
|
|
332
|
+
|
|
333
|
+
if not findings_raw:
|
|
334
|
+
return text, []
|
|
335
|
+
|
|
336
|
+
# Anonymize with Presidio - create operator per entity type for proper labeling
|
|
337
|
+
operators = {}
|
|
338
|
+
for result in findings_raw:
|
|
339
|
+
entity_type = result.entity_type
|
|
340
|
+
operators[entity_type] = OperatorConfig(
|
|
341
|
+
"replace", {"new_value": f"[REDACTED-{entity_type}]"}
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
anonymized = self.anonymizer.anonymize(
|
|
345
|
+
text=original_text,
|
|
346
|
+
analyzer_results=findings_raw,
|
|
347
|
+
operators=operators,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Convert findings
|
|
351
|
+
findings = []
|
|
352
|
+
for result in findings_raw:
|
|
353
|
+
findings.append(
|
|
354
|
+
{
|
|
355
|
+
"type": result.entity_type.lower(),
|
|
356
|
+
"value": original_text[result.start : result.end],
|
|
357
|
+
"start": result.start,
|
|
358
|
+
"end": result.end,
|
|
359
|
+
"confidence": result.score,
|
|
360
|
+
}
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
logger.info(
|
|
364
|
+
"PII redacted",
|
|
365
|
+
count=len(findings),
|
|
366
|
+
types=list(set(f["type"] for f in findings)),
|
|
367
|
+
allowed_types=self.allowed_types,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Convert back to dict if original was dict
|
|
371
|
+
if is_dict:
|
|
372
|
+
try:
|
|
373
|
+
return json.loads(anonymized.text), findings
|
|
374
|
+
except json.JSONDecodeError:
|
|
375
|
+
logger.warning("Redaction broke JSON structure, returning original")
|
|
376
|
+
return text, findings
|
|
377
|
+
|
|
378
|
+
return anonymized.text, findings
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Secrets detection using detect-secrets.
|
|
2
|
+
|
|
3
|
+
Production-grade secret detection using Yelp's detect-secrets.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
from collections import Counter
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
from detect_secrets.core.secrets_collection import SecretsCollection
|
|
14
|
+
from detect_secrets.plugins.aws import AWSKeyDetector
|
|
15
|
+
from detect_secrets.plugins.basic_auth import BasicAuthDetector
|
|
16
|
+
from detect_secrets.plugins.github_token import GitHubTokenDetector
|
|
17
|
+
from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString
|
|
18
|
+
from detect_secrets.plugins.jwt import JwtTokenDetector
|
|
19
|
+
from detect_secrets.plugins.private_key import PrivateKeyDetector
|
|
20
|
+
from detect_secrets.plugins.slack import SlackDetector
|
|
21
|
+
|
|
22
|
+
logger = structlog.get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class SecretsDetectionResult:
|
|
27
|
+
"""Result of secrets detection scan."""
|
|
28
|
+
detected: bool
|
|
29
|
+
count: int
|
|
30
|
+
types: list[str]
|
|
31
|
+
counts_per_type: dict[str, int] = field(default_factory=dict)
|
|
32
|
+
findings: list[dict[str, Any]] = field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SecretsDetector:
|
|
36
|
+
"""Production-grade secrets detection using detect-secrets.
|
|
37
|
+
|
|
38
|
+
Uses multiple plugins to detect:
|
|
39
|
+
- AWS keys (AKIA...)
|
|
40
|
+
- GitHub tokens (ghp_, gho_, etc.)
|
|
41
|
+
- Slack tokens (xoxb-, xoxp-, etc.)
|
|
42
|
+
- JWT tokens
|
|
43
|
+
- Private keys (RSA, SSH)
|
|
44
|
+
- Basic auth credentials
|
|
45
|
+
- High entropy strings
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, enabled: bool = True):
|
|
49
|
+
"""Initialize secrets detector.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
enabled: Whether secrets detection is enabled
|
|
53
|
+
"""
|
|
54
|
+
self.enabled = enabled
|
|
55
|
+
|
|
56
|
+
# Initialize all plugins
|
|
57
|
+
self.plugins = [
|
|
58
|
+
AWSKeyDetector(),
|
|
59
|
+
GitHubTokenDetector(),
|
|
60
|
+
SlackDetector(),
|
|
61
|
+
BasicAuthDetector(),
|
|
62
|
+
PrivateKeyDetector(),
|
|
63
|
+
JwtTokenDetector(),
|
|
64
|
+
HexHighEntropyString(limit=3.0), # High entropy threshold
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
logger.info(
|
|
68
|
+
"Secrets detector initialized",
|
|
69
|
+
plugins=len(self.plugins),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def scan(self, text: str | dict[str, Any]) -> list[dict[str, Any]]:
|
|
73
|
+
"""Scan text or dict for secrets.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
text: Text or dict to scan
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of secret findings
|
|
80
|
+
"""
|
|
81
|
+
if not self.enabled:
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
# Convert dict to JSON string
|
|
85
|
+
if isinstance(text, dict):
|
|
86
|
+
text = json.dumps(text)
|
|
87
|
+
|
|
88
|
+
findings = []
|
|
89
|
+
|
|
90
|
+
# Scan with each plugin
|
|
91
|
+
for plugin in self.plugins:
|
|
92
|
+
try:
|
|
93
|
+
secrets = plugin.analyze_line(filename="inline", line=text, line_number=0)
|
|
94
|
+
|
|
95
|
+
for secret in secrets:
|
|
96
|
+
# Never log actual secret value
|
|
97
|
+
findings.append(
|
|
98
|
+
{
|
|
99
|
+
"type": secret.type,
|
|
100
|
+
"value": "***", # Redacted
|
|
101
|
+
"start": secret.line_number,
|
|
102
|
+
"end": secret.line_number,
|
|
103
|
+
"confidence": 0.9,
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.debug(f"Plugin {plugin.__class__.__name__} error: {e}")
|
|
108
|
+
|
|
109
|
+
if findings:
|
|
110
|
+
logger.warning(
|
|
111
|
+
"Secrets detected",
|
|
112
|
+
count=len(findings),
|
|
113
|
+
types=list(set(f["type"] for f in findings)),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return findings
|
|
117
|
+
|
|
118
|
+
def detect(self, text: str | dict[str, Any]) -> SecretsDetectionResult:
|
|
119
|
+
"""Detect secrets in text and return structured result.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: Text or dict to scan
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
SecretsDetectionResult with detection details and counts per type
|
|
126
|
+
"""
|
|
127
|
+
findings = self.scan(text)
|
|
128
|
+
|
|
129
|
+
if not findings:
|
|
130
|
+
return SecretsDetectionResult(
|
|
131
|
+
detected=False,
|
|
132
|
+
count=0,
|
|
133
|
+
types=[],
|
|
134
|
+
counts_per_type={},
|
|
135
|
+
findings=[],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Count occurrences per type
|
|
139
|
+
type_counts = Counter(f["type"] for f in findings)
|
|
140
|
+
|
|
141
|
+
return SecretsDetectionResult(
|
|
142
|
+
detected=True,
|
|
143
|
+
count=len(findings),
|
|
144
|
+
types=list(type_counts.keys()),
|
|
145
|
+
counts_per_type=dict(type_counts),
|
|
146
|
+
findings=findings,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def has_secrets(self, text: str | dict[str, Any]) -> bool:
|
|
150
|
+
"""Check if text contains secrets.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
text: Text or dict to check
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True if secrets detected
|
|
157
|
+
"""
|
|
158
|
+
return len(self.scan(text)) > 0
|
|
159
|
+
|
|
160
|
+
def redact(self, text: str | dict[str, Any]) -> tuple[str | dict, list[dict[str, Any]]]:
|
|
161
|
+
"""Scan and redact secrets.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
text: Text or dict to redact
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
(redacted_content, findings)
|
|
168
|
+
"""
|
|
169
|
+
is_dict = isinstance(text, dict)
|
|
170
|
+
original_text = json.dumps(text) if is_dict else text
|
|
171
|
+
|
|
172
|
+
# Scan for secrets
|
|
173
|
+
findings = self.scan(original_text)
|
|
174
|
+
|
|
175
|
+
if not findings:
|
|
176
|
+
return text, []
|
|
177
|
+
|
|
178
|
+
# Redact known patterns
|
|
179
|
+
redacted = original_text
|
|
180
|
+
|
|
181
|
+
# AWS keys
|
|
182
|
+
redacted = re.sub(r"AKIA[0-9A-Z]{16}", "[REDACTED-AWS_KEY]", redacted)
|
|
183
|
+
|
|
184
|
+
# GitHub tokens
|
|
185
|
+
redacted = re.sub(r"gh[ps]_[A-Za-z0-9]{36}", "[REDACTED-GITHUB_TOKEN]", redacted)
|
|
186
|
+
|
|
187
|
+
# Slack tokens
|
|
188
|
+
redacted = re.sub(r"xox[baprs]-[A-Za-z0-9-]+", "[REDACTED-SLACK_TOKEN]", redacted)
|
|
189
|
+
|
|
190
|
+
# Generic high-entropy strings (be conservative)
|
|
191
|
+
# redacted = re.sub(r"\b[A-Za-z0-9]{40,}\b", "[REDACTED-SECRET]", redacted)
|
|
192
|
+
|
|
193
|
+
logger.info(
|
|
194
|
+
"Secrets redacted",
|
|
195
|
+
count=len(findings),
|
|
196
|
+
types=list(set(f["type"] for f in findings)),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Convert back to dict if needed
|
|
200
|
+
if is_dict:
|
|
201
|
+
try:
|
|
202
|
+
return json.loads(redacted), findings
|
|
203
|
+
except json.JSONDecodeError:
|
|
204
|
+
return text, findings
|
|
205
|
+
|
|
206
|
+
return redacted, findings
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""LLM call interceptor for governance.
|
|
2
|
+
|
|
3
|
+
Intercepts calls to language model APIs (OpenAI, Anthropic, etc.) and records
|
|
4
|
+
them for observability. In enforcement mode, also evaluates policies.
|
|
5
|
+
|
|
6
|
+
GUARDRAILS APPLY HERE - LLMs should NOT receive sensitive data.
|
|
7
|
+
Unlike tools (which NEED the data), LLM prompts should be sanitized.
|
|
8
|
+
|
|
9
|
+
This allows compliance teams to see:
|
|
10
|
+
- What models are being used
|
|
11
|
+
- What data is being sent to LLMs
|
|
12
|
+
- ⚠️ PII in prompts (risk alert)
|
|
13
|
+
- 🚨 Secrets in prompts (critical risk)
|
|
14
|
+
- Prompt manipulation attempts
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import structlog
|
|
20
|
+
|
|
21
|
+
logger = structlog.get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LLMInterceptor:
|
|
25
|
+
"""Intercepts and observes LLM API calls.
|
|
26
|
+
|
|
27
|
+
Supports:
|
|
28
|
+
- OpenAI (ChatCompletion, Completion)
|
|
29
|
+
- Anthropic (Claude)
|
|
30
|
+
- Azure OpenAI
|
|
31
|
+
- Generic LLM APIs
|
|
32
|
+
|
|
33
|
+
GUARDRAILS APPLY HERE because:
|
|
34
|
+
- LLMs should NOT receive PII (data leak to external model)
|
|
35
|
+
- LLMs should NOT receive secrets (credential exposure)
|
|
36
|
+
- Prompts should be checked for manipulation attempts
|
|
37
|
+
|
|
38
|
+
This is different from tools, which NEED the sensitive data to work.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, cortex_hub: Any): # Type: CortexHub
|
|
42
|
+
"""Initialize LLM interceptor.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
cortex_hub: CortexHub instance for policy enforcement
|
|
46
|
+
"""
|
|
47
|
+
self.cortex_hub = cortex_hub
|
|
48
|
+
self._openai_patched = False
|
|
49
|
+
self._anthropic_patched = False
|
|
50
|
+
logger.info("LLM interceptor initialized")
|
|
51
|
+
|
|
52
|
+
def intercept_openai(self) -> None:
|
|
53
|
+
"""Provider-specific interception disabled (use framework adapters)."""
|
|
54
|
+
logger.info("OpenAI interception disabled; use framework adapters")
|
|
55
|
+
|
|
56
|
+
def intercept_anthropic(self) -> None:
|
|
57
|
+
"""Provider-specific interception disabled (use framework adapters)."""
|
|
58
|
+
logger.info("Anthropic interception disabled; use framework adapters")
|
|
59
|
+
|
|
60
|
+
def apply_all(self) -> None:
|
|
61
|
+
"""Provider-specific interception disabled (use framework adapters)."""
|
|
62
|
+
logger.info("LLM interception is handled by framework adapters")
|