cortexhub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,378 @@
1
+ """PII detection and redaction using Presidio.
2
+
3
+ Production-grade PII detection with 50+ entity types using Microsoft Presidio.
4
+ NO fallbacks - Presidio is required.
5
+
6
+ Supports:
7
+ - User-configured PII types (redact only selected types)
8
+ - Custom regex patterns for company-specific sensitive data
9
+ """
10
+
11
+ import json
12
+ import re
13
+ from collections import Counter
14
+ from dataclasses import dataclass, field
15
+ from typing import Any
16
+
17
+ import structlog
18
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, Pattern, PatternRecognizer
19
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
20
+ from presidio_anonymizer import AnonymizerEngine
21
+ from presidio_anonymizer.entities import OperatorConfig
22
+
23
+ logger = structlog.get_logger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class CustomPattern:
28
+ """Custom regex pattern for detection."""
29
+ name: str
30
+ pattern: str
31
+ description: str | None = None
32
+ enabled: bool = True
33
+
34
+
35
+ @dataclass
36
+ class PIIDetectionResult:
37
+ """Result of PII detection scan."""
38
+ detected: bool
39
+ count: int # Total raw matches
40
+ types: list[str] # Unique PII types found
41
+ counts_per_type: dict[str, int] = field(default_factory=dict) # Matches per type
42
+ unique_values_per_type: dict[str, int] = field(default_factory=dict) # Unique values per type
43
+ findings: list[dict[str, Any]] = field(default_factory=list)
44
+
45
+ @property
46
+ def summary(self) -> str:
47
+ """Human-readable summary of detections."""
48
+ if not self.detected:
49
+ return "No PII detected"
50
+
51
+ parts = []
52
+ for pii_type in sorted(self.types):
53
+ unique_count = self.unique_values_per_type.get(pii_type, 0)
54
+ total_count = self.counts_per_type.get(pii_type, 0)
55
+
56
+ # Format: "3 SSN" or "5 SSN (12 occurrences)" if duplicates
57
+ if unique_count == total_count:
58
+ parts.append(f"{unique_count} {pii_type}")
59
+ else:
60
+ parts.append(f"{unique_count} {pii_type} ({total_count} occurrences)")
61
+
62
+ return ", ".join(parts)
63
+
64
+ @property
65
+ def unique_count(self) -> int:
66
+ """Total unique PII values detected."""
67
+ return sum(self.unique_values_per_type.values())
68
+
69
+
70
+ class PIIDetector:
71
+ """Production-grade PII detection using Microsoft Presidio.
72
+
73
+ Detects 50+ PII types:
74
+ - EMAIL_ADDRESS, PHONE_NUMBER
75
+ - US_SSN, US_PASSPORT, US_DRIVER_LICENSE
76
+ - CREDIT_CARD, IBAN_CODE, CRYPTO
77
+ - PERSON, LOCATION, ORGANIZATION
78
+ - MEDICAL_LICENSE, NRP (medical terms for HIPAA)
79
+ - And many more...
80
+
81
+ Supports user configuration:
82
+ - allowed_types: Only detect/redact these specific PII types
83
+ - custom_patterns: Custom regex patterns for company-specific data
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ enabled: bool = True,
89
+ confidence_threshold: float = 0.5,
90
+ allowed_types: list[str] | None = None,
91
+ custom_patterns: list[CustomPattern] | None = None,
92
+ ):
93
+ """Initialize PII detector with Presidio.
94
+
95
+ Args:
96
+ enabled: Whether PII detection is enabled
97
+ confidence_threshold: Minimum confidence for detection (0.0-1.0)
98
+ allowed_types: If set, only detect these PII types (None = all types)
99
+ custom_patterns: Custom regex patterns to detect
100
+ """
101
+ self.enabled = enabled
102
+ self.confidence_threshold = confidence_threshold
103
+ # Normalize to lowercase for comparison
104
+ self.allowed_types = [t.lower() for t in allowed_types] if allowed_types else None
105
+ self.custom_patterns = custom_patterns or []
106
+
107
+ # Configure NLP engine (use small model for speed)
108
+ configuration = {
109
+ "nlp_engine_name": "spacy",
110
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
111
+ }
112
+
113
+ provider = NlpEngineProvider(nlp_configuration=configuration)
114
+ nlp_engine = provider.create_engine()
115
+
116
+ # Initialize analyzer with custom registry
117
+ registry = RecognizerRegistry()
118
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
119
+
120
+ # Add custom pattern recognizers
121
+ for cp in self.custom_patterns:
122
+ if cp.enabled:
123
+ try:
124
+ pattern = Pattern(
125
+ name=cp.name,
126
+ regex=cp.pattern,
127
+ score=0.8, # High confidence for custom patterns
128
+ )
129
+ recognizer = PatternRecognizer(
130
+ supported_entity=f"CUSTOM_{cp.name.upper()}",
131
+ patterns=[pattern],
132
+ )
133
+ registry.add_recognizer(recognizer)
134
+ logger.info(f"Added custom PII pattern: {cp.name}")
135
+ except Exception as e:
136
+ logger.warning(f"Failed to add custom pattern {cp.name}: {e}")
137
+
138
+ self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
139
+ self.anonymizer = AnonymizerEngine()
140
+
141
+ logger.info(
142
+ "PII detector initialized",
143
+ enabled=enabled,
144
+ threshold=confidence_threshold,
145
+ nlp_model="en_core_web_sm",
146
+ allowed_types=self.allowed_types,
147
+ custom_patterns_count=len(self.custom_patterns),
148
+ )
149
+
150
+ def configure(
151
+ self,
152
+ allowed_types: list[str] | None = None,
153
+ custom_patterns: list[CustomPattern] | None = None,
154
+ ) -> None:
155
+ """Update configuration at runtime.
156
+
157
+ Args:
158
+ allowed_types: If set, only detect these PII types (None = all types)
159
+ custom_patterns: Custom regex patterns to detect
160
+ """
161
+ if allowed_types is not None:
162
+ self.allowed_types = [t.lower() for t in allowed_types]
163
+
164
+ if custom_patterns is not None:
165
+ self.custom_patterns = custom_patterns
166
+ # Re-add custom recognizers would require re-initializing the analyzer
167
+ # For now, custom patterns are only applied at initialization
168
+ logger.info(
169
+ "PII detector config updated",
170
+ allowed_types=self.allowed_types,
171
+ )
172
+
173
+ def _filter_by_allowed_types(self, findings: list[dict[str, Any]]) -> list[dict[str, Any]]:
174
+ """Filter findings to only include allowed types.
175
+
176
+ Args:
177
+ findings: All detected findings
178
+
179
+ Returns:
180
+ Filtered findings (only allowed types)
181
+ """
182
+ if self.allowed_types is None:
183
+ return findings # No filter - return all
184
+
185
+ filtered = [f for f in findings if f["type"].lower() in self.allowed_types]
186
+
187
+ if len(filtered) != len(findings):
188
+ logger.debug(
189
+ "PII findings filtered by allowed types",
190
+ original_count=len(findings),
191
+ filtered_count=len(filtered),
192
+ allowed_types=self.allowed_types,
193
+ )
194
+
195
+ return filtered
196
+
197
+ def scan(self, text: str | dict[str, Any], filter_types: bool = True) -> list[dict[str, Any]]:
198
+ """Scan text or dict for PII using Presidio.
199
+
200
+ Args:
201
+ text: Text or dict to scan
202
+ filter_types: Whether to filter by allowed_types (default True)
203
+
204
+ Returns:
205
+ List of PII findings with type, value, position, confidence
206
+ """
207
+ if not self.enabled:
208
+ return []
209
+
210
+ # Convert dict to JSON string
211
+ if isinstance(text, dict):
212
+ text = json.dumps(text)
213
+
214
+ # Analyze with Presidio
215
+ results = self.analyzer.analyze(
216
+ text=text,
217
+ language="en",
218
+ score_threshold=self.confidence_threshold,
219
+ )
220
+
221
+ findings = []
222
+ for result in results:
223
+ findings.append(
224
+ {
225
+ "type": result.entity_type.lower(),
226
+ "value": text[result.start : result.end],
227
+ "start": result.start,
228
+ "end": result.end,
229
+ "confidence": result.score,
230
+ }
231
+ )
232
+
233
+ # Filter by allowed types if configured
234
+ if filter_types:
235
+ findings = self._filter_by_allowed_types(findings)
236
+
237
+ if findings:
238
+ logger.warning(
239
+ "PII detected",
240
+ count=len(findings),
241
+ types=list(set(f["type"] for f in findings)),
242
+ )
243
+
244
+ return findings
245
+
246
+ def detect(self, text: str | dict[str, Any]) -> PIIDetectionResult:
247
+ """Detect PII in text and return structured result.
248
+
249
+ Args:
250
+ text: Text or dict to scan
251
+
252
+ Returns:
253
+ PIIDetectionResult with detection details and counts per type
254
+ """
255
+ findings = self.scan(text)
256
+
257
+ if not findings:
258
+ return PIIDetectionResult(
259
+ detected=False,
260
+ count=0,
261
+ types=[],
262
+ counts_per_type={},
263
+ unique_values_per_type={},
264
+ findings=[],
265
+ )
266
+
267
+ # Count occurrences per type
268
+ type_counts = Counter(f["type"] for f in findings)
269
+
270
+ # Count unique values per type (deduplicate by normalized value)
271
+ unique_per_type: dict[str, set[str]] = {}
272
+ for f in findings:
273
+ pii_type = f["type"]
274
+ # Normalize value for deduplication (lowercase, strip whitespace)
275
+ normalized_value = f["value"].lower().strip()
276
+ if pii_type not in unique_per_type:
277
+ unique_per_type[pii_type] = set()
278
+ unique_per_type[pii_type].add(normalized_value)
279
+
280
+ unique_counts = {t: len(values) for t, values in unique_per_type.items()}
281
+
282
+ return PIIDetectionResult(
283
+ detected=True,
284
+ count=len(findings),
285
+ types=list(type_counts.keys()),
286
+ counts_per_type=dict(type_counts),
287
+ unique_values_per_type=unique_counts,
288
+ findings=findings,
289
+ )
290
+
291
+ def has_pii(self, text: str | dict[str, Any]) -> bool:
292
+ """Check if text contains PII.
293
+
294
+ Args:
295
+ text: Text or dict to check
296
+
297
+ Returns:
298
+ True if PII detected
299
+ """
300
+ return len(self.scan(text)) > 0
301
+
302
+ def redact(self, text: str | dict[str, Any]) -> tuple[str | dict, list[dict[str, Any]]]:
303
+ """Scan and redact PII using Presidio anonymizer.
304
+
305
+ Only redacts PII types that are in allowed_types (if configured).
306
+
307
+ Args:
308
+ text: Text or dict to redact
309
+
310
+ Returns:
311
+ (redacted_content, findings)
312
+ """
313
+ is_dict = isinstance(text, dict)
314
+ original_text = json.dumps(text) if is_dict else text
315
+
316
+ # Analyze for PII
317
+ findings_raw = self.analyzer.analyze(
318
+ text=original_text,
319
+ language="en",
320
+ score_threshold=self.confidence_threshold,
321
+ )
322
+
323
+ if not findings_raw:
324
+ return text, []
325
+
326
+ # Filter by allowed types if configured
327
+ if self.allowed_types is not None:
328
+ findings_raw = [
329
+ f for f in findings_raw
330
+ if f.entity_type.lower() in self.allowed_types
331
+ ]
332
+
333
+ if not findings_raw:
334
+ return text, []
335
+
336
+ # Anonymize with Presidio - create operator per entity type for proper labeling
337
+ operators = {}
338
+ for result in findings_raw:
339
+ entity_type = result.entity_type
340
+ operators[entity_type] = OperatorConfig(
341
+ "replace", {"new_value": f"[REDACTED-{entity_type}]"}
342
+ )
343
+
344
+ anonymized = self.anonymizer.anonymize(
345
+ text=original_text,
346
+ analyzer_results=findings_raw,
347
+ operators=operators,
348
+ )
349
+
350
+ # Convert findings
351
+ findings = []
352
+ for result in findings_raw:
353
+ findings.append(
354
+ {
355
+ "type": result.entity_type.lower(),
356
+ "value": original_text[result.start : result.end],
357
+ "start": result.start,
358
+ "end": result.end,
359
+ "confidence": result.score,
360
+ }
361
+ )
362
+
363
+ logger.info(
364
+ "PII redacted",
365
+ count=len(findings),
366
+ types=list(set(f["type"] for f in findings)),
367
+ allowed_types=self.allowed_types,
368
+ )
369
+
370
+ # Convert back to dict if original was dict
371
+ if is_dict:
372
+ try:
373
+ return json.loads(anonymized.text), findings
374
+ except json.JSONDecodeError:
375
+ logger.warning("Redaction broke JSON structure, returning original")
376
+ return text, findings
377
+
378
+ return anonymized.text, findings
@@ -0,0 +1,206 @@
1
+ """Secrets detection using detect-secrets.
2
+
3
+ Production-grade secret detection using Yelp's detect-secrets.
4
+ """
5
+
6
+ import json
7
+ import re
8
+ from collections import Counter
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+
12
+ import structlog
13
+ from detect_secrets.core.secrets_collection import SecretsCollection
14
+ from detect_secrets.plugins.aws import AWSKeyDetector
15
+ from detect_secrets.plugins.basic_auth import BasicAuthDetector
16
+ from detect_secrets.plugins.github_token import GitHubTokenDetector
17
+ from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString
18
+ from detect_secrets.plugins.jwt import JwtTokenDetector
19
+ from detect_secrets.plugins.private_key import PrivateKeyDetector
20
+ from detect_secrets.plugins.slack import SlackDetector
21
+
22
+ logger = structlog.get_logger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class SecretsDetectionResult:
27
+ """Result of secrets detection scan."""
28
+ detected: bool
29
+ count: int
30
+ types: list[str]
31
+ counts_per_type: dict[str, int] = field(default_factory=dict)
32
+ findings: list[dict[str, Any]] = field(default_factory=list)
33
+
34
+
35
+ class SecretsDetector:
36
+ """Production-grade secrets detection using detect-secrets.
37
+
38
+ Uses multiple plugins to detect:
39
+ - AWS keys (AKIA...)
40
+ - GitHub tokens (ghp_, gho_, etc.)
41
+ - Slack tokens (xoxb-, xoxp-, etc.)
42
+ - JWT tokens
43
+ - Private keys (RSA, SSH)
44
+ - Basic auth credentials
45
+ - High entropy strings
46
+ """
47
+
48
+ def __init__(self, enabled: bool = True):
49
+ """Initialize secrets detector.
50
+
51
+ Args:
52
+ enabled: Whether secrets detection is enabled
53
+ """
54
+ self.enabled = enabled
55
+
56
+ # Initialize all plugins
57
+ self.plugins = [
58
+ AWSKeyDetector(),
59
+ GitHubTokenDetector(),
60
+ SlackDetector(),
61
+ BasicAuthDetector(),
62
+ PrivateKeyDetector(),
63
+ JwtTokenDetector(),
64
+ HexHighEntropyString(limit=3.0), # High entropy threshold
65
+ ]
66
+
67
+ logger.info(
68
+ "Secrets detector initialized",
69
+ plugins=len(self.plugins),
70
+ )
71
+
72
+ def scan(self, text: str | dict[str, Any]) -> list[dict[str, Any]]:
73
+ """Scan text or dict for secrets.
74
+
75
+ Args:
76
+ text: Text or dict to scan
77
+
78
+ Returns:
79
+ List of secret findings
80
+ """
81
+ if not self.enabled:
82
+ return []
83
+
84
+ # Convert dict to JSON string
85
+ if isinstance(text, dict):
86
+ text = json.dumps(text)
87
+
88
+ findings = []
89
+
90
+ # Scan with each plugin
91
+ for plugin in self.plugins:
92
+ try:
93
+ secrets = plugin.analyze_line(filename="inline", line=text, line_number=0)
94
+
95
+ for secret in secrets:
96
+ # Never log actual secret value
97
+ findings.append(
98
+ {
99
+ "type": secret.type,
100
+ "value": "***", # Redacted
101
+ "start": secret.line_number,
102
+ "end": secret.line_number,
103
+ "confidence": 0.9,
104
+ }
105
+ )
106
+ except Exception as e:
107
+ logger.debug(f"Plugin {plugin.__class__.__name__} error: {e}")
108
+
109
+ if findings:
110
+ logger.warning(
111
+ "Secrets detected",
112
+ count=len(findings),
113
+ types=list(set(f["type"] for f in findings)),
114
+ )
115
+
116
+ return findings
117
+
118
+ def detect(self, text: str | dict[str, Any]) -> SecretsDetectionResult:
119
+ """Detect secrets in text and return structured result.
120
+
121
+ Args:
122
+ text: Text or dict to scan
123
+
124
+ Returns:
125
+ SecretsDetectionResult with detection details and counts per type
126
+ """
127
+ findings = self.scan(text)
128
+
129
+ if not findings:
130
+ return SecretsDetectionResult(
131
+ detected=False,
132
+ count=0,
133
+ types=[],
134
+ counts_per_type={},
135
+ findings=[],
136
+ )
137
+
138
+ # Count occurrences per type
139
+ type_counts = Counter(f["type"] for f in findings)
140
+
141
+ return SecretsDetectionResult(
142
+ detected=True,
143
+ count=len(findings),
144
+ types=list(type_counts.keys()),
145
+ counts_per_type=dict(type_counts),
146
+ findings=findings,
147
+ )
148
+
149
+ def has_secrets(self, text: str | dict[str, Any]) -> bool:
150
+ """Check if text contains secrets.
151
+
152
+ Args:
153
+ text: Text or dict to check
154
+
155
+ Returns:
156
+ True if secrets detected
157
+ """
158
+ return len(self.scan(text)) > 0
159
+
160
+ def redact(self, text: str | dict[str, Any]) -> tuple[str | dict, list[dict[str, Any]]]:
161
+ """Scan and redact secrets.
162
+
163
+ Args:
164
+ text: Text or dict to redact
165
+
166
+ Returns:
167
+ (redacted_content, findings)
168
+ """
169
+ is_dict = isinstance(text, dict)
170
+ original_text = json.dumps(text) if is_dict else text
171
+
172
+ # Scan for secrets
173
+ findings = self.scan(original_text)
174
+
175
+ if not findings:
176
+ return text, []
177
+
178
+ # Redact known patterns
179
+ redacted = original_text
180
+
181
+ # AWS keys
182
+ redacted = re.sub(r"AKIA[0-9A-Z]{16}", "[REDACTED-AWS_KEY]", redacted)
183
+
184
+ # GitHub tokens
185
+ redacted = re.sub(r"gh[ps]_[A-Za-z0-9]{36}", "[REDACTED-GITHUB_TOKEN]", redacted)
186
+
187
+ # Slack tokens
188
+ redacted = re.sub(r"xox[baprs]-[A-Za-z0-9-]+", "[REDACTED-SLACK_TOKEN]", redacted)
189
+
190
+ # Generic high-entropy strings (be conservative)
191
+ # redacted = re.sub(r"\b[A-Za-z0-9]{40,}\b", "[REDACTED-SECRET]", redacted)
192
+
193
+ logger.info(
194
+ "Secrets redacted",
195
+ count=len(findings),
196
+ types=list(set(f["type"] for f in findings)),
197
+ )
198
+
199
+ # Convert back to dict if needed
200
+ if is_dict:
201
+ try:
202
+ return json.loads(redacted), findings
203
+ except json.JSONDecodeError:
204
+ return text, findings
205
+
206
+ return redacted, findings
@@ -0,0 +1,3 @@
1
+ """Interceptors for different agent interaction types (LLM, MCP, etc.)."""
2
+
3
+ __all__ = []
@@ -0,0 +1,62 @@
1
+ """LLM call interceptor for governance.
2
+
3
+ Intercepts calls to language model APIs (OpenAI, Anthropic, etc.) and records
4
+ them for observability. In enforcement mode, also evaluates policies.
5
+
6
+ GUARDRAILS APPLY HERE - LLMs should NOT receive sensitive data.
7
+ Unlike tools (which NEED the data), LLM prompts should be sanitized.
8
+
9
+ This allows compliance teams to see:
10
+ - What models are being used
11
+ - What data is being sent to LLMs
12
+ - ⚠️ PII in prompts (risk alert)
13
+ - 🚨 Secrets in prompts (critical risk)
14
+ - Prompt manipulation attempts
15
+ """
16
+
17
+ from typing import Any
18
+
19
+ import structlog
20
+
21
+ logger = structlog.get_logger(__name__)
22
+
23
+
24
+ class LLMInterceptor:
25
+ """Intercepts and observes LLM API calls.
26
+
27
+ Supports:
28
+ - OpenAI (ChatCompletion, Completion)
29
+ - Anthropic (Claude)
30
+ - Azure OpenAI
31
+ - Generic LLM APIs
32
+
33
+ GUARDRAILS APPLY HERE because:
34
+ - LLMs should NOT receive PII (data leak to external model)
35
+ - LLMs should NOT receive secrets (credential exposure)
36
+ - Prompts should be checked for manipulation attempts
37
+
38
+ This is different from tools, which NEED the sensitive data to work.
39
+ """
40
+
41
+ def __init__(self, cortex_hub: Any): # Type: CortexHub
42
+ """Initialize LLM interceptor.
43
+
44
+ Args:
45
+ cortex_hub: CortexHub instance for policy enforcement
46
+ """
47
+ self.cortex_hub = cortex_hub
48
+ self._openai_patched = False
49
+ self._anthropic_patched = False
50
+ logger.info("LLM interceptor initialized")
51
+
52
+ def intercept_openai(self) -> None:
53
+ """Provider-specific interception disabled (use framework adapters)."""
54
+ logger.info("OpenAI interception disabled; use framework adapters")
55
+
56
+ def intercept_anthropic(self) -> None:
57
+ """Provider-specific interception disabled (use framework adapters)."""
58
+ logger.info("Anthropic interception disabled; use framework adapters")
59
+
60
+ def apply_all(self) -> None:
61
+ """Provider-specific interception disabled (use framework adapters)."""
62
+ logger.info("LLM interception is handled by framework adapters")