glacis 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glacis/__init__.py +62 -1
- glacis/__main__.py +1 -80
- glacis/client.py +60 -31
- glacis/config.py +141 -0
- glacis/controls/__init__.py +232 -0
- glacis/controls/base.py +104 -0
- glacis/controls/jailbreak.py +224 -0
- glacis/controls/pii.py +855 -0
- glacis/crypto.py +70 -1
- glacis/integrations/__init__.py +53 -3
- glacis/integrations/anthropic.py +207 -142
- glacis/integrations/base.py +476 -0
- glacis/integrations/openai.py +156 -121
- glacis/models.py +209 -16
- glacis/storage.py +324 -8
- glacis/verify.py +154 -0
- glacis-0.2.0.dist-info/METADATA +275 -0
- glacis-0.2.0.dist-info/RECORD +21 -0
- glacis/wasm/s3p_core_wasi.wasm +0 -0
- glacis/wasm_runtime.py +0 -533
- glacis-0.1.4.dist-info/METADATA +0 -324
- glacis-0.1.4.dist-info/RECORD +0 -16
- {glacis-0.1.4.dist-info → glacis-0.2.0.dist-info}/WHEEL +0 -0
- {glacis-0.1.4.dist-info → glacis-0.2.0.dist-info}/licenses/LICENSE +0 -0
glacis/controls/pii.py
ADDED
|
@@ -0,0 +1,855 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PII/PHI Redaction Control.
|
|
3
|
+
|
|
4
|
+
HIPAA-compliant detection and redaction of the 18 Safe Harbor identifiers
|
|
5
|
+
using Microsoft Presidio with custom healthcare-specific recognizers.
|
|
6
|
+
|
|
7
|
+
Supported backends:
|
|
8
|
+
- presidio: Microsoft Presidio (default)
|
|
9
|
+
|
|
10
|
+
Two modes:
|
|
11
|
+
- fast: Regex-only detection (<2ms typical)
|
|
12
|
+
- full: Regex + spaCy NER (~15-20ms typical)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
import warnings
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
21
|
+
|
|
22
|
+
from glacis.controls.base import BaseControl, ControlResult
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
|
26
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
27
|
+
|
|
28
|
+
from glacis.config import PiiPhiConfig
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger("glacis.controls.pii")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Supported backends for PII detection
|
|
34
|
+
SUPPORTED_BACKENDS = ["presidio"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class PIIControl(BaseControl):
|
|
38
|
+
"""
|
|
39
|
+
PII/PHI detection and redaction control.
|
|
40
|
+
|
|
41
|
+
Uses Microsoft Presidio with custom recognizers for the 18 HIPAA Safe Harbor
|
|
42
|
+
identifiers. Supports two operating modes:
|
|
43
|
+
|
|
44
|
+
- "fast": Regex-only detection, typically <2ms
|
|
45
|
+
- "full": Regex + spaCy NER for improved name/location detection, ~15-20ms
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
config: PiiPhiConfig with enabled, backend, and mode settings
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> from glacis.config import PiiPhiConfig
|
|
52
|
+
>>> config = PiiPhiConfig(enabled=True, backend="presidio", mode="fast")
|
|
53
|
+
>>> control = PIIControl(config)
|
|
54
|
+
>>> result = control.check("SSN: 123-45-6789")
|
|
55
|
+
>>> result.detected
|
|
56
|
+
True
|
|
57
|
+
>>> result.modified_text
|
|
58
|
+
"SSN: [US_SSN]"
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
control_type = "pii"
|
|
62
|
+
|
|
63
|
+
# HIPAA Safe Harbor entity types
|
|
64
|
+
HIPAA_ENTITIES: list[str] = [
|
|
65
|
+
# Presidio OOTB
|
|
66
|
+
"PERSON",
|
|
67
|
+
"DATE_TIME",
|
|
68
|
+
"PHONE_NUMBER",
|
|
69
|
+
"EMAIL_ADDRESS",
|
|
70
|
+
"US_SSN",
|
|
71
|
+
"US_DRIVER_LICENSE",
|
|
72
|
+
"URL",
|
|
73
|
+
"IP_ADDRESS",
|
|
74
|
+
"CREDIT_CARD",
|
|
75
|
+
"US_BANK_NUMBER",
|
|
76
|
+
"IBAN_CODE",
|
|
77
|
+
"US_PASSPORT",
|
|
78
|
+
"US_ITIN",
|
|
79
|
+
# Custom HIPAA-specific
|
|
80
|
+
"MEDICAL_RECORD_NUMBER",
|
|
81
|
+
"HEALTH_PLAN_BENEFICIARY",
|
|
82
|
+
"NPI",
|
|
83
|
+
"DEA_NUMBER",
|
|
84
|
+
"MEDICAL_LICENSE",
|
|
85
|
+
"US_ZIP_CODE",
|
|
86
|
+
"STREET_ADDRESS",
|
|
87
|
+
"VIN",
|
|
88
|
+
"LICENSE_PLATE",
|
|
89
|
+
"DEVICE_SERIAL",
|
|
90
|
+
"UDI",
|
|
91
|
+
"IMEI",
|
|
92
|
+
"FAX_NUMBER",
|
|
93
|
+
"BIOMETRIC_ID",
|
|
94
|
+
"UUID",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def __init__(self, config: "PiiPhiConfig") -> None:
|
|
98
|
+
self._config = config
|
|
99
|
+
self._mode = config.mode
|
|
100
|
+
|
|
101
|
+
# Validate backend
|
|
102
|
+
if config.backend not in SUPPORTED_BACKENDS:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Unknown PII backend: {config.backend}. "
|
|
105
|
+
f"Available: {SUPPORTED_BACKENDS}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Default threshold: higher for "full" mode to reduce NER false positives
|
|
109
|
+
if self._mode == "full":
|
|
110
|
+
self._score_threshold = 0.7
|
|
111
|
+
else:
|
|
112
|
+
self._score_threshold = 0.5
|
|
113
|
+
|
|
114
|
+
self._analyzer: Optional["AnalyzerEngine"] = None
|
|
115
|
+
self._anonymizer: Optional["AnonymizerEngine"] = None
|
|
116
|
+
self._spacy_available: bool = False
|
|
117
|
+
self._initialized: bool = False
|
|
118
|
+
|
|
119
|
+
def _ensure_initialized(self) -> None:
|
|
120
|
+
"""Lazy initialization of Presidio engines."""
|
|
121
|
+
if self._initialized:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
# Suppress noisy loggers from Presidio and spaCy
|
|
125
|
+
for logger_name in [
|
|
126
|
+
"presidio-analyzer",
|
|
127
|
+
"presidio-anonymizer",
|
|
128
|
+
"presidio_analyzer",
|
|
129
|
+
"presidio_anonymizer",
|
|
130
|
+
"spacy",
|
|
131
|
+
]:
|
|
132
|
+
logging.getLogger(logger_name).setLevel(logging.WARNING)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
|
136
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
137
|
+
except ImportError as e:
|
|
138
|
+
raise ImportError(
|
|
139
|
+
"PII control requires presidio-analyzer and presidio-anonymizer. "
|
|
140
|
+
"Install with: pip install glacis[redaction]"
|
|
141
|
+
) from e
|
|
142
|
+
|
|
143
|
+
registry = RecognizerRegistry()
|
|
144
|
+
|
|
145
|
+
if self._mode == "fast":
|
|
146
|
+
# Fast mode: Only pattern-based recognizers, NO spaCy
|
|
147
|
+
for recognizer in self._build_core_pattern_recognizers():
|
|
148
|
+
registry.add_recognizer(recognizer)
|
|
149
|
+
else:
|
|
150
|
+
# Full mode: Load all predefined recognizers including SpacyRecognizer
|
|
151
|
+
registry.load_predefined_recognizers()
|
|
152
|
+
for recognizer in self._build_core_pattern_recognizers():
|
|
153
|
+
registry.add_recognizer(recognizer)
|
|
154
|
+
|
|
155
|
+
# Add custom HIPAA recognizers
|
|
156
|
+
for recognizer in self._build_healthcare_recognizers():
|
|
157
|
+
registry.add_recognizer(recognizer)
|
|
158
|
+
for recognizer in self._build_geographic_recognizers():
|
|
159
|
+
registry.add_recognizer(recognizer)
|
|
160
|
+
for recognizer in self._build_identifier_recognizers():
|
|
161
|
+
registry.add_recognizer(recognizer)
|
|
162
|
+
|
|
163
|
+
# Configure NLP engine based on mode
|
|
164
|
+
if self._mode == "full":
|
|
165
|
+
nlp_engine = self._try_load_spacy()
|
|
166
|
+
if nlp_engine:
|
|
167
|
+
self._analyzer = AnalyzerEngine(
|
|
168
|
+
registry=registry,
|
|
169
|
+
nlp_engine=nlp_engine,
|
|
170
|
+
supported_languages=["en"],
|
|
171
|
+
)
|
|
172
|
+
self._spacy_available = True
|
|
173
|
+
else:
|
|
174
|
+
warnings.warn(
|
|
175
|
+
"spaCy model 'en_core_web_md' not available. "
|
|
176
|
+
"Falling back to 'fast' mode (regex-only). "
|
|
177
|
+
"Install with: python -m spacy download en_core_web_md",
|
|
178
|
+
UserWarning,
|
|
179
|
+
stacklevel=2,
|
|
180
|
+
)
|
|
181
|
+
self._analyzer = AnalyzerEngine(
|
|
182
|
+
registry=registry,
|
|
183
|
+
supported_languages=["en"],
|
|
184
|
+
)
|
|
185
|
+
self._spacy_available = False
|
|
186
|
+
else:
|
|
187
|
+
self._analyzer = AnalyzerEngine(
|
|
188
|
+
registry=registry,
|
|
189
|
+
supported_languages=["en"],
|
|
190
|
+
)
|
|
191
|
+
self._spacy_available = False
|
|
192
|
+
|
|
193
|
+
self._anonymizer = AnonymizerEngine()
|
|
194
|
+
self._initialized = True
|
|
195
|
+
|
|
196
|
+
def _try_load_spacy(self) -> Optional[Any]:
|
|
197
|
+
"""Attempt to load spaCy NLP engine."""
|
|
198
|
+
try:
|
|
199
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
200
|
+
|
|
201
|
+
configuration = {
|
|
202
|
+
"nlp_engine_name": "spacy",
|
|
203
|
+
"models": [{"lang_code": "en", "model_name": "en_core_web_md"}],
|
|
204
|
+
}
|
|
205
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
|
206
|
+
return provider.create_engine()
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.debug(f"Failed to load spaCy: {e}")
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def check(self, text: str) -> ControlResult:
|
|
212
|
+
"""
|
|
213
|
+
Check text for PII/PHI and redact if found.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
text: Input text to check
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
ControlResult with detection info and redacted text
|
|
220
|
+
"""
|
|
221
|
+
self._ensure_initialized()
|
|
222
|
+
|
|
223
|
+
start_time = time.perf_counter()
|
|
224
|
+
|
|
225
|
+
if not text or not text.strip():
|
|
226
|
+
return ControlResult(
|
|
227
|
+
control_type=self.control_type,
|
|
228
|
+
detected=False,
|
|
229
|
+
action="pass",
|
|
230
|
+
categories=[],
|
|
231
|
+
latency_ms=0,
|
|
232
|
+
modified_text=text,
|
|
233
|
+
metadata={"backend": self._config.backend, "mode": self._mode},
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
assert self._analyzer is not None
|
|
237
|
+
assert self._anonymizer is not None
|
|
238
|
+
|
|
239
|
+
results: list["RecognizerResult"] = self._analyzer.analyze(
|
|
240
|
+
text=text,
|
|
241
|
+
language="en",
|
|
242
|
+
score_threshold=self._score_threshold,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
latency_ms = int((time.perf_counter() - start_time) * 1000)
|
|
246
|
+
|
|
247
|
+
if not results:
|
|
248
|
+
return ControlResult(
|
|
249
|
+
control_type=self.control_type,
|
|
250
|
+
detected=False,
|
|
251
|
+
action="pass",
|
|
252
|
+
categories=[],
|
|
253
|
+
latency_ms=latency_ms,
|
|
254
|
+
modified_text=text,
|
|
255
|
+
metadata={"backend": self._config.backend, "mode": self._mode},
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# De-duplicate overlapping detections
|
|
259
|
+
results = self._resolve_overlaps(results)
|
|
260
|
+
|
|
261
|
+
# Build operators for replacement format [ENTITY_TYPE]
|
|
262
|
+
from presidio_anonymizer.entities import OperatorConfig
|
|
263
|
+
|
|
264
|
+
operators = {}
|
|
265
|
+
for entity_type in set(r.entity_type for r in results):
|
|
266
|
+
operators[entity_type] = OperatorConfig(
|
|
267
|
+
"replace", {"new_value": f"[{entity_type}]"}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
anonymized = self._anonymizer.anonymize(
|
|
271
|
+
text=text,
|
|
272
|
+
analyzer_results=results,
|
|
273
|
+
operators=operators,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
categories = sorted(set(r.entity_type for r in results))
|
|
277
|
+
|
|
278
|
+
return ControlResult(
|
|
279
|
+
control_type=self.control_type,
|
|
280
|
+
detected=True,
|
|
281
|
+
action="redact",
|
|
282
|
+
categories=categories,
|
|
283
|
+
latency_ms=latency_ms,
|
|
284
|
+
modified_text=anonymized.text,
|
|
285
|
+
metadata={
|
|
286
|
+
"backend": self._config.backend,
|
|
287
|
+
"mode": self._mode,
|
|
288
|
+
"count": len(results),
|
|
289
|
+
},
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def close(self) -> None:
|
|
293
|
+
"""Release resources."""
|
|
294
|
+
self._analyzer = None
|
|
295
|
+
self._anonymizer = None
|
|
296
|
+
self._initialized = False
|
|
297
|
+
|
|
298
|
+
def _resolve_overlaps(self, results: list[Any]) -> list[Any]:
|
|
299
|
+
"""Resolve overlapping detections by keeping highest confidence."""
|
|
300
|
+
if not results:
|
|
301
|
+
return results
|
|
302
|
+
|
|
303
|
+
sorted_results = sorted(
|
|
304
|
+
results, key=lambda r: (r.start, -r.score, -(r.end - r.start))
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
merged: list[Any] = []
|
|
308
|
+
for result in sorted_results:
|
|
309
|
+
overlaps = False
|
|
310
|
+
for kept in merged:
|
|
311
|
+
if result.start < kept.end and result.end > kept.start:
|
|
312
|
+
overlaps = True
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
if not overlaps:
|
|
316
|
+
merged.append(result)
|
|
317
|
+
|
|
318
|
+
return merged
|
|
319
|
+
|
|
320
|
+
# =========================================================================
|
|
321
|
+
# Pattern Recognizer Builders (from original RedactionEngine)
|
|
322
|
+
# =========================================================================
|
|
323
|
+
|
|
324
|
+
def _build_core_pattern_recognizers(self) -> list[Any]:
|
|
325
|
+
"""Build core pattern-based recognizers."""
|
|
326
|
+
from presidio_analyzer import Pattern, PatternRecognizer
|
|
327
|
+
|
|
328
|
+
recognizers = []
|
|
329
|
+
|
|
330
|
+
# US SSN
|
|
331
|
+
ssn_patterns = [
|
|
332
|
+
Pattern(name="ssn_dashes", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.85),
|
|
333
|
+
Pattern(name="ssn_spaces", regex=r"\b\d{3}\s\d{2}\s\d{4}\b", score=0.85),
|
|
334
|
+
Pattern(name="ssn_no_sep", regex=r"\b\d{9}\b", score=0.3),
|
|
335
|
+
]
|
|
336
|
+
recognizers.append(
|
|
337
|
+
PatternRecognizer(
|
|
338
|
+
supported_entity="US_SSN",
|
|
339
|
+
patterns=ssn_patterns,
|
|
340
|
+
context=["ssn", "social security", "social security number"],
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Email
|
|
345
|
+
email_patterns = [
|
|
346
|
+
Pattern(
|
|
347
|
+
name="email",
|
|
348
|
+
regex=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
|
|
349
|
+
score=0.85,
|
|
350
|
+
),
|
|
351
|
+
]
|
|
352
|
+
recognizers.append(
|
|
353
|
+
PatternRecognizer(supported_entity="EMAIL_ADDRESS", patterns=email_patterns)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Phone Number
|
|
357
|
+
phone_patterns = [
|
|
358
|
+
Pattern(
|
|
359
|
+
name="phone_with_parens",
|
|
360
|
+
regex=r"\b\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
|
|
361
|
+
score=0.7,
|
|
362
|
+
),
|
|
363
|
+
Pattern(
|
|
364
|
+
name="phone_with_country",
|
|
365
|
+
regex=r"\b\+?1?[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
|
|
366
|
+
score=0.7,
|
|
367
|
+
),
|
|
368
|
+
]
|
|
369
|
+
recognizers.append(
|
|
370
|
+
PatternRecognizer(
|
|
371
|
+
supported_entity="PHONE_NUMBER",
|
|
372
|
+
patterns=phone_patterns,
|
|
373
|
+
context=["phone", "telephone", "cell", "mobile", "call", "contact"],
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Credit Card
|
|
378
|
+
cc_patterns = [
|
|
379
|
+
Pattern(
|
|
380
|
+
name="credit_card",
|
|
381
|
+
regex=r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
|
|
382
|
+
score=0.8,
|
|
383
|
+
),
|
|
384
|
+
Pattern(
|
|
385
|
+
name="credit_card_spaced",
|
|
386
|
+
regex=r"\b(?:\d{4}[\s-]?){3}\d{4}\b",
|
|
387
|
+
score=0.5,
|
|
388
|
+
),
|
|
389
|
+
]
|
|
390
|
+
recognizers.append(
|
|
391
|
+
PatternRecognizer(
|
|
392
|
+
supported_entity="CREDIT_CARD",
|
|
393
|
+
patterns=cc_patterns,
|
|
394
|
+
context=["credit card", "card number", "cc", "visa", "mastercard", "amex"],
|
|
395
|
+
)
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# IP Address
|
|
399
|
+
ip_patterns = [
|
|
400
|
+
Pattern(
|
|
401
|
+
name="ipv4",
|
|
402
|
+
regex=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
|
|
403
|
+
score=0.7,
|
|
404
|
+
),
|
|
405
|
+
]
|
|
406
|
+
recognizers.append(
|
|
407
|
+
PatternRecognizer(
|
|
408
|
+
supported_entity="IP_ADDRESS",
|
|
409
|
+
patterns=ip_patterns,
|
|
410
|
+
context=["ip", "ip address", "address"],
|
|
411
|
+
)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# URL
|
|
415
|
+
url_patterns = [
|
|
416
|
+
Pattern(
|
|
417
|
+
name="url",
|
|
418
|
+
regex=r"\bhttps?://[^\s<>\"{}|\\^`\[\]]+\b",
|
|
419
|
+
score=0.7,
|
|
420
|
+
),
|
|
421
|
+
]
|
|
422
|
+
recognizers.append(PatternRecognizer(supported_entity="URL", patterns=url_patterns))
|
|
423
|
+
|
|
424
|
+
# US Driver License
|
|
425
|
+
dl_patterns = [
|
|
426
|
+
Pattern(
|
|
427
|
+
name="driver_license_with_context",
|
|
428
|
+
regex=r"\b(?:DL|Driver'?s?\s*License|License)[\s:#]*([A-Z0-9]{5,15})\b",
|
|
429
|
+
score=0.7,
|
|
430
|
+
),
|
|
431
|
+
]
|
|
432
|
+
recognizers.append(
|
|
433
|
+
PatternRecognizer(
|
|
434
|
+
supported_entity="US_DRIVER_LICENSE",
|
|
435
|
+
patterns=dl_patterns,
|
|
436
|
+
context=["driver", "license", "dl", "driving"],
|
|
437
|
+
)
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# Date of Birth / Dates
|
|
441
|
+
date_patterns = [
|
|
442
|
+
Pattern(
|
|
443
|
+
name="date_mdy_full",
|
|
444
|
+
regex=r"\b(?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])[/\-.](?:19|20)\d{2}\b",
|
|
445
|
+
score=0.7,
|
|
446
|
+
),
|
|
447
|
+
Pattern(
|
|
448
|
+
name="date_iso",
|
|
449
|
+
regex=r"\b(?:19|20)\d{2}[/\-.](?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])\b",
|
|
450
|
+
score=0.7,
|
|
451
|
+
),
|
|
452
|
+
Pattern(
|
|
453
|
+
name="date_written",
|
|
454
|
+
regex=r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+(?:19|20)\d{2}\b",
|
|
455
|
+
score=0.75,
|
|
456
|
+
),
|
|
457
|
+
Pattern(
|
|
458
|
+
name="date_written_dmy",
|
|
459
|
+
regex=r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+(?:19|20)\d{2}\b",
|
|
460
|
+
score=0.75,
|
|
461
|
+
),
|
|
462
|
+
]
|
|
463
|
+
recognizers.append(
|
|
464
|
+
PatternRecognizer(
|
|
465
|
+
supported_entity="DATE_TIME",
|
|
466
|
+
patterns=date_patterns,
|
|
467
|
+
context=[
|
|
468
|
+
"dob", "birth", "born", "date of birth",
|
|
469
|
+
"birthday", "admitted", "discharged", "died",
|
|
470
|
+
],
|
|
471
|
+
)
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Person names with context
|
|
475
|
+
name_patterns = [
|
|
476
|
+
Pattern(
|
|
477
|
+
name="patient_name",
|
|
478
|
+
regex=r"(?:Patient|Client|Member|Subscriber|Beneficiary)(?:\s+Name)?[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
|
|
479
|
+
score=0.85,
|
|
480
|
+
),
|
|
481
|
+
Pattern(
|
|
482
|
+
name="name_field",
|
|
483
|
+
regex=r"(?<!\w)Name[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
|
|
484
|
+
score=0.7,
|
|
485
|
+
),
|
|
486
|
+
Pattern(
|
|
487
|
+
name="doctor_name",
|
|
488
|
+
regex=r"(?:Dr\.?|Doctor|Physician|Provider)[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
|
|
489
|
+
score=0.8,
|
|
490
|
+
),
|
|
491
|
+
]
|
|
492
|
+
recognizers.append(
|
|
493
|
+
PatternRecognizer(
|
|
494
|
+
supported_entity="PERSON",
|
|
495
|
+
patterns=name_patterns,
|
|
496
|
+
context=["patient", "name", "client", "member", "doctor", "physician"],
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
return recognizers
|
|
501
|
+
|
|
502
|
+
def _build_healthcare_recognizers(self) -> list[Any]:
|
|
503
|
+
"""Build healthcare-specific recognizers."""
|
|
504
|
+
from presidio_analyzer import Pattern, PatternRecognizer
|
|
505
|
+
|
|
506
|
+
recognizers = []
|
|
507
|
+
|
|
508
|
+
# Medical Record Number (MRN)
|
|
509
|
+
mrn_patterns = [
|
|
510
|
+
Pattern(
|
|
511
|
+
name="mrn_numeric",
|
|
512
|
+
regex=r"\b(?:MRN|MR#?|Medical Record|Patient ID)[\s:#\-]*(\d{6,10})\b",
|
|
513
|
+
score=0.85,
|
|
514
|
+
),
|
|
515
|
+
Pattern(
|
|
516
|
+
name="mrn_alphanumeric",
|
|
517
|
+
regex=r"\b(?:MRN|MR#?)[\s:#\-]*([A-Z]{1,3}[\-]?\d{6,10})\b",
|
|
518
|
+
score=0.85,
|
|
519
|
+
),
|
|
520
|
+
Pattern(
|
|
521
|
+
name="mrn_standalone",
|
|
522
|
+
regex=r"\b[A-Z]{2,3}\d{7,10}\b",
|
|
523
|
+
score=0.4,
|
|
524
|
+
),
|
|
525
|
+
]
|
|
526
|
+
recognizers.append(
|
|
527
|
+
PatternRecognizer(
|
|
528
|
+
supported_entity="MEDICAL_RECORD_NUMBER",
|
|
529
|
+
patterns=mrn_patterns,
|
|
530
|
+
context=["mrn", "medical record", "patient id", "chart number", "hospital number"],
|
|
531
|
+
)
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Health Plan Beneficiary Numbers
|
|
535
|
+
health_plan_patterns = [
|
|
536
|
+
Pattern(
|
|
537
|
+
name="medicare_new",
|
|
538
|
+
regex=r"\b[1-9][A-Z][A-Z0-9]\d-?[A-Z][A-Z0-9]\d-?[A-Z][A-Z0-9]\d{2}\b",
|
|
539
|
+
score=0.85,
|
|
540
|
+
),
|
|
541
|
+
Pattern(
|
|
542
|
+
name="medicare_legacy",
|
|
543
|
+
regex=r"\b\d{3}-?\d{2}-?\d{4}[A-Z]{1,2}\b",
|
|
544
|
+
score=0.75,
|
|
545
|
+
),
|
|
546
|
+
Pattern(
|
|
547
|
+
name="member_id_generic",
|
|
548
|
+
regex=r"\b(?:Member ID|Policy|Subscriber|Beneficiary)[\s:#]*([A-Z0-9]{9,15})\b",
|
|
549
|
+
score=0.8,
|
|
550
|
+
),
|
|
551
|
+
Pattern(
|
|
552
|
+
name="group_number",
|
|
553
|
+
regex=r"\b(?:Group|GRP)[\s:#]*([A-Z0-9]{5,12})\b",
|
|
554
|
+
score=0.7,
|
|
555
|
+
),
|
|
556
|
+
]
|
|
557
|
+
recognizers.append(
|
|
558
|
+
PatternRecognizer(
|
|
559
|
+
supported_entity="HEALTH_PLAN_BENEFICIARY",
|
|
560
|
+
patterns=health_plan_patterns,
|
|
561
|
+
context=[
|
|
562
|
+
"medicare", "medicaid", "member", "subscriber",
|
|
563
|
+
"beneficiary", "insurance", "policy", "group",
|
|
564
|
+
"health plan", "coverage",
|
|
565
|
+
],
|
|
566
|
+
)
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# NPI
|
|
570
|
+
npi_patterns = [
|
|
571
|
+
Pattern(
|
|
572
|
+
name="npi_with_context",
|
|
573
|
+
regex=r"\b(?:NPI|National Provider)[\s:#]*(\d{10})\b",
|
|
574
|
+
score=0.9,
|
|
575
|
+
),
|
|
576
|
+
Pattern(
|
|
577
|
+
name="npi_standalone",
|
|
578
|
+
regex=r"\b[12]\d{9}\b",
|
|
579
|
+
score=0.5,
|
|
580
|
+
),
|
|
581
|
+
]
|
|
582
|
+
recognizers.append(
|
|
583
|
+
PatternRecognizer(
|
|
584
|
+
supported_entity="NPI",
|
|
585
|
+
patterns=npi_patterns,
|
|
586
|
+
context=["npi", "national provider", "provider identifier", "prescriber"],
|
|
587
|
+
)
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# DEA Number
|
|
591
|
+
dea_patterns = [
|
|
592
|
+
Pattern(
|
|
593
|
+
name="dea_with_context",
|
|
594
|
+
regex=r"\b(?:DEA|Drug Enforcement)[\s:#]*([ABCDEFGHJKLMPRSTUX][A-Z9]\d{7})\b",
|
|
595
|
+
score=0.9,
|
|
596
|
+
),
|
|
597
|
+
Pattern(
|
|
598
|
+
name="dea_standalone",
|
|
599
|
+
regex=r"\b[ABCDEFGHJKLMPRSTUX][A-Z9]\d{7}\b",
|
|
600
|
+
score=0.6,
|
|
601
|
+
),
|
|
602
|
+
]
|
|
603
|
+
recognizers.append(
|
|
604
|
+
PatternRecognizer(
|
|
605
|
+
supported_entity="DEA_NUMBER",
|
|
606
|
+
patterns=dea_patterns,
|
|
607
|
+
context=["dea", "drug enforcement", "controlled substance", "prescriber"],
|
|
608
|
+
)
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Medical License
|
|
612
|
+
medical_license_patterns = [
|
|
613
|
+
Pattern(
|
|
614
|
+
name="medical_license_with_context",
|
|
615
|
+
regex=r"\b(?:License|Lic|Medical License)[\s:#]*([A-Z]{1,2}\d{5,8})\b",
|
|
616
|
+
score=0.8,
|
|
617
|
+
),
|
|
618
|
+
Pattern(
|
|
619
|
+
name="state_license",
|
|
620
|
+
regex=r"\b(?:MD|DO|RN|NP|PA|DDS|DMD|DPM|DC|OD)[\s-]*(?:License|Lic)[\s:#]*(\d{4,8})\b",
|
|
621
|
+
score=0.85,
|
|
622
|
+
),
|
|
623
|
+
]
|
|
624
|
+
recognizers.append(
|
|
625
|
+
PatternRecognizer(
|
|
626
|
+
supported_entity="MEDICAL_LICENSE",
|
|
627
|
+
patterns=medical_license_patterns,
|
|
628
|
+
context=[
|
|
629
|
+
"license", "medical license", "state license",
|
|
630
|
+
"board certified", "credentials", "physician",
|
|
631
|
+
"practitioner",
|
|
632
|
+
],
|
|
633
|
+
)
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
return recognizers
|
|
637
|
+
|
|
638
|
+
def _build_geographic_recognizers(self) -> list[Any]:
|
|
639
|
+
"""Build geographic identifier recognizers."""
|
|
640
|
+
from presidio_analyzer import Pattern, PatternRecognizer
|
|
641
|
+
|
|
642
|
+
recognizers = []
|
|
643
|
+
|
|
644
|
+
# US ZIP Code
|
|
645
|
+
zip_patterns = [
|
|
646
|
+
Pattern(name="zip_plus_4", regex=r"\b\d{5}-\d{4}\b", score=0.7),
|
|
647
|
+
Pattern(
|
|
648
|
+
name="zip_5_with_context",
|
|
649
|
+
regex=r"\b(?:zip|zip code|postal)[\s:#]*(\d{5})\b",
|
|
650
|
+
score=0.7,
|
|
651
|
+
),
|
|
652
|
+
]
|
|
653
|
+
recognizers.append(
|
|
654
|
+
PatternRecognizer(
|
|
655
|
+
supported_entity="US_ZIP_CODE",
|
|
656
|
+
patterns=zip_patterns,
|
|
657
|
+
context=["zip", "zipcode", "zip code", "postal", "mailing"],
|
|
658
|
+
)
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# Street Address
|
|
662
|
+
address_patterns = [
|
|
663
|
+
Pattern(
|
|
664
|
+
name="street_address_full",
|
|
665
|
+
regex=r"\b\d{1,5}\s+(?:[A-Z][a-z]+\s+){1,3}(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Court|Ct|Circle|Cir|Place|Pl|Terrace|Ter|Highway|Hwy)\.?\b",
|
|
666
|
+
score=0.7,
|
|
667
|
+
),
|
|
668
|
+
Pattern(
|
|
669
|
+
name="po_box",
|
|
670
|
+
regex=r"\b(?:P\.?O\.?\s*Box|Post Office Box)\s*\d+\b",
|
|
671
|
+
score=0.85,
|
|
672
|
+
),
|
|
673
|
+
Pattern(
|
|
674
|
+
name="apt_suite",
|
|
675
|
+
regex=r"\b(?:Apt|Apartment|Suite|Ste|Unit|#)\s*[A-Z0-9]+\b",
|
|
676
|
+
score=0.5,
|
|
677
|
+
),
|
|
678
|
+
]
|
|
679
|
+
recognizers.append(
|
|
680
|
+
PatternRecognizer(
|
|
681
|
+
supported_entity="STREET_ADDRESS",
|
|
682
|
+
patterns=address_patterns,
|
|
683
|
+
context=["address", "street", "mail", "ship", "deliver", "residence", "home"],
|
|
684
|
+
)
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
return recognizers
|
|
688
|
+
|
|
689
|
+
def _build_identifier_recognizers(self) -> list[Any]:
|
|
690
|
+
"""Build vehicle, device, and other identifier recognizers."""
|
|
691
|
+
from presidio_analyzer import Pattern, PatternRecognizer
|
|
692
|
+
|
|
693
|
+
recognizers = []
|
|
694
|
+
|
|
695
|
+
# VIN
|
|
696
|
+
vin_patterns = [
|
|
697
|
+
Pattern(
|
|
698
|
+
name="vin_with_context",
|
|
699
|
+
regex=r"\b(?:VIN|Vehicle ID)[\s:#]*([A-HJ-NPR-Z0-9]{17})\b",
|
|
700
|
+
score=0.9,
|
|
701
|
+
),
|
|
702
|
+
Pattern(
|
|
703
|
+
name="vin_standalone",
|
|
704
|
+
regex=r"\b[A-HJ-NPR-Z0-9]{17}\b",
|
|
705
|
+
score=0.5,
|
|
706
|
+
),
|
|
707
|
+
]
|
|
708
|
+
recognizers.append(
|
|
709
|
+
PatternRecognizer(
|
|
710
|
+
supported_entity="VIN",
|
|
711
|
+
patterns=vin_patterns,
|
|
712
|
+
context=["vin", "vehicle", "car", "truck", "automobile", "registration"],
|
|
713
|
+
)
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
# License Plate
|
|
717
|
+
plate_patterns = [
|
|
718
|
+
Pattern(
|
|
719
|
+
name="plate_with_context",
|
|
720
|
+
regex=r"\b(?:License Plate|Plate|Tag|Plate #)[\s:#]*([A-Z0-9]{2,8})\b",
|
|
721
|
+
score=0.85,
|
|
722
|
+
),
|
|
723
|
+
]
|
|
724
|
+
recognizers.append(
|
|
725
|
+
PatternRecognizer(
|
|
726
|
+
supported_entity="LICENSE_PLATE",
|
|
727
|
+
patterns=plate_patterns,
|
|
728
|
+
context=["license plate", "plate number", "tag", "vehicle registration", "dmv"],
|
|
729
|
+
)
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Device Serial Numbers
|
|
733
|
+
serial_patterns = [
|
|
734
|
+
Pattern(
|
|
735
|
+
name="serial_with_context",
|
|
736
|
+
regex=r"\b(?:Serial|SN|S/N)[\s:#]*([A-Z0-9]{8,20})\b",
|
|
737
|
+
score=0.85,
|
|
738
|
+
),
|
|
739
|
+
Pattern(
|
|
740
|
+
name="serial_common",
|
|
741
|
+
regex=r"\b[A-Z]{2,4}\d{6,12}[A-Z0-9]*\b",
|
|
742
|
+
score=0.4,
|
|
743
|
+
),
|
|
744
|
+
]
|
|
745
|
+
recognizers.append(
|
|
746
|
+
PatternRecognizer(
|
|
747
|
+
supported_entity="DEVICE_SERIAL",
|
|
748
|
+
patterns=serial_patterns,
|
|
749
|
+
context=["serial", "device", "equipment", "model", "asset"],
|
|
750
|
+
)
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
# UDI
|
|
754
|
+
udi_patterns = [
|
|
755
|
+
Pattern(
|
|
756
|
+
name="udi_gs1",
|
|
757
|
+
regex=r"\b\(01\)\d{14}(?:\(\d{2}\)[A-Z0-9]+)*\b",
|
|
758
|
+
score=0.9,
|
|
759
|
+
),
|
|
760
|
+
Pattern(
|
|
761
|
+
name="udi_hibcc",
|
|
762
|
+
regex=r"\b\+[A-Z0-9]{4,}\/[A-Z0-9]+\b",
|
|
763
|
+
score=0.85,
|
|
764
|
+
),
|
|
765
|
+
Pattern(
|
|
766
|
+
name="udi_with_context",
|
|
767
|
+
regex=r"\b(?:UDI|Unique Device)[\s:#]*([A-Z0-9\(\)\/\+]{10,})\b",
|
|
768
|
+
score=0.9,
|
|
769
|
+
),
|
|
770
|
+
]
|
|
771
|
+
recognizers.append(
|
|
772
|
+
PatternRecognizer(
|
|
773
|
+
supported_entity="UDI",
|
|
774
|
+
patterns=udi_patterns,
|
|
775
|
+
context=["udi", "unique device", "medical device", "implant", "fda"],
|
|
776
|
+
)
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
# IMEI
|
|
780
|
+
imei_patterns = [
|
|
781
|
+
Pattern(
|
|
782
|
+
name="imei_with_context",
|
|
783
|
+
regex=r"\b(?:IMEI|International Mobile)[\s:#]*(\d{15})\b",
|
|
784
|
+
score=0.9,
|
|
785
|
+
),
|
|
786
|
+
Pattern(
|
|
787
|
+
name="imei_standalone",
|
|
788
|
+
regex=r"\b\d{2}-?\d{6}-?\d{6}-?\d\b",
|
|
789
|
+
score=0.6,
|
|
790
|
+
),
|
|
791
|
+
]
|
|
792
|
+
recognizers.append(
|
|
793
|
+
PatternRecognizer(
|
|
794
|
+
supported_entity="IMEI",
|
|
795
|
+
patterns=imei_patterns,
|
|
796
|
+
context=["imei", "mobile", "phone", "device", "cellular"],
|
|
797
|
+
)
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
# Fax Number
|
|
801
|
+
fax_patterns = [
|
|
802
|
+
Pattern(
|
|
803
|
+
name="fax_with_context",
|
|
804
|
+
regex=r"\b(?:Fax|Facsimile|F)[\s:#]*(?:\+?1[\s.-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
|
|
805
|
+
score=0.9,
|
|
806
|
+
),
|
|
807
|
+
]
|
|
808
|
+
recognizers.append(
|
|
809
|
+
PatternRecognizer(
|
|
810
|
+
supported_entity="FAX_NUMBER",
|
|
811
|
+
patterns=fax_patterns,
|
|
812
|
+
context=["fax", "facsimile"],
|
|
813
|
+
)
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
# Biometric Identifiers
|
|
817
|
+
biometric_patterns = [
|
|
818
|
+
Pattern(
|
|
819
|
+
name="biometric_reference",
|
|
820
|
+
regex=r"\b(?:fingerprint|retina|iris|voice\s*print|face\s*id|biometric)[\s:#]*(?:id|scan|data|template)[\s:#]*([A-Z0-9\-]{8,})\b",
|
|
821
|
+
score=0.85,
|
|
822
|
+
),
|
|
823
|
+
]
|
|
824
|
+
recognizers.append(
|
|
825
|
+
PatternRecognizer(
|
|
826
|
+
supported_entity="BIOMETRIC_ID",
|
|
827
|
+
patterns=biometric_patterns,
|
|
828
|
+
context=["biometric", "fingerprint", "retina", "iris", "voiceprint", "facial"],
|
|
829
|
+
)
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
# UUID
|
|
833
|
+
uuid_patterns = [
|
|
834
|
+
Pattern(
|
|
835
|
+
name="uuid",
|
|
836
|
+
regex=r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b",
|
|
837
|
+
score=0.8,
|
|
838
|
+
),
|
|
839
|
+
]
|
|
840
|
+
recognizers.append(
|
|
841
|
+
PatternRecognizer(supported_entity="UUID", patterns=uuid_patterns)
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
return recognizers
|
|
845
|
+
|
|
846
|
+
@property
|
|
847
|
+
def mode(self) -> str:
|
|
848
|
+
"""Current operating mode."""
|
|
849
|
+
return self._mode
|
|
850
|
+
|
|
851
|
+
@property
|
|
852
|
+
def is_spacy_available(self) -> bool:
|
|
853
|
+
"""Whether spaCy NLP is available."""
|
|
854
|
+
self._ensure_initialized()
|
|
855
|
+
return self._spacy_available
|