glacis 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
glacis/controls/pii.py ADDED
@@ -0,0 +1,855 @@
1
+ """
2
+ PII/PHI Redaction Control.
3
+
4
+ HIPAA-compliant detection and redaction of the 18 Safe Harbor identifiers
5
+ using Microsoft Presidio with custom healthcare-specific recognizers.
6
+
7
+ Supported backends:
8
+ - presidio: Microsoft Presidio (default)
9
+
10
+ Two modes:
11
+ - fast: Regex-only detection (<2ms typical)
12
+ - full: Regex + spaCy NER (~15-20ms typical)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import time
19
+ import warnings
20
+ from typing import TYPE_CHECKING, Any, Optional
21
+
22
+ from glacis.controls.base import BaseControl, ControlResult
23
+
24
+ if TYPE_CHECKING:
25
+ from presidio_analyzer import AnalyzerEngine, RecognizerResult
26
+ from presidio_anonymizer import AnonymizerEngine
27
+
28
+ from glacis.config import PiiPhiConfig
29
+
30
+ logger = logging.getLogger("glacis.controls.pii")
31
+
32
+
33
+ # Supported backends for PII detection
34
+ SUPPORTED_BACKENDS = ["presidio"]
35
+
36
+
37
+ class PIIControl(BaseControl):
38
+ """
39
+ PII/PHI detection and redaction control.
40
+
41
+ Uses Microsoft Presidio with custom recognizers for the 18 HIPAA Safe Harbor
42
+ identifiers. Supports two operating modes:
43
+
44
+ - "fast": Regex-only detection, typically <2ms
45
+ - "full": Regex + spaCy NER for improved name/location detection, ~15-20ms
46
+
47
+ Args:
48
+ config: PiiPhiConfig with enabled, backend, and mode settings
49
+
50
+ Example:
51
+ >>> from glacis.config import PiiPhiConfig
52
+ >>> config = PiiPhiConfig(enabled=True, backend="presidio", mode="fast")
53
+ >>> control = PIIControl(config)
54
+ >>> result = control.check("SSN: 123-45-6789")
55
+ >>> result.detected
56
+ True
57
+ >>> result.modified_text
58
+ "SSN: [US_SSN]"
59
+ """
60
+
61
+ control_type = "pii"
62
+
63
+ # HIPAA Safe Harbor entity types
64
+ HIPAA_ENTITIES: list[str] = [
65
+ # Presidio OOTB
66
+ "PERSON",
67
+ "DATE_TIME",
68
+ "PHONE_NUMBER",
69
+ "EMAIL_ADDRESS",
70
+ "US_SSN",
71
+ "US_DRIVER_LICENSE",
72
+ "URL",
73
+ "IP_ADDRESS",
74
+ "CREDIT_CARD",
75
+ "US_BANK_NUMBER",
76
+ "IBAN_CODE",
77
+ "US_PASSPORT",
78
+ "US_ITIN",
79
+ # Custom HIPAA-specific
80
+ "MEDICAL_RECORD_NUMBER",
81
+ "HEALTH_PLAN_BENEFICIARY",
82
+ "NPI",
83
+ "DEA_NUMBER",
84
+ "MEDICAL_LICENSE",
85
+ "US_ZIP_CODE",
86
+ "STREET_ADDRESS",
87
+ "VIN",
88
+ "LICENSE_PLATE",
89
+ "DEVICE_SERIAL",
90
+ "UDI",
91
+ "IMEI",
92
+ "FAX_NUMBER",
93
+ "BIOMETRIC_ID",
94
+ "UUID",
95
+ ]
96
+
97
+ def __init__(self, config: "PiiPhiConfig") -> None:
98
+ self._config = config
99
+ self._mode = config.mode
100
+
101
+ # Validate backend
102
+ if config.backend not in SUPPORTED_BACKENDS:
103
+ raise ValueError(
104
+ f"Unknown PII backend: {config.backend}. "
105
+ f"Available: {SUPPORTED_BACKENDS}"
106
+ )
107
+
108
+ # Default threshold: higher for "full" mode to reduce NER false positives
109
+ if self._mode == "full":
110
+ self._score_threshold = 0.7
111
+ else:
112
+ self._score_threshold = 0.5
113
+
114
+ self._analyzer: Optional["AnalyzerEngine"] = None
115
+ self._anonymizer: Optional["AnonymizerEngine"] = None
116
+ self._spacy_available: bool = False
117
+ self._initialized: bool = False
118
+
119
+ def _ensure_initialized(self) -> None:
120
+ """Lazy initialization of Presidio engines."""
121
+ if self._initialized:
122
+ return
123
+
124
+ # Suppress noisy loggers from Presidio and spaCy
125
+ for logger_name in [
126
+ "presidio-analyzer",
127
+ "presidio-anonymizer",
128
+ "presidio_analyzer",
129
+ "presidio_anonymizer",
130
+ "spacy",
131
+ ]:
132
+ logging.getLogger(logger_name).setLevel(logging.WARNING)
133
+
134
+ try:
135
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
136
+ from presidio_anonymizer import AnonymizerEngine
137
+ except ImportError as e:
138
+ raise ImportError(
139
+ "PII control requires presidio-analyzer and presidio-anonymizer. "
140
+ "Install with: pip install glacis[redaction]"
141
+ ) from e
142
+
143
+ registry = RecognizerRegistry()
144
+
145
+ if self._mode == "fast":
146
+ # Fast mode: Only pattern-based recognizers, NO spaCy
147
+ for recognizer in self._build_core_pattern_recognizers():
148
+ registry.add_recognizer(recognizer)
149
+ else:
150
+ # Full mode: Load all predefined recognizers including SpacyRecognizer
151
+ registry.load_predefined_recognizers()
152
+ for recognizer in self._build_core_pattern_recognizers():
153
+ registry.add_recognizer(recognizer)
154
+
155
+ # Add custom HIPAA recognizers
156
+ for recognizer in self._build_healthcare_recognizers():
157
+ registry.add_recognizer(recognizer)
158
+ for recognizer in self._build_geographic_recognizers():
159
+ registry.add_recognizer(recognizer)
160
+ for recognizer in self._build_identifier_recognizers():
161
+ registry.add_recognizer(recognizer)
162
+
163
+ # Configure NLP engine based on mode
164
+ if self._mode == "full":
165
+ nlp_engine = self._try_load_spacy()
166
+ if nlp_engine:
167
+ self._analyzer = AnalyzerEngine(
168
+ registry=registry,
169
+ nlp_engine=nlp_engine,
170
+ supported_languages=["en"],
171
+ )
172
+ self._spacy_available = True
173
+ else:
174
+ warnings.warn(
175
+ "spaCy model 'en_core_web_md' not available. "
176
+ "Falling back to 'fast' mode (regex-only). "
177
+ "Install with: python -m spacy download en_core_web_md",
178
+ UserWarning,
179
+ stacklevel=2,
180
+ )
181
+ self._analyzer = AnalyzerEngine(
182
+ registry=registry,
183
+ supported_languages=["en"],
184
+ )
185
+ self._spacy_available = False
186
+ else:
187
+ self._analyzer = AnalyzerEngine(
188
+ registry=registry,
189
+ supported_languages=["en"],
190
+ )
191
+ self._spacy_available = False
192
+
193
+ self._anonymizer = AnonymizerEngine()
194
+ self._initialized = True
195
+
196
+ def _try_load_spacy(self) -> Optional[Any]:
197
+ """Attempt to load spaCy NLP engine."""
198
+ try:
199
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
200
+
201
+ configuration = {
202
+ "nlp_engine_name": "spacy",
203
+ "models": [{"lang_code": "en", "model_name": "en_core_web_md"}],
204
+ }
205
+ provider = NlpEngineProvider(nlp_configuration=configuration)
206
+ return provider.create_engine()
207
+ except Exception as e:
208
+ logger.debug(f"Failed to load spaCy: {e}")
209
+ return None
210
+
211
+ def check(self, text: str) -> ControlResult:
212
+ """
213
+ Check text for PII/PHI and redact if found.
214
+
215
+ Args:
216
+ text: Input text to check
217
+
218
+ Returns:
219
+ ControlResult with detection info and redacted text
220
+ """
221
+ self._ensure_initialized()
222
+
223
+ start_time = time.perf_counter()
224
+
225
+ if not text or not text.strip():
226
+ return ControlResult(
227
+ control_type=self.control_type,
228
+ detected=False,
229
+ action="pass",
230
+ categories=[],
231
+ latency_ms=0,
232
+ modified_text=text,
233
+ metadata={"backend": self._config.backend, "mode": self._mode},
234
+ )
235
+
236
+ assert self._analyzer is not None
237
+ assert self._anonymizer is not None
238
+
239
+ results: list["RecognizerResult"] = self._analyzer.analyze(
240
+ text=text,
241
+ language="en",
242
+ score_threshold=self._score_threshold,
243
+ )
244
+
245
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
246
+
247
+ if not results:
248
+ return ControlResult(
249
+ control_type=self.control_type,
250
+ detected=False,
251
+ action="pass",
252
+ categories=[],
253
+ latency_ms=latency_ms,
254
+ modified_text=text,
255
+ metadata={"backend": self._config.backend, "mode": self._mode},
256
+ )
257
+
258
+ # De-duplicate overlapping detections
259
+ results = self._resolve_overlaps(results)
260
+
261
+ # Build operators for replacement format [ENTITY_TYPE]
262
+ from presidio_anonymizer.entities import OperatorConfig
263
+
264
+ operators = {}
265
+ for entity_type in set(r.entity_type for r in results):
266
+ operators[entity_type] = OperatorConfig(
267
+ "replace", {"new_value": f"[{entity_type}]"}
268
+ )
269
+
270
+ anonymized = self._anonymizer.anonymize(
271
+ text=text,
272
+ analyzer_results=results,
273
+ operators=operators,
274
+ )
275
+
276
+ categories = sorted(set(r.entity_type for r in results))
277
+
278
+ return ControlResult(
279
+ control_type=self.control_type,
280
+ detected=True,
281
+ action="redact",
282
+ categories=categories,
283
+ latency_ms=latency_ms,
284
+ modified_text=anonymized.text,
285
+ metadata={
286
+ "backend": self._config.backend,
287
+ "mode": self._mode,
288
+ "count": len(results),
289
+ },
290
+ )
291
+
292
+ def close(self) -> None:
293
+ """Release resources."""
294
+ self._analyzer = None
295
+ self._anonymizer = None
296
+ self._initialized = False
297
+
298
+ def _resolve_overlaps(self, results: list[Any]) -> list[Any]:
299
+ """Resolve overlapping detections by keeping highest confidence."""
300
+ if not results:
301
+ return results
302
+
303
+ sorted_results = sorted(
304
+ results, key=lambda r: (r.start, -r.score, -(r.end - r.start))
305
+ )
306
+
307
+ merged: list[Any] = []
308
+ for result in sorted_results:
309
+ overlaps = False
310
+ for kept in merged:
311
+ if result.start < kept.end and result.end > kept.start:
312
+ overlaps = True
313
+ break
314
+
315
+ if not overlaps:
316
+ merged.append(result)
317
+
318
+ return merged
319
+
320
+ # =========================================================================
321
+ # Pattern Recognizer Builders (from original RedactionEngine)
322
+ # =========================================================================
323
+
324
+ def _build_core_pattern_recognizers(self) -> list[Any]:
325
+ """Build core pattern-based recognizers."""
326
+ from presidio_analyzer import Pattern, PatternRecognizer
327
+
328
+ recognizers = []
329
+
330
+ # US SSN
331
+ ssn_patterns = [
332
+ Pattern(name="ssn_dashes", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.85),
333
+ Pattern(name="ssn_spaces", regex=r"\b\d{3}\s\d{2}\s\d{4}\b", score=0.85),
334
+ Pattern(name="ssn_no_sep", regex=r"\b\d{9}\b", score=0.3),
335
+ ]
336
+ recognizers.append(
337
+ PatternRecognizer(
338
+ supported_entity="US_SSN",
339
+ patterns=ssn_patterns,
340
+ context=["ssn", "social security", "social security number"],
341
+ )
342
+ )
343
+
344
+ # Email
345
+ email_patterns = [
346
+ Pattern(
347
+ name="email",
348
+ regex=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
349
+ score=0.85,
350
+ ),
351
+ ]
352
+ recognizers.append(
353
+ PatternRecognizer(supported_entity="EMAIL_ADDRESS", patterns=email_patterns)
354
+ )
355
+
356
+ # Phone Number
357
+ phone_patterns = [
358
+ Pattern(
359
+ name="phone_with_parens",
360
+ regex=r"\b\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
361
+ score=0.7,
362
+ ),
363
+ Pattern(
364
+ name="phone_with_country",
365
+ regex=r"\b\+?1?[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
366
+ score=0.7,
367
+ ),
368
+ ]
369
+ recognizers.append(
370
+ PatternRecognizer(
371
+ supported_entity="PHONE_NUMBER",
372
+ patterns=phone_patterns,
373
+ context=["phone", "telephone", "cell", "mobile", "call", "contact"],
374
+ )
375
+ )
376
+
377
+ # Credit Card
378
+ cc_patterns = [
379
+ Pattern(
380
+ name="credit_card",
381
+ regex=r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
382
+ score=0.8,
383
+ ),
384
+ Pattern(
385
+ name="credit_card_spaced",
386
+ regex=r"\b(?:\d{4}[\s-]?){3}\d{4}\b",
387
+ score=0.5,
388
+ ),
389
+ ]
390
+ recognizers.append(
391
+ PatternRecognizer(
392
+ supported_entity="CREDIT_CARD",
393
+ patterns=cc_patterns,
394
+ context=["credit card", "card number", "cc", "visa", "mastercard", "amex"],
395
+ )
396
+ )
397
+
398
+ # IP Address
399
+ ip_patterns = [
400
+ Pattern(
401
+ name="ipv4",
402
+ regex=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
403
+ score=0.7,
404
+ ),
405
+ ]
406
+ recognizers.append(
407
+ PatternRecognizer(
408
+ supported_entity="IP_ADDRESS",
409
+ patterns=ip_patterns,
410
+ context=["ip", "ip address", "address"],
411
+ )
412
+ )
413
+
414
+ # URL
415
+ url_patterns = [
416
+ Pattern(
417
+ name="url",
418
+ regex=r"\bhttps?://[^\s<>\"{}|\\^`\[\]]+\b",
419
+ score=0.7,
420
+ ),
421
+ ]
422
+ recognizers.append(PatternRecognizer(supported_entity="URL", patterns=url_patterns))
423
+
424
+ # US Driver License
425
+ dl_patterns = [
426
+ Pattern(
427
+ name="driver_license_with_context",
428
+ regex=r"\b(?:DL|Driver'?s?\s*License|License)[\s:#]*([A-Z0-9]{5,15})\b",
429
+ score=0.7,
430
+ ),
431
+ ]
432
+ recognizers.append(
433
+ PatternRecognizer(
434
+ supported_entity="US_DRIVER_LICENSE",
435
+ patterns=dl_patterns,
436
+ context=["driver", "license", "dl", "driving"],
437
+ )
438
+ )
439
+
440
+ # Date of Birth / Dates
441
+ date_patterns = [
442
+ Pattern(
443
+ name="date_mdy_full",
444
+ regex=r"\b(?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])[/\-.](?:19|20)\d{2}\b",
445
+ score=0.7,
446
+ ),
447
+ Pattern(
448
+ name="date_iso",
449
+ regex=r"\b(?:19|20)\d{2}[/\-.](?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])\b",
450
+ score=0.7,
451
+ ),
452
+ Pattern(
453
+ name="date_written",
454
+ regex=r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+(?:19|20)\d{2}\b",
455
+ score=0.75,
456
+ ),
457
+ Pattern(
458
+ name="date_written_dmy",
459
+ regex=r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+(?:19|20)\d{2}\b",
460
+ score=0.75,
461
+ ),
462
+ ]
463
+ recognizers.append(
464
+ PatternRecognizer(
465
+ supported_entity="DATE_TIME",
466
+ patterns=date_patterns,
467
+ context=[
468
+ "dob", "birth", "born", "date of birth",
469
+ "birthday", "admitted", "discharged", "died",
470
+ ],
471
+ )
472
+ )
473
+
474
+ # Person names with context
475
+ name_patterns = [
476
+ Pattern(
477
+ name="patient_name",
478
+ regex=r"(?:Patient|Client|Member|Subscriber|Beneficiary)(?:\s+Name)?[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
479
+ score=0.85,
480
+ ),
481
+ Pattern(
482
+ name="name_field",
483
+ regex=r"(?<!\w)Name[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
484
+ score=0.7,
485
+ ),
486
+ Pattern(
487
+ name="doctor_name",
488
+ regex=r"(?:Dr\.?|Doctor|Physician|Provider)[\s:]+([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
489
+ score=0.8,
490
+ ),
491
+ ]
492
+ recognizers.append(
493
+ PatternRecognizer(
494
+ supported_entity="PERSON",
495
+ patterns=name_patterns,
496
+ context=["patient", "name", "client", "member", "doctor", "physician"],
497
+ )
498
+ )
499
+
500
+ return recognizers
501
+
502
+ def _build_healthcare_recognizers(self) -> list[Any]:
503
+ """Build healthcare-specific recognizers."""
504
+ from presidio_analyzer import Pattern, PatternRecognizer
505
+
506
+ recognizers = []
507
+
508
+ # Medical Record Number (MRN)
509
+ mrn_patterns = [
510
+ Pattern(
511
+ name="mrn_numeric",
512
+ regex=r"\b(?:MRN|MR#?|Medical Record|Patient ID)[\s:#\-]*(\d{6,10})\b",
513
+ score=0.85,
514
+ ),
515
+ Pattern(
516
+ name="mrn_alphanumeric",
517
+ regex=r"\b(?:MRN|MR#?)[\s:#\-]*([A-Z]{1,3}[\-]?\d{6,10})\b",
518
+ score=0.85,
519
+ ),
520
+ Pattern(
521
+ name="mrn_standalone",
522
+ regex=r"\b[A-Z]{2,3}\d{7,10}\b",
523
+ score=0.4,
524
+ ),
525
+ ]
526
+ recognizers.append(
527
+ PatternRecognizer(
528
+ supported_entity="MEDICAL_RECORD_NUMBER",
529
+ patterns=mrn_patterns,
530
+ context=["mrn", "medical record", "patient id", "chart number", "hospital number"],
531
+ )
532
+ )
533
+
534
+ # Health Plan Beneficiary Numbers
535
+ health_plan_patterns = [
536
+ Pattern(
537
+ name="medicare_new",
538
+ regex=r"\b[1-9][A-Z][A-Z0-9]\d-?[A-Z][A-Z0-9]\d-?[A-Z][A-Z0-9]\d{2}\b",
539
+ score=0.85,
540
+ ),
541
+ Pattern(
542
+ name="medicare_legacy",
543
+ regex=r"\b\d{3}-?\d{2}-?\d{4}[A-Z]{1,2}\b",
544
+ score=0.75,
545
+ ),
546
+ Pattern(
547
+ name="member_id_generic",
548
+ regex=r"\b(?:Member ID|Policy|Subscriber|Beneficiary)[\s:#]*([A-Z0-9]{9,15})\b",
549
+ score=0.8,
550
+ ),
551
+ Pattern(
552
+ name="group_number",
553
+ regex=r"\b(?:Group|GRP)[\s:#]*([A-Z0-9]{5,12})\b",
554
+ score=0.7,
555
+ ),
556
+ ]
557
+ recognizers.append(
558
+ PatternRecognizer(
559
+ supported_entity="HEALTH_PLAN_BENEFICIARY",
560
+ patterns=health_plan_patterns,
561
+ context=[
562
+ "medicare", "medicaid", "member", "subscriber",
563
+ "beneficiary", "insurance", "policy", "group",
564
+ "health plan", "coverage",
565
+ ],
566
+ )
567
+ )
568
+
569
+ # NPI
570
+ npi_patterns = [
571
+ Pattern(
572
+ name="npi_with_context",
573
+ regex=r"\b(?:NPI|National Provider)[\s:#]*(\d{10})\b",
574
+ score=0.9,
575
+ ),
576
+ Pattern(
577
+ name="npi_standalone",
578
+ regex=r"\b[12]\d{9}\b",
579
+ score=0.5,
580
+ ),
581
+ ]
582
+ recognizers.append(
583
+ PatternRecognizer(
584
+ supported_entity="NPI",
585
+ patterns=npi_patterns,
586
+ context=["npi", "national provider", "provider identifier", "prescriber"],
587
+ )
588
+ )
589
+
590
+ # DEA Number
591
+ dea_patterns = [
592
+ Pattern(
593
+ name="dea_with_context",
594
+ regex=r"\b(?:DEA|Drug Enforcement)[\s:#]*([ABCDEFGHJKLMPRSTUX][A-Z9]\d{7})\b",
595
+ score=0.9,
596
+ ),
597
+ Pattern(
598
+ name="dea_standalone",
599
+ regex=r"\b[ABCDEFGHJKLMPRSTUX][A-Z9]\d{7}\b",
600
+ score=0.6,
601
+ ),
602
+ ]
603
+ recognizers.append(
604
+ PatternRecognizer(
605
+ supported_entity="DEA_NUMBER",
606
+ patterns=dea_patterns,
607
+ context=["dea", "drug enforcement", "controlled substance", "prescriber"],
608
+ )
609
+ )
610
+
611
+ # Medical License
612
+ medical_license_patterns = [
613
+ Pattern(
614
+ name="medical_license_with_context",
615
+ regex=r"\b(?:License|Lic|Medical License)[\s:#]*([A-Z]{1,2}\d{5,8})\b",
616
+ score=0.8,
617
+ ),
618
+ Pattern(
619
+ name="state_license",
620
+ regex=r"\b(?:MD|DO|RN|NP|PA|DDS|DMD|DPM|DC|OD)[\s-]*(?:License|Lic)[\s:#]*(\d{4,8})\b",
621
+ score=0.85,
622
+ ),
623
+ ]
624
+ recognizers.append(
625
+ PatternRecognizer(
626
+ supported_entity="MEDICAL_LICENSE",
627
+ patterns=medical_license_patterns,
628
+ context=[
629
+ "license", "medical license", "state license",
630
+ "board certified", "credentials", "physician",
631
+ "practitioner",
632
+ ],
633
+ )
634
+ )
635
+
636
+ return recognizers
637
+
638
+ def _build_geographic_recognizers(self) -> list[Any]:
639
+ """Build geographic identifier recognizers."""
640
+ from presidio_analyzer import Pattern, PatternRecognizer
641
+
642
+ recognizers = []
643
+
644
+ # US ZIP Code
645
+ zip_patterns = [
646
+ Pattern(name="zip_plus_4", regex=r"\b\d{5}-\d{4}\b", score=0.7),
647
+ Pattern(
648
+ name="zip_5_with_context",
649
+ regex=r"\b(?:zip|zip code|postal)[\s:#]*(\d{5})\b",
650
+ score=0.7,
651
+ ),
652
+ ]
653
+ recognizers.append(
654
+ PatternRecognizer(
655
+ supported_entity="US_ZIP_CODE",
656
+ patterns=zip_patterns,
657
+ context=["zip", "zipcode", "zip code", "postal", "mailing"],
658
+ )
659
+ )
660
+
661
+ # Street Address
662
+ address_patterns = [
663
+ Pattern(
664
+ name="street_address_full",
665
+ regex=r"\b\d{1,5}\s+(?:[A-Z][a-z]+\s+){1,3}(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Way|Court|Ct|Circle|Cir|Place|Pl|Terrace|Ter|Highway|Hwy)\.?\b",
666
+ score=0.7,
667
+ ),
668
+ Pattern(
669
+ name="po_box",
670
+ regex=r"\b(?:P\.?O\.?\s*Box|Post Office Box)\s*\d+\b",
671
+ score=0.85,
672
+ ),
673
+ Pattern(
674
+ name="apt_suite",
675
+ regex=r"\b(?:Apt|Apartment|Suite|Ste|Unit|#)\s*[A-Z0-9]+\b",
676
+ score=0.5,
677
+ ),
678
+ ]
679
+ recognizers.append(
680
+ PatternRecognizer(
681
+ supported_entity="STREET_ADDRESS",
682
+ patterns=address_patterns,
683
+ context=["address", "street", "mail", "ship", "deliver", "residence", "home"],
684
+ )
685
+ )
686
+
687
+ return recognizers
688
+
689
+ def _build_identifier_recognizers(self) -> list[Any]:
690
+ """Build vehicle, device, and other identifier recognizers."""
691
+ from presidio_analyzer import Pattern, PatternRecognizer
692
+
693
+ recognizers = []
694
+
695
+ # VIN
696
+ vin_patterns = [
697
+ Pattern(
698
+ name="vin_with_context",
699
+ regex=r"\b(?:VIN|Vehicle ID)[\s:#]*([A-HJ-NPR-Z0-9]{17})\b",
700
+ score=0.9,
701
+ ),
702
+ Pattern(
703
+ name="vin_standalone",
704
+ regex=r"\b[A-HJ-NPR-Z0-9]{17}\b",
705
+ score=0.5,
706
+ ),
707
+ ]
708
+ recognizers.append(
709
+ PatternRecognizer(
710
+ supported_entity="VIN",
711
+ patterns=vin_patterns,
712
+ context=["vin", "vehicle", "car", "truck", "automobile", "registration"],
713
+ )
714
+ )
715
+
716
+ # License Plate
717
+ plate_patterns = [
718
+ Pattern(
719
+ name="plate_with_context",
720
+ regex=r"\b(?:License Plate|Plate|Tag|Plate #)[\s:#]*([A-Z0-9]{2,8})\b",
721
+ score=0.85,
722
+ ),
723
+ ]
724
+ recognizers.append(
725
+ PatternRecognizer(
726
+ supported_entity="LICENSE_PLATE",
727
+ patterns=plate_patterns,
728
+ context=["license plate", "plate number", "tag", "vehicle registration", "dmv"],
729
+ )
730
+ )
731
+
732
+ # Device Serial Numbers
733
+ serial_patterns = [
734
+ Pattern(
735
+ name="serial_with_context",
736
+ regex=r"\b(?:Serial|SN|S/N)[\s:#]*([A-Z0-9]{8,20})\b",
737
+ score=0.85,
738
+ ),
739
+ Pattern(
740
+ name="serial_common",
741
+ regex=r"\b[A-Z]{2,4}\d{6,12}[A-Z0-9]*\b",
742
+ score=0.4,
743
+ ),
744
+ ]
745
+ recognizers.append(
746
+ PatternRecognizer(
747
+ supported_entity="DEVICE_SERIAL",
748
+ patterns=serial_patterns,
749
+ context=["serial", "device", "equipment", "model", "asset"],
750
+ )
751
+ )
752
+
753
+ # UDI
754
+ udi_patterns = [
755
+ Pattern(
756
+ name="udi_gs1",
757
+ regex=r"\b\(01\)\d{14}(?:\(\d{2}\)[A-Z0-9]+)*\b",
758
+ score=0.9,
759
+ ),
760
+ Pattern(
761
+ name="udi_hibcc",
762
+ regex=r"\b\+[A-Z0-9]{4,}\/[A-Z0-9]+\b",
763
+ score=0.85,
764
+ ),
765
+ Pattern(
766
+ name="udi_with_context",
767
+ regex=r"\b(?:UDI|Unique Device)[\s:#]*([A-Z0-9\(\)\/\+]{10,})\b",
768
+ score=0.9,
769
+ ),
770
+ ]
771
+ recognizers.append(
772
+ PatternRecognizer(
773
+ supported_entity="UDI",
774
+ patterns=udi_patterns,
775
+ context=["udi", "unique device", "medical device", "implant", "fda"],
776
+ )
777
+ )
778
+
779
+ # IMEI
780
+ imei_patterns = [
781
+ Pattern(
782
+ name="imei_with_context",
783
+ regex=r"\b(?:IMEI|International Mobile)[\s:#]*(\d{15})\b",
784
+ score=0.9,
785
+ ),
786
+ Pattern(
787
+ name="imei_standalone",
788
+ regex=r"\b\d{2}-?\d{6}-?\d{6}-?\d\b",
789
+ score=0.6,
790
+ ),
791
+ ]
792
+ recognizers.append(
793
+ PatternRecognizer(
794
+ supported_entity="IMEI",
795
+ patterns=imei_patterns,
796
+ context=["imei", "mobile", "phone", "device", "cellular"],
797
+ )
798
+ )
799
+
800
+ # Fax Number
801
+ fax_patterns = [
802
+ Pattern(
803
+ name="fax_with_context",
804
+ regex=r"\b(?:Fax|Facsimile|F)[\s:#]*(?:\+?1[\s.-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
805
+ score=0.9,
806
+ ),
807
+ ]
808
+ recognizers.append(
809
+ PatternRecognizer(
810
+ supported_entity="FAX_NUMBER",
811
+ patterns=fax_patterns,
812
+ context=["fax", "facsimile"],
813
+ )
814
+ )
815
+
816
+ # Biometric Identifiers
817
+ biometric_patterns = [
818
+ Pattern(
819
+ name="biometric_reference",
820
+ regex=r"\b(?:fingerprint|retina|iris|voice\s*print|face\s*id|biometric)[\s:#]*(?:id|scan|data|template)[\s:#]*([A-Z0-9\-]{8,})\b",
821
+ score=0.85,
822
+ ),
823
+ ]
824
+ recognizers.append(
825
+ PatternRecognizer(
826
+ supported_entity="BIOMETRIC_ID",
827
+ patterns=biometric_patterns,
828
+ context=["biometric", "fingerprint", "retina", "iris", "voiceprint", "facial"],
829
+ )
830
+ )
831
+
832
+ # UUID
833
+ uuid_patterns = [
834
+ Pattern(
835
+ name="uuid",
836
+ regex=r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b",
837
+ score=0.8,
838
+ ),
839
+ ]
840
+ recognizers.append(
841
+ PatternRecognizer(supported_entity="UUID", patterns=uuid_patterns)
842
+ )
843
+
844
+ return recognizers
845
+
846
+ @property
847
+ def mode(self) -> str:
848
+ """Current operating mode."""
849
+ return self._mode
850
+
851
+ @property
852
+ def is_spacy_available(self) -> bool:
853
+ """Whether spaCy NLP is available."""
854
+ self._ensure_initialized()
855
+ return self._spacy_available