pii-protector 2.2.0__tar.gz → 2.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pii_protector-2.2.0 → pii_protector-2.2.2}/PKG-INFO +10 -10
- {pii_protector-2.2.0 → pii_protector-2.2.2}/README.md +6 -6
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/__init__.py +1 -1
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/config/settings.py +8 -8
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/detector.py +85 -84
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/core_pii.py +1 -1
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/utils/helpers.py +2 -2
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/PKG-INFO +10 -10
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/requires.txt +2 -2
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pyproject.toml +5 -5
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/__main__.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/cli.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/config/__init__.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/__init__.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/api_secrets.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/cloud_providers.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/communication.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/confidential_markers.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/dates_urls.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/file_control.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/india_pii.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/monitoring.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/north_america_pii.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/payment.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/version_control.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/utils/__init__.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/__init__.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/api_validators.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/pii_validators.py +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/SOURCES.txt +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/dependency_links.txt +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/entry_points.txt +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/top_level.txt +0 -0
- {pii_protector-2.2.0 → pii_protector-2.2.2}/setup.cfg +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pii-protector
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: Production-grade PII detection with multi-model ensemble
|
|
5
5
|
Author-email: tensoryug <jainsatyam26@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/sjain26/pii-
|
|
7
|
+
Project-URL: Homepage, https://github.com/sjain26/pii-detector
|
|
8
8
|
Keywords: pii,privacy,nlp,detection,presidio,data-protection
|
|
9
9
|
Classifier: Development Status :: 5 - Production/Stable
|
|
10
10
|
Classifier: Intended Audience :: Developers
|
|
@@ -22,12 +22,12 @@ Requires-Dist: presidio-analyzer>=2.2; extra == "presidio"
|
|
|
22
22
|
Requires-Dist: spacy>=3.7; extra == "presidio"
|
|
23
23
|
Provides-Extra: transformers
|
|
24
24
|
Requires-Dist: transformers>=4.40; extra == "transformers"
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: optimum[onnxruntime]>=1.18; extra == "transformers"
|
|
26
26
|
Provides-Extra: full
|
|
27
27
|
Requires-Dist: presidio-analyzer>=2.2; extra == "full"
|
|
28
28
|
Requires-Dist: spacy>=3.7; extra == "full"
|
|
29
29
|
Requires-Dist: transformers>=4.40; extra == "full"
|
|
30
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: optimum[onnxruntime]>=1.18; extra == "full"
|
|
31
31
|
|
|
32
32
|
# pii-protector
|
|
33
33
|
|
|
@@ -40,9 +40,9 @@ Layer 1 — Regex always runs ~0.3–3ms
|
|
|
40
40
|
↓ escalation score >= 3?
|
|
41
41
|
Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
|
|
42
42
|
↓ confidence low or conflict?
|
|
43
|
-
Layer 3 —
|
|
43
|
+
Layer 3 — NER Transformer high-accuracy names ~20ms
|
|
44
44
|
+
|
|
45
|
-
Layer 4 —
|
|
45
|
+
Layer 4 — PII Model structured PII ~15ms
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
|
|
@@ -144,8 +144,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
|
|
|
144
144
|
```python
|
|
145
145
|
detector = AdvancedPIIDetector(
|
|
146
146
|
use_presidio=True, # Layer 2: spaCy NER via Presidio
|
|
147
|
-
use_transformers=True, # Layer 3:
|
|
148
|
-
use_pii_model=True, # Layer 4:
|
|
147
|
+
use_transformers=True, # Layer 3: NER transformer
|
|
148
|
+
use_pii_model=True, # Layer 4: PII model
|
|
149
149
|
confidence_threshold=0.5, # Early-stop threshold
|
|
150
150
|
spacy_model="en_core_web_lg",
|
|
151
151
|
transformer_model="jainsatyam26/pii-ner-onnx",
|
|
@@ -171,8 +171,8 @@ SPACY_DATA_PATH=/path/to/models
|
|
|
171
171
|
| 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
|
|
172
172
|
|
|
173
173
|
ONNX-optimized versions:
|
|
174
|
-
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) —
|
|
175
|
-
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) —
|
|
174
|
+
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
|
|
175
|
+
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
|
|
176
176
|
|
|
177
177
|
## License
|
|
178
178
|
|
|
@@ -9,9 +9,9 @@ Layer 1 — Regex always runs ~0.3–3ms
|
|
|
9
9
|
↓ escalation score >= 3?
|
|
10
10
|
Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
|
|
11
11
|
↓ confidence low or conflict?
|
|
12
|
-
Layer 3 —
|
|
12
|
+
Layer 3 — NER Transformer high-accuracy names ~20ms
|
|
13
13
|
+
|
|
14
|
-
Layer 4 —
|
|
14
|
+
Layer 4 — PII Model structured PII ~15ms
|
|
15
15
|
```
|
|
16
16
|
|
|
17
17
|
Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
|
|
@@ -113,8 +113,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
|
|
|
113
113
|
```python
|
|
114
114
|
detector = AdvancedPIIDetector(
|
|
115
115
|
use_presidio=True, # Layer 2: spaCy NER via Presidio
|
|
116
|
-
use_transformers=True, # Layer 3:
|
|
117
|
-
use_pii_model=True, # Layer 4:
|
|
116
|
+
use_transformers=True, # Layer 3: NER transformer
|
|
117
|
+
use_pii_model=True, # Layer 4: PII model
|
|
118
118
|
confidence_threshold=0.5, # Early-stop threshold
|
|
119
119
|
spacy_model="en_core_web_lg",
|
|
120
120
|
transformer_model="jainsatyam26/pii-ner-onnx",
|
|
@@ -140,8 +140,8 @@ SPACY_DATA_PATH=/path/to/models
|
|
|
140
140
|
| 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
|
|
141
141
|
|
|
142
142
|
ONNX-optimized versions:
|
|
143
|
-
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) —
|
|
144
|
-
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) —
|
|
143
|
+
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
|
|
144
|
+
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
|
|
145
145
|
|
|
146
146
|
## License
|
|
147
147
|
|
|
@@ -7,5 +7,5 @@ Multi-model ensemble with pre-compiled patterns and type-aware voting
|
|
|
7
7
|
from .detector import AdvancedPIIDetector, PIIEntity, PresidioCustomDetector
|
|
8
8
|
from .config.settings import PII_CONFIG
|
|
9
9
|
|
|
10
|
-
__version__ = "2.2.
|
|
10
|
+
__version__ = "2.2.2"
|
|
11
11
|
__all__ = ["AdvancedPIIDetector", "PIIEntity", "PresidioCustomDetector", "PII_CONFIG"]
|
|
@@ -4,8 +4,8 @@ Configuration settings for PII detection
|
|
|
4
4
|
Latency profile (per-call on typical hardware):
|
|
5
5
|
Regex only : ~0.3ms (Layer 1 — structured PII, India PII, API secrets)
|
|
6
6
|
Regex + Presidio : ~5ms (Layer 2 — adds NAME/ORG/LOC via spaCy, CPU)
|
|
7
|
-
Regex + Presidio +
|
|
8
|
-
+
|
|
7
|
+
Regex + Presidio + NER Transformer : ~20ms (Layer 3 — high-accuracy English NER, GPU)
|
|
8
|
+
+ PII Model : +15ms (Layer 4 — structured PII: PHONE, EMAIL, DOB, SSN)
|
|
9
9
|
|
|
10
10
|
Early stop fires for ~80% of texts → average well under 5ms.
|
|
11
11
|
|
|
@@ -14,7 +14,7 @@ Model selection rationale:
|
|
|
14
14
|
- Higher NER accuracy than sm for NAME/ORG/LOC — worth the extra size
|
|
15
15
|
- Used inside Presidio (Layer 2)
|
|
16
16
|
✅ jainsatyam26/pii-ner-onnx:
|
|
17
|
-
- Best-in-class English NER accuracy
|
|
17
|
+
- Best-in-class English NER accuracy
|
|
18
18
|
- Significantly better NAME detection vs multilingual models
|
|
19
19
|
- Labels: PER → NAME, ORG → ORGANIZATION, LOC → LOCATION, MISC → _SKIP
|
|
20
20
|
✅ jainsatyam26/pii-detector-onnx:
|
|
@@ -27,11 +27,11 @@ Model selection rationale:
|
|
|
27
27
|
PII_CONFIG = {
|
|
28
28
|
# ── What to run ──────────────────────────────────────────────────
|
|
29
29
|
# Presidio (spaCy NER): ON → for NAME, ORG, LOCATION — CPU, fast
|
|
30
|
-
# Transformer
|
|
31
|
-
# PII model
|
|
30
|
+
# NER Transformer: ON → high-accuracy English NER — GPU, conditional
|
|
31
|
+
# PII model: ON → structured PII specialist — always runs
|
|
32
32
|
'use_presidio': True,
|
|
33
|
-
'use_transformers': True, #
|
|
34
|
-
'use_pii_model': True, #
|
|
33
|
+
'use_transformers': True, # NER transformer enabled
|
|
34
|
+
'use_pii_model': True, # PII model enabled
|
|
35
35
|
|
|
36
36
|
# Model names
|
|
37
37
|
'spacy_model': "en_core_web_lg", # Large model — better NER accuracy than sm
|
|
@@ -98,7 +98,7 @@ TYPE_MAPPINGS = {
|
|
|
98
98
|
'LAW': '_SKIP',
|
|
99
99
|
'LANGUAGE': '_SKIP',
|
|
100
100
|
|
|
101
|
-
# Transformer types (
|
|
101
|
+
# Transformer types (multilingual NER)
|
|
102
102
|
# NOTE: ORG -> ORGANIZATION and LOC -> LOCATION are already mapped in the spaCy section above
|
|
103
103
|
'PER': 'NAME', # Person names -> NAME
|
|
104
104
|
'MISC': '_SKIP', # Miscellaneous -> skip
|
|
@@ -82,7 +82,7 @@ def _empty_timing() -> Dict:
|
|
|
82
82
|
"""Return a zeroed timing dict for empty/skipped calls."""
|
|
83
83
|
return {
|
|
84
84
|
'regex_ms': 0.0, 'presidio_ms': 0.0, 'transformer_ms': 0.0,
|
|
85
|
-
'
|
|
85
|
+
'pii_model_ms': 0.0,
|
|
86
86
|
'escalation_check_ms': 0.0, 'postprocess_ms': 0.0, 'total_ms': 0.0,
|
|
87
87
|
'layers': 'none', 'ner_escalation_score': 0,
|
|
88
88
|
'entities_found': 0, 'text_length': 0,
|
|
@@ -148,7 +148,7 @@ _COMMON_NON_NAME_WORDS = frozenset({
|
|
|
148
148
|
|
|
149
149
|
# Escalation score thresholds
|
|
150
150
|
_NER_ESCALATION_THRESHOLD = 3
|
|
151
|
-
#
|
|
151
|
+
# NER transformer runs only when Presidio NER is uncertain.
|
|
152
152
|
# Threshold=3 means: run transformer only if name confidence is low or conflicting.
|
|
153
153
|
# 0 = always run (wasteful), 3 = smart conditional (saves ~9ms on clear cases).
|
|
154
154
|
_TRANSFORMER_ESCALATION_THRESHOLD = 3
|
|
@@ -169,7 +169,7 @@ class PIIEntity:
|
|
|
169
169
|
return hash((self.text, self.entity_type, self.start, self.end))
|
|
170
170
|
|
|
171
171
|
|
|
172
|
-
# Person-context keywords — when present, a
|
|
172
|
+
# Person-context keywords — when present, a pii_model USERNAME is likely a NAME.
|
|
173
173
|
_PERSON_CONTEXT_KW = frozenset({
|
|
174
174
|
'appointment', 'scheduling', 'schedule', 'scheduled',
|
|
175
175
|
'patient', 'customer', 'client', 'employee', 'user',
|
|
@@ -178,7 +178,7 @@ _PERSON_CONTEXT_KW = frozenset({
|
|
|
178
178
|
'meeting', 'visit', 'booking', 'reservation',
|
|
179
179
|
})
|
|
180
180
|
|
|
181
|
-
#
|
|
181
|
+
# PII model actual label → our standard type (labels verified from model output).
|
|
182
182
|
_PIIRANHA_MAP = {
|
|
183
183
|
# Name variants
|
|
184
184
|
'GIVENNAME': 'NAME',
|
|
@@ -215,7 +215,7 @@ _PIIRANHA_MAP = {
|
|
|
215
215
|
'ID_CARD': '_SKIP',
|
|
216
216
|
# Auth
|
|
217
217
|
'USERNAME': 'USERNAME',
|
|
218
|
-
'PASSWORD': '_SKIP', # NOSONAR -
|
|
218
|
+
'PASSWORD': '_SKIP', # NOSONAR - pii_model label key, not a credential
|
|
219
219
|
# Location (skip — too noisy)
|
|
220
220
|
'CITY': '_SKIP',
|
|
221
221
|
'ZIPCODE': '_SKIP',
|
|
@@ -305,23 +305,42 @@ class AdvancedPIIDetector:
|
|
|
305
305
|
self.type_mappings = TYPE_MAPPINGS
|
|
306
306
|
|
|
307
307
|
self.presidio = None
|
|
308
|
-
self.ner_pipeline = None #
|
|
309
|
-
self.pii_pipeline = None #
|
|
308
|
+
self.ner_pipeline = None # NER transformer: NAME / ORG / LOC
|
|
309
|
+
self.pii_pipeline = None # PII model: structured PII (PHONE, EMAIL, SSN, DOB etc.)
|
|
310
310
|
self._ner_sources_count = 0
|
|
311
311
|
self.last_timing: Dict = _empty_timing()
|
|
312
312
|
|
|
313
313
|
if self.use_presidio:
|
|
314
314
|
self._init_presidio()
|
|
315
315
|
if self.use_transformers:
|
|
316
|
-
self._init_transformers() #
|
|
316
|
+
self._init_transformers() # NER transformer (NAME/ORG/LOC)
|
|
317
317
|
if self.use_pii_model:
|
|
318
|
-
self._init_pii_transformer() #
|
|
318
|
+
self._init_pii_transformer() # PII model structured PII
|
|
319
|
+
|
|
320
|
+
if self.use_presidio and not self.presidio:
|
|
321
|
+
warnings.warn(
|
|
322
|
+
"Presidio (Layer 2) unavailable — NAME/ORG/LOCATION detection disabled. "
|
|
323
|
+
"Install: pip install 'pii-protector[presidio]' then: python -m spacy download en_core_web_lg",
|
|
324
|
+
UserWarning, stacklevel=2,
|
|
325
|
+
)
|
|
326
|
+
if self.use_transformers and not self.ner_pipeline:
|
|
327
|
+
warnings.warn(
|
|
328
|
+
"NER transformer (Layer 3) unavailable — high-accuracy name detection disabled. "
|
|
329
|
+
"Install: pip install 'optimum[onnxruntime]>=1.18' 'transformers>=4.40'",
|
|
330
|
+
UserWarning, stacklevel=2,
|
|
331
|
+
)
|
|
332
|
+
if self.use_pii_model and not self.pii_pipeline:
|
|
333
|
+
warnings.warn(
|
|
334
|
+
"PII model (Layer 4) unavailable — PHONE/EMAIL/DOB/SSN transformer detection disabled. "
|
|
335
|
+
"Install: pip install 'optimum[onnxruntime]>=1.18' 'transformers>=4.40'",
|
|
336
|
+
UserWarning, stacklevel=2,
|
|
337
|
+
)
|
|
319
338
|
|
|
320
339
|
logger.info(
|
|
321
340
|
f"PII Detector ready | mode=SEQUENTIAL_LAYERED "
|
|
322
341
|
f"| presidio={'ON' if self.presidio else 'OFF'} "
|
|
323
342
|
f"| ner={'ON' if self.ner_pipeline else 'OFF'} "
|
|
324
|
-
f"|
|
|
343
|
+
f"| pii_model={'ON' if self.pii_pipeline else 'OFF'} "
|
|
325
344
|
f"| ner_sources={self._ner_sources_count}"
|
|
326
345
|
)
|
|
327
346
|
|
|
@@ -363,61 +382,43 @@ class AdvancedPIIDetector:
|
|
|
363
382
|
logger.warning(f"Presidio unavailable: {e}")
|
|
364
383
|
|
|
365
384
|
def _init_transformers(self):
|
|
366
|
-
"""
|
|
385
|
+
"""NER transformer — high-accuracy NAME/ORG/LOC (ONNX Runtime)."""
|
|
367
386
|
try:
|
|
368
|
-
from
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
except Exception:
|
|
381
|
-
with _suppress_stdout():
|
|
382
|
-
self.ner_pipeline = pipeline(
|
|
383
|
-
"ner",
|
|
384
|
-
model=self.transformer_model_name,
|
|
385
|
-
aggregation_strategy="first",
|
|
386
|
-
device=-1,
|
|
387
|
-
)
|
|
388
|
-
self.ner_pipeline("warmup")
|
|
389
|
-
logger.info(f"NER ready on CPU ({self.transformer_model_name})")
|
|
387
|
+
from optimum.onnxruntime import ORTModelForTokenClassification
|
|
388
|
+
from transformers import AutoTokenizer, pipeline
|
|
389
|
+
with _suppress_stdout():
|
|
390
|
+
model = ORTModelForTokenClassification.from_pretrained(self.transformer_model_name)
|
|
391
|
+
tokenizer = AutoTokenizer.from_pretrained(self.transformer_model_name)
|
|
392
|
+
self.ner_pipeline = pipeline(
|
|
393
|
+
"ner",
|
|
394
|
+
model=model,
|
|
395
|
+
tokenizer=tokenizer,
|
|
396
|
+
aggregation_strategy="first",
|
|
397
|
+
)
|
|
398
|
+
self.ner_pipeline("warmup")
|
|
390
399
|
self._ner_sources_count += 1
|
|
400
|
+
logger.info(f"NER ready ({self.transformer_model_name})")
|
|
391
401
|
except Exception as e:
|
|
392
402
|
logger.warning(f"NER transformer unavailable: {e}")
|
|
393
403
|
|
|
394
404
|
def _init_pii_transformer(self):
|
|
395
|
-
"""
|
|
405
|
+
"""PII model structured PII detection — PHONE, EMAIL, SSN, DOB, ADDRESS etc. (ONNX Runtime)."""
|
|
396
406
|
try:
|
|
397
|
-
from
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
with _suppress_stdout():
|
|
411
|
-
self.pii_pipeline = pipeline(
|
|
412
|
-
"token-classification",
|
|
413
|
-
model=self.pii_model_name,
|
|
414
|
-
aggregation_strategy="first",
|
|
415
|
-
device=-1,
|
|
416
|
-
)
|
|
417
|
-
self.pii_pipeline("warmup")
|
|
418
|
-
logger.info(f"piiranha ready on CPU ({self.pii_model_name})")
|
|
407
|
+
from optimum.onnxruntime import ORTModelForTokenClassification
|
|
408
|
+
from transformers import AutoTokenizer, pipeline
|
|
409
|
+
with _suppress_stdout():
|
|
410
|
+
model = ORTModelForTokenClassification.from_pretrained(self.pii_model_name)
|
|
411
|
+
tokenizer = AutoTokenizer.from_pretrained(self.pii_model_name)
|
|
412
|
+
self.pii_pipeline = pipeline(
|
|
413
|
+
"token-classification",
|
|
414
|
+
model=model,
|
|
415
|
+
tokenizer=tokenizer,
|
|
416
|
+
aggregation_strategy="first",
|
|
417
|
+
)
|
|
418
|
+
self.pii_pipeline("warmup")
|
|
419
|
+
logger.info(f"PII model ready ({self.pii_model_name})")
|
|
419
420
|
except Exception as e:
|
|
420
|
-
logger.warning(f"
|
|
421
|
+
logger.warning(f"PII model unavailable: {e}")
|
|
421
422
|
|
|
422
423
|
# ------------------------------------------------------------------
|
|
423
424
|
# Layer detection methods
|
|
@@ -540,8 +541,8 @@ class AdvancedPIIDetector:
|
|
|
540
541
|
clean = word.strip()
|
|
541
542
|
return clean.isalpha() and len(clean) >= 3
|
|
542
543
|
|
|
543
|
-
def
|
|
544
|
-
"""Map
|
|
544
|
+
def _pii_model_results_to_entities(self, results, offset, overlap, person_ctx):
|
|
545
|
+
"""Map pii_model pipeline results (one chunk) into PIIEntity objects."""
|
|
545
546
|
out = []
|
|
546
547
|
for r in results:
|
|
547
548
|
if offset > 0 and r['start'] < overlap:
|
|
@@ -551,23 +552,23 @@ class AdvancedPIIDetector:
|
|
|
551
552
|
if entity_type == '_SKIP':
|
|
552
553
|
continue
|
|
553
554
|
word = r['word']
|
|
554
|
-
#
|
|
555
|
+
# PII model often tags standalone person names as USERNAME in free text;
|
|
555
556
|
# promote to NAME when person context is present and the word looks like a name.
|
|
556
557
|
if entity_type == 'USERNAME' and person_ctx and self._looks_like_name(word):
|
|
557
558
|
entity_type = 'NAME'
|
|
558
|
-
logger.debug(f"[
|
|
559
|
+
logger.debug(f"[pii_model] Promoted USERNAME→NAME for '{word}' (person context)")
|
|
559
560
|
out.append(PIIEntity(
|
|
560
561
|
text=word,
|
|
561
562
|
entity_type=entity_type,
|
|
562
563
|
start=int(r['start']) + offset,
|
|
563
564
|
end=int(r['end']) + offset,
|
|
564
565
|
score=float(r['score']),
|
|
565
|
-
source="
|
|
566
|
+
source="pii_model",
|
|
566
567
|
))
|
|
567
568
|
return out
|
|
568
569
|
|
|
569
570
|
def _detect_pii_transformer(self, text: str) -> List[PIIEntity]:
|
|
570
|
-
"""
|
|
571
|
+
"""PII model specialized PII detection with chunking (see _PIIRANHA_MAP)."""
|
|
571
572
|
if not self.pii_pipeline:
|
|
572
573
|
return []
|
|
573
574
|
|
|
@@ -584,11 +585,11 @@ class AdvancedPIIDetector:
|
|
|
584
585
|
for chunk_results, chunk_start in zip(
|
|
585
586
|
self.pii_pipeline(chunks, batch_size=batch_size), offsets
|
|
586
587
|
):
|
|
587
|
-
entities.extend(self.
|
|
588
|
+
entities.extend(self._pii_model_results_to_entities(chunk_results, chunk_start, overlap, person_ctx))
|
|
588
589
|
else:
|
|
589
|
-
entities.extend(self.
|
|
590
|
+
entities.extend(self._pii_model_results_to_entities(self.pii_pipeline(text), 0, overlap, person_ctx))
|
|
590
591
|
except Exception as e:
|
|
591
|
-
logger.warning(f"PII transformer
|
|
592
|
+
logger.warning(f"PII transformer detection failed: {e}")
|
|
592
593
|
|
|
593
594
|
return entities
|
|
594
595
|
|
|
@@ -640,7 +641,7 @@ class AdvancedPIIDetector:
|
|
|
640
641
|
|
|
641
642
|
When non-Latin Unicode is present the text is in a language that static
|
|
642
643
|
English/Latin keyword patterns cannot cover. We must escalate to the
|
|
643
|
-
|
|
644
|
+
multilingual NER transformer so it can detect names/orgs/locations.
|
|
644
645
|
"""
|
|
645
646
|
return bool(re.search(
|
|
646
647
|
r'[\u0600-\u06FF' # Arabic
|
|
@@ -709,7 +710,7 @@ class AdvancedPIIDetector:
|
|
|
709
710
|
score += 3
|
|
710
711
|
if has_ner_kw:
|
|
711
712
|
score += 3
|
|
712
|
-
if has_non_latin: # multilingual text needs
|
|
713
|
+
if has_non_latin: # multilingual text needs multilingual NER
|
|
713
714
|
score += 3
|
|
714
715
|
|
|
715
716
|
score += self._regex_confidence_score(
|
|
@@ -759,7 +760,7 @@ class AdvancedPIIDetector:
|
|
|
759
760
|
Signals:
|
|
760
761
|
+3 NER entity with confidence < 0.55 (very uncertain)
|
|
761
762
|
+3 Non-Latin Unicode present — Presidio/spaCy is English-only,
|
|
762
|
-
|
|
763
|
+
multilingual NER transformer is the only model that handles
|
|
763
764
|
Arabic / CJK / Devanagari / Cyrillic NER correctly.
|
|
764
765
|
+2 NER entity with confidence 0.55 – 0.70 (uncertain)
|
|
765
766
|
+2 Overlapping entities of different types (conflict)
|
|
@@ -774,11 +775,11 @@ class AdvancedPIIDetector:
|
|
|
774
775
|
ner_types = {'NAME', 'PERSON', 'ORGANIZATION', 'LOCATION'}
|
|
775
776
|
ner_ents = [e for e in current_entities if e.entity_type in ner_types]
|
|
776
777
|
|
|
777
|
-
# Non-Latin Unicode
|
|
778
|
+
# Non-Latin Unicode: escalate to multilingual NER.
|
|
778
779
|
if self._has_non_latin_unicode(text):
|
|
779
780
|
score += 3
|
|
780
781
|
|
|
781
|
-
#
|
|
782
|
+
# NER transformer is markedly more accurate than spaCy NER for person names.
|
|
782
783
|
if any(e.entity_type in {'NAME', 'PERSON'} for e in ner_ents):
|
|
783
784
|
score += 3
|
|
784
785
|
|
|
@@ -831,7 +832,7 @@ class AdvancedPIIDetector:
|
|
|
831
832
|
return False
|
|
832
833
|
|
|
833
834
|
# Never early-stop for non-Latin Unicode (Arabic, CJK, Devanagari etc.)
|
|
834
|
-
# Static keyword patterns are Latin-only;
|
|
835
|
+
# Static keyword patterns are Latin-only; multilingual NER must run.
|
|
835
836
|
if self._has_non_latin_unicode(text):
|
|
836
837
|
return False
|
|
837
838
|
|
|
@@ -1037,28 +1038,28 @@ class AdvancedPIIDetector:
|
|
|
1037
1038
|
# ------------------------------------------------------------------
|
|
1038
1039
|
|
|
1039
1040
|
def _run_ner_layers(self, text, all_entities, layers_used, timing):
|
|
1040
|
-
"""Run Presidio +
|
|
1041
|
+
"""Run Presidio + NER + PII model layers, mutating all_entities/layers_used/timing."""
|
|
1041
1042
|
if self.presidio:
|
|
1042
1043
|
t0 = time.perf_counter()
|
|
1043
1044
|
all_entities.extend(self._detect_presidio(text))
|
|
1044
1045
|
timing['presidio_ms'] = _elapsed_ms(t0)
|
|
1045
1046
|
layers_used.append('presidio')
|
|
1046
1047
|
|
|
1047
|
-
# Layer 3 (
|
|
1048
|
+
# Layer 3 (NER transformer) runs only when Presidio NER is uncertain.
|
|
1048
1049
|
if self.ner_pipeline:
|
|
1049
1050
|
transformer_score = self._calc_transformer_escalation_score(text, list(all_entities))
|
|
1050
1051
|
if transformer_score >= _TRANSFORMER_ESCALATION_THRESHOLD:
|
|
1051
1052
|
t0 = time.perf_counter()
|
|
1052
1053
|
all_entities.extend(self._detect_transformers(text))
|
|
1053
1054
|
timing['transformer_ms'] = _elapsed_ms(t0)
|
|
1054
|
-
layers_used.append('
|
|
1055
|
+
layers_used.append('ner')
|
|
1055
1056
|
|
|
1056
|
-
# Layer 4 (
|
|
1057
|
+
# Layer 4 (PII model) catches informal PHONE/EMAIL/SSN/DOB that regex misses.
|
|
1057
1058
|
if self.pii_pipeline:
|
|
1058
1059
|
t0 = time.perf_counter()
|
|
1059
1060
|
all_entities.extend(self._detect_pii_transformer(text))
|
|
1060
|
-
timing['
|
|
1061
|
-
layers_used.append('
|
|
1061
|
+
timing['pii_model_ms'] = _elapsed_ms(t0)
|
|
1062
|
+
layers_used.append('pii_model')
|
|
1062
1063
|
|
|
1063
1064
|
@staticmethod
|
|
1064
1065
|
def _entities_to_dicts(entities):
|
|
@@ -1083,8 +1084,8 @@ class AdvancedPIIDetector:
|
|
|
1083
1084
|
2. Early stop check -> skip NER if high confidence + no name signals
|
|
1084
1085
|
3. Escalation score -> if >= 3, run NER layers
|
|
1085
1086
|
4. Layer 2 (Presidio NER) -> spaCy NAME, ORG, LOCATION
|
|
1086
|
-
5. Layer 3 (
|
|
1087
|
-
6. Layer 4 (
|
|
1087
|
+
5. Layer 3 (NER transformer) -> high-accuracy NAME detection
|
|
1088
|
+
6. Layer 4 (PII model) -> structured PII: PHONE, EMAIL, SSN, DOB etc.
|
|
1088
1089
|
7. Ensemble vote -> conflict resolve -> filter -> deduplicate
|
|
1089
1090
|
|
|
1090
1091
|
Returns list of entity dicts.
|
|
@@ -1114,7 +1115,7 @@ class AdvancedPIIDetector:
|
|
|
1114
1115
|
layers_used = ['regex']
|
|
1115
1116
|
timing['presidio_ms'] = 0.0
|
|
1116
1117
|
timing['transformer_ms'] = 0.0
|
|
1117
|
-
timing['
|
|
1118
|
+
timing['pii_model_ms'] = 0.0
|
|
1118
1119
|
timing['early_stop'] = False
|
|
1119
1120
|
|
|
1120
1121
|
# ── Early stopping check (if high confidence) ─────────────────
|
|
@@ -1160,8 +1161,8 @@ class AdvancedPIIDetector:
|
|
|
1160
1161
|
f"[PII TIMER] total={timing['total_ms']:.1f}ms | "
|
|
1161
1162
|
f"regex={timing['regex_ms']:.1f}ms | "
|
|
1162
1163
|
f"presidio={timing['presidio_ms']:.1f}ms | "
|
|
1163
|
-
f"
|
|
1164
|
-
f"
|
|
1164
|
+
f"ner={timing['transformer_ms']:.1f}ms | "
|
|
1165
|
+
f"pii_model={timing['pii_model_ms']:.1f}ms | "
|
|
1165
1166
|
f"post={timing['postprocess_ms']:.1f}ms"
|
|
1166
1167
|
)
|
|
1167
1168
|
|
|
@@ -1242,6 +1243,6 @@ class PresidioCustomDetector(AdvancedPIIDetector):
|
|
|
1242
1243
|
use_presidio=PII_CONFIG.get('use_presidio', True),
|
|
1243
1244
|
use_transformers=PII_CONFIG.get('use_transformers', True),
|
|
1244
1245
|
transformer_model=PII_CONFIG.get('transformer_model',
|
|
1245
|
-
'
|
|
1246
|
+
'jainsatyam26/pii-ner-onnx'),
|
|
1246
1247
|
)
|
|
1247
1248
|
self.name = "Advanced_PII_Detector"
|
|
@@ -169,7 +169,7 @@ CORE_PII_PATTERNS = {
|
|
|
169
169
|
# ── NAME ───────────────────────────────────────────────────────────
|
|
170
170
|
# Regex only covers STRUCTURED label patterns (exact field names).
|
|
171
171
|
# Free-text names (threat verbs, sentences) are handled by
|
|
172
|
-
# Presidio/spaCy NER (Layer 2) +
|
|
172
|
+
# Presidio/spaCy NER (Layer 2) + NER transformer (Layer 3).
|
|
173
173
|
'NAME': [
|
|
174
174
|
# Explicit form/document field labels — unambiguous
|
|
175
175
|
# "Full Name: John Smith", "Employee Name: Sneha Patel", "Claimant Name: Ahmed Khan"
|
|
@@ -111,8 +111,8 @@ def deduplicate_entities(entities: List[Dict], overlap_threshold: float = 0.5) -
|
|
|
111
111
|
1. Score ke basis pe descending sort karo (highest score pehle process ho)
|
|
112
112
|
2. Har entity ke liye check karo — kya kisi already-kept entity se overlap hai?
|
|
113
113
|
- Same span + SAME type → duplicate (same detector found it twice)
|
|
114
|
-
- Same span + DIFF type → cross-category conflict (e.g.
|
|
115
|
-
regex EMAIL, or
|
|
114
|
+
- Same span + DIFF type → cross-category conflict (e.g. pii_model EMAIL vs
|
|
115
|
+
regex EMAIL, or pii_model PHONE vs regex PHONE) —
|
|
116
116
|
keep only highest-score, drop the rest
|
|
117
117
|
3. Agar overlap hai → discard (already-kept wala higher score ka hai)
|
|
118
118
|
4. Final result position ke basis pe sort karo
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pii-protector
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.2
|
|
4
4
|
Summary: Production-grade PII detection with multi-model ensemble
|
|
5
5
|
Author-email: tensoryug <jainsatyam26@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/sjain26/pii-
|
|
7
|
+
Project-URL: Homepage, https://github.com/sjain26/pii-detector
|
|
8
8
|
Keywords: pii,privacy,nlp,detection,presidio,data-protection
|
|
9
9
|
Classifier: Development Status :: 5 - Production/Stable
|
|
10
10
|
Classifier: Intended Audience :: Developers
|
|
@@ -22,12 +22,12 @@ Requires-Dist: presidio-analyzer>=2.2; extra == "presidio"
|
|
|
22
22
|
Requires-Dist: spacy>=3.7; extra == "presidio"
|
|
23
23
|
Provides-Extra: transformers
|
|
24
24
|
Requires-Dist: transformers>=4.40; extra == "transformers"
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: optimum[onnxruntime]>=1.18; extra == "transformers"
|
|
26
26
|
Provides-Extra: full
|
|
27
27
|
Requires-Dist: presidio-analyzer>=2.2; extra == "full"
|
|
28
28
|
Requires-Dist: spacy>=3.7; extra == "full"
|
|
29
29
|
Requires-Dist: transformers>=4.40; extra == "full"
|
|
30
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: optimum[onnxruntime]>=1.18; extra == "full"
|
|
31
31
|
|
|
32
32
|
# pii-protector
|
|
33
33
|
|
|
@@ -40,9 +40,9 @@ Layer 1 — Regex always runs ~0.3–3ms
|
|
|
40
40
|
↓ escalation score >= 3?
|
|
41
41
|
Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
|
|
42
42
|
↓ confidence low or conflict?
|
|
43
|
-
Layer 3 —
|
|
43
|
+
Layer 3 — NER Transformer high-accuracy names ~20ms
|
|
44
44
|
+
|
|
45
|
-
Layer 4 —
|
|
45
|
+
Layer 4 — PII Model structured PII ~15ms
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
|
|
@@ -144,8 +144,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
|
|
|
144
144
|
```python
|
|
145
145
|
detector = AdvancedPIIDetector(
|
|
146
146
|
use_presidio=True, # Layer 2: spaCy NER via Presidio
|
|
147
|
-
use_transformers=True, # Layer 3:
|
|
148
|
-
use_pii_model=True, # Layer 4:
|
|
147
|
+
use_transformers=True, # Layer 3: NER transformer
|
|
148
|
+
use_pii_model=True, # Layer 4: PII model
|
|
149
149
|
confidence_threshold=0.5, # Early-stop threshold
|
|
150
150
|
spacy_model="en_core_web_lg",
|
|
151
151
|
transformer_model="jainsatyam26/pii-ner-onnx",
|
|
@@ -171,8 +171,8 @@ SPACY_DATA_PATH=/path/to/models
|
|
|
171
171
|
| 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
|
|
172
172
|
|
|
173
173
|
ONNX-optimized versions:
|
|
174
|
-
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) —
|
|
175
|
-
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) —
|
|
174
|
+
- [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
|
|
175
|
+
- [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
|
|
176
176
|
|
|
177
177
|
## License
|
|
178
178
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
presidio-analyzer>=2.2
|
|
4
4
|
spacy>=3.7
|
|
5
5
|
transformers>=4.40
|
|
6
|
-
|
|
6
|
+
optimum[onnxruntime]>=1.18
|
|
7
7
|
|
|
8
8
|
[presidio]
|
|
9
9
|
presidio-analyzer>=2.2
|
|
@@ -11,4 +11,4 @@ spacy>=3.7
|
|
|
11
11
|
|
|
12
12
|
[transformers]
|
|
13
13
|
transformers>=4.40
|
|
14
|
-
|
|
14
|
+
optimum[onnxruntime]>=1.18
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pii-protector"
|
|
7
|
-
version = "2.2.
|
|
7
|
+
version = "2.2.2"
|
|
8
8
|
description = "Production-grade PII detection with multi-model ensemble"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -35,24 +35,24 @@ presidio = [
|
|
|
35
35
|
"presidio-analyzer>=2.2",
|
|
36
36
|
"spacy>=3.7",
|
|
37
37
|
]
|
|
38
|
-
# Layer 3 & 4:
|
|
38
|
+
# Layer 3 & 4: NER transformer + PII model (ONNX Runtime)
|
|
39
39
|
transformers = [
|
|
40
40
|
"transformers>=4.40",
|
|
41
|
-
"
|
|
41
|
+
"optimum[onnxruntime]>=1.18",
|
|
42
42
|
]
|
|
43
43
|
# Install everything
|
|
44
44
|
full = [
|
|
45
45
|
"presidio-analyzer>=2.2",
|
|
46
46
|
"spacy>=3.7",
|
|
47
47
|
"transformers>=4.40",
|
|
48
|
-
"
|
|
48
|
+
"optimum[onnxruntime]>=1.18",
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
[project.scripts]
|
|
52
52
|
pii-detect = "pii_detector.cli:main"
|
|
53
53
|
|
|
54
54
|
[project.urls]
|
|
55
|
-
Homepage = "https://github.com/sjain26/pii-
|
|
55
|
+
Homepage = "https://github.com/sjain26/pii-detector"
|
|
56
56
|
|
|
57
57
|
[tool.setuptools.packages.find]
|
|
58
58
|
include = ["pii_detector*"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|