pii-protector 2.2.0__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {pii_protector-2.2.0 → pii_protector-2.2.2}/PKG-INFO +10 -10
  2. {pii_protector-2.2.0 → pii_protector-2.2.2}/README.md +6 -6
  3. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/__init__.py +1 -1
  4. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/config/settings.py +8 -8
  5. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/detector.py +85 -84
  6. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/core_pii.py +1 -1
  7. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/utils/helpers.py +2 -2
  8. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/PKG-INFO +10 -10
  9. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/requires.txt +2 -2
  10. {pii_protector-2.2.0 → pii_protector-2.2.2}/pyproject.toml +5 -5
  11. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/__main__.py +0 -0
  12. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/cli.py +0 -0
  13. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/config/__init__.py +0 -0
  14. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/__init__.py +0 -0
  15. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/api_secrets.py +0 -0
  16. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/cloud_providers.py +0 -0
  17. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/communication.py +0 -0
  18. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/confidential_markers.py +0 -0
  19. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/dates_urls.py +0 -0
  20. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/file_control.py +0 -0
  21. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/india_pii.py +0 -0
  22. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/monitoring.py +0 -0
  23. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/north_america_pii.py +0 -0
  24. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/payment.py +0 -0
  25. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/patterns/version_control.py +0 -0
  26. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/utils/__init__.py +0 -0
  27. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/__init__.py +0 -0
  28. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/api_validators.py +0 -0
  29. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_detector/validators/pii_validators.py +0 -0
  30. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/SOURCES.txt +0 -0
  31. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/dependency_links.txt +0 -0
  32. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/entry_points.txt +0 -0
  33. {pii_protector-2.2.0 → pii_protector-2.2.2}/pii_protector.egg-info/top_level.txt +0 -0
  34. {pii_protector-2.2.0 → pii_protector-2.2.2}/setup.cfg +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pii-protector
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Production-grade PII detection with multi-model ensemble
5
5
  Author-email: tensoryug <jainsatyam26@gmail.com>
6
6
  License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/sjain26/pii-guard
7
+ Project-URL: Homepage, https://github.com/sjain26/pii-detector
8
8
  Keywords: pii,privacy,nlp,detection,presidio,data-protection
9
9
  Classifier: Development Status :: 5 - Production/Stable
10
10
  Classifier: Intended Audience :: Developers
@@ -22,12 +22,12 @@ Requires-Dist: presidio-analyzer>=2.2; extra == "presidio"
22
22
  Requires-Dist: spacy>=3.7; extra == "presidio"
23
23
  Provides-Extra: transformers
24
24
  Requires-Dist: transformers>=4.40; extra == "transformers"
25
- Requires-Dist: torch>=2.0; extra == "transformers"
25
+ Requires-Dist: optimum[onnxruntime]>=1.18; extra == "transformers"
26
26
  Provides-Extra: full
27
27
  Requires-Dist: presidio-analyzer>=2.2; extra == "full"
28
28
  Requires-Dist: spacy>=3.7; extra == "full"
29
29
  Requires-Dist: transformers>=4.40; extra == "full"
30
- Requires-Dist: torch>=2.0; extra == "full"
30
+ Requires-Dist: optimum[onnxruntime]>=1.18; extra == "full"
31
31
 
32
32
  # pii-protector
33
33
 
@@ -40,9 +40,9 @@ Layer 1 — Regex always runs ~0.3–3ms
40
40
  ↓ escalation score >= 3?
41
41
  Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
42
42
  ↓ confidence low or conflict?
43
- Layer 3 — RoBERTa-large NER high-accuracy names ~20ms
43
+ Layer 3 — NER Transformer high-accuracy names ~20ms
44
44
  +
45
- Layer 4 — piiranha-v1 structured PII ~15ms
45
+ Layer 4 — PII Model structured PII ~15ms
46
46
  ```
47
47
 
48
48
  Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
@@ -144,8 +144,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
144
144
  ```python
145
145
  detector = AdvancedPIIDetector(
146
146
  use_presidio=True, # Layer 2: spaCy NER via Presidio
147
- use_transformers=True, # Layer 3: RoBERTa-large NER
148
- use_pii_model=True, # Layer 4: piiranha-v1
147
+ use_transformers=True, # Layer 3: NER transformer
148
+ use_pii_model=True, # Layer 4: PII model
149
149
  confidence_threshold=0.5, # Early-stop threshold
150
150
  spacy_model="en_core_web_lg",
151
151
  transformer_model="jainsatyam26/pii-ner-onnx",
@@ -171,8 +171,8 @@ SPACY_DATA_PATH=/path/to/models
171
171
  | 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
172
172
 
173
173
  ONNX-optimized versions:
174
- - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — RoBERTa NER
175
- - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — piiranha-v1
174
+ - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
175
+ - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
176
176
 
177
177
  ## License
178
178
 
@@ -9,9 +9,9 @@ Layer 1 — Regex always runs ~0.3–3ms
9
9
  ↓ escalation score >= 3?
10
10
  Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
11
11
  ↓ confidence low or conflict?
12
- Layer 3 — RoBERTa-large NER high-accuracy names ~20ms
12
+ Layer 3 — NER Transformer high-accuracy names ~20ms
13
13
  +
14
- Layer 4 — piiranha-v1 structured PII ~15ms
14
+ Layer 4 — PII Model structured PII ~15ms
15
15
  ```
16
16
 
17
17
  Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
@@ -113,8 +113,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
113
113
  ```python
114
114
  detector = AdvancedPIIDetector(
115
115
  use_presidio=True, # Layer 2: spaCy NER via Presidio
116
- use_transformers=True, # Layer 3: RoBERTa-large NER
117
- use_pii_model=True, # Layer 4: piiranha-v1
116
+ use_transformers=True, # Layer 3: NER transformer
117
+ use_pii_model=True, # Layer 4: PII model
118
118
  confidence_threshold=0.5, # Early-stop threshold
119
119
  spacy_model="en_core_web_lg",
120
120
  transformer_model="jainsatyam26/pii-ner-onnx",
@@ -140,8 +140,8 @@ SPACY_DATA_PATH=/path/to/models
140
140
  | 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
141
141
 
142
142
  ONNX-optimized versions:
143
- - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — RoBERTa NER
144
- - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — piiranha-v1
143
+ - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
144
+ - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
145
145
 
146
146
  ## License
147
147
 
@@ -7,5 +7,5 @@ Multi-model ensemble with pre-compiled patterns and type-aware voting
7
7
  from .detector import AdvancedPIIDetector, PIIEntity, PresidioCustomDetector
8
8
  from .config.settings import PII_CONFIG
9
9
 
10
- __version__ = "2.2.0"
10
+ __version__ = "2.2.2"
11
11
  __all__ = ["AdvancedPIIDetector", "PIIEntity", "PresidioCustomDetector", "PII_CONFIG"]
@@ -4,8 +4,8 @@ Configuration settings for PII detection
4
4
  Latency profile (per-call on typical hardware):
5
5
  Regex only : ~0.3ms (Layer 1 — structured PII, India PII, API secrets)
6
6
  Regex + Presidio : ~5ms (Layer 2 — adds NAME/ORG/LOC via spaCy, CPU)
7
- Regex + Presidio + RoBERTa-NER : ~20ms (Layer 3 — high-accuracy English NER, GPU)
8
- + piiranha-v1 : +15ms (Layer 4 — structured PII: PHONE, EMAIL, DOB, SSN)
7
+ Regex + Presidio + NER Transformer : ~20ms (Layer 3 — high-accuracy English NER, GPU)
8
+ + PII Model : +15ms (Layer 4 — structured PII: PHONE, EMAIL, DOB, SSN)
9
9
 
10
10
  Early stop fires for ~80% of texts → average well under 5ms.
11
11
 
@@ -14,7 +14,7 @@ Model selection rationale:
14
14
  - Higher NER accuracy than sm for NAME/ORG/LOC — worth the extra size
15
15
  - Used inside Presidio (Layer 2)
16
16
  ✅ jainsatyam26/pii-ner-onnx:
17
- - Best-in-class English NER accuracy (RoBERTa-large fine-tuned on CoNLL-2003)
17
+ - Best-in-class English NER accuracy
18
18
  - Significantly better NAME detection vs multilingual models
19
19
  - Labels: PER → NAME, ORG → ORGANIZATION, LOC → LOCATION, MISC → _SKIP
20
20
  ✅ jainsatyam26/pii-detector-onnx:
@@ -27,11 +27,11 @@ Model selection rationale:
27
27
  PII_CONFIG = {
28
28
  # ── What to run ──────────────────────────────────────────────────
29
29
  # Presidio (spaCy NER): ON → for NAME, ORG, LOCATION — CPU, fast
30
- # Transformer (RoBERTa): ON → high-accuracy English NER — GPU, conditional
31
- # PII model (piiranha): ON → structured PII specialist — always runs
30
+ # NER Transformer: ON → high-accuracy English NER — GPU, conditional
31
+ # PII model: ON → structured PII specialist — always runs
32
32
  'use_presidio': True,
33
- 'use_transformers': True, # roberta-large NER enabled
34
- 'use_pii_model': True, # piiranha enabled
33
+ 'use_transformers': True, # NER transformer enabled
34
+ 'use_pii_model': True, # PII model enabled
35
35
 
36
36
  # Model names
37
37
  'spacy_model': "en_core_web_lg", # Large model — better NER accuracy than sm
@@ -98,7 +98,7 @@ TYPE_MAPPINGS = {
98
98
  'LAW': '_SKIP',
99
99
  'LANGUAGE': '_SKIP',
100
100
 
101
- # Transformer types (Davlan/xlm-roberta-large-ner-hrl)
101
+ # Transformer types (multilingual NER)
102
102
  # NOTE: ORG -> ORGANIZATION and LOC -> LOCATION are already mapped in the spaCy section above
103
103
  'PER': 'NAME', # Person names -> NAME
104
104
  'MISC': '_SKIP', # Miscellaneous -> skip
@@ -82,7 +82,7 @@ def _empty_timing() -> Dict:
82
82
  """Return a zeroed timing dict for empty/skipped calls."""
83
83
  return {
84
84
  'regex_ms': 0.0, 'presidio_ms': 0.0, 'transformer_ms': 0.0,
85
- 'piiranha_ms': 0.0,
85
+ 'pii_model_ms': 0.0,
86
86
  'escalation_check_ms': 0.0, 'postprocess_ms': 0.0, 'total_ms': 0.0,
87
87
  'layers': 'none', 'ner_escalation_score': 0,
88
88
  'entities_found': 0, 'text_length': 0,
@@ -148,7 +148,7 @@ _COMMON_NON_NAME_WORDS = frozenset({
148
148
 
149
149
  # Escalation score thresholds
150
150
  _NER_ESCALATION_THRESHOLD = 3
151
- # Transformer (Davlan multilingual) runs only when Presidio NER is uncertain.
151
+ # NER transformer runs only when Presidio NER is uncertain.
152
152
  # Threshold=3 means: run transformer only if name confidence is low or conflicting.
153
153
  # 0 = always run (wasteful), 3 = smart conditional (saves ~9ms on clear cases).
154
154
  _TRANSFORMER_ESCALATION_THRESHOLD = 3
@@ -169,7 +169,7 @@ class PIIEntity:
169
169
  return hash((self.text, self.entity_type, self.start, self.end))
170
170
 
171
171
 
172
- # Person-context keywords — when present, a piiranha USERNAME is likely a NAME.
172
+ # Person-context keywords — when present, a pii_model USERNAME is likely a NAME.
173
173
  _PERSON_CONTEXT_KW = frozenset({
174
174
  'appointment', 'scheduling', 'schedule', 'scheduled',
175
175
  'patient', 'customer', 'client', 'employee', 'user',
@@ -178,7 +178,7 @@ _PERSON_CONTEXT_KW = frozenset({
178
178
  'meeting', 'visit', 'booking', 'reservation',
179
179
  })
180
180
 
181
- # piiranha-v1 actual label → our standard type (labels verified from model output).
181
+ # PII model actual label → our standard type (labels verified from model output).
182
182
  _PIIRANHA_MAP = {
183
183
  # Name variants
184
184
  'GIVENNAME': 'NAME',
@@ -215,7 +215,7 @@ _PIIRANHA_MAP = {
215
215
  'ID_CARD': '_SKIP',
216
216
  # Auth
217
217
  'USERNAME': 'USERNAME',
218
- 'PASSWORD': '_SKIP', # NOSONAR - piiranha label key, not a credential
218
+ 'PASSWORD': '_SKIP', # NOSONAR - pii_model label key, not a credential
219
219
  # Location (skip — too noisy)
220
220
  'CITY': '_SKIP',
221
221
  'ZIPCODE': '_SKIP',
@@ -305,23 +305,42 @@ class AdvancedPIIDetector:
305
305
  self.type_mappings = TYPE_MAPPINGS
306
306
 
307
307
  self.presidio = None
308
- self.ner_pipeline = None # RoBERTa-large English NER: NAME / ORG / LOC
309
- self.pii_pipeline = None # piiranha-v1: structured PII (PHONE, EMAIL, SSN, DOB etc.)
308
+ self.ner_pipeline = None # NER transformer: NAME / ORG / LOC
309
+ self.pii_pipeline = None # PII model: structured PII (PHONE, EMAIL, SSN, DOB etc.)
310
310
  self._ner_sources_count = 0
311
311
  self.last_timing: Dict = _empty_timing()
312
312
 
313
313
  if self.use_presidio:
314
314
  self._init_presidio()
315
315
  if self.use_transformers:
316
- self._init_transformers() # RoBERTa-large English NER (NAME/ORG/LOC)
316
+ self._init_transformers() # NER transformer (NAME/ORG/LOC)
317
317
  if self.use_pii_model:
318
- self._init_pii_transformer() # piiranha-v1 structured PII
318
+ self._init_pii_transformer() # PII model structured PII
319
+
320
+ if self.use_presidio and not self.presidio:
321
+ warnings.warn(
322
+ "Presidio (Layer 2) unavailable — NAME/ORG/LOCATION detection disabled. "
323
+ "Install: pip install 'pii-protector[presidio]' then: python -m spacy download en_core_web_lg",
324
+ UserWarning, stacklevel=2,
325
+ )
326
+ if self.use_transformers and not self.ner_pipeline:
327
+ warnings.warn(
328
+ "NER transformer (Layer 3) unavailable — high-accuracy name detection disabled. "
329
+ "Install: pip install 'optimum[onnxruntime]>=1.18' 'transformers>=4.40'",
330
+ UserWarning, stacklevel=2,
331
+ )
332
+ if self.use_pii_model and not self.pii_pipeline:
333
+ warnings.warn(
334
+ "PII model (Layer 4) unavailable — PHONE/EMAIL/DOB/SSN transformer detection disabled. "
335
+ "Install: pip install 'optimum[onnxruntime]>=1.18' 'transformers>=4.40'",
336
+ UserWarning, stacklevel=2,
337
+ )
319
338
 
320
339
  logger.info(
321
340
  f"PII Detector ready | mode=SEQUENTIAL_LAYERED "
322
341
  f"| presidio={'ON' if self.presidio else 'OFF'} "
323
342
  f"| ner={'ON' if self.ner_pipeline else 'OFF'} "
324
- f"| piiranha={'ON' if self.pii_pipeline else 'OFF'} "
343
+ f"| pii_model={'ON' if self.pii_pipeline else 'OFF'} "
325
344
  f"| ner_sources={self._ner_sources_count}"
326
345
  )
327
346
 
@@ -363,61 +382,43 @@ class AdvancedPIIDetector:
363
382
  logger.warning(f"Presidio unavailable: {e}")
364
383
 
365
384
  def _init_transformers(self):
366
- """RoBERTa-large English NER — high-accuracy NAME/ORG/LOC."""
385
+ """NER transformer — high-accuracy NAME/ORG/LOC (ONNX Runtime)."""
367
386
  try:
368
- from transformers import pipeline
369
- try:
370
- with _suppress_stdout():
371
- self.ner_pipeline = pipeline(
372
- "ner",
373
- model=self.transformer_model_name,
374
- aggregation_strategy="first",
375
- device=0,
376
- model_kwargs={"dtype": "float16"},
377
- )
378
- self.ner_pipeline("warmup") # GPU kernel compile
379
- logger.info(f"NER ready on GPU ({self.transformer_model_name})")
380
- except Exception:
381
- with _suppress_stdout():
382
- self.ner_pipeline = pipeline(
383
- "ner",
384
- model=self.transformer_model_name,
385
- aggregation_strategy="first",
386
- device=-1,
387
- )
388
- self.ner_pipeline("warmup")
389
- logger.info(f"NER ready on CPU ({self.transformer_model_name})")
387
+ from optimum.onnxruntime import ORTModelForTokenClassification
388
+ from transformers import AutoTokenizer, pipeline
389
+ with _suppress_stdout():
390
+ model = ORTModelForTokenClassification.from_pretrained(self.transformer_model_name)
391
+ tokenizer = AutoTokenizer.from_pretrained(self.transformer_model_name)
392
+ self.ner_pipeline = pipeline(
393
+ "ner",
394
+ model=model,
395
+ tokenizer=tokenizer,
396
+ aggregation_strategy="first",
397
+ )
398
+ self.ner_pipeline("warmup")
390
399
  self._ner_sources_count += 1
400
+ logger.info(f"NER ready ({self.transformer_model_name})")
391
401
  except Exception as e:
392
402
  logger.warning(f"NER transformer unavailable: {e}")
393
403
 
394
404
  def _init_pii_transformer(self):
395
- """piiranha-v1 structured PII detection — PHONE, EMAIL, SSN, DOB, ADDRESS etc."""
405
+ """PII model structured PII detection — PHONE, EMAIL, SSN, DOB, ADDRESS etc. (ONNX Runtime)."""
396
406
  try:
397
- from transformers import pipeline
398
- try:
399
- with _suppress_stdout():
400
- self.pii_pipeline = pipeline(
401
- "token-classification",
402
- model=self.pii_model_name,
403
- aggregation_strategy="first",
404
- device=0,
405
- model_kwargs={"dtype": "float16"},
406
- )
407
- self.pii_pipeline("warmup") # GPU kernel compile
408
- logger.info(f"piiranha ready on GPU ({self.pii_model_name})")
409
- except Exception:
410
- with _suppress_stdout():
411
- self.pii_pipeline = pipeline(
412
- "token-classification",
413
- model=self.pii_model_name,
414
- aggregation_strategy="first",
415
- device=-1,
416
- )
417
- self.pii_pipeline("warmup")
418
- logger.info(f"piiranha ready on CPU ({self.pii_model_name})")
407
+ from optimum.onnxruntime import ORTModelForTokenClassification
408
+ from transformers import AutoTokenizer, pipeline
409
+ with _suppress_stdout():
410
+ model = ORTModelForTokenClassification.from_pretrained(self.pii_model_name)
411
+ tokenizer = AutoTokenizer.from_pretrained(self.pii_model_name)
412
+ self.pii_pipeline = pipeline(
413
+ "token-classification",
414
+ model=model,
415
+ tokenizer=tokenizer,
416
+ aggregation_strategy="first",
417
+ )
418
+ self.pii_pipeline("warmup")
419
+ logger.info(f"PII model ready ({self.pii_model_name})")
419
420
  except Exception as e:
420
- logger.warning(f"piiranha unavailable: {e}")
421
+ logger.warning(f"PII model unavailable: {e}")
421
422
 
422
423
  # ------------------------------------------------------------------
423
424
  # Layer detection methods
@@ -540,8 +541,8 @@ class AdvancedPIIDetector:
540
541
  clean = word.strip()
541
542
  return clean.isalpha() and len(clean) >= 3
542
543
 
543
- def _piiranha_results_to_entities(self, results, offset, overlap, person_ctx):
544
- """Map piiranha pipeline results (one chunk) into PIIEntity objects."""
544
+ def _pii_model_results_to_entities(self, results, offset, overlap, person_ctx):
545
+ """Map pii_model pipeline results (one chunk) into PIIEntity objects."""
545
546
  out = []
546
547
  for r in results:
547
548
  if offset > 0 and r['start'] < overlap:
@@ -551,23 +552,23 @@ class AdvancedPIIDetector:
551
552
  if entity_type == '_SKIP':
552
553
  continue
553
554
  word = r['word']
554
- # piiranha often tags standalone person names as USERNAME in free text;
555
+ # PII model often tags standalone person names as USERNAME in free text;
555
556
  # promote to NAME when person context is present and the word looks like a name.
556
557
  if entity_type == 'USERNAME' and person_ctx and self._looks_like_name(word):
557
558
  entity_type = 'NAME'
558
- logger.debug(f"[piiranha] Promoted USERNAME→NAME for '{word}' (person context)")
559
+ logger.debug(f"[pii_model] Promoted USERNAME→NAME for '{word}' (person context)")
559
560
  out.append(PIIEntity(
560
561
  text=word,
561
562
  entity_type=entity_type,
562
563
  start=int(r['start']) + offset,
563
564
  end=int(r['end']) + offset,
564
565
  score=float(r['score']),
565
- source="piiranha",
566
+ source="pii_model",
566
567
  ))
567
568
  return out
568
569
 
569
570
  def _detect_pii_transformer(self, text: str) -> List[PIIEntity]:
570
- """piiranha-v1 specialized PII detection with chunking (see _PIIRANHA_MAP)."""
571
+ """PII model specialized PII detection with chunking (see _PIIRANHA_MAP)."""
571
572
  if not self.pii_pipeline:
572
573
  return []
573
574
 
@@ -584,11 +585,11 @@ class AdvancedPIIDetector:
584
585
  for chunk_results, chunk_start in zip(
585
586
  self.pii_pipeline(chunks, batch_size=batch_size), offsets
586
587
  ):
587
- entities.extend(self._piiranha_results_to_entities(chunk_results, chunk_start, overlap, person_ctx))
588
+ entities.extend(self._pii_model_results_to_entities(chunk_results, chunk_start, overlap, person_ctx))
588
589
  else:
589
- entities.extend(self._piiranha_results_to_entities(self.pii_pipeline(text), 0, overlap, person_ctx))
590
+ entities.extend(self._pii_model_results_to_entities(self.pii_pipeline(text), 0, overlap, person_ctx))
590
591
  except Exception as e:
591
- logger.warning(f"PII transformer (piiranha) detection failed: {e}")
592
+ logger.warning(f"PII transformer detection failed: {e}")
592
593
 
593
594
  return entities
594
595
 
@@ -640,7 +641,7 @@ class AdvancedPIIDetector:
640
641
 
641
642
  When non-Latin Unicode is present the text is in a language that static
642
643
  English/Latin keyword patterns cannot cover. We must escalate to the
643
- XLM-RoBERTa multilingual transformer so it can detect names/orgs/locations.
644
+ multilingual NER transformer so it can detect names/orgs/locations.
644
645
  """
645
646
  return bool(re.search(
646
647
  r'[\u0600-\u06FF' # Arabic
@@ -709,7 +710,7 @@ class AdvancedPIIDetector:
709
710
  score += 3
710
711
  if has_ner_kw:
711
712
  score += 3
712
- if has_non_latin: # multilingual text needs XLM-RoBERTa NER
713
+ if has_non_latin: # multilingual text needs multilingual NER
713
714
  score += 3
714
715
 
715
716
  score += self._regex_confidence_score(
@@ -759,7 +760,7 @@ class AdvancedPIIDetector:
759
760
  Signals:
760
761
  +3 NER entity with confidence < 0.55 (very uncertain)
761
762
  +3 Non-Latin Unicode present — Presidio/spaCy is English-only,
762
- XLM-RoBERTa transformer is the only model that handles
763
+ multilingual NER transformer is the only model that handles
763
764
  Arabic / CJK / Devanagari / Cyrillic NER correctly.
764
765
  +2 NER entity with confidence 0.55 – 0.70 (uncertain)
765
766
  +2 Overlapping entities of different types (conflict)
@@ -774,11 +775,11 @@ class AdvancedPIIDetector:
774
775
  ner_types = {'NAME', 'PERSON', 'ORGANIZATION', 'LOCATION'}
775
776
  ner_ents = [e for e in current_entities if e.entity_type in ner_types]
776
777
 
777
- # Non-Latin Unicode → spaCy is English-only; escalate to XLM-RoBERTa.
778
+ # Non-Latin Unicode: escalate to multilingual NER.
778
779
  if self._has_non_latin_unicode(text):
779
780
  score += 3
780
781
 
781
- # RoBERTa-large is markedly more accurate than spaCy NER for person names.
782
+ # NER transformer is markedly more accurate than spaCy NER for person names.
782
783
  if any(e.entity_type in {'NAME', 'PERSON'} for e in ner_ents):
783
784
  score += 3
784
785
 
@@ -831,7 +832,7 @@ class AdvancedPIIDetector:
831
832
  return False
832
833
 
833
834
  # Never early-stop for non-Latin Unicode (Arabic, CJK, Devanagari etc.)
834
- # Static keyword patterns are Latin-only; XLM-RoBERTa must run.
835
+ # Static keyword patterns are Latin-only; multilingual NER must run.
835
836
  if self._has_non_latin_unicode(text):
836
837
  return False
837
838
 
@@ -1037,28 +1038,28 @@ class AdvancedPIIDetector:
1037
1038
  # ------------------------------------------------------------------
1038
1039
 
1039
1040
  def _run_ner_layers(self, text, all_entities, layers_used, timing):
1040
- """Run Presidio + RoBERTa + piiranha layers, mutating all_entities/layers_used/timing."""
1041
+ """Run Presidio + NER + PII model layers, mutating all_entities/layers_used/timing."""
1041
1042
  if self.presidio:
1042
1043
  t0 = time.perf_counter()
1043
1044
  all_entities.extend(self._detect_presidio(text))
1044
1045
  timing['presidio_ms'] = _elapsed_ms(t0)
1045
1046
  layers_used.append('presidio')
1046
1047
 
1047
- # Layer 3 (RoBERTa-large) runs only when Presidio NER is uncertain.
1048
+ # Layer 3 (NER transformer) runs only when Presidio NER is uncertain.
1048
1049
  if self.ner_pipeline:
1049
1050
  transformer_score = self._calc_transformer_escalation_score(text, list(all_entities))
1050
1051
  if transformer_score >= _TRANSFORMER_ESCALATION_THRESHOLD:
1051
1052
  t0 = time.perf_counter()
1052
1053
  all_entities.extend(self._detect_transformers(text))
1053
1054
  timing['transformer_ms'] = _elapsed_ms(t0)
1054
- layers_used.append('roberta')
1055
+ layers_used.append('ner')
1055
1056
 
1056
- # Layer 4 (piiranha-v1) catches informal PHONE/EMAIL/SSN/DOB that regex misses.
1057
+ # Layer 4 (PII model) catches informal PHONE/EMAIL/SSN/DOB that regex misses.
1057
1058
  if self.pii_pipeline:
1058
1059
  t0 = time.perf_counter()
1059
1060
  all_entities.extend(self._detect_pii_transformer(text))
1060
- timing['piiranha_ms'] = _elapsed_ms(t0)
1061
- layers_used.append('piiranha')
1061
+ timing['pii_model_ms'] = _elapsed_ms(t0)
1062
+ layers_used.append('pii_model')
1062
1063
 
1063
1064
  @staticmethod
1064
1065
  def _entities_to_dicts(entities):
@@ -1083,8 +1084,8 @@ class AdvancedPIIDetector:
1083
1084
  2. Early stop check -> skip NER if high confidence + no name signals
1084
1085
  3. Escalation score -> if >= 3, run NER layers
1085
1086
  4. Layer 2 (Presidio NER) -> spaCy NAME, ORG, LOCATION
1086
- 5. Layer 3 (roberta-large) -> high-accuracy NAME detection
1087
- 6. Layer 4 (piiranha-v1) -> structured PII: PHONE, EMAIL, SSN, DOB etc.
1087
+ 5. Layer 3 (NER transformer) -> high-accuracy NAME detection
1088
+ 6. Layer 4 (PII model) -> structured PII: PHONE, EMAIL, SSN, DOB etc.
1088
1089
  7. Ensemble vote -> conflict resolve -> filter -> deduplicate
1089
1090
 
1090
1091
  Returns list of entity dicts.
@@ -1114,7 +1115,7 @@ class AdvancedPIIDetector:
1114
1115
  layers_used = ['regex']
1115
1116
  timing['presidio_ms'] = 0.0
1116
1117
  timing['transformer_ms'] = 0.0
1117
- timing['piiranha_ms'] = 0.0
1118
+ timing['pii_model_ms'] = 0.0
1118
1119
  timing['early_stop'] = False
1119
1120
 
1120
1121
  # ── Early stopping check (if high confidence) ─────────────────
@@ -1160,8 +1161,8 @@ class AdvancedPIIDetector:
1160
1161
  f"[PII TIMER] total={timing['total_ms']:.1f}ms | "
1161
1162
  f"regex={timing['regex_ms']:.1f}ms | "
1162
1163
  f"presidio={timing['presidio_ms']:.1f}ms | "
1163
- f"roberta={timing['transformer_ms']:.1f}ms | "
1164
- f"piiranha={timing['piiranha_ms']:.1f}ms | "
1164
+ f"ner={timing['transformer_ms']:.1f}ms | "
1165
+ f"pii_model={timing['pii_model_ms']:.1f}ms | "
1165
1166
  f"post={timing['postprocess_ms']:.1f}ms"
1166
1167
  )
1167
1168
 
@@ -1242,6 +1243,6 @@ class PresidioCustomDetector(AdvancedPIIDetector):
1242
1243
  use_presidio=PII_CONFIG.get('use_presidio', True),
1243
1244
  use_transformers=PII_CONFIG.get('use_transformers', True),
1244
1245
  transformer_model=PII_CONFIG.get('transformer_model',
1245
- 'Davlan/bert-base-multilingual-cased-ner-hrl'),
1246
+ 'jainsatyam26/pii-ner-onnx'),
1246
1247
  )
1247
1248
  self.name = "Advanced_PII_Detector"
@@ -169,7 +169,7 @@ CORE_PII_PATTERNS = {
169
169
  # ── NAME ───────────────────────────────────────────────────────────
170
170
  # Regex only covers STRUCTURED label patterns (exact field names).
171
171
  # Free-text names (threat verbs, sentences) are handled by
172
- # Presidio/spaCy NER (Layer 2) + RoBERTa transformer (Layer 3).
172
+ # Presidio/spaCy NER (Layer 2) + NER transformer (Layer 3).
173
173
  'NAME': [
174
174
  # Explicit form/document field labels — unambiguous
175
175
  # "Full Name: John Smith", "Employee Name: Sneha Patel", "Claimant Name: Ahmed Khan"
@@ -111,8 +111,8 @@ def deduplicate_entities(entities: List[Dict], overlap_threshold: float = 0.5) -
111
111
  1. Score ke basis pe descending sort karo (highest score pehle process ho)
112
112
  2. Har entity ke liye check karo — kya kisi already-kept entity se overlap hai?
113
113
  - Same span + SAME type → duplicate (same detector found it twice)
114
- - Same span + DIFF type → cross-category conflict (e.g. piiranha EMAIL vs
115
- regex EMAIL, or piiranha PHONE vs regex PHONE) —
114
+ - Same span + DIFF type → cross-category conflict (e.g. pii_model EMAIL vs
115
+ regex EMAIL, or pii_model PHONE vs regex PHONE) —
116
116
  keep only highest-score, drop the rest
117
117
  3. Agar overlap hai → discard (already-kept wala higher score ka hai)
118
118
  4. Final result position ke basis pe sort karo
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pii-protector
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Production-grade PII detection with multi-model ensemble
5
5
  Author-email: tensoryug <jainsatyam26@gmail.com>
6
6
  License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/sjain26/pii-guard
7
+ Project-URL: Homepage, https://github.com/sjain26/pii-detector
8
8
  Keywords: pii,privacy,nlp,detection,presidio,data-protection
9
9
  Classifier: Development Status :: 5 - Production/Stable
10
10
  Classifier: Intended Audience :: Developers
@@ -22,12 +22,12 @@ Requires-Dist: presidio-analyzer>=2.2; extra == "presidio"
22
22
  Requires-Dist: spacy>=3.7; extra == "presidio"
23
23
  Provides-Extra: transformers
24
24
  Requires-Dist: transformers>=4.40; extra == "transformers"
25
- Requires-Dist: torch>=2.0; extra == "transformers"
25
+ Requires-Dist: optimum[onnxruntime]>=1.18; extra == "transformers"
26
26
  Provides-Extra: full
27
27
  Requires-Dist: presidio-analyzer>=2.2; extra == "full"
28
28
  Requires-Dist: spacy>=3.7; extra == "full"
29
29
  Requires-Dist: transformers>=4.40; extra == "full"
30
- Requires-Dist: torch>=2.0; extra == "full"
30
+ Requires-Dist: optimum[onnxruntime]>=1.18; extra == "full"
31
31
 
32
32
  # pii-protector
33
33
 
@@ -40,9 +40,9 @@ Layer 1 — Regex always runs ~0.3–3ms
40
40
  ↓ escalation score >= 3?
41
41
  Layer 2 — Presidio + spaCy NER names / orgs / loc ~5ms
42
42
  ↓ confidence low or conflict?
43
- Layer 3 — RoBERTa-large NER high-accuracy names ~20ms
43
+ Layer 3 — NER Transformer high-accuracy names ~20ms
44
44
  +
45
- Layer 4 — piiranha-v1 structured PII ~15ms
45
+ Layer 4 — PII Model structured PII ~15ms
46
46
  ```
47
47
 
48
48
  Each layer decides whether the next one is needed. On clean structured text, only Layer 1 runs.
@@ -144,8 +144,8 @@ pii-detect --no-presidio --no-transformer --no-pii-model "card: 4111111111111111
144
144
  ```python
145
145
  detector = AdvancedPIIDetector(
146
146
  use_presidio=True, # Layer 2: spaCy NER via Presidio
147
- use_transformers=True, # Layer 3: RoBERTa-large NER
148
- use_pii_model=True, # Layer 4: piiranha-v1
147
+ use_transformers=True, # Layer 3: NER transformer
148
+ use_pii_model=True, # Layer 4: PII model
149
149
  confidence_threshold=0.5, # Early-stop threshold
150
150
  spacy_model="en_core_web_lg",
151
151
  transformer_model="jainsatyam26/pii-ner-onnx",
@@ -171,8 +171,8 @@ SPACY_DATA_PATH=/path/to/models
171
171
  | 4 | [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) | PHONE, EMAIL, SSN, DOB, ADDRESS |
172
172
 
173
173
  ONNX-optimized versions:
174
- - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — RoBERTa NER
175
- - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — piiranha-v1
174
+ - [`jainsatyam26/pii-ner-onnx`](https://huggingface.co/jainsatyam26/pii-ner-onnx) — NER model
175
+ - [`jainsatyam26/pii-detector-onnx`](https://huggingface.co/jainsatyam26/pii-detector-onnx) — PII structured model
176
176
 
177
177
  ## License
178
178
 
@@ -3,7 +3,7 @@
3
3
  presidio-analyzer>=2.2
4
4
  spacy>=3.7
5
5
  transformers>=4.40
6
- torch>=2.0
6
+ optimum[onnxruntime]>=1.18
7
7
 
8
8
  [presidio]
9
9
  presidio-analyzer>=2.2
@@ -11,4 +11,4 @@ spacy>=3.7
11
11
 
12
12
  [transformers]
13
13
  transformers>=4.40
14
- torch>=2.0
14
+ optimum[onnxruntime]>=1.18
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "pii-protector"
7
- version = "2.2.0"
7
+ version = "2.2.2"
8
8
  description = "Production-grade PII detection with multi-model ensemble"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -35,24 +35,24 @@ presidio = [
35
35
  "presidio-analyzer>=2.2",
36
36
  "spacy>=3.7",
37
37
  ]
38
- # Layer 3 & 4: Transformer-based NER (RoBERTa) + piiranha-v1
38
+ # Layer 3 & 4: NER transformer + PII model (ONNX Runtime)
39
39
  transformers = [
40
40
  "transformers>=4.40",
41
- "torch>=2.0",
41
+ "optimum[onnxruntime]>=1.18",
42
42
  ]
43
43
  # Install everything
44
44
  full = [
45
45
  "presidio-analyzer>=2.2",
46
46
  "spacy>=3.7",
47
47
  "transformers>=4.40",
48
- "torch>=2.0",
48
+ "optimum[onnxruntime]>=1.18",
49
49
  ]
50
50
 
51
51
  [project.scripts]
52
52
  pii-detect = "pii_detector.cli:main"
53
53
 
54
54
  [project.urls]
55
- Homepage = "https://github.com/sjain26/pii-guard"
55
+ Homepage = "https://github.com/sjain26/pii-detector"
56
56
 
57
57
  [tool.setuptools.packages.find]
58
58
  include = ["pii_detector*"]
File without changes