classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,883 @@
1
+ """PII detector powered by Microsoft Presidio."""
2
+
3
+ import asyncio
4
+ import importlib
5
+ import logging
6
+ import re
7
+ import subprocess
8
+ import sys
9
+ import warnings
10
+ from dataclasses import dataclass
11
+ from typing import Any, ClassVar
12
+
13
+ from ...models.generated_detectors import DetectorConfig, PIIDetectorConfig, Severity
14
+ from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType, Location
15
+ from ..base import BaseDetector
16
+ from ..dependencies import MissingDependencyError, require_module
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _PRESIDIO_LOG_FILTER_INSTALLED = False
21
+
22
+
23
+ class _PresidioNoiseFilter(logging.Filter):
24
+ """Suppresses noisy but harmless Presidio initialization warnings."""
25
+
26
+ _SUPPRESSED = (
27
+ "Recognizer not added to registry because language is not supported by registry",
28
+ "model_to_presidio_entity_mapping is missing from configuration",
29
+ "low_score_entity_names is missing from configuration",
30
+ "labels_to_ignore is missing from configuration",
31
+ "Fetching all recognizers for language",
32
+ )
33
+
34
+ def filter(self, record: logging.LogRecord) -> bool:
35
+ msg = record.getMessage()
36
+ if any(s in msg for s in self._SUPPRESSED):
37
+ return False
38
+ if "Entity " in msg and (
39
+ "is not mapped to a Presidio entity" in msg
40
+ or "doesn't have the corresponding recognizer in language" in msg
41
+ ):
42
+ return False
43
+ return True
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class _TabularCell:
48
+ row_index: int
49
+ column_name: str
50
+ value: str
51
+
52
+
53
+ class PIIDetector(BaseDetector):
54
+ """
55
+ PII detector powered by Microsoft Presidio.
56
+
57
+ Detects personally identifiable information across global and regional entity types,
58
+ covering all built-in Presidio recognizers plus optional ad-hoc custom recognizers
59
+ defined in configuration.
60
+
61
+ Supported regions: Global, USA, UK, Spain, Italy, Singapore, Australia, India,
62
+ Finland, Poland, DACH (Germany / Austria / Switzerland).
63
+ """
64
+
65
+ detector_type = "pii"
66
+ detector_name = "pii"
67
+
68
+ # All entity types supported by built-in Presidio recognizers.
69
+ _ALL_SUPPORTED_ENTITIES: ClassVar[set[str]] = {
70
+ # Global
71
+ "CREDIT_CARD",
72
+ "CRYPTO",
73
+ "DATE_TIME",
74
+ "EMAIL_ADDRESS",
75
+ "IBAN_CODE",
76
+ "IP_ADDRESS",
77
+ "NRP",
78
+ "LOCATION",
79
+ "PERSON",
80
+ "PHONE_NUMBER",
81
+ "MEDICAL_LICENSE",
82
+ "URL",
83
+ # USA
84
+ "US_BANK_NUMBER",
85
+ "US_DRIVER_LICENSE",
86
+ "US_ITIN",
87
+ "US_PASSPORT",
88
+ "US_SSN",
89
+ # UK
90
+ "UK_NHS",
91
+ # Spain
92
+ "ES_NIF",
93
+ "ES_NIE",
94
+ # Italy
95
+ "IT_FISCAL_CODE",
96
+ "IT_DRIVER_LICENSE",
97
+ "IT_VAR_CODE",
98
+ "IT_PASSPORT",
99
+ "IT_IDENTITY_CARD",
100
+ # Singapore
101
+ "SG_NRIC_FIN",
102
+ "SG_UEN",
103
+ # Australia
104
+ "AU_ABN",
105
+ "AU_ACN",
106
+ "AU_TFN",
107
+ "AU_MEDICARE",
108
+ # India
109
+ "IN_PAN",
110
+ "IN_AADHAAR",
111
+ "IN_VEHICLE_REGISTRATION",
112
+ "IN_VOTER",
113
+ # Finland
114
+ "FI_PERSONAL_IDENTITY_CODE",
115
+ # Poland
116
+ "PL_PESEL",
117
+ # DACH
118
+ "AT_SVNR",
119
+ "CH_AHV",
120
+ "DE_TAX_ID",
121
+ "EU_NATIONAL_ID",
122
+ }
123
+
124
+ # Entity types that carry low signal in structured column values unless the column
125
+ # explicitly indicates free text (e.g. description, notes, body).
126
+ _NON_TEXT_ENTITY_TYPES: ClassVar[set[str]] = {"PERSON", "LOCATION", "DATE_TIME", "NRP"}
127
+
128
+ # Maps individual column-name tokens to the entity types relevant for that column.
129
+ _COLUMN_ENTITY_HINTS: ClassVar[dict[str, set[str]]] = {
130
+ "email": {"EMAIL_ADDRESS"},
131
+ "mail": {"EMAIL_ADDRESS"},
132
+ "phone": {"PHONE_NUMBER"},
133
+ "mobile": {"PHONE_NUMBER"},
134
+ "tel": {"PHONE_NUMBER"},
135
+ "telephone": {"PHONE_NUMBER"},
136
+ "fax": {"PHONE_NUMBER"},
137
+ "name": {"PERSON"},
138
+ "person": {"PERSON"},
139
+ "address": {"LOCATION"},
140
+ "location": {"LOCATION"},
141
+ "city": {"LOCATION"},
142
+ "state": {"LOCATION"},
143
+ "country": {"LOCATION"},
144
+ "postal": {"LOCATION"},
145
+ "postcode": {"LOCATION"},
146
+ "zipcode": {"LOCATION"},
147
+ "zip": {"LOCATION"},
148
+ "ip": {"IP_ADDRESS"},
149
+ "ssn": {"US_SSN"},
150
+ "passport": {"US_PASSPORT"},
151
+ "driver": {"US_DRIVER_LICENSE"},
152
+ "license": {"US_DRIVER_LICENSE"},
153
+ "iban": {"IBAN_CODE"},
154
+ "svnr": {"AT_SVNR"},
155
+ "ahv": {"CH_AHV"},
156
+ "tax": {"DE_TAX_ID"},
157
+ "national": {"EU_NATIONAL_ID"},
158
+ "card": {"CREDIT_CARD"},
159
+ "credit": {"CREDIT_CARD"},
160
+ "crypto": {"CRYPTO"},
161
+ "wallet": {"CRYPTO"},
162
+ "url": {"URL"},
163
+ "uri": {"URL"},
164
+ "website": {"URL"},
165
+ "nhs": {"UK_NHS"},
166
+ "medicare": {"AU_MEDICARE"},
167
+ "tfn": {"AU_TFN"},
168
+ "abn": {"AU_ABN"},
169
+ "acn": {"AU_ACN"},
170
+ "pan": {"IN_PAN"},
171
+ "aadhaar": {"IN_AADHAAR"},
172
+ "nric": {"SG_NRIC_FIN"},
173
+ "fin": {"SG_NRIC_FIN"},
174
+ "uen": {"SG_UEN"},
175
+ "pesel": {"PL_PESEL"},
176
+ "nif": {"ES_NIF"},
177
+ "nie": {"ES_NIE"},
178
+ "medical": {"MEDICAL_LICENSE"},
179
+ }
180
+
181
+ _FREE_TEXT_COLUMN_TOKENS: ClassVar[set[str]] = {
182
+ "text",
183
+ "body",
184
+ "content",
185
+ "description",
186
+ "message",
187
+ "comment",
188
+ "comments",
189
+ "note",
190
+ "notes",
191
+ "summary",
192
+ "details",
193
+ "bio",
194
+ }
195
+ _NAME_COLUMN_TOKENS: ClassVar[set[str]] = {
196
+ "name",
197
+ "first",
198
+ "last",
199
+ "middle",
200
+ "full",
201
+ "person",
202
+ "contact",
203
+ }
204
+ _EMAIL_COLUMN_TOKENS: ClassVar[set[str]] = {"email", "mail"}
205
+ _PHONE_COLUMN_TOKENS: ClassVar[set[str]] = {"phone", "mobile", "tel", "telephone", "fax"}
206
+ _ADDRESS_COLUMN_TOKENS: ClassVar[set[str]] = {
207
+ "address",
208
+ "street",
209
+ "city",
210
+ "state",
211
+ "country",
212
+ "postal",
213
+ "postcode",
214
+ "zipcode",
215
+ "zip",
216
+ "location",
217
+ }
218
+ _URL_COLUMN_TOKENS: ClassVar[set[str]] = {"url", "uri", "website", "web", "link", "domain"}
219
+ _ID_COLUMN_TOKENS: ClassVar[set[str]] = {"id", "uuid", "guid", "key", "source", "row"}
220
+
221
+ _TABULAR_ROW_RE: ClassVar[re.Pattern[str]] = re.compile(r"^row_(\d+):$")
222
+ _TABULAR_CELL_RE: ClassVar[re.Pattern[str]] = re.compile(r"^ ([^:]+):(?: ?(.*))?$")
223
+ _TABULAR_CONTINUATION_RE: ClassVar[re.Pattern[str]] = re.compile(r"^ (.*)$")
224
+
225
+ # Fall back to full-text analysis when a page has more than this many cells.
226
+ # Per-cell analysis at scale causes O(rowsxcolumns) Presidio calls per page.
227
+ _TABULAR_CELL_LIMIT: ClassVar[int] = 200
228
+
229
+ def __init__(self, config: DetectorConfig | None = None) -> None:
230
+ super().__init__(config)
231
+ self._cfg: PIIDetectorConfig = (
232
+ config if isinstance(config, PIIDetectorConfig) else PIIDetectorConfig()
233
+ )
234
+ self._init_error: MissingDependencyError | None = None
235
+ self.analyzer: Any = None
236
+ self._supported_entities_cache: frozenset[str] | None = None
237
+ try:
238
+ self._initialize_analyzer()
239
+ except MissingDependencyError as exc:
240
+ self._init_error = exc
241
+ logger.warning("Presidio unavailable — PII detector will raise on first use: %s", exc)
242
+ except (FileNotFoundError, OSError) as exc:
243
+ self._init_error = MissingDependencyError(
244
+ "pii",
245
+ ["privacy", "detectors"],
246
+ f"Presidio installation is incomplete (missing data files): {exc}",
247
+ )
248
+ logger.warning(
249
+ "Presidio data files missing — PII detector will raise on first use: %s", exc
250
+ )
251
+
252
+ # ------------------------------------------------------------------
253
+ # Initialization
254
+ # ------------------------------------------------------------------
255
+
256
+ @staticmethod
257
+ def _patch_tldextract_offline() -> None:
258
+ # tldextract ignores the TLDEXTRACT_CACHE env var; without explicit config it
259
+ # downloads the public suffix list on first use, hanging pods with no egress.
260
+ # Replace the module-level extract instance with an offline one (bundled PSL)
261
+ # before Presidio's UrlRecognizer is loaded so it never makes a network call.
262
+ try:
263
+ import tldextract as _tl # type: ignore[import-not-found, import-untyped]
264
+
265
+ offline = _tl.TLDExtract(
266
+ suffix_list_urls=(),
267
+ fallback_to_snapshot=True,
268
+ )
269
+ offline("example.com") # force PSL load from bundled snapshot
270
+ _tl.extract = offline
271
+ except Exception as exc:
272
+ logger.debug("tldextract offline patch skipped: %s", exc)
273
+
274
+ def _initialize_analyzer(self) -> None:
275
+ """Build the Presidio AnalyzerEngine with NLP engine and custom recognizers."""
276
+ global _PRESIDIO_LOG_FILTER_INSTALLED # noqa: PLW0603
277
+
278
+ with warnings.catch_warnings():
279
+ warnings.filterwarnings(
280
+ "ignore",
281
+ message=r"`torch\.jit\.script` is deprecated\..*",
282
+ category=DeprecationWarning,
283
+ )
284
+
285
+ self._patch_tldextract_offline()
286
+
287
+ if not _PRESIDIO_LOG_FILTER_INSTALLED:
288
+ logging.getLogger("presidio-analyzer").addFilter(_PresidioNoiseFilter())
289
+ _PRESIDIO_LOG_FILTER_INSTALLED = True
290
+
291
+ presidio_module = require_module(
292
+ "presidio_analyzer",
293
+ "pii",
294
+ ["privacy", "detectors"],
295
+ )
296
+ AnalyzerEngine = presidio_module.AnalyzerEngine # noqa: N806
297
+
298
+ nlp_engine = self._build_nlp_engine(presidio_module)
299
+ if nlp_engine is not None:
300
+ self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
301
+ else:
302
+ self.analyzer = AnalyzerEngine()
303
+
304
+ self._register_custom_recognizers(presidio_module)
305
+ self._probe_phone_recognizer()
306
+
307
+ self._supported_entities_cache = frozenset(self.analyzer.get_supported_entities())
308
+ logger.debug(
309
+ "PII detector initialized — %d built-in entity types, %d custom recognizers",
310
+ len(self._supported_entities_cache),
311
+ len(getattr(self.config, "custom_recognizers", None) or []),
312
+ )
313
+
314
+ def _build_nlp_engine(self, presidio_module: Any) -> Any | None:
315
+ """Return a SpacyNlpEngine for the configured model, or None to use the default."""
316
+ try:
317
+ spacy = importlib.import_module("spacy")
318
+ except ImportError:
319
+ logger.warning("spaCy not available; using default Presidio NLP engine")
320
+ return None
321
+
322
+ cfg_model: str = getattr(self.config, "spacy_model", None) or "en_core_web_sm"
323
+ cfg_model_url: str | None = getattr(self.config, "spacy_model_url", None)
324
+
325
+ if cfg_model_url:
326
+ try:
327
+ spacy.load(cfg_model)
328
+ except OSError:
329
+ logger.info(
330
+ "spaCy model '%s' not found; installing from %s", cfg_model, cfg_model_url
331
+ )
332
+ subprocess.run(
333
+ [sys.executable, "-m", "pip", "install", cfg_model_url],
334
+ check=True,
335
+ capture_output=True,
336
+ )
337
+ importlib.invalidate_caches()
338
+
339
+ try:
340
+ nlp = spacy.load(cfg_model)
341
+ except OSError:
342
+ logger.warning("spaCy model '%s' not found; using default NLP engine", cfg_model)
343
+ return None
344
+
345
+ spacy_max_length: int | None = getattr(self._cfg, "max_length", None)
346
+ if spacy_max_length is not None:
347
+ nlp.max_length = spacy_max_length
348
+ logger.debug("Set spaCy nlp.max_length = %d", spacy_max_length)
349
+
350
+ nlp_engine_module = require_module(
351
+ "presidio_analyzer.nlp_engine",
352
+ "pii",
353
+ ["privacy", "detectors"],
354
+ )
355
+ ner_config_module = require_module(
356
+ "presidio_analyzer.nlp_engine.ner_model_configuration",
357
+ "pii",
358
+ ["privacy", "detectors"],
359
+ )
360
+
361
+ ner_config = nlp_engine_module.NerModelConfiguration(
362
+ labels_to_ignore=[
363
+ "CARDINAL",
364
+ "ORDINAL",
365
+ "QUANTITY",
366
+ "FAC",
367
+ "WORK_OF_ART",
368
+ "PRODUCT",
369
+ "EVENT",
370
+ "LAW",
371
+ "LANGUAGE",
372
+ "PERCENT",
373
+ "MONEY",
374
+ ],
375
+ model_to_presidio_entity_mapping=ner_config_module.MODEL_TO_PRESIDIO_ENTITY_MAPPING,
376
+ low_score_entity_names=ner_config_module.LOW_SCORE_ENTITY_NAMES,
377
+ )
378
+ nlp_engine = nlp_engine_module.SpacyNlpEngine(
379
+ models=[{"lang_code": "en", "model_name": cfg_model}],
380
+ ner_model_configuration=ner_config,
381
+ )
382
+ nlp_engine.nlp = {"en": nlp}
383
+ logger.debug("Loaded spaCy model '%s'", cfg_model)
384
+ return nlp_engine
385
+
386
+ def _register_custom_recognizers(self, presidio_module: Any) -> None:
387
+ """Add ad-hoc recognizers from config to the analyzer registry."""
388
+ custom_recognizers = getattr(self.config, "custom_recognizers", None) or []
389
+ if not custom_recognizers:
390
+ return
391
+
392
+ PatternRecognizer = presidio_module.PatternRecognizer # noqa: N806
393
+ Pattern = presidio_module.Pattern # noqa: N806
394
+
395
+ for rec in custom_recognizers:
396
+ raw_patterns = getattr(rec.patterns, "root", rec.patterns) or []
397
+ patterns = [Pattern(name=p.name, regex=p.regex, score=p.score) for p in raw_patterns]
398
+ raw_deny_list = getattr(rec.deny_list, "root", rec.deny_list)
399
+ deny_list = list(raw_deny_list) if raw_deny_list else None
400
+ context = list(rec.context) if rec.context else None
401
+
402
+ recognizer = PatternRecognizer(
403
+ supported_entity=rec.supported_entity,
404
+ name=rec.name,
405
+ supported_language=rec.supported_language or "en",
406
+ patterns=patterns or None,
407
+ deny_list=deny_list,
408
+ context=context,
409
+ )
410
+ self.analyzer.registry.add_recognizer(recognizer)
411
+ logger.debug(
412
+ "Registered custom recognizer '%s' → entity '%s'",
413
+ rec.name,
414
+ rec.supported_entity,
415
+ )
416
+
417
+ def _probe_phone_recognizer(self) -> None:
418
+ """Verify phonenumbers regional data loads correctly; remove PhoneRecognizer if broken.
419
+
420
+ phonenumbers >=9 uses __import__ with level=1 for lazy region loading, which can fail
421
+ in certain execution contexts (e.g. frozen environments, some uv/venv setups).
422
+ Probing once at init avoids per-call ModuleNotFoundError spam.
423
+ """
424
+ if self.analyzer is None:
425
+ return
426
+ try:
427
+ import phonenumbers
428
+
429
+ phonenumbers.parse("+12025551234", None)
430
+ except ModuleNotFoundError as exc:
431
+ logger.warning(
432
+ "phonenumbers regional data unavailable (%s) — PHONE_NUMBER entity disabled", exc
433
+ )
434
+ self.analyzer.registry.recognizers = [
435
+ r for r in self.analyzer.registry.recognizers if "phone" not in r.name.lower()
436
+ ]
437
+ self._ALL_SUPPORTED_ENTITIES = self._ALL_SUPPORTED_ENTITIES - {"PHONE_NUMBER"}
438
+ if self._supported_entities_cache is not None:
439
+ self._supported_entities_cache = self._supported_entities_cache - frozenset(
440
+ {"PHONE_NUMBER"}
441
+ )
442
+ except Exception:
443
+ pass
444
+
445
+ # ------------------------------------------------------------------
446
+ # Entity filtering
447
+ # ------------------------------------------------------------------
448
+
449
+ def _enabled_entities(self) -> set[str] | None:
450
+ """Return the set of enabled Presidio entity types, or None for all."""
451
+ configured = self._cfg.enabled_patterns
452
+ if not configured:
453
+ return None
454
+ normalized = {str(p).strip().upper() for p in configured if str(p).strip()}
455
+ return normalized or None
456
+
457
+ def _is_entity_enabled(self, entity_type: str) -> bool:
458
+ enabled = self._enabled_entities()
459
+ return True if enabled is None else entity_type.upper() in enabled
460
+
461
+ # ------------------------------------------------------------------
462
+ # Tabular column heuristics
463
+ # ------------------------------------------------------------------
464
+
465
+ def _normalize_column_name(self, column_name: str) -> str:
466
+ return re.sub(r"[^a-z0-9]+", " ", column_name.lower()).strip()
467
+
468
+ def _column_tokens(self, column_name: str) -> set[str]:
469
+ normalized = self._normalize_column_name(column_name)
470
+ return {t for t in normalized.split() if t}
471
+
472
+ def _is_free_text_column(self, column_name: str) -> bool:
473
+ return bool(self._column_tokens(column_name) & self._FREE_TEXT_COLUMN_TOKENS)
474
+
475
+ def _allowed_entities_for_column(self, column_name: str) -> set[str]:
476
+ """Return the Presidio entity types that are relevant for this column name."""
477
+ enabled = self._enabled_entities() or self._ALL_SUPPORTED_ENTITIES
478
+ tokens = self._column_tokens(column_name)
479
+
480
+ if not tokens:
481
+ return enabled - self._NON_TEXT_ENTITY_TYPES
482
+
483
+ if tokens & self._FREE_TEXT_COLUMN_TOKENS:
484
+ return enabled
485
+
486
+ allowed: set[str] = set()
487
+ for token in tokens:
488
+ allowed.update(self._COLUMN_ENTITY_HINTS.get(token, set()))
489
+
490
+ if tokens & self._NAME_COLUMN_TOKENS and "company" not in tokens:
491
+ allowed.add("PERSON")
492
+ if tokens & self._EMAIL_COLUMN_TOKENS:
493
+ allowed.add("EMAIL_ADDRESS")
494
+ if tokens & self._PHONE_COLUMN_TOKENS:
495
+ allowed.add("PHONE_NUMBER")
496
+ if tokens & self._ADDRESS_COLUMN_TOKENS:
497
+ allowed.add("LOCATION")
498
+
499
+ if allowed:
500
+ return allowed & enabled
501
+
502
+ if tokens & self._URL_COLUMN_TOKENS:
503
+ return {"IP_ADDRESS", "URL"} & enabled
504
+
505
+ if tokens & self._ID_COLUMN_TOKENS:
506
+ return set()
507
+
508
+ return enabled - self._NON_TEXT_ENTITY_TYPES
509
+
510
+ def _is_entity_allowed_for_column(self, entity_type: str, column_name: str) -> bool:
511
+ allowed = self._allowed_entities_for_column(column_name)
512
+ if not allowed:
513
+ return False
514
+ entity_upper = entity_type.upper()
515
+ if entity_upper not in self._ALL_SUPPORTED_ENTITIES:
516
+ # Custom entity: allow in free-text columns when no pattern filter is active.
517
+ return self._enabled_entities() is None and self._is_free_text_column(column_name)
518
+ return entity_upper in allowed
519
+
520
+ # ------------------------------------------------------------------
521
+ # Presidio analysis
522
+ # ------------------------------------------------------------------
523
+
524
+ def _analyze_content(self, content: str, *, entities: list[str] | None = None) -> list[Any]:
525
+ if self.analyzer is None:
526
+ if self._init_error is not None:
527
+ raise self._init_error
528
+ return []
529
+ try:
530
+ return self.analyzer.analyze(text=content, language="en", entities=entities)
531
+ except ModuleNotFoundError as exc:
532
+ if "phonenumbers" in str(exc):
533
+ logger.warning("phonenumbers data missing mid-run; disabling PHONE_NUMBER entity")
534
+ self._probe_phone_recognizer()
535
+ # Retry without the now-removed phone recognizer
536
+ retry_entities = (
537
+ [e for e in entities if e != "PHONE_NUMBER"] if entities is not None else None
538
+ )
539
+ try:
540
+ return self.analyzer.analyze(
541
+ text=content, language="en", entities=retry_entities
542
+ )
543
+ except Exception:
544
+ return []
545
+ logger.error("PII analysis failed: %s", exc)
546
+ logger.exception(exc)
547
+ return []
548
+ except Exception as e:
549
+ logger.error("PII analysis failed: %s", e)
550
+ logger.exception(e)
551
+ return []
552
+
553
+ def _analyzer_supported_entities(self) -> frozenset[str]:
554
+ if self._supported_entities_cache is not None:
555
+ return self._supported_entities_cache
556
+ if self.analyzer is None:
557
+ return frozenset()
558
+ self._supported_entities_cache = frozenset(self.analyzer.get_supported_entities())
559
+ return self._supported_entities_cache
560
+
561
+ def _analyze_structured_cell(
562
+ self, content: str, *, allowed_entity_types: set[str]
563
+ ) -> list[Any]:
564
+ if not allowed_entity_types or self.analyzer is None:
565
+ if self.analyzer is None and self._init_error is not None:
566
+ raise self._init_error
567
+ return []
568
+
569
+ entities = sorted(allowed_entity_types & self._analyzer_supported_entities())
570
+ if not entities:
571
+ return []
572
+
573
+ return self._analyze_content(content, entities=entities)
574
+
575
+ # ------------------------------------------------------------------
576
+ # Tabular content detection
577
+ # ------------------------------------------------------------------
578
+
579
+ def _extract_tabular_cells(self, content: str) -> list[_TabularCell]:
580
+ if "row_1:" not in content:
581
+ return []
582
+
583
+ cells: list[_TabularCell] = []
584
+ current_row_index: int | None = None
585
+ current_column_name: str | None = None
586
+ current_value_lines: list[str] = []
587
+
588
+ def flush_cell() -> None:
589
+ nonlocal current_column_name, current_value_lines
590
+ if current_row_index is None or current_column_name is None:
591
+ current_column_name = None
592
+ current_value_lines = []
593
+ return
594
+ cells.append(
595
+ _TabularCell(
596
+ row_index=current_row_index,
597
+ column_name=current_column_name,
598
+ value="\n".join(current_value_lines).strip(),
599
+ )
600
+ )
601
+ current_column_name = None
602
+ current_value_lines = []
603
+
604
+ for line in content.splitlines():
605
+ row_match = self._TABULAR_ROW_RE.match(line)
606
+ if row_match:
607
+ flush_cell()
608
+ current_row_index = int(row_match.group(1))
609
+ continue
610
+
611
+ cell_match = self._TABULAR_CELL_RE.match(line)
612
+ if cell_match and current_row_index is not None:
613
+ flush_cell()
614
+ current_column_name = cell_match.group(1).strip()
615
+ current_value_lines = [cell_match.group(2) or ""]
616
+ continue
617
+
618
+ continuation_match = self._TABULAR_CONTINUATION_RE.match(line)
619
+ if continuation_match and current_column_name is not None:
620
+ current_value_lines.append(continuation_match.group(1))
621
+ continue
622
+
623
+ if current_column_name is not None and line and current_row_index is not None:
624
+ current_value_lines.append(line)
625
+ continue
626
+
627
+ if not line:
628
+ flush_cell()
629
+
630
+ flush_cell()
631
+ return [cell for cell in cells if cell.value]
632
+
633
+ def _should_keep_tabular_result(
634
+ self, *, cell: _TabularCell, entity_type: str, matched_content: str
635
+ ) -> bool:
636
+ if entity_type != "PERSON":
637
+ return True
638
+ if not self._is_free_text_column(cell.column_name):
639
+ return True
640
+ token_count = len(re.findall(r"[A-Za-z][A-Za-z'-]*", matched_content))
641
+ return token_count >= 2
642
+
643
+ def _dedupe_tabular_findings(self, findings: list[DetectionResult]) -> list[DetectionResult]:
644
+ deduped: dict[tuple[str, str, int | None, str | None], DetectionResult] = {}
645
+ ordered_keys: list[tuple[str, str, int | None, str | None]] = []
646
+
647
+ for finding in findings:
648
+ metadata = finding.metadata or {}
649
+ key = (
650
+ finding.finding_type,
651
+ finding.matched_content.strip(),
652
+ metadata.get("tabular_row_index"),
653
+ metadata.get("tabular_column_name"),
654
+ )
655
+ existing = deduped.get(key)
656
+ if existing is None:
657
+ deduped[key] = finding
658
+ ordered_keys.append(key)
659
+ elif finding.confidence > existing.confidence:
660
+ deduped[key] = finding
661
+
662
+ return [deduped[k] for k in ordered_keys]
663
+
664
+ def _detect_tabular_content(self, content: str) -> list[DetectionResult] | None:
665
+ cells = self._extract_tabular_cells(content)
666
+ if not cells:
667
+ return None
668
+
669
+ # For very wide/long pages fall back to full-text analysis to avoid O(N) Presidio calls.
670
+ if len(cells) > self._TABULAR_CELL_LIMIT:
671
+ logger.debug(
672
+ "Page has %d cells (> %d limit); using full-text analysis instead of per-cell",
673
+ len(cells),
674
+ self._TABULAR_CELL_LIMIT,
675
+ )
676
+ return None
677
+
678
+ threshold = self._cfg.confidence_threshold or 0.7
679
+ results: list[DetectionResult] = []
680
+
681
+ for cell in cells:
682
+ allowed = self._allowed_entities_for_column(cell.column_name)
683
+ if not allowed:
684
+ continue
685
+
686
+ for result in self._analyze_structured_cell(cell.value, allowed_entity_types=allowed):
687
+ if not self._is_entity_enabled(result.entity_type):
688
+ continue
689
+ if not self._is_entity_allowed_for_column(result.entity_type, cell.column_name):
690
+ continue
691
+
692
+ matched_content = cell.value[result.start : result.end]
693
+ if not self._should_keep_tabular_result(
694
+ cell=cell,
695
+ entity_type=result.entity_type,
696
+ matched_content=matched_content,
697
+ ):
698
+ continue
699
+
700
+ detection = self._build_detection_result(
701
+ matched_content=matched_content,
702
+ entity_type=result.entity_type,
703
+ confidence=result.score,
704
+ recognition_metadata=result.recognition_metadata,
705
+ line_number=cell.row_index,
706
+ start=result.start,
707
+ end=result.end,
708
+ metadata={
709
+ "tabular_row_index": cell.row_index,
710
+ "tabular_column_name": cell.column_name,
711
+ },
712
+ )
713
+ if detection.confidence >= threshold:
714
+ results.append(detection)
715
+
716
+ return self._dedupe_tabular_findings(results)
717
+
718
+ # ------------------------------------------------------------------
719
+ # Result construction
720
+ # ------------------------------------------------------------------
721
+
722
+ def _build_detection_result(
723
+ self,
724
+ *,
725
+ matched_content: str,
726
+ entity_type: str,
727
+ confidence: float,
728
+ recognition_metadata: dict[str, Any] | None,
729
+ line_number: int,
730
+ start: int,
731
+ end: int,
732
+ metadata: dict[str, Any] | None = None,
733
+ ) -> DetectionResult:
734
+ base_metadata: dict[str, Any] = {
735
+ "recognizer": recognition_metadata.get("recognizer_name", "unknown")
736
+ if recognition_metadata
737
+ else "unknown",
738
+ "entity_type": entity_type,
739
+ }
740
+ if metadata:
741
+ base_metadata.update(metadata)
742
+
743
+ return DetectionResult(
744
+ detector_type=DetectorType.PII,
745
+ finding_type=entity_type,
746
+ category="PII",
747
+ severity=self._get_severity_for_entity(entity_type),
748
+ confidence=confidence,
749
+ matched_content=matched_content,
750
+ location=Location(
751
+ path=f"line {line_number}",
752
+ description=f"character range {start}-{end}",
753
+ ),
754
+ metadata=base_metadata,
755
+ )
756
+
757
+ def _get_severity_for_entity(self, entity_type: str) -> Severity:
758
+ e = entity_type.upper()
759
+
760
+ # Critical — government IDs, financial account numbers, biometric IDs
761
+ if e in {
762
+ "CREDIT_CARD",
763
+ "CRYPTO",
764
+ "IBAN_CODE",
765
+ "US_SSN",
766
+ "US_PASSPORT",
767
+ "US_DRIVER_LICENSE",
768
+ "US_BANK_NUMBER",
769
+ "US_ITIN",
770
+ "UK_NHS",
771
+ "AT_SVNR",
772
+ "CH_AHV",
773
+ "DE_TAX_ID",
774
+ "EU_NATIONAL_ID",
775
+ "ES_NIF",
776
+ "ES_NIE",
777
+ "IT_FISCAL_CODE",
778
+ "IT_PASSPORT",
779
+ "IT_DRIVER_LICENSE",
780
+ "IT_IDENTITY_CARD",
781
+ "SG_NRIC_FIN",
782
+ "AU_TFN",
783
+ "AU_MEDICARE",
784
+ "IN_PAN",
785
+ "IN_AADHAAR",
786
+ "FI_PERSONAL_IDENTITY_CODE",
787
+ "PL_PESEL",
788
+ }:
789
+ return Severity.critical
790
+
791
+ # High — contact identifiers, business numbers, less-direct personal IDs
792
+ if e in {
793
+ "EMAIL_ADDRESS",
794
+ "PHONE_NUMBER",
795
+ "IP_ADDRESS",
796
+ "MEDICAL_LICENSE",
797
+ "AU_ABN",
798
+ "AU_ACN",
799
+ "SG_UEN",
800
+ "IT_VAR_CODE",
801
+ "IN_VOTER",
802
+ "IN_VEHICLE_REGISTRATION",
803
+ }:
804
+ return Severity.high
805
+
806
+ # Medium — contextual personal information
807
+ if e in {"PERSON", "LOCATION", "DATE_TIME", "NRP", "URL"}:
808
+ return Severity.medium
809
+
810
+ return Severity.high
811
+
812
+ # ------------------------------------------------------------------
813
+ # Public API
814
+ # ------------------------------------------------------------------
815
+
816
+ async def detect(
817
+ self, content: str | bytes, content_type: str = "text/plain"
818
+ ) -> list[DetectionResult]:
819
+ """Detect PII in *content* and return a list of :class:`DetectionResult` objects."""
820
+ if isinstance(content, bytes):
821
+ return []
822
+ # Presidio + spaCy NER are CPU-bound synchronous operations. Running them
823
+ # directly in the async coroutine blocks the event loop for the duration of
824
+ # each page (seconds on CPU-limited pods), making the job appear frozen.
825
+ # Offloading to a thread keeps the loop alive and allows I/O to proceed.
826
+ return await asyncio.to_thread(self._detect_sync, content)
827
+
828
+ def _chunk_text(self, text: str) -> list[tuple[str, int]]:
829
+ """Return (chunk, offset) pairs. When chunk_size is null returns the full text at offset 0."""
830
+ chunk_size: int | None = getattr(self._cfg, "chunk_size", None)
831
+ if not chunk_size:
832
+ return [(text, 0)]
833
+ overlap: int = getattr(self._cfg, "chunk_overlap", None) or 0
834
+ step = max(1, chunk_size - overlap)
835
+ return [(text[i : i + chunk_size], i) for i in range(0, len(text), step)]
836
+
837
+ def _detect_sync(self, content: str) -> list[DetectionResult]:
838
+ tabular_results = self._detect_tabular_content(content)
839
+ if tabular_results is not None:
840
+ if self._cfg.max_findings and len(tabular_results) > self._cfg.max_findings:
841
+ return tabular_results[: self._cfg.max_findings]
842
+ return tabular_results
843
+
844
+ enabled = self._enabled_entities()
845
+ entities = sorted(enabled) if enabled else None
846
+ threshold = self._cfg.confidence_threshold or 0.7
847
+ results: list[DetectionResult] = []
848
+ seen: set[tuple[str, int, int]] = set()
849
+
850
+ for chunk, offset in self._chunk_text(content):
851
+ for result in self._analyze_content(chunk, entities=entities):
852
+ if not self._is_entity_enabled(result.entity_type):
853
+ continue
854
+
855
+ abs_start = result.start + offset
856
+ abs_end = result.end + offset
857
+ dedup_key = (result.entity_type, abs_start, abs_end)
858
+ if dedup_key in seen:
859
+ continue
860
+ seen.add(dedup_key)
861
+
862
+ line_number = content[:abs_start].count("\n") + 1
863
+ matched_content = content[abs_start:abs_end]
864
+
865
+ detection = self._build_detection_result(
866
+ matched_content=matched_content,
867
+ entity_type=result.entity_type,
868
+ confidence=result.score,
869
+ recognition_metadata=result.recognition_metadata,
870
+ line_number=line_number,
871
+ start=abs_start,
872
+ end=abs_end,
873
+ )
874
+ if detection.confidence >= threshold:
875
+ results.append(detection)
876
+
877
+ if self._cfg.max_findings and len(results) > self._cfg.max_findings:
878
+ results = results[: self._cfg.max_findings]
879
+
880
+ return results
881
+
882
+ def get_supported_content_types(self) -> list[str]:
883
+ return ["text/plain", "text/html", "application/json", "text/xml"]