classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,883 @@
|
|
|
1
|
+
"""PII detector powered by Microsoft Presidio."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import importlib
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import warnings
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any, ClassVar
|
|
12
|
+
|
|
13
|
+
from ...models.generated_detectors import DetectorConfig, PIIDetectorConfig, Severity
|
|
14
|
+
from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType, Location
|
|
15
|
+
from ..base import BaseDetector
|
|
16
|
+
from ..dependencies import MissingDependencyError, require_module
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_PRESIDIO_LOG_FILTER_INSTALLED = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _PresidioNoiseFilter(logging.Filter):
|
|
24
|
+
"""Suppresses noisy but harmless Presidio initialization warnings."""
|
|
25
|
+
|
|
26
|
+
_SUPPRESSED = (
|
|
27
|
+
"Recognizer not added to registry because language is not supported by registry",
|
|
28
|
+
"model_to_presidio_entity_mapping is missing from configuration",
|
|
29
|
+
"low_score_entity_names is missing from configuration",
|
|
30
|
+
"labels_to_ignore is missing from configuration",
|
|
31
|
+
"Fetching all recognizers for language",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
35
|
+
msg = record.getMessage()
|
|
36
|
+
if any(s in msg for s in self._SUPPRESSED):
|
|
37
|
+
return False
|
|
38
|
+
if "Entity " in msg and (
|
|
39
|
+
"is not mapped to a Presidio entity" in msg
|
|
40
|
+
or "doesn't have the corresponding recognizer in language" in msg
|
|
41
|
+
):
|
|
42
|
+
return False
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class _TabularCell:
|
|
48
|
+
row_index: int
|
|
49
|
+
column_name: str
|
|
50
|
+
value: str
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PIIDetector(BaseDetector):
|
|
54
|
+
"""
|
|
55
|
+
PII detector powered by Microsoft Presidio.
|
|
56
|
+
|
|
57
|
+
Detects personally identifiable information across global and regional entity types,
|
|
58
|
+
covering all built-in Presidio recognizers plus optional ad-hoc custom recognizers
|
|
59
|
+
defined in configuration.
|
|
60
|
+
|
|
61
|
+
Supported regions: Global, USA, UK, Spain, Italy, Singapore, Australia, India,
|
|
62
|
+
Finland, Poland, DACH (Germany / Austria / Switzerland).
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
detector_type = "pii"
|
|
66
|
+
detector_name = "pii"
|
|
67
|
+
|
|
68
|
+
# All entity types supported by built-in Presidio recognizers.
|
|
69
|
+
_ALL_SUPPORTED_ENTITIES: ClassVar[set[str]] = {
|
|
70
|
+
# Global
|
|
71
|
+
"CREDIT_CARD",
|
|
72
|
+
"CRYPTO",
|
|
73
|
+
"DATE_TIME",
|
|
74
|
+
"EMAIL_ADDRESS",
|
|
75
|
+
"IBAN_CODE",
|
|
76
|
+
"IP_ADDRESS",
|
|
77
|
+
"NRP",
|
|
78
|
+
"LOCATION",
|
|
79
|
+
"PERSON",
|
|
80
|
+
"PHONE_NUMBER",
|
|
81
|
+
"MEDICAL_LICENSE",
|
|
82
|
+
"URL",
|
|
83
|
+
# USA
|
|
84
|
+
"US_BANK_NUMBER",
|
|
85
|
+
"US_DRIVER_LICENSE",
|
|
86
|
+
"US_ITIN",
|
|
87
|
+
"US_PASSPORT",
|
|
88
|
+
"US_SSN",
|
|
89
|
+
# UK
|
|
90
|
+
"UK_NHS",
|
|
91
|
+
# Spain
|
|
92
|
+
"ES_NIF",
|
|
93
|
+
"ES_NIE",
|
|
94
|
+
# Italy
|
|
95
|
+
"IT_FISCAL_CODE",
|
|
96
|
+
"IT_DRIVER_LICENSE",
|
|
97
|
+
"IT_VAR_CODE",
|
|
98
|
+
"IT_PASSPORT",
|
|
99
|
+
"IT_IDENTITY_CARD",
|
|
100
|
+
# Singapore
|
|
101
|
+
"SG_NRIC_FIN",
|
|
102
|
+
"SG_UEN",
|
|
103
|
+
# Australia
|
|
104
|
+
"AU_ABN",
|
|
105
|
+
"AU_ACN",
|
|
106
|
+
"AU_TFN",
|
|
107
|
+
"AU_MEDICARE",
|
|
108
|
+
# India
|
|
109
|
+
"IN_PAN",
|
|
110
|
+
"IN_AADHAAR",
|
|
111
|
+
"IN_VEHICLE_REGISTRATION",
|
|
112
|
+
"IN_VOTER",
|
|
113
|
+
# Finland
|
|
114
|
+
"FI_PERSONAL_IDENTITY_CODE",
|
|
115
|
+
# Poland
|
|
116
|
+
"PL_PESEL",
|
|
117
|
+
# DACH
|
|
118
|
+
"AT_SVNR",
|
|
119
|
+
"CH_AHV",
|
|
120
|
+
"DE_TAX_ID",
|
|
121
|
+
"EU_NATIONAL_ID",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# Entity types that carry low signal in structured column values unless the column
|
|
125
|
+
# explicitly indicates free text (e.g. description, notes, body).
|
|
126
|
+
_NON_TEXT_ENTITY_TYPES: ClassVar[set[str]] = {"PERSON", "LOCATION", "DATE_TIME", "NRP"}
|
|
127
|
+
|
|
128
|
+
# Maps individual column-name tokens to the entity types relevant for that column.
|
|
129
|
+
_COLUMN_ENTITY_HINTS: ClassVar[dict[str, set[str]]] = {
|
|
130
|
+
"email": {"EMAIL_ADDRESS"},
|
|
131
|
+
"mail": {"EMAIL_ADDRESS"},
|
|
132
|
+
"phone": {"PHONE_NUMBER"},
|
|
133
|
+
"mobile": {"PHONE_NUMBER"},
|
|
134
|
+
"tel": {"PHONE_NUMBER"},
|
|
135
|
+
"telephone": {"PHONE_NUMBER"},
|
|
136
|
+
"fax": {"PHONE_NUMBER"},
|
|
137
|
+
"name": {"PERSON"},
|
|
138
|
+
"person": {"PERSON"},
|
|
139
|
+
"address": {"LOCATION"},
|
|
140
|
+
"location": {"LOCATION"},
|
|
141
|
+
"city": {"LOCATION"},
|
|
142
|
+
"state": {"LOCATION"},
|
|
143
|
+
"country": {"LOCATION"},
|
|
144
|
+
"postal": {"LOCATION"},
|
|
145
|
+
"postcode": {"LOCATION"},
|
|
146
|
+
"zipcode": {"LOCATION"},
|
|
147
|
+
"zip": {"LOCATION"},
|
|
148
|
+
"ip": {"IP_ADDRESS"},
|
|
149
|
+
"ssn": {"US_SSN"},
|
|
150
|
+
"passport": {"US_PASSPORT"},
|
|
151
|
+
"driver": {"US_DRIVER_LICENSE"},
|
|
152
|
+
"license": {"US_DRIVER_LICENSE"},
|
|
153
|
+
"iban": {"IBAN_CODE"},
|
|
154
|
+
"svnr": {"AT_SVNR"},
|
|
155
|
+
"ahv": {"CH_AHV"},
|
|
156
|
+
"tax": {"DE_TAX_ID"},
|
|
157
|
+
"national": {"EU_NATIONAL_ID"},
|
|
158
|
+
"card": {"CREDIT_CARD"},
|
|
159
|
+
"credit": {"CREDIT_CARD"},
|
|
160
|
+
"crypto": {"CRYPTO"},
|
|
161
|
+
"wallet": {"CRYPTO"},
|
|
162
|
+
"url": {"URL"},
|
|
163
|
+
"uri": {"URL"},
|
|
164
|
+
"website": {"URL"},
|
|
165
|
+
"nhs": {"UK_NHS"},
|
|
166
|
+
"medicare": {"AU_MEDICARE"},
|
|
167
|
+
"tfn": {"AU_TFN"},
|
|
168
|
+
"abn": {"AU_ABN"},
|
|
169
|
+
"acn": {"AU_ACN"},
|
|
170
|
+
"pan": {"IN_PAN"},
|
|
171
|
+
"aadhaar": {"IN_AADHAAR"},
|
|
172
|
+
"nric": {"SG_NRIC_FIN"},
|
|
173
|
+
"fin": {"SG_NRIC_FIN"},
|
|
174
|
+
"uen": {"SG_UEN"},
|
|
175
|
+
"pesel": {"PL_PESEL"},
|
|
176
|
+
"nif": {"ES_NIF"},
|
|
177
|
+
"nie": {"ES_NIE"},
|
|
178
|
+
"medical": {"MEDICAL_LICENSE"},
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
_FREE_TEXT_COLUMN_TOKENS: ClassVar[set[str]] = {
|
|
182
|
+
"text",
|
|
183
|
+
"body",
|
|
184
|
+
"content",
|
|
185
|
+
"description",
|
|
186
|
+
"message",
|
|
187
|
+
"comment",
|
|
188
|
+
"comments",
|
|
189
|
+
"note",
|
|
190
|
+
"notes",
|
|
191
|
+
"summary",
|
|
192
|
+
"details",
|
|
193
|
+
"bio",
|
|
194
|
+
}
|
|
195
|
+
_NAME_COLUMN_TOKENS: ClassVar[set[str]] = {
|
|
196
|
+
"name",
|
|
197
|
+
"first",
|
|
198
|
+
"last",
|
|
199
|
+
"middle",
|
|
200
|
+
"full",
|
|
201
|
+
"person",
|
|
202
|
+
"contact",
|
|
203
|
+
}
|
|
204
|
+
_EMAIL_COLUMN_TOKENS: ClassVar[set[str]] = {"email", "mail"}
|
|
205
|
+
_PHONE_COLUMN_TOKENS: ClassVar[set[str]] = {"phone", "mobile", "tel", "telephone", "fax"}
|
|
206
|
+
_ADDRESS_COLUMN_TOKENS: ClassVar[set[str]] = {
|
|
207
|
+
"address",
|
|
208
|
+
"street",
|
|
209
|
+
"city",
|
|
210
|
+
"state",
|
|
211
|
+
"country",
|
|
212
|
+
"postal",
|
|
213
|
+
"postcode",
|
|
214
|
+
"zipcode",
|
|
215
|
+
"zip",
|
|
216
|
+
"location",
|
|
217
|
+
}
|
|
218
|
+
_URL_COLUMN_TOKENS: ClassVar[set[str]] = {"url", "uri", "website", "web", "link", "domain"}
|
|
219
|
+
_ID_COLUMN_TOKENS: ClassVar[set[str]] = {"id", "uuid", "guid", "key", "source", "row"}
|
|
220
|
+
|
|
221
|
+
_TABULAR_ROW_RE: ClassVar[re.Pattern[str]] = re.compile(r"^row_(\d+):$")
|
|
222
|
+
_TABULAR_CELL_RE: ClassVar[re.Pattern[str]] = re.compile(r"^ ([^:]+):(?: ?(.*))?$")
|
|
223
|
+
_TABULAR_CONTINUATION_RE: ClassVar[re.Pattern[str]] = re.compile(r"^ (.*)$")
|
|
224
|
+
|
|
225
|
+
# Fall back to full-text analysis when a page has more than this many cells.
|
|
226
|
+
# Per-cell analysis at scale causes O(rowsxcolumns) Presidio calls per page.
|
|
227
|
+
_TABULAR_CELL_LIMIT: ClassVar[int] = 200
|
|
228
|
+
|
|
229
|
+
def __init__(self, config: DetectorConfig | None = None) -> None:
|
|
230
|
+
super().__init__(config)
|
|
231
|
+
self._cfg: PIIDetectorConfig = (
|
|
232
|
+
config if isinstance(config, PIIDetectorConfig) else PIIDetectorConfig()
|
|
233
|
+
)
|
|
234
|
+
self._init_error: MissingDependencyError | None = None
|
|
235
|
+
self.analyzer: Any = None
|
|
236
|
+
self._supported_entities_cache: frozenset[str] | None = None
|
|
237
|
+
try:
|
|
238
|
+
self._initialize_analyzer()
|
|
239
|
+
except MissingDependencyError as exc:
|
|
240
|
+
self._init_error = exc
|
|
241
|
+
logger.warning("Presidio unavailable — PII detector will raise on first use: %s", exc)
|
|
242
|
+
except (FileNotFoundError, OSError) as exc:
|
|
243
|
+
self._init_error = MissingDependencyError(
|
|
244
|
+
"pii",
|
|
245
|
+
["privacy", "detectors"],
|
|
246
|
+
f"Presidio installation is incomplete (missing data files): {exc}",
|
|
247
|
+
)
|
|
248
|
+
logger.warning(
|
|
249
|
+
"Presidio data files missing — PII detector will raise on first use: %s", exc
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# ------------------------------------------------------------------
|
|
253
|
+
# Initialization
|
|
254
|
+
# ------------------------------------------------------------------
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _patch_tldextract_offline() -> None:
|
|
258
|
+
# tldextract ignores the TLDEXTRACT_CACHE env var; without explicit config it
|
|
259
|
+
# downloads the public suffix list on first use, hanging pods with no egress.
|
|
260
|
+
# Replace the module-level extract instance with an offline one (bundled PSL)
|
|
261
|
+
# before Presidio's UrlRecognizer is loaded so it never makes a network call.
|
|
262
|
+
try:
|
|
263
|
+
import tldextract as _tl # type: ignore[import-not-found, import-untyped]
|
|
264
|
+
|
|
265
|
+
offline = _tl.TLDExtract(
|
|
266
|
+
suffix_list_urls=(),
|
|
267
|
+
fallback_to_snapshot=True,
|
|
268
|
+
)
|
|
269
|
+
offline("example.com") # force PSL load from bundled snapshot
|
|
270
|
+
_tl.extract = offline
|
|
271
|
+
except Exception as exc:
|
|
272
|
+
logger.debug("tldextract offline patch skipped: %s", exc)
|
|
273
|
+
|
|
274
|
+
def _initialize_analyzer(self) -> None:
|
|
275
|
+
"""Build the Presidio AnalyzerEngine with NLP engine and custom recognizers."""
|
|
276
|
+
global _PRESIDIO_LOG_FILTER_INSTALLED # noqa: PLW0603
|
|
277
|
+
|
|
278
|
+
with warnings.catch_warnings():
|
|
279
|
+
warnings.filterwarnings(
|
|
280
|
+
"ignore",
|
|
281
|
+
message=r"`torch\.jit\.script` is deprecated\..*",
|
|
282
|
+
category=DeprecationWarning,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
self._patch_tldextract_offline()
|
|
286
|
+
|
|
287
|
+
if not _PRESIDIO_LOG_FILTER_INSTALLED:
|
|
288
|
+
logging.getLogger("presidio-analyzer").addFilter(_PresidioNoiseFilter())
|
|
289
|
+
_PRESIDIO_LOG_FILTER_INSTALLED = True
|
|
290
|
+
|
|
291
|
+
presidio_module = require_module(
|
|
292
|
+
"presidio_analyzer",
|
|
293
|
+
"pii",
|
|
294
|
+
["privacy", "detectors"],
|
|
295
|
+
)
|
|
296
|
+
AnalyzerEngine = presidio_module.AnalyzerEngine # noqa: N806
|
|
297
|
+
|
|
298
|
+
nlp_engine = self._build_nlp_engine(presidio_module)
|
|
299
|
+
if nlp_engine is not None:
|
|
300
|
+
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])
|
|
301
|
+
else:
|
|
302
|
+
self.analyzer = AnalyzerEngine()
|
|
303
|
+
|
|
304
|
+
self._register_custom_recognizers(presidio_module)
|
|
305
|
+
self._probe_phone_recognizer()
|
|
306
|
+
|
|
307
|
+
self._supported_entities_cache = frozenset(self.analyzer.get_supported_entities())
|
|
308
|
+
logger.debug(
|
|
309
|
+
"PII detector initialized — %d built-in entity types, %d custom recognizers",
|
|
310
|
+
len(self._supported_entities_cache),
|
|
311
|
+
len(getattr(self.config, "custom_recognizers", None) or []),
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def _build_nlp_engine(self, presidio_module: Any) -> Any | None:
|
|
315
|
+
"""Return a SpacyNlpEngine for the configured model, or None to use the default."""
|
|
316
|
+
try:
|
|
317
|
+
spacy = importlib.import_module("spacy")
|
|
318
|
+
except ImportError:
|
|
319
|
+
logger.warning("spaCy not available; using default Presidio NLP engine")
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
cfg_model: str = getattr(self.config, "spacy_model", None) or "en_core_web_sm"
|
|
323
|
+
cfg_model_url: str | None = getattr(self.config, "spacy_model_url", None)
|
|
324
|
+
|
|
325
|
+
if cfg_model_url:
|
|
326
|
+
try:
|
|
327
|
+
spacy.load(cfg_model)
|
|
328
|
+
except OSError:
|
|
329
|
+
logger.info(
|
|
330
|
+
"spaCy model '%s' not found; installing from %s", cfg_model, cfg_model_url
|
|
331
|
+
)
|
|
332
|
+
subprocess.run(
|
|
333
|
+
[sys.executable, "-m", "pip", "install", cfg_model_url],
|
|
334
|
+
check=True,
|
|
335
|
+
capture_output=True,
|
|
336
|
+
)
|
|
337
|
+
importlib.invalidate_caches()
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
nlp = spacy.load(cfg_model)
|
|
341
|
+
except OSError:
|
|
342
|
+
logger.warning("spaCy model '%s' not found; using default NLP engine", cfg_model)
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
spacy_max_length: int | None = getattr(self._cfg, "max_length", None)
|
|
346
|
+
if spacy_max_length is not None:
|
|
347
|
+
nlp.max_length = spacy_max_length
|
|
348
|
+
logger.debug("Set spaCy nlp.max_length = %d", spacy_max_length)
|
|
349
|
+
|
|
350
|
+
nlp_engine_module = require_module(
|
|
351
|
+
"presidio_analyzer.nlp_engine",
|
|
352
|
+
"pii",
|
|
353
|
+
["privacy", "detectors"],
|
|
354
|
+
)
|
|
355
|
+
ner_config_module = require_module(
|
|
356
|
+
"presidio_analyzer.nlp_engine.ner_model_configuration",
|
|
357
|
+
"pii",
|
|
358
|
+
["privacy", "detectors"],
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
ner_config = nlp_engine_module.NerModelConfiguration(
|
|
362
|
+
labels_to_ignore=[
|
|
363
|
+
"CARDINAL",
|
|
364
|
+
"ORDINAL",
|
|
365
|
+
"QUANTITY",
|
|
366
|
+
"FAC",
|
|
367
|
+
"WORK_OF_ART",
|
|
368
|
+
"PRODUCT",
|
|
369
|
+
"EVENT",
|
|
370
|
+
"LAW",
|
|
371
|
+
"LANGUAGE",
|
|
372
|
+
"PERCENT",
|
|
373
|
+
"MONEY",
|
|
374
|
+
],
|
|
375
|
+
model_to_presidio_entity_mapping=ner_config_module.MODEL_TO_PRESIDIO_ENTITY_MAPPING,
|
|
376
|
+
low_score_entity_names=ner_config_module.LOW_SCORE_ENTITY_NAMES,
|
|
377
|
+
)
|
|
378
|
+
nlp_engine = nlp_engine_module.SpacyNlpEngine(
|
|
379
|
+
models=[{"lang_code": "en", "model_name": cfg_model}],
|
|
380
|
+
ner_model_configuration=ner_config,
|
|
381
|
+
)
|
|
382
|
+
nlp_engine.nlp = {"en": nlp}
|
|
383
|
+
logger.debug("Loaded spaCy model '%s'", cfg_model)
|
|
384
|
+
return nlp_engine
|
|
385
|
+
|
|
386
|
+
def _register_custom_recognizers(self, presidio_module: Any) -> None:
|
|
387
|
+
"""Add ad-hoc recognizers from config to the analyzer registry."""
|
|
388
|
+
custom_recognizers = getattr(self.config, "custom_recognizers", None) or []
|
|
389
|
+
if not custom_recognizers:
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
PatternRecognizer = presidio_module.PatternRecognizer # noqa: N806
|
|
393
|
+
Pattern = presidio_module.Pattern # noqa: N806
|
|
394
|
+
|
|
395
|
+
for rec in custom_recognizers:
|
|
396
|
+
raw_patterns = getattr(rec.patterns, "root", rec.patterns) or []
|
|
397
|
+
patterns = [Pattern(name=p.name, regex=p.regex, score=p.score) for p in raw_patterns]
|
|
398
|
+
raw_deny_list = getattr(rec.deny_list, "root", rec.deny_list)
|
|
399
|
+
deny_list = list(raw_deny_list) if raw_deny_list else None
|
|
400
|
+
context = list(rec.context) if rec.context else None
|
|
401
|
+
|
|
402
|
+
recognizer = PatternRecognizer(
|
|
403
|
+
supported_entity=rec.supported_entity,
|
|
404
|
+
name=rec.name,
|
|
405
|
+
supported_language=rec.supported_language or "en",
|
|
406
|
+
patterns=patterns or None,
|
|
407
|
+
deny_list=deny_list,
|
|
408
|
+
context=context,
|
|
409
|
+
)
|
|
410
|
+
self.analyzer.registry.add_recognizer(recognizer)
|
|
411
|
+
logger.debug(
|
|
412
|
+
"Registered custom recognizer '%s' → entity '%s'",
|
|
413
|
+
rec.name,
|
|
414
|
+
rec.supported_entity,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
def _probe_phone_recognizer(self) -> None:
|
|
418
|
+
"""Verify phonenumbers regional data loads correctly; remove PhoneRecognizer if broken.
|
|
419
|
+
|
|
420
|
+
phonenumbers >=9 uses __import__ with level=1 for lazy region loading, which can fail
|
|
421
|
+
in certain execution contexts (e.g. frozen environments, some uv/venv setups).
|
|
422
|
+
Probing once at init avoids per-call ModuleNotFoundError spam.
|
|
423
|
+
"""
|
|
424
|
+
if self.analyzer is None:
|
|
425
|
+
return
|
|
426
|
+
try:
|
|
427
|
+
import phonenumbers
|
|
428
|
+
|
|
429
|
+
phonenumbers.parse("+12025551234", None)
|
|
430
|
+
except ModuleNotFoundError as exc:
|
|
431
|
+
logger.warning(
|
|
432
|
+
"phonenumbers regional data unavailable (%s) — PHONE_NUMBER entity disabled", exc
|
|
433
|
+
)
|
|
434
|
+
self.analyzer.registry.recognizers = [
|
|
435
|
+
r for r in self.analyzer.registry.recognizers if "phone" not in r.name.lower()
|
|
436
|
+
]
|
|
437
|
+
self._ALL_SUPPORTED_ENTITIES = self._ALL_SUPPORTED_ENTITIES - {"PHONE_NUMBER"}
|
|
438
|
+
if self._supported_entities_cache is not None:
|
|
439
|
+
self._supported_entities_cache = self._supported_entities_cache - frozenset(
|
|
440
|
+
{"PHONE_NUMBER"}
|
|
441
|
+
)
|
|
442
|
+
except Exception:
|
|
443
|
+
pass
|
|
444
|
+
|
|
445
|
+
# ------------------------------------------------------------------
|
|
446
|
+
# Entity filtering
|
|
447
|
+
# ------------------------------------------------------------------
|
|
448
|
+
|
|
449
|
+
def _enabled_entities(self) -> set[str] | None:
|
|
450
|
+
"""Return the set of enabled Presidio entity types, or None for all."""
|
|
451
|
+
configured = self._cfg.enabled_patterns
|
|
452
|
+
if not configured:
|
|
453
|
+
return None
|
|
454
|
+
normalized = {str(p).strip().upper() for p in configured if str(p).strip()}
|
|
455
|
+
return normalized or None
|
|
456
|
+
|
|
457
|
+
def _is_entity_enabled(self, entity_type: str) -> bool:
|
|
458
|
+
enabled = self._enabled_entities()
|
|
459
|
+
return True if enabled is None else entity_type.upper() in enabled
|
|
460
|
+
|
|
461
|
+
# ------------------------------------------------------------------
|
|
462
|
+
# Tabular column heuristics
|
|
463
|
+
# ------------------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
def _normalize_column_name(self, column_name: str) -> str:
|
|
466
|
+
return re.sub(r"[^a-z0-9]+", " ", column_name.lower()).strip()
|
|
467
|
+
|
|
468
|
+
def _column_tokens(self, column_name: str) -> set[str]:
|
|
469
|
+
normalized = self._normalize_column_name(column_name)
|
|
470
|
+
return {t for t in normalized.split() if t}
|
|
471
|
+
|
|
472
|
+
def _is_free_text_column(self, column_name: str) -> bool:
|
|
473
|
+
return bool(self._column_tokens(column_name) & self._FREE_TEXT_COLUMN_TOKENS)
|
|
474
|
+
|
|
475
|
+
def _allowed_entities_for_column(self, column_name: str) -> set[str]:
|
|
476
|
+
"""Return the Presidio entity types that are relevant for this column name."""
|
|
477
|
+
enabled = self._enabled_entities() or self._ALL_SUPPORTED_ENTITIES
|
|
478
|
+
tokens = self._column_tokens(column_name)
|
|
479
|
+
|
|
480
|
+
if not tokens:
|
|
481
|
+
return enabled - self._NON_TEXT_ENTITY_TYPES
|
|
482
|
+
|
|
483
|
+
if tokens & self._FREE_TEXT_COLUMN_TOKENS:
|
|
484
|
+
return enabled
|
|
485
|
+
|
|
486
|
+
allowed: set[str] = set()
|
|
487
|
+
for token in tokens:
|
|
488
|
+
allowed.update(self._COLUMN_ENTITY_HINTS.get(token, set()))
|
|
489
|
+
|
|
490
|
+
if tokens & self._NAME_COLUMN_TOKENS and "company" not in tokens:
|
|
491
|
+
allowed.add("PERSON")
|
|
492
|
+
if tokens & self._EMAIL_COLUMN_TOKENS:
|
|
493
|
+
allowed.add("EMAIL_ADDRESS")
|
|
494
|
+
if tokens & self._PHONE_COLUMN_TOKENS:
|
|
495
|
+
allowed.add("PHONE_NUMBER")
|
|
496
|
+
if tokens & self._ADDRESS_COLUMN_TOKENS:
|
|
497
|
+
allowed.add("LOCATION")
|
|
498
|
+
|
|
499
|
+
if allowed:
|
|
500
|
+
return allowed & enabled
|
|
501
|
+
|
|
502
|
+
if tokens & self._URL_COLUMN_TOKENS:
|
|
503
|
+
return {"IP_ADDRESS", "URL"} & enabled
|
|
504
|
+
|
|
505
|
+
if tokens & self._ID_COLUMN_TOKENS:
|
|
506
|
+
return set()
|
|
507
|
+
|
|
508
|
+
return enabled - self._NON_TEXT_ENTITY_TYPES
|
|
509
|
+
|
|
510
|
+
def _is_entity_allowed_for_column(self, entity_type: str, column_name: str) -> bool:
|
|
511
|
+
allowed = self._allowed_entities_for_column(column_name)
|
|
512
|
+
if not allowed:
|
|
513
|
+
return False
|
|
514
|
+
entity_upper = entity_type.upper()
|
|
515
|
+
if entity_upper not in self._ALL_SUPPORTED_ENTITIES:
|
|
516
|
+
# Custom entity: allow in free-text columns when no pattern filter is active.
|
|
517
|
+
return self._enabled_entities() is None and self._is_free_text_column(column_name)
|
|
518
|
+
return entity_upper in allowed
|
|
519
|
+
|
|
520
|
+
# ------------------------------------------------------------------
|
|
521
|
+
# Presidio analysis
|
|
522
|
+
# ------------------------------------------------------------------
|
|
523
|
+
|
|
524
|
+
def _analyze_content(self, content: str, *, entities: list[str] | None = None) -> list[Any]:
|
|
525
|
+
if self.analyzer is None:
|
|
526
|
+
if self._init_error is not None:
|
|
527
|
+
raise self._init_error
|
|
528
|
+
return []
|
|
529
|
+
try:
|
|
530
|
+
return self.analyzer.analyze(text=content, language="en", entities=entities)
|
|
531
|
+
except ModuleNotFoundError as exc:
|
|
532
|
+
if "phonenumbers" in str(exc):
|
|
533
|
+
logger.warning("phonenumbers data missing mid-run; disabling PHONE_NUMBER entity")
|
|
534
|
+
self._probe_phone_recognizer()
|
|
535
|
+
# Retry without the now-removed phone recognizer
|
|
536
|
+
retry_entities = (
|
|
537
|
+
[e for e in entities if e != "PHONE_NUMBER"] if entities is not None else None
|
|
538
|
+
)
|
|
539
|
+
try:
|
|
540
|
+
return self.analyzer.analyze(
|
|
541
|
+
text=content, language="en", entities=retry_entities
|
|
542
|
+
)
|
|
543
|
+
except Exception:
|
|
544
|
+
return []
|
|
545
|
+
logger.error("PII analysis failed: %s", exc)
|
|
546
|
+
logger.exception(exc)
|
|
547
|
+
return []
|
|
548
|
+
except Exception as e:
|
|
549
|
+
logger.error("PII analysis failed: %s", e)
|
|
550
|
+
logger.exception(e)
|
|
551
|
+
return []
|
|
552
|
+
|
|
553
|
+
def _analyzer_supported_entities(self) -> frozenset[str]:
|
|
554
|
+
if self._supported_entities_cache is not None:
|
|
555
|
+
return self._supported_entities_cache
|
|
556
|
+
if self.analyzer is None:
|
|
557
|
+
return frozenset()
|
|
558
|
+
self._supported_entities_cache = frozenset(self.analyzer.get_supported_entities())
|
|
559
|
+
return self._supported_entities_cache
|
|
560
|
+
|
|
561
|
+
def _analyze_structured_cell(
|
|
562
|
+
self, content: str, *, allowed_entity_types: set[str]
|
|
563
|
+
) -> list[Any]:
|
|
564
|
+
if not allowed_entity_types or self.analyzer is None:
|
|
565
|
+
if self.analyzer is None and self._init_error is not None:
|
|
566
|
+
raise self._init_error
|
|
567
|
+
return []
|
|
568
|
+
|
|
569
|
+
entities = sorted(allowed_entity_types & self._analyzer_supported_entities())
|
|
570
|
+
if not entities:
|
|
571
|
+
return []
|
|
572
|
+
|
|
573
|
+
return self._analyze_content(content, entities=entities)
|
|
574
|
+
|
|
575
|
+
# ------------------------------------------------------------------
|
|
576
|
+
# Tabular content detection
|
|
577
|
+
# ------------------------------------------------------------------
|
|
578
|
+
|
|
579
|
+
def _extract_tabular_cells(self, content: str) -> list[_TabularCell]:
|
|
580
|
+
if "row_1:" not in content:
|
|
581
|
+
return []
|
|
582
|
+
|
|
583
|
+
cells: list[_TabularCell] = []
|
|
584
|
+
current_row_index: int | None = None
|
|
585
|
+
current_column_name: str | None = None
|
|
586
|
+
current_value_lines: list[str] = []
|
|
587
|
+
|
|
588
|
+
def flush_cell() -> None:
|
|
589
|
+
nonlocal current_column_name, current_value_lines
|
|
590
|
+
if current_row_index is None or current_column_name is None:
|
|
591
|
+
current_column_name = None
|
|
592
|
+
current_value_lines = []
|
|
593
|
+
return
|
|
594
|
+
cells.append(
|
|
595
|
+
_TabularCell(
|
|
596
|
+
row_index=current_row_index,
|
|
597
|
+
column_name=current_column_name,
|
|
598
|
+
value="\n".join(current_value_lines).strip(),
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
current_column_name = None
|
|
602
|
+
current_value_lines = []
|
|
603
|
+
|
|
604
|
+
for line in content.splitlines():
|
|
605
|
+
row_match = self._TABULAR_ROW_RE.match(line)
|
|
606
|
+
if row_match:
|
|
607
|
+
flush_cell()
|
|
608
|
+
current_row_index = int(row_match.group(1))
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
cell_match = self._TABULAR_CELL_RE.match(line)
|
|
612
|
+
if cell_match and current_row_index is not None:
|
|
613
|
+
flush_cell()
|
|
614
|
+
current_column_name = cell_match.group(1).strip()
|
|
615
|
+
current_value_lines = [cell_match.group(2) or ""]
|
|
616
|
+
continue
|
|
617
|
+
|
|
618
|
+
continuation_match = self._TABULAR_CONTINUATION_RE.match(line)
|
|
619
|
+
if continuation_match and current_column_name is not None:
|
|
620
|
+
current_value_lines.append(continuation_match.group(1))
|
|
621
|
+
continue
|
|
622
|
+
|
|
623
|
+
if current_column_name is not None and line and current_row_index is not None:
|
|
624
|
+
current_value_lines.append(line)
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
if not line:
|
|
628
|
+
flush_cell()
|
|
629
|
+
|
|
630
|
+
flush_cell()
|
|
631
|
+
return [cell for cell in cells if cell.value]
|
|
632
|
+
|
|
633
|
+
def _should_keep_tabular_result(
|
|
634
|
+
self, *, cell: _TabularCell, entity_type: str, matched_content: str
|
|
635
|
+
) -> bool:
|
|
636
|
+
if entity_type != "PERSON":
|
|
637
|
+
return True
|
|
638
|
+
if not self._is_free_text_column(cell.column_name):
|
|
639
|
+
return True
|
|
640
|
+
token_count = len(re.findall(r"[A-Za-z][A-Za-z'-]*", matched_content))
|
|
641
|
+
return token_count >= 2
|
|
642
|
+
|
|
643
|
+
def _dedupe_tabular_findings(self, findings: list[DetectionResult]) -> list[DetectionResult]:
|
|
644
|
+
deduped: dict[tuple[str, str, int | None, str | None], DetectionResult] = {}
|
|
645
|
+
ordered_keys: list[tuple[str, str, int | None, str | None]] = []
|
|
646
|
+
|
|
647
|
+
for finding in findings:
|
|
648
|
+
metadata = finding.metadata or {}
|
|
649
|
+
key = (
|
|
650
|
+
finding.finding_type,
|
|
651
|
+
finding.matched_content.strip(),
|
|
652
|
+
metadata.get("tabular_row_index"),
|
|
653
|
+
metadata.get("tabular_column_name"),
|
|
654
|
+
)
|
|
655
|
+
existing = deduped.get(key)
|
|
656
|
+
if existing is None:
|
|
657
|
+
deduped[key] = finding
|
|
658
|
+
ordered_keys.append(key)
|
|
659
|
+
elif finding.confidence > existing.confidence:
|
|
660
|
+
deduped[key] = finding
|
|
661
|
+
|
|
662
|
+
return [deduped[k] for k in ordered_keys]
|
|
663
|
+
|
|
664
|
+
def _detect_tabular_content(self, content: str) -> list[DetectionResult] | None:
|
|
665
|
+
cells = self._extract_tabular_cells(content)
|
|
666
|
+
if not cells:
|
|
667
|
+
return None
|
|
668
|
+
|
|
669
|
+
# For very wide/long pages fall back to full-text analysis to avoid O(N) Presidio calls.
|
|
670
|
+
if len(cells) > self._TABULAR_CELL_LIMIT:
|
|
671
|
+
logger.debug(
|
|
672
|
+
"Page has %d cells (> %d limit); using full-text analysis instead of per-cell",
|
|
673
|
+
len(cells),
|
|
674
|
+
self._TABULAR_CELL_LIMIT,
|
|
675
|
+
)
|
|
676
|
+
return None
|
|
677
|
+
|
|
678
|
+
threshold = self._cfg.confidence_threshold or 0.7
|
|
679
|
+
results: list[DetectionResult] = []
|
|
680
|
+
|
|
681
|
+
for cell in cells:
|
|
682
|
+
allowed = self._allowed_entities_for_column(cell.column_name)
|
|
683
|
+
if not allowed:
|
|
684
|
+
continue
|
|
685
|
+
|
|
686
|
+
for result in self._analyze_structured_cell(cell.value, allowed_entity_types=allowed):
|
|
687
|
+
if not self._is_entity_enabled(result.entity_type):
|
|
688
|
+
continue
|
|
689
|
+
if not self._is_entity_allowed_for_column(result.entity_type, cell.column_name):
|
|
690
|
+
continue
|
|
691
|
+
|
|
692
|
+
matched_content = cell.value[result.start : result.end]
|
|
693
|
+
if not self._should_keep_tabular_result(
|
|
694
|
+
cell=cell,
|
|
695
|
+
entity_type=result.entity_type,
|
|
696
|
+
matched_content=matched_content,
|
|
697
|
+
):
|
|
698
|
+
continue
|
|
699
|
+
|
|
700
|
+
detection = self._build_detection_result(
|
|
701
|
+
matched_content=matched_content,
|
|
702
|
+
entity_type=result.entity_type,
|
|
703
|
+
confidence=result.score,
|
|
704
|
+
recognition_metadata=result.recognition_metadata,
|
|
705
|
+
line_number=cell.row_index,
|
|
706
|
+
start=result.start,
|
|
707
|
+
end=result.end,
|
|
708
|
+
metadata={
|
|
709
|
+
"tabular_row_index": cell.row_index,
|
|
710
|
+
"tabular_column_name": cell.column_name,
|
|
711
|
+
},
|
|
712
|
+
)
|
|
713
|
+
if detection.confidence >= threshold:
|
|
714
|
+
results.append(detection)
|
|
715
|
+
|
|
716
|
+
return self._dedupe_tabular_findings(results)
|
|
717
|
+
|
|
718
|
+
# ------------------------------------------------------------------
|
|
719
|
+
# Result construction
|
|
720
|
+
# ------------------------------------------------------------------
|
|
721
|
+
|
|
722
|
+
def _build_detection_result(
|
|
723
|
+
self,
|
|
724
|
+
*,
|
|
725
|
+
matched_content: str,
|
|
726
|
+
entity_type: str,
|
|
727
|
+
confidence: float,
|
|
728
|
+
recognition_metadata: dict[str, Any] | None,
|
|
729
|
+
line_number: int,
|
|
730
|
+
start: int,
|
|
731
|
+
end: int,
|
|
732
|
+
metadata: dict[str, Any] | None = None,
|
|
733
|
+
) -> DetectionResult:
|
|
734
|
+
base_metadata: dict[str, Any] = {
|
|
735
|
+
"recognizer": recognition_metadata.get("recognizer_name", "unknown")
|
|
736
|
+
if recognition_metadata
|
|
737
|
+
else "unknown",
|
|
738
|
+
"entity_type": entity_type,
|
|
739
|
+
}
|
|
740
|
+
if metadata:
|
|
741
|
+
base_metadata.update(metadata)
|
|
742
|
+
|
|
743
|
+
return DetectionResult(
|
|
744
|
+
detector_type=DetectorType.PII,
|
|
745
|
+
finding_type=entity_type,
|
|
746
|
+
category="PII",
|
|
747
|
+
severity=self._get_severity_for_entity(entity_type),
|
|
748
|
+
confidence=confidence,
|
|
749
|
+
matched_content=matched_content,
|
|
750
|
+
location=Location(
|
|
751
|
+
path=f"line {line_number}",
|
|
752
|
+
description=f"character range {start}-{end}",
|
|
753
|
+
),
|
|
754
|
+
metadata=base_metadata,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
def _get_severity_for_entity(self, entity_type: str) -> Severity:
|
|
758
|
+
e = entity_type.upper()
|
|
759
|
+
|
|
760
|
+
# Critical — government IDs, financial account numbers, biometric IDs
|
|
761
|
+
if e in {
|
|
762
|
+
"CREDIT_CARD",
|
|
763
|
+
"CRYPTO",
|
|
764
|
+
"IBAN_CODE",
|
|
765
|
+
"US_SSN",
|
|
766
|
+
"US_PASSPORT",
|
|
767
|
+
"US_DRIVER_LICENSE",
|
|
768
|
+
"US_BANK_NUMBER",
|
|
769
|
+
"US_ITIN",
|
|
770
|
+
"UK_NHS",
|
|
771
|
+
"AT_SVNR",
|
|
772
|
+
"CH_AHV",
|
|
773
|
+
"DE_TAX_ID",
|
|
774
|
+
"EU_NATIONAL_ID",
|
|
775
|
+
"ES_NIF",
|
|
776
|
+
"ES_NIE",
|
|
777
|
+
"IT_FISCAL_CODE",
|
|
778
|
+
"IT_PASSPORT",
|
|
779
|
+
"IT_DRIVER_LICENSE",
|
|
780
|
+
"IT_IDENTITY_CARD",
|
|
781
|
+
"SG_NRIC_FIN",
|
|
782
|
+
"AU_TFN",
|
|
783
|
+
"AU_MEDICARE",
|
|
784
|
+
"IN_PAN",
|
|
785
|
+
"IN_AADHAAR",
|
|
786
|
+
"FI_PERSONAL_IDENTITY_CODE",
|
|
787
|
+
"PL_PESEL",
|
|
788
|
+
}:
|
|
789
|
+
return Severity.critical
|
|
790
|
+
|
|
791
|
+
# High — contact identifiers, business numbers, less-direct personal IDs
|
|
792
|
+
if e in {
|
|
793
|
+
"EMAIL_ADDRESS",
|
|
794
|
+
"PHONE_NUMBER",
|
|
795
|
+
"IP_ADDRESS",
|
|
796
|
+
"MEDICAL_LICENSE",
|
|
797
|
+
"AU_ABN",
|
|
798
|
+
"AU_ACN",
|
|
799
|
+
"SG_UEN",
|
|
800
|
+
"IT_VAR_CODE",
|
|
801
|
+
"IN_VOTER",
|
|
802
|
+
"IN_VEHICLE_REGISTRATION",
|
|
803
|
+
}:
|
|
804
|
+
return Severity.high
|
|
805
|
+
|
|
806
|
+
# Medium — contextual personal information
|
|
807
|
+
if e in {"PERSON", "LOCATION", "DATE_TIME", "NRP", "URL"}:
|
|
808
|
+
return Severity.medium
|
|
809
|
+
|
|
810
|
+
return Severity.high
|
|
811
|
+
|
|
812
|
+
# ------------------------------------------------------------------
|
|
813
|
+
# Public API
|
|
814
|
+
# ------------------------------------------------------------------
|
|
815
|
+
|
|
816
|
+
async def detect(
|
|
817
|
+
self, content: str | bytes, content_type: str = "text/plain"
|
|
818
|
+
) -> list[DetectionResult]:
|
|
819
|
+
"""Detect PII in *content* and return a list of :class:`DetectionResult` objects."""
|
|
820
|
+
if isinstance(content, bytes):
|
|
821
|
+
return []
|
|
822
|
+
# Presidio + spaCy NER are CPU-bound synchronous operations. Running them
|
|
823
|
+
# directly in the async coroutine blocks the event loop for the duration of
|
|
824
|
+
# each page (seconds on CPU-limited pods), making the job appear frozen.
|
|
825
|
+
# Offloading to a thread keeps the loop alive and allows I/O to proceed.
|
|
826
|
+
return await asyncio.to_thread(self._detect_sync, content)
|
|
827
|
+
|
|
828
|
+
def _chunk_text(self, text: str) -> list[tuple[str, int]]:
|
|
829
|
+
"""Return (chunk, offset) pairs. When chunk_size is null returns the full text at offset 0."""
|
|
830
|
+
chunk_size: int | None = getattr(self._cfg, "chunk_size", None)
|
|
831
|
+
if not chunk_size:
|
|
832
|
+
return [(text, 0)]
|
|
833
|
+
overlap: int = getattr(self._cfg, "chunk_overlap", None) or 0
|
|
834
|
+
step = max(1, chunk_size - overlap)
|
|
835
|
+
return [(text[i : i + chunk_size], i) for i in range(0, len(text), step)]
|
|
836
|
+
|
|
837
|
+
def _detect_sync(self, content: str) -> list[DetectionResult]:
|
|
838
|
+
tabular_results = self._detect_tabular_content(content)
|
|
839
|
+
if tabular_results is not None:
|
|
840
|
+
if self._cfg.max_findings and len(tabular_results) > self._cfg.max_findings:
|
|
841
|
+
return tabular_results[: self._cfg.max_findings]
|
|
842
|
+
return tabular_results
|
|
843
|
+
|
|
844
|
+
enabled = self._enabled_entities()
|
|
845
|
+
entities = sorted(enabled) if enabled else None
|
|
846
|
+
threshold = self._cfg.confidence_threshold or 0.7
|
|
847
|
+
results: list[DetectionResult] = []
|
|
848
|
+
seen: set[tuple[str, int, int]] = set()
|
|
849
|
+
|
|
850
|
+
for chunk, offset in self._chunk_text(content):
|
|
851
|
+
for result in self._analyze_content(chunk, entities=entities):
|
|
852
|
+
if not self._is_entity_enabled(result.entity_type):
|
|
853
|
+
continue
|
|
854
|
+
|
|
855
|
+
abs_start = result.start + offset
|
|
856
|
+
abs_end = result.end + offset
|
|
857
|
+
dedup_key = (result.entity_type, abs_start, abs_end)
|
|
858
|
+
if dedup_key in seen:
|
|
859
|
+
continue
|
|
860
|
+
seen.add(dedup_key)
|
|
861
|
+
|
|
862
|
+
line_number = content[:abs_start].count("\n") + 1
|
|
863
|
+
matched_content = content[abs_start:abs_end]
|
|
864
|
+
|
|
865
|
+
detection = self._build_detection_result(
|
|
866
|
+
matched_content=matched_content,
|
|
867
|
+
entity_type=result.entity_type,
|
|
868
|
+
confidence=result.score,
|
|
869
|
+
recognition_metadata=result.recognition_metadata,
|
|
870
|
+
line_number=line_number,
|
|
871
|
+
start=abs_start,
|
|
872
|
+
end=abs_end,
|
|
873
|
+
)
|
|
874
|
+
if detection.confidence >= threshold:
|
|
875
|
+
results.append(detection)
|
|
876
|
+
|
|
877
|
+
if self._cfg.max_findings and len(results) > self._cfg.max_findings:
|
|
878
|
+
results = results[: self._cfg.max_findings]
|
|
879
|
+
|
|
880
|
+
return results
|
|
881
|
+
|
|
882
|
+
def get_supported_content_types(self) -> list[str]:
|
|
883
|
+
return ["text/plain", "text/html", "application/json", "text/xml"]
|