euredact 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euredact/__init__.py +196 -0
- euredact/cache.py +50 -0
- euredact/cloud/__init__.py +1 -0
- euredact/cloud/client.py +19 -0
- euredact/cloud/hasher.py +1 -0
- euredact/cloud/shuffler.py +1 -0
- euredact/coref/__init__.py +1 -0
- euredact/coref/models.py +1 -0
- euredact/coref/resolver.py +1 -0
- euredact/normalizer.py +97 -0
- euredact/py.typed +0 -0
- euredact/rules/__init__.py +5 -0
- euredact/rules/countries/__init__.py +1 -0
- euredact/rules/countries/_base.py +31 -0
- euredact/rules/countries/_shared.py +171 -0
- euredact/rules/countries/at.py +82 -0
- euredact/rules/countries/be.py +135 -0
- euredact/rules/countries/bg.py +31 -0
- euredact/rules/countries/ch.py +97 -0
- euredact/rules/countries/cy.py +48 -0
- euredact/rules/countries/cz.py +31 -0
- euredact/rules/countries/de.py +167 -0
- euredact/rules/countries/dk.py +97 -0
- euredact/rules/countries/ee.py +40 -0
- euredact/rules/countries/el.py +49 -0
- euredact/rules/countries/es.py +91 -0
- euredact/rules/countries/fi.py +83 -0
- euredact/rules/countries/fr.py +142 -0
- euredact/rules/countries/hr.py +33 -0
- euredact/rules/countries/hu.py +38 -0
- euredact/rules/countries/ie.py +46 -0
- euredact/rules/countries/is_.py +80 -0
- euredact/rules/countries/it.py +92 -0
- euredact/rules/countries/lt.py +43 -0
- euredact/rules/countries/lu.py +105 -0
- euredact/rules/countries/lv.py +45 -0
- euredact/rules/countries/mt.py +47 -0
- euredact/rules/countries/nl.py +141 -0
- euredact/rules/countries/no.py +85 -0
- euredact/rules/countries/pl.py +32 -0
- euredact/rules/countries/pt.py +79 -0
- euredact/rules/countries/ro.py +29 -0
- euredact/rules/countries/se.py +123 -0
- euredact/rules/countries/si.py +33 -0
- euredact/rules/countries/sk.py +29 -0
- euredact/rules/countries/uk.py +61 -0
- euredact/rules/engine.py +119 -0
- euredact/rules/matchers.py +183 -0
- euredact/rules/registry.py +68 -0
- euredact/rules/structural.py +154 -0
- euredact/rules/suppressors.py +455 -0
- euredact/rules/validators.py +800 -0
- euredact/sdk.py +263 -0
- euredact/types.py +71 -0
- euredact-0.1.0.dist-info/METADATA +316 -0
- euredact-0.1.0.dist-info/RECORD +58 -0
- euredact-0.1.0.dist-info/WHEEL +4 -0
- euredact-0.1.0.dist-info/licenses/LICENSE +190 -0
euredact/__init__.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""EuRedact -- European PII redaction SDK.
|
|
2
|
+
|
|
3
|
+
Quick start::
|
|
4
|
+
|
|
5
|
+
import euredact
|
|
6
|
+
|
|
7
|
+
result = euredact.redact("Jan Janssens, BSN 123456782, woont in Gent.")
|
|
8
|
+
print(result.redacted_text)
|
|
9
|
+
print(result.detections)
|
|
10
|
+
|
|
11
|
+
# Batch processing:
|
|
12
|
+
results = euredact.redact_batch(["text one", "text two"], countries=["NL"])
|
|
13
|
+
|
|
14
|
+
# Async:
|
|
15
|
+
result = await euredact.aredact("some text")
|
|
16
|
+
|
|
17
|
+
# Available countries:
|
|
18
|
+
print(euredact.available_countries()) # ['AT', 'BE', 'DE', 'NL', ...]
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Iterator
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
|
|
27
|
+
from euredact.sdk import EuRedact
|
|
28
|
+
from euredact.types import Detection, DetectionSource, EntityType, RedactResult
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"__version__",
|
|
32
|
+
"aredact",
|
|
33
|
+
"aredact_batch",
|
|
34
|
+
"available_countries",
|
|
35
|
+
"Detection",
|
|
36
|
+
"DetectionSource",
|
|
37
|
+
"EntityType",
|
|
38
|
+
"EuRedact",
|
|
39
|
+
"redact",
|
|
40
|
+
"redact_batch",
|
|
41
|
+
"redact_iter",
|
|
42
|
+
"RedactResult",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Module-level singleton
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
_instance: EuRedact | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_instance() -> EuRedact:
|
|
53
|
+
global _instance
|
|
54
|
+
if _instance is None:
|
|
55
|
+
_instance = EuRedact()
|
|
56
|
+
return _instance
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Public helpers
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def available_countries() -> list[str]:
|
|
65
|
+
"""Return a sorted list of supported country codes (e.g. ``['AT', 'BE', ...]``)."""
|
|
66
|
+
from euredact.rules.registry import CountryRegistry
|
|
67
|
+
return CountryRegistry().available_countries
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def redact(
|
|
71
|
+
text: str,
|
|
72
|
+
*,
|
|
73
|
+
countries: list[str] | None = None,
|
|
74
|
+
mode: str = "rules",
|
|
75
|
+
pseudonymize: bool = False,
|
|
76
|
+
detect_dates: bool = False,
|
|
77
|
+
coref: bool = False,
|
|
78
|
+
coref_model: str = "default",
|
|
79
|
+
cache: bool = True,
|
|
80
|
+
) -> RedactResult:
|
|
81
|
+
"""Redact PII from text. Main entry point.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
detect_dates: Include date-of-birth / date-of-death detections.
|
|
85
|
+
Off by default -- bare dates are better handled by the cloud
|
|
86
|
+
LLM tier. When True, the rule engine applies keyword and
|
|
87
|
+
structural (JSON/CSV header) checks before emitting a date.
|
|
88
|
+
"""
|
|
89
|
+
return _get_instance().redact(
|
|
90
|
+
text,
|
|
91
|
+
countries=countries,
|
|
92
|
+
mode=mode,
|
|
93
|
+
pseudonymize=pseudonymize,
|
|
94
|
+
detect_dates=detect_dates,
|
|
95
|
+
coref=coref,
|
|
96
|
+
coref_model=coref_model,
|
|
97
|
+
cache=cache,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def aredact(
|
|
102
|
+
text: str,
|
|
103
|
+
*,
|
|
104
|
+
countries: list[str] | None = None,
|
|
105
|
+
mode: str = "rules",
|
|
106
|
+
pseudonymize: bool = False,
|
|
107
|
+
detect_dates: bool = False,
|
|
108
|
+
cache: bool = True,
|
|
109
|
+
) -> RedactResult:
|
|
110
|
+
"""Async version of redact().
|
|
111
|
+
|
|
112
|
+
Offloads CPU-bound work to a thread pool so it doesn't block the
|
|
113
|
+
event loop. Safe to ``await`` from multiple concurrent tasks.
|
|
114
|
+
"""
|
|
115
|
+
return await _get_instance().aredact(
|
|
116
|
+
text,
|
|
117
|
+
countries=countries,
|
|
118
|
+
mode=mode,
|
|
119
|
+
pseudonymize=pseudonymize,
|
|
120
|
+
detect_dates=detect_dates,
|
|
121
|
+
cache=cache,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def redact_batch(
|
|
126
|
+
texts: list[str],
|
|
127
|
+
*,
|
|
128
|
+
countries: list[str] | None = None,
|
|
129
|
+
mode: str = "rules",
|
|
130
|
+
pseudonymize: bool = False,
|
|
131
|
+
detect_dates: bool = False,
|
|
132
|
+
cache: bool = True,
|
|
133
|
+
) -> list[RedactResult]:
|
|
134
|
+
"""Redact PII from multiple texts at once.
|
|
135
|
+
|
|
136
|
+
More efficient than calling ``redact()`` in a loop -- loads country
|
|
137
|
+
configs once. Returns results in the same order as the input.
|
|
138
|
+
"""
|
|
139
|
+
return _get_instance().redact_batch(
|
|
140
|
+
texts,
|
|
141
|
+
countries=countries,
|
|
142
|
+
mode=mode,
|
|
143
|
+
pseudonymize=pseudonymize,
|
|
144
|
+
detect_dates=detect_dates,
|
|
145
|
+
cache=cache,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def aredact_batch(
|
|
150
|
+
texts: list[str],
|
|
151
|
+
*,
|
|
152
|
+
countries: list[str] | None = None,
|
|
153
|
+
mode: str = "rules",
|
|
154
|
+
pseudonymize: bool = False,
|
|
155
|
+
detect_dates: bool = False,
|
|
156
|
+
cache: bool = True,
|
|
157
|
+
max_concurrency: int = 4,
|
|
158
|
+
) -> list[RedactResult]:
|
|
159
|
+
"""Async batch redaction with controlled concurrency.
|
|
160
|
+
|
|
161
|
+
Processes texts concurrently in a thread pool. ``max_concurrency``
|
|
162
|
+
limits parallel threads (default 4). Returns results in input order.
|
|
163
|
+
"""
|
|
164
|
+
return await _get_instance().aredact_batch(
|
|
165
|
+
texts,
|
|
166
|
+
countries=countries,
|
|
167
|
+
mode=mode,
|
|
168
|
+
pseudonymize=pseudonymize,
|
|
169
|
+
detect_dates=detect_dates,
|
|
170
|
+
cache=cache,
|
|
171
|
+
max_concurrency=max_concurrency,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def redact_iter(
|
|
176
|
+
texts: Iterator[str],
|
|
177
|
+
*,
|
|
178
|
+
countries: list[str] | None = None,
|
|
179
|
+
mode: str = "rules",
|
|
180
|
+
pseudonymize: bool = False,
|
|
181
|
+
detect_dates: bool = False,
|
|
182
|
+
cache: bool = True,
|
|
183
|
+
) -> Iterator[RedactResult]:
|
|
184
|
+
"""Lazy iterator that yields results one at a time.
|
|
185
|
+
|
|
186
|
+
Useful for processing large datasets without loading all results
|
|
187
|
+
into memory. Loads country configs once on the first item.
|
|
188
|
+
"""
|
|
189
|
+
return _get_instance().redact_iter(
|
|
190
|
+
texts,
|
|
191
|
+
countries=countries,
|
|
192
|
+
mode=mode,
|
|
193
|
+
pseudonymize=pseudonymize,
|
|
194
|
+
detect_dates=detect_dates,
|
|
195
|
+
cache=cache,
|
|
196
|
+
)
|
euredact/cache.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""LRU result cache with SHA-256 content hashing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import threading
|
|
7
|
+
from collections import OrderedDict
|
|
8
|
+
|
|
9
|
+
from euredact.types import RedactResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ResultCache:
|
|
13
|
+
"""LRU cache keyed on SHA-256 of input text + config hash."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, maxsize: int = 1024, enabled: bool = True) -> None:
|
|
16
|
+
self._maxsize = maxsize
|
|
17
|
+
self._enabled = enabled
|
|
18
|
+
self._store: OrderedDict[str, RedactResult] = OrderedDict()
|
|
19
|
+
self._lock = threading.Lock()
|
|
20
|
+
|
|
21
|
+
def key(self, text: str, countries: tuple[str, ...], mode: str) -> str:
|
|
22
|
+
"""Compute cache key from input text and configuration."""
|
|
23
|
+
raw = f"{text}|{'|'.join(sorted(countries))}|{mode}"
|
|
24
|
+
return hashlib.sha256(raw.encode()).hexdigest()
|
|
25
|
+
|
|
26
|
+
def get(self, key: str) -> RedactResult | None:
|
|
27
|
+
"""Retrieve a cached result, or None on miss."""
|
|
28
|
+
if not self._enabled:
|
|
29
|
+
return None
|
|
30
|
+
with self._lock:
|
|
31
|
+
if key in self._store:
|
|
32
|
+
self._store.move_to_end(key)
|
|
33
|
+
return self._store[key]
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
def put(self, key: str, result: RedactResult) -> None:
|
|
37
|
+
"""Store a result in the cache."""
|
|
38
|
+
if not self._enabled:
|
|
39
|
+
return
|
|
40
|
+
with self._lock:
|
|
41
|
+
if key in self._store:
|
|
42
|
+
self._store.move_to_end(key)
|
|
43
|
+
self._store[key] = result
|
|
44
|
+
while len(self._store) > self._maxsize:
|
|
45
|
+
self._store.popitem(last=False)
|
|
46
|
+
|
|
47
|
+
def clear(self) -> None:
|
|
48
|
+
"""Clear the entire cache."""
|
|
49
|
+
with self._lock:
|
|
50
|
+
self._store.clear()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[CLOUD EXTENSION] Cloud tier — stubbed, not implemented."""
|
euredact/cloud/client.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""[CLOUD EXTENSION] Placeholder cloud client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NotConfiguredError(Exception):
|
|
7
|
+
"""Raised when cloud tier is used without configuration."""
|
|
8
|
+
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
super().__init__(
|
|
11
|
+
"Cloud tier not configured. Call euredact.configure(api_key=...) first."
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CloudClient:
|
|
16
|
+
"""Placeholder for the cloud annotation client."""
|
|
17
|
+
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
raise NotConfiguredError()
|
euredact/cloud/hasher.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[CLOUD EXTENSION] Segment hashing + hash-to-position mapping."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[CLOUD EXTENSION] Cross-client segment shuffling."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[COREF EXTENSION] Coreference resolution — Pro/Enterprise, not implemented."""
|
euredact/coref/models.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[COREF EXTENSION] Model registry: manages DistilBERT + TinyBERT variants."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""[COREF EXTENSION] Local coreference model: merges partial entity refs."""
|
euredact/normalizer.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Unicode NFC normalization for input text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import unicodedata
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def normalize(text: str) -> tuple[str, list[int] | None]:
|
|
9
|
+
"""Normalize text to NFC form before pattern matching.
|
|
10
|
+
|
|
11
|
+
Returns the normalized text and an offset mapping from normalized positions
|
|
12
|
+
to original positions (None if no length change occurred).
|
|
13
|
+
"""
|
|
14
|
+
normalized = unicodedata.normalize("NFC", text)
|
|
15
|
+
if len(normalized) == len(text):
|
|
16
|
+
return normalized, None
|
|
17
|
+
|
|
18
|
+
# Build character offset mapping: normalized index -> original index
|
|
19
|
+
# We do this by normalizing character-by-character
|
|
20
|
+
mapping: list[int] = []
|
|
21
|
+
orig_idx = 0
|
|
22
|
+
for char in text:
|
|
23
|
+
nfc_char = unicodedata.normalize("NFC", char)
|
|
24
|
+
for _ in nfc_char:
|
|
25
|
+
mapping.append(orig_idx)
|
|
26
|
+
orig_idx += len(char.encode("utf-32-le")) // 4
|
|
27
|
+
# Actually, we need codepoint-level tracking
|
|
28
|
+
# Simpler approach: use NFC on full string and build mapping via
|
|
29
|
+
# decompose-then-compose tracking
|
|
30
|
+
# For most European text, NFC doesn't change length. When it does,
|
|
31
|
+
# the mapping lets us translate detection offsets back to original positions.
|
|
32
|
+
return normalized, _build_offset_mapping(text, normalized)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _build_offset_mapping(original: str, normalized: str) -> list[int]:
|
|
36
|
+
"""Build mapping from normalized character positions to original positions.
|
|
37
|
+
|
|
38
|
+
Uses NFD as intermediate to align characters between original and NFC.
|
|
39
|
+
"""
|
|
40
|
+
nfd_original = unicodedata.normalize("NFD", original)
|
|
41
|
+
nfd_normalized = unicodedata.normalize("NFD", normalized)
|
|
42
|
+
|
|
43
|
+
# Both NFD forms should be identical
|
|
44
|
+
# Build: original char index -> NFD index range
|
|
45
|
+
orig_to_nfd: list[int] = []
|
|
46
|
+
nfd_idx = 0
|
|
47
|
+
for char in original:
|
|
48
|
+
nfd_char = unicodedata.normalize("NFD", char)
|
|
49
|
+
orig_to_nfd.append(nfd_idx)
|
|
50
|
+
nfd_idx += len(nfd_char)
|
|
51
|
+
|
|
52
|
+
# Build: NFD index -> normalized char index
|
|
53
|
+
nfd_to_norm: list[int] = []
|
|
54
|
+
nfd_idx = 0
|
|
55
|
+
for norm_idx, char in enumerate(normalized):
|
|
56
|
+
nfd_char = unicodedata.normalize("NFD", char)
|
|
57
|
+
for _ in nfd_char:
|
|
58
|
+
nfd_to_norm.append(norm_idx)
|
|
59
|
+
nfd_idx += len(nfd_char)
|
|
60
|
+
|
|
61
|
+
# Build: normalized index -> original index
|
|
62
|
+
# For each normalized position, find which original character maps to it
|
|
63
|
+
norm_to_orig: list[int] = []
|
|
64
|
+
orig_char_idx = 0
|
|
65
|
+
orig_nfd_pos = 0
|
|
66
|
+
for norm_idx in range(len(normalized)):
|
|
67
|
+
nfd_of_norm_char = unicodedata.normalize("NFD", normalized[norm_idx])
|
|
68
|
+
nfd_start = sum(
|
|
69
|
+
len(unicodedata.normalize("NFD", normalized[i]))
|
|
70
|
+
for i in range(norm_idx)
|
|
71
|
+
)
|
|
72
|
+
# Find which original character this NFD position belongs to
|
|
73
|
+
target_orig = 0
|
|
74
|
+
running = 0
|
|
75
|
+
for oi, oc in enumerate(original):
|
|
76
|
+
nfd_oc = unicodedata.normalize("NFD", oc)
|
|
77
|
+
if running + len(nfd_oc) > nfd_start:
|
|
78
|
+
target_orig = oi
|
|
79
|
+
break
|
|
80
|
+
running += len(nfd_oc)
|
|
81
|
+
norm_to_orig.append(target_orig)
|
|
82
|
+
|
|
83
|
+
return norm_to_orig
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def map_offset_to_original(
|
|
87
|
+
offset: int, mapping: list[int] | None
|
|
88
|
+
) -> int:
|
|
89
|
+
"""Map a normalized-text offset back to an original-text offset."""
|
|
90
|
+
if mapping is None:
|
|
91
|
+
return offset
|
|
92
|
+
if offset >= len(mapping):
|
|
93
|
+
# Past end of mapping — extrapolate
|
|
94
|
+
if mapping:
|
|
95
|
+
return mapping[-1] + (offset - len(mapping) + 1)
|
|
96
|
+
return offset
|
|
97
|
+
return mapping[offset]
|
euredact/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Country-specific PII pattern configurations."""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Base country configuration class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from euredact.types import EntityType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PatternDef:
|
|
12
|
+
"""A single pattern definition."""
|
|
13
|
+
|
|
14
|
+
entity_type: EntityType
|
|
15
|
+
pattern: str
|
|
16
|
+
validator: str | None = None
|
|
17
|
+
description: str = ""
|
|
18
|
+
context_keywords: list[str] = field(default_factory=list)
|
|
19
|
+
requires_context: bool = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CountryConfig:
|
|
24
|
+
"""Base class for country-specific PII patterns."""
|
|
25
|
+
|
|
26
|
+
code: str = ""
|
|
27
|
+
name: str = ""
|
|
28
|
+
patterns: list[PatternDef] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
def __post_init__(self) -> None:
|
|
31
|
+
"""Override in subclasses to populate patterns."""
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Shared (EU-wide) PII patterns: email, generic IBAN, international phone, dates, digital IDs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from euredact.rules.countries._base import CountryConfig, PatternDef
|
|
6
|
+
from euredact.types import EntityType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SharedConfig(CountryConfig):
|
|
10
|
+
"""Patterns shared across all EU countries."""
|
|
11
|
+
|
|
12
|
+
def __post_init__(self) -> None:
|
|
13
|
+
self.code = "SHARED"
|
|
14
|
+
self.name = "Shared EU Patterns"
|
|
15
|
+
self.patterns = [
|
|
16
|
+
# --- Email ---
|
|
17
|
+
PatternDef(
|
|
18
|
+
entity_type=EntityType.EMAIL,
|
|
19
|
+
pattern=r"\b[\w._%+\-]+@[\w.\-]+\.[a-zA-Z]{2,}\b",
|
|
20
|
+
validator=None,
|
|
21
|
+
description="Email address (RFC 5322 simplified)",
|
|
22
|
+
),
|
|
23
|
+
# --- BIC/SWIFT ---
|
|
24
|
+
PatternDef(
|
|
25
|
+
entity_type=EntityType.BIC,
|
|
26
|
+
pattern=r"\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\b",
|
|
27
|
+
validator="bic",
|
|
28
|
+
description="BIC/SWIFT code (8 or 11 characters)",
|
|
29
|
+
),
|
|
30
|
+
# --- Credit Card (Visa, Mastercard, Amex) ---
|
|
31
|
+
PatternDef(
|
|
32
|
+
entity_type=EntityType.CREDIT_CARD,
|
|
33
|
+
pattern=(
|
|
34
|
+
r"\b(?:"
|
|
35
|
+
r"4[0-9]{3}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}" # Visa
|
|
36
|
+
r"|5[1-5][0-9]{2}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}" # MC
|
|
37
|
+
r"|3[47][0-9]{2}[\s\-]?[0-9]{6}[\s\-]?[0-9]{5}" # Amex
|
|
38
|
+
r")\b"
|
|
39
|
+
),
|
|
40
|
+
validator="luhn",
|
|
41
|
+
description="Credit card number (Visa, Mastercard, Amex) with Luhn",
|
|
42
|
+
),
|
|
43
|
+
# --- VIN ---
|
|
44
|
+
PatternDef(
|
|
45
|
+
entity_type=EntityType.VIN,
|
|
46
|
+
pattern=r"\b[A-HJ-NPR-Z0-9]{17}\b",
|
|
47
|
+
validator="vin",
|
|
48
|
+
description="Vehicle Identification Number (ISO 3779)",
|
|
49
|
+
),
|
|
50
|
+
# --- IPv4 ---
|
|
51
|
+
PatternDef(
|
|
52
|
+
entity_type=EntityType.IP_ADDRESS,
|
|
53
|
+
pattern=(
|
|
54
|
+
r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
|
|
55
|
+
r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
|
|
56
|
+
),
|
|
57
|
+
validator=None,
|
|
58
|
+
description="IPv4 address",
|
|
59
|
+
),
|
|
60
|
+
# --- IPv6 ---
|
|
61
|
+
PatternDef(
|
|
62
|
+
entity_type=EntityType.IPV6_ADDRESS,
|
|
63
|
+
pattern=(
|
|
64
|
+
r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b"
|
|
65
|
+
r"|\b(?:[0-9a-fA-F]{1,4}:){1,7}:\b"
|
|
66
|
+
r"|\b(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}\b"
|
|
67
|
+
r"|\b::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\b"
|
|
68
|
+
),
|
|
69
|
+
validator=None,
|
|
70
|
+
description="IPv6 address",
|
|
71
|
+
),
|
|
72
|
+
# --- MAC Address (colon/dash: 00:1A:2B:3C:4D:5E) ---
|
|
73
|
+
PatternDef(
|
|
74
|
+
entity_type=EntityType.MAC_ADDRESS,
|
|
75
|
+
pattern=r"\b(?:[0-9a-fA-F]{2}[:\-]){5}[0-9a-fA-F]{2}\b",
|
|
76
|
+
validator=None,
|
|
77
|
+
description="MAC address (colon or dash separated)",
|
|
78
|
+
),
|
|
79
|
+
# MAC Address Cisco format (dot: 0670.3A83.C107)
|
|
80
|
+
PatternDef(
|
|
81
|
+
entity_type=EntityType.MAC_ADDRESS,
|
|
82
|
+
pattern=r"\b[0-9a-fA-F]{4}\.[0-9a-fA-F]{4}\.[0-9a-fA-F]{4}\b",
|
|
83
|
+
validator=None,
|
|
84
|
+
description="MAC address (Cisco dot notation)",
|
|
85
|
+
),
|
|
86
|
+
# --- IMEI (15 digits, Luhn check) ---
|
|
87
|
+
# Luhn + TAC validation is strong enough to not require context
|
|
88
|
+
PatternDef(
|
|
89
|
+
entity_type=EntityType.IMEI,
|
|
90
|
+
pattern=r"\b\d{15}\b",
|
|
91
|
+
validator="imei",
|
|
92
|
+
description="IMEI — 15 digits with Luhn check",
|
|
93
|
+
),
|
|
94
|
+
# IMEI with separators: XX-XXXXXX-XXXXXX-X
|
|
95
|
+
PatternDef(
|
|
96
|
+
entity_type=EntityType.IMEI,
|
|
97
|
+
pattern=r"\b\d{2}[\-\s]\d{6}[\-\s]\d{6}[\-\s]\d\b",
|
|
98
|
+
validator="imei",
|
|
99
|
+
description="IMEI — formatted with separators",
|
|
100
|
+
),
|
|
101
|
+
# --- GPS Coordinates (decimal degrees) ---
|
|
102
|
+
# Latitude: -90 to 90, Longitude: -180 to 180, at least 4 decimal places
|
|
103
|
+
PatternDef(
|
|
104
|
+
entity_type=EntityType.GPS_COORDINATES,
|
|
105
|
+
pattern=(
|
|
106
|
+
r"-?(?:[1-8]?\d(?:\.\d{4,})|90(?:\.0{4,}))"
|
|
107
|
+
r"\s*[,;/]\s*"
|
|
108
|
+
r"-?(?:1[0-7]\d(?:\.\d{4,})|180(?:\.0{4,})|\d{1,2}(?:\.\d{4,}))"
|
|
109
|
+
),
|
|
110
|
+
validator=None,
|
|
111
|
+
description="GPS coordinates — decimal degrees (lat, lon) with 4+ decimals",
|
|
112
|
+
),
|
|
113
|
+
# --- UUID (version 1–5, standard 8-4-4-4-12 hex format) ---
|
|
114
|
+
PatternDef(
|
|
115
|
+
entity_type=EntityType.UUID,
|
|
116
|
+
pattern=r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b",
|
|
117
|
+
validator=None,
|
|
118
|
+
description="UUID (RFC 4122, versions 1-5)",
|
|
119
|
+
),
|
|
120
|
+
# --- Social Media Handles (@username) ---
|
|
121
|
+
# Must be preceded by whitespace or start-of-string, 1-30 alphanumeric/underscore/dot chars
|
|
122
|
+
PatternDef(
|
|
123
|
+
entity_type=EntityType.SOCIAL_HANDLE,
|
|
124
|
+
pattern=r"(?<!\w)@[a-zA-Z][a-zA-Z0-9_.]{1,29}\b",
|
|
125
|
+
validator=None,
|
|
126
|
+
description="Social media handle (@username)",
|
|
127
|
+
),
|
|
128
|
+
# --- Date of Birth (EU formats, requires context) ---
|
|
129
|
+
PatternDef(
|
|
130
|
+
entity_type=EntityType.DOB,
|
|
131
|
+
pattern=r"\b(?:0[1-9]|[12][0-9]|3[01])[/.\-](?:0[1-9]|1[0-2])[/.\-](?:19|20)\d{2}\b",
|
|
132
|
+
validator=None,
|
|
133
|
+
description="Date in DD/MM/YYYY format (EU standard) — requires context",
|
|
134
|
+
context_keywords=[
|
|
135
|
+
"geboren", "geboortedatum", "date de naissance", "né le", "née le",
|
|
136
|
+
"né(e) le", "nee le", "nee(e) le",
|
|
137
|
+
"date of birth", "DOB", "Geburtsdatum", "geboren am", "geboren op",
|
|
138
|
+
"nascido", "nacido", "data di nascita", "nato il", "nata il",
|
|
139
|
+
"geb.", "geb.datum", "geb ", "birth date", "birthday",
|
|
140
|
+
"naissance", "geboorte", "geburtstag",
|
|
141
|
+
],
|
|
142
|
+
requires_context=True,
|
|
143
|
+
),
|
|
144
|
+
# --- Date of Death (requires context) ---
|
|
145
|
+
PatternDef(
|
|
146
|
+
entity_type=EntityType.DATE_OF_DEATH,
|
|
147
|
+
pattern=r"\b(?:0[1-9]|[12][0-9]|3[01])[/.\-](?:0[1-9]|1[0-2])[/.\-](?:19|20)\d{2}\b",
|
|
148
|
+
validator=None,
|
|
149
|
+
description="Date of death — requires context",
|
|
150
|
+
context_keywords=[
|
|
151
|
+
"overleden", "overlijdensdatum", "date de décès", "décédé le",
|
|
152
|
+
"date of death", "Sterbedatum", "verstorben am", "gestorven",
|
|
153
|
+
"death date", "died on", "mort le", "décès",
|
|
154
|
+
],
|
|
155
|
+
requires_context=True,
|
|
156
|
+
),
|
|
157
|
+
# --- ISO date format YYYY-MM-DD (DOB, requires context) ---
|
|
158
|
+
PatternDef(
|
|
159
|
+
entity_type=EntityType.DOB,
|
|
160
|
+
pattern=r"\b(?:19|20)\d{2}[/.\-](?:0[1-9]|1[0-2])[/.\-](?:0[1-9]|[12][0-9]|3[01])\b",
|
|
161
|
+
validator=None,
|
|
162
|
+
description="Date in YYYY-MM-DD format (ISO) — requires context",
|
|
163
|
+
context_keywords=[
|
|
164
|
+
"geboren", "geboortedatum", "date de naissance", "né le", "née le",
|
|
165
|
+
"né(e) le", "nee le",
|
|
166
|
+
"date of birth", "DOB", "Geburtsdatum", "geboren am", "geboren op",
|
|
167
|
+
"geb.", "birth date", "birthday", "naissance", "geboorte",
|
|
168
|
+
],
|
|
169
|
+
requires_context=True,
|
|
170
|
+
),
|
|
171
|
+
]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Austria (AT) PII patterns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from euredact.rules.countries._base import CountryConfig, PatternDef
|
|
6
|
+
from euredact.types import EntityType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ATConfig(CountryConfig):
|
|
10
|
+
"""Austrian PII patterns: SVNR, IBAN, phone, etc."""
|
|
11
|
+
|
|
12
|
+
def __post_init__(self) -> None:
|
|
13
|
+
self.code = "AT"
|
|
14
|
+
self.name = "Austria"
|
|
15
|
+
self.patterns = [
|
|
16
|
+
# --- SVNR (Sozialversicherungsnummer) — 10 digits ---
|
|
17
|
+
PatternDef(
|
|
18
|
+
entity_type=EntityType.NATIONAL_ID,
|
|
19
|
+
pattern=r"\b\d{4}\s?\d{6}\b",
|
|
20
|
+
validator="austrian_svnr",
|
|
21
|
+
description="Austrian SVNR — 10 digits with check digit",
|
|
22
|
+
),
|
|
23
|
+
# --- IBAN ---
|
|
24
|
+
PatternDef(
|
|
25
|
+
entity_type=EntityType.IBAN,
|
|
26
|
+
pattern=r"\bAT\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\b",
|
|
27
|
+
validator="iban",
|
|
28
|
+
description="Austrian IBAN — AT + 18 digits",
|
|
29
|
+
),
|
|
30
|
+
# --- VAT (UID) ---
|
|
31
|
+
PatternDef(
|
|
32
|
+
entity_type=EntityType.VAT,
|
|
33
|
+
pattern=r"\bATU\d{8}\b",
|
|
34
|
+
validator=None,
|
|
35
|
+
description="Austrian VAT (UID) — ATU + 8 digits",
|
|
36
|
+
),
|
|
37
|
+
# --- Phone (national) ---
|
|
38
|
+
PatternDef(
|
|
39
|
+
entity_type=EntityType.PHONE,
|
|
40
|
+
pattern=r"\b0\d{3,4}[\s\-]?\d{5,8}\b",
|
|
41
|
+
validator=None,
|
|
42
|
+
description="Austrian national phone",
|
|
43
|
+
),
|
|
44
|
+
# --- Phone (international) ---
|
|
45
|
+
PatternDef(
|
|
46
|
+
entity_type=EntityType.PHONE,
|
|
47
|
+
pattern=r"\+43\s?\d{3,4}[\s\-]?\d{5,8}",
|
|
48
|
+
validator=None,
|
|
49
|
+
description="Austrian international phone — +43",
|
|
50
|
+
),
|
|
51
|
+
# --- License Plate ---
|
|
52
|
+
PatternDef(
|
|
53
|
+
entity_type=EntityType.LICENSE_PLATE,
|
|
54
|
+
pattern=r"\b[A-ZÄÖÜ]{1,2}\s?\d{1,5}\s?[A-Z]{1,2}\b",
|
|
55
|
+
validator=None,
|
|
56
|
+
description="Austrian license plate — district + number + letters",
|
|
57
|
+
context_keywords=[
|
|
58
|
+
"Kennzeichen", "Nummernschild", "Kfz-Kennzeichen",
|
|
59
|
+
],
|
|
60
|
+
requires_context=True,
|
|
61
|
+
),
|
|
62
|
+
# --- Postal Code ---
|
|
63
|
+
PatternDef(
|
|
64
|
+
entity_type=EntityType.POSTAL_CODE,
|
|
65
|
+
pattern=r"\b[1-9]\d{3}\b",
|
|
66
|
+
validator=None,
|
|
67
|
+
description="Austrian postal code — 4 digits",
|
|
68
|
+
context_keywords=[
|
|
69
|
+
"PLZ", "Postleitzahl", "Adresse", "Anschrift",
|
|
70
|
+
"Straße", "Str.", "Gasse", "Weg", "Platz",
|
|
71
|
+
"Postal:", "Wohnort",
|
|
72
|
+
],
|
|
73
|
+
requires_context=True,
|
|
74
|
+
),
|
|
75
|
+
# Structural: "City, XXXX" or ", XXXX City"
|
|
76
|
+
PatternDef(
|
|
77
|
+
entity_type=EntityType.POSTAL_CODE,
|
|
78
|
+
pattern=r"(?<=, )[1-9]\d{3}(?= [A-Z])",
|
|
79
|
+
validator=None,
|
|
80
|
+
description="Austrian postal code — in address structure",
|
|
81
|
+
),
|
|
82
|
+
]
|