euredact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. euredact/__init__.py +196 -0
  2. euredact/cache.py +50 -0
  3. euredact/cloud/__init__.py +1 -0
  4. euredact/cloud/client.py +19 -0
  5. euredact/cloud/hasher.py +1 -0
  6. euredact/cloud/shuffler.py +1 -0
  7. euredact/coref/__init__.py +1 -0
  8. euredact/coref/models.py +1 -0
  9. euredact/coref/resolver.py +1 -0
  10. euredact/normalizer.py +97 -0
  11. euredact/py.typed +0 -0
  12. euredact/rules/__init__.py +5 -0
  13. euredact/rules/countries/__init__.py +1 -0
  14. euredact/rules/countries/_base.py +31 -0
  15. euredact/rules/countries/_shared.py +171 -0
  16. euredact/rules/countries/at.py +82 -0
  17. euredact/rules/countries/be.py +135 -0
  18. euredact/rules/countries/bg.py +31 -0
  19. euredact/rules/countries/ch.py +97 -0
  20. euredact/rules/countries/cy.py +48 -0
  21. euredact/rules/countries/cz.py +31 -0
  22. euredact/rules/countries/de.py +167 -0
  23. euredact/rules/countries/dk.py +97 -0
  24. euredact/rules/countries/ee.py +40 -0
  25. euredact/rules/countries/el.py +49 -0
  26. euredact/rules/countries/es.py +91 -0
  27. euredact/rules/countries/fi.py +83 -0
  28. euredact/rules/countries/fr.py +142 -0
  29. euredact/rules/countries/hr.py +33 -0
  30. euredact/rules/countries/hu.py +38 -0
  31. euredact/rules/countries/ie.py +46 -0
  32. euredact/rules/countries/is_.py +80 -0
  33. euredact/rules/countries/it.py +92 -0
  34. euredact/rules/countries/lt.py +43 -0
  35. euredact/rules/countries/lu.py +105 -0
  36. euredact/rules/countries/lv.py +45 -0
  37. euredact/rules/countries/mt.py +47 -0
  38. euredact/rules/countries/nl.py +141 -0
  39. euredact/rules/countries/no.py +85 -0
  40. euredact/rules/countries/pl.py +32 -0
  41. euredact/rules/countries/pt.py +79 -0
  42. euredact/rules/countries/ro.py +29 -0
  43. euredact/rules/countries/se.py +123 -0
  44. euredact/rules/countries/si.py +33 -0
  45. euredact/rules/countries/sk.py +29 -0
  46. euredact/rules/countries/uk.py +61 -0
  47. euredact/rules/engine.py +119 -0
  48. euredact/rules/matchers.py +183 -0
  49. euredact/rules/registry.py +68 -0
  50. euredact/rules/structural.py +154 -0
  51. euredact/rules/suppressors.py +455 -0
  52. euredact/rules/validators.py +800 -0
  53. euredact/sdk.py +263 -0
  54. euredact/types.py +71 -0
  55. euredact-0.1.0.dist-info/METADATA +316 -0
  56. euredact-0.1.0.dist-info/RECORD +58 -0
  57. euredact-0.1.0.dist-info/WHEEL +4 -0
  58. euredact-0.1.0.dist-info/licenses/LICENSE +190 -0
euredact/__init__.py ADDED
@@ -0,0 +1,196 @@
1
+ """EuRedact -- European PII redaction SDK.
2
+
3
+ Quick start::
4
+
5
+ import euredact
6
+
7
+ result = euredact.redact("Jan Janssens, BSN 123456782, woont in Gent.")
8
+ print(result.redacted_text)
9
+ print(result.detections)
10
+
11
+ # Batch processing:
12
+ results = euredact.redact_batch(["text one", "text two"], countries=["NL"])
13
+
14
+ # Async:
15
+ result = await euredact.aredact("some text")
16
+
17
+ # Available countries:
18
+ print(euredact.available_countries()) # ['AT', 'BE', 'DE', 'NL', ...]
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import Iterator
24
+
25
+ __version__ = "0.1.0"
26
+
27
+ from euredact.sdk import EuRedact
28
+ from euredact.types import Detection, DetectionSource, EntityType, RedactResult
29
+
30
+ __all__ = [
31
+ "__version__",
32
+ "aredact",
33
+ "aredact_batch",
34
+ "available_countries",
35
+ "Detection",
36
+ "DetectionSource",
37
+ "EntityType",
38
+ "EuRedact",
39
+ "redact",
40
+ "redact_batch",
41
+ "redact_iter",
42
+ "RedactResult",
43
+ ]
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Module-level singleton
47
+ # ---------------------------------------------------------------------------
48
+
49
+ _instance: EuRedact | None = None
50
+
51
+
52
+ def _get_instance() -> EuRedact:
53
+ global _instance
54
+ if _instance is None:
55
+ _instance = EuRedact()
56
+ return _instance
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Public helpers
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ def available_countries() -> list[str]:
65
+ """Return a sorted list of supported country codes (e.g. ``['AT', 'BE', ...]``)."""
66
+ from euredact.rules.registry import CountryRegistry
67
+ return CountryRegistry().available_countries
68
+
69
+
70
+ def redact(
71
+ text: str,
72
+ *,
73
+ countries: list[str] | None = None,
74
+ mode: str = "rules",
75
+ pseudonymize: bool = False,
76
+ detect_dates: bool = False,
77
+ coref: bool = False,
78
+ coref_model: str = "default",
79
+ cache: bool = True,
80
+ ) -> RedactResult:
81
+ """Redact PII from text. Main entry point.
82
+
83
+ Args:
84
+ detect_dates: Include date-of-birth / date-of-death detections.
85
+ Off by default -- bare dates are better handled by the cloud
86
+ LLM tier. When True, the rule engine applies keyword and
87
+ structural (JSON/CSV header) checks before emitting a date.
88
+ """
89
+ return _get_instance().redact(
90
+ text,
91
+ countries=countries,
92
+ mode=mode,
93
+ pseudonymize=pseudonymize,
94
+ detect_dates=detect_dates,
95
+ coref=coref,
96
+ coref_model=coref_model,
97
+ cache=cache,
98
+ )
99
+
100
+
101
+ async def aredact(
102
+ text: str,
103
+ *,
104
+ countries: list[str] | None = None,
105
+ mode: str = "rules",
106
+ pseudonymize: bool = False,
107
+ detect_dates: bool = False,
108
+ cache: bool = True,
109
+ ) -> RedactResult:
110
+ """Async version of redact().
111
+
112
+ Offloads CPU-bound work to a thread pool so it doesn't block the
113
+ event loop. Safe to ``await`` from multiple concurrent tasks.
114
+ """
115
+ return await _get_instance().aredact(
116
+ text,
117
+ countries=countries,
118
+ mode=mode,
119
+ pseudonymize=pseudonymize,
120
+ detect_dates=detect_dates,
121
+ cache=cache,
122
+ )
123
+
124
+
125
+ def redact_batch(
126
+ texts: list[str],
127
+ *,
128
+ countries: list[str] | None = None,
129
+ mode: str = "rules",
130
+ pseudonymize: bool = False,
131
+ detect_dates: bool = False,
132
+ cache: bool = True,
133
+ ) -> list[RedactResult]:
134
+ """Redact PII from multiple texts at once.
135
+
136
+ More efficient than calling ``redact()`` in a loop -- loads country
137
+ configs once. Returns results in the same order as the input.
138
+ """
139
+ return _get_instance().redact_batch(
140
+ texts,
141
+ countries=countries,
142
+ mode=mode,
143
+ pseudonymize=pseudonymize,
144
+ detect_dates=detect_dates,
145
+ cache=cache,
146
+ )
147
+
148
+
149
+ async def aredact_batch(
150
+ texts: list[str],
151
+ *,
152
+ countries: list[str] | None = None,
153
+ mode: str = "rules",
154
+ pseudonymize: bool = False,
155
+ detect_dates: bool = False,
156
+ cache: bool = True,
157
+ max_concurrency: int = 4,
158
+ ) -> list[RedactResult]:
159
+ """Async batch redaction with controlled concurrency.
160
+
161
+ Processes texts concurrently in a thread pool. ``max_concurrency``
162
+ limits parallel threads (default 4). Returns results in input order.
163
+ """
164
+ return await _get_instance().aredact_batch(
165
+ texts,
166
+ countries=countries,
167
+ mode=mode,
168
+ pseudonymize=pseudonymize,
169
+ detect_dates=detect_dates,
170
+ cache=cache,
171
+ max_concurrency=max_concurrency,
172
+ )
173
+
174
+
175
+ def redact_iter(
176
+ texts: Iterator[str],
177
+ *,
178
+ countries: list[str] | None = None,
179
+ mode: str = "rules",
180
+ pseudonymize: bool = False,
181
+ detect_dates: bool = False,
182
+ cache: bool = True,
183
+ ) -> Iterator[RedactResult]:
184
+ """Lazy iterator that yields results one at a time.
185
+
186
+ Useful for processing large datasets without loading all results
187
+ into memory. Loads country configs once on the first item.
188
+ """
189
+ return _get_instance().redact_iter(
190
+ texts,
191
+ countries=countries,
192
+ mode=mode,
193
+ pseudonymize=pseudonymize,
194
+ detect_dates=detect_dates,
195
+ cache=cache,
196
+ )
euredact/cache.py ADDED
@@ -0,0 +1,50 @@
1
+ """LRU result cache with SHA-256 content hashing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import threading
7
+ from collections import OrderedDict
8
+
9
+ from euredact.types import RedactResult
10
+
11
+
12
+ class ResultCache:
13
+ """LRU cache keyed on SHA-256 of input text + config hash."""
14
+
15
+ def __init__(self, maxsize: int = 1024, enabled: bool = True) -> None:
16
+ self._maxsize = maxsize
17
+ self._enabled = enabled
18
+ self._store: OrderedDict[str, RedactResult] = OrderedDict()
19
+ self._lock = threading.Lock()
20
+
21
+ def key(self, text: str, countries: tuple[str, ...], mode: str) -> str:
22
+ """Compute cache key from input text and configuration."""
23
+ raw = f"{text}|{'|'.join(sorted(countries))}|{mode}"
24
+ return hashlib.sha256(raw.encode()).hexdigest()
25
+
26
+ def get(self, key: str) -> RedactResult | None:
27
+ """Retrieve a cached result, or None on miss."""
28
+ if not self._enabled:
29
+ return None
30
+ with self._lock:
31
+ if key in self._store:
32
+ self._store.move_to_end(key)
33
+ return self._store[key]
34
+ return None
35
+
36
+ def put(self, key: str, result: RedactResult) -> None:
37
+ """Store a result in the cache."""
38
+ if not self._enabled:
39
+ return
40
+ with self._lock:
41
+ if key in self._store:
42
+ self._store.move_to_end(key)
43
+ self._store[key] = result
44
+ while len(self._store) > self._maxsize:
45
+ self._store.popitem(last=False)
46
+
47
+ def clear(self) -> None:
48
+ """Clear the entire cache."""
49
+ with self._lock:
50
+ self._store.clear()
@@ -0,0 +1 @@
1
+ """[CLOUD EXTENSION] Cloud tier — stubbed, not implemented."""
@@ -0,0 +1,19 @@
1
+ """[CLOUD EXTENSION] Placeholder cloud client."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class NotConfiguredError(Exception):
7
+ """Raised when cloud tier is used without configuration."""
8
+
9
+ def __init__(self) -> None:
10
+ super().__init__(
11
+ "Cloud tier not configured. Call euredact.configure(api_key=...) first."
12
+ )
13
+
14
+
15
+ class CloudClient:
16
+ """Placeholder for the cloud annotation client."""
17
+
18
+ def __init__(self) -> None:
19
+ raise NotConfiguredError()
@@ -0,0 +1 @@
1
+ """[CLOUD EXTENSION] Segment hashing + hash-to-position mapping."""
@@ -0,0 +1 @@
1
+ """[CLOUD EXTENSION] Cross-client segment shuffling."""
@@ -0,0 +1 @@
1
+ """[COREF EXTENSION] Coreference resolution — Pro/Enterprise, not implemented."""
@@ -0,0 +1 @@
1
+ """[COREF EXTENSION] Model registry: manages DistilBERT + TinyBERT variants."""
@@ -0,0 +1 @@
1
+ """[COREF EXTENSION] Local coreference model: merges partial entity refs."""
euredact/normalizer.py ADDED
@@ -0,0 +1,97 @@
1
+ """Unicode NFC normalization for input text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import unicodedata
6
+
7
+
8
+ def normalize(text: str) -> tuple[str, list[int] | None]:
9
+ """Normalize text to NFC form before pattern matching.
10
+
11
+ Returns the normalized text and an offset mapping from normalized positions
12
+ to original positions (None if no length change occurred).
13
+ """
14
+ normalized = unicodedata.normalize("NFC", text)
15
+ if len(normalized) == len(text):
16
+ return normalized, None
17
+
18
+ # Build character offset mapping: normalized index -> original index
19
+ # We do this by normalizing character-by-character
20
+ mapping: list[int] = []
21
+ orig_idx = 0
22
+ for char in text:
23
+ nfc_char = unicodedata.normalize("NFC", char)
24
+ for _ in nfc_char:
25
+ mapping.append(orig_idx)
26
+ orig_idx += len(char.encode("utf-32-le")) // 4
27
+ # Actually, we need codepoint-level tracking
28
+ # Simpler approach: use NFC on full string and build mapping via
29
+ # decompose-then-compose tracking
30
+ # For most European text, NFC doesn't change length. When it does,
31
+ # the mapping lets us translate detection offsets back to original positions.
32
+ return normalized, _build_offset_mapping(text, normalized)
33
+
34
+
35
+ def _build_offset_mapping(original: str, normalized: str) -> list[int]:
36
+ """Build mapping from normalized character positions to original positions.
37
+
38
+ Uses NFD as intermediate to align characters between original and NFC.
39
+ """
40
+ nfd_original = unicodedata.normalize("NFD", original)
41
+ nfd_normalized = unicodedata.normalize("NFD", normalized)
42
+
43
+ # Both NFD forms should be identical
44
+ # Build: original char index -> NFD index range
45
+ orig_to_nfd: list[int] = []
46
+ nfd_idx = 0
47
+ for char in original:
48
+ nfd_char = unicodedata.normalize("NFD", char)
49
+ orig_to_nfd.append(nfd_idx)
50
+ nfd_idx += len(nfd_char)
51
+
52
+ # Build: NFD index -> normalized char index
53
+ nfd_to_norm: list[int] = []
54
+ nfd_idx = 0
55
+ for norm_idx, char in enumerate(normalized):
56
+ nfd_char = unicodedata.normalize("NFD", char)
57
+ for _ in nfd_char:
58
+ nfd_to_norm.append(norm_idx)
59
+ nfd_idx += len(nfd_char)
60
+
61
+ # Build: normalized index -> original index
62
+ # For each normalized position, find which original character maps to it
63
+ norm_to_orig: list[int] = []
64
+ orig_char_idx = 0
65
+ orig_nfd_pos = 0
66
+ for norm_idx in range(len(normalized)):
67
+ nfd_of_norm_char = unicodedata.normalize("NFD", normalized[norm_idx])
68
+ nfd_start = sum(
69
+ len(unicodedata.normalize("NFD", normalized[i]))
70
+ for i in range(norm_idx)
71
+ )
72
+ # Find which original character this NFD position belongs to
73
+ target_orig = 0
74
+ running = 0
75
+ for oi, oc in enumerate(original):
76
+ nfd_oc = unicodedata.normalize("NFD", oc)
77
+ if running + len(nfd_oc) > nfd_start:
78
+ target_orig = oi
79
+ break
80
+ running += len(nfd_oc)
81
+ norm_to_orig.append(target_orig)
82
+
83
+ return norm_to_orig
84
+
85
+
86
+ def map_offset_to_original(
87
+ offset: int, mapping: list[int] | None
88
+ ) -> int:
89
+ """Map a normalized-text offset back to an original-text offset."""
90
+ if mapping is None:
91
+ return offset
92
+ if offset >= len(mapping):
93
+ # Past end of mapping — extrapolate
94
+ if mapping:
95
+ return mapping[-1] + (offset - len(mapping) + 1)
96
+ return offset
97
+ return mapping[offset]
euredact/py.typed ADDED
File without changes
@@ -0,0 +1,5 @@
1
+ """EuRedact rule engine."""
2
+
3
+ from euredact.rules.engine import RuleEngine
4
+
5
+ __all__ = ["RuleEngine"]
@@ -0,0 +1 @@
1
+ """Country-specific PII pattern configurations."""
@@ -0,0 +1,31 @@
1
+ """Base country configuration class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+ from euredact.types import EntityType
8
+
9
+
10
+ @dataclass
11
+ class PatternDef:
12
+ """A single pattern definition."""
13
+
14
+ entity_type: EntityType
15
+ pattern: str
16
+ validator: str | None = None
17
+ description: str = ""
18
+ context_keywords: list[str] = field(default_factory=list)
19
+ requires_context: bool = False
20
+
21
+
22
+ @dataclass
23
+ class CountryConfig:
24
+ """Base class for country-specific PII patterns."""
25
+
26
+ code: str = ""
27
+ name: str = ""
28
+ patterns: list[PatternDef] = field(default_factory=list)
29
+
30
+ def __post_init__(self) -> None:
31
+ """Override in subclasses to populate patterns."""
@@ -0,0 +1,171 @@
1
+ """Shared (EU-wide) PII patterns: email, generic IBAN, international phone, dates, digital IDs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from euredact.rules.countries._base import CountryConfig, PatternDef
6
+ from euredact.types import EntityType
7
+
8
+
9
+ class SharedConfig(CountryConfig):
10
+ """Patterns shared across all EU countries."""
11
+
12
+ def __post_init__(self) -> None:
13
+ self.code = "SHARED"
14
+ self.name = "Shared EU Patterns"
15
+ self.patterns = [
16
+ # --- Email ---
17
+ PatternDef(
18
+ entity_type=EntityType.EMAIL,
19
+ pattern=r"\b[\w._%+\-]+@[\w.\-]+\.[a-zA-Z]{2,}\b",
20
+ validator=None,
21
+ description="Email address (RFC 5322 simplified)",
22
+ ),
23
+ # --- BIC/SWIFT ---
24
+ PatternDef(
25
+ entity_type=EntityType.BIC,
26
+ pattern=r"\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\b",
27
+ validator="bic",
28
+ description="BIC/SWIFT code (8 or 11 characters)",
29
+ ),
30
+ # --- Credit Card (Visa, Mastercard, Amex) ---
31
+ PatternDef(
32
+ entity_type=EntityType.CREDIT_CARD,
33
+ pattern=(
34
+ r"\b(?:"
35
+ r"4[0-9]{3}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}" # Visa
36
+ r"|5[1-5][0-9]{2}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}[\s\-]?[0-9]{4}" # MC
37
+ r"|3[47][0-9]{2}[\s\-]?[0-9]{6}[\s\-]?[0-9]{5}" # Amex
38
+ r")\b"
39
+ ),
40
+ validator="luhn",
41
+ description="Credit card number (Visa, Mastercard, Amex) with Luhn",
42
+ ),
43
+ # --- VIN ---
44
+ PatternDef(
45
+ entity_type=EntityType.VIN,
46
+ pattern=r"\b[A-HJ-NPR-Z0-9]{17}\b",
47
+ validator="vin",
48
+ description="Vehicle Identification Number (ISO 3779)",
49
+ ),
50
+ # --- IPv4 ---
51
+ PatternDef(
52
+ entity_type=EntityType.IP_ADDRESS,
53
+ pattern=(
54
+ r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
55
+ r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
56
+ ),
57
+ validator=None,
58
+ description="IPv4 address",
59
+ ),
60
+ # --- IPv6 ---
61
+ PatternDef(
62
+ entity_type=EntityType.IPV6_ADDRESS,
63
+ pattern=(
64
+ r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b"
65
+ r"|\b(?:[0-9a-fA-F]{1,4}:){1,7}:\b"
66
+ r"|\b(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}\b"
67
+ r"|\b::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\b"
68
+ ),
69
+ validator=None,
70
+ description="IPv6 address",
71
+ ),
72
+ # --- MAC Address (colon/dash: 00:1A:2B:3C:4D:5E) ---
73
+ PatternDef(
74
+ entity_type=EntityType.MAC_ADDRESS,
75
+ pattern=r"\b(?:[0-9a-fA-F]{2}[:\-]){5}[0-9a-fA-F]{2}\b",
76
+ validator=None,
77
+ description="MAC address (colon or dash separated)",
78
+ ),
79
+ # MAC Address Cisco format (dot: 0670.3A83.C107)
80
+ PatternDef(
81
+ entity_type=EntityType.MAC_ADDRESS,
82
+ pattern=r"\b[0-9a-fA-F]{4}\.[0-9a-fA-F]{4}\.[0-9a-fA-F]{4}\b",
83
+ validator=None,
84
+ description="MAC address (Cisco dot notation)",
85
+ ),
86
+ # --- IMEI (15 digits, Luhn check) ---
87
+ # Luhn + TAC validation is strong enough to not require context
88
+ PatternDef(
89
+ entity_type=EntityType.IMEI,
90
+ pattern=r"\b\d{15}\b",
91
+ validator="imei",
92
+ description="IMEI — 15 digits with Luhn check",
93
+ ),
94
+ # IMEI with separators: XX-XXXXXX-XXXXXX-X
95
+ PatternDef(
96
+ entity_type=EntityType.IMEI,
97
+ pattern=r"\b\d{2}[\-\s]\d{6}[\-\s]\d{6}[\-\s]\d\b",
98
+ validator="imei",
99
+ description="IMEI — formatted with separators",
100
+ ),
101
+ # --- GPS Coordinates (decimal degrees) ---
102
+ # Latitude: -90 to 90, Longitude: -180 to 180, at least 4 decimal places
103
+ PatternDef(
104
+ entity_type=EntityType.GPS_COORDINATES,
105
+ pattern=(
106
+ r"-?(?:[1-8]?\d(?:\.\d{4,})|90(?:\.0{4,}))"
107
+ r"\s*[,;/]\s*"
108
+ r"-?(?:1[0-7]\d(?:\.\d{4,})|180(?:\.0{4,})|\d{1,2}(?:\.\d{4,}))"
109
+ ),
110
+ validator=None,
111
+ description="GPS coordinates — decimal degrees (lat, lon) with 4+ decimals",
112
+ ),
113
+ # --- UUID (version 1–5, standard 8-4-4-4-12 hex format) ---
114
+ PatternDef(
115
+ entity_type=EntityType.UUID,
116
+ pattern=r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b",
117
+ validator=None,
118
+ description="UUID (RFC 4122, versions 1-5)",
119
+ ),
120
+ # --- Social Media Handles (@username) ---
121
+ # Must be preceded by whitespace or start-of-string, 1-30 alphanumeric/underscore/dot chars
122
+ PatternDef(
123
+ entity_type=EntityType.SOCIAL_HANDLE,
124
+ pattern=r"(?<!\w)@[a-zA-Z][a-zA-Z0-9_.]{1,29}\b",
125
+ validator=None,
126
+ description="Social media handle (@username)",
127
+ ),
128
+ # --- Date of Birth (EU formats, requires context) ---
129
+ PatternDef(
130
+ entity_type=EntityType.DOB,
131
+ pattern=r"\b(?:0[1-9]|[12][0-9]|3[01])[/.\-](?:0[1-9]|1[0-2])[/.\-](?:19|20)\d{2}\b",
132
+ validator=None,
133
+ description="Date in DD/MM/YYYY format (EU standard) — requires context",
134
+ context_keywords=[
135
+ "geboren", "geboortedatum", "date de naissance", "né le", "née le",
136
+ "né(e) le", "nee le", "nee(e) le",
137
+ "date of birth", "DOB", "Geburtsdatum", "geboren am", "geboren op",
138
+ "nascido", "nacido", "data di nascita", "nato il", "nata il",
139
+ "geb.", "geb.datum", "geb ", "birth date", "birthday",
140
+ "naissance", "geboorte", "geburtstag",
141
+ ],
142
+ requires_context=True,
143
+ ),
144
+ # --- Date of Death (requires context) ---
145
+ PatternDef(
146
+ entity_type=EntityType.DATE_OF_DEATH,
147
+ pattern=r"\b(?:0[1-9]|[12][0-9]|3[01])[/.\-](?:0[1-9]|1[0-2])[/.\-](?:19|20)\d{2}\b",
148
+ validator=None,
149
+ description="Date of death — requires context",
150
+ context_keywords=[
151
+ "overleden", "overlijdensdatum", "date de décès", "décédé le",
152
+ "date of death", "Sterbedatum", "verstorben am", "gestorven",
153
+ "death date", "died on", "mort le", "décès",
154
+ ],
155
+ requires_context=True,
156
+ ),
157
+ # --- ISO date format YYYY-MM-DD (DOB, requires context) ---
158
+ PatternDef(
159
+ entity_type=EntityType.DOB,
160
+ pattern=r"\b(?:19|20)\d{2}[/.\-](?:0[1-9]|1[0-2])[/.\-](?:0[1-9]|[12][0-9]|3[01])\b",
161
+ validator=None,
162
+ description="Date in YYYY-MM-DD format (ISO) — requires context",
163
+ context_keywords=[
164
+ "geboren", "geboortedatum", "date de naissance", "né le", "née le",
165
+ "né(e) le", "nee le",
166
+ "date of birth", "DOB", "Geburtsdatum", "geboren am", "geboren op",
167
+ "geb.", "birth date", "birthday", "naissance", "geboorte",
168
+ ],
169
+ requires_context=True,
170
+ ),
171
+ ]
@@ -0,0 +1,82 @@
1
+ """Austria (AT) PII patterns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from euredact.rules.countries._base import CountryConfig, PatternDef
6
+ from euredact.types import EntityType
7
+
8
+
9
+ class ATConfig(CountryConfig):
10
+ """Austrian PII patterns: SVNR, IBAN, phone, etc."""
11
+
12
+ def __post_init__(self) -> None:
13
+ self.code = "AT"
14
+ self.name = "Austria"
15
+ self.patterns = [
16
+ # --- SVNR (Sozialversicherungsnummer) — 10 digits ---
17
+ PatternDef(
18
+ entity_type=EntityType.NATIONAL_ID,
19
+ pattern=r"\b\d{4}\s?\d{6}\b",
20
+ validator="austrian_svnr",
21
+ description="Austrian SVNR — 10 digits with check digit",
22
+ ),
23
+ # --- IBAN ---
24
+ PatternDef(
25
+ entity_type=EntityType.IBAN,
26
+ pattern=r"\bAT\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\b",
27
+ validator="iban",
28
+ description="Austrian IBAN — AT + 18 digits",
29
+ ),
30
+ # --- VAT (UID) ---
31
+ PatternDef(
32
+ entity_type=EntityType.VAT,
33
+ pattern=r"\bATU\d{8}\b",
34
+ validator=None,
35
+ description="Austrian VAT (UID) — ATU + 8 digits",
36
+ ),
37
+ # --- Phone (national) ---
38
+ PatternDef(
39
+ entity_type=EntityType.PHONE,
40
+ pattern=r"\b0\d{3,4}[\s\-]?\d{5,8}\b",
41
+ validator=None,
42
+ description="Austrian national phone",
43
+ ),
44
+ # --- Phone (international) ---
45
+ PatternDef(
46
+ entity_type=EntityType.PHONE,
47
+ pattern=r"\+43\s?\d{3,4}[\s\-]?\d{5,8}",
48
+ validator=None,
49
+ description="Austrian international phone — +43",
50
+ ),
51
+ # --- License Plate ---
52
+ PatternDef(
53
+ entity_type=EntityType.LICENSE_PLATE,
54
+ pattern=r"\b[A-ZÄÖÜ]{1,2}\s?\d{1,5}\s?[A-Z]{1,2}\b",
55
+ validator=None,
56
+ description="Austrian license plate — district + number + letters",
57
+ context_keywords=[
58
+ "Kennzeichen", "Nummernschild", "Kfz-Kennzeichen",
59
+ ],
60
+ requires_context=True,
61
+ ),
62
+ # --- Postal Code ---
63
+ PatternDef(
64
+ entity_type=EntityType.POSTAL_CODE,
65
+ pattern=r"\b[1-9]\d{3}\b",
66
+ validator=None,
67
+ description="Austrian postal code — 4 digits",
68
+ context_keywords=[
69
+ "PLZ", "Postleitzahl", "Adresse", "Anschrift",
70
+ "Straße", "Str.", "Gasse", "Weg", "Platz",
71
+ "Postal:", "Wohnort",
72
+ ],
73
+ requires_context=True,
74
+ ),
75
+ # Structural: "City, XXXX" or ", XXXX City"
76
+ PatternDef(
77
+ entity_type=EntityType.POSTAL_CODE,
78
+ pattern=r"(?<=, )[1-9]\d{3}(?= [A-Z])",
79
+ validator=None,
80
+ description="Austrian postal code — in address structure",
81
+ ),
82
+ ]