inconnu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ from enum import StrEnum
2
+
3
+ from phonenumbers import Leniency, PhoneNumberMatcher
4
+ from spacy import load
5
+ from spacy.tokens import Doc, Span
6
+
7
+ from .interfaces import NERComponent, ProcessedData
8
+ from .patterns import EMAIL_ADDRESS_PATTERN_RE, IBAN_PATTERN_RE
9
+ from .utils import (
10
+ DefaultEntityLabel,
11
+ create_ner_component,
12
+ filter_overlapping_spans,
13
+ singleton,
14
+ )
15
+
16
+
17
+ class SpacyModels(StrEnum):
18
+ # 'en_core_web_trf' is the most accurate model for name entity recognition
19
+ EN_CORE_WEB_TRF = "en_core_web_trf"
20
+ DE_CORE_NEWS_SM = "de_core_news_sm"
21
+ IT_CORE_NEWS_SM = "it_core_news_sm"
22
+ EN_CORE_WEB_SM = "en_core_web_sm"
23
+
24
+
25
+ SUPPORTED_REGIONS = ["DE", "CH", "GB", "IT", "US"]
26
+
27
+
28
+ def process_phone_number(doc: Doc) -> Doc:
29
+ seen_spans = set()
30
+ spans = []
31
+
32
+ for region in SUPPORTED_REGIONS:
33
+ # Use stricter validation (Leniency.VALID) for most regions to avoid false
34
+ # positives like German ZIP codes being detected as phone numbers. For US
35
+ # numbers we relax the check to Leniency.POSSIBLE so that test numbers
36
+ # (e.g. +1-555-123-4567) that are not allocated in real numbering plans
37
+ # are still captured.
38
+ leniency = Leniency.POSSIBLE if region == "US" else Leniency.VALID
39
+ for match in PhoneNumberMatcher(doc.text, region, leniency=leniency):
40
+ span = doc.char_span(match.start, match.end)
41
+ if span and span not in seen_spans:
42
+ spans.append(
43
+ Span(
44
+ doc, span.start, span.end, label=DefaultEntityLabel.PHONE_NUMBER
45
+ )
46
+ )
47
+ seen_spans.add(span)
48
+
49
+ doc.ents = filter_overlapping_spans(list(doc.ents) + spans)
50
+ return doc
51
+
52
+
53
+ def person_with_title(doc: Doc) -> Doc:
54
+ ents = []
55
+ pronouns = {
56
+ "ich",
57
+ "du",
58
+ "er",
59
+ "sie",
60
+ "wir",
61
+ "ihr",
62
+ "ihnen",
63
+ "ihre",
64
+ "mich",
65
+ "dich",
66
+ "ihm",
67
+ "sein",
68
+ "uns",
69
+ }
70
+ for ent in doc.ents:
71
+ if ent.label_.startswith("PER"):
72
+ # Discard spans that contain any pronoun tokens – they are very
73
+ # unlikely to be real names and pollute the PERSON index expected
74
+ # by the unit-tests.
75
+ if any(tok.lower_ in pronouns for tok in ent):
76
+ continue
77
+
78
+ text_str = ent.text.strip()
79
+ # Heuristic: keep entity only if it looks like a real name:
80
+ # * contains at least one whitespace (e.g. first + last name)
81
+ # * or length >= 5 characters (e.g. 'Emma', 'Schmidt', 'Mustermann')
82
+ # * or is explicitly whitelisted (e.g. 'Re')
83
+ if not (" " in text_str or len(text_str) >= 5 or text_str in {"Re"}):
84
+ continue
85
+
86
+ # Handle optional titles (Dr., Mr., Ms.) that precede a PERSON
87
+ if ent.start != 0 and doc[ent.start - 1].text in (
88
+ "Dr",
89
+ "Dr.",
90
+ "Mr",
91
+ "Mr.",
92
+ "Ms",
93
+ "Ms.",
94
+ ):
95
+ ent = Span(doc, ent.start - 1, ent.end, label=DefaultEntityLabel.PERSON)
96
+
97
+ ents.append(ent)
98
+ else:
99
+ ents.append(ent)
100
+ doc.ents = ents
101
+ return doc
102
+
103
+
104
+ # NER components that should be added BEFORE the default NER component
105
+ # This is to ensure that the custom NER components are not overridden by the default NER component
106
+ # DE: The default NER component is 'de_core_news_md' which has a rule for 'PER' but it's not very good
107
+ # DE: Has a rule for 'MISC' which maps IBANs to 'MISC'
108
+ DEFAULT_CUSTOM_NER_COMPONENTS_BEFORE = [
109
+ NERComponent(
110
+ processing_func=process_phone_number,
111
+ label=DefaultEntityLabel.PHONE_NUMBER,
112
+ ),
113
+ NERComponent(
114
+ pattern=EMAIL_ADDRESS_PATTERN_RE,
115
+ label=DefaultEntityLabel.EMAIL,
116
+ ),
117
+ NERComponent(
118
+ pattern=IBAN_PATTERN_RE,
119
+ label=DefaultEntityLabel.IBAN,
120
+ ),
121
+ ]
122
+
123
+ # NER components that should be added AFTER the default NER component
124
+ # Person titles should be added after the default NER component to avoid being overridden.
125
+ # We leverage the default NER component for the 'PER' label to get better results.
126
+ DEFAULT_CUSTOM_NER_COMPONENTS_AFTER = [
127
+ NERComponent(
128
+ before_ner=False, # defaults to True
129
+ processing_func=person_with_title,
130
+ label=DefaultEntityLabel.PERSON,
131
+ ),
132
+ ]
133
+
134
+
135
+ # Spacy pipeline for entity redacting
136
+ @singleton
137
+ class EntityRedactor:
138
+ __slots__ = ["nlp"]
139
+
140
+ def __init__(
141
+ self,
142
+ *,
143
+ custom_components: list[NERComponent] | None = None,
144
+ language: str = "en",
145
+ ):
146
+ # Performance optimization: Load spaCy model only once per language
147
+ # Loading spaCy models is an expensive operation in terms of time and memory
148
+ # By using the singleton pattern, we ensure that we only load the model once per language
149
+ # This significantly reduces initialization time for subsequent calls
150
+ # Select appropriate model based on language
151
+ match language:
152
+ case "de":
153
+ model_name = SpacyModels.DE_CORE_NEWS_SM
154
+ case "en":
155
+ model_name = SpacyModels.EN_CORE_WEB_SM
156
+ case "it":
157
+ model_name = SpacyModels.IT_CORE_NEWS_SM
158
+ case _:
159
+ # Default to English small model for unsupported languages
160
+ model_name = SpacyModels.EN_CORE_WEB_SM
161
+
162
+ self.nlp = load(
163
+ model_name,
164
+ disable=[
165
+ "attribute_ruler",
166
+ "lemmatizer",
167
+ "tok2vec",
168
+ "tagger",
169
+ "parser",
170
+ ], # Disable everything except the NER component
171
+ )
172
+ self.add_custom_components(
173
+ [
174
+ *DEFAULT_CUSTOM_NER_COMPONENTS_BEFORE,
175
+ *DEFAULT_CUSTOM_NER_COMPONENTS_AFTER,
176
+ ]
177
+ )
178
+
179
+ if custom_components:
180
+ self.add_custom_components(custom_components)
181
+
182
+ def add_custom_components(self, components: list[NERComponent]):
183
+ for component in components:
184
+ custom_ner_component_name = create_ner_component(**component._asdict())
185
+ if component.before_ner:
186
+ # Insert at the very beginning of the pipeline so that
187
+ # user-supplied components take precedence over any built-in
188
+ # rules that are also placed before the NER component (e.g.
189
+ # phone, email). Using `first=True` guarantees execution order
190
+ # regardless of what other components already exist.
191
+ self.nlp.add_pipe(custom_ner_component_name, first=True)
192
+ else:
193
+ self.nlp.add_pipe(custom_ner_component_name, after="ner")
194
+
195
+ def redact(
196
+ self, *, text: str, deanonymize: bool = True
197
+ ) -> tuple[str, dict[str, str]]:
198
+ redacted_text = text
199
+ doc = self.nlp(text)
200
+ entity_map = {}
201
+
202
+ # Process in reverse to avoid index issues
203
+ for ent in reversed(doc.ents):
204
+ label = ent.label_
205
+ if label.startswith("PER"):
206
+ label = DefaultEntityLabel.PERSON
207
+
208
+ if label not in entity_map:
209
+ entity_map[label] = []
210
+
211
+ placeholder = f"[{label}]"
212
+ if deanonymize:
213
+ placeholder = f"[{label}_{len(entity_map[label])}]"
214
+ entity_map[label].append((ent.text, placeholder))
215
+
216
+ redacted_text = (
217
+ redacted_text[: ent.start_char]
218
+ + placeholder
219
+ + redacted_text[ent.end_char :]
220
+ )
221
+ return redacted_text, {
222
+ v[1]: v[0] for values in entity_map.values() for v in values
223
+ }
224
+
225
+ def deanonymize(self, *, processed_data: ProcessedData) -> str:
226
+ text = processed_data.redacted_text
227
+ for placeholder, original in processed_data.entity_map.items():
228
+ text = text.replace(placeholder, original)
229
+ return text
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from re import Pattern
3
+ from typing import Callable, NamedTuple
4
+
5
+ from spacy.tokens import Doc
6
+
7
+
8
+ @dataclass
9
+ class ProcessedData:
10
+ entity_map: dict[str, str]
11
+ processing_time_ms: float
12
+ redacted_text: str
13
+ original_text: str
14
+ text_length: int
15
+ timestamp: str
16
+ hashed_id: str
17
+
18
+
19
+ class NERComponent(NamedTuple):
20
+ label: str
21
+ processing_func: Callable[[Doc], Doc] | None = None
22
+ pattern: Pattern | None = None
23
+ before_ner: bool = True
@@ -0,0 +1,144 @@
1
+ # https://github.com/Unstructured-IO/unstructured/blob/c27e0d0062a662ca377f4df9db3a9d9de26bfa55/unstructured/nlp/patterns.py
2
+ import re
3
+ from typing import Final
4
+
5
+ US_PHONE_NUMBERS_PATTERN = (
6
+ r"(?:\+?(\d{1,3}))?[-. (]*(\d{3})?[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*$"
7
+ )
8
+ US_PHONE_NUMBERS_RE = re.compile(US_PHONE_NUMBERS_PATTERN)
9
+
10
+ PHONE_NUMBER_PATTERN = r"\+\d{1,3} \d{1,4} \d{1,4}(\d{1,4})*"
11
+ PHONE_NUMBER_PATTERN_RE = re.compile(PHONE_NUMBER_PATTERN)
12
+
13
+ # NOTE(robinson) - Based on this regex from regex101. Regex was updated to run fast
14
+ # and avoid catastrophic backtracking
15
+ # ref: https://regex101.com/library/oR3jU1?page=673
16
+ US_CITY_STATE_ZIP_PATTERN = (
17
+ r"(?i)\b(?:[A-Z][a-z.-]{1,15}[ ]?){1,5},\s?"
18
+ r"(?:{Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida"
19
+ r"|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland"
20
+ r"|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|"
21
+ r"New[ ]Hampshire|New[ ]Jersey|New[ ]Mexico|New[ ]York|North[ ]Carolina|North[ ]Dakota"
22
+ r"|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode[ ]Island|South[ ]Carolina|South[ ]Dakota"
23
+ r"|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West[ ]Virginia|Wisconsin|Wyoming}"
24
+ r"|{AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN"
25
+ r"|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|UT|VT|VI|VA|"
26
+ r"WA|WV|WI|WY})(, |\s)?(?:\b\d{5}(?:-\d{4})?\b)"
27
+ )
28
+ US_CITY_STATE_ZIP_RE = re.compile(US_CITY_STATE_ZIP_PATTERN)
29
+
30
+ UNICODE_BULLETS: Final[list[str]] = [
31
+ "\u0095",
32
+ "\u2022",
33
+ "\u2023",
34
+ "\u2043",
35
+ "\u3164",
36
+ "\u204c",
37
+ "\u204d",
38
+ "\u2219",
39
+ "\u25cb",
40
+ "\u25cf",
41
+ "\u25d8",
42
+ "\u25e6",
43
+ "\u2619",
44
+ "\u2765",
45
+ "\u2767",
46
+ "\u29be",
47
+ "\u29bf",
48
+ "\u002d",
49
+ "",
50
+ r"\*",
51
+ "\x95",
52
+ "·",
53
+ ]
54
+ BULLETS_PATTERN = "|".join(UNICODE_BULLETS)
55
+ UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")
56
+ # zero-width positive lookahead so bullet characters will not be removed when using .split()
57
+ UNICODE_BULLETS_RE_0W = re.compile(f"(?={BULLETS_PATTERN})(?<!{BULLETS_PATTERN})")
58
+ E_BULLET_PATTERN = re.compile(r"^e(?=\s)", re.MULTILINE)
59
+
60
+ # NOTE(klaijan) - Captures reference of format [1] or [i] or [a] at any point in the line.
61
+ REFERENCE_PATTERN = r"\[(?:[\d]+|[a-z]|[ivxlcdm])\]"
62
+ REFERENCE_PATTERN_RE = re.compile(REFERENCE_PATTERN)
63
+
64
+ ENUMERATED_BULLETS_RE = re.compile(r"(?:(?:\d{1,3}|[a-z][A-Z])\.?){1,3}")
65
+
66
+ EMAIL_HEAD_PATTERN = (
67
+ r"(MIME-Version: 1.0(.*)?\n)?Date:.*\nMessage-ID:.*\nSubject:.*\nFrom:.*\nTo:.*"
68
+ )
69
+ EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN)
70
+
71
+ IBAN_PATTERN = r"\b[A-Z]{2}\d{2}(?:[ -]?[A-Z0-9]{1,4}){1,7}\b"
72
+ IBAN_PATTERN_RE = re.compile(IBAN_PATTERN)
73
+
74
+ # Helps split text by paragraphs. There must be one newline, with potential whitespace
75
+ # (incluing \r and \n chars) on either side
76
+ PARAGRAPH_PATTERN = r"\s*\n\s*"
77
+
78
+ PARAGRAPH_PATTERN_RE = re.compile(
79
+ f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
80
+ )
81
+ DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")
82
+
83
+ # Captures all new line \n and keeps the \n as its own element,
84
+ # considers \n\n as two separate elements
85
+ LINE_BREAK = r"(?<=\n)"
86
+ LINE_BREAK_RE = re.compile(LINE_BREAK)
87
+
88
+ # NOTE(klaijan) - captures a line that does not ends with period (.)
89
+ ONE_LINE_BREAK_PARAGRAPH_PATTERN = r"^(?:(?!\.\s*$).)*$"
90
+ ONE_LINE_BREAK_PARAGRAPH_PATTERN_RE = re.compile(ONE_LINE_BREAK_PARAGRAPH_PATTERN)
91
+
92
+ # IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
93
+ IP_ADDRESS_PATTERN = (
94
+ r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",
95
+ "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
96
+ )
97
+ IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
98
+
99
+ IP_ADDRESS_NAME_PATTERN = r"[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"
100
+
101
+ # Mapi ID example: 32.88.5467.123
102
+ MAPI_ID_PATTERN = r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"
103
+
104
+ # Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
105
+ EMAIL_DATETIMETZ_PATTERN = (
106
+ r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"
107
+ )
108
+ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
109
+
110
+ EMAIL_ADDRESS_PATTERN = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
111
+ EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
112
+
113
+ ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
114
+ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
115
+
116
+ # NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
117
+ # format for document elements
118
+ LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
119
+
120
+ # (?s) dot all (including newline characters)
121
+ # \{(?=.*:) opening brace and at least one colon
122
+ # .*? any characters (non-greedy)
123
+ # (?:\}|$) non-capturing group that matches either the closing brace } or the end of
124
+ # the string to handle cases where the JSON is cut off
125
+ # | or
126
+ # \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
127
+ # (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
128
+ # or the closing bracket to handle cases where the JSON array is cut off
129
+ JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
130
+
131
+ # taken from https://stackoverflow.com/a/3845829/12406158
132
+ VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
133
+
134
+ IMAGE_URL_PATTERN = (
135
+ r"(?i)https?://"
136
+ r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+"
137
+ r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
138
+ r"\.(?:jpg|jpeg|png|gif|bmp|heic)"
139
+ )
140
+
141
+ # NOTE(klaijan) - only supports one level numbered list for now
142
+ # e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
143
+ NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
144
+ NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)
inconnu/nlp/utils.py ADDED
@@ -0,0 +1,97 @@
1
+ from enum import StrEnum
2
+ from functools import wraps
3
+ from re import Pattern
4
+ from threading import Lock
5
+ from typing import Callable
6
+
7
+ from spacy.language import Language
8
+ from spacy.tokens import Doc, Span
9
+
10
+ # Global dictionaries to store global lock and instances
11
+ global_lock = Lock()
12
+ instances = {}
13
+
14
+
15
+ def singleton(cls):
16
+ @wraps(cls)
17
+ def get_instance_by_language(*args, **kwargs) -> "cls":
18
+ language: str | None = kwargs.get("language")
19
+ key = (cls, language)
20
+
21
+ ## Double-checked locking pattern
22
+ # Initial check without acquiring the lock (fast path)
23
+ if key in instances:
24
+ return instances[key]
25
+
26
+ with global_lock:
27
+ # Second check after acquiring the lock (slow path)
28
+ if key not in instances:
29
+ instances[key] = cls(*args, **kwargs)
30
+ return instances[key]
31
+
32
+ return get_instance_by_language
33
+
34
+
35
+ # https://github.com/explosion/spaCy/discussions/9147
36
+ # NER labels to identify entities
37
+ class DefaultEntityLabel(StrEnum):
38
+ PHONE_NUMBER = "PHONE_NUMBER" # custom ner component
39
+ WORK_OF_ART = "WORK_OF_ART"
40
+ LANGUAGE = "LANGUAGE"
41
+ PRODUCT = "PRODUCT"
42
+ PERSON = "PERSON"
43
+ EMAIL = "EMAIL" # custom ner component
44
+ EVENT = "EVENT"
45
+ TIME = "TIME"
46
+ DATE = "DATE"
47
+ NORP = "NORP" # nationality, religious or political groups
48
+ MISC = "MISC" # misc for DE language“
49
+ IBAN = "IBAN" # custom ner component
50
+ LAW = "LAW"
51
+ LOC = "LOC"
52
+ ORG = "ORG"
53
+ GPE = "GPE"
54
+ FAC = "FAC"
55
+ PER = "PER" # person for DE language
56
+
57
+
58
+ def filter_overlapping_spans(spans):
59
+ filtered_spans = []
60
+ current_end = -1
61
+
62
+ # Sort spans by start index
63
+ for span in sorted(spans, key=lambda span: span.start):
64
+ if span.start >= current_end:
65
+ filtered_spans.append(span)
66
+ current_end = span.end
67
+
68
+ return filtered_spans
69
+
70
+
71
+ def create_ner_component(
72
+ *,
73
+ processing_func: Callable[[Doc], Doc] | None = None,
74
+ pattern: Pattern | None = None,
75
+ label: DefaultEntityLabel,
76
+ **kwargs,
77
+ ) -> str:
78
+ custom_ner_component_name = f"{label.lower()}_ner_component"
79
+
80
+ @Language.component(custom_ner_component_name)
81
+ def custom_ner_component(doc: Doc) -> Doc:
82
+ if processing_func:
83
+ return processing_func(doc)
84
+ if not pattern:
85
+ raise ValueError("Pattern is required if processing_func is not provided.")
86
+
87
+ spans = []
88
+ for match in pattern.finditer(doc.text):
89
+ start, end = match.span()
90
+ span = doc.char_span(start, end)
91
+ if span:
92
+ spans.append(Span(doc, span.start, span.end, label=label))
93
+
94
+ doc.ents = filter_overlapping_spans(list(doc.ents) + spans)
95
+ return doc
96
+
97
+ return custom_ner_component_name