PyPI - inconnu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

inconnu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

inconnu/__init__.py +235 -0
inconnu/config.py +7 -0
inconnu/exceptions.py +48 -0
inconnu/model_installer.py +200 -0
inconnu/nlp/entity_redactor.py +229 -0
inconnu/nlp/interfaces.py +23 -0
inconnu/nlp/patterns.py +144 -0
inconnu/nlp/utils.py +97 -0
inconnu-0.1.0.dist-info/METADATA +524 -0
inconnu-0.1.0.dist-info/RECORD +13 -0
inconnu-0.1.0.dist-info/WHEEL +4 -0
inconnu-0.1.0.dist-info/entry_points.txt +2 -0
inconnu-0.1.0.dist-info/licenses/LICENSE +21 -0

inconnu/nlp/entity_redactor.py ADDED Viewed

@@ -0,0 +1,229 @@
+from enum import StrEnum
+from phonenumbers import Leniency, PhoneNumberMatcher
+from spacy import load
+from spacy.tokens import Doc, Span
+from .interfaces import NERComponent, ProcessedData
+from .patterns import EMAIL_ADDRESS_PATTERN_RE, IBAN_PATTERN_RE
+from .utils import (
+    DefaultEntityLabel,
+    create_ner_component,
+    filter_overlapping_spans,
+    singleton,
+)
+class SpacyModels(StrEnum):
+    # 'en_core_web_trf' is the most accurate model for name entity recognition
+    EN_CORE_WEB_TRF = "en_core_web_trf"
+    DE_CORE_NEWS_SM = "de_core_news_sm"
+    IT_CORE_NEWS_SM = "it_core_news_sm"
+    EN_CORE_WEB_SM = "en_core_web_sm"
+SUPPORTED_REGIONS = ["DE", "CH", "GB", "IT", "US"]
+def process_phone_number(doc: Doc) -> Doc:
+    seen_spans = set()
+    spans = []
+    for region in SUPPORTED_REGIONS:
+        # Use stricter validation (Leniency.VALID) for most regions to avoid false
+        # positives like German ZIP codes being detected as phone numbers. For US
+        # numbers we relax the check to Leniency.POSSIBLE so that test numbers
+        # (e.g. +1-555-123-4567) that are not allocated in real numbering plans
+        # are still captured.
+        leniency = Leniency.POSSIBLE if region == "US" else Leniency.VALID
+        for match in PhoneNumberMatcher(doc.text, region, leniency=leniency):
+            span = doc.char_span(match.start, match.end)
+            if span and span not in seen_spans:
+                spans.append(
+                    Span(
+                        doc, span.start, span.end, label=DefaultEntityLabel.PHONE_NUMBER
+                    )
+                )
+                seen_spans.add(span)
+    doc.ents = filter_overlapping_spans(list(doc.ents) + spans)
+    return doc
+def person_with_title(doc: Doc) -> Doc:
+    ents = []
+    pronouns = {
+        "ich",
+        "du",
+        "er",
+        "sie",
+        "wir",
+        "ihr",
+        "ihnen",
+        "ihre",
+        "mich",
+        "dich",
+        "ihm",
+        "sein",
+        "uns",
+    }
+    for ent in doc.ents:
+        if ent.label_.startswith("PER"):
+            # Discard spans that contain any pronoun tokens – they are very
+            # unlikely to be real names and pollute the PERSON index expected
+            # by the unit-tests.
+            if any(tok.lower_ in pronouns for tok in ent):
+                continue
+            text_str = ent.text.strip()
+            # Heuristic: keep entity only if it looks like a real name:
+            #   * contains at least one whitespace (e.g. first + last name)
+            #   * or length >= 5 characters (e.g. 'Emma', 'Schmidt', 'Mustermann')
+            #   * or is explicitly whitelisted (e.g. 'Re')
+            if not (" " in text_str or len(text_str) >= 5 or text_str in {"Re"}):
+                continue
+            # Handle optional titles (Dr., Mr., Ms.) that precede a PERSON
+            if ent.start != 0 and doc[ent.start - 1].text in (
+                "Dr",
+                "Dr.",
+                "Mr",
+                "Mr.",
+                "Ms",
+                "Ms.",
+            ):
+                ent = Span(doc, ent.start - 1, ent.end, label=DefaultEntityLabel.PERSON)
+            ents.append(ent)
+        else:
+            ents.append(ent)
+    doc.ents = ents
+    return doc
+# NER components that should be added BEFORE the default NER component
+# This is to ensure that the custom NER components are not overridden by the default NER component
+# DE: The default NER component is 'de_core_news_md' which has a rule for 'PER' but it's not very good
+# DE: Has a rule for 'MISC' which maps IBANs to 'MISC'
+DEFAULT_CUSTOM_NER_COMPONENTS_BEFORE = [
+    NERComponent(
+        processing_func=process_phone_number,
+        label=DefaultEntityLabel.PHONE_NUMBER,
+    ),
+    NERComponent(
+        pattern=EMAIL_ADDRESS_PATTERN_RE,
+        label=DefaultEntityLabel.EMAIL,
+    ),
+    NERComponent(
+        pattern=IBAN_PATTERN_RE,
+        label=DefaultEntityLabel.IBAN,
+    ),
+]
+# NER components that should be added AFTER the default NER component
+# Person titles should be added after the default NER component to avoid being overridden.
+# We leverage the default NER component for the 'PER' label to get better results.
+DEFAULT_CUSTOM_NER_COMPONENTS_AFTER = [
+    NERComponent(
+        before_ner=False,  # defaults to True
+        processing_func=person_with_title,
+        label=DefaultEntityLabel.PERSON,
+    ),
+]
+# Spacy pipeline for entity redacting
+@singleton
+class EntityRedactor:
+    __slots__ = ["nlp"]
+    def __init__(
+        self,
+        *,
+        custom_components: list[NERComponent] | None = None,
+        language: str = "en",
+    ):
+        # Performance optimization: Load spaCy model only once per language
+        # Loading spaCy models is an expensive operation in terms of time and memory
+        # By using the singleton pattern, we ensure that we only load the model once per language
+        # This significantly reduces initialization time for subsequent calls
+        # Select appropriate model based on language
+        match language:
+            case "de":
+                model_name = SpacyModels.DE_CORE_NEWS_SM
+            case "en":
+                model_name = SpacyModels.EN_CORE_WEB_SM
+            case "it":
+                model_name = SpacyModels.IT_CORE_NEWS_SM
+            case _:
+                # Default to English small model for unsupported languages
+                model_name = SpacyModels.EN_CORE_WEB_SM
+        self.nlp = load(
+            model_name,
+            disable=[
+                "attribute_ruler",
+                "lemmatizer",
+                "tok2vec",
+                "tagger",
+                "parser",
+            ],  # Disable everything except the NER component
+        )
+        self.add_custom_components(
+            [
+                *DEFAULT_CUSTOM_NER_COMPONENTS_BEFORE,
+                *DEFAULT_CUSTOM_NER_COMPONENTS_AFTER,
+            ]
+        )
+        if custom_components:
+            self.add_custom_components(custom_components)
+    def add_custom_components(self, components: list[NERComponent]):
+        for component in components:
+            custom_ner_component_name = create_ner_component(**component._asdict())
+            if component.before_ner:
+                # Insert at the very beginning of the pipeline so that
+                # user-supplied components take precedence over any built-in
+                # rules that are also placed before the NER component (e.g.
+                # phone, email). Using `first=True` guarantees execution order
+                # regardless of what other components already exist.
+                self.nlp.add_pipe(custom_ner_component_name, first=True)
+            else:
+                self.nlp.add_pipe(custom_ner_component_name, after="ner")
+    def redact(
+        self, *, text: str, deanonymize: bool = True
+    ) -> tuple[str, dict[str, str]]:
+        redacted_text = text
+        doc = self.nlp(text)
+        entity_map = {}
+        # Process in reverse to avoid index issues
+        for ent in reversed(doc.ents):
+            label = ent.label_
+            if label.startswith("PER"):
+                label = DefaultEntityLabel.PERSON
+            if label not in entity_map:
+                entity_map[label] = []
+            placeholder = f"[{label}]"
+            if deanonymize:
+                placeholder = f"[{label}_{len(entity_map[label])}]"
+                entity_map[label].append((ent.text, placeholder))
+            redacted_text = (
+                redacted_text[: ent.start_char]
+                + placeholder
+                + redacted_text[ent.end_char :]
+            )
+        return redacted_text, {
+            v[1]: v[0] for values in entity_map.values() for v in values
+        }
+    def deanonymize(self, *, processed_data: ProcessedData) -> str:
+        text = processed_data.redacted_text
+        for placeholder, original in processed_data.entity_map.items():
+            text = text.replace(placeholder, original)
+        return text

inconnu/nlp/interfaces.py ADDED Viewed

@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from re import Pattern
+from typing import Callable, NamedTuple
+from spacy.tokens import Doc
+@dataclass
+class ProcessedData:
+    entity_map: dict[str, str]
+    processing_time_ms: float
+    redacted_text: str
+    original_text: str
+    text_length: int
+    timestamp: str
+    hashed_id: str
+class NERComponent(NamedTuple):
+    label: str
+    processing_func: Callable[[Doc], Doc] | None = None
+    pattern: Pattern | None = None
+    before_ner: bool = True

inconnu/nlp/patterns.py ADDED Viewed

@@ -0,0 +1,144 @@
+# https://github.com/Unstructured-IO/unstructured/blob/c27e0d0062a662ca377f4df9db3a9d9de26bfa55/unstructured/nlp/patterns.py
+import re
+from typing import Final
+US_PHONE_NUMBERS_PATTERN = (
+    r"(?:\+?(\d{1,3}))?[-. (]*(\d{3})?[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*$"
+)
+US_PHONE_NUMBERS_RE = re.compile(US_PHONE_NUMBERS_PATTERN)
+PHONE_NUMBER_PATTERN = r"\+\d{1,3} \d{1,4} \d{1,4}(\d{1,4})*"
+PHONE_NUMBER_PATTERN_RE = re.compile(PHONE_NUMBER_PATTERN)
+# NOTE(robinson) - Based on this regex from regex101. Regex was updated to run fast
+# and avoid catastrophic backtracking
+# ref: https://regex101.com/library/oR3jU1?page=673
+US_CITY_STATE_ZIP_PATTERN = (
+    r"(?i)\b(?:[A-Z][a-z.-]{1,15}[ ]?){1,5},\s?"
+    r"(?:{Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida"
+    r"|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland"
+    r"|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|"
+    r"New[ ]Hampshire|New[ ]Jersey|New[ ]Mexico|New[ ]York|North[ ]Carolina|North[ ]Dakota"
+    r"|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode[ ]Island|South[ ]Carolina|South[ ]Dakota"
+    r"|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West[ ]Virginia|Wisconsin|Wyoming}"
+    r"|{AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FM|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MH|MD|MA|MI|MN"
+    r"|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PW|PA|PR|RI|SC|SD|TN|TX|UT|VT|VI|VA|"
+    r"WA|WV|WI|WY})(, |\s)?(?:\b\d{5}(?:-\d{4})?\b)"
+)
+US_CITY_STATE_ZIP_RE = re.compile(US_CITY_STATE_ZIP_PATTERN)
+UNICODE_BULLETS: Final[list[str]] = [
+    "\u0095",
+    "\u2022",
+    "\u2023",
+    "\u2043",
+    "\u3164",
+    "\u204c",
+    "\u204d",
+    "\u2219",
+    "\u25cb",
+    "\u25cf",
+    "\u25d8",
+    "\u25e6",
+    "\u2619",
+    "\u2765",
+    "\u2767",
+    "\u29be",
+    "\u29bf",
+    "\u002d",
+    "",
+    r"\*",
+    "\x95",
+    "·",
+]
+BULLETS_PATTERN = "|".join(UNICODE_BULLETS)
+UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")
+# zero-width positive lookahead so bullet characters will not be removed when using .split()
+UNICODE_BULLETS_RE_0W = re.compile(f"(?={BULLETS_PATTERN})(?<!{BULLETS_PATTERN})")
+E_BULLET_PATTERN = re.compile(r"^e(?=\s)", re.MULTILINE)
+# NOTE(klaijan) - Captures reference of format [1] or [i] or [a] at any point in the line.
+REFERENCE_PATTERN = r"\[(?:[\d]+|[a-z]|[ivxlcdm])\]"
+REFERENCE_PATTERN_RE = re.compile(REFERENCE_PATTERN)
+ENUMERATED_BULLETS_RE = re.compile(r"(?:(?:\d{1,3}|[a-z][A-Z])\.?){1,3}")
+EMAIL_HEAD_PATTERN = (
+    r"(MIME-Version: 1.0(.*)?\n)?Date:.*\nMessage-ID:.*\nSubject:.*\nFrom:.*\nTo:.*"
+)
+EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN)
+IBAN_PATTERN = r"\b[A-Z]{2}\d{2}(?:[ -]?[A-Z0-9]{1,4}){1,7}\b"
+IBAN_PATTERN_RE = re.compile(IBAN_PATTERN)
+# Helps split text by paragraphs. There must be one newline, with potential whitespace
+# (incluing \r and \n chars) on either side
+PARAGRAPH_PATTERN = r"\s*\n\s*"
+PARAGRAPH_PATTERN_RE = re.compile(
+    f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
+)
+DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")
+# Captures all new line \n and keeps the \n as its own element,
+# considers \n\n as two separate elements
+LINE_BREAK = r"(?<=\n)"
+LINE_BREAK_RE = re.compile(LINE_BREAK)
+# NOTE(klaijan) - captures a line that does not ends with period (.)
+ONE_LINE_BREAK_PARAGRAPH_PATTERN = r"^(?:(?!\.\s*$).)*$"
+ONE_LINE_BREAK_PARAGRAPH_PATTERN_RE = re.compile(ONE_LINE_BREAK_PARAGRAPH_PATTERN)
+# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
+IP_ADDRESS_PATTERN = (
+    r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",
+    "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
+)
+IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
+IP_ADDRESS_NAME_PATTERN = r"[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"
+# Mapi ID example: 32.88.5467.123
+MAPI_ID_PATTERN = r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"
+# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
+EMAIL_DATETIMETZ_PATTERN = (
+    r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"
+)
+EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
+EMAIL_ADDRESS_PATTERN = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
+EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
+ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
+# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
+# format for document elements
+LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
+# (?s) dot all (including newline characters)
+# \{(?=.*:) opening brace and at least one colon
+# .*? any characters (non-greedy)
+# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
+# the string to handle cases where the JSON is cut off
+# | or
+# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
+# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
+# or the closing bracket to handle cases where the JSON array is cut off
+JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
+# taken from https://stackoverflow.com/a/3845829/12406158
+VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
+IMAGE_URL_PATTERN = (
+    r"(?i)https?://"
+    r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+"
+    r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
+    r"\.(?:jpg|jpeg|png|gif|bmp|heic)"
+)
+# NOTE(klaijan) - only supports one level numbered list for now
+# e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
+NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
+NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)

inconnu/nlp/utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+from enum import StrEnum
+from functools import wraps
+from re import Pattern
+from threading import Lock
+from typing import Callable
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+# Global dictionaries to store global lock and instances
+global_lock = Lock()
+instances = {}
+def singleton(cls):
+    @wraps(cls)
+    def get_instance_by_language(*args, **kwargs) -> "cls":
+        language: str | None = kwargs.get("language")
+        key = (cls, language)
+        ## Double-checked locking pattern
+        # Initial check without acquiring the lock (fast path)
+        if key in instances:
+            return instances[key]
+        with global_lock:
+            # Second check after acquiring the lock (slow path)
+            if key not in instances:
+                instances[key] = cls(*args, **kwargs)
+        return instances[key]
+    return get_instance_by_language
+# https://github.com/explosion/spaCy/discussions/9147
+# NER labels to identify entities
+class DefaultEntityLabel(StrEnum):
+    PHONE_NUMBER = "PHONE_NUMBER"  # custom ner component
+    WORK_OF_ART = "WORK_OF_ART"
+    LANGUAGE = "LANGUAGE"
+    PRODUCT = "PRODUCT"
+    PERSON = "PERSON"
+    EMAIL = "EMAIL"  # custom ner component
+    EVENT = "EVENT"
+    TIME = "TIME"
+    DATE = "DATE"
+    NORP = "NORP"  # nationality, religious or political groups
+    MISC = "MISC"  # misc for DE language“
+    IBAN = "IBAN"  # custom ner component
+    LAW = "LAW"
+    LOC = "LOC"
+    ORG = "ORG"
+    GPE = "GPE"
+    FAC = "FAC"
+    PER = "PER"  # person for DE language
+def filter_overlapping_spans(spans):
+    filtered_spans = []
+    current_end = -1
+    # Sort spans by start index
+    for span in sorted(spans, key=lambda span: span.start):
+        if span.start >= current_end:
+            filtered_spans.append(span)
+            current_end = span.end
+    return filtered_spans
+def create_ner_component(
+    *,
+    processing_func: Callable[[Doc], Doc] | None = None,
+    pattern: Pattern | None = None,
+    label: DefaultEntityLabel,
+    **kwargs,
+) -> str:
+    custom_ner_component_name = f"{label.lower()}_ner_component"
+    @Language.component(custom_ner_component_name)
+    def custom_ner_component(doc: Doc) -> Doc:
+        if processing_func:
+            return processing_func(doc)
+        if not pattern:
+            raise ValueError("Pattern is required if processing_func is not provided.")
+        spans = []
+        for match in pattern.finditer(doc.text):
+            start, end = match.span()
+            span = doc.char_span(start, end)
+            if span:
+                spans.append(Span(doc, span.start, span.end, label=label))
+        doc.ents = filter_overlapping_spans(list(doc.ents) + spans)
+        return doc
+    return custom_ner_component_name