PyPI - dataknobs-xization - Versions diffs - 1.0.0__py3-none-any.whl - Mend

dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dataknobs-xization might be problematic. Click here for more details.

Files changed (10) hide show

dataknobs_xization/0.readme.txt +66 -0
dataknobs_xization/__init__.py +16 -0
dataknobs_xization/annotations.py +1308 -0
dataknobs_xization/authorities.py +766 -0
dataknobs_xization/lexicon.py +596 -0
dataknobs_xization/masking_tokenizer.py +697 -0
dataknobs_xization/normalize.py +448 -0
dataknobs_xization-1.0.0.dist-info/METADATA +58 -0
dataknobs_xization-1.0.0.dist-info/RECORD +10 -0
dataknobs_xization-1.0.0.dist-info/WHEEL +4 -0

dataknobs_xization/normalize.py ADDED Viewed

@@ -0,0 +1,448 @@
+import math
+import re
+from itertools import product
+from typing import List, Set
+# squash whitespace: to collapse consecutive whitespace to a single space by
+#    x.sub(' ', text)
+SQUASH_WS_RE = re.compile(r"\s+")
+# to identify strings with any symbols by
+#    x.search(text)
+ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+")
+# camelcase LU: to split between consecutive lower and upper chars by
+#    x.sub(r'\1 \2', text)
+CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])")
+# camelcase UL: to split between consecutive upper and uppler lower chars by
+#    x.sub(r'\1 \2', text)
+CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
+# non-embedded symbols: those without a word char on both sides by
+#    x.sub('', text)
+NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))")
+# embedded symbols: to drop embedded symbols by
+#    x.sub('', text)
+EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)")
+# hyphen-slash: to split between an embedded hyphen and/or slash by
+#    x.split(text)
+HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)")
+# hyphen-only: to split between an embedded hyphen by
+#    x.split(text)
+HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)")
+# slash-only: to split between an embedded slash by
+#    x.split(text)
+SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)")
+# parenthetical expressions: to drop parenthetical expressions by
+#    x.sub('', text)
+PARENTHETICAL_RE = re.compile(r"\(.*\)")
+# ampersand: to replace an ampersand with " and " by
+#    x.sub(' and ', text)
+AMPERSAND_RE = re.compile(r"\s*\&\s*")
+def expand_camelcase_fn(text: str) -> str:
+    """Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """
+    text = CAMELCASE_LU_RE.sub(r"\1 \2", text)
+    return CAMELCASE_UL_RE.sub(r"\1 \2", text)
+def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str:
+    """Drop symbols not embedded within word characters"""
+    return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text)
+def drop_embedded_symbols_fn(text: str, repl: str = "") -> str:
+    """Drop symbols embedded within word characters"""
+    return EMBEDDED_SYMS_RE.sub(repl, text)
+def get_hyphen_slash_expansions_fn(
+    text: str,
+    subs: List[str] = ("-", " ", ""),
+    add_self: bool = True,
+    do_split: bool = True,
+    min_split_token_len: int = 2,
+    hyphen_slash_re=HYPHEN_SLASH_RE,
+) -> Set[str]:
+    """Given text with words that may or may not appear as hyphenated or with a
+    slash, return the set potential variations:
+        - the text as-is (add_self)
+        - with a hyphen between all words (if '-' in subs)
+        - with a space between all words (if ' ' in subs)
+        - with all words squashed together (empty string between if '' in subs)
+        - with each word separately (do_split as long as min_split_token_len is
+              met for all tokens)
+    :param text: The hyphen-worthy snippet of text, either already
+        hyphenated or with a slash or space delimited.
+    :param subs: A string of characters or list of strings to insert between
+        tokens
+    :param add_self: True to include the text itself in the result
+    :param min_split_token_len: If any of the split tokens fail
+        to meet the min token length, don't add any of the splits.
+    :param hyphen_slash_re: The regex to identify hyphen/slash to expand.
+    Notes:
+      * To add a variation with a slash, add '/' to subs.
+      * To not add any variations with symbols, leave them out of subs
+        * and don't add self.
+    """
+    variations = {text} if add_self else set()
+    if subs is not None and len(subs) > 0:
+        # create variant with all <s>'s
+        for s in subs:
+            variations.add(HYPHEN_SLASH_RE.sub(s, text))
+    if do_split:
+        # add each word separately
+        tokens = set(hyphen_slash_re.split(text))
+        if not max(map(lambda t: len(t) < min_split_token_len, tokens)):
+            variations.update(tokens)
+    return variations
+def drop_parentheticals_fn(text: str) -> str:
+    """Drop parenthetical expressions from the text."""
+    return PARENTHETICAL_RE.sub("", text)
+def expand_ampersand_fn(text: str) -> str:
+    """Replace '&' with ' and '."""
+    return AMPERSAND_RE.sub(" and ", text)
+def get_lexical_variations(
+    text: str,
+    include_self: bool = True,
+    expand_camelcase: bool = True,
+    drop_non_embedded_symbols: bool = True,
+    drop_embedded_symbols: bool = True,
+    spacify_embedded_symbols: bool = False,
+    do_hyphen_expansion: bool = True,
+    hyphen_subs: List[str] = (" ", ""),
+    do_hyphen_split: bool = True,
+    min_hyphen_split_token_len=2,
+    do_slash_expansion: bool = True,
+    slash_subs: List[str] = (" ", " or "),
+    do_slash_split: bool = True,
+    min_slash_split_token_len: int = 1,
+    drop_parentheticals: bool = True,
+    expand_ampersands: bool = True,
+    add_eng_plurals: bool = True,
+) -> Set[str]:
+    """Get all variations for the text (including the text itself)."""
+    variations = {text} if include_self else set()
+    if expand_camelcase:
+        variations.add(expand_camelcase_fn(text))
+    if drop_non_embedded_symbols:
+        variations.add(drop_non_embedded_symbols_fn(text))
+    if drop_embedded_symbols:
+        variations.add(drop_embedded_symbols_fn(text))
+    if spacify_embedded_symbols:
+        variations.add(drop_embedded_symbols_fn(text, " "))
+    if (
+        do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0
+    ) or do_hyphen_split:
+        variations.update(
+            get_hyphen_slash_expansions_fn(
+                text,
+                subs=hyphen_subs,
+                add_self=False,
+                do_split=do_hyphen_split,
+                min_split_token_len=min_hyphen_split_token_len,
+            )
+        )
+    if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split:
+        variations.update(
+            get_hyphen_slash_expansions_fn(
+                text,
+                subs=slash_subs,
+                add_self=False,
+                do_split=do_slash_split,
+                min_split_token_len=min_slash_split_token_len,
+            )
+        )
+    if drop_parentheticals:
+        variations.add(drop_parentheticals_fn(text))
+    if expand_ampersands:
+        variations.add(expand_ampersand_fn(text))
+    if add_eng_plurals:
+        # TODO: Use a better pluralizer
+        plurals = {f"{v}s" for v in variations}
+        variations.update(plurals)
+    return variations
+def int_to_en(num: int) -> str:
+    d = {
+        0: "zero",
+        1: "one",
+        2: "two",
+        3: "three",
+        4: "four",
+        5: "five",
+        6: "six",
+        7: "seven",
+        8: "eight",
+        9: "nine",
+        10: "ten",
+        11: "eleven",
+        12: "twelve",
+        13: "thirteen",
+        14: "fourteen",
+        15: "fifteen",
+        16: "sixteen",
+        17: "seventeen",
+        18: "eighteen",
+        19: "nineteen",
+        20: "twenty",
+        30: "thirty",
+        40: "forty",
+        50: "fifty",
+        60: "sixty",
+        70: "seventy",
+        80: "eighty",
+        90: "ninety",
+    }
+    k = 1000
+    m = k * 1000
+    b = m * 1000
+    t = b * 1000
+    if not isinstance(num, int):
+        return num
+    if num < 0:
+        return "negative " + int_to_en(abs(num))
+    if num < 20:
+        return d[num]
+    if num < 100:
+        if num % 10 == 0:
+            return d[num]
+        else:
+            return d[num // 10 * 10] + " " + d[num % 10]
+    if num < k:
+        if num % 100 == 0:
+            return d[num // 100] + " hundred"
+        else:
+            return d[num // 100] + " hundred and " + int_to_en(num % 100)
+    if num < m:
+        if num % k == 0:
+            return int_to_en(num // k) + " thousand"
+        else:
+            return int_to_en(num // k) + " thousand " + int_to_en(num % k)
+    if num < b:
+        if (num % m) == 0:
+            return int_to_en(num // m) + " million"
+        else:
+            return int_to_en(num // m) + " million " + int_to_en(num % m)
+    if num < t:
+        if (num % b) == 0:
+            return int_to_en(num // b) + " billion"
+        else:
+            return int_to_en(num // b) + " billion " + int_to_en(num % b)
+    if num % t == 0:
+        return int_to_en(num // t) + " trillion"
+    else:
+        return int_to_en(num // t) + " trillion " + int_to_en(num % t)
+    # num is too large
+    return str(num)
+def zero_pad_variations(
+    val: int,
+    min_zpad_len: int,
+    max_zpad_len: int,
+) -> Set[str]:
+    """Get (only) zero-padded variations of the given value from min (inclusive)
+    to max (exclusive) zero-pad lengths.
+    Examples:
+      * zero_pad_variations(9, 2, 4) == {'09', '009'}
+      * zero_pad_variations(90, 2, 4) == {'090'}
+      * zero_pad_variations(90, 2, 3) == {}
+      * zero_pad_variations(3, 0, 5) == {'03', '003', '0003'}
+    :param min_zpad_len: The minimum zero-padded string length (inclusive)
+    :param max_zpad_len: The maximum zero-padded string length (exclusive)
+    :return: The set of all requested zero-padded number strings
+    """
+    return {
+        f"{val:0{zpad}d}"
+        for zpad in range(
+            max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len
+        )
+    }
+def month_day_variations_fn(
+    month_or_day: int,
+    do_int_to_en: bool = False,
+) -> Set[str]:
+    """Get the variations for a month or day number, including the number
+    itself as a string, a 2-digit zero-padded form of the number, and
+    (optionally) english word for the number.
+    :param month_or_day: The month or day for which to get variations
+    :param do_int_to_en: Optionally include the english word for the number
+    :return: The set of variations for the value
+    """
+    result = zero_pad_variations(month_or_day, 2, 3)
+    result.add(str(month_or_day))
+    if do_int_to_en:
+        result.add(int_to_en(month_or_day))
+    return result
+def year_variations_fn(
+    year: int,
+    min_year: int = 0,
+    max_year: int = 9999,
+    do_int_to_en_below_100: bool = False,
+    numeric_only: bool = False,
+) -> Set[str]:
+    """* "1999"
+    * Convert a year to english text:
+      * Long text: one thousand, nine hundred and ninety nine
+      * Short text: nineteen [hundred and] ninety nine
+    """
+    variations = {str(year)}
+    if year < min_year or year > max_year:
+        return variations
+    # one thousand, nine hundred and ninety nine
+    if not numeric_only and (do_int_to_en_below_100 or year >= 100):
+        variations.add(int_to_en(year))
+    # nineteen ninety five
+    century = year // 100
+    remainder = year % 100
+    remainder_text = int_to_en(remainder)
+    variations.update(zero_pad_variations(remainder, 2, 3))
+    if century > 0:
+        remainder_texts = list()
+        if remainder > 0:
+            if remainder < 10:
+                if not numeric_only:
+                    remainder_texts.append(f" oh {remainder_text}")
+                remainder_texts.append(f" 0{remainder}")
+            else:
+                if not numeric_only:
+                    remainder_texts.append(f" {remainder_text}")
+                remainder_texts.append(f" {remainder}")
+            if not numeric_only:
+                remainder_texts.append(f" and {remainder_text}")
+        century_text = int_to_en(century)
+        scales = ["", century_text]
+        if century % 10 == 0:
+            mil_text = int_to_en(century // 10)
+            scales.append(f"{mil_text} thousand")
+        else:
+            scales.append(f"{century_text} hundred")
+        def clean_up(s):
+            s = s.strip()
+            if s.startswith("and "):
+                s = s[4:]
+            return s
+        variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)})
+    return variations
+def replace_smart_quotes_fn(text: str) -> str:
+    """Replace "smart" quotes with their ascii version."""
+    return (
+        text.replace(
+            "\u201c",
+            '"',  # left double quote U+201C
+        )
+        .replace(
+            "\u201d",
+            '"',  # right double quote U+201D
+        )
+        .replace(
+            "\u2018",
+            "'",  # left single quote U+2018
+        )
+        .replace(
+            "\u2019",
+            "'",  # right single quote U+2019
+        )
+    )
+def basic_normalization_fn(
+    text: str,
+    lowercase: bool = True,
+    expand_camelcase: bool = True,
+    simplify_quote_chars: bool = True,
+    drop_non_embedded_symbols: bool = False,
+    spacify_embedded_symbols: bool = False,
+    drop_embedded_symbols: bool = False,
+    squash_whitespace: bool = False,
+    do_all: bool = False,
+) -> str:
+    """Basic normalization functions include:
+    * lowercasing [default]
+    * expanding camelcase [default]
+    * replacing "smart" quotes and apostrophes with ascii verisons [default]
+    * dropping non_embedded symbols [optional]
+    * replacing embedded symbols with a space [takes precedence over dropping unless do_all]
+    * or dropping embedded symbols [optional]
+    * collapsing multiple spaces and stripping spaces from ends [optional]
+    """
+    # NOTE: do this before changing case
+    if expand_camelcase or do_all:
+        text = expand_camelcase_fn(text)
+    if lowercase or do_all:
+        text = text.lower()
+    if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all:
+        text = re.sub(r"[^\w\s]+", "", text)
+    elif drop_non_embedded_symbols:
+        text = drop_non_embedded_symbols_fn(text)
+    elif spacify_embedded_symbols:
+        text = drop_embedded_symbols_fn(text, " ")
+    elif drop_embedded_symbols:
+        text = drop_embedded_symbols_fn(text)
+    # NOTE: do this after dropping (only some) symbols
+    if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols):
+        # NOTE: It only makes sense to do this if we're keeping symbols
+        text = replace_smart_quotes_fn(text)
+    # NOTE: do this last
+    if squash_whitespace or do_all:
+        text = re.sub(r"\s+", " ", text).strip()
+    return text

dataknobs_xization-1.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,58 @@
+Metadata-Version: 2.4
+Name: dataknobs-xization
+Version: 1.0.0
+Summary: Text normalization and tokenization tools
+Author-email: Spence Koehler <KoehlerSB747@gmail.com>
+Requires-Python: >=3.10
+Requires-Dist: dataknobs-common>=1.0.0
+Requires-Dist: dataknobs-structures>=1.0.0
+Requires-Dist: dataknobs-utils>=1.0.0
+Requires-Dist: nltk>=3.9.1
+Description-Content-Type: text/markdown
+# dataknobs-xization
+Text normalization and tokenization tools.
+## Installation
+```bash
+pip install dataknobs-xization
+```
+## Features
+- **Text Normalization**: Standardize text for consistent processing
+- **Masking Tokenizer**: Advanced tokenization with masking capabilities
+- **Annotations**: Text annotation system
+- **Authorities**: Authority management for text processing
+- **Lexicon**: Lexicon-based text analysis
+## Usage
+```python
+from dataknobs_xization import normalize, MaskingTokenizer
+# Text normalization
+normalized = normalize.normalize_text("Hello, World!")
+# Tokenization with masking
+tokenizer = MaskingTokenizer()
+tokens = tokenizer.tokenize("This is a sample text.")
+# Working with annotations
+from dataknobs_xization import annotations
+doc = annotations.create_document("Sample text", {"metadata": "value"})
+```
+## Dependencies
+This package depends on:
+- `dataknobs-common`
+- `dataknobs-structures`
+- `dataknobs-utils`
+- nltk
+## License
+See LICENSE file in the root repository.

dataknobs_xization-1.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
+dataknobs_xization/__init__.py,sha256=ixsRSYr86q1T4LqQTRzP9Z_ihcOVN6r8SQNurhmHWmY,404
+dataknobs_xization/annotations.py,sha256=qiH_QzzIs5mjvO2Yr4jiLBMIxIiPbzzfd_iublS8HTI,45143
+dataknobs_xization/authorities.py,sha256=69nAlExbh_U7NKav1q3IujXb8lBq14QJhHHy5IZ0PZE,30745
+dataknobs_xization/lexicon.py,sha256=NMo3lAXUVzFVRy246Y90TZtm-27qR5g0z8Ef9u2E2LA,23722
+dataknobs_xization/masking_tokenizer.py,sha256=65RkHdU83l1Tf0f9bXwNrLDuFsN-xegMQNJGON7Z8WY,26036
+dataknobs_xization/normalize.py,sha256=kpT8y1jEmeiKiNC8pruurFjasmREhr4rAQ3W_yB2v4U,14024
+dataknobs_xization-1.0.0.dist-info/METADATA,sha256=xa_zmCme9PWkU1NJO6w8G2nQvubCrpSKvTIBwHWAv-4,1393
+dataknobs_xization-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dataknobs_xization-1.0.0.dist-info/RECORD,,

dataknobs_xization-1.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any