PyPI - gngram-lookup - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

gngram-lookup 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

gngram_counter/lookup.py CHANGED Viewed

@@ -2,6 +2,11 @@
 High-level lookup API for gngram-counter.
 Provides simple functions for word frequency lookups similar to bnc-lookup.
+Includes contraction fallback: if a contraction like "don't" is not found
+directly, the stem ("do") is looked up instead. The ngram corpus only
+contains pure alphabetic words, so contractions and their suffix parts
+(n't, 'll, etc.) are absent — but the stems are present.
 """
 import hashlib
@@ -11,6 +16,7 @@ from typing import TypedDict
 import polars as pl
 from gngram_counter.data import get_hash_file, is_data_installed
+from gngram_counter.normalize import normalize
 class FrequencyData(TypedDict):
@@ -22,6 +28,26 @@ class FrequencyData(TypedDict):
     sum_df: int  # Total document frequency across all decades
+# Contraction suffixes stored as separate tokens in the ngram corpus
+# Order matters: longer suffixes must be checked before shorter ones
+CONTRACTION_SUFFIXES = ("n't", "'ll", "'re", "'ve", "'m", "'d")
+# Specific stems that form 's contractions (where 's = "is" or "has").
+# NOT generalized — 's is ambiguous with possessive, so only known
+# contraction stems are listed here. Ported from bnc-lookup.
+S_CONTRACTION_STEMS = frozenset({
+    # Pronouns (unambiguously 's = "is" or "has", never possessive)
+    'it', 'he', 'she', 'that', 'what', 'who',
+    # Adverbs / demonstratives
+    'where', 'how', 'here', 'there',
+    # "let's" = "let us"
+    'let',
+    # Indefinite pronouns
+    'somebody', 'everybody', 'everyone', 'nobody',
+    'anywhere', 'nowhere',
+})
 @lru_cache(maxsize=256)
 def _load_bucket(prefix: str) -> pl.DataFrame:
     """Load and cache a parquet bucket file."""
@@ -30,13 +56,63 @@ def _load_bucket(prefix: str) -> pl.DataFrame:
 def _hash_word(word: str) -> tuple[str, str]:
     """Hash a word and return (prefix, suffix)."""
-    h = hashlib.md5(word.lower().encode("utf-8")).hexdigest()
+    h = hashlib.md5(normalize(word).encode("utf-8")).hexdigest()
     return h[:2], h[2:]
+def _lookup_frequency(word: str) -> FrequencyData | None:
+    """Look up frequency data for a single word form (no fallbacks)."""
+    if not word:
+        return None
+    prefix, suffix = _hash_word(word)
+    try:
+        df = _load_bucket(prefix)
+    except FileNotFoundError:
+        return None
+    row = df.filter(pl.col("hash") == suffix)
+    if len(row) == 0:
+        return None
+    return FrequencyData(
+        peak_tf=row["peak_tf"][0],
+        peak_df=row["peak_df"][0],
+        sum_tf=row["sum_tf"][0],
+        sum_df=row["sum_df"][0],
+    )
+def _split_contraction(word: str) -> tuple[str, str] | None:
+    """Split a contraction into its component parts if possible.
+    The ngram corpus tokenizes contractions separately (e.g., "we'll" -> "we" + "'ll").
+    This function reverses that split for fallback lookup.
+    Returns:
+        Tuple of (stem, suffix) if the word matches a contraction pattern,
+        or None if no contraction pattern matches.
+    """
+    for suffix in CONTRACTION_SUFFIXES:
+        if word.endswith(suffix):
+            stem = word[:-len(suffix)]
+            if stem:
+                return (stem, suffix)
+    # Specific 's contractions from curated allowlist (not possessives)
+    if word.endswith("'s"):
+        stem = word[:-2]
+        if stem in S_CONTRACTION_STEMS:
+            return (stem, "'s")
+    return None
 def exists(word: str) -> bool:
     """Check if a word exists in the ngram data.
+    Performs case-insensitive lookup with automatic fallbacks:
+    1. Direct lookup of the normalized word
+    2. Contraction fallback: if word is a contraction, check if both
+       components exist (e.g., "don't" -> "do" + "n't")
     Args:
         word: The word to check (case-insensitive)
@@ -51,14 +127,27 @@ def exists(word: str) -> bool:
             "Data files not installed. Run: python -m gngram_counter.download_data"
         )
-    prefix, suffix = _hash_word(word)
-    df = _load_bucket(prefix)
-    return len(df.filter(pl.col("hash") == suffix)) > 0
+    word = normalize(word)
+    if _lookup_frequency(word) is not None:
+        return True
+    # Contraction fallback: check if the stem exists
+    parts = _split_contraction(word)
+    if parts:
+        stem, _ = parts
+        if _lookup_frequency(stem) is not None:
+            return True
+    return False
 def frequency(word: str) -> FrequencyData | None:
     """Get frequency data for a word.
+    Performs case-insensitive lookup with contraction fallback.
+    For contractions, returns the stem's frequency data.
     Args:
         word: The word to look up (case-insensitive)
@@ -73,19 +162,21 @@ def frequency(word: str) -> FrequencyData | None:
             "Data files not installed. Run: python -m gngram_counter.download_data"
         )
-    prefix, suffix = _hash_word(word)
-    df = _load_bucket(prefix)
-    row = df.filter(pl.col("hash") == suffix)
+    word = normalize(word)
-    if len(row) == 0:
-        return None
+    result = _lookup_frequency(word)
+    if result is not None:
+        return result
-    return FrequencyData(
-        peak_tf=row["peak_tf"][0],
-        peak_df=row["peak_df"][0],
-        sum_tf=row["sum_tf"][0],
-        sum_df=row["sum_df"][0],
-    )
+    # Contraction fallback: return the stem's frequency
+    parts = _split_contraction(word)
+    if parts:
+        stem, _ = parts
+        stem_freq = _lookup_frequency(stem)
+        if stem_freq is not None:
+            return stem_freq
+    return None
 def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
@@ -106,24 +197,27 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
         )
     # Group words by bucket prefix for efficient batch lookups
-    by_prefix: dict[str, list[tuple[str, str]]] = {}
+    by_prefix: dict[str, list[tuple[str, str, str]]] = {}
+    contraction_words: list[str] = []
     for word in words:
-        prefix, suffix = _hash_word(word)
+        normalized = normalize(word)
+        prefix, suffix = _hash_word(normalized)
         if prefix not in by_prefix:
             by_prefix[prefix] = []
-        by_prefix[prefix].append((word, suffix))
+        by_prefix[prefix].append((word, normalized, suffix))
     results: dict[str, FrequencyData | None] = {}
-    for prefix, word_suffix_pairs in by_prefix.items():
+    for prefix, entries in by_prefix.items():
         df = _load_bucket(prefix)
-        suffixes = [s for _, s in word_suffix_pairs]
+        suffixes = [s for _, _, s in entries]
         # Filter to all matching suffixes at once
         matches = df.filter(pl.col("hash").is_in(suffixes))
         match_dict = {row["hash"]: row for row in matches.iter_rows(named=True)}
-        for word, suffix in word_suffix_pairs:
+        for word, normalized, suffix in entries:
             if suffix in match_dict:
                 row = match_dict[suffix]
                 results[word] = FrequencyData(
@@ -133,6 +227,18 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
                     sum_df=row["sum_df"],
                 )
             else:
+                # Mark for contraction fallback
                 results[word] = None
+                contraction_words.append(word)
+    # Contraction fallback for words not found directly
+    for word in contraction_words:
+        normalized = normalize(word)
+        parts = _split_contraction(normalized)
+        if parts:
+            stem, _ = parts
+            stem_freq = _lookup_frequency(stem)
+            if stem_freq is not None:
+                results[word] = stem_freq
     return results

gngram_counter/normalize.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Text normalization utilities for gngram-counter.
+Handles normalization of Unicode apostrophe variants and other text
+transformations to ensure consistent matching against the ngram corpus.
+Ported from bnc-lookup normalize.py.
+"""
+# Unicode characters that should normalize to ASCII apostrophe (U+0027)
+# Ordered by likelihood of occurrence in English text
+APOSTROPHE_VARIANTS = (
+    '\u2019'  # RIGHT SINGLE QUOTATION MARK (most common smart quote)
+    '\u2018'  # LEFT SINGLE QUOTATION MARK
+    '\u0060'  # GRAVE ACCENT
+    '\u00B4'  # ACUTE ACCENT
+    '\u201B'  # SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    '\u2032'  # PRIME
+    '\u2035'  # REVERSED PRIME
+    '\u02B9'  # MODIFIER LETTER PRIME
+    '\u02BC'  # MODIFIER LETTER APOSTROPHE
+    '\u02C8'  # MODIFIER LETTER VERTICAL LINE
+    '\u0313'  # COMBINING COMMA ABOVE
+    '\u0315'  # COMBINING COMMA ABOVE RIGHT
+    '\u055A'  # ARMENIAN APOSTROPHE
+    '\u05F3'  # HEBREW PUNCTUATION GERESH
+    '\u07F4'  # NKO HIGH TONE APOSTROPHE
+    '\u07F5'  # NKO LOW TONE APOSTROPHE
+    '\uFF07'  # FULLWIDTH APOSTROPHE
+    '\u1FBF'  # GREEK PSILI
+    '\u1FBD'  # GREEK KORONIS
+    '\uA78C'  # LATIN SMALL LETTER SALTILLO
+)
+# Pre-compiled translation table for fast apostrophe normalization
+_APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
+def normalize_apostrophes(text: str) -> str:
+    """Normalize Unicode apostrophe variants to ASCII apostrophe."""
+    return text.translate(_APOSTROPHE_TABLE)
+def normalize(text: str) -> str:
+    """Normalize text for ngram lookup.
+    Applies: apostrophe variant conversion, lowercase, strip whitespace.
+    """
+    return normalize_apostrophes(text).lower().strip()

{gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gngram-lookup
-Version: 0.2.1
+Version: 0.2.2
 Summary: Static Hash-Based Lookup for Google Ngram Frequencies
 Home-page: https://github.com/craigtrim/gngram-lookup
 License: Proprietary
@@ -32,7 +32,7 @@ Description-Content-Type: text/markdown
 [![PyPI version](https://badge.fury.io/py/gngram-lookup.svg)](https://badge.fury.io/py/gngram-lookup)
 [![Downloads](https://pepy.tech/badge/gngram-lookup)](https://pepy.tech/project/gngram-lookup)
 [![Downloads/Month](https://pepy.tech/badge/gngram-lookup/month)](https://pepy.tech/project/gngram-lookup)
-[![Tests](https://img.shields.io/badge/tests-58-brightgreen)](tests/)
+[![Tests](https://img.shields.io/badge/tests-131-brightgreen)](https://github.com/craigtrim/gngram-lookup/tree/main/tests)
 [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
 Word frequency from 500 years of books. O(1) lookup. 5 million words.
@@ -74,11 +74,11 @@ gngram-freq computer
 ## Docs
-- [API Reference](docs/api.md)
-- [CLI Reference](docs/cli.md)
-- [Data Format](docs/data-format.md)
-- [Use Cases](docs/use-cases.md)
-- [Development](docs/development.md)
+- [API Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/api.md)
+- [CLI Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/cli.md)
+- [Data Format](https://github.com/craigtrim/gngram-lookup/blob/main/docs/data-format.md)
+- [Use Cases](https://github.com/craigtrim/gngram-lookup/blob/main/docs/use-cases.md)
+- [Development](https://github.com/craigtrim/gngram-lookup/blob/main/docs/development.md)
 ## See Also
@@ -91,5 +91,5 @@ Data derived from the [Google Books Ngram](https://books.google.com/ngrams) data
 ## License
-Proprietary. See [LICENSE](LICENSE).
+Proprietary. See [LICENSE](https://github.com/craigtrim/gngram-lookup/blob/main/LICENSE).

gngram_lookup-0.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
+gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
+gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
+gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
+gngram_counter/lookup.py,sha256=r67ulgLPM0zkIWyulQsmsRVbIZt9J1APQ1974DWgX1k,7564
+gngram_counter/normalize.py,sha256=UDUPk4Mb-fcdIy-4WAivFnk33H6gwxxD3oKFHq2tNNg,1693
+gngram_lookup-0.2.2.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
+gngram_lookup-0.2.2.dist-info/METADATA,sha256=DSF-z85Un8wSJQs55r0Q6_O5K9c9fyDuUi9rW1NMFIU,3323
+gngram_lookup-0.2.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+gngram_lookup-0.2.2.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
+gngram_lookup-0.2.2.dist-info/RECORD,,

gngram_lookup-0.2.1.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
-gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
-gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
-gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
-gngram_counter/lookup.py,sha256=8WThcRWmIYPBgHTwfOYNSN1wTgddnBXCx6moNwulKXU,3992
-gngram_lookup-0.2.1.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
-gngram_lookup-0.2.1.dist-info/METADATA,sha256=okH1jbNz8k9EsgS8aFnOdeslJwY7wUhDSFwpWptFeD4,2952
-gngram_lookup-0.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-gngram_lookup-0.2.1.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
-gngram_lookup-0.2.1.dist-info/RECORD,,

{gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

gngram-lookup 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

gngram-lookup 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl