gngram-lookup 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gngram_counter/lookup.py CHANGED
@@ -2,8 +2,15 @@
2
2
  High-level lookup API for gngram-counter.
3
3
 
4
4
  Provides simple functions for word frequency lookups similar to bnc-lookup.
5
+
6
+ Includes contraction fallback: if a contraction like "don't" is not found
7
+ directly, the stem ("do") is looked up instead. The ngram corpus only
8
+ contains pure alphabetic words, so contractions and their suffix parts
9
+ (n't, 'll, etc.) are absent — but the stems are present.
5
10
  """
6
11
 
12
+ from __future__ import annotations
13
+
7
14
  import hashlib
8
15
  from functools import lru_cache
9
16
  from typing import TypedDict
@@ -11,6 +18,7 @@ from typing import TypedDict
11
18
  import polars as pl
12
19
 
13
20
  from gngram_counter.data import get_hash_file, is_data_installed
21
+ from gngram_counter.normalize import normalize
14
22
 
15
23
 
16
24
  class FrequencyData(TypedDict):
@@ -22,6 +30,26 @@ class FrequencyData(TypedDict):
22
30
  sum_df: int # Total document frequency across all decades
23
31
 
24
32
 
33
+ # Contraction suffixes stored as separate tokens in the ngram corpus
34
+ # Order matters: longer suffixes must be checked before shorter ones
35
+ CONTRACTION_SUFFIXES = ("n't", "'ll", "'re", "'ve", "'m", "'d")
36
+
37
+ # Specific stems that form 's contractions (where 's = "is" or "has").
38
+ # NOT generalized — 's is ambiguous with possessive, so only known
39
+ # contraction stems are listed here. Ported from bnc-lookup.
40
+ S_CONTRACTION_STEMS = frozenset({
41
+ # Pronouns (unambiguously 's = "is" or "has", never possessive)
42
+ 'it', 'he', 'she', 'that', 'what', 'who',
43
+ # Adverbs / demonstratives
44
+ 'where', 'how', 'here', 'there',
45
+ # "let's" = "let us"
46
+ 'let',
47
+ # Indefinite pronouns
48
+ 'somebody', 'everybody', 'everyone', 'nobody',
49
+ 'anywhere', 'nowhere',
50
+ })
51
+
52
+
25
53
  @lru_cache(maxsize=256)
26
54
  def _load_bucket(prefix: str) -> pl.DataFrame:
27
55
  """Load and cache a parquet bucket file."""
@@ -30,13 +58,63 @@ def _load_bucket(prefix: str) -> pl.DataFrame:
30
58
 
31
59
  def _hash_word(word: str) -> tuple[str, str]:
32
60
  """Hash a word and return (prefix, suffix)."""
33
- h = hashlib.md5(word.lower().encode("utf-8")).hexdigest()
61
+ h = hashlib.md5(normalize(word).encode("utf-8")).hexdigest()
34
62
  return h[:2], h[2:]
35
63
 
36
64
 
65
+ def _lookup_frequency(word: str) -> FrequencyData | None:
66
+ """Look up frequency data for a single word form (no fallbacks)."""
67
+ if not word:
68
+ return None
69
+ prefix, suffix = _hash_word(word)
70
+ try:
71
+ df = _load_bucket(prefix)
72
+ except FileNotFoundError:
73
+ return None
74
+ row = df.filter(pl.col("hash") == suffix)
75
+ if len(row) == 0:
76
+ return None
77
+ return FrequencyData(
78
+ peak_tf=row["peak_tf"][0],
79
+ peak_df=row["peak_df"][0],
80
+ sum_tf=row["sum_tf"][0],
81
+ sum_df=row["sum_df"][0],
82
+ )
83
+
84
+
85
+ def _split_contraction(word: str) -> tuple[str, str] | None:
86
+ """Split a contraction into its component parts if possible.
87
+
88
+ The ngram corpus tokenizes contractions separately (e.g., "we'll" -> "we" + "'ll").
89
+ This function reverses that split for fallback lookup.
90
+
91
+ Returns:
92
+ Tuple of (stem, suffix) if the word matches a contraction pattern,
93
+ or None if no contraction pattern matches.
94
+ """
95
+ for suffix in CONTRACTION_SUFFIXES:
96
+ if word.endswith(suffix):
97
+ stem = word[:-len(suffix)]
98
+ if stem:
99
+ return (stem, suffix)
100
+
101
+ # Specific 's contractions from curated allowlist (not possessives)
102
+ if word.endswith("'s"):
103
+ stem = word[:-2]
104
+ if stem in S_CONTRACTION_STEMS:
105
+ return (stem, "'s")
106
+
107
+ return None
108
+
109
+
37
110
  def exists(word: str) -> bool:
38
111
  """Check if a word exists in the ngram data.
39
112
 
113
+ Performs case-insensitive lookup with automatic fallbacks:
114
+ 1. Direct lookup of the normalized word
115
+ 2. Contraction fallback: if word is a contraction, check if both
116
+ components exist (e.g., "don't" -> "do" + "n't")
117
+
40
118
  Args:
41
119
  word: The word to check (case-insensitive)
42
120
 
@@ -51,14 +129,27 @@ def exists(word: str) -> bool:
51
129
  "Data files not installed. Run: python -m gngram_counter.download_data"
52
130
  )
53
131
 
54
- prefix, suffix = _hash_word(word)
55
- df = _load_bucket(prefix)
56
- return len(df.filter(pl.col("hash") == suffix)) > 0
132
+ word = normalize(word)
133
+
134
+ if _lookup_frequency(word) is not None:
135
+ return True
136
+
137
+ # Contraction fallback: check if the stem exists
138
+ parts = _split_contraction(word)
139
+ if parts:
140
+ stem, _ = parts
141
+ if _lookup_frequency(stem) is not None:
142
+ return True
143
+
144
+ return False
57
145
 
58
146
 
59
147
  def frequency(word: str) -> FrequencyData | None:
60
148
  """Get frequency data for a word.
61
149
 
150
+ Performs case-insensitive lookup with contraction fallback.
151
+ For contractions, returns the stem's frequency data.
152
+
62
153
  Args:
63
154
  word: The word to look up (case-insensitive)
64
155
 
@@ -73,19 +164,21 @@ def frequency(word: str) -> FrequencyData | None:
73
164
  "Data files not installed. Run: python -m gngram_counter.download_data"
74
165
  )
75
166
 
76
- prefix, suffix = _hash_word(word)
77
- df = _load_bucket(prefix)
78
- row = df.filter(pl.col("hash") == suffix)
167
+ word = normalize(word)
79
168
 
80
- if len(row) == 0:
81
- return None
169
+ result = _lookup_frequency(word)
170
+ if result is not None:
171
+ return result
82
172
 
83
- return FrequencyData(
84
- peak_tf=row["peak_tf"][0],
85
- peak_df=row["peak_df"][0],
86
- sum_tf=row["sum_tf"][0],
87
- sum_df=row["sum_df"][0],
88
- )
173
+ # Contraction fallback: return the stem's frequency
174
+ parts = _split_contraction(word)
175
+ if parts:
176
+ stem, _ = parts
177
+ stem_freq = _lookup_frequency(stem)
178
+ if stem_freq is not None:
179
+ return stem_freq
180
+
181
+ return None
89
182
 
90
183
 
91
184
  def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
@@ -106,24 +199,27 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
106
199
  )
107
200
 
108
201
  # Group words by bucket prefix for efficient batch lookups
109
- by_prefix: dict[str, list[tuple[str, str]]] = {}
202
+ by_prefix: dict[str, list[tuple[str, str, str]]] = {}
203
+ contraction_words: list[str] = []
204
+
110
205
  for word in words:
111
- prefix, suffix = _hash_word(word)
206
+ normalized = normalize(word)
207
+ prefix, suffix = _hash_word(normalized)
112
208
  if prefix not in by_prefix:
113
209
  by_prefix[prefix] = []
114
- by_prefix[prefix].append((word, suffix))
210
+ by_prefix[prefix].append((word, normalized, suffix))
115
211
 
116
212
  results: dict[str, FrequencyData | None] = {}
117
213
 
118
- for prefix, word_suffix_pairs in by_prefix.items():
214
+ for prefix, entries in by_prefix.items():
119
215
  df = _load_bucket(prefix)
120
- suffixes = [s for _, s in word_suffix_pairs]
216
+ suffixes = [s for _, _, s in entries]
121
217
 
122
218
  # Filter to all matching suffixes at once
123
219
  matches = df.filter(pl.col("hash").is_in(suffixes))
124
220
  match_dict = {row["hash"]: row for row in matches.iter_rows(named=True)}
125
221
 
126
- for word, suffix in word_suffix_pairs:
222
+ for word, normalized, suffix in entries:
127
223
  if suffix in match_dict:
128
224
  row = match_dict[suffix]
129
225
  results[word] = FrequencyData(
@@ -133,6 +229,18 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
133
229
  sum_df=row["sum_df"],
134
230
  )
135
231
  else:
232
+ # Mark for contraction fallback
136
233
  results[word] = None
234
+ contraction_words.append(word)
235
+
236
+ # Contraction fallback for words not found directly
237
+ for word in contraction_words:
238
+ normalized = normalize(word)
239
+ parts = _split_contraction(normalized)
240
+ if parts:
241
+ stem, _ = parts
242
+ stem_freq = _lookup_frequency(stem)
243
+ if stem_freq is not None:
244
+ results[word] = stem_freq
137
245
 
138
246
  return results
@@ -0,0 +1,50 @@
1
+ """Text normalization utilities for gngram-counter.
2
+
3
+ Handles normalization of Unicode apostrophe variants and other text
4
+ transformations to ensure consistent matching against the ngram corpus.
5
+
6
+ Ported from bnc-lookup normalize.py.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # Unicode characters that should normalize to ASCII apostrophe (U+0027)
12
+ # Ordered by likelihood of occurrence in English text
13
+ APOSTROPHE_VARIANTS = (
14
+ '\u2019' # RIGHT SINGLE QUOTATION MARK (most common smart quote)
15
+ '\u2018' # LEFT SINGLE QUOTATION MARK
16
+ '\u0060' # GRAVE ACCENT
17
+ '\u00B4' # ACUTE ACCENT
18
+ '\u201B' # SINGLE HIGH-REVERSED-9 QUOTATION MARK
19
+ '\u2032' # PRIME
20
+ '\u2035' # REVERSED PRIME
21
+ '\u02B9' # MODIFIER LETTER PRIME
22
+ '\u02BC' # MODIFIER LETTER APOSTROPHE
23
+ '\u02C8' # MODIFIER LETTER VERTICAL LINE
24
+ '\u0313' # COMBINING COMMA ABOVE
25
+ '\u0315' # COMBINING COMMA ABOVE RIGHT
26
+ '\u055A' # ARMENIAN APOSTROPHE
27
+ '\u05F3' # HEBREW PUNCTUATION GERESH
28
+ '\u07F4' # NKO HIGH TONE APOSTROPHE
29
+ '\u07F5' # NKO LOW TONE APOSTROPHE
30
+ '\uFF07' # FULLWIDTH APOSTROPHE
31
+ '\u1FBF' # GREEK PSILI
32
+ '\u1FBD' # GREEK KORONIS
33
+ '\uA78C' # LATIN SMALL LETTER SALTILLO
34
+ )
35
+
36
+ # Pre-compiled translation table for fast apostrophe normalization
37
+ _APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
38
+
39
+
40
+ def normalize_apostrophes(text: str) -> str:
41
+ """Normalize Unicode apostrophe variants to ASCII apostrophe."""
42
+ return text.translate(_APOSTROPHE_TABLE)
43
+
44
+
45
+ def normalize(text: str) -> str:
46
+ """Normalize text for ngram lookup.
47
+
48
+ Applies: apostrophe variant conversion, lowercase, strip whitespace.
49
+ """
50
+ return normalize_apostrophes(text).lower().strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gngram-lookup
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Static Hash-Based Lookup for Google Ngram Frequencies
5
5
  Home-page: https://github.com/craigtrim/gngram-lookup
6
6
  License: Proprietary
@@ -9,7 +9,7 @@ Author: Craig Trim
9
9
  Author-email: craigtrim@gmail.com
10
10
  Maintainer: Craig Trim
11
11
  Maintainer-email: craigtrim@gmail.com
12
- Requires-Python: >=3.11,<4.0
12
+ Requires-Python: >=3.9,<4.0
13
13
  Classifier: Development Status :: 4 - Beta
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Intended Audience :: Science/Research
@@ -17,6 +17,8 @@ Classifier: License :: Other/Proprietary License
17
17
  Classifier: Natural Language :: English
18
18
  Classifier: Operating System :: OS Independent
19
19
  Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
20
22
  Classifier: Programming Language :: Python :: 3.11
21
23
  Classifier: Programming Language :: Python :: 3.12
22
24
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
@@ -32,8 +34,8 @@ Description-Content-Type: text/markdown
32
34
  [![PyPI version](https://badge.fury.io/py/gngram-lookup.svg)](https://badge.fury.io/py/gngram-lookup)
33
35
  [![Downloads](https://pepy.tech/badge/gngram-lookup)](https://pepy.tech/project/gngram-lookup)
34
36
  [![Downloads/Month](https://pepy.tech/badge/gngram-lookup/month)](https://pepy.tech/project/gngram-lookup)
35
- [![Tests](https://img.shields.io/badge/tests-58-brightgreen)](tests/)
36
- [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
37
+ [![Tests](https://img.shields.io/badge/tests-131-brightgreen)](https://github.com/craigtrim/gngram-lookup/tree/main/tests)
38
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://www.python.org/downloads/)
37
39
 
38
40
  Word frequency from 500 years of books. O(1) lookup. 5 million words.
39
41
 
@@ -74,11 +76,11 @@ gngram-freq computer
74
76
 
75
77
  ## Docs
76
78
 
77
- - [API Reference](docs/api.md)
78
- - [CLI Reference](docs/cli.md)
79
- - [Data Format](docs/data-format.md)
80
- - [Use Cases](docs/use-cases.md)
81
- - [Development](docs/development.md)
79
+ - [API Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/api.md)
80
+ - [CLI Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/cli.md)
81
+ - [Data Format](https://github.com/craigtrim/gngram-lookup/blob/main/docs/data-format.md)
82
+ - [Use Cases](https://github.com/craigtrim/gngram-lookup/blob/main/docs/use-cases.md)
83
+ - [Development](https://github.com/craigtrim/gngram-lookup/blob/main/docs/development.md)
82
84
 
83
85
  ## See Also
84
86
 
@@ -91,5 +93,5 @@ Data derived from the [Google Books Ngram](https://books.google.com/ngrams) data
91
93
 
92
94
  ## License
93
95
 
94
- Proprietary. See [LICENSE](LICENSE).
96
+ Proprietary. See [LICENSE](https://github.com/craigtrim/gngram-lookup/blob/main/LICENSE).
95
97
 
@@ -0,0 +1,11 @@
1
+ gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
2
+ gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
3
+ gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
4
+ gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
5
+ gngram_counter/lookup.py,sha256=L2O7iwzMy4CqLDAhgvzzilJXX4ncIJkIJy5bgIrpiVg,7600
6
+ gngram_counter/normalize.py,sha256=Fwb32rhzr-6p25Shcbfkr2Ttdj1aLt9wZypPQWY9dRQ,1729
7
+ gngram_lookup-0.2.3.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
8
+ gngram_lookup-0.2.3.dist-info/METADATA,sha256=_d4vLD8nToY-5bmAZjH8Y6CiN7NAkEp7bEPUgs7qKww,3421
9
+ gngram_lookup-0.2.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
+ gngram_lookup-0.2.3.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
11
+ gngram_lookup-0.2.3.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
2
- gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
3
- gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
4
- gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
5
- gngram_counter/lookup.py,sha256=8WThcRWmIYPBgHTwfOYNSN1wTgddnBXCx6moNwulKXU,3992
6
- gngram_lookup-0.2.1.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
7
- gngram_lookup-0.2.1.dist-info/METADATA,sha256=okH1jbNz8k9EsgS8aFnOdeslJwY7wUhDSFwpWptFeD4,2952
8
- gngram_lookup-0.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- gngram_lookup-0.2.1.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
10
- gngram_lookup-0.2.1.dist-info/RECORD,,