gngram-lookup 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gngram_counter/lookup.py CHANGED
@@ -2,6 +2,11 @@
2
2
  High-level lookup API for gngram-counter.
3
3
 
4
4
  Provides simple functions for word frequency lookups similar to bnc-lookup.
5
+
6
+ Includes contraction fallback: if a contraction like "don't" is not found
7
+ directly, the stem ("do") is looked up instead. The ngram corpus only
8
+ contains pure alphabetic words, so contractions and their suffix parts
9
+ (n't, 'll, etc.) are absent — but the stems are present.
5
10
  """
6
11
 
7
12
  import hashlib
@@ -11,6 +16,7 @@ from typing import TypedDict
11
16
  import polars as pl
12
17
 
13
18
  from gngram_counter.data import get_hash_file, is_data_installed
19
+ from gngram_counter.normalize import normalize
14
20
 
15
21
 
16
22
  class FrequencyData(TypedDict):
@@ -22,6 +28,26 @@ class FrequencyData(TypedDict):
22
28
  sum_df: int # Total document frequency across all decades
23
29
 
24
30
 
31
+ # Contraction suffixes stored as separate tokens in the ngram corpus
32
+ # Order matters: longer suffixes must be checked before shorter ones
33
+ CONTRACTION_SUFFIXES = ("n't", "'ll", "'re", "'ve", "'m", "'d")
34
+
35
+ # Specific stems that form 's contractions (where 's = "is" or "has").
36
+ # NOT generalized — 's is ambiguous with possessive, so only known
37
+ # contraction stems are listed here. Ported from bnc-lookup.
38
+ S_CONTRACTION_STEMS = frozenset({
39
+ # Pronouns (unambiguously 's = "is" or "has", never possessive)
40
+ 'it', 'he', 'she', 'that', 'what', 'who',
41
+ # Adverbs / demonstratives
42
+ 'where', 'how', 'here', 'there',
43
+ # "let's" = "let us"
44
+ 'let',
45
+ # Indefinite pronouns
46
+ 'somebody', 'everybody', 'everyone', 'nobody',
47
+ 'anywhere', 'nowhere',
48
+ })
49
+
50
+
25
51
  @lru_cache(maxsize=256)
26
52
  def _load_bucket(prefix: str) -> pl.DataFrame:
27
53
  """Load and cache a parquet bucket file."""
@@ -30,13 +56,63 @@ def _load_bucket(prefix: str) -> pl.DataFrame:
30
56
 
31
57
  def _hash_word(word: str) -> tuple[str, str]:
32
58
  """Hash a word and return (prefix, suffix)."""
33
- h = hashlib.md5(word.lower().encode("utf-8")).hexdigest()
59
+ h = hashlib.md5(normalize(word).encode("utf-8")).hexdigest()
34
60
  return h[:2], h[2:]
35
61
 
36
62
 
63
+ def _lookup_frequency(word: str) -> FrequencyData | None:
64
+ """Look up frequency data for a single word form (no fallbacks)."""
65
+ if not word:
66
+ return None
67
+ prefix, suffix = _hash_word(word)
68
+ try:
69
+ df = _load_bucket(prefix)
70
+ except FileNotFoundError:
71
+ return None
72
+ row = df.filter(pl.col("hash") == suffix)
73
+ if len(row) == 0:
74
+ return None
75
+ return FrequencyData(
76
+ peak_tf=row["peak_tf"][0],
77
+ peak_df=row["peak_df"][0],
78
+ sum_tf=row["sum_tf"][0],
79
+ sum_df=row["sum_df"][0],
80
+ )
81
+
82
+
83
+ def _split_contraction(word: str) -> tuple[str, str] | None:
84
+ """Split a contraction into its component parts if possible.
85
+
86
+ The ngram corpus tokenizes contractions separately (e.g., "we'll" -> "we" + "'ll").
87
+ This function reverses that split for fallback lookup.
88
+
89
+ Returns:
90
+ Tuple of (stem, suffix) if the word matches a contraction pattern,
91
+ or None if no contraction pattern matches.
92
+ """
93
+ for suffix in CONTRACTION_SUFFIXES:
94
+ if word.endswith(suffix):
95
+ stem = word[:-len(suffix)]
96
+ if stem:
97
+ return (stem, suffix)
98
+
99
+ # Specific 's contractions from curated allowlist (not possessives)
100
+ if word.endswith("'s"):
101
+ stem = word[:-2]
102
+ if stem in S_CONTRACTION_STEMS:
103
+ return (stem, "'s")
104
+
105
+ return None
106
+
107
+
37
108
  def exists(word: str) -> bool:
38
109
  """Check if a word exists in the ngram data.
39
110
 
111
+ Performs case-insensitive lookup with automatic fallbacks:
112
+ 1. Direct lookup of the normalized word
113
+ 2. Contraction fallback: if word is a contraction, check if both
114
+ components exist (e.g., "don't" -> "do" + "n't")
115
+
40
116
  Args:
41
117
  word: The word to check (case-insensitive)
42
118
 
@@ -51,14 +127,27 @@ def exists(word: str) -> bool:
51
127
  "Data files not installed. Run: python -m gngram_counter.download_data"
52
128
  )
53
129
 
54
- prefix, suffix = _hash_word(word)
55
- df = _load_bucket(prefix)
56
- return len(df.filter(pl.col("hash") == suffix)) > 0
130
+ word = normalize(word)
131
+
132
+ if _lookup_frequency(word) is not None:
133
+ return True
134
+
135
+ # Contraction fallback: check if the stem exists
136
+ parts = _split_contraction(word)
137
+ if parts:
138
+ stem, _ = parts
139
+ if _lookup_frequency(stem) is not None:
140
+ return True
141
+
142
+ return False
57
143
 
58
144
 
59
145
  def frequency(word: str) -> FrequencyData | None:
60
146
  """Get frequency data for a word.
61
147
 
148
+ Performs case-insensitive lookup with contraction fallback.
149
+ For contractions, returns the stem's frequency data.
150
+
62
151
  Args:
63
152
  word: The word to look up (case-insensitive)
64
153
 
@@ -73,19 +162,21 @@ def frequency(word: str) -> FrequencyData | None:
73
162
  "Data files not installed. Run: python -m gngram_counter.download_data"
74
163
  )
75
164
 
76
- prefix, suffix = _hash_word(word)
77
- df = _load_bucket(prefix)
78
- row = df.filter(pl.col("hash") == suffix)
165
+ word = normalize(word)
79
166
 
80
- if len(row) == 0:
81
- return None
167
+ result = _lookup_frequency(word)
168
+ if result is not None:
169
+ return result
82
170
 
83
- return FrequencyData(
84
- peak_tf=row["peak_tf"][0],
85
- peak_df=row["peak_df"][0],
86
- sum_tf=row["sum_tf"][0],
87
- sum_df=row["sum_df"][0],
88
- )
171
+ # Contraction fallback: return the stem's frequency
172
+ parts = _split_contraction(word)
173
+ if parts:
174
+ stem, _ = parts
175
+ stem_freq = _lookup_frequency(stem)
176
+ if stem_freq is not None:
177
+ return stem_freq
178
+
179
+ return None
89
180
 
90
181
 
91
182
  def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
@@ -106,24 +197,27 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
106
197
  )
107
198
 
108
199
  # Group words by bucket prefix for efficient batch lookups
109
- by_prefix: dict[str, list[tuple[str, str]]] = {}
200
+ by_prefix: dict[str, list[tuple[str, str, str]]] = {}
201
+ contraction_words: list[str] = []
202
+
110
203
  for word in words:
111
- prefix, suffix = _hash_word(word)
204
+ normalized = normalize(word)
205
+ prefix, suffix = _hash_word(normalized)
112
206
  if prefix not in by_prefix:
113
207
  by_prefix[prefix] = []
114
- by_prefix[prefix].append((word, suffix))
208
+ by_prefix[prefix].append((word, normalized, suffix))
115
209
 
116
210
  results: dict[str, FrequencyData | None] = {}
117
211
 
118
- for prefix, word_suffix_pairs in by_prefix.items():
212
+ for prefix, entries in by_prefix.items():
119
213
  df = _load_bucket(prefix)
120
- suffixes = [s for _, s in word_suffix_pairs]
214
+ suffixes = [s for _, _, s in entries]
121
215
 
122
216
  # Filter to all matching suffixes at once
123
217
  matches = df.filter(pl.col("hash").is_in(suffixes))
124
218
  match_dict = {row["hash"]: row for row in matches.iter_rows(named=True)}
125
219
 
126
- for word, suffix in word_suffix_pairs:
220
+ for word, normalized, suffix in entries:
127
221
  if suffix in match_dict:
128
222
  row = match_dict[suffix]
129
223
  results[word] = FrequencyData(
@@ -133,6 +227,18 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
133
227
  sum_df=row["sum_df"],
134
228
  )
135
229
  else:
230
+ # Mark for contraction fallback
136
231
  results[word] = None
232
+ contraction_words.append(word)
233
+
234
+ # Contraction fallback for words not found directly
235
+ for word in contraction_words:
236
+ normalized = normalize(word)
237
+ parts = _split_contraction(normalized)
238
+ if parts:
239
+ stem, _ = parts
240
+ stem_freq = _lookup_frequency(stem)
241
+ if stem_freq is not None:
242
+ results[word] = stem_freq
137
243
 
138
244
  return results
@@ -0,0 +1,48 @@
1
+ """Text normalization utilities for gngram-counter.
2
+
3
+ Handles normalization of Unicode apostrophe variants and other text
4
+ transformations to ensure consistent matching against the ngram corpus.
5
+
6
+ Ported from bnc-lookup normalize.py.
7
+ """
8
+
9
+ # Unicode characters that should normalize to ASCII apostrophe (U+0027)
10
+ # Ordered by likelihood of occurrence in English text
11
+ APOSTROPHE_VARIANTS = (
12
+ '\u2019' # RIGHT SINGLE QUOTATION MARK (most common smart quote)
13
+ '\u2018' # LEFT SINGLE QUOTATION MARK
14
+ '\u0060' # GRAVE ACCENT
15
+ '\u00B4' # ACUTE ACCENT
16
+ '\u201B' # SINGLE HIGH-REVERSED-9 QUOTATION MARK
17
+ '\u2032' # PRIME
18
+ '\u2035' # REVERSED PRIME
19
+ '\u02B9' # MODIFIER LETTER PRIME
20
+ '\u02BC' # MODIFIER LETTER APOSTROPHE
21
+ '\u02C8' # MODIFIER LETTER VERTICAL LINE
22
+ '\u0313' # COMBINING COMMA ABOVE
23
+ '\u0315' # COMBINING COMMA ABOVE RIGHT
24
+ '\u055A' # ARMENIAN APOSTROPHE
25
+ '\u05F3' # HEBREW PUNCTUATION GERESH
26
+ '\u07F4' # NKO HIGH TONE APOSTROPHE
27
+ '\u07F5' # NKO LOW TONE APOSTROPHE
28
+ '\uFF07' # FULLWIDTH APOSTROPHE
29
+ '\u1FBF' # GREEK PSILI
30
+ '\u1FBD' # GREEK KORONIS
31
+ '\uA78C' # LATIN SMALL LETTER SALTILLO
32
+ )
33
+
34
+ # Pre-compiled translation table for fast apostrophe normalization
35
+ _APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
36
+
37
+
38
+ def normalize_apostrophes(text: str) -> str:
39
+ """Normalize Unicode apostrophe variants to ASCII apostrophe."""
40
+ return text.translate(_APOSTROPHE_TABLE)
41
+
42
+
43
+ def normalize(text: str) -> str:
44
+ """Normalize text for ngram lookup.
45
+
46
+ Applies: apostrophe variant conversion, lowercase, strip whitespace.
47
+ """
48
+ return normalize_apostrophes(text).lower().strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gngram-lookup
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Static Hash-Based Lookup for Google Ngram Frequencies
5
5
  Home-page: https://github.com/craigtrim/gngram-lookup
6
6
  License: Proprietary
@@ -30,6 +30,9 @@ Description-Content-Type: text/markdown
30
30
  # gngram-lookup
31
31
 
32
32
  [![PyPI version](https://badge.fury.io/py/gngram-lookup.svg)](https://badge.fury.io/py/gngram-lookup)
33
+ [![Downloads](https://pepy.tech/badge/gngram-lookup)](https://pepy.tech/project/gngram-lookup)
34
+ [![Downloads/Month](https://pepy.tech/badge/gngram-lookup/month)](https://pepy.tech/project/gngram-lookup)
35
+ [![Tests](https://img.shields.io/badge/tests-131-brightgreen)](https://github.com/craigtrim/gngram-lookup/tree/main/tests)
33
36
  [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
34
37
 
35
38
  Word frequency from 500 years of books. O(1) lookup. 5 million words.
@@ -71,11 +74,16 @@ gngram-freq computer
71
74
 
72
75
  ## Docs
73
76
 
74
- - [API Reference](docs/api.md)
75
- - [CLI Reference](docs/cli.md)
76
- - [Data Format](docs/data-format.md)
77
- - [Use Cases](docs/use-cases.md)
78
- - [Development](docs/development.md)
77
+ - [API Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/api.md)
78
+ - [CLI Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/cli.md)
79
+ - [Data Format](https://github.com/craigtrim/gngram-lookup/blob/main/docs/data-format.md)
80
+ - [Use Cases](https://github.com/craigtrim/gngram-lookup/blob/main/docs/use-cases.md)
81
+ - [Development](https://github.com/craigtrim/gngram-lookup/blob/main/docs/development.md)
82
+
83
+ ## See Also
84
+
85
+ - [bnc-lookup](https://pypi.org/project/bnc-lookup/) - O(1) lookup for British National Corpus
86
+ - [wordnet-lookup](https://pypi.org/project/wordnet-lookup/) - O(1) lookup for WordNet
79
87
 
80
88
  ## Attribution
81
89
 
@@ -83,5 +91,5 @@ Data derived from the [Google Books Ngram](https://books.google.com/ngrams) data
83
91
 
84
92
  ## License
85
93
 
86
- Proprietary. See [LICENSE](LICENSE).
94
+ Proprietary. See [LICENSE](https://github.com/craigtrim/gngram-lookup/blob/main/LICENSE).
87
95
 
@@ -0,0 +1,11 @@
1
+ gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
2
+ gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
3
+ gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
4
+ gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
5
+ gngram_counter/lookup.py,sha256=r67ulgLPM0zkIWyulQsmsRVbIZt9J1APQ1974DWgX1k,7564
6
+ gngram_counter/normalize.py,sha256=UDUPk4Mb-fcdIy-4WAivFnk33H6gwxxD3oKFHq2tNNg,1693
7
+ gngram_lookup-0.2.2.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
8
+ gngram_lookup-0.2.2.dist-info/METADATA,sha256=DSF-z85Un8wSJQs55r0Q6_O5K9c9fyDuUi9rW1NMFIU,3323
9
+ gngram_lookup-0.2.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
+ gngram_lookup-0.2.2.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
11
+ gngram_lookup-0.2.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
2
- gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
3
- gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
4
- gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
5
- gngram_counter/lookup.py,sha256=8WThcRWmIYPBgHTwfOYNSN1wTgddnBXCx6moNwulKXU,3992
6
- gngram_lookup-0.2.0.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
7
- gngram_lookup-0.2.0.dist-info/METADATA,sha256=Pep89swHnnJUtSa9iES_GE_Ywr8uwOmqhyHqv13GmQM,2484
8
- gngram_lookup-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- gngram_lookup-0.2.0.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
10
- gngram_lookup-0.2.0.dist-info/RECORD,,