gngram-lookup 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gngram_counter/lookup.py +127 -21
- gngram_counter/normalize.py +48 -0
- {gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/METADATA +8 -8
- gngram_lookup-0.2.2.dist-info/RECORD +11 -0
- gngram_lookup-0.2.1.dist-info/RECORD +0 -10
- {gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/LICENSE +0 -0
- {gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/WHEEL +0 -0
- {gngram_lookup-0.2.1.dist-info → gngram_lookup-0.2.2.dist-info}/entry_points.txt +0 -0
gngram_counter/lookup.py
CHANGED
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
High-level lookup API for gngram-counter.
|
|
3
3
|
|
|
4
4
|
Provides simple functions for word frequency lookups similar to bnc-lookup.
|
|
5
|
+
|
|
6
|
+
Includes contraction fallback: if a contraction like "don't" is not found
|
|
7
|
+
directly, the stem ("do") is looked up instead. The ngram corpus only
|
|
8
|
+
contains pure alphabetic words, so contractions and their suffix parts
|
|
9
|
+
(n't, 'll, etc.) are absent — but the stems are present.
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
import hashlib
|
|
@@ -11,6 +16,7 @@ from typing import TypedDict
|
|
|
11
16
|
import polars as pl
|
|
12
17
|
|
|
13
18
|
from gngram_counter.data import get_hash_file, is_data_installed
|
|
19
|
+
from gngram_counter.normalize import normalize
|
|
14
20
|
|
|
15
21
|
|
|
16
22
|
class FrequencyData(TypedDict):
|
|
@@ -22,6 +28,26 @@ class FrequencyData(TypedDict):
|
|
|
22
28
|
sum_df: int # Total document frequency across all decades
|
|
23
29
|
|
|
24
30
|
|
|
31
|
+
# Contraction suffixes stored as separate tokens in the ngram corpus
|
|
32
|
+
# Order matters: longer suffixes must be checked before shorter ones
|
|
33
|
+
CONTRACTION_SUFFIXES = ("n't", "'ll", "'re", "'ve", "'m", "'d")
|
|
34
|
+
|
|
35
|
+
# Specific stems that form 's contractions (where 's = "is" or "has").
|
|
36
|
+
# NOT generalized — 's is ambiguous with possessive, so only known
|
|
37
|
+
# contraction stems are listed here. Ported from bnc-lookup.
|
|
38
|
+
S_CONTRACTION_STEMS = frozenset({
|
|
39
|
+
# Pronouns (unambiguously 's = "is" or "has", never possessive)
|
|
40
|
+
'it', 'he', 'she', 'that', 'what', 'who',
|
|
41
|
+
# Adverbs / demonstratives
|
|
42
|
+
'where', 'how', 'here', 'there',
|
|
43
|
+
# "let's" = "let us"
|
|
44
|
+
'let',
|
|
45
|
+
# Indefinite pronouns
|
|
46
|
+
'somebody', 'everybody', 'everyone', 'nobody',
|
|
47
|
+
'anywhere', 'nowhere',
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
|
|
25
51
|
@lru_cache(maxsize=256)
|
|
26
52
|
def _load_bucket(prefix: str) -> pl.DataFrame:
|
|
27
53
|
"""Load and cache a parquet bucket file."""
|
|
@@ -30,13 +56,63 @@ def _load_bucket(prefix: str) -> pl.DataFrame:
|
|
|
30
56
|
|
|
31
57
|
def _hash_word(word: str) -> tuple[str, str]:
|
|
32
58
|
"""Hash a word and return (prefix, suffix)."""
|
|
33
|
-
h = hashlib.md5(word
|
|
59
|
+
h = hashlib.md5(normalize(word).encode("utf-8")).hexdigest()
|
|
34
60
|
return h[:2], h[2:]
|
|
35
61
|
|
|
36
62
|
|
|
63
|
+
def _lookup_frequency(word: str) -> FrequencyData | None:
|
|
64
|
+
"""Look up frequency data for a single word form (no fallbacks)."""
|
|
65
|
+
if not word:
|
|
66
|
+
return None
|
|
67
|
+
prefix, suffix = _hash_word(word)
|
|
68
|
+
try:
|
|
69
|
+
df = _load_bucket(prefix)
|
|
70
|
+
except FileNotFoundError:
|
|
71
|
+
return None
|
|
72
|
+
row = df.filter(pl.col("hash") == suffix)
|
|
73
|
+
if len(row) == 0:
|
|
74
|
+
return None
|
|
75
|
+
return FrequencyData(
|
|
76
|
+
peak_tf=row["peak_tf"][0],
|
|
77
|
+
peak_df=row["peak_df"][0],
|
|
78
|
+
sum_tf=row["sum_tf"][0],
|
|
79
|
+
sum_df=row["sum_df"][0],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _split_contraction(word: str) -> tuple[str, str] | None:
|
|
84
|
+
"""Split a contraction into its component parts if possible.
|
|
85
|
+
|
|
86
|
+
The ngram corpus tokenizes contractions separately (e.g., "we'll" -> "we" + "'ll").
|
|
87
|
+
This function reverses that split for fallback lookup.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Tuple of (stem, suffix) if the word matches a contraction pattern,
|
|
91
|
+
or None if no contraction pattern matches.
|
|
92
|
+
"""
|
|
93
|
+
for suffix in CONTRACTION_SUFFIXES:
|
|
94
|
+
if word.endswith(suffix):
|
|
95
|
+
stem = word[:-len(suffix)]
|
|
96
|
+
if stem:
|
|
97
|
+
return (stem, suffix)
|
|
98
|
+
|
|
99
|
+
# Specific 's contractions from curated allowlist (not possessives)
|
|
100
|
+
if word.endswith("'s"):
|
|
101
|
+
stem = word[:-2]
|
|
102
|
+
if stem in S_CONTRACTION_STEMS:
|
|
103
|
+
return (stem, "'s")
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
37
108
|
def exists(word: str) -> bool:
|
|
38
109
|
"""Check if a word exists in the ngram data.
|
|
39
110
|
|
|
111
|
+
Performs case-insensitive lookup with automatic fallbacks:
|
|
112
|
+
1. Direct lookup of the normalized word
|
|
113
|
+
2. Contraction fallback: if word is a contraction, check if both
|
|
114
|
+
components exist (e.g., "don't" -> "do" + "n't")
|
|
115
|
+
|
|
40
116
|
Args:
|
|
41
117
|
word: The word to check (case-insensitive)
|
|
42
118
|
|
|
@@ -51,14 +127,27 @@ def exists(word: str) -> bool:
|
|
|
51
127
|
"Data files not installed. Run: python -m gngram_counter.download_data"
|
|
52
128
|
)
|
|
53
129
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
130
|
+
word = normalize(word)
|
|
131
|
+
|
|
132
|
+
if _lookup_frequency(word) is not None:
|
|
133
|
+
return True
|
|
134
|
+
|
|
135
|
+
# Contraction fallback: check if the stem exists
|
|
136
|
+
parts = _split_contraction(word)
|
|
137
|
+
if parts:
|
|
138
|
+
stem, _ = parts
|
|
139
|
+
if _lookup_frequency(stem) is not None:
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
return False
|
|
57
143
|
|
|
58
144
|
|
|
59
145
|
def frequency(word: str) -> FrequencyData | None:
|
|
60
146
|
"""Get frequency data for a word.
|
|
61
147
|
|
|
148
|
+
Performs case-insensitive lookup with contraction fallback.
|
|
149
|
+
For contractions, returns the stem's frequency data.
|
|
150
|
+
|
|
62
151
|
Args:
|
|
63
152
|
word: The word to look up (case-insensitive)
|
|
64
153
|
|
|
@@ -73,19 +162,21 @@ def frequency(word: str) -> FrequencyData | None:
|
|
|
73
162
|
"Data files not installed. Run: python -m gngram_counter.download_data"
|
|
74
163
|
)
|
|
75
164
|
|
|
76
|
-
|
|
77
|
-
df = _load_bucket(prefix)
|
|
78
|
-
row = df.filter(pl.col("hash") == suffix)
|
|
165
|
+
word = normalize(word)
|
|
79
166
|
|
|
80
|
-
|
|
81
|
-
|
|
167
|
+
result = _lookup_frequency(word)
|
|
168
|
+
if result is not None:
|
|
169
|
+
return result
|
|
82
170
|
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
171
|
+
# Contraction fallback: return the stem's frequency
|
|
172
|
+
parts = _split_contraction(word)
|
|
173
|
+
if parts:
|
|
174
|
+
stem, _ = parts
|
|
175
|
+
stem_freq = _lookup_frequency(stem)
|
|
176
|
+
if stem_freq is not None:
|
|
177
|
+
return stem_freq
|
|
178
|
+
|
|
179
|
+
return None
|
|
89
180
|
|
|
90
181
|
|
|
91
182
|
def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
|
|
@@ -106,24 +197,27 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
|
|
|
106
197
|
)
|
|
107
198
|
|
|
108
199
|
# Group words by bucket prefix for efficient batch lookups
|
|
109
|
-
by_prefix: dict[str, list[tuple[str, str]]] = {}
|
|
200
|
+
by_prefix: dict[str, list[tuple[str, str, str]]] = {}
|
|
201
|
+
contraction_words: list[str] = []
|
|
202
|
+
|
|
110
203
|
for word in words:
|
|
111
|
-
|
|
204
|
+
normalized = normalize(word)
|
|
205
|
+
prefix, suffix = _hash_word(normalized)
|
|
112
206
|
if prefix not in by_prefix:
|
|
113
207
|
by_prefix[prefix] = []
|
|
114
|
-
by_prefix[prefix].append((word, suffix))
|
|
208
|
+
by_prefix[prefix].append((word, normalized, suffix))
|
|
115
209
|
|
|
116
210
|
results: dict[str, FrequencyData | None] = {}
|
|
117
211
|
|
|
118
|
-
for prefix,
|
|
212
|
+
for prefix, entries in by_prefix.items():
|
|
119
213
|
df = _load_bucket(prefix)
|
|
120
|
-
suffixes = [s for _, s in
|
|
214
|
+
suffixes = [s for _, _, s in entries]
|
|
121
215
|
|
|
122
216
|
# Filter to all matching suffixes at once
|
|
123
217
|
matches = df.filter(pl.col("hash").is_in(suffixes))
|
|
124
218
|
match_dict = {row["hash"]: row for row in matches.iter_rows(named=True)}
|
|
125
219
|
|
|
126
|
-
for word, suffix in
|
|
220
|
+
for word, normalized, suffix in entries:
|
|
127
221
|
if suffix in match_dict:
|
|
128
222
|
row = match_dict[suffix]
|
|
129
223
|
results[word] = FrequencyData(
|
|
@@ -133,6 +227,18 @@ def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
|
|
|
133
227
|
sum_df=row["sum_df"],
|
|
134
228
|
)
|
|
135
229
|
else:
|
|
230
|
+
# Mark for contraction fallback
|
|
136
231
|
results[word] = None
|
|
232
|
+
contraction_words.append(word)
|
|
233
|
+
|
|
234
|
+
# Contraction fallback for words not found directly
|
|
235
|
+
for word in contraction_words:
|
|
236
|
+
normalized = normalize(word)
|
|
237
|
+
parts = _split_contraction(normalized)
|
|
238
|
+
if parts:
|
|
239
|
+
stem, _ = parts
|
|
240
|
+
stem_freq = _lookup_frequency(stem)
|
|
241
|
+
if stem_freq is not None:
|
|
242
|
+
results[word] = stem_freq
|
|
137
243
|
|
|
138
244
|
return results
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Text normalization utilities for gngram-counter.
|
|
2
|
+
|
|
3
|
+
Handles normalization of Unicode apostrophe variants and other text
|
|
4
|
+
transformations to ensure consistent matching against the ngram corpus.
|
|
5
|
+
|
|
6
|
+
Ported from bnc-lookup normalize.py.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
# Unicode characters that should normalize to ASCII apostrophe (U+0027)
|
|
10
|
+
# Ordered by likelihood of occurrence in English text
|
|
11
|
+
APOSTROPHE_VARIANTS = (
|
|
12
|
+
'\u2019' # RIGHT SINGLE QUOTATION MARK (most common smart quote)
|
|
13
|
+
'\u2018' # LEFT SINGLE QUOTATION MARK
|
|
14
|
+
'\u0060' # GRAVE ACCENT
|
|
15
|
+
'\u00B4' # ACUTE ACCENT
|
|
16
|
+
'\u201B' # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
17
|
+
'\u2032' # PRIME
|
|
18
|
+
'\u2035' # REVERSED PRIME
|
|
19
|
+
'\u02B9' # MODIFIER LETTER PRIME
|
|
20
|
+
'\u02BC' # MODIFIER LETTER APOSTROPHE
|
|
21
|
+
'\u02C8' # MODIFIER LETTER VERTICAL LINE
|
|
22
|
+
'\u0313' # COMBINING COMMA ABOVE
|
|
23
|
+
'\u0315' # COMBINING COMMA ABOVE RIGHT
|
|
24
|
+
'\u055A' # ARMENIAN APOSTROPHE
|
|
25
|
+
'\u05F3' # HEBREW PUNCTUATION GERESH
|
|
26
|
+
'\u07F4' # NKO HIGH TONE APOSTROPHE
|
|
27
|
+
'\u07F5' # NKO LOW TONE APOSTROPHE
|
|
28
|
+
'\uFF07' # FULLWIDTH APOSTROPHE
|
|
29
|
+
'\u1FBF' # GREEK PSILI
|
|
30
|
+
'\u1FBD' # GREEK KORONIS
|
|
31
|
+
'\uA78C' # LATIN SMALL LETTER SALTILLO
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Pre-compiled translation table for fast apostrophe normalization
|
|
35
|
+
_APOSTROPHE_TABLE = str.maketrans({char: "'" for char in APOSTROPHE_VARIANTS})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def normalize_apostrophes(text: str) -> str:
|
|
39
|
+
"""Normalize Unicode apostrophe variants to ASCII apostrophe."""
|
|
40
|
+
return text.translate(_APOSTROPHE_TABLE)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def normalize(text: str) -> str:
|
|
44
|
+
"""Normalize text for ngram lookup.
|
|
45
|
+
|
|
46
|
+
Applies: apostrophe variant conversion, lowercase, strip whitespace.
|
|
47
|
+
"""
|
|
48
|
+
return normalize_apostrophes(text).lower().strip()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gngram-lookup
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Static Hash-Based Lookup for Google Ngram Frequencies
|
|
5
5
|
Home-page: https://github.com/craigtrim/gngram-lookup
|
|
6
6
|
License: Proprietary
|
|
@@ -32,7 +32,7 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
[](https://badge.fury.io/py/gngram-lookup)
|
|
33
33
|
[](https://pepy.tech/project/gngram-lookup)
|
|
34
34
|
[](https://pepy.tech/project/gngram-lookup)
|
|
35
|
-
[](https://github.com/craigtrim/gngram-lookup/tree/main/tests)
|
|
36
36
|
[](https://www.python.org/downloads/)
|
|
37
37
|
|
|
38
38
|
Word frequency from 500 years of books. O(1) lookup. 5 million words.
|
|
@@ -74,11 +74,11 @@ gngram-freq computer
|
|
|
74
74
|
|
|
75
75
|
## Docs
|
|
76
76
|
|
|
77
|
-
- [API Reference](docs/api.md)
|
|
78
|
-
- [CLI Reference](docs/cli.md)
|
|
79
|
-
- [Data Format](docs/data-format.md)
|
|
80
|
-
- [Use Cases](docs/use-cases.md)
|
|
81
|
-
- [Development](docs/development.md)
|
|
77
|
+
- [API Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/api.md)
|
|
78
|
+
- [CLI Reference](https://github.com/craigtrim/gngram-lookup/blob/main/docs/cli.md)
|
|
79
|
+
- [Data Format](https://github.com/craigtrim/gngram-lookup/blob/main/docs/data-format.md)
|
|
80
|
+
- [Use Cases](https://github.com/craigtrim/gngram-lookup/blob/main/docs/use-cases.md)
|
|
81
|
+
- [Development](https://github.com/craigtrim/gngram-lookup/blob/main/docs/development.md)
|
|
82
82
|
|
|
83
83
|
## See Also
|
|
84
84
|
|
|
@@ -91,5 +91,5 @@ Data derived from the [Google Books Ngram](https://books.google.com/ngrams) data
|
|
|
91
91
|
|
|
92
92
|
## License
|
|
93
93
|
|
|
94
|
-
Proprietary. See [LICENSE](LICENSE).
|
|
94
|
+
Proprietary. See [LICENSE](https://github.com/craigtrim/gngram-lookup/blob/main/LICENSE).
|
|
95
95
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
|
|
2
|
+
gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
|
|
3
|
+
gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
|
|
4
|
+
gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
|
|
5
|
+
gngram_counter/lookup.py,sha256=r67ulgLPM0zkIWyulQsmsRVbIZt9J1APQ1974DWgX1k,7564
|
|
6
|
+
gngram_counter/normalize.py,sha256=UDUPk4Mb-fcdIy-4WAivFnk33H6gwxxD3oKFHq2tNNg,1693
|
|
7
|
+
gngram_lookup-0.2.2.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
|
|
8
|
+
gngram_lookup-0.2.2.dist-info/METADATA,sha256=DSF-z85Un8wSJQs55r0Q6_O5K9c9fyDuUi9rW1NMFIU,3323
|
|
9
|
+
gngram_lookup-0.2.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
10
|
+
gngram_lookup-0.2.2.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
|
|
11
|
+
gngram_lookup-0.2.2.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
gngram_counter/__init__.py,sha256=JsgQYIF5LcYqdhWuDuVhrlt5eVebk36CsXQK9Q3H5ZA,374
|
|
2
|
-
gngram_counter/cli.py,sha256=7PScHhnjNoi0so0IGGZ7ipi0bgILtfQmZ0PPCxJCO_k,861
|
|
3
|
-
gngram_counter/data.py,sha256=HvESF1lc9v7HPbemksnvzvV460ts9gBjvACMZZao9qs,1089
|
|
4
|
-
gngram_counter/download_data.py,sha256=vlggDDszmI29UJA9H17AK-_BTNOcjq9oWoKju4DDCTU,2663
|
|
5
|
-
gngram_counter/lookup.py,sha256=8WThcRWmIYPBgHTwfOYNSN1wTgddnBXCx6moNwulKXU,3992
|
|
6
|
-
gngram_lookup-0.2.1.dist-info/LICENSE,sha256=9r2EF9XQjpHEtltPlomXEmegbVVhZsVHzygSPfiid_E,1497
|
|
7
|
-
gngram_lookup-0.2.1.dist-info/METADATA,sha256=okH1jbNz8k9EsgS8aFnOdeslJwY7wUhDSFwpWptFeD4,2952
|
|
8
|
-
gngram_lookup-0.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
9
|
-
gngram_lookup-0.2.1.dist-info/entry_points.txt,sha256=bzFME4Um0_lWLTo2JcvFseBUSD7Gk7r-156Cr_wssnM,109
|
|
10
|
-
gngram_lookup-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|