gngram-lookup 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gngram_lookup-0.2.0/LICENSE +33 -0
- gngram_lookup-0.2.0/PKG-INFO +87 -0
- gngram_lookup-0.2.0/README.md +57 -0
- gngram_lookup-0.2.0/gngram_counter/__init__.py +14 -0
- gngram_lookup-0.2.0/gngram_counter/cli.py +36 -0
- gngram_lookup-0.2.0/gngram_counter/data.py +43 -0
- gngram_lookup-0.2.0/gngram_counter/download_data.py +89 -0
- gngram_lookup-0.2.0/gngram_counter/lookup.py +138 -0
- gngram_lookup-0.2.0/pyproject.toml +60 -0
- gngram_lookup-0.2.0/setup.py +35 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Proprietary License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Craig Trim. All rights reserved.
|
|
4
|
+
|
|
5
|
+
This software and associated documentation files (the "Software") are proprietary
|
|
6
|
+
and confidential. Unauthorized copying, modification, distribution, or use of this
|
|
7
|
+
Software, via any medium, is strictly prohibited.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
10
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
11
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
12
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
13
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
14
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
15
|
+
SOFTWARE.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
Data Attribution:
|
|
20
|
+
|
|
21
|
+
The frequency data distributed with this package is derived from the Google
|
|
22
|
+
Books Ngram Dataset, which is made available by Google Inc. under the Creative
|
|
23
|
+
Commons Attribution 3.0 Unported License (CC BY 3.0).
|
|
24
|
+
|
|
25
|
+
Google Books Ngram Viewer: https://books.google.com/ngrams
|
|
26
|
+
Dataset: https://storage.googleapis.com/books/ngrams/books/datasetsv3.html
|
|
27
|
+
|
|
28
|
+
Citation:
|
|
29
|
+
Jean-Baptiste Michel*, Yuan Kui Shen, Aviva Presser Aiden, Adrian Veres,
|
|
30
|
+
Matthew K. Gray, The Google Books Team, Joseph P. Pickett, Dale Hoiberg,
|
|
31
|
+
Dan Clancy, Peter Norvig, Jon Orwant, Steven Pinker, Martin A. Nowak,
|
|
32
|
+
and Erez Lieberman Aiden*. Quantitative Analysis of Culture Using Millions
|
|
33
|
+
of Digitized Books. Science (Published online ahead of print: 12/16/2010).
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: gngram-lookup
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Static Hash-Based Lookup for Google Ngram Frequencies
|
|
5
|
+
Home-page: https://github.com/craigtrim/gngram-lookup
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Keywords: ngram,google-ngram,nlp,natural-language-processing,frequency,linguistics
|
|
8
|
+
Author: Craig Trim
|
|
9
|
+
Author-email: craigtrim@gmail.com
|
|
10
|
+
Maintainer: Craig Trim
|
|
11
|
+
Maintainer-email: craigtrim@gmail.com
|
|
12
|
+
Requires-Python: >=3.11,<4.0
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: Other/Proprietary License
|
|
17
|
+
Classifier: Natural Language :: English
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
|
+
Requires-Dist: polars (>=1.0,<2.0)
|
|
26
|
+
Requires-Dist: pyarrow (>=18.0,<19.0)
|
|
27
|
+
Project-URL: Repository, https://github.com/craigtrim/gngram-lookup
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# gngram-lookup
|
|
31
|
+
|
|
32
|
+
[](https://badge.fury.io/py/gngram-lookup)
|
|
33
|
+
[](https://www.python.org/downloads/)
|
|
34
|
+
|
|
35
|
+
Word frequency from 500 years of books. O(1) lookup. 5 million words.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install gngram-lookup
|
|
41
|
+
python -m gngram_lookup.download_data
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Python
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import gngram_lookup as ng
|
|
48
|
+
|
|
49
|
+
ng.exists('computer') # True
|
|
50
|
+
ng.exists('xyznotaword') # False
|
|
51
|
+
|
|
52
|
+
ng.frequency('computer')
|
|
53
|
+
# {'peak_tf': 2000, 'peak_df': 2000, 'sum_tf': 892451, 'sum_df': 312876}
|
|
54
|
+
|
|
55
|
+
ng.batch_frequency(['the', 'algorithm', 'xyznotaword'])
|
|
56
|
+
# {'the': {...}, 'algorithm': {...}, 'xyznotaword': None}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## CLI
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
gngram-exists computer # True, exit 0
|
|
63
|
+
gngram-exists xyznotaword # False, exit 1
|
|
64
|
+
|
|
65
|
+
gngram-freq computer
|
|
66
|
+
# peak_tf_decade: 2000
|
|
67
|
+
# peak_df_decade: 2000
|
|
68
|
+
# sum_tf: 892451
|
|
69
|
+
# sum_df: 312876
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Docs
|
|
73
|
+
|
|
74
|
+
- [API Reference](docs/api.md)
|
|
75
|
+
- [CLI Reference](docs/cli.md)
|
|
76
|
+
- [Data Format](docs/data-format.md)
|
|
77
|
+
- [Use Cases](docs/use-cases.md)
|
|
78
|
+
- [Development](docs/development.md)
|
|
79
|
+
|
|
80
|
+
## Attribution
|
|
81
|
+
|
|
82
|
+
Data derived from the [Google Books Ngram](https://books.google.com/ngrams) dataset.
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
Proprietary. See [LICENSE](LICENSE).
|
|
87
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# gngram-lookup
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/gngram-lookup)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
|
|
6
|
+
Word frequency from 500 years of books. O(1) lookup. 5 million words.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install gngram-lookup
|
|
12
|
+
python -m gngram_lookup.download_data
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Python
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import gngram_lookup as ng
|
|
19
|
+
|
|
20
|
+
ng.exists('computer') # True
|
|
21
|
+
ng.exists('xyznotaword') # False
|
|
22
|
+
|
|
23
|
+
ng.frequency('computer')
|
|
24
|
+
# {'peak_tf': 2000, 'peak_df': 2000, 'sum_tf': 892451, 'sum_df': 312876}
|
|
25
|
+
|
|
26
|
+
ng.batch_frequency(['the', 'algorithm', 'xyznotaword'])
|
|
27
|
+
# {'the': {...}, 'algorithm': {...}, 'xyznotaword': None}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## CLI
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
gngram-exists computer # True, exit 0
|
|
34
|
+
gngram-exists xyznotaword # False, exit 1
|
|
35
|
+
|
|
36
|
+
gngram-freq computer
|
|
37
|
+
# peak_tf_decade: 2000
|
|
38
|
+
# peak_df_decade: 2000
|
|
39
|
+
# sum_tf: 892451
|
|
40
|
+
# sum_df: 312876
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Docs
|
|
44
|
+
|
|
45
|
+
- [API Reference](docs/api.md)
|
|
46
|
+
- [CLI Reference](docs/cli.md)
|
|
47
|
+
- [Data Format](docs/data-format.md)
|
|
48
|
+
- [Use Cases](docs/use-cases.md)
|
|
49
|
+
- [Development](docs/development.md)
|
|
50
|
+
|
|
51
|
+
## Attribution
|
|
52
|
+
|
|
53
|
+
Data derived from the [Google Books Ngram](https://books.google.com/ngrams) dataset.
|
|
54
|
+
|
|
55
|
+
## License
|
|
56
|
+
|
|
57
|
+
Proprietary. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""gngram-counter: Google Ngram frequency counter."""
|
|
2
|
+
|
|
3
|
+
from gngram_counter.data import get_data_dir, get_hash_file, is_data_installed
|
|
4
|
+
from gngram_counter.lookup import FrequencyData, batch_frequency, exists, frequency
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"get_data_dir",
|
|
8
|
+
"get_hash_file",
|
|
9
|
+
"is_data_installed",
|
|
10
|
+
"exists",
|
|
11
|
+
"frequency",
|
|
12
|
+
"batch_frequency",
|
|
13
|
+
"FrequencyData",
|
|
14
|
+
]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""CLI entry points for gngram-counter."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from gngram_counter.lookup import exists, frequency
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def gngram_exists() -> None:
|
|
9
|
+
"""Check if a word exists in the ngram data."""
|
|
10
|
+
if len(sys.argv) != 2:
|
|
11
|
+
print("Usage: gngram-exists <word>")
|
|
12
|
+
sys.exit(1)
|
|
13
|
+
|
|
14
|
+
word = sys.argv[1]
|
|
15
|
+
result = exists(word)
|
|
16
|
+
print(result)
|
|
17
|
+
sys.exit(0 if result else 1)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def gngram_freq() -> None:
|
|
21
|
+
"""Get frequency data for a word."""
|
|
22
|
+
if len(sys.argv) != 2:
|
|
23
|
+
print("Usage: gngram-freq <word>")
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
|
|
26
|
+
word = sys.argv[1]
|
|
27
|
+
result = frequency(word)
|
|
28
|
+
if result is None:
|
|
29
|
+
print("None")
|
|
30
|
+
sys.exit(1)
|
|
31
|
+
|
|
32
|
+
print(f"peak_tf_decade: {result['peak_tf']}")
|
|
33
|
+
print(f"peak_df_decade: {result['peak_df']}")
|
|
34
|
+
print(f"sum_tf: {result['sum_tf']}")
|
|
35
|
+
print(f"sum_df: {result['sum_df']}")
|
|
36
|
+
sys.exit(0)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data path utilities for gngram-counter.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
DATA_DIR = Path.home() / ".gngram-counter" / "data"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_data_dir() -> Path:
|
|
11
|
+
"""Return the data directory path."""
|
|
12
|
+
return DATA_DIR
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_hash_file(prefix: str) -> Path:
|
|
16
|
+
"""Return path to a specific hash bucket parquet file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
prefix: Two hex characters (00-ff)
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Path to the parquet file (may be in subdirectory from tar extraction)
|
|
23
|
+
"""
|
|
24
|
+
# Handle both flat structure and nested (from tar extraction)
|
|
25
|
+
direct = DATA_DIR / f"{prefix}.parquet"
|
|
26
|
+
if direct.exists():
|
|
27
|
+
return direct
|
|
28
|
+
|
|
29
|
+
nested = DATA_DIR / "parquet-hash" / f"{prefix}.parquet"
|
|
30
|
+
if nested.exists():
|
|
31
|
+
return nested
|
|
32
|
+
|
|
33
|
+
raise FileNotFoundError(
|
|
34
|
+
f"Data file not found for prefix '{prefix}'. "
|
|
35
|
+
"Run 'python -m gngram_counter.download_data' to download the data files."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_data_installed() -> bool:
|
|
40
|
+
"""Check if data files are installed."""
|
|
41
|
+
if not DATA_DIR.exists():
|
|
42
|
+
return False
|
|
43
|
+
return any(DATA_DIR.glob("**/*.parquet"))
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download gngram data files from GitHub.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m gngram_counter.download_data
|
|
6
|
+
|
|
7
|
+
Downloads parquet hash files to ~/.gngram-counter/data/
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
import tarfile
|
|
12
|
+
import urllib.request
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
GITHUB_REPO = "craigtrim/gngram-counter"
|
|
17
|
+
DATA_VERSION = "v1.0.0"
|
|
18
|
+
DATA_FILENAME = "parquet-hash.tar.gz"
|
|
19
|
+
|
|
20
|
+
DATA_DIR = Path.home() / ".gngram-counter" / "data"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_download_url() -> str:
|
|
24
|
+
return f"https://github.com/{GITHUB_REPO}/releases/download/{DATA_VERSION}/{DATA_FILENAME}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def download_and_extract() -> None:
|
|
28
|
+
url = get_download_url()
|
|
29
|
+
print(f"Downloading gngram data from {url}")
|
|
30
|
+
print(f"Destination: {DATA_DIR}")
|
|
31
|
+
|
|
32
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
with urllib.request.urlopen(url) as response:
|
|
36
|
+
total_size = response.headers.get("Content-Length")
|
|
37
|
+
if total_size:
|
|
38
|
+
total_size = int(total_size)
|
|
39
|
+
print(f"Size: {total_size / 1024 / 1024:.1f} MB")
|
|
40
|
+
|
|
41
|
+
data = BytesIO()
|
|
42
|
+
downloaded = 0
|
|
43
|
+
chunk_size = 1024 * 1024 # 1MB chunks
|
|
44
|
+
|
|
45
|
+
while True:
|
|
46
|
+
chunk = response.read(chunk_size)
|
|
47
|
+
if not chunk:
|
|
48
|
+
break
|
|
49
|
+
data.write(chunk)
|
|
50
|
+
downloaded += len(chunk)
|
|
51
|
+
if total_size:
|
|
52
|
+
pct = downloaded / total_size * 100
|
|
53
|
+
print(f"\r {downloaded / 1024 / 1024:.1f} MB ({pct:.0f}%)", end="", flush=True)
|
|
54
|
+
else:
|
|
55
|
+
print(f"\r {downloaded / 1024 / 1024:.1f} MB", end="", flush=True)
|
|
56
|
+
|
|
57
|
+
print()
|
|
58
|
+
data.seek(0)
|
|
59
|
+
|
|
60
|
+
print("Extracting...")
|
|
61
|
+
with tarfile.open(fileobj=data, mode="r:gz") as tar:
|
|
62
|
+
tar.extractall(DATA_DIR, filter="data")
|
|
63
|
+
|
|
64
|
+
except urllib.error.HTTPError as e:
|
|
65
|
+
print(f"Error: Failed to download ({e.code} {e.reason})")
|
|
66
|
+
print(f"URL: {url}")
|
|
67
|
+
print("Make sure the release exists and the file is attached.")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
except urllib.error.URLError as e:
|
|
70
|
+
print(f"Error: Network error ({e.reason})")
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
parquet_files = list(DATA_DIR.glob("**/*.parquet"))
|
|
74
|
+
print(f"Done: {len(parquet_files)} parquet files installed to {DATA_DIR}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def main() -> None:
|
|
78
|
+
if DATA_DIR.exists() and any(DATA_DIR.glob("**/*.parquet")):
|
|
79
|
+
print(f"Data already exists at {DATA_DIR}")
|
|
80
|
+
response = input("Re-download and overwrite? [y/N]: ").strip().lower()
|
|
81
|
+
if response != "y":
|
|
82
|
+
print("Cancelled.")
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
download_and_extract()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
main()
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-level lookup API for gngram-counter.
|
|
3
|
+
|
|
4
|
+
Provides simple functions for word frequency lookups similar to bnc-lookup.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
from typing import TypedDict
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from gngram_counter.data import get_hash_file, is_data_installed
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FrequencyData(TypedDict):
|
|
17
|
+
"""Frequency data for a word."""
|
|
18
|
+
|
|
19
|
+
peak_tf: int # Decade with highest term frequency
|
|
20
|
+
peak_df: int # Decade with highest document frequency
|
|
21
|
+
sum_tf: int # Total term frequency across all decades
|
|
22
|
+
sum_df: int # Total document frequency across all decades
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@lru_cache(maxsize=256)
|
|
26
|
+
def _load_bucket(prefix: str) -> pl.DataFrame:
|
|
27
|
+
"""Load and cache a parquet bucket file."""
|
|
28
|
+
return pl.read_parquet(get_hash_file(prefix))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _hash_word(word: str) -> tuple[str, str]:
|
|
32
|
+
"""Hash a word and return (prefix, suffix)."""
|
|
33
|
+
h = hashlib.md5(word.lower().encode("utf-8")).hexdigest()
|
|
34
|
+
return h[:2], h[2:]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def exists(word: str) -> bool:
|
|
38
|
+
"""Check if a word exists in the ngram data.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
word: The word to check (case-insensitive)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
True if the word exists, False otherwise
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
FileNotFoundError: If data files are not installed
|
|
48
|
+
"""
|
|
49
|
+
if not is_data_installed():
|
|
50
|
+
raise FileNotFoundError(
|
|
51
|
+
"Data files not installed. Run: python -m gngram_counter.download_data"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
prefix, suffix = _hash_word(word)
|
|
55
|
+
df = _load_bucket(prefix)
|
|
56
|
+
return len(df.filter(pl.col("hash") == suffix)) > 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def frequency(word: str) -> FrequencyData | None:
|
|
60
|
+
"""Get frequency data for a word.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
word: The word to look up (case-insensitive)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
FrequencyData dict with peak_tf, peak_df, sum_tf, sum_df, or None if not found
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
FileNotFoundError: If data files are not installed
|
|
70
|
+
"""
|
|
71
|
+
if not is_data_installed():
|
|
72
|
+
raise FileNotFoundError(
|
|
73
|
+
"Data files not installed. Run: python -m gngram_counter.download_data"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
prefix, suffix = _hash_word(word)
|
|
77
|
+
df = _load_bucket(prefix)
|
|
78
|
+
row = df.filter(pl.col("hash") == suffix)
|
|
79
|
+
|
|
80
|
+
if len(row) == 0:
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
return FrequencyData(
|
|
84
|
+
peak_tf=row["peak_tf"][0],
|
|
85
|
+
peak_df=row["peak_df"][0],
|
|
86
|
+
sum_tf=row["sum_tf"][0],
|
|
87
|
+
sum_df=row["sum_df"][0],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def batch_frequency(words: list[str]) -> dict[str, FrequencyData | None]:
|
|
92
|
+
"""Get frequency data for multiple words.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
words: List of words to look up (case-insensitive)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dict mapping each word to its FrequencyData or None if not found
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
FileNotFoundError: If data files are not installed
|
|
102
|
+
"""
|
|
103
|
+
if not is_data_installed():
|
|
104
|
+
raise FileNotFoundError(
|
|
105
|
+
"Data files not installed. Run: python -m gngram_counter.download_data"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Group words by bucket prefix for efficient batch lookups
|
|
109
|
+
by_prefix: dict[str, list[tuple[str, str]]] = {}
|
|
110
|
+
for word in words:
|
|
111
|
+
prefix, suffix = _hash_word(word)
|
|
112
|
+
if prefix not in by_prefix:
|
|
113
|
+
by_prefix[prefix] = []
|
|
114
|
+
by_prefix[prefix].append((word, suffix))
|
|
115
|
+
|
|
116
|
+
results: dict[str, FrequencyData | None] = {}
|
|
117
|
+
|
|
118
|
+
for prefix, word_suffix_pairs in by_prefix.items():
|
|
119
|
+
df = _load_bucket(prefix)
|
|
120
|
+
suffixes = [s for _, s in word_suffix_pairs]
|
|
121
|
+
|
|
122
|
+
# Filter to all matching suffixes at once
|
|
123
|
+
matches = df.filter(pl.col("hash").is_in(suffixes))
|
|
124
|
+
match_dict = {row["hash"]: row for row in matches.iter_rows(named=True)}
|
|
125
|
+
|
|
126
|
+
for word, suffix in word_suffix_pairs:
|
|
127
|
+
if suffix in match_dict:
|
|
128
|
+
row = match_dict[suffix]
|
|
129
|
+
results[word] = FrequencyData(
|
|
130
|
+
peak_tf=row["peak_tf"],
|
|
131
|
+
peak_df=row["peak_df"],
|
|
132
|
+
sum_tf=row["sum_tf"],
|
|
133
|
+
sum_df=row["sum_df"],
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
results[word] = None
|
|
137
|
+
|
|
138
|
+
return results
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "gngram-lookup"
|
|
3
|
+
packages = [{include = "gngram_counter"}]
|
|
4
|
+
version = "0.2.0"
|
|
5
|
+
description = "Static Hash-Based Lookup for Google Ngram Frequencies"
|
|
6
|
+
authors = ["Craig Trim <craigtrim@gmail.com>"]
|
|
7
|
+
maintainers = ["Craig Trim <craigtrim@gmail.com>"]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "Proprietary"
|
|
10
|
+
exclude = ["parquet-hash/", "builder/"]
|
|
11
|
+
|
|
12
|
+
keywords = [
|
|
13
|
+
"ngram",
|
|
14
|
+
"google-ngram",
|
|
15
|
+
"nlp",
|
|
16
|
+
"natural-language-processing",
|
|
17
|
+
"frequency",
|
|
18
|
+
"linguistics",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
repository = "https://github.com/craigtrim/gngram-lookup"
|
|
22
|
+
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
29
|
+
"Topic :: Text Processing :: Linguistic",
|
|
30
|
+
"Natural Language :: English",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[tool.poetry.scripts]
|
|
35
|
+
gngram-exists = "gngram_counter.cli:gngram_exists"
|
|
36
|
+
gngram-freq = "gngram_counter.cli:gngram_freq"
|
|
37
|
+
|
|
38
|
+
[tool.poetry.build]
|
|
39
|
+
generate-setup-file = true
|
|
40
|
+
|
|
41
|
+
[tool.poetry.dependencies]
|
|
42
|
+
python = "^3.11"
|
|
43
|
+
polars = "^1.0"
|
|
44
|
+
pyarrow = "^18.0"
|
|
45
|
+
|
|
46
|
+
[tool.poetry.dev-dependencies]
|
|
47
|
+
autopep8 = "*"
|
|
48
|
+
ruff = "*"
|
|
49
|
+
pre-commit = "^2.20.0"
|
|
50
|
+
pytest = "*"
|
|
51
|
+
|
|
52
|
+
[build-system]
|
|
53
|
+
build-backend = "poetry.core.masonry.api"
|
|
54
|
+
requires = ["poetry-core>=1.0.0"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 120
|
|
58
|
+
|
|
59
|
+
[tool.ruff.lint]
|
|
60
|
+
ignore = ["F403", "F405"]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from setuptools import setup
|
|
3
|
+
|
|
4
|
+
packages = \
|
|
5
|
+
['gngram_counter']
|
|
6
|
+
|
|
7
|
+
package_data = \
|
|
8
|
+
{'': ['*']}
|
|
9
|
+
|
|
10
|
+
install_requires = \
|
|
11
|
+
['polars>=1.0,<2.0', 'pyarrow>=18.0,<19.0']
|
|
12
|
+
|
|
13
|
+
entry_points = \
|
|
14
|
+
{'console_scripts': ['gngram-exists = gngram_counter.cli:gngram_exists',
|
|
15
|
+
'gngram-freq = gngram_counter.cli:gngram_freq']}
|
|
16
|
+
|
|
17
|
+
setup_kwargs = {
|
|
18
|
+
'name': 'gngram-lookup',
|
|
19
|
+
'version': '0.2.0',
|
|
20
|
+
'description': 'Static Hash-Based Lookup for Google Ngram Frequencies',
|
|
21
|
+
'long_description': "# gngram-lookup\n\n[](https://badge.fury.io/py/gngram-lookup)\n[](https://www.python.org/downloads/)\n\nWord frequency from 500 years of books. O(1) lookup. 5 million words.\n\n## Install\n\n```bash\npip install gngram-lookup\npython -m gngram_lookup.download_data\n```\n\n## Python\n\n```python\nimport gngram_lookup as ng\n\nng.exists('computer') # True\nng.exists('xyznotaword') # False\n\nng.frequency('computer')\n# {'peak_tf': 2000, 'peak_df': 2000, 'sum_tf': 892451, 'sum_df': 312876}\n\nng.batch_frequency(['the', 'algorithm', 'xyznotaword'])\n# {'the': {...}, 'algorithm': {...}, 'xyznotaword': None}\n```\n\n## CLI\n\n```bash\ngngram-exists computer # True, exit 0\ngngram-exists xyznotaword # False, exit 1\n\ngngram-freq computer\n# peak_tf_decade: 2000\n# peak_df_decade: 2000\n# sum_tf: 892451\n# sum_df: 312876\n```\n\n## Docs\n\n- [API Reference](docs/api.md)\n- [CLI Reference](docs/cli.md)\n- [Data Format](docs/data-format.md)\n- [Use Cases](docs/use-cases.md)\n- [Development](docs/development.md)\n\n## Attribution\n\nData derived from the [Google Books Ngram](https://books.google.com/ngrams) dataset.\n\n## License\n\nProprietary. See [LICENSE](LICENSE).\n",
|
|
22
|
+
'author': 'Craig Trim',
|
|
23
|
+
'author_email': 'craigtrim@gmail.com',
|
|
24
|
+
'maintainer': 'Craig Trim',
|
|
25
|
+
'maintainer_email': 'craigtrim@gmail.com',
|
|
26
|
+
'url': 'https://github.com/craigtrim/gngram-lookup',
|
|
27
|
+
'packages': packages,
|
|
28
|
+
'package_data': package_data,
|
|
29
|
+
'install_requires': install_requires,
|
|
30
|
+
'entry_points': entry_points,
|
|
31
|
+
'python_requires': '>=3.11,<4.0',
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
setup(**setup_kwargs)
|