google-ngrams 0.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google_ngrams/__init__.py +16 -0
- google_ngrams/data/__init__.py +14 -0
- google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet +0 -0
- google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet +0 -0
- google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet +0 -0
- google_ngrams/ngrams.py +209 -0
- google_ngrams/vnc.py +1123 -0
- google_ngrams-0.1.0.dist-info/LICENSE +201 -0
- google_ngrams-0.1.0.dist-info/METADATA +126 -0
- google_ngrams-0.1.0.dist-info/RECORD +12 -0
- google_ngrams-0.1.0.dist-info/WHEEL +6 -0
- google_ngrams-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
# flake8: noqa
|
2
|
+
|
3
|
+
# Set version ----
|
4
|
+
from importlib.metadata import version as _v
|
5
|
+
|
6
|
+
__version__ = _v("google_ngrams")
|
7
|
+
|
8
|
+
del _v
|
9
|
+
|
10
|
+
# Imports ----
|
11
|
+
|
12
|
+
from .ngrams import google_ngram
|
13
|
+
|
14
|
+
from .vnc import TimeSeries
|
15
|
+
|
16
|
+
__all__ = ['google_ngram', 'TimeSeries']
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# flake8: noqa
|
2
|
+
|
3
|
+
from importlib_resources import files as _files
|
4
|
+
|
5
|
+
sources = {
|
6
|
+
"eng_all": _files("google_ngrams") / "data/googlebooks_eng_all_totalcounts_20120701.parquet",
|
7
|
+
"gb_all": _files("google_ngrams") / "data/googlebooks_eng_gb_all_totalcounts_20120701.parquet",
|
8
|
+
"us_all": _files("google_ngrams") / "data/googlebooks_eng_us_all_totalcounts_20120701.parquet",
|
9
|
+
}
|
10
|
+
|
11
|
+
|
12
|
+
def __dir__():
|
13
|
+
return list(sources)
|
14
|
+
|
Binary file
|
Binary file
|
Binary file
|
google_ngrams/ngrams.py
ADDED
@@ -0,0 +1,209 @@
|
|
1
|
+
import re
|
2
|
+
import polars as pl
|
3
|
+
from textwrap import dedent
|
4
|
+
from typing import List
|
5
|
+
from .data import sources
|
6
|
+
|
7
|
+
|
8
|
+
def google_ngram(word_forms: List[str],
|
9
|
+
variety="eng",
|
10
|
+
by="decade") -> pl.DataFrame:
|
11
|
+
"""
|
12
|
+
Fetches Google Ngram data for specified word forms.
|
13
|
+
|
14
|
+
This function retrieves ngram data from the Google Books Ngram Viewer
|
15
|
+
for the given word forms. It supports different varieties of English
|
16
|
+
(e.g., British, American) and allows aggregation by year or decade.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
word_forms : List
|
21
|
+
List of word forms to search for.
|
22
|
+
variety : str
|
23
|
+
Variety of English ('eng', 'gb', 'us', 'fiction').
|
24
|
+
by : str
|
25
|
+
Aggregation level ('year' or 'decade').
|
26
|
+
|
27
|
+
Returns
|
28
|
+
-------
|
29
|
+
pl.DataFrame
|
30
|
+
DataFrame containing the ngram data.
|
31
|
+
"""
|
32
|
+
variety_types = ["eng", "gb", "us", "fiction"]
|
33
|
+
if variety not in variety_types:
|
34
|
+
raise ValueError("""variety_types
|
35
|
+
Invalid variety type. Expected one of: %s
|
36
|
+
""" % variety_types)
|
37
|
+
by_types = ["year", "decade"]
|
38
|
+
if by not in by_types:
|
39
|
+
raise ValueError("""variety_types
|
40
|
+
Invalid by type. Expected one of: %s
|
41
|
+
""" % by_types)
|
42
|
+
word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
|
43
|
+
r'\1 - \2', wf) for wf in word_forms]
|
44
|
+
word_forms = [wf.strip() for wf in word_forms]
|
45
|
+
n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
|
46
|
+
n = list(set(n))
|
47
|
+
|
48
|
+
if len(n) > 1:
|
49
|
+
raise ValueError("""Check spelling.
|
50
|
+
Word forms should be lemmas of the same word
|
51
|
+
(e.g. 'teenager' and 'teenagers'
|
52
|
+
or 'walk', 'walks' and 'walked'
|
53
|
+
""")
|
54
|
+
if n[0] > 5:
|
55
|
+
raise ValueError("""Ngrams can be a maximum of 5 tokens.
|
56
|
+
Hyphenated words are split and include the hyphen,
|
57
|
+
so 'x-ray' would count as 3 tokens.
|
58
|
+
""")
|
59
|
+
|
60
|
+
gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
|
61
|
+
gram = list(set([g.lower() for g in gram]))
|
62
|
+
|
63
|
+
if len(gram) > 1:
|
64
|
+
raise ValueError("""Check spelling.
|
65
|
+
Word forms should be lemmas of the same word
|
66
|
+
(e.g. 'teenager' and 'teenagers'
|
67
|
+
or 'walk', 'walks' and 'walked'
|
68
|
+
""")
|
69
|
+
|
70
|
+
if re.match(r'^[a-z][^a-z]', gram[0]):
|
71
|
+
gram[0] = re.sub(r'[^a-z]', '_', gram[0])
|
72
|
+
if re.match(r'^[0-9]', gram[0]):
|
73
|
+
gram[0] = gram[0][:1]
|
74
|
+
if re.match(r'^[\W]', gram[0]):
|
75
|
+
gram[0] = "punctuation"
|
76
|
+
|
77
|
+
if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđijɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
|
78
|
+
gram[0] = "other"
|
79
|
+
|
80
|
+
gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')
|
81
|
+
|
82
|
+
if variety == "eng":
|
83
|
+
repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
|
84
|
+
else:
|
85
|
+
repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
|
86
|
+
|
87
|
+
print(dedent(
|
88
|
+
"""
|
89
|
+
Accessing repository. For larger ones
|
90
|
+
(e.g., ngrams containing 2 or more words).
|
91
|
+
This may take a few minutes...
|
92
|
+
"""
|
93
|
+
))
|
94
|
+
|
95
|
+
word_forms = [re.sub(
|
96
|
+
r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*)',
|
97
|
+
r'\\\1', wf
|
98
|
+
) for wf in word_forms]
|
99
|
+
|
100
|
+
grep_words = "|".join([f"^{wf}$" for wf in word_forms])
|
101
|
+
|
102
|
+
# Read the data from the google repository and format
|
103
|
+
schema = {"column_1": pl.String,
|
104
|
+
"column_2": pl.Int64,
|
105
|
+
"column_3": pl.Int64,
|
106
|
+
"column_4": pl.Int64}
|
107
|
+
df = pl.scan_csv(repo, separator='\t', has_header=False, schema=schema)
|
108
|
+
filtered_df = df.filter(
|
109
|
+
pl.col("column_1").str.contains(r"(?i)" + grep_words)
|
110
|
+
)
|
111
|
+
all_grams = filtered_df.collect(streaming=True)
|
112
|
+
|
113
|
+
all_grams = (
|
114
|
+
all_grams
|
115
|
+
.rename(
|
116
|
+
{"column_1": "Token",
|
117
|
+
"column_2": "Year",
|
118
|
+
"column_3": "AF"}
|
119
|
+
)
|
120
|
+
).drop("column_4")
|
121
|
+
|
122
|
+
# read totals
|
123
|
+
if variety == "eng":
|
124
|
+
f_path = sources.get("eng_all")
|
125
|
+
elif variety == "gb":
|
126
|
+
f_path = sources.get("gb_all")
|
127
|
+
elif variety == "us":
|
128
|
+
f_path = sources.get("us_all")
|
129
|
+
|
130
|
+
total_counts = pl.read_parquet(f_path)
|
131
|
+
total_counts = total_counts.cast({"Year": pl.UInt32,
|
132
|
+
"Total": pl.UInt64,
|
133
|
+
"Pages": pl.UInt64,
|
134
|
+
"Volumes": pl.UInt64})
|
135
|
+
# format totals, fill missing data, and sum
|
136
|
+
total_counts = total_counts.cast({
|
137
|
+
"Year": pl.UInt32,
|
138
|
+
"Total": pl.UInt64,
|
139
|
+
"Pages": pl.UInt64,
|
140
|
+
"Volumes": pl.UInt64
|
141
|
+
})
|
142
|
+
|
143
|
+
total_counts = (
|
144
|
+
total_counts
|
145
|
+
.with_columns(
|
146
|
+
pl.col("Year")
|
147
|
+
.cast(pl.String).str.to_datetime("%Y")
|
148
|
+
)
|
149
|
+
.sort("Year")
|
150
|
+
.upsample(time_column="Year", every="1y")
|
151
|
+
.with_columns(
|
152
|
+
pl.col(["Total", "Pages", "Volumes"])
|
153
|
+
.fill_null(strategy="zero")
|
154
|
+
)
|
155
|
+
)
|
156
|
+
total_counts = (
|
157
|
+
total_counts
|
158
|
+
.group_by_dynamic(
|
159
|
+
"Year", every="1y"
|
160
|
+
).agg(pl.col("Total").sum())
|
161
|
+
)
|
162
|
+
|
163
|
+
# sum token totals, convert to datetime and fill in missing years
|
164
|
+
sum_tokens = all_grams.group_by("Year", maintain_order=True).sum()
|
165
|
+
sum_tokens = (
|
166
|
+
sum_tokens
|
167
|
+
.with_columns(
|
168
|
+
pl.col("Year")
|
169
|
+
.cast(pl.String).str.to_datetime("%Y")
|
170
|
+
)
|
171
|
+
.sort("Year")
|
172
|
+
.upsample(time_column="Year", every="1y")
|
173
|
+
.with_columns(
|
174
|
+
pl.col("AF")
|
175
|
+
.fill_null(strategy="zero")
|
176
|
+
)
|
177
|
+
).drop("Token")
|
178
|
+
# join with totals
|
179
|
+
sum_tokens = sum_tokens.join(total_counts, on="Year")
|
180
|
+
|
181
|
+
if by == "decade":
|
182
|
+
sum_tokens = (
|
183
|
+
sum_tokens
|
184
|
+
.group_by_dynamic("Year", every="10y")
|
185
|
+
.agg(pl.col(["AF", "Total"]).sum())
|
186
|
+
)
|
187
|
+
# normalize RF per million tokens
|
188
|
+
sum_tokens = (
|
189
|
+
sum_tokens
|
190
|
+
.with_columns(
|
191
|
+
RF=pl.col("AF").truediv("Total").mul(1000000)
|
192
|
+
)
|
193
|
+
.with_columns(
|
194
|
+
pl.col("RF").fill_nan(0)
|
195
|
+
)
|
196
|
+
)
|
197
|
+
sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))
|
198
|
+
sum_tokens = (
|
199
|
+
sum_tokens
|
200
|
+
.with_columns(
|
201
|
+
pl.col("Year").dt.year().alias("Year")
|
202
|
+
)
|
203
|
+
.drop("Total")
|
204
|
+
)
|
205
|
+
|
206
|
+
if by == "decade":
|
207
|
+
sum_tokens = sum_tokens.rename({"Year": "Decade"})
|
208
|
+
|
209
|
+
return sum_tokens
|