google-ngrams 0.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ # flake8: noqa
2
+
3
+ # Set version ----
4
+ from importlib.metadata import version as _v
5
+
6
+ __version__ = _v("google_ngrams")
7
+
8
+ del _v
9
+
10
+ # Imports ----
11
+
12
+ from .ngrams import google_ngram
13
+
14
+ from .vnc import TimeSeries
15
+
16
+ __all__ = ['google_ngram', 'TimeSeries']
@@ -0,0 +1,14 @@
1
+ # flake8: noqa
2
+
3
+ from importlib_resources import files as _files
4
+
5
+ sources = {
6
+ "eng_all": _files("google_ngrams") / "data/googlebooks_eng_all_totalcounts_20120701.parquet",
7
+ "gb_all": _files("google_ngrams") / "data/googlebooks_eng_gb_all_totalcounts_20120701.parquet",
8
+ "us_all": _files("google_ngrams") / "data/googlebooks_eng_us_all_totalcounts_20120701.parquet",
9
+ }
10
+
11
+
12
+ def __dir__():
13
+ return list(sources)
14
+
@@ -0,0 +1,209 @@
1
+ import re
2
+ import polars as pl
3
+ from textwrap import dedent
4
+ from typing import List
5
+ from .data import sources
6
+
7
+
8
+ def google_ngram(word_forms: List[str],
9
+ variety="eng",
10
+ by="decade") -> pl.DataFrame:
11
+ """
12
+ Fetches Google Ngram data for specified word forms.
13
+
14
+ This function retrieves ngram data from the Google Books Ngram Viewer
15
+ for the given word forms. It supports different varieties of English
16
+ (e.g., British, American) and allows aggregation by year or decade.
17
+
18
+ Parameters
19
+ ----------
20
+ word_forms : List
21
+ List of word forms to search for.
22
+ variety : str
23
+ Variety of English ('eng', 'gb', 'us', 'fiction').
24
+ by : str
25
+ Aggregation level ('year' or 'decade').
26
+
27
+ Returns
28
+ -------
29
+ pl.DataFrame
30
+ DataFrame containing the ngram data.
31
+ """
32
+ variety_types = ["eng", "gb", "us", "fiction"]
33
+ if variety not in variety_types:
34
+ raise ValueError("""variety_types
35
+ Invalid variety type. Expected one of: %s
36
+ """ % variety_types)
37
+ by_types = ["year", "decade"]
38
+ if by not in by_types:
39
+ raise ValueError("""variety_types
40
+ Invalid by type. Expected one of: %s
41
+ """ % by_types)
42
+ word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
43
+ r'\1 - \2', wf) for wf in word_forms]
44
+ word_forms = [wf.strip() for wf in word_forms]
45
+ n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
46
+ n = list(set(n))
47
+
48
+ if len(n) > 1:
49
+ raise ValueError("""Check spelling.
50
+ Word forms should be lemmas of the same word
51
+ (e.g. 'teenager' and 'teenagers'
52
+ or 'walk', 'walks' and 'walked'
53
+ """)
54
+ if n[0] > 5:
55
+ raise ValueError("""Ngrams can be a maximum of 5 tokens.
56
+ Hyphenated words are split and include the hyphen,
57
+ so 'x-ray' would count as 3 tokens.
58
+ """)
59
+
60
+ gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
61
+ gram = list(set([g.lower() for g in gram]))
62
+
63
+ if len(gram) > 1:
64
+ raise ValueError("""Check spelling.
65
+ Word forms should be lemmas of the same word
66
+ (e.g. 'teenager' and 'teenagers'
67
+ or 'walk', 'walks' and 'walked'
68
+ """)
69
+
70
+ if re.match(r'^[a-z][^a-z]', gram[0]):
71
+ gram[0] = re.sub(r'[^a-z]', '_', gram[0])
72
+ if re.match(r'^[0-9]', gram[0]):
73
+ gram[0] = gram[0][:1]
74
+ if re.match(r'^[\W]', gram[0]):
75
+ gram[0] = "punctuation"
76
+
77
+ if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđijɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
78
+ gram[0] = "other"
79
+
80
+ gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')
81
+
82
+ if variety == "eng":
83
+ repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
84
+ else:
85
+ repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
86
+
87
+ print(dedent(
88
+ """
89
+ Accessing repository. For larger ones
90
+ (e.g., ngrams containing 2 or more words).
91
+ This may take a few minutes...
92
+ """
93
+ ))
94
+
95
+ word_forms = [re.sub(
96
+ r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*)',
97
+ r'\\\1', wf
98
+ ) for wf in word_forms]
99
+
100
+ grep_words = "|".join([f"^{wf}$" for wf in word_forms])
101
+
102
+ # Read the data from the google repository and format
103
+ schema = {"column_1": pl.String,
104
+ "column_2": pl.Int64,
105
+ "column_3": pl.Int64,
106
+ "column_4": pl.Int64}
107
+ df = pl.scan_csv(repo, separator='\t', has_header=False, schema=schema)
108
+ filtered_df = df.filter(
109
+ pl.col("column_1").str.contains(r"(?i)" + grep_words)
110
+ )
111
+ all_grams = filtered_df.collect(streaming=True)
112
+
113
+ all_grams = (
114
+ all_grams
115
+ .rename(
116
+ {"column_1": "Token",
117
+ "column_2": "Year",
118
+ "column_3": "AF"}
119
+ )
120
+ ).drop("column_4")
121
+
122
+ # read totals
123
+ if variety == "eng":
124
+ f_path = sources.get("eng_all")
125
+ elif variety == "gb":
126
+ f_path = sources.get("gb_all")
127
+ elif variety == "us":
128
+ f_path = sources.get("us_all")
129
+
130
+ total_counts = pl.read_parquet(f_path)
131
+ total_counts = total_counts.cast({"Year": pl.UInt32,
132
+ "Total": pl.UInt64,
133
+ "Pages": pl.UInt64,
134
+ "Volumes": pl.UInt64})
135
+ # format totals, fill missing data, and sum
136
+ total_counts = total_counts.cast({
137
+ "Year": pl.UInt32,
138
+ "Total": pl.UInt64,
139
+ "Pages": pl.UInt64,
140
+ "Volumes": pl.UInt64
141
+ })
142
+
143
+ total_counts = (
144
+ total_counts
145
+ .with_columns(
146
+ pl.col("Year")
147
+ .cast(pl.String).str.to_datetime("%Y")
148
+ )
149
+ .sort("Year")
150
+ .upsample(time_column="Year", every="1y")
151
+ .with_columns(
152
+ pl.col(["Total", "Pages", "Volumes"])
153
+ .fill_null(strategy="zero")
154
+ )
155
+ )
156
+ total_counts = (
157
+ total_counts
158
+ .group_by_dynamic(
159
+ "Year", every="1y"
160
+ ).agg(pl.col("Total").sum())
161
+ )
162
+
163
+ # sum token totals, convert to datetime and fill in missing years
164
+ sum_tokens = all_grams.group_by("Year", maintain_order=True).sum()
165
+ sum_tokens = (
166
+ sum_tokens
167
+ .with_columns(
168
+ pl.col("Year")
169
+ .cast(pl.String).str.to_datetime("%Y")
170
+ )
171
+ .sort("Year")
172
+ .upsample(time_column="Year", every="1y")
173
+ .with_columns(
174
+ pl.col("AF")
175
+ .fill_null(strategy="zero")
176
+ )
177
+ ).drop("Token")
178
+ # join with totals
179
+ sum_tokens = sum_tokens.join(total_counts, on="Year")
180
+
181
+ if by == "decade":
182
+ sum_tokens = (
183
+ sum_tokens
184
+ .group_by_dynamic("Year", every="10y")
185
+ .agg(pl.col(["AF", "Total"]).sum())
186
+ )
187
+ # normalize RF per million tokens
188
+ sum_tokens = (
189
+ sum_tokens
190
+ .with_columns(
191
+ RF=pl.col("AF").truediv("Total").mul(1000000)
192
+ )
193
+ .with_columns(
194
+ pl.col("RF").fill_nan(0)
195
+ )
196
+ )
197
+ sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))
198
+ sum_tokens = (
199
+ sum_tokens
200
+ .with_columns(
201
+ pl.col("Year").dt.year().alias("Year")
202
+ )
203
+ .drop("Total")
204
+ )
205
+
206
+ if by == "decade":
207
+ sum_tokens = sum_tokens.rename({"Year": "Decade"})
208
+
209
+ return sum_tokens