google-ngrams 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/LICENSE +21 -60
- {google_ngrams-0.1.0/google_ngrams.egg-info → google_ngrams-0.2.0}/PKG-INFO +38 -20
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/README.rst +18 -10
- google_ngrams-0.2.0/google_ngrams/__init__.py +19 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/__init__.py +1 -1
- google_ngrams-0.2.0/google_ngrams/ngrams.py +341 -0
- google_ngrams-0.2.0/google_ngrams/scatter_helpers.py +187 -0
- google_ngrams-0.2.0/google_ngrams/vnc.py +518 -0
- google_ngrams-0.2.0/google_ngrams/vnc_helpers.py +809 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0/google_ngrams.egg-info}/PKG-INFO +38 -20
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/SOURCES.txt +7 -6
- google_ngrams-0.2.0/google_ngrams.egg-info/requires.txt +13 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/pyproject.toml +25 -14
- google_ngrams-0.2.0/tests/test_ngrams.py +56 -0
- google_ngrams-0.2.0/tests/test_ngrams_realdata.py +62 -0
- google_ngrams-0.2.0/tests/test_vnc.py +85 -0
- google_ngrams-0.2.0/tests/test_vnc_compat.py +178 -0
- google_ngrams-0.1.0/.github/workflows/ci.yml +0 -98
- google_ngrams-0.1.0/.gitignore +0 -174
- google_ngrams-0.1.0/_quarto/_quarto.yml +0 -104
- google_ngrams-0.1.0/docs/.gitkeep +0 -1
- google_ngrams-0.1.0/docs/google_ngrams.ipynb +0 -913
- google_ngrams-0.1.0/google_ngrams/__init__.py +0 -16
- google_ngrams-0.1.0/google_ngrams/ngrams.py +0 -209
- google_ngrams-0.1.0/google_ngrams/vnc.py +0 -1123
- google_ngrams-0.1.0/google_ngrams.egg-info/requires.txt +0 -4
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet +0 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet +0 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet +0 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/dependency_links.txt +0 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/top_level.txt +0 -0
- {google_ngrams-0.1.0 → google_ngrams-0.2.0}/setup.cfg +0 -0
@@ -1,63 +1,24 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24
|
-
exercising permissions granted by this License.
|
25
|
-
|
26
|
-
"Source" form shall mean the preferred form for making modifications,
|
27
|
-
including but not limited to software source code, documentation
|
28
|
-
source, and configuration files.
|
29
|
-
|
30
|
-
"Object" form shall mean any form resulting from mechanical
|
31
|
-
transformation or translation of a Source form, including but
|
32
|
-
not limited to compiled object code, generated documentation,
|
33
|
-
and conversions to other media types.
|
34
|
-
|
35
|
-
"Work" shall mean the work of authorship, whether in Source or
|
36
|
-
Object form, made available under the License, as indicated by a
|
37
|
-
copyright notice that is included in or attached to the work
|
38
|
-
(an example is provided in the Appendix below).
|
39
|
-
|
40
|
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41
|
-
form, that is based on (or derived from) the Work and for which the
|
42
|
-
editorial revisions, annotations, elaborations, or other modifications
|
43
|
-
represent, as a whole, an original work of authorship. For the purposes
|
44
|
-
of this License, Derivative Works shall not include works that remain
|
45
|
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46
|
-
the Work and Derivative Works thereof.
|
47
|
-
|
48
|
-
"Contribution" shall mean any work of authorship, including
|
49
|
-
the original version of the Work and any modifications or additions
|
50
|
-
to that Work or Derivative Works thereof, that is intentionally
|
51
|
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52
|
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53
|
-
the copyright owner. For the purposes of this definition, "submitted"
|
54
|
-
means any form of electronic, verbal, or written communication sent
|
55
|
-
to the Licensor or its representatives, including but not limited to
|
56
|
-
communication on electronic mailing lists, source code control systems,
|
57
|
-
and issue tracking systems that are managed by, or on behalf of, the
|
58
|
-
Licensor for the purpose of discussing and improving the Work, but
|
59
|
-
excluding communication that is conspicuously marked or otherwise
|
60
|
-
designated in writing by the copyright owner as "Not a Contribution."
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 David Brown
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
61
22
|
|
62
23
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63
24
|
on behalf of whom a Contribution has been received by Licensor and
|
@@ -1,30 +1,40 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: google_ngrams
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Fetch and analyze Google Ngram data for specified word forms.
|
5
5
|
Author-email: David Brown <dwb2@andrew.cmu.edu>
|
6
6
|
Maintainer-email: David Brown <dwb2@andrew.cmu.edu>
|
7
|
+
License-Expression: MIT
|
7
8
|
Project-URL: Documentation, https://browndw.github.io/google_ngrams
|
8
|
-
Project-URL: Homepage, https://github.com/browndw/
|
9
|
+
Project-URL: Homepage, https://github.com/browndw/google_ngrams
|
9
10
|
Keywords: nlp,language
|
10
11
|
Classifier: Programming Language :: Python :: 3 :: Only
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
15
|
-
Classifier:
|
16
|
-
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
16
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
17
|
+
Classifier: Operating System :: OS Independent
|
18
|
+
Requires-Python: >=3.10
|
17
19
|
Description-Content-Type: text/x-rst
|
18
20
|
License-File: LICENSE
|
19
|
-
Requires-Dist:
|
21
|
+
Requires-Dist: numpy>=1.22
|
20
22
|
Requires-Dist: matplotlib>=3.5
|
21
23
|
Requires-Dist: polars>=1.17
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
Provides-Extra: test
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
26
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
27
|
+
Provides-Extra: dev
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
29
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
30
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
31
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
32
|
+
Dynamic: license-file
|
33
|
+
|
34
|
+
|
35
|
+
google_ngrams
|
26
36
|
=======================================================================================================
|
27
|
-
|pypi| |pypi_downloads|
|
37
|
+
|pypi| |pypi_downloads| |tests|
|
28
38
|
|
29
39
|
This package has functions for processing `Google’s Ngram repositories <http://storage.googleapis.com/books/ngrams/books/datasetsv2.html>`_ without having to download them locally. These repositories vary in their size, but the larger ones (like th one for the letter *s* or common bigrams) contain multiple gigabytes.
|
30
40
|
|
@@ -33,15 +43,20 @@ The main function uses `scan_csv from the polars <https://docs.pola.rs/api/pytho
|
|
33
43
|
vnc
|
34
44
|
---
|
35
45
|
|
36
|
-
To analyze the returned data, the package
|
46
|
+
To analyze the returned data, the package also contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
|
37
47
|
|
38
|
-
The idea is to use hierarchical clustering to aid "bottom up
|
48
|
+
The idea is to use hierarchical clustering to aid "bottom up" periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
|
39
49
|
|
40
50
|
Distances, therefore, are calculated in sums of standard deviations and coefficients of variation, according to their stated method.
|
41
51
|
|
42
|
-
Dendrograms are plotted using matplotlib,
|
52
|
+
Dendrograms are plotted using matplotlib, with custom implementations for hierarchical clustering that maintain the plotting order of the leaves according to the requirements of the method.
|
53
|
+
|
54
|
+
The package also has a custom implementation of dendrogram truncation that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
|
55
|
+
|
56
|
+
Lightweight Implementation
|
57
|
+
--------------------------
|
43
58
|
|
44
|
-
|
59
|
+
Starting with version 0.2.0, google_ngrams uses lightweight, custom implementations for statistical computations instead of heavy dependencies like scipy and statsmodels. This design choice reduces installation overhead while maintaining full functionality for the core VNC methodology and smoothing operations.
|
45
60
|
|
46
61
|
|
47
62
|
Installation
|
@@ -51,7 +66,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
|
|
51
66
|
|
52
67
|
.. code-block:: install-google_ngrams
|
53
68
|
|
54
|
-
pip install
|
69
|
+
pip install google-ngrams
|
55
70
|
|
56
71
|
|
57
72
|
Usage
|
@@ -113,14 +128,17 @@ For additional information, consult the `documentation <https://browndw.github.i
|
|
113
128
|
License
|
114
129
|
-------
|
115
130
|
|
116
|
-
Code licensed under `
|
117
|
-
See `LICENSE <https://github.com/browndw/
|
131
|
+
Code licensed under `MIT License <https://opensource.org/licenses/MIT>`_.
|
132
|
+
See `LICENSE <https://github.com/browndw/google_ngrams/blob/main/LICENSE>`_ file.
|
118
133
|
|
119
134
|
.. |pypi| image:: https://badge.fury.io/py/google_ngrams.svg
|
120
|
-
:target: https://badge.fury.io/py/
|
135
|
+
:target: https://badge.fury.io/py/google_ngrams
|
121
136
|
:alt: PyPI Version
|
122
137
|
|
123
138
|
.. |pypi_downloads| image:: https://img.shields.io/pypi/dm/google_ngrams
|
124
139
|
:target: https://pypi.org/project/google_ngrams/
|
125
140
|
:alt: Downloads from PyPI
|
126
141
|
|
142
|
+
.. |tests| image:: https://github.com/browndw/google_ngrams/actions/workflows/test.yml/badge.svg
|
143
|
+
:target: https://github.com/browndw/google_ngrams/actions/workflows/test.yml
|
144
|
+
:alt: Test Status
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
|
-
google_ngrams
|
2
|
+
google_ngrams
|
3
3
|
=======================================================================================================
|
4
|
-
|pypi| |pypi_downloads|
|
4
|
+
|pypi| |pypi_downloads| |tests|
|
5
5
|
|
6
6
|
This package has functions for processing `Google’s Ngram repositories <http://storage.googleapis.com/books/ngrams/books/datasetsv2.html>`_ without having to download them locally. These repositories vary in their size, but the larger ones (like th one for the letter *s* or common bigrams) contain multiple gigabytes.
|
7
7
|
|
@@ -10,15 +10,20 @@ The main function uses `scan_csv from the polars <https://docs.pola.rs/api/pytho
|
|
10
10
|
vnc
|
11
11
|
---
|
12
12
|
|
13
|
-
To analyze the returned data, the package
|
13
|
+
To analyze the returned data, the package also contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
|
14
14
|
|
15
|
-
The idea is to use hierarchical clustering to aid "bottom up
|
15
|
+
The idea is to use hierarchical clustering to aid "bottom up" periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
|
16
16
|
|
17
17
|
Distances, therefore, are calculated in sums of standard deviations and coefficients of variation, according to their stated method.
|
18
18
|
|
19
|
-
Dendrograms are plotted using matplotlib,
|
19
|
+
Dendrograms are plotted using matplotlib, with custom implementations for hierarchical clustering that maintain the plotting order of the leaves according to the requirements of the method.
|
20
20
|
|
21
|
-
The package also has
|
21
|
+
The package also has a custom implementation of dendrogram truncation that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
|
22
|
+
|
23
|
+
Lightweight Implementation
|
24
|
+
--------------------------
|
25
|
+
|
26
|
+
Starting with version 0.2.0, google_ngrams uses lightweight, custom implementations for statistical computations instead of heavy dependencies like scipy and statsmodels. This design choice reduces installation overhead while maintaining full functionality for the core VNC methodology and smoothing operations.
|
22
27
|
|
23
28
|
|
24
29
|
Installation
|
@@ -28,7 +33,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
|
|
28
33
|
|
29
34
|
.. code-block:: install-google_ngrams
|
30
35
|
|
31
|
-
pip install
|
36
|
+
pip install google-ngrams
|
32
37
|
|
33
38
|
|
34
39
|
Usage
|
@@ -90,14 +95,17 @@ For additional information, consult the `documentation <https://browndw.github.i
|
|
90
95
|
License
|
91
96
|
-------
|
92
97
|
|
93
|
-
Code licensed under `
|
94
|
-
See `LICENSE <https://github.com/browndw/
|
98
|
+
Code licensed under `MIT License <https://opensource.org/licenses/MIT>`_.
|
99
|
+
See `LICENSE <https://github.com/browndw/google_ngrams/blob/main/LICENSE>`_ file.
|
95
100
|
|
96
101
|
.. |pypi| image:: https://badge.fury.io/py/google_ngrams.svg
|
97
|
-
:target: https://badge.fury.io/py/
|
102
|
+
:target: https://badge.fury.io/py/google_ngrams
|
98
103
|
:alt: PyPI Version
|
99
104
|
|
100
105
|
.. |pypi_downloads| image:: https://img.shields.io/pypi/dm/google_ngrams
|
101
106
|
:target: https://pypi.org/project/google_ngrams/
|
102
107
|
:alt: Downloads from PyPI
|
103
108
|
|
109
|
+
.. |tests| image:: https://github.com/browndw/google_ngrams/actions/workflows/test.yml/badge.svg
|
110
|
+
:target: https://github.com/browndw/google_ngrams/actions/workflows/test.yml
|
111
|
+
:alt: Test Status
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# flake8: noqa
|
2
|
+
|
3
|
+
# Set version ----
|
4
|
+
from importlib.metadata import version as _v, PackageNotFoundError as _PNF
|
5
|
+
|
6
|
+
try:
|
7
|
+
__version__ = _v("google_ngrams")
|
8
|
+
except _PNF: # Fallback when running from source without installed metadata
|
9
|
+
__version__ = "0.0.0"
|
10
|
+
|
11
|
+
del _v
|
12
|
+
|
13
|
+
# Imports ----
|
14
|
+
|
15
|
+
from .ngrams import google_ngram
|
16
|
+
|
17
|
+
from .vnc import TimeSeries
|
18
|
+
|
19
|
+
__all__ = ['google_ngram', 'TimeSeries']
|
@@ -0,0 +1,341 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import polars as pl
|
4
|
+
import warnings
|
5
|
+
import logging
|
6
|
+
from textwrap import dedent
|
7
|
+
from typing import List
|
8
|
+
from .data import sources
|
9
|
+
|
10
|
+
|
11
|
+
def google_ngram(
|
12
|
+
word_forms: List[str],
|
13
|
+
variety="eng",
|
14
|
+
by="decade"
|
15
|
+
) -> pl.DataFrame:
|
16
|
+
"""
|
17
|
+
Fetches Google Ngram data for specified word forms.
|
18
|
+
|
19
|
+
This function retrieves ngram data from the Google Books Ngram Viewer
|
20
|
+
for the given word forms. It supports different varieties of English
|
21
|
+
(e.g., British, American) and allows aggregation by year or decade.
|
22
|
+
|
23
|
+
Parameters
|
24
|
+
----------
|
25
|
+
word_forms : List
|
26
|
+
List of word forms to search for.
|
27
|
+
variety : str
|
28
|
+
Variety of English ('eng', 'gb', 'us').
|
29
|
+
by : str
|
30
|
+
Aggregation level ('year' or 'decade').
|
31
|
+
|
32
|
+
Returns
|
33
|
+
-------
|
34
|
+
pl.DataFrame
|
35
|
+
DataFrame containing the ngram data.
|
36
|
+
"""
|
37
|
+
variety_types = ["eng", "gb", "us"]
|
38
|
+
if variety not in variety_types:
|
39
|
+
raise ValueError("""variety_types
|
40
|
+
Invalid variety type. Expected one of: %s
|
41
|
+
""" % variety_types)
|
42
|
+
by_types = ["year", "decade"]
|
43
|
+
if by not in by_types:
|
44
|
+
raise ValueError("""variety_types
|
45
|
+
Invalid by type. Expected one of: %s
|
46
|
+
""" % by_types)
|
47
|
+
word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
|
48
|
+
r'\1 - \2', wf) for wf in word_forms]
|
49
|
+
word_forms = [wf.strip() for wf in word_forms]
|
50
|
+
n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
|
51
|
+
n = list(set(n))
|
52
|
+
|
53
|
+
if len(n) > 1:
|
54
|
+
raise ValueError("""Check spelling.
|
55
|
+
Word forms should be lemmas of the same word
|
56
|
+
(e.g. 'teenager' and 'teenagers'
|
57
|
+
or 'walk', 'walks' and 'walked'
|
58
|
+
""")
|
59
|
+
if n[0] > 5:
|
60
|
+
raise ValueError("""Ngrams can be a maximum of 5 tokens.
|
61
|
+
Hyphenated words are split and include the hyphen,
|
62
|
+
so 'x-ray' would count as 3 tokens.
|
63
|
+
""")
|
64
|
+
|
65
|
+
gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
|
66
|
+
gram = list(set([g.lower() for g in gram]))
|
67
|
+
|
68
|
+
if len(gram) > 1:
|
69
|
+
raise ValueError("""Check spelling.
|
70
|
+
Word forms should be lemmas of the same word
|
71
|
+
(e.g. 'teenager' and 'teenagers'
|
72
|
+
or 'walk', 'walks' and 'walked'
|
73
|
+
""")
|
74
|
+
|
75
|
+
if re.match(r'^[a-z][^a-z]', gram[0]):
|
76
|
+
gram[0] = re.sub(r'[^a-z]', '_', gram[0])
|
77
|
+
if re.match(r'^[0-9]', gram[0]):
|
78
|
+
gram[0] = gram[0][:1]
|
79
|
+
if re.match(r'^[\W]', gram[0]):
|
80
|
+
gram[0] = "punctuation"
|
81
|
+
|
82
|
+
if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđijɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
|
83
|
+
gram[0] = "other"
|
84
|
+
|
85
|
+
gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')
|
86
|
+
|
87
|
+
# Use HTTPS for integrity (Google Storage supports it) instead of HTTP
|
88
|
+
if variety == "eng":
|
89
|
+
repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
|
90
|
+
else:
|
91
|
+
repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
|
92
|
+
|
93
|
+
logger = logging.getLogger(__name__)
|
94
|
+
logger.info(dedent(
|
95
|
+
"""
|
96
|
+
Accessing repository. For larger ones
|
97
|
+
(e.g., ngrams containing 2 or more words).
|
98
|
+
This may take a few minutes...
|
99
|
+
"""
|
100
|
+
))
|
101
|
+
|
102
|
+
# Preserve exact tokens for equality filtering in non-regex fallbacks
|
103
|
+
tokens_exact = list(word_forms)
|
104
|
+
word_forms = [re.sub(
|
105
|
+
r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*|\+|\|)',
|
106
|
+
r'\\\1', wf
|
107
|
+
) for wf in word_forms]
|
108
|
+
|
109
|
+
grep_words = "|".join([f"^{wf}$" for wf in word_forms])
|
110
|
+
|
111
|
+
# Read the data from the google repository and format
|
112
|
+
schema = {"column_1": pl.String,
|
113
|
+
"column_2": pl.Int64,
|
114
|
+
"column_3": pl.Int64,
|
115
|
+
"column_4": pl.Int64}
|
116
|
+
try:
|
117
|
+
df = pl.scan_csv(
|
118
|
+
repo,
|
119
|
+
separator='\t',
|
120
|
+
has_header=False,
|
121
|
+
schema=schema,
|
122
|
+
truncate_ragged_lines=True,
|
123
|
+
low_memory=True,
|
124
|
+
quote_char=None,
|
125
|
+
ignore_errors=True,
|
126
|
+
)
|
127
|
+
except TypeError:
|
128
|
+
# Fallback for environments/tests that monkeypatch scan_csv with a
|
129
|
+
# limited signature. Use minimal, widely-supported args.
|
130
|
+
df = pl.scan_csv(repo, separator='\t', has_header=False, schema=schema)
|
131
|
+
# Push down filter and projection before collection to minimize memory
|
132
|
+
filtered_df = (
|
133
|
+
df
|
134
|
+
.filter(pl.col("column_1").str.contains(r"(?i)" + grep_words))
|
135
|
+
.select([
|
136
|
+
pl.col("column_1").alias("Token"),
|
137
|
+
pl.col("column_2").alias("Year"),
|
138
|
+
pl.col("column_3").alias("AF"),
|
139
|
+
])
|
140
|
+
)
|
141
|
+
|
142
|
+
# Optional: allow tuning streaming batch size via env
|
143
|
+
try:
|
144
|
+
chunk_sz = os.environ.get("POLARS_STREAMING_CHUNK_SIZE")
|
145
|
+
if chunk_sz:
|
146
|
+
pl.Config.set_streaming_chunk_size(int(chunk_sz))
|
147
|
+
except Exception:
|
148
|
+
pass
|
149
|
+
|
150
|
+
# Collect with streaming fallback for stability across polars versions
|
151
|
+
try:
|
152
|
+
logger.debug("Collecting with engine='streaming'.")
|
153
|
+
all_grams = filtered_df.collect(engine="streaming")
|
154
|
+
except Exception:
|
155
|
+
try:
|
156
|
+
# Older streaming path (deprecated in newer Polars)
|
157
|
+
logger.debug("Collecting with deprecated streaming=True path.")
|
158
|
+
with warnings.catch_warnings():
|
159
|
+
warnings.filterwarnings(
|
160
|
+
"ignore",
|
161
|
+
category=DeprecationWarning,
|
162
|
+
message=r"the `streaming` parameter was deprecated.*",
|
163
|
+
)
|
164
|
+
all_grams = filtered_df.collect( # type: ignore[arg-type]
|
165
|
+
streaming=True
|
166
|
+
)
|
167
|
+
except Exception:
|
168
|
+
try:
|
169
|
+
# Plain in-memory collect
|
170
|
+
logger.debug(
|
171
|
+
"Collecting with in-memory engine (no streaming)."
|
172
|
+
)
|
173
|
+
all_grams = filtered_df.collect()
|
174
|
+
except Exception:
|
175
|
+
# Final memory-safe fallback: batched CSV reader with
|
176
|
+
# per-batch filter
|
177
|
+
logger.debug(
|
178
|
+
"Falling back to batched CSV reader + per-batch filter."
|
179
|
+
)
|
180
|
+
batch_sz = int(
|
181
|
+
os.environ.get("POLARS_CSV_BATCH_SIZE", "200000")
|
182
|
+
)
|
183
|
+
try:
|
184
|
+
reader = pl.read_csv_batched(
|
185
|
+
repo,
|
186
|
+
separator='\t',
|
187
|
+
has_header=False,
|
188
|
+
ignore_errors=True,
|
189
|
+
low_memory=True,
|
190
|
+
batch_size=batch_sz,
|
191
|
+
)
|
192
|
+
filtered_batches = []
|
193
|
+
# Prefer equality match for speed and stability
|
194
|
+
try:
|
195
|
+
for batch in reader: # type: ignore[assignment]
|
196
|
+
fb = (
|
197
|
+
batch
|
198
|
+
.filter(pl.col("column_1").is_in(tokens_exact))
|
199
|
+
.select([
|
200
|
+
pl.col("column_1").alias("Token"),
|
201
|
+
pl.col("column_2").alias("Year"),
|
202
|
+
pl.col("column_3").alias("AF"),
|
203
|
+
])
|
204
|
+
)
|
205
|
+
if fb.height:
|
206
|
+
filtered_batches.append(fb)
|
207
|
+
except TypeError:
|
208
|
+
# Fallback for alternate reader APIs
|
209
|
+
while True:
|
210
|
+
try:
|
211
|
+
batches = reader.next_batches(1)
|
212
|
+
except AttributeError:
|
213
|
+
break
|
214
|
+
if not batches:
|
215
|
+
break
|
216
|
+
batch = batches[0]
|
217
|
+
fb = (
|
218
|
+
batch
|
219
|
+
.filter(pl.col("column_1").is_in(tokens_exact))
|
220
|
+
.select([
|
221
|
+
pl.col("column_1").alias("Token"),
|
222
|
+
pl.col("column_2").alias("Year"),
|
223
|
+
pl.col("column_3").alias("AF"),
|
224
|
+
])
|
225
|
+
)
|
226
|
+
if fb.height:
|
227
|
+
filtered_batches.append(fb)
|
228
|
+
|
229
|
+
if filtered_batches:
|
230
|
+
all_grams = pl.concat(filtered_batches)
|
231
|
+
else:
|
232
|
+
all_grams = pl.DataFrame({
|
233
|
+
"Token": pl.Series([], dtype=pl.String),
|
234
|
+
"Year": pl.Series([], dtype=pl.Int64),
|
235
|
+
"AF": pl.Series([], dtype=pl.Int64),
|
236
|
+
})
|
237
|
+
except Exception as e:
|
238
|
+
# If batched reader is unavailable, re-raise with guidance
|
239
|
+
raise RuntimeError(
|
240
|
+
"Polars batched CSV reader fallback failed; consider "
|
241
|
+
"upgrading Polars or disabling this code path via "
|
242
|
+
"environment if necessary."
|
243
|
+
) from e
|
244
|
+
|
245
|
+
# read totals
|
246
|
+
if variety == "eng":
|
247
|
+
f_path = sources.get("eng_all")
|
248
|
+
elif variety == "gb":
|
249
|
+
f_path = sources.get("gb_all")
|
250
|
+
elif variety == "us":
|
251
|
+
f_path = sources.get("us_all")
|
252
|
+
|
253
|
+
total_counts = pl.read_parquet(f_path)
|
254
|
+
# format totals, fill missing data, and sum
|
255
|
+
total_counts = total_counts.cast({
|
256
|
+
"Year": pl.UInt32,
|
257
|
+
"Total": pl.UInt64,
|
258
|
+
"Pages": pl.UInt64,
|
259
|
+
"Volumes": pl.UInt64,
|
260
|
+
})
|
261
|
+
|
262
|
+
total_counts = (
|
263
|
+
total_counts
|
264
|
+
.with_columns(
|
265
|
+
pl.col("Year")
|
266
|
+
.cast(pl.String).str.to_datetime("%Y")
|
267
|
+
)
|
268
|
+
.sort("Year")
|
269
|
+
.upsample(time_column="Year", every="1y")
|
270
|
+
.with_columns(
|
271
|
+
pl.col(["Total", "Pages", "Volumes"])
|
272
|
+
.fill_null(strategy="zero")
|
273
|
+
)
|
274
|
+
)
|
275
|
+
total_counts = (
|
276
|
+
total_counts
|
277
|
+
.group_by_dynamic(
|
278
|
+
"Year", every="1y"
|
279
|
+
).agg(pl.col("Total").sum())
|
280
|
+
)
|
281
|
+
|
282
|
+
# sum token totals, convert to datetime and fill in missing years
|
283
|
+
sum_tokens = (
|
284
|
+
all_grams
|
285
|
+
.group_by("Year", maintain_order=True)
|
286
|
+
.agg(pl.col("AF").sum())
|
287
|
+
)
|
288
|
+
sum_tokens = (
|
289
|
+
sum_tokens
|
290
|
+
.with_columns(
|
291
|
+
pl.col("Year")
|
292
|
+
.cast(pl.String).str.to_datetime("%Y")
|
293
|
+
)
|
294
|
+
.sort("Year")
|
295
|
+
.upsample(time_column="Year", every="1y")
|
296
|
+
.with_columns(
|
297
|
+
pl.col("AF")
|
298
|
+
.fill_null(strategy="zero")
|
299
|
+
)
|
300
|
+
)
|
301
|
+
# join with totals
|
302
|
+
sum_tokens = sum_tokens.join(total_counts, on="Year", how="right")
|
303
|
+
# Fill any missing AF created by the join (years with no token hits)
|
304
|
+
sum_tokens = sum_tokens.with_columns(
|
305
|
+
pl.col("AF").fill_null(strategy="zero")
|
306
|
+
)
|
307
|
+
|
308
|
+
if by == "decade":
|
309
|
+
sum_tokens = (
|
310
|
+
sum_tokens
|
311
|
+
.group_by_dynamic("Year", every="10y")
|
312
|
+
.agg(pl.col(["AF", "Total"]).sum())
|
313
|
+
)
|
314
|
+
# normalize RF per million tokens
|
315
|
+
sum_tokens = (
|
316
|
+
sum_tokens
|
317
|
+
.with_columns(
|
318
|
+
RF=pl.col("AF").truediv("Total").mul(1000000)
|
319
|
+
)
|
320
|
+
.with_columns(
|
321
|
+
pl.col("RF").fill_nan(0)
|
322
|
+
)
|
323
|
+
)
|
324
|
+
sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))
|
325
|
+
sum_tokens = (
|
326
|
+
sum_tokens
|
327
|
+
.with_columns(
|
328
|
+
pl.col("Year").dt.year().alias("Year")
|
329
|
+
)
|
330
|
+
.drop("Total")
|
331
|
+
)
|
332
|
+
|
333
|
+
if by == "decade":
|
334
|
+
# Avoid .rename to prevent potential segfaults
|
335
|
+
sum_tokens = (
|
336
|
+
sum_tokens
|
337
|
+
.with_columns(pl.col("Year").alias("Decade"))
|
338
|
+
.drop("Year")
|
339
|
+
)
|
340
|
+
|
341
|
+
return sum_tokens
|