google-ngrams 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/LICENSE +21 -60
  2. {google_ngrams-0.1.0/google_ngrams.egg-info → google_ngrams-0.2.0}/PKG-INFO +38 -20
  3. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/README.rst +18 -10
  4. google_ngrams-0.2.0/google_ngrams/__init__.py +19 -0
  5. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/__init__.py +1 -1
  6. google_ngrams-0.2.0/google_ngrams/ngrams.py +341 -0
  7. google_ngrams-0.2.0/google_ngrams/scatter_helpers.py +187 -0
  8. google_ngrams-0.2.0/google_ngrams/vnc.py +518 -0
  9. google_ngrams-0.2.0/google_ngrams/vnc_helpers.py +809 -0
  10. {google_ngrams-0.1.0 → google_ngrams-0.2.0/google_ngrams.egg-info}/PKG-INFO +38 -20
  11. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/SOURCES.txt +7 -6
  12. google_ngrams-0.2.0/google_ngrams.egg-info/requires.txt +13 -0
  13. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/pyproject.toml +25 -14
  14. google_ngrams-0.2.0/tests/test_ngrams.py +56 -0
  15. google_ngrams-0.2.0/tests/test_ngrams_realdata.py +62 -0
  16. google_ngrams-0.2.0/tests/test_vnc.py +85 -0
  17. google_ngrams-0.2.0/tests/test_vnc_compat.py +178 -0
  18. google_ngrams-0.1.0/.github/workflows/ci.yml +0 -98
  19. google_ngrams-0.1.0/.gitignore +0 -174
  20. google_ngrams-0.1.0/_quarto/_quarto.yml +0 -104
  21. google_ngrams-0.1.0/docs/.gitkeep +0 -1
  22. google_ngrams-0.1.0/docs/google_ngrams.ipynb +0 -913
  23. google_ngrams-0.1.0/google_ngrams/__init__.py +0 -16
  24. google_ngrams-0.1.0/google_ngrams/ngrams.py +0 -209
  25. google_ngrams-0.1.0/google_ngrams/vnc.py +0 -1123
  26. google_ngrams-0.1.0/google_ngrams.egg-info/requires.txt +0 -4
  27. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet +0 -0
  28. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet +0 -0
  29. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet +0 -0
  30. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/dependency_links.txt +0 -0
  31. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/google_ngrams.egg-info/top_level.txt +0 -0
  32. {google_ngrams-0.1.0 → google_ngrams-0.2.0}/setup.cfg +0 -0
@@ -1,63 +1,24 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
1
+ MIT License
2
+
3
+ Copyright (c) 2025 David Brown
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
61
22
 
62
23
  "Contributor" shall mean Licensor and any individual or Legal Entity
63
24
  on behalf of whom a Contribution has been received by Licensor and
@@ -1,30 +1,40 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: google_ngrams
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Fetch and analyze Google Ngram data for specified word forms.
5
5
  Author-email: David Brown <dwb2@andrew.cmu.edu>
6
6
  Maintainer-email: David Brown <dwb2@andrew.cmu.edu>
7
+ License-Expression: MIT
7
8
  Project-URL: Documentation, https://browndw.github.io/google_ngrams
8
- Project-URL: Homepage, https://github.com/browndw/pybiber
9
+ Project-URL: Homepage, https://github.com/browndw/google_ngrams
9
10
  Keywords: nlp,language
10
11
  Classifier: Programming Language :: Python :: 3 :: Only
11
- Classifier: Programming Language :: Python :: 3.9
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
- Classifier: Programming Language :: Python :: 3.13
16
- Requires-Python: >=3.9
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Classifier: Operating System :: OS Independent
18
+ Requires-Python: >=3.10
17
19
  Description-Content-Type: text/x-rst
18
20
  License-File: LICENSE
19
- Requires-Dist: importlib-resources>=6.5
21
+ Requires-Dist: numpy>=1.22
20
22
  Requires-Dist: matplotlib>=3.5
21
23
  Requires-Dist: polars>=1.17
22
- Requires-Dist: scipy>=1.15
23
-
24
-
25
- google_ngrams: Fetch and analyze Google Ngram data for specified word forms.
24
+ Provides-Extra: test
25
+ Requires-Dist: pytest>=7.0; extra == "test"
26
+ Requires-Dist: pytest-cov>=4.0; extra == "test"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
30
+ Requires-Dist: build>=1.0.0; extra == "dev"
31
+ Requires-Dist: twine>=5.0.0; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+
35
+ google_ngrams
26
36
  =======================================================================================================
27
- |pypi| |pypi_downloads|
37
+ |pypi| |pypi_downloads| |tests|
28
38
 
29
39
  This package has functions for processing `Google’s Ngram repositories <http://storage.googleapis.com/books/ngrams/books/datasetsv2.html>`_ without having to download them locally. These repositories vary in their size, but the larger ones (like th one for the letter *s* or common bigrams) contain multiple gigabytes.
30
40
 
@@ -33,15 +43,20 @@ The main function uses `scan_csv from the polars <https://docs.pola.rs/api/pytho
33
43
  vnc
34
44
  ---
35
45
 
36
- To analyze the returned data, the package als contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
46
+ To analyze the returned data, the package also contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
37
47
 
38
- The idea is to use hierarchical clustering to aid "bottom up periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
48
+ The idea is to use hierarchical clustering to aid "bottom up" periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
39
49
 
40
50
  Distances, therefore, are calculated in sums of standard deviations and coefficients of variation, according to their stated method.
41
51
 
42
- Dendrograms are plotted using matplotlib, following the scipy conventions for formatting coordinates. However, the package has customized functions for maintaining the plotting order of the leaves according the requirements of the method.
52
+ Dendrograms are plotted using matplotlib, with custom implementations for hierarchical clustering that maintain the plotting order of the leaves according to the requirements of the method.
53
+
54
+ The package also has a custom implementation of dendrogram truncation that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
55
+
56
+ Lightweight Implementation
57
+ --------------------------
43
58
 
44
- The package also has an implementation of `scipy's truncate_mode <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html/>`_ that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
59
+ Starting with version 0.2.0, google_ngrams uses lightweight, custom implementations for statistical computations instead of heavy dependencies like scipy and statsmodels. This design choice reduces installation overhead while maintaining full functionality for the core VNC methodology and smoothing operations.
45
60
 
46
61
 
47
62
  Installation
@@ -51,7 +66,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
51
66
 
52
67
  .. code-block:: install-google_ngrams
53
68
 
54
- pip install google_ngrams
69
+ pip install google-ngrams
55
70
 
56
71
 
57
72
  Usage
@@ -113,14 +128,17 @@ For additional information, consult the `documentation <https://browndw.github.i
113
128
  License
114
129
  -------
115
130
 
116
- Code licensed under `Apache License 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
117
- See `LICENSE <https://github.com/browndw/docuscospacy/blob/master/LICENSE>`_ file.
131
+ Code licensed under `MIT License <https://opensource.org/licenses/MIT>`_.
132
+ See `LICENSE <https://github.com/browndw/google_ngrams/blob/main/LICENSE>`_ file.
118
133
 
119
134
  .. |pypi| image:: https://badge.fury.io/py/google_ngrams.svg
120
- :target: https://badge.fury.io/py/pybiber
135
+ :target: https://badge.fury.io/py/google_ngrams
121
136
  :alt: PyPI Version
122
137
 
123
138
  .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/google_ngrams
124
139
  :target: https://pypi.org/project/google_ngrams/
125
140
  :alt: Downloads from PyPI
126
141
 
142
+ .. |tests| image:: https://github.com/browndw/google_ngrams/actions/workflows/test.yml/badge.svg
143
+ :target: https://github.com/browndw/google_ngrams/actions/workflows/test.yml
144
+ :alt: Test Status
@@ -1,7 +1,7 @@
1
1
 
2
- google_ngrams: Fetch and analyze Google Ngram data for specified word forms.
2
+ google_ngrams
3
3
  =======================================================================================================
4
- |pypi| |pypi_downloads|
4
+ |pypi| |pypi_downloads| |tests|
5
5
 
6
6
  This package has functions for processing `Google’s Ngram repositories <http://storage.googleapis.com/books/ngrams/books/datasetsv2.html>`_ without having to download them locally. These repositories vary in their size, but the larger ones (like th one for the letter *s* or common bigrams) contain multiple gigabytes.
7
7
 
@@ -10,15 +10,20 @@ The main function uses `scan_csv from the polars <https://docs.pola.rs/api/pytho
10
10
  vnc
11
11
  ---
12
12
 
13
- To analyze the returned data, the package als contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
13
+ To analyze the returned data, the package also contains functions based on the work of Gries and Hilpert (2012) for `Variability-Based Neighbor Clustering <https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199922765.001.0001/oxfordhb-9780199922765-e-14>`_.
14
14
 
15
- The idea is to use hierarchical clustering to aid "bottom up periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
15
+ The idea is to use hierarchical clustering to aid "bottom up" periodization of language change. The python functions are built on `their original R code <http://global.oup.com/us/companion.websites/fdscontent/uscompanion/us/static/companion.websites/nevalainen/Gries-Hilpert_web_final/vnc.individual.html>`_.
16
16
 
17
17
  Distances, therefore, are calculated in sums of standard deviations and coefficients of variation, according to their stated method.
18
18
 
19
- Dendrograms are plotted using matplotlib, following the scipy conventions for formatting coordinates. However, the package has customized functions for maintaining the plotting order of the leaves according the requirements of the method.
19
+ Dendrograms are plotted using matplotlib, with custom implementations for hierarchical clustering that maintain the plotting order of the leaves according to the requirements of the method.
20
20
 
21
- The package also has an implementation of `scipy's truncate_mode <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html/>`_ that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
21
+ The package also has a custom implementation of dendrogram truncation that consolidates leaves under a specified number of time periods (or clusters) while also maintaining the leaf order to facilitate the reading and interpretation of large dendrograms.
22
+
23
+ Lightweight Implementation
24
+ --------------------------
25
+
26
+ Starting with version 0.2.0, google_ngrams uses lightweight, custom implementations for statistical computations instead of heavy dependencies like scipy and statsmodels. This design choice reduces installation overhead while maintaining full functionality for the core VNC methodology and smoothing operations.
22
27
 
23
28
 
24
29
  Installation
@@ -28,7 +33,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
28
33
 
29
34
  .. code-block:: install-google_ngrams
30
35
 
31
- pip install google_ngrams
36
+ pip install google-ngrams
32
37
 
33
38
 
34
39
  Usage
@@ -90,14 +95,17 @@ For additional information, consult the `documentation <https://browndw.github.i
90
95
  License
91
96
  -------
92
97
 
93
- Code licensed under `Apache License 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
94
- See `LICENSE <https://github.com/browndw/docuscospacy/blob/master/LICENSE>`_ file.
98
+ Code licensed under `MIT License <https://opensource.org/licenses/MIT>`_.
99
+ See `LICENSE <https://github.com/browndw/google_ngrams/blob/main/LICENSE>`_ file.
95
100
 
96
101
  .. |pypi| image:: https://badge.fury.io/py/google_ngrams.svg
97
- :target: https://badge.fury.io/py/pybiber
102
+ :target: https://badge.fury.io/py/google_ngrams
98
103
  :alt: PyPI Version
99
104
 
100
105
  .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/google_ngrams
101
106
  :target: https://pypi.org/project/google_ngrams/
102
107
  :alt: Downloads from PyPI
103
108
 
109
+ .. |tests| image:: https://github.com/browndw/google_ngrams/actions/workflows/test.yml/badge.svg
110
+ :target: https://github.com/browndw/google_ngrams/actions/workflows/test.yml
111
+ :alt: Test Status
@@ -0,0 +1,19 @@
1
+ # flake8: noqa
2
+
3
+ # Set version ----
4
+ from importlib.metadata import version as _v, PackageNotFoundError as _PNF
5
+
6
+ try:
7
+ __version__ = _v("google_ngrams")
8
+ except _PNF: # Fallback when running from source without installed metadata
9
+ __version__ = "0.0.0"
10
+
11
+ del _v
12
+
13
+ # Imports ----
14
+
15
+ from .ngrams import google_ngram
16
+
17
+ from .vnc import TimeSeries
18
+
19
+ __all__ = ['google_ngram', 'TimeSeries']
@@ -1,6 +1,6 @@
1
1
  # flake8: noqa
2
2
 
3
- from importlib_resources import files as _files
3
+ from importlib.resources import files as _files
4
4
 
5
5
  sources = {
6
6
  "eng_all": _files("google_ngrams") / "data/googlebooks_eng_all_totalcounts_20120701.parquet",
@@ -0,0 +1,341 @@
1
+ import os
2
+ import re
3
+ import polars as pl
4
+ import warnings
5
+ import logging
6
+ from textwrap import dedent
7
+ from typing import List
8
+ from .data import sources
9
+
10
+
11
+ def google_ngram(
12
+ word_forms: List[str],
13
+ variety="eng",
14
+ by="decade"
15
+ ) -> pl.DataFrame:
16
+ """
17
+ Fetches Google Ngram data for specified word forms.
18
+
19
+ This function retrieves ngram data from the Google Books Ngram Viewer
20
+ for the given word forms. It supports different varieties of English
21
+ (e.g., British, American) and allows aggregation by year or decade.
22
+
23
+ Parameters
24
+ ----------
25
+ word_forms : List
26
+ List of word forms to search for.
27
+ variety : str
28
+ Variety of English ('eng', 'gb', 'us').
29
+ by : str
30
+ Aggregation level ('year' or 'decade').
31
+
32
+ Returns
33
+ -------
34
+ pl.DataFrame
35
+ DataFrame containing the ngram data.
36
+ """
37
+ variety_types = ["eng", "gb", "us"]
38
+ if variety not in variety_types:
39
+ raise ValueError("""variety_types
40
+ Invalid variety type. Expected one of: %s
41
+ """ % variety_types)
42
+ by_types = ["year", "decade"]
43
+ if by not in by_types:
44
+ raise ValueError("""variety_types
45
+ Invalid by type. Expected one of: %s
46
+ """ % by_types)
47
+ word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
48
+ r'\1 - \2', wf) for wf in word_forms]
49
+ word_forms = [wf.strip() for wf in word_forms]
50
+ n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
51
+ n = list(set(n))
52
+
53
+ if len(n) > 1:
54
+ raise ValueError("""Check spelling.
55
+ Word forms should be lemmas of the same word
56
+ (e.g. 'teenager' and 'teenagers'
57
+ or 'walk', 'walks' and 'walked'
58
+ """)
59
+ if n[0] > 5:
60
+ raise ValueError("""Ngrams can be a maximum of 5 tokens.
61
+ Hyphenated words are split and include the hyphen,
62
+ so 'x-ray' would count as 3 tokens.
63
+ """)
64
+
65
+ gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
66
+ gram = list(set([g.lower() for g in gram]))
67
+
68
+ if len(gram) > 1:
69
+ raise ValueError("""Check spelling.
70
+ Word forms should be lemmas of the same word
71
+ (e.g. 'teenager' and 'teenagers'
72
+ or 'walk', 'walks' and 'walked'
73
+ """)
74
+
75
+ if re.match(r'^[a-z][^a-z]', gram[0]):
76
+ gram[0] = re.sub(r'[^a-z]', '_', gram[0])
77
+ if re.match(r'^[0-9]', gram[0]):
78
+ gram[0] = gram[0][:1]
79
+ if re.match(r'^[\W]', gram[0]):
80
+ gram[0] = "punctuation"
81
+
82
+ if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđijɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
83
+ gram[0] = "other"
84
+
85
+ gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')
86
+
87
+ # Use HTTPS for integrity (Google Storage supports it) instead of HTTP
88
+ if variety == "eng":
89
+ repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
90
+ else:
91
+ repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz" # noqa: E501
92
+
93
+ logger = logging.getLogger(__name__)
94
+ logger.info(dedent(
95
+ """
96
+ Accessing repository. For larger ones
97
+ (e.g., ngrams containing 2 or more words).
98
+ This may take a few minutes...
99
+ """
100
+ ))
101
+
102
+ # Preserve exact tokens for equality filtering in non-regex fallbacks
103
+ tokens_exact = list(word_forms)
104
+ word_forms = [re.sub(
105
+ r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*|\+|\|)',
106
+ r'\\\1', wf
107
+ ) for wf in word_forms]
108
+
109
+ grep_words = "|".join([f"^{wf}$" for wf in word_forms])
110
+
111
+ # Read the data from the google repository and format
112
+ schema = {"column_1": pl.String,
113
+ "column_2": pl.Int64,
114
+ "column_3": pl.Int64,
115
+ "column_4": pl.Int64}
116
+ try:
117
+ df = pl.scan_csv(
118
+ repo,
119
+ separator='\t',
120
+ has_header=False,
121
+ schema=schema,
122
+ truncate_ragged_lines=True,
123
+ low_memory=True,
124
+ quote_char=None,
125
+ ignore_errors=True,
126
+ )
127
+ except TypeError:
128
+ # Fallback for environments/tests that monkeypatch scan_csv with a
129
+ # limited signature. Use minimal, widely-supported args.
130
+ df = pl.scan_csv(repo, separator='\t', has_header=False, schema=schema)
131
+ # Push down filter and projection before collection to minimize memory
132
+ filtered_df = (
133
+ df
134
+ .filter(pl.col("column_1").str.contains(r"(?i)" + grep_words))
135
+ .select([
136
+ pl.col("column_1").alias("Token"),
137
+ pl.col("column_2").alias("Year"),
138
+ pl.col("column_3").alias("AF"),
139
+ ])
140
+ )
141
+
142
+ # Optional: allow tuning streaming batch size via env
143
+ try:
144
+ chunk_sz = os.environ.get("POLARS_STREAMING_CHUNK_SIZE")
145
+ if chunk_sz:
146
+ pl.Config.set_streaming_chunk_size(int(chunk_sz))
147
+ except Exception:
148
+ pass
149
+
150
+ # Collect with streaming fallback for stability across polars versions
151
+ try:
152
+ logger.debug("Collecting with engine='streaming'.")
153
+ all_grams = filtered_df.collect(engine="streaming")
154
+ except Exception:
155
+ try:
156
+ # Older streaming path (deprecated in newer Polars)
157
+ logger.debug("Collecting with deprecated streaming=True path.")
158
+ with warnings.catch_warnings():
159
+ warnings.filterwarnings(
160
+ "ignore",
161
+ category=DeprecationWarning,
162
+ message=r"the `streaming` parameter was deprecated.*",
163
+ )
164
+ all_grams = filtered_df.collect( # type: ignore[arg-type]
165
+ streaming=True
166
+ )
167
+ except Exception:
168
+ try:
169
+ # Plain in-memory collect
170
+ logger.debug(
171
+ "Collecting with in-memory engine (no streaming)."
172
+ )
173
+ all_grams = filtered_df.collect()
174
+ except Exception:
175
+ # Final memory-safe fallback: batched CSV reader with
176
+ # per-batch filter
177
+ logger.debug(
178
+ "Falling back to batched CSV reader + per-batch filter."
179
+ )
180
+ batch_sz = int(
181
+ os.environ.get("POLARS_CSV_BATCH_SIZE", "200000")
182
+ )
183
+ try:
184
+ reader = pl.read_csv_batched(
185
+ repo,
186
+ separator='\t',
187
+ has_header=False,
188
+ ignore_errors=True,
189
+ low_memory=True,
190
+ batch_size=batch_sz,
191
+ )
192
+ filtered_batches = []
193
+ # Prefer equality match for speed and stability
194
+ try:
195
+ for batch in reader: # type: ignore[assignment]
196
+ fb = (
197
+ batch
198
+ .filter(pl.col("column_1").is_in(tokens_exact))
199
+ .select([
200
+ pl.col("column_1").alias("Token"),
201
+ pl.col("column_2").alias("Year"),
202
+ pl.col("column_3").alias("AF"),
203
+ ])
204
+ )
205
+ if fb.height:
206
+ filtered_batches.append(fb)
207
+ except TypeError:
208
+ # Fallback for alternate reader APIs
209
+ while True:
210
+ try:
211
+ batches = reader.next_batches(1)
212
+ except AttributeError:
213
+ break
214
+ if not batches:
215
+ break
216
+ batch = batches[0]
217
+ fb = (
218
+ batch
219
+ .filter(pl.col("column_1").is_in(tokens_exact))
220
+ .select([
221
+ pl.col("column_1").alias("Token"),
222
+ pl.col("column_2").alias("Year"),
223
+ pl.col("column_3").alias("AF"),
224
+ ])
225
+ )
226
+ if fb.height:
227
+ filtered_batches.append(fb)
228
+
229
+ if filtered_batches:
230
+ all_grams = pl.concat(filtered_batches)
231
+ else:
232
+ all_grams = pl.DataFrame({
233
+ "Token": pl.Series([], dtype=pl.String),
234
+ "Year": pl.Series([], dtype=pl.Int64),
235
+ "AF": pl.Series([], dtype=pl.Int64),
236
+ })
237
+ except Exception as e:
238
+ # If batched reader is unavailable, re-raise with guidance
239
+ raise RuntimeError(
240
+ "Polars batched CSV reader fallback failed; consider "
241
+ "upgrading Polars or disabling this code path via "
242
+ "environment if necessary."
243
+ ) from e
244
+
245
+ # read totals
246
+ if variety == "eng":
247
+ f_path = sources.get("eng_all")
248
+ elif variety == "gb":
249
+ f_path = sources.get("gb_all")
250
+ elif variety == "us":
251
+ f_path = sources.get("us_all")
252
+
253
+ total_counts = pl.read_parquet(f_path)
254
+ # format totals, fill missing data, and sum
255
+ total_counts = total_counts.cast({
256
+ "Year": pl.UInt32,
257
+ "Total": pl.UInt64,
258
+ "Pages": pl.UInt64,
259
+ "Volumes": pl.UInt64,
260
+ })
261
+
262
+ total_counts = (
263
+ total_counts
264
+ .with_columns(
265
+ pl.col("Year")
266
+ .cast(pl.String).str.to_datetime("%Y")
267
+ )
268
+ .sort("Year")
269
+ .upsample(time_column="Year", every="1y")
270
+ .with_columns(
271
+ pl.col(["Total", "Pages", "Volumes"])
272
+ .fill_null(strategy="zero")
273
+ )
274
+ )
275
+ total_counts = (
276
+ total_counts
277
+ .group_by_dynamic(
278
+ "Year", every="1y"
279
+ ).agg(pl.col("Total").sum())
280
+ )
281
+
282
+ # sum token totals, convert to datetime and fill in missing years
283
+ sum_tokens = (
284
+ all_grams
285
+ .group_by("Year", maintain_order=True)
286
+ .agg(pl.col("AF").sum())
287
+ )
288
+ sum_tokens = (
289
+ sum_tokens
290
+ .with_columns(
291
+ pl.col("Year")
292
+ .cast(pl.String).str.to_datetime("%Y")
293
+ )
294
+ .sort("Year")
295
+ .upsample(time_column="Year", every="1y")
296
+ .with_columns(
297
+ pl.col("AF")
298
+ .fill_null(strategy="zero")
299
+ )
300
+ )
301
+ # join with totals
302
+ sum_tokens = sum_tokens.join(total_counts, on="Year", how="right")
303
+ # Fill any missing AF created by the join (years with no token hits)
304
+ sum_tokens = sum_tokens.with_columns(
305
+ pl.col("AF").fill_null(strategy="zero")
306
+ )
307
+
308
+ if by == "decade":
309
+ sum_tokens = (
310
+ sum_tokens
311
+ .group_by_dynamic("Year", every="10y")
312
+ .agg(pl.col(["AF", "Total"]).sum())
313
+ )
314
+ # normalize RF per million tokens
315
+ sum_tokens = (
316
+ sum_tokens
317
+ .with_columns(
318
+ RF=pl.col("AF").truediv("Total").mul(1000000)
319
+ )
320
+ .with_columns(
321
+ pl.col("RF").fill_nan(0)
322
+ )
323
+ )
324
+ sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))
325
+ sum_tokens = (
326
+ sum_tokens
327
+ .with_columns(
328
+ pl.col("Year").dt.year().alias("Year")
329
+ )
330
+ .drop("Total")
331
+ )
332
+
333
+ if by == "decade":
334
+ # Avoid .rename to prevent potential segfaults
335
+ sum_tokens = (
336
+ sum_tokens
337
+ .with_columns(pl.col("Year").alias("Decade"))
338
+ .drop("Year")
339
+ )
340
+
341
+ return sum_tokens