PyPI - google-ngrams - Versions diffs - 0.2.0__py3-none-any.whl - Mend

google-ngrams 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

google_ngrams/__init__.py +19 -0
google_ngrams/data/__init__.py +14 -0
google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet +0 -0
google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet +0 -0
google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet +0 -0
google_ngrams/ngrams.py +341 -0
google_ngrams/scatter_helpers.py +187 -0
google_ngrams/vnc.py +518 -0
google_ngrams/vnc_helpers.py +809 -0
google_ngrams-0.2.0.dist-info/METADATA +144 -0
google_ngrams-0.2.0.dist-info/RECORD +14 -0
google_ngrams-0.2.0.dist-info/WHEEL +5 -0
google_ngrams-0.2.0.dist-info/licenses/LICENSE +162 -0
google_ngrams-0.2.0.dist-info/top_level.txt +1 -0

google_ngrams/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+# flake8: noqa
+# Set version ----
+from importlib.metadata import version as _v, PackageNotFoundError as _PNF
+try:
+	__version__ = _v("google_ngrams")
+except _PNF:  # Fallback when running from source without installed metadata
+	__version__ = "0.0.0"
+del _v
+# Imports ----
+from .ngrams import google_ngram
+from .vnc import TimeSeries
+__all__ = ['google_ngram', 'TimeSeries']

google_ngrams/data/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# flake8: noqa
+from importlib.resources import files as _files
+sources = {
+    "eng_all": _files("google_ngrams") / "data/googlebooks_eng_all_totalcounts_20120701.parquet",
+    "gb_all": _files("google_ngrams") / "data/googlebooks_eng_gb_all_totalcounts_20120701.parquet",
+    "us_all": _files("google_ngrams") / "data/googlebooks_eng_us_all_totalcounts_20120701.parquet",
+}
+def __dir__():
+    return list(sources)

google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet ADDED Viewed

Binary file

google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet ADDED Viewed

Binary file

google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet ADDED Viewed

Binary file

google_ngrams/ngrams.py ADDED Viewed

@@ -0,0 +1,341 @@
+import os
+import re
+import polars as pl
+import warnings
+import logging
+from textwrap import dedent
+from typing import List
+from .data import sources
+def google_ngram(
+        word_forms: List[str],
+        variety="eng",
+        by="decade"
+) -> pl.DataFrame:
+    """
+    Fetches Google Ngram data for specified word forms.
+    This function retrieves ngram data from the Google Books Ngram Viewer
+    for the given word forms. It supports different varieties of English
+    (e.g., British, American) and allows aggregation by year or decade.
+    Parameters
+    ----------
+    word_forms : List
+        List of word forms to search for.
+    variety : str
+        Variety of English ('eng', 'gb', 'us').
+    by : str
+        Aggregation level ('year' or 'decade').
+    Returns
+    -------
+    pl.DataFrame
+        DataFrame containing the ngram data.
+    """
+    variety_types = ["eng", "gb", "us"]
+    if variety not in variety_types:
+        raise ValueError("""variety_types
+                         Invalid variety type. Expected one of: %s
+                         """ % variety_types)
+    by_types = ["year", "decade"]
+    if by not in by_types:
+        raise ValueError("""variety_types
+                         Invalid by type. Expected one of: %s
+                         """ % by_types)
+    word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
+                         r'\1 - \2', wf) for wf in word_forms]
+    word_forms = [wf.strip() for wf in word_forms]
+    n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
+    n = list(set(n))
+    if len(n) > 1:
+        raise ValueError("""Check spelling.
+                         Word forms should be lemmas of the same word
+                         (e.g. 'teenager' and 'teenagers'
+                         or 'walk', 'walks' and 'walked'
+                         """)
+    if n[0] > 5:
+        raise ValueError("""Ngrams can be a maximum of 5 tokens.
+                         Hyphenated words are split and include the hyphen,
+                         so 'x-ray' would count as 3 tokens.
+                         """)
+    gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
+    gram = list(set([g.lower() for g in gram]))
+    if len(gram) > 1:
+        raise ValueError("""Check spelling.
+                         Word forms should be lemmas of the same word
+                         (e.g. 'teenager' and 'teenagers'
+                         or 'walk', 'walks' and 'walked'
+                         """)
+    if re.match(r'^[a-z][^a-z]', gram[0]):
+        gram[0] = re.sub(r'[^a-z]', '_', gram[0])
+    if re.match(r'^[0-9]', gram[0]):
+        gram[0] = gram[0][:1]
+    if re.match(r'^[\W]', gram[0]):
+        gram[0] = "punctuation"
+    if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđĳɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
+        gram[0] = "other"
+    gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')
+    # Use HTTPS for integrity (Google Storage supports it) instead of HTTP
+    if variety == "eng":
+        repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz"  # noqa: E501
+    else:
+        repo = f"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz"  # noqa: E501
+    logger = logging.getLogger(__name__)
+    logger.info(dedent(
+        """
+        Accessing repository. For larger ones
+        (e.g., ngrams containing 2 or more words).
+        This may take a few minutes...
+        """
+    ))
+    # Preserve exact tokens for equality filtering in non-regex fallbacks
+    tokens_exact = list(word_forms)
+    word_forms = [re.sub(
+        r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*|\+|\|)',
+        r'\\\1', wf
+        ) for wf in word_forms]
+    grep_words = "|".join([f"^{wf}$" for wf in word_forms])
+    # Read the data from the google repository and format
+    schema = {"column_1": pl.String,
+              "column_2": pl.Int64,
+              "column_3": pl.Int64,
+              "column_4": pl.Int64}
+    try:
+        df = pl.scan_csv(
+            repo,
+            separator='\t',
+            has_header=False,
+            schema=schema,
+            truncate_ragged_lines=True,
+            low_memory=True,
+            quote_char=None,
+            ignore_errors=True,
+        )
+    except TypeError:
+        # Fallback for environments/tests that monkeypatch scan_csv with a
+        # limited signature. Use minimal, widely-supported args.
+        df = pl.scan_csv(repo, separator='\t', has_header=False, schema=schema)
+    # Push down filter and projection before collection to minimize memory
+    filtered_df = (
+        df
+        .filter(pl.col("column_1").str.contains(r"(?i)" + grep_words))
+        .select([
+            pl.col("column_1").alias("Token"),
+            pl.col("column_2").alias("Year"),
+            pl.col("column_3").alias("AF"),
+        ])
+    )
+    # Optional: allow tuning streaming batch size via env
+    try:
+        chunk_sz = os.environ.get("POLARS_STREAMING_CHUNK_SIZE")
+        if chunk_sz:
+            pl.Config.set_streaming_chunk_size(int(chunk_sz))
+    except Exception:
+        pass
+    # Collect with streaming fallback for stability across polars versions
+    try:
+        logger.debug("Collecting with engine='streaming'.")
+        all_grams = filtered_df.collect(engine="streaming")
+    except Exception:
+        try:
+            # Older streaming path (deprecated in newer Polars)
+            logger.debug("Collecting with deprecated streaming=True path.")
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    category=DeprecationWarning,
+                    message=r"the `streaming` parameter was deprecated.*",
+                )
+                all_grams = filtered_df.collect(  # type: ignore[arg-type]
+                    streaming=True
+                )
+        except Exception:
+            try:
+                # Plain in-memory collect
+                logger.debug(
+                    "Collecting with in-memory engine (no streaming)."
+                )
+                all_grams = filtered_df.collect()
+            except Exception:
+                # Final memory-safe fallback: batched CSV reader with
+                # per-batch filter
+                logger.debug(
+                    "Falling back to batched CSV reader + per-batch filter."
+                )
+                batch_sz = int(
+                    os.environ.get("POLARS_CSV_BATCH_SIZE", "200000")
+                )
+                try:
+                    reader = pl.read_csv_batched(
+                        repo,
+                        separator='\t',
+                        has_header=False,
+                        ignore_errors=True,
+                        low_memory=True,
+                        batch_size=batch_sz,
+                    )
+                    filtered_batches = []
+                    # Prefer equality match for speed and stability
+                    try:
+                        for batch in reader:  # type: ignore[assignment]
+                            fb = (
+                                batch
+                                .filter(pl.col("column_1").is_in(tokens_exact))
+                                .select([
+                                    pl.col("column_1").alias("Token"),
+                                    pl.col("column_2").alias("Year"),
+                                    pl.col("column_3").alias("AF"),
+                                ])
+                            )
+                            if fb.height:
+                                filtered_batches.append(fb)
+                    except TypeError:
+                        # Fallback for alternate reader APIs
+                        while True:
+                            try:
+                                batches = reader.next_batches(1)
+                            except AttributeError:
+                                break
+                            if not batches:
+                                break
+                            batch = batches[0]
+                            fb = (
+                                batch
+                                .filter(pl.col("column_1").is_in(tokens_exact))
+                                .select([
+                                    pl.col("column_1").alias("Token"),
+                                    pl.col("column_2").alias("Year"),
+                                    pl.col("column_3").alias("AF"),
+                                ])
+                            )
+                            if fb.height:
+                                filtered_batches.append(fb)
+                    if filtered_batches:
+                        all_grams = pl.concat(filtered_batches)
+                    else:
+                        all_grams = pl.DataFrame({
+                            "Token": pl.Series([], dtype=pl.String),
+                            "Year": pl.Series([], dtype=pl.Int64),
+                            "AF": pl.Series([], dtype=pl.Int64),
+                        })
+                except Exception as e:
+                    # If batched reader is unavailable, re-raise with guidance
+                    raise RuntimeError(
+                        "Polars batched CSV reader fallback failed; consider "
+                        "upgrading Polars or disabling this code path via "
+                        "environment if necessary."
+                    ) from e
+    # read totals
+    if variety == "eng":
+        f_path = sources.get("eng_all")
+    elif variety == "gb":
+        f_path = sources.get("gb_all")
+    elif variety == "us":
+        f_path = sources.get("us_all")
+    total_counts = pl.read_parquet(f_path)
+    # format totals, fill missing data, and sum
+    total_counts = total_counts.cast({
+        "Year": pl.UInt32,
+        "Total": pl.UInt64,
+        "Pages": pl.UInt64,
+        "Volumes": pl.UInt64,
+    })
+    total_counts = (
+        total_counts
+        .with_columns(
+            pl.col("Year")
+            .cast(pl.String).str.to_datetime("%Y")
+            )
+        .sort("Year")
+        .upsample(time_column="Year", every="1y")
+        .with_columns(
+            pl.col(["Total", "Pages", "Volumes"])
+            .fill_null(strategy="zero")
+            )
+            )
+    total_counts = (
+        total_counts
+        .group_by_dynamic(
+            "Year", every="1y"
+        ).agg(pl.col("Total").sum())
+    )
+    # sum token totals, convert to datetime and fill in missing years
+    sum_tokens = (
+        all_grams
+        .group_by("Year", maintain_order=True)
+        .agg(pl.col("AF").sum())
+    )
+    sum_tokens = (
+        sum_tokens
+        .with_columns(
+            pl.col("Year")
+            .cast(pl.String).str.to_datetime("%Y")
+            )
+        .sort("Year")
+        .upsample(time_column="Year", every="1y")
+        .with_columns(
+                pl.col("AF")
+                .fill_null(strategy="zero")
+                )
+        )
+    # join with totals
+    sum_tokens = sum_tokens.join(total_counts, on="Year", how="right")
+    # Fill any missing AF created by the join (years with no token hits)
+    sum_tokens = sum_tokens.with_columns(
+        pl.col("AF").fill_null(strategy="zero")
+    )
+    if by == "decade":
+        sum_tokens = (
+            sum_tokens
+            .group_by_dynamic("Year", every="10y")
+            .agg(pl.col(["AF", "Total"]).sum())
+        )
+    # normalize RF per million tokens
+    sum_tokens = (
+        sum_tokens
+        .with_columns(
+            RF=pl.col("AF").truediv("Total").mul(1000000)
+            )
+        .with_columns(
+            pl.col("RF").fill_nan(0)
+            )
+    )
+    sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))
+    sum_tokens = (
+        sum_tokens
+        .with_columns(
+            pl.col("Year").dt.year().alias("Year")
+            )
+        .drop("Total")
+        )
+    if by == "decade":
+        # Avoid .rename to prevent potential segfaults
+        sum_tokens = (
+            sum_tokens
+            .with_columns(pl.col("Year").alias("Decade"))
+            .drop("Year")
+        )
+    return sum_tokens

google_ngrams/scatter_helpers.py ADDED Viewed

@@ -0,0 +1,187 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+@dataclass
+class SmoothResult:
+    x: np.ndarray
+    y_fit: np.ndarray
+    y_lower: Optional[np.ndarray]
+    y_upper: Optional[np.ndarray]
+def _df_from_ui(n: int, smoothing: int) -> int:
+    """Map UI smoothing (1..9) to an effective df like original code.
+    Original mapping: df = (10 - smoothing) * 10.
+    Clamp to a feasible range given the sample size and cubic degree.
+    """
+    s_param = int(smoothing)
+    if s_param < 1:
+        s_param = 1
+    elif s_param > 9:
+        s_param = 9
+    df = (10 - s_param) * 10  # 1->90 (flexible), 9->10 (smooth)
+    # For cubic regression spline, basis dimension = 4 + t (t=interior knots).
+    max_t = max(0, n - 5)
+    max_df = max_t + 4
+    df = max(6, min(df, max_df))
+    return df
+def _crs_design(x: np.ndarray, knots: np.ndarray) -> np.ndarray:
+    """Cubic regression spline design matrix using truncated power basis.
+    Columns: [1, x, x^2, x^3, (x - t1)_+^3, ..., (x - tm)_+^3]
+    Where (a)_+ = max(a, 0).
+    """
+    x = np.asarray(x, dtype=float)
+    X = [
+        np.ones_like(x),
+        x,
+        x * x,
+        x * x * x,
+    ]
+    for t in knots:
+        z = x - float(t)
+        z[z < 0.0] = 0.0
+        X.append(z * z * z)
+    return np.column_stack(X)
+def _fit_ridge(X: np.ndarray, y: np.ndarray, lam: float) -> np.ndarray:
+    """Solve ridge regression (X^T X + lam I) beta = X^T y."""
+    XtX = X.T @ X
+    n_feat = XtX.shape[0]
+    A = XtX + lam * np.eye(n_feat)
+    Xty = X.T @ y
+    try:
+        beta = np.linalg.solve(A, Xty)
+    except np.linalg.LinAlgError:
+        beta = np.linalg.lstsq(A, Xty, rcond=None)[0]
+    return beta
+def gam_smoother(
+    x: np.ndarray,
+    y: np.ndarray,
+    *,
+    smoothing: int = 7,
+    ci: bool = True,
+    ci_level: float = 0.95,
+    n_boot: int = 200,
+    random_state: Optional[int] = None,
+) -> SmoothResult:
+    """Cubic regression spline (NumPy only) with optional bootstrap CIs.
+    Uses a truncated power basis with interior knots chosen at quantiles and
+    ridge regularization mapped from the UI smoothing parameter. Returns
+    predictions at the original x order and clips negatives to zero.
+    """
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(y, dtype=float)
+    if x.ndim != 1 or y.ndim != 1 or len(x) != len(y):
+        raise ValueError("x and y must be 1D arrays of the same length")
+    order = np.argsort(x)
+    x_sorted = x[order]
+    y_sorted = y[order]
+    # Handle duplicate x by adding tiny jitter
+    dx = np.diff(x_sorted)
+    if np.any(dx == 0):
+        eps = 1e-9 * max(1.0, (x_sorted.max() - x_sorted.min()))
+        counts = {}
+        x_use = x_sorted.copy()
+        for i, val in enumerate(x_sorted):
+            c = counts.get(val, 0)
+            if c > 0:
+                x_use[i] = val + c * eps
+            counts[val] = c + 1
+    else:
+        x_use = x_sorted
+    # Normalize x to [0,1] for numerical stability
+    xmin = float(x_use.min())
+    xmax = float(x_use.max())
+    span = xmax - xmin
+    if span <= 0:
+        span = 1.0
+    x0 = (x_use - xmin) / span
+    # Determine number of interior knots from df mapping
+    n = len(x0)
+    df = _df_from_ui(n, smoothing)
+    k = 3  # cubic
+    t_count = max(0, min(df - (k + 1), n - (k + 2)))
+    if t_count > 0:
+        qs = np.linspace(0, 1, t_count + 2)[1:-1]
+        knots = np.quantile(x0, qs)
+    else:
+        knots = np.array([], dtype=float)
+    # Build design
+    X = _crs_design(x0, knots)
+    # Standardize columns except intercept for stable ridge behavior
+    Xs = X.copy()
+    means = np.zeros(X.shape[1])
+    scales = np.ones(X.shape[1])
+    # skip intercept at col 0
+    for j in range(1, X.shape[1]):
+        col = X[:, j]
+        m = float(np.mean(col))
+        s = float(np.std(col))
+        if s <= 0.0:
+            s = 1.0
+        means[j] = m
+        scales[j] = s
+        Xs[:, j] = (col - m) / s
+    # Map smoothing (1..9) -> ridge lambda on a small scale
+    # smoothing=1 -> ~1e-9 (very flexible), smoothing=9 -> ~1e-4 (smoother)
+    s_param = int(np.clip(smoothing, 1, 9))
+    exp_min, exp_max = -9.0, -4.0
+    exponent = exp_min + (s_param - 1) * (exp_max - exp_min) / 8.0
+    lam = 10.0 ** exponent
+    beta = _fit_ridge(Xs, y_sorted, lam)
+    y_fit_sorted = Xs @ beta
+    inv = np.argsort(order)
+    y_fit = np.asarray(y_fit_sorted[inv], dtype=float)
+    y_fit[y_fit < 0] = 0.0
+    y_lower = None
+    y_upper = None
+    if ci:
+        rng = np.random.default_rng(random_state)
+        alpha = 1.0 - float(ci_level)
+        hi_q = 100.0 * (1.0 - alpha / 2.0)
+        resid_sorted = y_sorted - y_fit_sorted
+        boot_preds = np.empty((n_boot, len(x0)), dtype=float)
+        for b in range(n_boot):
+            resampled = rng.choice(
+                resid_sorted, size=len(resid_sorted), replace=True
+            )
+            y_b = y_fit_sorted + resampled
+            # Refit with the same design and penalty
+            beta_b = _fit_ridge(Xs, y_b, lam)
+            boot_preds[b, :] = Xs @ beta_b
+        # Symmetric, fit-centered half-width from absolute deviations
+        hw_sorted = np.percentile(
+            np.abs(boot_preds - y_fit_sorted), hi_q, axis=0
+        )
+        y_lower_sorted = y_fit_sorted - hw_sorted
+        y_upper_sorted = y_fit_sorted + hw_sorted
+        # Map back to original x order
+        y_lower = y_lower_sorted[inv]
+        y_upper = y_upper_sorted[inv]
+        # Clip to non-negative domain
+        y_lower[y_lower < 0] = 0.0
+        y_upper[y_upper < 0] = 0.0
+    return SmoothResult(x=x, y_fit=y_fit, y_lower=y_lower, y_upper=y_upper)