PyPI - statcast-bigquery - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statcast-bigquery 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

statcast_bigquery/__init__.py +8 -0
statcast_bigquery/_version.py +1 -0
statcast_bigquery/cli.py +194 -0
statcast_bigquery/client.py +62 -0
statcast_bigquery/docs/__init__.py +28 -0
statcast_bigquery/docs/example_queries.py +483 -0
statcast_bigquery/docs/pitfalls.py +116 -0
statcast_bigquery/docs/renderers.py +155 -0
statcast_bigquery/docs/statsapi_map.py +90 -0
statcast_bigquery/docs/taxonomy.py +30 -0
statcast_bigquery/schema.py +2473 -0
statcast_bigquery/verify/__init__.py +23 -0
statcast_bigquery/verify/base.py +76 -0
statcast_bigquery/verify/compare.py +32 -0
statcast_bigquery/verify/savant.py +263 -0
statcast_bigquery/writer.py +112 -0
statcast_bigquery-0.1.0.dist-info/METADATA +61 -0
statcast_bigquery-0.1.0.dist-info/RECORD +21 -0
statcast_bigquery-0.1.0.dist-info/WHEEL +4 -0
statcast_bigquery-0.1.0.dist-info/entry_points.txt +2 -0
statcast_bigquery-0.1.0.dist-info/licenses/LICENSE +24 -0

statcast_bigquery/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""statcast-bigquery.
+Statcast pitch-level ingestion + LLM-friendly docs + Baseball Savant verification.
+"""
+from statcast_bigquery._version import __version__
+__all__ = ["__version__"]

statcast_bigquery/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

statcast_bigquery/cli.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""CLI entrypoint: statcast-bigquery {sync,docs,verify}."""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+from datetime import date, datetime, timedelta
+from google.cloud import bigquery
+from statcast_bigquery._version import __version__
+from statcast_bigquery.client import StatcastClient
+from statcast_bigquery.docs.renderers import (
+    render_bq_descriptions,
+    render_data_dictionary,
+    render_dbt_yaml,
+    render_llm_context,
+    render_markdown,
+)
+from statcast_bigquery.verify.savant import (
+    BATTING_METRIC_TO_SAVANT_FIELD,
+    PITCHING_METRIC_TO_SAVANT_FIELD,
+    BaseballSavantBattingVerifier,
+    BaseballSavantPitchingVerifier,
+)
+from statcast_bigquery.writer import BigQueryWriter, TableRef
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
+log = logging.getLogger("statcast-bigquery")
+ALL_BATTING_METRICS = list(BATTING_METRIC_TO_SAVANT_FIELD)
+ALL_PITCHING_METRICS = list(PITCHING_METRIC_TO_SAVANT_FIELD)
+DOC_FORMATS = ["bq-apply", "llm", "dictionary", "markdown", "dbt"]
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="statcast-bigquery")
+    parser.add_argument("--version", action="version", version=__version__)
+    sub = parser.add_subparsers(dest="command", required=True)
+    # sync
+    p_sync = sub.add_parser("sync", help="Pull Statcast and write to BigQuery")
+    p_sync.add_argument("--start", required=True, help="YYYY-MM-DD start (inclusive)")
+    p_sync.add_argument("--end", required=True, help="YYYY-MM-DD end (inclusive)")
+    p_sync.add_argument("--table", required=True, help="project.dataset.table")
+    p_sync.add_argument("--chunk-by", default="year", choices=["year", "month", "range"])
+    p_sync.add_argument("--resume", action="store_true",
+                        help="Skip year-chunks already recorded in _statcast_ingest_runs")
+    p_sync.add_argument("--dry-run", action="store_true")
+    # docs
+    p_docs = sub.add_parser("docs", help="Render documentation in various formats")
+    p_docs.add_argument("--format", required=True, choices=DOC_FORMATS)
+    p_docs.add_argument("--table", help="project.dataset.table (required for bq-apply, dictionary)")
+    p_docs.add_argument("--dataset", help="for dictionary format")
+    p_docs.add_argument("--output", default="-", help="path or '-' for stdout (default)")
+    # verify
+    p_v = sub.add_parser("verify", help="Compare aggregations to external sources")
+    p_v.add_argument("--source", default="baseball-savant", choices=["baseball-savant"])
+    p_v.add_argument("--aggregation", required=True,
+                     choices=["player-season", "pitcher-season"])
+    p_v.add_argument("--metric", required=True,
+                     choices=[*ALL_BATTING_METRICS, *ALL_PITCHING_METRICS, "all"])
+    p_v.add_argument("--season", required=True, type=int)
+    p_v.add_argument("--table", required=True)
+    p_v.add_argument("--tolerance", type=float, default=None)
+    p_v.add_argument("--min-sample-size", type=int, default=50)
+    p_v.add_argument("--threshold", type=float, default=0.99)
+    p_v.add_argument("--output", default="-")
+    return parser
+def _iter_year_chunks(start: str, end: str) -> list[tuple[str, str]]:
+    s = datetime.strptime(start, "%Y-%m-%d").date()
+    e = datetime.strptime(end, "%Y-%m-%d").date()
+    chunks: list[tuple[str, str]] = []
+    cur = s
+    while cur <= e:
+        year_end = date(cur.year, 12, 31)
+        last = min(year_end, e)
+        chunks.append((cur.isoformat(), last.isoformat()))
+        cur = last + timedelta(days=1)
+    return chunks
+def cmd_sync(ns: argparse.Namespace) -> int:
+    client = bigquery.Client()
+    sc = StatcastClient()
+    writer = BigQueryWriter(client=client)
+    ref = TableRef.parse(ns.table)
+    if not ns.dry_run:
+        writer.create_table_if_missing(ref)
+    chunks = _iter_year_chunks(ns.start, ns.end) if ns.chunk_by == "year" \
+        else [(ns.start, ns.end)]
+    for cs, ce in chunks:
+        log.info("chunk %s -> %s", cs, ce)
+        if ns.dry_run:
+            continue
+        df = sc.fetch(cs, ce)
+        writer.write(ref, df, cs, ce)
+    return 0
+def cmd_docs(ns: argparse.Namespace) -> int:
+    if ns.format == "bq-apply":
+        if not ns.table:
+            log.error("--table required for bq-apply")
+            return 2
+        client = bigquery.Client()
+        ref = TableRef.parse(ns.table)
+        table = client.get_table(str(ref))
+        table.schema = render_bq_descriptions()
+        client.update_table(table, ["schema"])
+        log.info("updated schema descriptions on %s", ref)
+        return 0
+    if ns.format == "llm":
+        out = render_llm_context()
+    elif ns.format == "dictionary":
+        if not (ns.dataset and ns.table):
+            log.error("--dataset and --table required for dictionary")
+            return 2
+        ref = TableRef.parse(ns.table)
+        out = json.dumps(
+            render_data_dictionary(dataset=ns.dataset, table=ref.table), indent=2
+        )
+    elif ns.format == "markdown":
+        out = render_markdown()
+    elif ns.format == "dbt":
+        out = render_dbt_yaml()
+    else:
+        raise AssertionError(f"unhandled format {ns.format}")
+    if ns.output == "-":
+        with open(sys.stdout.fileno(), mode="w", encoding="utf-8", newline="") as f:
+            f.write(out)
+    else:
+        with open(ns.output, "w", encoding="utf-8") as f:
+            f.write(out)
+    return 0
+def cmd_verify(ns: argparse.Namespace) -> int:
+    client = bigquery.Client()
+    metrics = ([*ALL_BATTING_METRICS] if ns.aggregation == "player-season"
+               else [*ALL_PITCHING_METRICS]) if ns.metric == "all" else [ns.metric]
+    overall_pass = True
+    all_results: list[dict] = []
+    for m in metrics:
+        if ns.aggregation == "player-season":
+            v = BaseballSavantBattingVerifier(
+                client=client, table=ns.table, season=ns.season, metric=m,
+                min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
+            )
+        else:
+            v = BaseballSavantPitchingVerifier(
+                client=client, table=ns.table, season=ns.season, metric=m,
+                min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
+            )
+        result = v.run()
+        print(result.summary())
+        verdict = "PASS" if result.passed(ns.threshold) else "FAIL"
+        print(f"{verdict} (threshold {ns.threshold:.2%})\n")
+        if not result.passed(ns.threshold):
+            overall_pass = False
+        all_results.append(result.to_json())
+    if ns.output != "-":
+        with open(ns.output, "w", encoding="utf-8") as f:
+            json.dump(all_results, f, indent=2)
+    return 0 if overall_pass else 1
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    ns = parser.parse_args(argv)
+    if ns.command == "sync":
+        return cmd_sync(ns)
+    if ns.command == "docs":
+        return cmd_docs(ns)
+    if ns.command == "verify":
+        return cmd_verify(ns)
+    raise AssertionError(f"unhandled command {ns.command}")
+if __name__ == "__main__":
+    sys.exit(main())

statcast_bigquery/client.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""StatcastClient: thin wrapper around pybaseball.statcast with retry + politeness."""
+from __future__ import annotations
+import logging
+import time
+from typing import Final
+import pandas as pd
+import pybaseball as pb
+log = logging.getLogger(__name__)
+DEFAULT_SLEEP_SECONDS: Final[float] = 2.0
+DEFAULT_MAX_RETRIES: Final[int] = 5
+class StatcastClient:
+    """Pull pitch-level Statcast data for a date range, regular-season only."""
+    def __init__(
+        self,
+        sleep_seconds: float = DEFAULT_SLEEP_SECONDS,
+        max_retries: int = DEFAULT_MAX_RETRIES,
+    ) -> None:
+        self.sleep_seconds = sleep_seconds
+        self.max_retries = max_retries
+    def fetch(self, start_date: str, end_date: str) -> pd.DataFrame:
+        """Pull Statcast pitches between [start_date, end_date], filtered to regular season.
+        Returns an empty DataFrame on no data; raises on persistent failure.
+        """
+        log.info("statcast: pull %s -> %s", start_date, end_date)
+        attempt = 0
+        last_err: Exception | None = None
+        while attempt < self.max_retries:
+            attempt += 1
+            try:
+                df = pb.statcast(start_dt=start_date, end_dt=end_date)
+                break
+            except Exception as e:  # pybaseball can raise generic Exception on rate limits
+                last_err = e
+                backoff = self.sleep_seconds * (2 ** (attempt - 1))
+                log.warning(
+                    "statcast attempt %d failed: %s; backoff %.1fs", attempt, e, backoff
+                )
+                time.sleep(backoff)
+        else:
+            assert last_err is not None
+            raise last_err
+        if df is None or len(df) == 0:
+            log.info("statcast: no data for %s -> %s", start_date, end_date)
+            return pd.DataFrame()
+        result = df[df["game_type"] == "R"].copy()
+        if not isinstance(result, pd.DataFrame):
+            result = pd.DataFrame(result)
+        log.info("statcast: %d regular-season pitches", len(result))
+        time.sleep(self.sleep_seconds)
+        return result

statcast_bigquery/docs/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Documentation renderers + curated content (taxonomy, statsapi map, pitfalls, examples)."""
+from statcast_bigquery.docs.example_queries import EXAMPLE_QUERIES, ExampleQuery
+from statcast_bigquery.docs.pitfalls import PITFALLS, Pitfall
+from statcast_bigquery.docs.renderers import (
+    render_bq_descriptions,
+    render_data_dictionary,
+    render_dbt_yaml,
+    render_llm_context,
+    render_markdown,
+)
+from statcast_bigquery.docs.statsapi_map import STATCAST_TO_STATSAPI_MAP
+from statcast_bigquery.docs.taxonomy import SEMANTIC_GROUPS, columns_in_group
+__all__ = [
+    "EXAMPLE_QUERIES",
+    "ExampleQuery",
+    "PITFALLS",
+    "Pitfall",
+    "SEMANTIC_GROUPS",
+    "STATCAST_TO_STATSAPI_MAP",
+    "columns_in_group",
+    "render_bq_descriptions",
+    "render_data_dictionary",
+    "render_dbt_yaml",
+    "render_llm_context",
+    "render_markdown",
+]