statcast-bigquery 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """statcast-bigquery.
2
+
3
+ Statcast pitch-level ingestion + LLM-friendly docs + Baseball Savant verification.
4
+ """
5
+
6
+ from statcast_bigquery._version import __version__
7
+
8
+ __all__ = ["__version__"]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,194 @@
1
+ """CLI entrypoint: statcast-bigquery {sync,docs,verify}."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import logging
8
+ import sys
9
+ from datetime import date, datetime, timedelta
10
+
11
+ from google.cloud import bigquery
12
+
13
+ from statcast_bigquery._version import __version__
14
+ from statcast_bigquery.client import StatcastClient
15
+ from statcast_bigquery.docs.renderers import (
16
+ render_bq_descriptions,
17
+ render_data_dictionary,
18
+ render_dbt_yaml,
19
+ render_llm_context,
20
+ render_markdown,
21
+ )
22
+ from statcast_bigquery.verify.savant import (
23
+ BATTING_METRIC_TO_SAVANT_FIELD,
24
+ PITCHING_METRIC_TO_SAVANT_FIELD,
25
+ BaseballSavantBattingVerifier,
26
+ BaseballSavantPitchingVerifier,
27
+ )
28
+ from statcast_bigquery.writer import BigQueryWriter, TableRef
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
31
+ log = logging.getLogger("statcast-bigquery")
32
+
33
+ ALL_BATTING_METRICS = list(BATTING_METRIC_TO_SAVANT_FIELD)
34
+ ALL_PITCHING_METRICS = list(PITCHING_METRIC_TO_SAVANT_FIELD)
35
+ DOC_FORMATS = ["bq-apply", "llm", "dictionary", "markdown", "dbt"]
36
+
37
+
38
+ def build_parser() -> argparse.ArgumentParser:
39
+ parser = argparse.ArgumentParser(prog="statcast-bigquery")
40
+ parser.add_argument("--version", action="version", version=__version__)
41
+ sub = parser.add_subparsers(dest="command", required=True)
42
+
43
+ # sync
44
+ p_sync = sub.add_parser("sync", help="Pull Statcast and write to BigQuery")
45
+ p_sync.add_argument("--start", required=True, help="YYYY-MM-DD start (inclusive)")
46
+ p_sync.add_argument("--end", required=True, help="YYYY-MM-DD end (inclusive)")
47
+ p_sync.add_argument("--table", required=True, help="project.dataset.table")
48
+ p_sync.add_argument("--chunk-by", default="year", choices=["year", "month", "range"])
49
+ p_sync.add_argument("--resume", action="store_true",
50
+ help="Skip year-chunks already recorded in _statcast_ingest_runs")
51
+ p_sync.add_argument("--dry-run", action="store_true")
52
+
53
+ # docs
54
+ p_docs = sub.add_parser("docs", help="Render documentation in various formats")
55
+ p_docs.add_argument("--format", required=True, choices=DOC_FORMATS)
56
+ p_docs.add_argument("--table", help="project.dataset.table (required for bq-apply, dictionary)")
57
+ p_docs.add_argument("--dataset", help="for dictionary format")
58
+ p_docs.add_argument("--output", default="-", help="path or '-' for stdout (default)")
59
+
60
+ # verify
61
+ p_v = sub.add_parser("verify", help="Compare aggregations to external sources")
62
+ p_v.add_argument("--source", default="baseball-savant", choices=["baseball-savant"])
63
+ p_v.add_argument("--aggregation", required=True,
64
+ choices=["player-season", "pitcher-season"])
65
+ p_v.add_argument("--metric", required=True,
66
+ choices=[*ALL_BATTING_METRICS, *ALL_PITCHING_METRICS, "all"])
67
+ p_v.add_argument("--season", required=True, type=int)
68
+ p_v.add_argument("--table", required=True)
69
+ p_v.add_argument("--tolerance", type=float, default=None)
70
+ p_v.add_argument("--min-sample-size", type=int, default=50)
71
+ p_v.add_argument("--threshold", type=float, default=0.99)
72
+ p_v.add_argument("--output", default="-")
73
+
74
+ return parser
75
+
76
+
77
+ def _iter_year_chunks(start: str, end: str) -> list[tuple[str, str]]:
78
+ s = datetime.strptime(start, "%Y-%m-%d").date()
79
+ e = datetime.strptime(end, "%Y-%m-%d").date()
80
+ chunks: list[tuple[str, str]] = []
81
+ cur = s
82
+ while cur <= e:
83
+ year_end = date(cur.year, 12, 31)
84
+ last = min(year_end, e)
85
+ chunks.append((cur.isoformat(), last.isoformat()))
86
+ cur = last + timedelta(days=1)
87
+ return chunks
88
+
89
+
90
+ def cmd_sync(ns: argparse.Namespace) -> int:
91
+ client = bigquery.Client()
92
+ sc = StatcastClient()
93
+ writer = BigQueryWriter(client=client)
94
+ ref = TableRef.parse(ns.table)
95
+ if not ns.dry_run:
96
+ writer.create_table_if_missing(ref)
97
+
98
+ chunks = _iter_year_chunks(ns.start, ns.end) if ns.chunk_by == "year" \
99
+ else [(ns.start, ns.end)]
100
+ for cs, ce in chunks:
101
+ log.info("chunk %s -> %s", cs, ce)
102
+ if ns.dry_run:
103
+ continue
104
+ df = sc.fetch(cs, ce)
105
+ writer.write(ref, df, cs, ce)
106
+ return 0
107
+
108
+
109
+ def cmd_docs(ns: argparse.Namespace) -> int:
110
+ if ns.format == "bq-apply":
111
+ if not ns.table:
112
+ log.error("--table required for bq-apply")
113
+ return 2
114
+ client = bigquery.Client()
115
+ ref = TableRef.parse(ns.table)
116
+ table = client.get_table(str(ref))
117
+ table.schema = render_bq_descriptions()
118
+ client.update_table(table, ["schema"])
119
+ log.info("updated schema descriptions on %s", ref)
120
+ return 0
121
+
122
+ if ns.format == "llm":
123
+ out = render_llm_context()
124
+ elif ns.format == "dictionary":
125
+ if not (ns.dataset and ns.table):
126
+ log.error("--dataset and --table required for dictionary")
127
+ return 2
128
+ ref = TableRef.parse(ns.table)
129
+ out = json.dumps(
130
+ render_data_dictionary(dataset=ns.dataset, table=ref.table), indent=2
131
+ )
132
+ elif ns.format == "markdown":
133
+ out = render_markdown()
134
+ elif ns.format == "dbt":
135
+ out = render_dbt_yaml()
136
+ else:
137
+ raise AssertionError(f"unhandled format {ns.format}")
138
+
139
+ if ns.output == "-":
140
+ with open(sys.stdout.fileno(), mode="w", encoding="utf-8", newline="") as f:
141
+ f.write(out)
142
+ else:
143
+ with open(ns.output, "w", encoding="utf-8") as f:
144
+ f.write(out)
145
+ return 0
146
+
147
+
148
+ def cmd_verify(ns: argparse.Namespace) -> int:
149
+ client = bigquery.Client()
150
+ metrics = ([*ALL_BATTING_METRICS] if ns.aggregation == "player-season"
151
+ else [*ALL_PITCHING_METRICS]) if ns.metric == "all" else [ns.metric]
152
+
153
+ overall_pass = True
154
+ all_results: list[dict] = []
155
+ for m in metrics:
156
+ if ns.aggregation == "player-season":
157
+ v = BaseballSavantBattingVerifier(
158
+ client=client, table=ns.table, season=ns.season, metric=m,
159
+ min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
160
+ )
161
+ else:
162
+ v = BaseballSavantPitchingVerifier(
163
+ client=client, table=ns.table, season=ns.season, metric=m,
164
+ min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
165
+ )
166
+ result = v.run()
167
+ print(result.summary())
168
+ verdict = "PASS" if result.passed(ns.threshold) else "FAIL"
169
+ print(f"{verdict} (threshold {ns.threshold:.2%})\n")
170
+ if not result.passed(ns.threshold):
171
+ overall_pass = False
172
+ all_results.append(result.to_json())
173
+
174
+ if ns.output != "-":
175
+ with open(ns.output, "w", encoding="utf-8") as f:
176
+ json.dump(all_results, f, indent=2)
177
+
178
+ return 0 if overall_pass else 1
179
+
180
+
181
+ def main(argv: list[str] | None = None) -> int:
182
+ parser = build_parser()
183
+ ns = parser.parse_args(argv)
184
+ if ns.command == "sync":
185
+ return cmd_sync(ns)
186
+ if ns.command == "docs":
187
+ return cmd_docs(ns)
188
+ if ns.command == "verify":
189
+ return cmd_verify(ns)
190
+ raise AssertionError(f"unhandled command {ns.command}")
191
+
192
+
193
+ if __name__ == "__main__":
194
+ sys.exit(main())
@@ -0,0 +1,62 @@
1
+ """StatcastClient: thin wrapper around pybaseball.statcast with retry + politeness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from typing import Final
8
+
9
+ import pandas as pd
10
+ import pybaseball as pb
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+ DEFAULT_SLEEP_SECONDS: Final[float] = 2.0
15
+ DEFAULT_MAX_RETRIES: Final[int] = 5
16
+
17
+
18
+ class StatcastClient:
19
+ """Pull pitch-level Statcast data for a date range, regular-season only."""
20
+
21
+ def __init__(
22
+ self,
23
+ sleep_seconds: float = DEFAULT_SLEEP_SECONDS,
24
+ max_retries: int = DEFAULT_MAX_RETRIES,
25
+ ) -> None:
26
+ self.sleep_seconds = sleep_seconds
27
+ self.max_retries = max_retries
28
+
29
+ def fetch(self, start_date: str, end_date: str) -> pd.DataFrame:
30
+ """Pull Statcast pitches between [start_date, end_date], filtered to regular season.
31
+
32
+ Returns an empty DataFrame on no data; raises on persistent failure.
33
+ """
34
+ log.info("statcast: pull %s -> %s", start_date, end_date)
35
+ attempt = 0
36
+ last_err: Exception | None = None
37
+ while attempt < self.max_retries:
38
+ attempt += 1
39
+ try:
40
+ df = pb.statcast(start_dt=start_date, end_dt=end_date)
41
+ break
42
+ except Exception as e: # pybaseball can raise generic Exception on rate limits
43
+ last_err = e
44
+ backoff = self.sleep_seconds * (2 ** (attempt - 1))
45
+ log.warning(
46
+ "statcast attempt %d failed: %s; backoff %.1fs", attempt, e, backoff
47
+ )
48
+ time.sleep(backoff)
49
+ else:
50
+ assert last_err is not None
51
+ raise last_err
52
+
53
+ if df is None or len(df) == 0:
54
+ log.info("statcast: no data for %s -> %s", start_date, end_date)
55
+ return pd.DataFrame()
56
+
57
+ result = df[df["game_type"] == "R"].copy()
58
+ if not isinstance(result, pd.DataFrame):
59
+ result = pd.DataFrame(result)
60
+ log.info("statcast: %d regular-season pitches", len(result))
61
+ time.sleep(self.sleep_seconds)
62
+ return result
@@ -0,0 +1,28 @@
1
+ """Documentation renderers + curated content (taxonomy, statsapi map, pitfalls, examples)."""
2
+
3
+ from statcast_bigquery.docs.example_queries import EXAMPLE_QUERIES, ExampleQuery
4
+ from statcast_bigquery.docs.pitfalls import PITFALLS, Pitfall
5
+ from statcast_bigquery.docs.renderers import (
6
+ render_bq_descriptions,
7
+ render_data_dictionary,
8
+ render_dbt_yaml,
9
+ render_llm_context,
10
+ render_markdown,
11
+ )
12
+ from statcast_bigquery.docs.statsapi_map import STATCAST_TO_STATSAPI_MAP
13
+ from statcast_bigquery.docs.taxonomy import SEMANTIC_GROUPS, columns_in_group
14
+
15
+ __all__ = [
16
+ "EXAMPLE_QUERIES",
17
+ "ExampleQuery",
18
+ "PITFALLS",
19
+ "Pitfall",
20
+ "SEMANTIC_GROUPS",
21
+ "STATCAST_TO_STATSAPI_MAP",
22
+ "columns_in_group",
23
+ "render_bq_descriptions",
24
+ "render_data_dictionary",
25
+ "render_dbt_yaml",
26
+ "render_llm_context",
27
+ "render_markdown",
28
+ ]