statcast-bigquery 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. statcast_bigquery-0.1.0/.github/workflows/release.yml +41 -0
  2. statcast_bigquery-0.1.0/.github/workflows/test.yml +30 -0
  3. statcast_bigquery-0.1.0/.gitignore +19 -0
  4. statcast_bigquery-0.1.0/.python-version +1 -0
  5. statcast_bigquery-0.1.0/CHANGELOG.md +11 -0
  6. statcast_bigquery-0.1.0/CONTRIBUTING.md +15 -0
  7. statcast_bigquery-0.1.0/LICENSE +24 -0
  8. statcast_bigquery-0.1.0/PKG-INFO +61 -0
  9. statcast_bigquery-0.1.0/README.md +28 -0
  10. statcast_bigquery-0.1.0/pyproject.toml +64 -0
  11. statcast_bigquery-0.1.0/scripts/_pybaseball_columns_2024-04-01.txt +124 -0
  12. statcast_bigquery-0.1.0/scripts/discover_pybaseball_columns.py +22 -0
  13. statcast_bigquery-0.1.0/statcast_bigquery/__init__.py +8 -0
  14. statcast_bigquery-0.1.0/statcast_bigquery/_version.py +1 -0
  15. statcast_bigquery-0.1.0/statcast_bigquery/cli.py +194 -0
  16. statcast_bigquery-0.1.0/statcast_bigquery/client.py +62 -0
  17. statcast_bigquery-0.1.0/statcast_bigquery/docs/__init__.py +28 -0
  18. statcast_bigquery-0.1.0/statcast_bigquery/docs/example_queries.py +483 -0
  19. statcast_bigquery-0.1.0/statcast_bigquery/docs/pitfalls.py +116 -0
  20. statcast_bigquery-0.1.0/statcast_bigquery/docs/renderers.py +155 -0
  21. statcast_bigquery-0.1.0/statcast_bigquery/docs/statsapi_map.py +90 -0
  22. statcast_bigquery-0.1.0/statcast_bigquery/docs/taxonomy.py +30 -0
  23. statcast_bigquery-0.1.0/statcast_bigquery/schema.py +2473 -0
  24. statcast_bigquery-0.1.0/statcast_bigquery/verify/__init__.py +23 -0
  25. statcast_bigquery-0.1.0/statcast_bigquery/verify/base.py +76 -0
  26. statcast_bigquery-0.1.0/statcast_bigquery/verify/compare.py +32 -0
  27. statcast_bigquery-0.1.0/statcast_bigquery/verify/savant.py +263 -0
  28. statcast_bigquery-0.1.0/statcast_bigquery/writer.py +112 -0
  29. statcast_bigquery-0.1.0/tests/__init__.py +0 -0
  30. statcast_bigquery-0.1.0/tests/fixtures/__init__.py +1 -0
  31. statcast_bigquery-0.1.0/tests/fixtures/make_savant_fixture.py +31 -0
  32. statcast_bigquery-0.1.0/tests/fixtures/make_statcast_fixture.py +22 -0
  33. statcast_bigquery-0.1.0/tests/fixtures/savant_batter_2024.parquet +0 -0
  34. statcast_bigquery-0.1.0/tests/fixtures/savant_leaderboard_2024.parquet +0 -0
  35. statcast_bigquery-0.1.0/tests/fixtures/savant_pitcher_2024.parquet +0 -0
  36. statcast_bigquery-0.1.0/tests/fixtures/statcast_sample_2024-04-01.parquet +0 -0
  37. statcast_bigquery-0.1.0/tests/test_cli.py +56 -0
  38. statcast_bigquery-0.1.0/tests/test_client.py +56 -0
  39. statcast_bigquery-0.1.0/tests/test_example_queries.py +55 -0
  40. statcast_bigquery-0.1.0/tests/test_idempotency.py +40 -0
  41. statcast_bigquery-0.1.0/tests/test_pitfalls.py +27 -0
  42. statcast_bigquery-0.1.0/tests/test_renderers.py +70 -0
  43. statcast_bigquery-0.1.0/tests/test_schema.py +190 -0
  44. statcast_bigquery-0.1.0/tests/test_taxonomy.py +40 -0
  45. statcast_bigquery-0.1.0/tests/test_verify_compare.py +73 -0
  46. statcast_bigquery-0.1.0/tests/test_verify_savant.py +126 -0
  47. statcast_bigquery-0.1.0/tests/test_writer.py +80 -0
  48. statcast_bigquery-0.1.0/uv.lock +1811 -0
@@ -0,0 +1,41 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ outputs:
12
+ dist-path: ${{ steps.build.outputs.dist-path }}
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: astral-sh/setup-uv@v3
16
+ - run: uv python install 3.13
17
+ - name: Create venv
18
+ run: uv venv
19
+ - name: Install build tools
20
+ run: uv pip install build
21
+ - id: build
22
+ run: |
23
+ uv run python -m build
24
+ echo "dist-path=dist" >> $GITHUB_OUTPUT
25
+ - uses: actions/upload-artifact@v4
26
+ with:
27
+ name: dist
28
+ path: dist/
29
+
30
+ publish:
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment: pypi
34
+ permissions:
35
+ id-token: write
36
+ steps:
37
+ - uses: actions/download-artifact@v4
38
+ with:
39
+ name: dist
40
+ path: dist/
41
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,30 @@
1
+ name: test
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.11", "3.12", "3.13"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: astral-sh/setup-uv@v3
17
+ - name: Set up Python ${{ matrix.python-version }}
18
+ run: uv python install ${{ matrix.python-version }}
19
+ - name: Create venv
20
+ run: uv venv
21
+ - name: Install
22
+ run: uv pip install -e ".[dev]"
23
+ - name: Lint
24
+ run: uv run ruff check .
25
+ - name: Type check
26
+ run: uv run pyright
27
+ - name: Test
28
+ run: uv run pytest -v --cov=statcast_bigquery --cov-report=xml
29
+ - name: Build
30
+ run: uv run python -m build
@@ -0,0 +1,19 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ .uv-cache/
5
+ .ruff_cache/
6
+ .pytest_cache/
7
+ .coverage
8
+ htmlcov/
9
+ dist/
10
+ build/
11
+ *.egg-info/
12
+ .python-version-local
13
+ .env
14
+ .env.*
15
+ !.env.example
16
+ .vscode/
17
+ .idea/
18
+ .DS_Store
19
+ Thumbs.db
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2026-05-?? (planned)
4
+
5
+ ### Added
6
+ - Initial release.
7
+ - `statcast-bigquery sync` — idempotent Statcast pitch-level ingestion to BigQuery.
8
+ - `statcast-bigquery docs` — 5 documentation renderers (BQ-native, LLM, dictionary, markdown, dbt).
9
+ - `statcast-bigquery verify` — Baseball Savant leaderboard verification (8 metrics).
10
+ - 25 vetted example queries; 10+ pitfall catalog; statsapi cross-reference.
11
+ - Schema spans pybaseball Statcast columns (~118 fields); auto-applied BQ-native column descriptions.
@@ -0,0 +1,15 @@
1
+ # Contributing
2
+
3
+ Bug reports + small PRs welcome. Please open an issue before large changes.
4
+
5
+ This is a hobbyist project; review cadence is best-effort.
6
+
7
+ ## Dev setup
8
+
9
+ uv venv
10
+ uv pip install -e ".[dev]"
11
+ pytest
12
+
13
+ ## Style
14
+
15
+ Ruff + pyright. Run `ruff check . && pyright` before opening a PR.
@@ -0,0 +1,24 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jason Blahovec
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ This software does not include or distribute MLB data. MLB data accessed via
24
+ this software is governed by its source's terms (Baseball Savant / MLBAM).
@@ -0,0 +1,61 @@
1
+ Metadata-Version: 2.4
2
+ Name: statcast-bigquery
3
+ Version: 0.1.0
4
+ Summary: Statcast → BigQuery: idempotent ingestion + LLM-friendly docs + Baseball Savant verification
5
+ Project-URL: Homepage, https://github.com/blahovec-labs/statcast-bigquery
6
+ Project-URL: Issues, https://github.com/blahovec-labs/statcast-bigquery/issues
7
+ Project-URL: Changelog, https://github.com/blahovec-labs/statcast-bigquery/blob/main/CHANGELOG.md
8
+ Author: Jason Blahovec
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: baseball,bigquery,data-engineering,mlb,statcast
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Database
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Requires-Python: >=3.11
21
+ Requires-Dist: google-cloud-bigquery<4.0,>=3.20
22
+ Requires-Dist: pandas<3.0,>=2.0
23
+ Requires-Dist: pyarrow<19.0,>=15.0
24
+ Requires-Dist: pybaseball<3.0,>=2.2.7
25
+ Provides-Extra: dev
26
+ Requires-Dist: build>=1.2.0; extra == 'dev'
27
+ Requires-Dist: duckdb<2.0,>=1.0; extra == 'dev'
28
+ Requires-Dist: pyright>=1.1.380; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
30
+ Requires-Dist: pytest>=8.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.6.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # statcast-bigquery
35
+
36
+ Idempotent Statcast → BigQuery ingestion, with first-class documentation for SQL/LLM agents and round-trip validation against Baseball Savant.
37
+
38
+ ## Install
39
+
40
+ pip install statcast-bigquery
41
+
42
+ ## Quickstart
43
+
44
+ gcloud auth application-default login
45
+ statcast-bigquery sync \
46
+ --start 2024-04-01 --end 2024-10-31 \
47
+ --table myproject.mydataset.statcast_pitches
48
+
49
+ ## Documentation
50
+
51
+ statcast-bigquery docs --format llm > STATCAST_FOR_LLMS.md
52
+
53
+ ## Verification
54
+
55
+ statcast-bigquery verify \
56
+ --source baseball-savant \
57
+ --aggregation player-season \
58
+ --metric all --season 2024 \
59
+ --table myproject.mydataset.statcast_pitches
60
+
61
+ MIT licensed. This software does not include or distribute MLB data.
@@ -0,0 +1,28 @@
1
+ # statcast-bigquery
2
+
3
+ Idempotent Statcast → BigQuery ingestion, with first-class documentation for SQL/LLM agents and round-trip validation against Baseball Savant.
4
+
5
+ ## Install
6
+
7
+ pip install statcast-bigquery
8
+
9
+ ## Quickstart
10
+
11
+ gcloud auth application-default login
12
+ statcast-bigquery sync \
13
+ --start 2024-04-01 --end 2024-10-31 \
14
+ --table myproject.mydataset.statcast_pitches
15
+
16
+ ## Documentation
17
+
18
+ statcast-bigquery docs --format llm > STATCAST_FOR_LLMS.md
19
+
20
+ ## Verification
21
+
22
+ statcast-bigquery verify \
23
+ --source baseball-savant \
24
+ --aggregation player-season \
25
+ --metric all --season 2024 \
26
+ --table myproject.mydataset.statcast_pitches
27
+
28
+ MIT licensed. This software does not include or distribute MLB data.
@@ -0,0 +1,64 @@
1
+ [project]
2
+ name = "statcast-bigquery"
3
+ version = "0.1.0"
4
+ description = "Statcast → BigQuery: idempotent ingestion + LLM-friendly docs + Baseball Savant verification"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [{ name = "Jason Blahovec" }]
8
+ requires-python = ">=3.11"
9
+ keywords = ["mlb", "baseball", "statcast", "bigquery", "data-engineering"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Programming Language :: Python :: 3.13",
17
+ "Topic :: Database",
18
+ "Topic :: Scientific/Engineering",
19
+ ]
20
+ dependencies = [
21
+ "pybaseball>=2.2.7,<3.0",
22
+ "pandas>=2.0,<3.0",
23
+ "google-cloud-bigquery>=3.20,<4.0",
24
+ "pyarrow>=15.0,<19.0",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ dev = [
29
+ "pytest>=8.0",
30
+ "pytest-cov>=5.0",
31
+ "ruff>=0.6.0",
32
+ "pyright>=1.1.380",
33
+ "build>=1.2.0",
34
+ "duckdb>=1.0,<2.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ statcast-bigquery = "statcast_bigquery.cli:main"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/blahovec-labs/statcast-bigquery"
42
+ Issues = "https://github.com/blahovec-labs/statcast-bigquery/issues"
43
+ Changelog = "https://github.com/blahovec-labs/statcast-bigquery/blob/main/CHANGELOG.md"
44
+
45
+ [build-system]
46
+ requires = ["hatchling"]
47
+ build-backend = "hatchling.build"
48
+
49
+ [tool.hatch.build.targets.wheel]
50
+ packages = ["statcast_bigquery"]
51
+
52
+ [tool.ruff]
53
+ line-length = 100
54
+ target-version = "py313"
55
+
56
+ [tool.ruff.lint]
57
+ select = ["E", "F", "I", "N", "UP", "W"]
58
+
59
+ [tool.pyright]
60
+ pythonVersion = "3.13"
61
+ typeCheckingMode = "standard"
62
+
63
+ [tool.pytest.ini_options]
64
+ testpaths = ["tests"]
@@ -0,0 +1,124 @@
1
+ This is a large query, it may take a moment to complete
2
+
3
  0%| | 0/1 [00:00<?, ?it/s]C:\Users\jason\Projects\statcast-bigquery\.venv\Lib\site-packages\pybaseball\datahelpers\postprocessing.py:59: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead
4
+ data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
5
+
6
+ Rows: 4190
7
+ Columns: 118
8
+ 'pitch_type' object
9
+ 'game_date' datetime64[ns]
10
+ 'release_speed' Float64
11
+ 'release_pos_x' Float64
12
+ 'release_pos_z' Float64
13
+ 'player_name' object
14
+ 'batter' Int64
15
+ 'pitcher' Int64
16
+ 'events' object
17
+ 'description' object
18
+ 'spin_dir' Int64
19
+ 'spin_rate_deprecated' Int64
20
+ 'break_angle_deprecated' Int64
21
+ 'break_length_deprecated' Int64
22
+ 'zone' Int64
23
+ 'des' object
24
+ 'game_type' object
25
+ 'stand' object
26
+ 'p_throws' object
27
+ 'home_team' object
28
+ 'away_team' object
29
+ 'type' object
30
+ 'hit_location' Int64
31
+ 'bb_type' object
32
+ 'balls' Int64
33
+ 'strikes' Int64
34
+ 'game_year' Int64
35
+ 'pfx_x' Float64
36
+ 'pfx_z' Float64
37
+ 'plate_x' Float64
38
+ 'plate_z' Float64
39
+ 'on_3b' Int64
40
+ 'on_2b' Int64
41
+ 'on_1b' Int64
42
+ 'outs_when_up' Int64
43
+ 'inning' Int64
44
+ 'inning_topbot' object
45
+ 'hc_x' Float64
46
+ 'hc_y' Float64
47
+ 'tfs_deprecated' Int64
48
+ 'tfs_zulu_deprecated' Int64
49
+ 'umpire' Int64
50
+ 'sv_id' Int64
51
+ 'vx0' Float64
52
+ 'vy0' Float64
53
+ 'vz0' Float64
54
+ 'ax' Float64
55
+ 'ay' Float64
56
+ 'az' Float64
57
+ 'sz_top' Float64
58
+ 'sz_bot' Float64
59
+ 'hit_distance_sc' Int64
60
+ 'launch_speed' Float64
61
+ 'launch_angle' Int64
62
+ 'effective_speed' Float64
63
+ 'release_spin_rate' Int64
64
+ 'release_extension' Float64
65
+ 'game_pk' Int64
66
+ 'fielder_2' Int64
67
+ 'fielder_3' Int64
68
+ 'fielder_4' Int64
69
+ 'fielder_5' Int64
70
+ 'fielder_6' Int64
71
+ 'fielder_7' Int64
72
+ 'fielder_8' Int64
73
+ 'fielder_9' Int64
74
+ 'release_pos_y' Float64
75
+ 'estimated_ba_using_speedangle' Float64
76
+ 'estimated_woba_using_speedangle' Float64
77
+ 'woba_value' Float64
78
+ 'woba_denom' Int64
79
+ 'babip_value' Int64
80
+ 'iso_value' Int64
81
+ 'launch_speed_angle' Int64
82
+ 'at_bat_number' Int64
83
+ 'pitch_number' Int64
84
+ 'pitch_name' object
85
+ 'home_score' Int64
86
+ 'away_score' Int64
87
+ 'bat_score' Int64
88
+ 'fld_score' Int64
89
+ 'post_away_score' Int64
90
+ 'post_home_score' Int64
91
+ 'post_bat_score' Int64
92
+ 'post_fld_score' Int64
93
+ 'if_fielding_alignment' object
94
+ 'of_fielding_alignment' object
95
+ 'spin_axis' Int64
96
+ 'delta_home_win_exp' Float64
97
+ 'delta_run_exp' Float64
98
+ 'bat_speed' Int64
99
+ 'swing_length' Int64
100
+ 'estimated_slg_using_speedangle' Float64
101
+ 'delta_pitcher_run_exp' Float64
102
+ 'hyper_speed' Float64
103
+ 'home_score_diff' Int64
104
+ 'bat_score_diff' Int64
105
+ 'home_win_exp' Float64
106
+ 'bat_win_exp' Float64
107
+ 'age_pit_legacy' Int64
108
+ 'age_bat_legacy' Int64
109
+ 'age_pit' Int64
110
+ 'age_bat' Int64
111
+ 'n_thruorder_pitcher' Int64
112
+ 'n_priorpa_thisgame_player_at_bat' Int64
113
+ 'pitcher_days_since_prev_game' Int64
114
+ 'batter_days_since_prev_game' Int64
115
+ 'pitcher_days_until_next_game' Int64
116
+ 'batter_days_until_next_game' Int64
117
+ 'api_break_z_with_gravity' Float64
118
+ 'api_break_x_arm' Float64
119
+ 'api_break_x_batter_in' Float64
120
+ 'arm_angle' Float64
121
+ 'attack_angle' Int64
122
+ 'attack_direction' Int64
123
+ 'swing_path_tilt' Int64
124
+ 'intercept_ball_minus_batter_pos_x_inches' Int64
125
+ 'intercept_ball_minus_batter_pos_y_inches' Int64
@@ -0,0 +1,22 @@
1
+ """Run pybaseball.statcast for one day; print column names + dtypes.
2
+
3
+ Use this once at the start of Task 3 to ensure PITCHES_SCHEMA covers every column.
4
+ Output is informational only; not committed as a test.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import pybaseball as pb
10
+
11
+
12
+ def main() -> None:
13
+ df = pb.statcast(start_dt="2024-04-01", end_dt="2024-04-01")
14
+ print(f"Rows: {len(df)}")
15
+ print(f"Columns: {len(df.columns)}")
16
+ for col in df.columns:
17
+ dtype = df[col].dtype
18
+ print(f" {col!r:<35} {dtype}")
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
@@ -0,0 +1,8 @@
1
+ """statcast-bigquery.
2
+
3
+ Statcast pitch-level ingestion + LLM-friendly docs + Baseball Savant verification.
4
+ """
5
+
6
+ from statcast_bigquery._version import __version__
7
+
8
+ __all__ = ["__version__"]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,194 @@
1
+ """CLI entrypoint: statcast-bigquery {sync,docs,verify}."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import logging
8
+ import sys
9
+ from datetime import date, datetime, timedelta
10
+
11
+ from google.cloud import bigquery
12
+
13
+ from statcast_bigquery._version import __version__
14
+ from statcast_bigquery.client import StatcastClient
15
+ from statcast_bigquery.docs.renderers import (
16
+ render_bq_descriptions,
17
+ render_data_dictionary,
18
+ render_dbt_yaml,
19
+ render_llm_context,
20
+ render_markdown,
21
+ )
22
+ from statcast_bigquery.verify.savant import (
23
+ BATTING_METRIC_TO_SAVANT_FIELD,
24
+ PITCHING_METRIC_TO_SAVANT_FIELD,
25
+ BaseballSavantBattingVerifier,
26
+ BaseballSavantPitchingVerifier,
27
+ )
28
+ from statcast_bigquery.writer import BigQueryWriter, TableRef
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
31
+ log = logging.getLogger("statcast-bigquery")
32
+
33
+ ALL_BATTING_METRICS = list(BATTING_METRIC_TO_SAVANT_FIELD)
34
+ ALL_PITCHING_METRICS = list(PITCHING_METRIC_TO_SAVANT_FIELD)
35
+ DOC_FORMATS = ["bq-apply", "llm", "dictionary", "markdown", "dbt"]
36
+
37
+
38
+ def build_parser() -> argparse.ArgumentParser:
39
+ parser = argparse.ArgumentParser(prog="statcast-bigquery")
40
+ parser.add_argument("--version", action="version", version=__version__)
41
+ sub = parser.add_subparsers(dest="command", required=True)
42
+
43
+ # sync
44
+ p_sync = sub.add_parser("sync", help="Pull Statcast and write to BigQuery")
45
+ p_sync.add_argument("--start", required=True, help="YYYY-MM-DD start (inclusive)")
46
+ p_sync.add_argument("--end", required=True, help="YYYY-MM-DD end (inclusive)")
47
+ p_sync.add_argument("--table", required=True, help="project.dataset.table")
48
+ p_sync.add_argument("--chunk-by", default="year", choices=["year", "month", "range"])
49
+ p_sync.add_argument("--resume", action="store_true",
50
+ help="Skip year-chunks already recorded in _statcast_ingest_runs")
51
+ p_sync.add_argument("--dry-run", action="store_true")
52
+
53
+ # docs
54
+ p_docs = sub.add_parser("docs", help="Render documentation in various formats")
55
+ p_docs.add_argument("--format", required=True, choices=DOC_FORMATS)
56
+ p_docs.add_argument("--table", help="project.dataset.table (required for bq-apply, dictionary)")
57
+ p_docs.add_argument("--dataset", help="for dictionary format")
58
+ p_docs.add_argument("--output", default="-", help="path or '-' for stdout (default)")
59
+
60
+ # verify
61
+ p_v = sub.add_parser("verify", help="Compare aggregations to external sources")
62
+ p_v.add_argument("--source", default="baseball-savant", choices=["baseball-savant"])
63
+ p_v.add_argument("--aggregation", required=True,
64
+ choices=["player-season", "pitcher-season"])
65
+ p_v.add_argument("--metric", required=True,
66
+ choices=[*ALL_BATTING_METRICS, *ALL_PITCHING_METRICS, "all"])
67
+ p_v.add_argument("--season", required=True, type=int)
68
+ p_v.add_argument("--table", required=True)
69
+ p_v.add_argument("--tolerance", type=float, default=None)
70
+ p_v.add_argument("--min-sample-size", type=int, default=50)
71
+ p_v.add_argument("--threshold", type=float, default=0.99)
72
+ p_v.add_argument("--output", default="-")
73
+
74
+ return parser
75
+
76
+
77
+ def _iter_year_chunks(start: str, end: str) -> list[tuple[str, str]]:
78
+ s = datetime.strptime(start, "%Y-%m-%d").date()
79
+ e = datetime.strptime(end, "%Y-%m-%d").date()
80
+ chunks: list[tuple[str, str]] = []
81
+ cur = s
82
+ while cur <= e:
83
+ year_end = date(cur.year, 12, 31)
84
+ last = min(year_end, e)
85
+ chunks.append((cur.isoformat(), last.isoformat()))
86
+ cur = last + timedelta(days=1)
87
+ return chunks
88
+
89
+
90
+ def cmd_sync(ns: argparse.Namespace) -> int:
91
+ client = bigquery.Client()
92
+ sc = StatcastClient()
93
+ writer = BigQueryWriter(client=client)
94
+ ref = TableRef.parse(ns.table)
95
+ if not ns.dry_run:
96
+ writer.create_table_if_missing(ref)
97
+
98
+ chunks = _iter_year_chunks(ns.start, ns.end) if ns.chunk_by == "year" \
99
+ else [(ns.start, ns.end)]
100
+ for cs, ce in chunks:
101
+ log.info("chunk %s -> %s", cs, ce)
102
+ if ns.dry_run:
103
+ continue
104
+ df = sc.fetch(cs, ce)
105
+ writer.write(ref, df, cs, ce)
106
+ return 0
107
+
108
+
109
+ def cmd_docs(ns: argparse.Namespace) -> int:
110
+ if ns.format == "bq-apply":
111
+ if not ns.table:
112
+ log.error("--table required for bq-apply")
113
+ return 2
114
+ client = bigquery.Client()
115
+ ref = TableRef.parse(ns.table)
116
+ table = client.get_table(str(ref))
117
+ table.schema = render_bq_descriptions()
118
+ client.update_table(table, ["schema"])
119
+ log.info("updated schema descriptions on %s", ref)
120
+ return 0
121
+
122
+ if ns.format == "llm":
123
+ out = render_llm_context()
124
+ elif ns.format == "dictionary":
125
+ if not (ns.dataset and ns.table):
126
+ log.error("--dataset and --table required for dictionary")
127
+ return 2
128
+ ref = TableRef.parse(ns.table)
129
+ out = json.dumps(
130
+ render_data_dictionary(dataset=ns.dataset, table=ref.table), indent=2
131
+ )
132
+ elif ns.format == "markdown":
133
+ out = render_markdown()
134
+ elif ns.format == "dbt":
135
+ out = render_dbt_yaml()
136
+ else:
137
+ raise AssertionError(f"unhandled format {ns.format}")
138
+
139
+ if ns.output == "-":
140
+ with open(sys.stdout.fileno(), mode="w", encoding="utf-8", newline="") as f:
141
+ f.write(out)
142
+ else:
143
+ with open(ns.output, "w", encoding="utf-8") as f:
144
+ f.write(out)
145
+ return 0
146
+
147
+
148
+ def cmd_verify(ns: argparse.Namespace) -> int:
149
+ client = bigquery.Client()
150
+ metrics = ([*ALL_BATTING_METRICS] if ns.aggregation == "player-season"
151
+ else [*ALL_PITCHING_METRICS]) if ns.metric == "all" else [ns.metric]
152
+
153
+ overall_pass = True
154
+ all_results: list[dict] = []
155
+ for m in metrics:
156
+ if ns.aggregation == "player-season":
157
+ v = BaseballSavantBattingVerifier(
158
+ client=client, table=ns.table, season=ns.season, metric=m,
159
+ min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
160
+ )
161
+ else:
162
+ v = BaseballSavantPitchingVerifier(
163
+ client=client, table=ns.table, season=ns.season, metric=m,
164
+ min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
165
+ )
166
+ result = v.run()
167
+ print(result.summary())
168
+ verdict = "PASS" if result.passed(ns.threshold) else "FAIL"
169
+ print(f"{verdict} (threshold {ns.threshold:.2%})\n")
170
+ if not result.passed(ns.threshold):
171
+ overall_pass = False
172
+ all_results.append(result.to_json())
173
+
174
+ if ns.output != "-":
175
+ with open(ns.output, "w", encoding="utf-8") as f:
176
+ json.dump(all_results, f, indent=2)
177
+
178
+ return 0 if overall_pass else 1
179
+
180
+
181
+ def main(argv: list[str] | None = None) -> int:
182
+ parser = build_parser()
183
+ ns = parser.parse_args(argv)
184
+ if ns.command == "sync":
185
+ return cmd_sync(ns)
186
+ if ns.command == "docs":
187
+ return cmd_docs(ns)
188
+ if ns.command == "verify":
189
+ return cmd_verify(ns)
190
+ raise AssertionError(f"unhandled command {ns.command}")
191
+
192
+
193
+ if __name__ == "__main__":
194
+ sys.exit(main())