statcast-bigquery 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statcast_bigquery-0.1.0/.github/workflows/release.yml +41 -0
- statcast_bigquery-0.1.0/.github/workflows/test.yml +30 -0
- statcast_bigquery-0.1.0/.gitignore +19 -0
- statcast_bigquery-0.1.0/.python-version +1 -0
- statcast_bigquery-0.1.0/CHANGELOG.md +11 -0
- statcast_bigquery-0.1.0/CONTRIBUTING.md +15 -0
- statcast_bigquery-0.1.0/LICENSE +24 -0
- statcast_bigquery-0.1.0/PKG-INFO +61 -0
- statcast_bigquery-0.1.0/README.md +28 -0
- statcast_bigquery-0.1.0/pyproject.toml +64 -0
- statcast_bigquery-0.1.0/scripts/_pybaseball_columns_2024-04-01.txt +124 -0
- statcast_bigquery-0.1.0/scripts/discover_pybaseball_columns.py +22 -0
- statcast_bigquery-0.1.0/statcast_bigquery/__init__.py +8 -0
- statcast_bigquery-0.1.0/statcast_bigquery/_version.py +1 -0
- statcast_bigquery-0.1.0/statcast_bigquery/cli.py +194 -0
- statcast_bigquery-0.1.0/statcast_bigquery/client.py +62 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/__init__.py +28 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/example_queries.py +483 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/pitfalls.py +116 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/renderers.py +155 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/statsapi_map.py +90 -0
- statcast_bigquery-0.1.0/statcast_bigquery/docs/taxonomy.py +30 -0
- statcast_bigquery-0.1.0/statcast_bigquery/schema.py +2473 -0
- statcast_bigquery-0.1.0/statcast_bigquery/verify/__init__.py +23 -0
- statcast_bigquery-0.1.0/statcast_bigquery/verify/base.py +76 -0
- statcast_bigquery-0.1.0/statcast_bigquery/verify/compare.py +32 -0
- statcast_bigquery-0.1.0/statcast_bigquery/verify/savant.py +263 -0
- statcast_bigquery-0.1.0/statcast_bigquery/writer.py +112 -0
- statcast_bigquery-0.1.0/tests/__init__.py +0 -0
- statcast_bigquery-0.1.0/tests/fixtures/__init__.py +1 -0
- statcast_bigquery-0.1.0/tests/fixtures/make_savant_fixture.py +31 -0
- statcast_bigquery-0.1.0/tests/fixtures/make_statcast_fixture.py +22 -0
- statcast_bigquery-0.1.0/tests/fixtures/savant_batter_2024.parquet +0 -0
- statcast_bigquery-0.1.0/tests/fixtures/savant_leaderboard_2024.parquet +0 -0
- statcast_bigquery-0.1.0/tests/fixtures/savant_pitcher_2024.parquet +0 -0
- statcast_bigquery-0.1.0/tests/fixtures/statcast_sample_2024-04-01.parquet +0 -0
- statcast_bigquery-0.1.0/tests/test_cli.py +56 -0
- statcast_bigquery-0.1.0/tests/test_client.py +56 -0
- statcast_bigquery-0.1.0/tests/test_example_queries.py +55 -0
- statcast_bigquery-0.1.0/tests/test_idempotency.py +40 -0
- statcast_bigquery-0.1.0/tests/test_pitfalls.py +27 -0
- statcast_bigquery-0.1.0/tests/test_renderers.py +70 -0
- statcast_bigquery-0.1.0/tests/test_schema.py +190 -0
- statcast_bigquery-0.1.0/tests/test_taxonomy.py +40 -0
- statcast_bigquery-0.1.0/tests/test_verify_compare.py +73 -0
- statcast_bigquery-0.1.0/tests/test_verify_savant.py +126 -0
- statcast_bigquery-0.1.0/tests/test_writer.py +80 -0
- statcast_bigquery-0.1.0/uv.lock +1811 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
outputs:
|
|
12
|
+
dist-path: ${{ steps.build.outputs.dist-path }}
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: astral-sh/setup-uv@v3
|
|
16
|
+
- run: uv python install 3.13
|
|
17
|
+
- name: Create venv
|
|
18
|
+
run: uv venv
|
|
19
|
+
- name: Install build tools
|
|
20
|
+
run: uv pip install build
|
|
21
|
+
- id: build
|
|
22
|
+
run: |
|
|
23
|
+
uv run python -m build
|
|
24
|
+
echo "dist-path=dist" >> $GITHUB_OUTPUT
|
|
25
|
+
- uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: dist
|
|
28
|
+
path: dist/
|
|
29
|
+
|
|
30
|
+
publish:
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment: pypi
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist/
|
|
41
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v3
|
|
17
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
18
|
+
run: uv python install ${{ matrix.python-version }}
|
|
19
|
+
- name: Create venv
|
|
20
|
+
run: uv venv
|
|
21
|
+
- name: Install
|
|
22
|
+
run: uv pip install -e ".[dev]"
|
|
23
|
+
- name: Lint
|
|
24
|
+
run: uv run ruff check .
|
|
25
|
+
- name: Type check
|
|
26
|
+
run: uv run pyright
|
|
27
|
+
- name: Test
|
|
28
|
+
run: uv run pytest -v --cov=statcast_bigquery --cov-report=xml
|
|
29
|
+
- name: Build
|
|
30
|
+
run: uv run python -m build
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-05-?? (planned)
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Initial release.
|
|
7
|
+
- `statcast-bigquery sync` — idempotent Statcast pitch-level ingestion to BigQuery.
|
|
8
|
+
- `statcast-bigquery docs` — 5 documentation renderers (BQ-native, LLM, dictionary, markdown, dbt).
|
|
9
|
+
- `statcast-bigquery verify` — Baseball Savant leaderboard verification (8 metrics).
|
|
10
|
+
- 25 vetted example queries; 10+ pitfall catalog; statsapi cross-reference.
|
|
11
|
+
- Schema spans pybaseball Statcast columns (~118 fields); auto-applied BQ-native column descriptions.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Bug reports + small PRs welcome. Please open an issue before large changes.
|
|
4
|
+
|
|
5
|
+
This is a hobbyist project; review cadence is best-effort.
|
|
6
|
+
|
|
7
|
+
## Dev setup
|
|
8
|
+
|
|
9
|
+
uv venv
|
|
10
|
+
uv pip install -e ".[dev]"
|
|
11
|
+
pytest
|
|
12
|
+
|
|
13
|
+
## Style
|
|
14
|
+
|
|
15
|
+
Ruff + pyright. Run `ruff check . && pyright` before opening a PR.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jason Blahovec
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
This software does not include or distribute MLB data. MLB data accessed via
|
|
24
|
+
this software is governed by its source's terms (Baseball Savant / MLBAM).
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: statcast-bigquery
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Statcast → BigQuery: idempotent ingestion + LLM-friendly docs + Baseball Savant verification
|
|
5
|
+
Project-URL: Homepage, https://github.com/blahovec-labs/statcast-bigquery
|
|
6
|
+
Project-URL: Issues, https://github.com/blahovec-labs/statcast-bigquery/issues
|
|
7
|
+
Project-URL: Changelog, https://github.com/blahovec-labs/statcast-bigquery/blob/main/CHANGELOG.md
|
|
8
|
+
Author: Jason Blahovec
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: baseball,bigquery,data-engineering,mlb,statcast
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Database
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: google-cloud-bigquery<4.0,>=3.20
|
|
22
|
+
Requires-Dist: pandas<3.0,>=2.0
|
|
23
|
+
Requires-Dist: pyarrow<19.0,>=15.0
|
|
24
|
+
Requires-Dist: pybaseball<3.0,>=2.2.7
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: build>=1.2.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: duckdb<2.0,>=1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pyright>=1.1.380; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.6.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# statcast-bigquery
|
|
35
|
+
|
|
36
|
+
Idempotent Statcast → BigQuery ingestion, with first-class documentation for SQL/LLM agents and round-trip validation against Baseball Savant.
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
pip install statcast-bigquery
|
|
41
|
+
|
|
42
|
+
## Quickstart
|
|
43
|
+
|
|
44
|
+
gcloud auth application-default login
|
|
45
|
+
statcast-bigquery sync \
|
|
46
|
+
--start 2024-04-01 --end 2024-10-31 \
|
|
47
|
+
--table myproject.mydataset.statcast_pitches
|
|
48
|
+
|
|
49
|
+
## Documentation
|
|
50
|
+
|
|
51
|
+
statcast-bigquery docs --format llm > STATCAST_FOR_LLMS.md
|
|
52
|
+
|
|
53
|
+
## Verification
|
|
54
|
+
|
|
55
|
+
statcast-bigquery verify \
|
|
56
|
+
--source baseball-savant \
|
|
57
|
+
--aggregation player-season \
|
|
58
|
+
--metric all --season 2024 \
|
|
59
|
+
--table myproject.mydataset.statcast_pitches
|
|
60
|
+
|
|
61
|
+
MIT licensed. This software does not include or distribute MLB data.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# statcast-bigquery
|
|
2
|
+
|
|
3
|
+
Idempotent Statcast → BigQuery ingestion, with first-class documentation for SQL/LLM agents and round-trip validation against Baseball Savant.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
pip install statcast-bigquery
|
|
8
|
+
|
|
9
|
+
## Quickstart
|
|
10
|
+
|
|
11
|
+
gcloud auth application-default login
|
|
12
|
+
statcast-bigquery sync \
|
|
13
|
+
--start 2024-04-01 --end 2024-10-31 \
|
|
14
|
+
--table myproject.mydataset.statcast_pitches
|
|
15
|
+
|
|
16
|
+
## Documentation
|
|
17
|
+
|
|
18
|
+
statcast-bigquery docs --format llm > STATCAST_FOR_LLMS.md
|
|
19
|
+
|
|
20
|
+
## Verification
|
|
21
|
+
|
|
22
|
+
statcast-bigquery verify \
|
|
23
|
+
--source baseball-savant \
|
|
24
|
+
--aggregation player-season \
|
|
25
|
+
--metric all --season 2024 \
|
|
26
|
+
--table myproject.mydataset.statcast_pitches
|
|
27
|
+
|
|
28
|
+
MIT licensed. This software does not include or distribute MLB data.
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "statcast-bigquery"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Statcast → BigQuery: idempotent ingestion + LLM-friendly docs + Baseball Savant verification"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Jason Blahovec" }]
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
keywords = ["mlb", "baseball", "statcast", "bigquery", "data-engineering"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Programming Language :: Python :: 3.13",
|
|
17
|
+
"Topic :: Database",
|
|
18
|
+
"Topic :: Scientific/Engineering",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"pybaseball>=2.2.7,<3.0",
|
|
22
|
+
"pandas>=2.0,<3.0",
|
|
23
|
+
"google-cloud-bigquery>=3.20,<4.0",
|
|
24
|
+
"pyarrow>=15.0,<19.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
dev = [
|
|
29
|
+
"pytest>=8.0",
|
|
30
|
+
"pytest-cov>=5.0",
|
|
31
|
+
"ruff>=0.6.0",
|
|
32
|
+
"pyright>=1.1.380",
|
|
33
|
+
"build>=1.2.0",
|
|
34
|
+
"duckdb>=1.0,<2.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
statcast-bigquery = "statcast_bigquery.cli:main"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/blahovec-labs/statcast-bigquery"
|
|
42
|
+
Issues = "https://github.com/blahovec-labs/statcast-bigquery/issues"
|
|
43
|
+
Changelog = "https://github.com/blahovec-labs/statcast-bigquery/blob/main/CHANGELOG.md"
|
|
44
|
+
|
|
45
|
+
[build-system]
|
|
46
|
+
requires = ["hatchling"]
|
|
47
|
+
build-backend = "hatchling.build"
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["statcast_bigquery"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff]
|
|
53
|
+
line-length = 100
|
|
54
|
+
target-version = "py313"
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
select = ["E", "F", "I", "N", "UP", "W"]
|
|
58
|
+
|
|
59
|
+
[tool.pyright]
|
|
60
|
+
pythonVersion = "3.13"
|
|
61
|
+
typeCheckingMode = "standard"
|
|
62
|
+
|
|
63
|
+
[tool.pytest.ini_options]
|
|
64
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
This is a large query, it may take a moment to complete
|
|
2
|
+
|
|
3
|
0%| | 0/1 [00:00<?, ?it/s]C:\Users\jason\Projects\statcast-bigquery\.venv\Lib\site-packages\pybaseball\datahelpers\postprocessing.py:59: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead
|
|
4
|
+
data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
|
|
5
|
+
|
|
6
|
+
Rows: 4190
|
|
7
|
+
Columns: 118
|
|
8
|
+
'pitch_type' object
|
|
9
|
+
'game_date' datetime64[ns]
|
|
10
|
+
'release_speed' Float64
|
|
11
|
+
'release_pos_x' Float64
|
|
12
|
+
'release_pos_z' Float64
|
|
13
|
+
'player_name' object
|
|
14
|
+
'batter' Int64
|
|
15
|
+
'pitcher' Int64
|
|
16
|
+
'events' object
|
|
17
|
+
'description' object
|
|
18
|
+
'spin_dir' Int64
|
|
19
|
+
'spin_rate_deprecated' Int64
|
|
20
|
+
'break_angle_deprecated' Int64
|
|
21
|
+
'break_length_deprecated' Int64
|
|
22
|
+
'zone' Int64
|
|
23
|
+
'des' object
|
|
24
|
+
'game_type' object
|
|
25
|
+
'stand' object
|
|
26
|
+
'p_throws' object
|
|
27
|
+
'home_team' object
|
|
28
|
+
'away_team' object
|
|
29
|
+
'type' object
|
|
30
|
+
'hit_location' Int64
|
|
31
|
+
'bb_type' object
|
|
32
|
+
'balls' Int64
|
|
33
|
+
'strikes' Int64
|
|
34
|
+
'game_year' Int64
|
|
35
|
+
'pfx_x' Float64
|
|
36
|
+
'pfx_z' Float64
|
|
37
|
+
'plate_x' Float64
|
|
38
|
+
'plate_z' Float64
|
|
39
|
+
'on_3b' Int64
|
|
40
|
+
'on_2b' Int64
|
|
41
|
+
'on_1b' Int64
|
|
42
|
+
'outs_when_up' Int64
|
|
43
|
+
'inning' Int64
|
|
44
|
+
'inning_topbot' object
|
|
45
|
+
'hc_x' Float64
|
|
46
|
+
'hc_y' Float64
|
|
47
|
+
'tfs_deprecated' Int64
|
|
48
|
+
'tfs_zulu_deprecated' Int64
|
|
49
|
+
'umpire' Int64
|
|
50
|
+
'sv_id' Int64
|
|
51
|
+
'vx0' Float64
|
|
52
|
+
'vy0' Float64
|
|
53
|
+
'vz0' Float64
|
|
54
|
+
'ax' Float64
|
|
55
|
+
'ay' Float64
|
|
56
|
+
'az' Float64
|
|
57
|
+
'sz_top' Float64
|
|
58
|
+
'sz_bot' Float64
|
|
59
|
+
'hit_distance_sc' Int64
|
|
60
|
+
'launch_speed' Float64
|
|
61
|
+
'launch_angle' Int64
|
|
62
|
+
'effective_speed' Float64
|
|
63
|
+
'release_spin_rate' Int64
|
|
64
|
+
'release_extension' Float64
|
|
65
|
+
'game_pk' Int64
|
|
66
|
+
'fielder_2' Int64
|
|
67
|
+
'fielder_3' Int64
|
|
68
|
+
'fielder_4' Int64
|
|
69
|
+
'fielder_5' Int64
|
|
70
|
+
'fielder_6' Int64
|
|
71
|
+
'fielder_7' Int64
|
|
72
|
+
'fielder_8' Int64
|
|
73
|
+
'fielder_9' Int64
|
|
74
|
+
'release_pos_y' Float64
|
|
75
|
+
'estimated_ba_using_speedangle' Float64
|
|
76
|
+
'estimated_woba_using_speedangle' Float64
|
|
77
|
+
'woba_value' Float64
|
|
78
|
+
'woba_denom' Int64
|
|
79
|
+
'babip_value' Int64
|
|
80
|
+
'iso_value' Int64
|
|
81
|
+
'launch_speed_angle' Int64
|
|
82
|
+
'at_bat_number' Int64
|
|
83
|
+
'pitch_number' Int64
|
|
84
|
+
'pitch_name' object
|
|
85
|
+
'home_score' Int64
|
|
86
|
+
'away_score' Int64
|
|
87
|
+
'bat_score' Int64
|
|
88
|
+
'fld_score' Int64
|
|
89
|
+
'post_away_score' Int64
|
|
90
|
+
'post_home_score' Int64
|
|
91
|
+
'post_bat_score' Int64
|
|
92
|
+
'post_fld_score' Int64
|
|
93
|
+
'if_fielding_alignment' object
|
|
94
|
+
'of_fielding_alignment' object
|
|
95
|
+
'spin_axis' Int64
|
|
96
|
+
'delta_home_win_exp' Float64
|
|
97
|
+
'delta_run_exp' Float64
|
|
98
|
+
'bat_speed' Int64
|
|
99
|
+
'swing_length' Int64
|
|
100
|
+
'estimated_slg_using_speedangle' Float64
|
|
101
|
+
'delta_pitcher_run_exp' Float64
|
|
102
|
+
'hyper_speed' Float64
|
|
103
|
+
'home_score_diff' Int64
|
|
104
|
+
'bat_score_diff' Int64
|
|
105
|
+
'home_win_exp' Float64
|
|
106
|
+
'bat_win_exp' Float64
|
|
107
|
+
'age_pit_legacy' Int64
|
|
108
|
+
'age_bat_legacy' Int64
|
|
109
|
+
'age_pit' Int64
|
|
110
|
+
'age_bat' Int64
|
|
111
|
+
'n_thruorder_pitcher' Int64
|
|
112
|
+
'n_priorpa_thisgame_player_at_bat' Int64
|
|
113
|
+
'pitcher_days_since_prev_game' Int64
|
|
114
|
+
'batter_days_since_prev_game' Int64
|
|
115
|
+
'pitcher_days_until_next_game' Int64
|
|
116
|
+
'batter_days_until_next_game' Int64
|
|
117
|
+
'api_break_z_with_gravity' Float64
|
|
118
|
+
'api_break_x_arm' Float64
|
|
119
|
+
'api_break_x_batter_in' Float64
|
|
120
|
+
'arm_angle' Float64
|
|
121
|
+
'attack_angle' Int64
|
|
122
|
+
'attack_direction' Int64
|
|
123
|
+
'swing_path_tilt' Int64
|
|
124
|
+
'intercept_ball_minus_batter_pos_x_inches' Int64
|
|
125
|
+
'intercept_ball_minus_batter_pos_y_inches' Int64
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Run pybaseball.statcast for one day; print column names + dtypes.
|
|
2
|
+
|
|
3
|
+
Use this once at the start of Task 3 to ensure PITCHES_SCHEMA covers every column.
|
|
4
|
+
Output is informational only; not committed as a test.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import pybaseball as pb
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
df = pb.statcast(start_dt="2024-04-01", end_dt="2024-04-01")
|
|
14
|
+
print(f"Rows: {len(df)}")
|
|
15
|
+
print(f"Columns: {len(df.columns)}")
|
|
16
|
+
for col in df.columns:
|
|
17
|
+
dtype = df[col].dtype
|
|
18
|
+
print(f" {col!r:<35} {dtype}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""CLI entrypoint: statcast-bigquery {sync,docs,verify}."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import sys
|
|
9
|
+
from datetime import date, datetime, timedelta
|
|
10
|
+
|
|
11
|
+
from google.cloud import bigquery
|
|
12
|
+
|
|
13
|
+
from statcast_bigquery._version import __version__
|
|
14
|
+
from statcast_bigquery.client import StatcastClient
|
|
15
|
+
from statcast_bigquery.docs.renderers import (
|
|
16
|
+
render_bq_descriptions,
|
|
17
|
+
render_data_dictionary,
|
|
18
|
+
render_dbt_yaml,
|
|
19
|
+
render_llm_context,
|
|
20
|
+
render_markdown,
|
|
21
|
+
)
|
|
22
|
+
from statcast_bigquery.verify.savant import (
|
|
23
|
+
BATTING_METRIC_TO_SAVANT_FIELD,
|
|
24
|
+
PITCHING_METRIC_TO_SAVANT_FIELD,
|
|
25
|
+
BaseballSavantBattingVerifier,
|
|
26
|
+
BaseballSavantPitchingVerifier,
|
|
27
|
+
)
|
|
28
|
+
from statcast_bigquery.writer import BigQueryWriter, TableRef
|
|
29
|
+
|
|
30
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
|
|
31
|
+
log = logging.getLogger("statcast-bigquery")
|
|
32
|
+
|
|
33
|
+
ALL_BATTING_METRICS = list(BATTING_METRIC_TO_SAVANT_FIELD)
|
|
34
|
+
ALL_PITCHING_METRICS = list(PITCHING_METRIC_TO_SAVANT_FIELD)
|
|
35
|
+
DOC_FORMATS = ["bq-apply", "llm", "dictionary", "markdown", "dbt"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
39
|
+
parser = argparse.ArgumentParser(prog="statcast-bigquery")
|
|
40
|
+
parser.add_argument("--version", action="version", version=__version__)
|
|
41
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
42
|
+
|
|
43
|
+
# sync
|
|
44
|
+
p_sync = sub.add_parser("sync", help="Pull Statcast and write to BigQuery")
|
|
45
|
+
p_sync.add_argument("--start", required=True, help="YYYY-MM-DD start (inclusive)")
|
|
46
|
+
p_sync.add_argument("--end", required=True, help="YYYY-MM-DD end (inclusive)")
|
|
47
|
+
p_sync.add_argument("--table", required=True, help="project.dataset.table")
|
|
48
|
+
p_sync.add_argument("--chunk-by", default="year", choices=["year", "month", "range"])
|
|
49
|
+
p_sync.add_argument("--resume", action="store_true",
|
|
50
|
+
help="Skip year-chunks already recorded in _statcast_ingest_runs")
|
|
51
|
+
p_sync.add_argument("--dry-run", action="store_true")
|
|
52
|
+
|
|
53
|
+
# docs
|
|
54
|
+
p_docs = sub.add_parser("docs", help="Render documentation in various formats")
|
|
55
|
+
p_docs.add_argument("--format", required=True, choices=DOC_FORMATS)
|
|
56
|
+
p_docs.add_argument("--table", help="project.dataset.table (required for bq-apply, dictionary)")
|
|
57
|
+
p_docs.add_argument("--dataset", help="for dictionary format")
|
|
58
|
+
p_docs.add_argument("--output", default="-", help="path or '-' for stdout (default)")
|
|
59
|
+
|
|
60
|
+
# verify
|
|
61
|
+
p_v = sub.add_parser("verify", help="Compare aggregations to external sources")
|
|
62
|
+
p_v.add_argument("--source", default="baseball-savant", choices=["baseball-savant"])
|
|
63
|
+
p_v.add_argument("--aggregation", required=True,
|
|
64
|
+
choices=["player-season", "pitcher-season"])
|
|
65
|
+
p_v.add_argument("--metric", required=True,
|
|
66
|
+
choices=[*ALL_BATTING_METRICS, *ALL_PITCHING_METRICS, "all"])
|
|
67
|
+
p_v.add_argument("--season", required=True, type=int)
|
|
68
|
+
p_v.add_argument("--table", required=True)
|
|
69
|
+
p_v.add_argument("--tolerance", type=float, default=None)
|
|
70
|
+
p_v.add_argument("--min-sample-size", type=int, default=50)
|
|
71
|
+
p_v.add_argument("--threshold", type=float, default=0.99)
|
|
72
|
+
p_v.add_argument("--output", default="-")
|
|
73
|
+
|
|
74
|
+
return parser
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _iter_year_chunks(start: str, end: str) -> list[tuple[str, str]]:
|
|
78
|
+
s = datetime.strptime(start, "%Y-%m-%d").date()
|
|
79
|
+
e = datetime.strptime(end, "%Y-%m-%d").date()
|
|
80
|
+
chunks: list[tuple[str, str]] = []
|
|
81
|
+
cur = s
|
|
82
|
+
while cur <= e:
|
|
83
|
+
year_end = date(cur.year, 12, 31)
|
|
84
|
+
last = min(year_end, e)
|
|
85
|
+
chunks.append((cur.isoformat(), last.isoformat()))
|
|
86
|
+
cur = last + timedelta(days=1)
|
|
87
|
+
return chunks
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def cmd_sync(ns: argparse.Namespace) -> int:
|
|
91
|
+
client = bigquery.Client()
|
|
92
|
+
sc = StatcastClient()
|
|
93
|
+
writer = BigQueryWriter(client=client)
|
|
94
|
+
ref = TableRef.parse(ns.table)
|
|
95
|
+
if not ns.dry_run:
|
|
96
|
+
writer.create_table_if_missing(ref)
|
|
97
|
+
|
|
98
|
+
chunks = _iter_year_chunks(ns.start, ns.end) if ns.chunk_by == "year" \
|
|
99
|
+
else [(ns.start, ns.end)]
|
|
100
|
+
for cs, ce in chunks:
|
|
101
|
+
log.info("chunk %s -> %s", cs, ce)
|
|
102
|
+
if ns.dry_run:
|
|
103
|
+
continue
|
|
104
|
+
df = sc.fetch(cs, ce)
|
|
105
|
+
writer.write(ref, df, cs, ce)
|
|
106
|
+
return 0
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def cmd_docs(ns: argparse.Namespace) -> int:
|
|
110
|
+
if ns.format == "bq-apply":
|
|
111
|
+
if not ns.table:
|
|
112
|
+
log.error("--table required for bq-apply")
|
|
113
|
+
return 2
|
|
114
|
+
client = bigquery.Client()
|
|
115
|
+
ref = TableRef.parse(ns.table)
|
|
116
|
+
table = client.get_table(str(ref))
|
|
117
|
+
table.schema = render_bq_descriptions()
|
|
118
|
+
client.update_table(table, ["schema"])
|
|
119
|
+
log.info("updated schema descriptions on %s", ref)
|
|
120
|
+
return 0
|
|
121
|
+
|
|
122
|
+
if ns.format == "llm":
|
|
123
|
+
out = render_llm_context()
|
|
124
|
+
elif ns.format == "dictionary":
|
|
125
|
+
if not (ns.dataset and ns.table):
|
|
126
|
+
log.error("--dataset and --table required for dictionary")
|
|
127
|
+
return 2
|
|
128
|
+
ref = TableRef.parse(ns.table)
|
|
129
|
+
out = json.dumps(
|
|
130
|
+
render_data_dictionary(dataset=ns.dataset, table=ref.table), indent=2
|
|
131
|
+
)
|
|
132
|
+
elif ns.format == "markdown":
|
|
133
|
+
out = render_markdown()
|
|
134
|
+
elif ns.format == "dbt":
|
|
135
|
+
out = render_dbt_yaml()
|
|
136
|
+
else:
|
|
137
|
+
raise AssertionError(f"unhandled format {ns.format}")
|
|
138
|
+
|
|
139
|
+
if ns.output == "-":
|
|
140
|
+
with open(sys.stdout.fileno(), mode="w", encoding="utf-8", newline="") as f:
|
|
141
|
+
f.write(out)
|
|
142
|
+
else:
|
|
143
|
+
with open(ns.output, "w", encoding="utf-8") as f:
|
|
144
|
+
f.write(out)
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def cmd_verify(ns: argparse.Namespace) -> int:
|
|
149
|
+
client = bigquery.Client()
|
|
150
|
+
metrics = ([*ALL_BATTING_METRICS] if ns.aggregation == "player-season"
|
|
151
|
+
else [*ALL_PITCHING_METRICS]) if ns.metric == "all" else [ns.metric]
|
|
152
|
+
|
|
153
|
+
overall_pass = True
|
|
154
|
+
all_results: list[dict] = []
|
|
155
|
+
for m in metrics:
|
|
156
|
+
if ns.aggregation == "player-season":
|
|
157
|
+
v = BaseballSavantBattingVerifier(
|
|
158
|
+
client=client, table=ns.table, season=ns.season, metric=m,
|
|
159
|
+
min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
v = BaseballSavantPitchingVerifier(
|
|
163
|
+
client=client, table=ns.table, season=ns.season, metric=m,
|
|
164
|
+
min_sample_size=ns.min_sample_size, tolerance=ns.tolerance,
|
|
165
|
+
)
|
|
166
|
+
result = v.run()
|
|
167
|
+
print(result.summary())
|
|
168
|
+
verdict = "PASS" if result.passed(ns.threshold) else "FAIL"
|
|
169
|
+
print(f"{verdict} (threshold {ns.threshold:.2%})\n")
|
|
170
|
+
if not result.passed(ns.threshold):
|
|
171
|
+
overall_pass = False
|
|
172
|
+
all_results.append(result.to_json())
|
|
173
|
+
|
|
174
|
+
if ns.output != "-":
|
|
175
|
+
with open(ns.output, "w", encoding="utf-8") as f:
|
|
176
|
+
json.dump(all_results, f, indent=2)
|
|
177
|
+
|
|
178
|
+
return 0 if overall_pass else 1
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main(argv: list[str] | None = None) -> int:
|
|
182
|
+
parser = build_parser()
|
|
183
|
+
ns = parser.parse_args(argv)
|
|
184
|
+
if ns.command == "sync":
|
|
185
|
+
return cmd_sync(ns)
|
|
186
|
+
if ns.command == "docs":
|
|
187
|
+
return cmd_docs(ns)
|
|
188
|
+
if ns.command == "verify":
|
|
189
|
+
return cmd_verify(ns)
|
|
190
|
+
raise AssertionError(f"unhandled command {ns.command}")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
|
+
sys.exit(main())
|