pybaseballstats 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.2
2
+ Name: pybaseballstats
3
+ Version: 0.0.1
4
+ Summary: A Python package for scraping baseball data.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: aiohttp>=3.11.11
8
+ Requires-Dist: asyncio>=3.4.3
9
+ Requires-Dist: bs4>=0.0.2
10
+ Requires-Dist: pandas>=2.2.3
11
+ Requires-Dist: polars>=1.20.0
12
+ Requires-Dist: pytest>=8.3.4
13
+ Requires-Dist: ruff>=0.9.3
14
+ Requires-Dist: setuptools>=75.8.0
15
+ Requires-Dist: tqdm>=4.67.1
16
+
17
+ # pybaseballstats
18
+
19
+ A Python package for scraping baseball statistics from the web. Inspired by the pybaseball package by James LeDoux. This package is a work in progress and is not yet ready for use.
20
+
21
+ ## Available Sources
22
+
23
+ 1. Baseball Savant
24
+
25
+ ## Usage
26
+
27
+ Usage documentation can be found at /usage_docs/
@@ -0,0 +1,11 @@
1
+ # pybaseballstats
2
+
3
+ A Python package for scraping baseball statistics from the web. Inspired by the pybaseball package by James LeDoux. This package is a work in progress and is not yet ready for use.
4
+
5
+ ## Available Sources
6
+
7
+ 1. Baseball Savant
8
+
9
+ ## Usage
10
+
11
+ Usage documentation can be found at /usage_docs/
@@ -0,0 +1,4 @@
1
+ from .fangraphs import ( # noqa: F401
2
+ show_fangraphs_batting_stat_types,
3
+ )
4
+ from .statcast import statcast_date_range, statcast_single_game # noqa: F401
@@ -0,0 +1,214 @@
1
+ from enum import Enum
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+ import polars as pl
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ url = "https://www.fangraphs.com/leaders/major-league?pos={pos}&stats=bat&lg={league}&qual={qual}&type={stat_type}&season={end_season}&season1={start_season}&ind=0&startdate={start_date}&enddate={end_date}&month=0&team=0&pagenum=1&pageitems=2000000000"
10
+
11
+ # pos options: c, 1b,2b,3b, ss, lf, cf, rf, dh, of, p, all
12
+ # qual options: y, n
13
+ # league options: "", "al", "nl"
14
+ # start date, end date are strings in the format "yyyy-mm-dd"
15
+ # stat options: 8 (dashboard), 0 (standard), 1 (advanced), 2 (batted ball), 3 (win_probability), 6 (value), 23 (+stats),24 (statcast), 48 (violations)
16
+
17
+
18
+ # Define the available stat types as an Enum
19
+ class FangraphsBattingStatType(Enum):
20
+ DASHBOARD = 8
21
+ STANDARD = 0
22
+ ADVANCED = 1
23
+ BATTED_BALL = 2
24
+ WIN_PROBABILITY = 3
25
+ VALUE = 6
26
+ PLUS_STATS = 23
27
+ STATCAST = 24
28
+ VIOLATIONS = 48
29
+
30
+
31
+ def get_table_data(
32
+ stat_type, pos, league, start_date, end_date, qual, start_season, end_season
33
+ ):
34
+ # Assuming `cont` contains the HTML content
35
+ cont = requests.get(
36
+ url.format(
37
+ pos="all",
38
+ league="",
39
+ stat_type=stat_type,
40
+ start_season=1900,
41
+ end_season=2024,
42
+ qual="y",
43
+ start_date="",
44
+ end_date="",
45
+ )
46
+ ).content.decode("utf-8")
47
+
48
+ # Parse the HTML content with BeautifulSoup
49
+ soup = BeautifulSoup(cont, "html.parser")
50
+
51
+ # Find the main table using the provided CSS selector
52
+ main_table = soup.select_one(
53
+ "#content > div.leaders-major_leaders-major__table__hcmbm > div.fg-data-grid.table-type > div.table-wrapper-outer > div > div.table-scroll > table"
54
+ )
55
+
56
+ # Find the table header
57
+ thead = main_table.find("thead")
58
+
59
+ # Extract column names from the data-col-id attribute of the <th> elements, excluding "divider"
60
+ headers = [
61
+ th["data-col-id"]
62
+ for th in thead.find_all("th")
63
+ if "data-col-id" in th.attrs and th["data-col-id"] != "divider"
64
+ ]
65
+
66
+ # Find the table body within the main table
67
+ tbody = main_table.find("tbody")
68
+
69
+ # Initialize a list to store the extracted data
70
+ data = []
71
+
72
+ # Iterate over each row in the table body
73
+ for row in tbody.find_all("tr"):
74
+ row_data = {header: None for header in headers} # Initialize with None
75
+ for cell in row.find_all("td"):
76
+ col_id = cell.get("data-col-id")
77
+
78
+ if col_id and col_id != "divider":
79
+ if cell.find("a"):
80
+ row_data[col_id] = cell.find("a").text
81
+ elif cell.find("span"):
82
+ row_data[col_id] = cell.find("span").text
83
+ else:
84
+ text = cell.text.strip().replace("%", "")
85
+ if text == "":
86
+ row_data[col_id] = None
87
+ else:
88
+ try:
89
+ row_data[col_id] = float(text) if "." in text else int(text)
90
+ except ValueError:
91
+ row_data[col_id] = text
92
+ # Print row_data for debugging
93
+ data.append(row_data)
94
+
95
+ # Create a Polars DataFrame from the extracted data
96
+ df = pl.DataFrame(data)
97
+ return df
98
+
99
+
100
+ def show_fangraphs_batting_stat_types():
101
+ for stat_type in FangraphsBattingStatType:
102
+ print(stat_type)
103
+
104
+
105
+ def show_batting_pos_options():
106
+ print("c,1b,2b,3b,ss,lf,cf,rf,dh,of,p,all")
107
+
108
+
109
+ def fangraphs_batting_date_range(
110
+ start_date: str,
111
+ end_date: str,
112
+ stat_types: List[FangraphsBattingStatType] = None,
113
+ return_pandas: bool = False,
114
+ pos: str = "all",
115
+ league: str = "",
116
+ qual: str = "y",
117
+ ) -> pl.DataFrame | pd.DataFrame:
118
+ """Pulls Fangraphs batting data for a date range.
119
+
120
+ Args:
121
+ start_date (str): format "yyyy-mm-dd", ex) "2021-04-01"
122
+ end_date (str): format "yyyy-mm-dd", ex) "2021-04-01"
123
+ stat_types (List[FangraphsBattingStatType], optional): List of what Fangraphs stat types to include, more information can be found by calling pyb.show_fangraphs_stat_types(). Defaults to None, meaning all stat types will be returned.
124
+ return_pandas (bool, optional): whether to return a Polars Dataframe (False) or a Pandas Dataframe (True). Defaults to False.
125
+ pos (str, optional): What positions to return data for. More information can be found by calling pyb.show_batting_pos_options(). Defaults to "all".
126
+ league (str, optional): What league to return data for, options are ""(all), "nl", "al". Defaults to "".
127
+ qual (str, optional): whether or not to restrict to qualified batters, to return unqualified batters pass "n" as the argument. Defaults to "y".
128
+
129
+ Returns:
130
+ pl.DataFrame | pd.DataFrame: The requested data as a Polars or Pandas DataFrame.
131
+ """
132
+ df_list = []
133
+ if stat_types is None:
134
+ stat_types = FangraphsBattingStatType
135
+ if len(stat_types) == 0:
136
+ print(
137
+ "Warning: No stat types provided, returning None, to return all stattypes, pass in None."
138
+ )
139
+ return None
140
+ for stat_type in stat_types:
141
+ print(f"Fetching data for {stat_type}...")
142
+ df = get_table_data(
143
+ stat_type=stat_types[stat_type.value],
144
+ pos=pos,
145
+ league=league,
146
+ start_date=start_date,
147
+ end_date=end_date,
148
+ qual=qual,
149
+ start_season="",
150
+ end_season="",
151
+ )
152
+ if df is not None:
153
+ print(f"Data fetched for {stat_type}")
154
+ df_list.append(df)
155
+ else:
156
+ print(f"Warning: No data returned for {stat_type}")
157
+ df = pl.concat(df_list, how="diagonal")
158
+ df = df.select(pl.col("Name").drop_nulls())
159
+ return df.to_pandas() if return_pandas else df
160
+
161
+
162
+ def fangraphs_batting_season_range(
163
+ start_season,
164
+ end_season,
165
+ stat_types,
166
+ return_pandas=False,
167
+ pos="all",
168
+ league="",
169
+ qual="y",
170
+ ) -> pl.DataFrame | pd.DataFrame:
171
+ df_list = []
172
+ if stat_types is None:
173
+ stat_types = FangraphsBattingStatType
174
+ if len(stat_types) == 0:
175
+ print(
176
+ "Warning: No stat types provided, returning None, to return all stattypes, pass in None."
177
+ )
178
+ return None
179
+ for stat_type in stat_types:
180
+ print(f"Fetching data for {stat_type}...")
181
+ df = get_table_data(
182
+ stat_type=stat_types[stat_type.value],
183
+ pos=pos,
184
+ league=league,
185
+ start_date="",
186
+ end_date="",
187
+ qual=qual,
188
+ start_season=start_season,
189
+ end_season=end_season,
190
+ )
191
+ if df is not None:
192
+ print(f"Data fetched for {stat_type}")
193
+ df_list.append(df)
194
+ else:
195
+ print(f"Warning: No data returned for {stat_type}")
196
+ df = pl.concat(df_list, how="diagonal")
197
+ df = df.select(pl.col("Name").drop_nulls())
198
+ return df.to_pandas() if return_pandas else df
199
+
200
+
201
+ def fangraphs_pitching_date_range():
202
+ print("Not implemented yet.")
203
+
204
+
205
+ def fangraphs_pitching_season_range():
206
+ print("Not implemented yet.")
207
+
208
+
209
+ def fangraphs_fielding_date_range():
210
+ print("Not implemented yet.")
211
+
212
+
213
+ def fangraphs_fielding_season_range():
214
+ print("Not implemented yet.")
@@ -0,0 +1,76 @@
1
+ import asyncio
2
+ import io
3
+ import logging as logger
4
+
5
+ import pandas as pd
6
+ import polars as pl
7
+ import requests
8
+
9
+ from .statcast_utils import (
10
+ ROOT_URL,
11
+ SINGLE_GAME,
12
+ _add_extra_stats,
13
+ _statcast_date_range_helper,
14
+ )
15
+
16
+
17
+ def statcast_single_game(
18
+ game_pk: int, extra_stats: bool, return_pandas: bool = False
19
+ ) -> pl.LazyFrame | pd.DataFrame:
20
+ """Pulls statcast data for a single game.
21
+
22
+ Args:
23
+ game_pk (int): game_pk of the game you want to pull data for
24
+ extra_stats (bool): whether or not to include extra stats
25
+ return_pandas (bool, optional): whether or not to return as a Pandas DataFrame. Defaults to False (returns Polars LazyFrame).
26
+
27
+ Returns:
28
+ pl.LazyFrame | pd.DataFrame: DataFrame of statcast data for the game
29
+ """
30
+ try:
31
+ statcast_content = requests.get(
32
+ ROOT_URL + SINGLE_GAME.format(game_pk=game_pk), timeout=None
33
+ ).content
34
+ except Exception as e:
35
+ logger.error(f"Failed to pull data for game_pk: {game_pk}. {str(e)}")
36
+ return pl.LazyFrame() if not return_pandas else pd.DataFrame()
37
+ if not extra_stats:
38
+ return (
39
+ pl.scan_csv(io.StringIO(statcast_content.decode("utf-8")))
40
+ if not return_pandas
41
+ else pd.read_csv(io.StringIO(statcast_content.decode("utf-8")))
42
+ )
43
+ else:
44
+ df = pl.scan_csv(io.StringIO(statcast_content.decode("utf-8")))
45
+ start_dt = df.select(pl.col("game_date").min())
46
+ end_dt = df.select(pl.col("game_date").max())
47
+ return asyncio.run(_add_extra_stats(df, start_dt, end_dt, return_pandas))
48
+
49
+
50
+ def statcast_date_range(
51
+ start_dt: str,
52
+ end_dt: str,
53
+ team: str = None,
54
+ extra_stats: bool = False,
55
+ return_pandas: bool = False,
56
+ ) -> pl.LazyFrame | pd.DataFrame:
57
+ """
58
+ Pulls statcast data for a date range.
59
+
60
+ Args:
61
+ start_dt: the start date in 'YYYY-MM-DD' format
62
+ end_dt: the end date in 'YYYY-MM-DD' format
63
+ team: the team abbreviation (e.g. 'WSH'). If None, data for all teams will be returned.
64
+ extra_stats: whether to include extra stats
65
+ return_pandas: whether to return a pandas DataFrame (default is False, returning a Polars LazyFrame)
66
+
67
+ Returns:
68
+ A DataFrame of statcast data for the date range.
69
+ """
70
+
71
+ async def async_statcast():
72
+ return await _statcast_date_range_helper(
73
+ start_dt, end_dt, team, extra_stats, return_pandas
74
+ )
75
+
76
+ return asyncio.run(async_statcast())
@@ -0,0 +1,178 @@
1
+ import asyncio
2
+ import datetime as dt
3
+ from typing import Iterator, Tuple
4
+
5
+ import aiohttp
6
+ import pandas as pd
7
+ import polars as pl
8
+ from tqdm import tqdm
9
+ from tqdm.asyncio import tqdm_asyncio
10
+
11
+ # https://github.com/jldbc/pybaseball/blob/master/pybaseball/statcast.py
12
+ # used for root_url, single_game, date_range
13
+ ROOT_URL = "https://baseballsavant.mlb.com"
14
+ SINGLE_GAME = "/statcast_search/csv?all=true&type=details&game_pk={game_pk}"
15
+ DATE_RANGE = "/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&team={team}&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&"
16
+ # my own url
17
+ EXTRA_STATS = "/statcast_search/csv?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C2023%7C2022%7C2021%7C2020%7C2019%7C2018%7C2017%7C2016%7C2015%7C2014%7C2013%7C2012%7C2011%7C2010%7C2009%7C2008%7C&hfSit=&player_type={pos}&game_date_gt=&game_date_lt=&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=is%5C.%5C.remove%5C.%5C.bunts%7Cis%5C.%5C.competitive%7C&metric_1=&group_by=name&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&chk_stats_pa=on&chk_stats_abs=on&chk_stats_bip=on&chk_stats_hits=on&chk_stats_singles=on&chk_stats_dbls=on&chk_stats_triples=on&chk_stats_hrs=on&chk_stats_so=on&chk_stats_k_percent=on&chk_stats_bb=on&chk_stats_bb_percent=on&chk_stats_whiffs=on&chk_stats_swings=on&chk_stats_api_break_z_with_gravity=on&chk_stats_api_break_x_arm=on&chk_stats_api_break_z_induced=on&chk_stats_api_break_x_batter_in=on&chk_stats_ba=on&chk_stats_xba=on&chk_stats_xbadiff=on&chk_stats_obp=on&chk_stats_xobp=on&chk_stats_xobpdiff=on&chk_stats_slg=on&chk_stats_xslg=on&chk_stats_xslgdiff=on&chk_stats_woba=on&chk_stats_xwoba=on&chk_stats_wobadiff=on&chk_stats_barrels_total=on&chk_stats_babip=on&chk_stats_iso=on&chk_stats_run_exp=on&chk_stats_pitcher_run_exp=on&chk_stats_swing_miss_percent=on&chk_stats_batter_run_value_per_100=on&chk_stats_pitcher_run_value_per_100=on&chk_stats_velocity=on&chk_stats_effective_speed=on&chk_stats_spin_rate=on&chk_stats_release_pos_z=on&chk_stats_release_pos_x=on&chk_stats_release_extension=on&chk_stats_plate_x=on&chk_stats_plate_z=on&chk_stats_arm_angle=on&chk_stats_launch_speed=on&chk_stats_hyper_speed=on&chk_stats_sweetspot_speed_mph=on&chk_stats_launch_angle=on&chk_stats_bbdist=on&chk_stats_swing_length=on&chk_stats_hardhit_percent=on&chk_stats_barrels_per_bbe_percent=on&chk_stats_barrels_per_pa_percent=on&chk_stats_pos3_int_start_distance=on&chk_stats_pos4_int_start_distance=on&chk_stats_pos5_int_start_distance=on&chk_stats_pos6_int_start_distance=on&chk_stats_pos7_int_start_distance=on&chk_stats_pos8_int_start_distance=on&chk_stats_pos9_int_start_distance=on#results"
18
+ YEAR_RANGES = {
19
+ 2022: (dt.date(2022, 3, 17), dt.date(2022, 11, 5)),
20
+ 2016: (dt.date(2016, 4, 3), dt.date(2016, 11, 2)),
21
+ 2019: (dt.date(2019, 3, 20), dt.date(2019, 10, 30)),
22
+ 2017: (dt.date(2017, 4, 2), dt.date(2017, 11, 1)),
23
+ 2023: (dt.date(2023, 3, 15), dt.date(2023, 11, 1)),
24
+ 2020: (dt.date(2020, 7, 23), dt.date(2020, 10, 27)),
25
+ 2018: (dt.date(2018, 3, 29), dt.date(2018, 10, 28)),
26
+ 2015: (dt.date(2015, 4, 5), dt.date(2015, 11, 1)),
27
+ 2024: (dt.date(2024, 3, 15), dt.date(2024, 10, 25)),
28
+ 2021: (dt.date(2021, 3, 15), dt.date(2021, 11, 2)),
29
+ }
30
+
31
+ STATCAST_DATE_FORMAT = "%Y-%m-%d"
32
+
33
+
34
+ async def _fetch_data(session, url, retries=2):
35
+ for attempt in range(retries):
36
+ try:
37
+ async with session.get(url) as response:
38
+ return await response.read()
39
+ except aiohttp.ClientPayloadError as e:
40
+ if attempt < retries - 1:
41
+ await asyncio.sleep(1) # Wait before retrying
42
+ print(f"Retrying... {retries - attempt - 1} attempts left.")
43
+ continue
44
+ else:
45
+ print(f"Failed to fetch data from {url}.")
46
+ raise e
47
+
48
+
49
+ async def _fetch_all_data(urls):
50
+ async with aiohttp.ClientSession() as session:
51
+ tasks = [_fetch_data(session, url) for url in urls]
52
+ return await tqdm_asyncio.gather(*tasks, desc="Fetching data")
53
+
54
+
55
+ async def _statcast_date_range_helper(
56
+ start_dt: str,
57
+ end_dt: str,
58
+ team: str = None,
59
+ extra_stats: bool = False,
60
+ return_pandas: bool = False,
61
+ ) -> pl.LazyFrame | pd.DataFrame:
62
+ """
63
+ Pulls statcast data for a date range.
64
+
65
+ Args:
66
+ start_dt: the start date in 'YYYY-MM-DD' format
67
+ end_dt: the end date in 'YYYY-MM-DD' format
68
+ team: the team abbreviation (e.g. 'WSH'). If None, data for all teams will be returned.
69
+
70
+ Returns:
71
+ A DataFrame of statcast data for the date range.
72
+ """
73
+ if start_dt is None or end_dt is None:
74
+ raise ValueError("Both start_dt and end_dt must be provided.")
75
+ print(f"Pulling data for date range: {start_dt} to {end_dt}.")
76
+ start_dt, end_dt = _handle_dates(start_dt, end_dt)
77
+ date_ranges = list(_create_date_ranges(start_dt, end_dt, 1))
78
+
79
+ data_list = []
80
+
81
+ urls = [
82
+ ROOT_URL
83
+ + DATE_RANGE.format(
84
+ start_dt=str(start),
85
+ end_dt=str(end),
86
+ team=team if team else "",
87
+ )
88
+ for start, end in date_ranges
89
+ ]
90
+ schema = None
91
+ responses = await _fetch_all_data(urls)
92
+ for data in tqdm(responses, desc="Processing regular data"):
93
+ # scan csv as lazyframe and drop columns that will always be null
94
+ data = pl.scan_csv(data)
95
+ if schema is None:
96
+ schema = data.collect_schema()
97
+ else:
98
+ data = data.cast(schema)
99
+ data_list.append(data)
100
+ print("Concatenating data.")
101
+ df = pl.concat(data_list)
102
+ print("Data concatenated.")
103
+ if not extra_stats:
104
+ print("Done")
105
+ return df if not return_pandas else df.to_pandas()
106
+ else:
107
+ return await _add_extra_stats(df, start_dt, end_dt, return_pandas=return_pandas)
108
+
109
+
110
+ async def _add_extra_stats(
111
+ df: pl.LazyFrame, start_dt: dt.date, end_dt: dt.date, return_pandas: bool = False
112
+ ) -> pl.LazyFrame | pd.DataFrame:
113
+ df_list = []
114
+ urls = [
115
+ ROOT_URL + EXTRA_STATS.format(pos=pos, start_dt=start_dt, end_dt=end_dt)
116
+ for pos in ["pitcher", "batter"]
117
+ ]
118
+ responses = await _fetch_all_data(urls)
119
+ for data in tqdm(responses, desc="Processing extra data"):
120
+ data = pl.scan_csv(data)
121
+ df_list.append(data)
122
+
123
+ p_df = df_list[0]
124
+ p_df = p_df.drop("player_name").rename(lambda x: f"{x}_pitcher")
125
+ b_df = df_list[1]
126
+ b_df = b_df.drop("player_name").rename(lambda x: f"{x}_batter")
127
+ print("Joining data.")
128
+ df = df.join(p_df, left_on="pitcher", right_on="player_id_pitcher", how="left")
129
+ df = df.join(b_df, left_on="batter", right_on="player_id_batter", how="left")
130
+ print("Done")
131
+ return df if not return_pandas else df.to_pandas()
132
+
133
+
134
+ def _handle_dates(start_dt: str, end_dt: str) -> Tuple[dt.date, dt.date]:
135
+ """
136
+ Helper function to handle date inputs.
137
+
138
+ Args:
139
+ start_dt: the start date in 'YYYY-MM-DD' format
140
+ end_dt: the end date in 'YYYY-MM-DD' format
141
+
142
+ Returns:
143
+ A tuple of datetime.date objects for the start and end dates.
144
+ """
145
+ start_dt_date = dt.datetime.strptime(start_dt, STATCAST_DATE_FORMAT).date()
146
+ end_dt_date = dt.datetime.strptime(end_dt, STATCAST_DATE_FORMAT).date()
147
+ if start_dt_date > end_dt_date:
148
+ raise ValueError("start_dt must be before end_dt.")
149
+ return start_dt_date, end_dt_date
150
+
151
+
152
+ # this function comes from https://github.com/jldbc/pybaseball/blob/master/pybaseball/statcast.py
153
+ def _create_date_ranges(
154
+ start: dt.date, stop: dt.date, step: int, verbose: bool = True
155
+ ) -> Iterator[Tuple[dt.date, dt.date]]:
156
+ """
157
+ Iterate over dates. Skip the offseason dates. Returns a pair of dates for beginning and end of each segment.
158
+ Range is inclusive of the stop date.
159
+ If verbose is enabled, it will print a message if it skips offseason dates.
160
+ This version is Statcast specific, relying on skipping predefined dates from STATCAST_VALID_DATES.
161
+ """
162
+ low = start
163
+
164
+ while low <= stop:
165
+ date_span = low.replace(month=3, day=15), low.replace(month=11, day=15)
166
+ season_start, season_end = YEAR_RANGES.get(low.year, date_span)
167
+ if low < season_start:
168
+ low = season_start
169
+ elif low > season_end:
170
+ low, _ = YEAR_RANGES.get(
171
+ low.year + 1, (dt.date(month=3, day=15, year=low.year + 1), None)
172
+ )
173
+
174
+ if low > stop:
175
+ return
176
+ high = min(low + dt.timedelta(step - 1), stop)
177
+ yield low, high
178
+ low += dt.timedelta(days=step)
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.2
2
+ Name: pybaseballstats
3
+ Version: 0.0.1
4
+ Summary: A Python package for scraping baseball data.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: aiohttp>=3.11.11
8
+ Requires-Dist: asyncio>=3.4.3
9
+ Requires-Dist: bs4>=0.0.2
10
+ Requires-Dist: pandas>=2.2.3
11
+ Requires-Dist: polars>=1.20.0
12
+ Requires-Dist: pytest>=8.3.4
13
+ Requires-Dist: ruff>=0.9.3
14
+ Requires-Dist: setuptools>=75.8.0
15
+ Requires-Dist: tqdm>=4.67.1
16
+
17
+ # pybaseballstats
18
+
19
+ A Python package for scraping baseball statistics from the web. Inspired by the pybaseball package by James LeDoux. This package is a work in progress and is not yet ready for use.
20
+
21
+ ## Available Sources
22
+
23
+ 1. Baseball Savant
24
+
25
+ ## Usage
26
+
27
+ Usage documentation can be found at /usage_docs/
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ pybaseballstats/__init__.py
5
+ pybaseballstats/fangraphs.py
6
+ pybaseballstats/statcast.py
7
+ pybaseballstats/statcast_utils.py
8
+ pybaseballstats.egg-info/PKG-INFO
9
+ pybaseballstats.egg-info/SOURCES.txt
10
+ pybaseballstats.egg-info/dependency_links.txt
11
+ pybaseballstats.egg-info/requires.txt
12
+ pybaseballstats.egg-info/top_level.txt
@@ -0,0 +1,9 @@
1
+ aiohttp>=3.11.11
2
+ asyncio>=3.4.3
3
+ bs4>=0.0.2
4
+ pandas>=2.2.3
5
+ polars>=1.20.0
6
+ pytest>=8.3.4
7
+ ruff>=0.9.3
8
+ setuptools>=75.8.0
9
+ tqdm>=4.67.1
@@ -0,0 +1 @@
1
+ pybaseballstats
@@ -0,0 +1,17 @@
1
+ [project]
2
+ name = "pybaseballstats"
3
+ version = "0.0.1"
4
+ description = "A Python package for scraping baseball data."
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "aiohttp>=3.11.11",
9
+ "asyncio>=3.4.3",
10
+ "bs4>=0.0.2",
11
+ "pandas>=2.2.3",
12
+ "polars>=1.20.0",
13
+ "pytest>=8.3.4",
14
+ "ruff>=0.9.3",
15
+ "setuptools>=75.8.0",
16
+ "tqdm>=4.67.1",
17
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ from setuptools import find_packages, setup
2
+
3
+ setup(
4
+ name="pybaseballstats",
5
+ version="0.0.1",
6
+ packages=find_packages(include=["pybaseballstats", "pybaseballstats.*"]),
7
+ # other setup arguments
8
+ )