wsba-hockey 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/api/api/index.py +1 -8
- wsba_hockey/tools/scraping.py +19 -13
- wsba_hockey/wsba_main.py +321 -114
- {wsba_hockey-1.1.6.dist-info → wsba_hockey-1.1.7.dist-info}/METADATA +1 -1
- {wsba_hockey-1.1.6.dist-info → wsba_hockey-1.1.7.dist-info}/RECORD +9 -9
- {wsba_hockey-1.1.6.dist-info → wsba_hockey-1.1.7.dist-info}/WHEEL +0 -0
- {wsba_hockey-1.1.6.dist-info → wsba_hockey-1.1.7.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-1.1.6.dist-info → wsba_hockey-1.1.7.dist-info}/top_level.txt +0 -0
wsba_hockey/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from wsba_hockey.wsba_main import nhl_scrape_game,nhl_scrape_schedule,nhl_scrape_season,nhl_scrape_seasons_info,nhl_scrape_standings,nhl_scrape_roster,nhl_scrape_draft_rankings,nhl_scrape_prospects,nhl_calculate_stats,
|
1
|
+
from wsba_hockey.wsba_main import nhl_scrape_game,nhl_scrape_schedule,nhl_scrape_season,nhl_scrape_seasons_info,nhl_scrape_standings,nhl_scrape_roster,nhl_scrape_draft_rankings,nhl_scrape_prospects,nhl_calculate_stats,nhl_apply_xG,nhl_plot_skaters_shots,nhl_plot_games,repo_load_rosters,repo_load_schedule,repo_load_teaminfo,repo_load_pbp,repo_load_seasons
|
wsba_hockey/api/api/index.py
CHANGED
@@ -90,14 +90,7 @@ def schedule_info(season: int):
|
|
90
90
|
|
91
91
|
@app.get("/nhl/games/{game_id}")
|
92
92
|
def pbp(game_id: int):
|
93
|
-
|
94
|
-
|
95
|
-
season = info['season']
|
96
|
-
dataset = ds.dataset(f's3://weakside-breakout/pbp/parquet/nhl_pbp_{season}.parquet', format='parquet')
|
97
|
-
filter_expr = (ds.field('game_id')==game_id)
|
98
|
-
|
99
|
-
table = dataset.to_table(use_threads=True,filter=filter_expr)
|
100
|
-
df = table.to_pandas()
|
93
|
+
df = pd.read_csv(f'data/sources/20242025/{game_id}.csv')
|
101
94
|
|
102
95
|
df = df.fillna('')
|
103
96
|
|
wsba_hockey/tools/scraping.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import re
|
2
2
|
import warnings
|
3
3
|
import os
|
4
|
+
import asyncio
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
7
|
import requests as rs
|
@@ -179,7 +180,7 @@ def get_game_info(game_id):
|
|
179
180
|
'coaches':get_game_coaches(game_id),
|
180
181
|
'json_shifts':json_shifts}
|
181
182
|
|
182
|
-
def parse_json(info):
|
183
|
+
async def parse_json(info):
|
183
184
|
#Given game info, return JSON document
|
184
185
|
|
185
186
|
#Retreive data
|
@@ -340,7 +341,7 @@ def clean_html_pbp(info):
|
|
340
341
|
|
341
342
|
return cleaned_html
|
342
343
|
|
343
|
-
def parse_html(info):
|
344
|
+
async def parse_html(info):
|
344
345
|
#Given game info, return HTML event data
|
345
346
|
|
346
347
|
#Retreive game information and html events
|
@@ -561,7 +562,7 @@ def espn_game_id(date,away,home):
|
|
561
562
|
#Return: ESPN game id
|
562
563
|
return game_id
|
563
564
|
|
564
|
-
def parse_espn(date,away,home):
|
565
|
+
async def parse_espn(date,away,home):
|
565
566
|
#Given a date formatted as YYYY-MM-DD and teams, return game events
|
566
567
|
game_id = espn_game_id(date,away,home)
|
567
568
|
url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
|
@@ -711,15 +712,24 @@ def assign_target(data):
|
|
711
712
|
#Revert sort and return dataframe
|
712
713
|
return data.reset_index()
|
713
714
|
|
714
|
-
def combine_pbp(info,sources):
|
715
|
+
async def combine_pbp(info,sources):
|
715
716
|
#Given game info, return complete play-by-play data for provided game
|
716
717
|
|
717
|
-
|
718
|
+
#Create tasks
|
719
|
+
html_task = asyncio.create_task(parse_html(info))
|
720
|
+
if info['season'] in [20052006, 20062007, 20072008, 20082009, 20092010]:
|
721
|
+
json_task = asyncio.create_task(parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']))
|
722
|
+
json_type = 'espn'
|
723
|
+
else:
|
724
|
+
json_task = asyncio.create_task(parse_json(info))
|
725
|
+
json_type = 'nhl'
|
718
726
|
|
727
|
+
html_pbp, json_pbp = await asyncio.gather(html_task, json_task)
|
728
|
+
|
719
729
|
#Route data combining - json if season is after 2009-2010:
|
720
|
-
if
|
730
|
+
if json_type == 'espn':
|
721
731
|
#ESPN x HTML
|
722
|
-
espn_pbp =
|
732
|
+
espn_pbp = json_pbp.rename(columns={'coords_x':'x',"coords_y":'y'}).sort_values(['period','seconds_elapsed']).reset_index()
|
723
733
|
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
|
724
734
|
|
725
735
|
#Merge pbp
|
@@ -727,8 +737,6 @@ def combine_pbp(info,sources):
|
|
727
737
|
|
728
738
|
else:
|
729
739
|
#JSON x HTML
|
730
|
-
json_pbp = parse_json(info)
|
731
|
-
|
732
740
|
if sources:
|
733
741
|
dirs_html = f'sources/{info['season']}/HTML/'
|
734
742
|
dirs_json = f'sources/{info['season']}/JSON/'
|
@@ -1077,12 +1085,10 @@ def combine_shifts(info,sources):
|
|
1077
1085
|
#Return: full shifts data converted to play-by-play format
|
1078
1086
|
return full_shifts
|
1079
1087
|
|
1080
|
-
def combine_data(info,sources):
|
1088
|
+
async def combine_data(info,sources):
|
1081
1089
|
#Given game info, return complete play-by-play data
|
1082
1090
|
|
1083
|
-
|
1084
|
-
|
1085
|
-
pbp = combine_pbp(info,sources)
|
1091
|
+
pbp = await combine_pbp(info,sources)
|
1086
1092
|
shifts = combine_shifts(info,sources)
|
1087
1093
|
|
1088
1094
|
#Combine data
|
wsba_hockey/wsba_main.py
CHANGED
@@ -2,7 +2,9 @@ import random
|
|
2
2
|
import os
|
3
3
|
import requests as rs
|
4
4
|
import pandas as pd
|
5
|
+
import asyncio
|
5
6
|
import time
|
7
|
+
from typing import Literal, Union
|
6
8
|
from datetime import datetime, timedelta, date
|
7
9
|
from wsba_hockey.tools.scraping import *
|
8
10
|
from wsba_hockey.tools.xg_model import *
|
@@ -112,29 +114,41 @@ INFO_PATH = os.path.join(DIR,'tools\\teaminfo\\nhl_teaminfo.csv')
|
|
112
114
|
DEFAULT_ROSTER = os.path.join(DIR,'tools\\rosters\\nhl_rosters.csv')
|
113
115
|
|
114
116
|
## SCRAPE FUNCTIONS ##
|
115
|
-
def nhl_scrape_game(game_ids,split_shifts = False, remove = [
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
117
|
+
def nhl_scrape_game(game_ids:list[int], split_shifts:bool = False, remove:list[str] = [], verbose:bool = False, sources:bool = False, errors:bool = False):
|
118
|
+
"""
|
119
|
+
Given a set of game_ids (NHL API), return complete play-by-play information as requested.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
game_ids (List[int] or ['random', int, int, int]):
|
123
|
+
List of NHL game IDs to scrape or use ['random', n, start_year, end_year] to fetch n random games.
|
124
|
+
split_shifts (bool, optional):
|
125
|
+
If True, returns a dict with separate 'pbp' and 'shifts' DataFrames. Default is False.
|
126
|
+
remove (List[str], optional):
|
127
|
+
List of event types to remove from the result. Default is an empty list.
|
128
|
+
verbose (bool, optional):
|
129
|
+
If True, generates extra event features (such as those required to calculate xG). Default is False.
|
130
|
+
sources (bool, optional):
|
131
|
+
If True, saves raw HTML, JSON, SHIFTS, and single-game full play-by-play to a separate folder in the working directory. Default is False.
|
132
|
+
errors (bool, optional):
|
133
|
+
If True, includes a list of game IDs that failed to scrape in the return. Default is False.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
pd.DataFrame:
|
137
|
+
If split_shifts is False, returns a single DataFrame of play-by-play data.
|
138
|
+
dict[str, pd.DataFrame]:
|
139
|
+
If split_shifts is True, returns a dictionary with keys:
|
140
|
+
- 'pbp': play-by-play events
|
141
|
+
- 'shifts': shift change events
|
142
|
+
- 'errors' (optional): list of game IDs that failed if errors=True
|
143
|
+
"""
|
144
|
+
|
125
145
|
pbps = []
|
126
146
|
if game_ids[0] == 'random':
|
127
147
|
#Randomize selection of game_ids
|
128
148
|
#Some ids returned may be invalid (for example, 2020021300)
|
129
149
|
num = game_ids[1]
|
130
|
-
|
131
|
-
|
132
|
-
except:
|
133
|
-
start = 2007
|
134
|
-
try:
|
135
|
-
end = game_ids[3]
|
136
|
-
except:
|
137
|
-
end = (date.today().year)-1
|
150
|
+
start = game_ids[2] if len(game_ids) > 1 else 2007
|
151
|
+
end = game_ids[3] if len(game_ids) > 2 else (date.today().year)-1
|
138
152
|
|
139
153
|
game_ids = []
|
140
154
|
i = 0
|
@@ -161,13 +175,13 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
|
|
161
175
|
error_ids = []
|
162
176
|
prog = 0
|
163
177
|
for game_id in game_ids:
|
164
|
-
print(
|
178
|
+
print(f'Scraping data from game {game_id}...',end='')
|
165
179
|
start = time.perf_counter()
|
166
180
|
|
167
181
|
try:
|
168
182
|
#Retrieve data
|
169
183
|
info = get_game_info(game_id)
|
170
|
-
data = combine_data(info, sources)
|
184
|
+
data = asyncio.run(combine_data(info, sources))
|
171
185
|
|
172
186
|
#Append data to list
|
173
187
|
pbps.append(data)
|
@@ -186,19 +200,19 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
|
|
186
200
|
data.to_csv(f'{dirs}{info['game_id']}.csv',index=False)
|
187
201
|
|
188
202
|
print(f" finished in {secs:.2f} seconds. {prog}/{len(game_ids)} ({(prog/len(game_ids))*100:.2f}%)")
|
189
|
-
except:
|
203
|
+
except Exception as e:
|
190
204
|
#Games such as the all-star game and pre-season games will incur this error
|
191
205
|
#Other games have known problems
|
192
206
|
if game_id in KNOWN_PROBS.keys():
|
193
207
|
print(f"\nGame {game_id} has a known problem: {KNOWN_PROBS[game_id]}")
|
194
208
|
else:
|
195
|
-
print(f"\nUnable to scrape game {game_id}.
|
209
|
+
print(f"\nUnable to scrape game {game_id}. Exception: {e}")
|
196
210
|
|
197
211
|
#Track error
|
198
212
|
error_ids.append(game_id)
|
199
213
|
|
200
214
|
#Add all pbps together
|
201
|
-
if
|
215
|
+
if not pbps:
|
202
216
|
print("\rNo data returned.")
|
203
217
|
return pd.DataFrame()
|
204
218
|
df = pd.concat(pbps)
|
@@ -210,7 +224,7 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
|
|
210
224
|
""
|
211
225
|
|
212
226
|
#Print final message
|
213
|
-
if
|
227
|
+
if error_ids:
|
214
228
|
print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
|
215
229
|
else:
|
216
230
|
print('\rScrape of provided games finished.')
|
@@ -218,7 +232,7 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
|
|
218
232
|
#Split pbp and shift events if necessary
|
219
233
|
#Return: complete play-by-play with data removed or split as necessary
|
220
234
|
|
221
|
-
if split_shifts
|
235
|
+
if split_shifts:
|
222
236
|
remove.append('change')
|
223
237
|
|
224
238
|
#Return: dict with pbp and shifts seperated
|
@@ -242,22 +256,40 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
|
|
242
256
|
else:
|
243
257
|
return pbp
|
244
258
|
|
245
|
-
def nhl_scrape_schedule(season,start =
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
259
|
+
def nhl_scrape_schedule(season:int, start:str = '', end:str = ''):
|
260
|
+
"""
|
261
|
+
Given season and an optional date range, retrieve NHL schedule data.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
season (int):
|
265
|
+
The NHL season formatted such as "20242025".
|
266
|
+
start (str, optional):
|
267
|
+
The date string (MM-DD) to start the schedule scrape at. Default is a blank string.
|
268
|
+
end (str, optional):
|
269
|
+
The date string (MM-DD) to end the schedule scrape at. Default is a blank string.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
pd.DataFrame:
|
273
|
+
A DataFrame containing the schedule data for the specified season and date range.
|
274
|
+
"""
|
250
275
|
|
251
276
|
api = "https://api-web.nhle.com/v1/schedule/"
|
252
277
|
|
253
|
-
#
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
278
|
+
#If either start or end are blank then find start and endpoints for specified season
|
279
|
+
if start == '' or end == '':
|
280
|
+
season_data = rs.get('https://api.nhle.com/stats/rest/en/season').json()['data']
|
281
|
+
season_data = [s for s in season_data if s['id'] == season][0]
|
282
|
+
start = season_data['startDate'][0:10]
|
283
|
+
end = season_data['endDate'][0:10]
|
258
284
|
else:
|
259
|
-
|
260
|
-
|
285
|
+
#Determine how to approach scraping; if month in season is after the new year the year must be adjusted
|
286
|
+
new_year = ["01","02","03","04","05","06"]
|
287
|
+
if start[:2] in new_year:
|
288
|
+
start = f'{int(str(season)[:4])+1}-{start}'
|
289
|
+
end = f'{str(season)[:-4]}-{end}'
|
290
|
+
else:
|
291
|
+
start = f'{int(str(season)[:4])}-{start}'
|
292
|
+
end = f'{str(season)[:-4]}-{end}'
|
261
293
|
|
262
294
|
form = '%Y-%m-%d'
|
263
295
|
|
@@ -274,9 +306,9 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
|
274
306
|
for i in range(day):
|
275
307
|
#For each day, call NHL api and retreive info on all games of selected game
|
276
308
|
inc = start+timedelta(days=i)
|
277
|
-
print(
|
309
|
+
print(f'Scraping games on {str(inc)[:10]}...')
|
278
310
|
|
279
|
-
get = rs.get(api
|
311
|
+
get = rs.get(f'{api}{str(inc)[:10]}').json()
|
280
312
|
gameWeek = pd.json_normalize(list(pd.json_normalize(get['gameWeek'])['games'])[0])
|
281
313
|
|
282
314
|
#Return nothing if there's nothing
|
@@ -302,43 +334,81 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
|
302
334
|
#Return: specificed schedule data
|
303
335
|
return df
|
304
336
|
|
305
|
-
def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = [
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
337
|
+
def nhl_scrape_season(season:int, split_shifts:bool = False, season_types:list[int] = [2,3], remove:list[str] = [], start:str = '', end:str = '', local:bool=False, local_path:str = SCHEDULE_PATH, verbose:bool = False, sources:bool = False, errors:bool = False):
|
338
|
+
"""
|
339
|
+
Given season, scrape all play-by-play occuring within the season.
|
340
|
+
|
341
|
+
Args:
|
342
|
+
season (int):
|
343
|
+
The NHL season formatted such as "20242025".
|
344
|
+
split_shifts (bool, optional):
|
345
|
+
If True, returns a dict with separate 'pbp' and 'shifts' DataFrames. Default is False.
|
346
|
+
season_types (List[int], optional):
|
347
|
+
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectfully.
|
348
|
+
remove (List[str], optional):
|
349
|
+
List of event types to remove from the result. Default is an empty list.
|
350
|
+
start (str, optional):
|
351
|
+
The date string (MM-DD) to start the schedule scrape at. Default is a blank string.
|
352
|
+
end (str, optional):
|
353
|
+
The date string (MM-DD) to end the schedule scrape at. Default is a blank string.
|
354
|
+
local (bool, optional):
|
355
|
+
If True, use local file to retreive schedule data.
|
356
|
+
local_path (bool, optional):
|
357
|
+
If True, specifies the path with schedule data necessary to scrape a season's games (only relevant if local = True).
|
358
|
+
verbose (bool, optional):
|
359
|
+
If True, generates extra event features (such as those required to calculate xG). Default is False.
|
360
|
+
sources (bool, optional):
|
361
|
+
If True, saves raw HTML, JSON, SHIFTS, and single-game full play-by-play to a separate folder in the working directory. Default is False.
|
362
|
+
errors (bool, optional):
|
363
|
+
If True, includes a list of game IDs that failed to scrape in the return. Default is False.
|
364
|
+
|
365
|
+
Returns:
|
366
|
+
pd.DataFrame:
|
367
|
+
If split_shifts is False, returns a single DataFrame of play-by-play data.
|
368
|
+
dict[str, pd.DataFrame]:
|
369
|
+
If split_shifts is True, returns a dictionary with keys:
|
370
|
+
- 'pbp': play-by-play events
|
371
|
+
- 'shifts': shift change events
|
372
|
+
- 'errors' (optional): list of game IDs that failed if errors=True
|
373
|
+
"""
|
374
|
+
|
318
375
|
#Determine whether to use schedule data in repository or to scrape
|
319
376
|
if local:
|
320
377
|
load = pd.read_csv(local_path)
|
321
378
|
load['date'] = pd.to_datetime(load['date'])
|
322
|
-
|
323
|
-
start
|
324
|
-
|
325
|
-
|
326
|
-
|
379
|
+
|
380
|
+
if start == '' or end == '':
|
381
|
+
season_data = rs.get('https://api.nhle.com/stats/rest/en/season').json()['data']
|
382
|
+
season_data = [s for s in season_data if s['id'] == season][0]
|
383
|
+
start = season_data['startDate'][0:10]
|
384
|
+
end = season_data['endDate'][0:10]
|
385
|
+
|
386
|
+
form = '%Y-%m-%d'
|
387
|
+
|
388
|
+
#Create datetime values from dates
|
389
|
+
start = datetime.strptime(start,form)
|
390
|
+
end = datetime.strptime(end,form)
|
391
|
+
|
392
|
+
else:
|
393
|
+
start = f'{(str(season)[0:4] if int(start[0:2])>=9 else str(season)[4:8])}-{start[0:2]}-{start[3:5]}'
|
394
|
+
end = f'{(str(season)[0:4] if int(end[0:2])>=9 else str(season)[4:8])}-{end[0:2]}-{end[3:5]}'
|
395
|
+
|
396
|
+
load = load.loc[(load['season']==season)&
|
327
397
|
(load['season_type'].isin(season_types))&
|
328
398
|
(load['date']>=start)&(load['date']<=end)]
|
329
399
|
|
330
|
-
game_ids =
|
400
|
+
game_ids = load['id'].to_list()
|
331
401
|
else:
|
332
402
|
load = nhl_scrape_schedule(season,start,end)
|
333
|
-
load = load.loc[(load['season']
|
334
|
-
game_ids =
|
403
|
+
load = load.loc[(load['season']==season)&(load['season_type'].isin(season_types))]
|
404
|
+
game_ids = load['id'].to_list()
|
335
405
|
|
336
406
|
#If no games found, terminate the process
|
337
407
|
if not game_ids:
|
338
408
|
print('No games found for dates in season...')
|
339
409
|
return ""
|
340
410
|
|
341
|
-
print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
|
411
|
+
print(f"Scraping games from {str(season)[0:4]}-{str(season)[4:8]} season...")
|
342
412
|
start = time.perf_counter()
|
343
413
|
|
344
414
|
#Perform scrape
|
@@ -354,11 +424,22 @@ def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove
|
|
354
424
|
#Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
|
355
425
|
return data
|
356
426
|
|
357
|
-
def nhl_scrape_seasons_info(seasons = []):
|
358
|
-
|
427
|
+
def nhl_scrape_seasons_info(seasons:list[int] = []):
|
428
|
+
"""
|
429
|
+
Returns info related to NHL seasons (by default, all seasons are included)
|
430
|
+
Args:
|
431
|
+
seasons (List[int], optional):
|
432
|
+
The NHL season formatted such as "20242025".
|
433
|
+
|
434
|
+
Returns:
|
435
|
+
pd.DataFrame:
|
436
|
+
A DataFrame containing the information for requested seasons.
|
437
|
+
"""
|
438
|
+
|
439
|
+
#
|
359
440
|
# param 'season' - list of seasons to include
|
360
441
|
|
361
|
-
print(
|
442
|
+
print(f'Scraping info for seasons: {seasons}')
|
362
443
|
api = "https://api.nhle.com/stats/rest/en/season"
|
363
444
|
info = "https://api-web.nhle.com/v1/standings-season"
|
364
445
|
data = rs.get(api).json()['data']
|
@@ -374,12 +455,20 @@ def nhl_scrape_seasons_info(seasons = []):
|
|
374
455
|
else:
|
375
456
|
return df.sort_values(by=['id'])
|
376
457
|
|
377
|
-
def nhl_scrape_standings(arg = "now", season_type = 2):
|
378
|
-
|
379
|
-
|
380
|
-
|
458
|
+
def nhl_scrape_standings(arg:str = "now", season_type:int = 2):
|
459
|
+
"""
|
460
|
+
Returns standings or playoff bracket
|
461
|
+
Args:
|
462
|
+
arg (str, optional):
|
463
|
+
Date formatted as 'YYYY-MM-DD' to scrape standings for specific date or 'now' for current standings. Default is 'now'.
|
464
|
+
season_type (int, optional):
|
465
|
+
Part of season to scrape. If 3 (playoffs) then scrape the playoff bracket for the season implied by arg. When arg = 'now' this is ignored. Default is 2.
|
466
|
+
|
467
|
+
Returns:
|
468
|
+
pd.DataFrame:
|
469
|
+
A DataFrame containing the standings information (or playoff bracket).
|
470
|
+
"""
|
381
471
|
|
382
|
-
#arg param is ignored when set to "now" if season_type param is 3
|
383
472
|
if season_type == 3:
|
384
473
|
if arg == "now":
|
385
474
|
arg = NEW
|
@@ -404,9 +493,19 @@ def nhl_scrape_standings(arg = "now", season_type = 2):
|
|
404
493
|
|
405
494
|
return pd.json_normalize(data)
|
406
495
|
|
407
|
-
def nhl_scrape_roster(season):
|
408
|
-
|
409
|
-
|
496
|
+
def nhl_scrape_roster(season: int):
|
497
|
+
"""
|
498
|
+
Returns rosters for all teams in a given season.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
season (int):
|
502
|
+
The NHL season formatted such as "20242025".
|
503
|
+
|
504
|
+
Returns:
|
505
|
+
pd.DataFrame:
|
506
|
+
A DataFrame containing the rosters for all teams in the specified season.
|
507
|
+
"""
|
508
|
+
|
410
509
|
print("Scrpaing rosters for the "+ season + "season...")
|
411
510
|
teaminfo = pd.read_csv(info_path)
|
412
511
|
|
@@ -435,8 +534,18 @@ def nhl_scrape_roster(season):
|
|
435
534
|
|
436
535
|
return pd.concat(rosts)
|
437
536
|
|
438
|
-
def nhl_scrape_prospects(team):
|
439
|
-
|
537
|
+
def nhl_scrape_prospects(team:str):
|
538
|
+
"""
|
539
|
+
Returns prospects for specified team
|
540
|
+
|
541
|
+
Args:
|
542
|
+
team (str):
|
543
|
+
Three character team abbreviation such as 'BOS'
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
pd.DataFrame:
|
547
|
+
A DataFrame containing the prospect data for the specified team.
|
548
|
+
"""
|
440
549
|
|
441
550
|
api = f'https://api-web.nhle.com/v1/prospects/{team}'
|
442
551
|
|
@@ -452,10 +561,20 @@ def nhl_scrape_prospects(team):
|
|
452
561
|
#Return: team prospects
|
453
562
|
return prospects
|
454
563
|
|
455
|
-
def nhl_scrape_team_info(country = False):
|
456
|
-
|
564
|
+
def nhl_scrape_team_info(country:bool = False):
|
565
|
+
"""
|
566
|
+
Returns team or country information from the NHL API.
|
567
|
+
|
568
|
+
Args:
|
569
|
+
country (bool, optional):
|
570
|
+
If True, returns country information instead of NHL team information.
|
457
571
|
|
458
|
-
|
572
|
+
Returns:
|
573
|
+
pd.DataFrame:
|
574
|
+
A DataFrame containing team or country information from the NHL API.
|
575
|
+
"""
|
576
|
+
|
577
|
+
print(f'Scraping {'country' if country else 'team'} information...')
|
459
578
|
api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
|
460
579
|
|
461
580
|
data = pd.json_normalize(rs.get(api).json()['data'])
|
@@ -467,8 +586,19 @@ def nhl_scrape_team_info(country = False):
|
|
467
586
|
|
468
587
|
return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
|
469
588
|
|
470
|
-
def nhl_scrape_player_data(player_ids):
|
471
|
-
|
589
|
+
def nhl_scrape_player_data(player_ids:list[int]):
|
590
|
+
"""
|
591
|
+
Returns player data for specified players.
|
592
|
+
|
593
|
+
Args:
|
594
|
+
player_ids (list[int]):
|
595
|
+
List of NHL API player IDs to retrieve information for.
|
596
|
+
|
597
|
+
Returns:
|
598
|
+
pd.DataFrame:
|
599
|
+
A DataFrame containing player data for specified players.
|
600
|
+
"""
|
601
|
+
|
472
602
|
infos = []
|
473
603
|
for player_id in player_ids:
|
474
604
|
player_id = int(player_id)
|
@@ -489,15 +619,28 @@ def nhl_scrape_player_data(player_ids):
|
|
489
619
|
else:
|
490
620
|
return pd.DataFrame()
|
491
621
|
|
492
|
-
def nhl_scrape_draft_rankings(arg = 'now', category =
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
622
|
+
def nhl_scrape_draft_rankings(arg:str = 'now', category:int = 0):
|
623
|
+
"""
|
624
|
+
Returns draft rankings
|
625
|
+
Args:
|
626
|
+
arg (str, optional):
|
627
|
+
Date formatted as 'YYYY-MM-DD' to scrape draft rankings for specific date or 'now' for current draft rankings. Default is 'now'.
|
628
|
+
category (int, optional):
|
629
|
+
Category number for prospects. When arg = 'now' this does not apply.
|
630
|
+
|
631
|
+
- Category 1 is North American Skaters.
|
632
|
+
- Category 2 is International Skaters.
|
633
|
+
- Category 3 is North American Goalie.
|
634
|
+
- Category 4 is International Goalie
|
635
|
+
|
636
|
+
Default is 0 (all prospects).
|
637
|
+
Returns:
|
638
|
+
pd.DataFrame:
|
639
|
+
A DataFrame containing draft rankings.
|
640
|
+
"""
|
498
641
|
|
499
642
|
#Player category only applies when requesting a specific season
|
500
|
-
api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category
|
643
|
+
api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category > 0 else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
|
501
644
|
data = pd.json_normalize(rs.get(api).json()['rankings'])
|
502
645
|
|
503
646
|
#Add player name columns
|
@@ -506,10 +649,16 @@ def nhl_scrape_draft_rankings(arg = 'now', category = ''):
|
|
506
649
|
#Return: prospect rankings
|
507
650
|
return data
|
508
651
|
|
509
|
-
def nhl_apply_xG(pbp):
|
510
|
-
|
511
|
-
|
512
|
-
|
652
|
+
def nhl_apply_xG(pbp: pd.DataFrame):
|
653
|
+
"""
|
654
|
+
Given play-by-play data, return this data with xG-related columns
|
655
|
+
Args:
|
656
|
+
pbp (pd.DataFrame):
|
657
|
+
A DataFrame containing play-by-play data generated within the WBSA Hockey package.
|
658
|
+
Returns:
|
659
|
+
pd.DataFrame:
|
660
|
+
A DataFrame containing input play-by-play data with xG column.
|
661
|
+
"""
|
513
662
|
|
514
663
|
print(f'Applying WSBA xG to model with seasons: {pbp['season'].drop_duplicates().to_list()}')
|
515
664
|
|
@@ -518,7 +667,7 @@ def nhl_apply_xG(pbp):
|
|
518
667
|
|
519
668
|
return pbp
|
520
669
|
|
521
|
-
def
|
670
|
+
def shooting_impacts(agg, type):
|
522
671
|
#Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
|
523
672
|
#Only 5v5 is supported as of now
|
524
673
|
|
@@ -868,7 +1017,7 @@ def nhl_shooting_impacts(agg,type):
|
|
868
1017
|
#Return: skater stats with shooting impacts
|
869
1018
|
return df
|
870
1019
|
|
871
|
-
def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,roster_path=DEFAULT_ROSTER,shot_impact=False):
|
1020
|
+
def nhl_calculate_stats(pbp:pd.DataFrame, type:Literal['skater','goalie','team'], season_types:list[int], game_strength: Union[Literal['all'], list[str]], split_game:bool = False, roster_path:str = DEFAULT_ROSTER, shot_impact:bool = False):
|
872
1021
|
#Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
|
873
1022
|
# param 'pbp' - play-by-play dataframe
|
874
1023
|
# param 'type' - type of stats to calculate ('skater', 'goalie', or 'team')
|
@@ -879,6 +1028,33 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
|
|
879
1028
|
# param 'roster_path' - path to roster file
|
880
1029
|
# param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
|
881
1030
|
|
1031
|
+
"""
|
1032
|
+
Given play-by-play data, seasonal information, game strength, rosters, and an xG model,
|
1033
|
+
return aggregated statistics at the skater, goalie, or team level.
|
1034
|
+
|
1035
|
+
Args:
|
1036
|
+
pbp (pd.DataFrame):
|
1037
|
+
A DataFrame containing play-by-play event data.
|
1038
|
+
type (Literal['skater', 'goalie', 'team']):
|
1039
|
+
Type of statistics to calculate. Must be one of 'skater', 'goalie', or 'team'.
|
1040
|
+
season (int):
|
1041
|
+
The NHL season formatted such as "20242025".
|
1042
|
+
season_types (List[int], optional):
|
1043
|
+
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectfully.
|
1044
|
+
game_strength (str or list[str]):
|
1045
|
+
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1046
|
+
split_game (bool, optional):
|
1047
|
+
If True, aggregates stats separately for each game; otherwise, stats are aggregated across all games. Default is False.
|
1048
|
+
roster_path (str, optional):
|
1049
|
+
File path to the roster data used for mapping players and teams.
|
1050
|
+
shot_impact (bool, optional):
|
1051
|
+
If True, applies shot impact metrics to the stats DataFrame. Default is False.
|
1052
|
+
|
1053
|
+
Returns:
|
1054
|
+
pd.DataFrame:
|
1055
|
+
A DataFrame containing the aggregated statistics according to the selected parameters.
|
1056
|
+
"""
|
1057
|
+
|
882
1058
|
print(f"Calculating statistics for all games in the provided play-by-play data at {game_strength} for {type}s...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
|
883
1059
|
start = time.perf_counter()
|
884
1060
|
|
@@ -970,7 +1146,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
|
|
970
1146
|
|
971
1147
|
#Apply shot impacts if necessary
|
972
1148
|
if shot_impact:
|
973
|
-
complete =
|
1149
|
+
complete = shooting_impacts(complete,'goalie')
|
974
1150
|
|
975
1151
|
end = time.perf_counter()
|
976
1152
|
length = end-start
|
@@ -1014,7 +1190,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
|
|
1014
1190
|
]+[f'{stat}/60' for stat in PER_SIXTY[11:len(PER_SIXTY)]]]
|
1015
1191
|
#Apply shot impacts if necessary
|
1016
1192
|
if shot_impact:
|
1017
|
-
complete =
|
1193
|
+
complete = shooting_impacts(complete,'team')
|
1018
1194
|
|
1019
1195
|
end = time.perf_counter()
|
1020
1196
|
length = end-start
|
@@ -1117,7 +1293,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
|
|
1117
1293
|
|
1118
1294
|
#Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
|
1119
1295
|
if shot_impact:
|
1120
|
-
complete =
|
1296
|
+
complete = shooting_impacts(complete,'skater')
|
1121
1297
|
|
1122
1298
|
end = time.perf_counter()
|
1123
1299
|
length = end-start
|
@@ -1125,16 +1301,34 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
|
|
1125
1301
|
|
1126
1302
|
return complete
|
1127
1303
|
|
1128
|
-
def nhl_plot_skaters_shots(pbp,skater_dict,strengths,marker_dict=event_markers,onice = 'indv',title = True,legend=False):
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1304
|
+
def nhl_plot_skaters_shots(pbp:pd.DataFrame, skater_dict:dict, strengths:Union[Literal['all'], list[str]], marker_dict:dict = event_markers, onice:Literal['indv','for','against'] = ['indv'], title:bool = True, legend:bool = False):
|
1305
|
+
"""
|
1306
|
+
Return a dictionary of shot plots for the specified skaters.
|
1307
|
+
|
1308
|
+
Args:
|
1309
|
+
pbp (pd.DataFrame):
|
1310
|
+
A DataFrame containing play-by-play event data to be visualized.
|
1311
|
+
skater_dict (dict[str, list[str]]):
|
1312
|
+
Dictionary of skaters to plot, where each key is a player name and the value is a list
|
1313
|
+
with season and team info (e.g., {'Patrice Bergeron': ['20242025', 'BOS']}).
|
1314
|
+
strengths (str or list[str]):
|
1315
|
+
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1316
|
+
marker_dict (dict[str, dict], optional):
|
1317
|
+
Dictionary of event types mapped to marker styles used in plotting.
|
1318
|
+
onice (Literal['indv', 'for', 'against'], optional):
|
1319
|
+
Determines which shot events to include for the player:
|
1320
|
+
- 'indv': only the player's own shots,
|
1321
|
+
- 'for': shots taken by the player's team while they are on ice,
|
1322
|
+
- 'against': shots taken by the opposing team while the player is on ice.
|
1323
|
+
title (bool, optional):
|
1324
|
+
Whether to include a plot title.
|
1325
|
+
legend (bool, optional):
|
1326
|
+
Whether to include a legend on the plots.
|
1327
|
+
|
1328
|
+
Returns:
|
1329
|
+
dict[str, matplotlib.figure.Figure]:
|
1330
|
+
A dictionary mapping each skater’s name to their corresponding matplotlib shot plot figure.
|
1331
|
+
"""
|
1138
1332
|
|
1139
1333
|
print(f'Plotting the following skater shots: {skater_dict}...')
|
1140
1334
|
|
@@ -1149,15 +1343,28 @@ def nhl_plot_skaters_shots(pbp,skater_dict,strengths,marker_dict=event_markers,o
|
|
1149
1343
|
#Return: list of plotted skater shot charts
|
1150
1344
|
return skater_plots
|
1151
1345
|
|
1152
|
-
def nhl_plot_games(pbp,events,strengths,game_ids='all',marker_dict=event_markers,team_colors={'away':'primary','home':'primary'},legend=False):
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1346
|
+
def nhl_plot_games(pbp:pd.DataFrame, events:list[str], strengths:Union[Literal['all'], list[str]], game_ids: Union[Literal['all'], list[int]] = 'all', marker_dict:dict = event_markers, team_colors:dict = {'away':'primary','home':'primary'}, legend:bool =False):
|
1347
|
+
"""
|
1348
|
+
Returns a dictionary of event plots for the specified games.
|
1349
|
+
|
1350
|
+
Args:
|
1351
|
+
pbp (pd.DataFrame):
|
1352
|
+
A DataFrame containing play-by-play event data.
|
1353
|
+
events (list[str]):
|
1354
|
+
List of event types to include in the plot (e.g., ['shot-on-goal', 'goal']).
|
1355
|
+
strengths (str or list[str]):
|
1356
|
+
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1357
|
+
game_ids (str or list[int]):
|
1358
|
+
List of game IDs to plot. If set to 'all', plots will be generated for all games in the DataFrame.
|
1359
|
+
marker_dict (dict[str, dict]):
|
1360
|
+
Dictionary mapping event types to marker styles and/or colors used in plotting.
|
1361
|
+
legend (bool):
|
1362
|
+
Whether to include a legend on the plots.
|
1363
|
+
|
1364
|
+
Returns:
|
1365
|
+
dict[int, matplotlib.figure.Figure]:
|
1366
|
+
A dictionary mapping each game ID to its corresponding matplotlib event plot figure.
|
1367
|
+
"""
|
1161
1368
|
|
1162
1369
|
#Find games to scrape
|
1163
1370
|
if game_ids == 'all':
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: wsba_hockey
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.7
|
4
4
|
Summary: WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information.
|
5
5
|
Author-email: Owen Singh <owenbksingh@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/owensingh38/wsba_hockey/
|
@@ -1,8 +1,8 @@
|
|
1
|
-
wsba_hockey/__init__.py,sha256=
|
1
|
+
wsba_hockey/__init__.py,sha256=qye0rq22KeaUzBPH__pqjBA_igwsmHemOAbaY_G2tNY,356
|
2
2
|
wsba_hockey/data_pipelines.py,sha256=SITapG3nbea6-_EsXujMW2JBQxtRaQ33XMcE6ohn2Ko,10853
|
3
3
|
wsba_hockey/workspace.py,sha256=MwuyqyLW0dHNa06WEm60RkvbFoCn8LBXhnki66V-ttY,954
|
4
|
-
wsba_hockey/wsba_main.py,sha256=
|
5
|
-
wsba_hockey/api/api/index.py,sha256=
|
4
|
+
wsba_hockey/wsba_main.py,sha256=N5i1y1QtP4jsnsSNKIR_lcAjl_V8oqAlH2YRNTWSUZk,62347
|
5
|
+
wsba_hockey/api/api/index.py,sha256=r2keq105Ve8V0JAsSZMIPs9geVHX2Fuxyi4MqnzCt48,6537
|
6
6
|
wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py,sha256=lmu0TB0rIYkAuV9-csFJgW-1hJojso_-EZpEoorUUKM,4949
|
7
7
|
wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py,sha256=ke8FuEflns-WlphCcQ9CC0qJqWqX3zEEuak74o6rgE8,3879
|
8
8
|
wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/test.py,sha256=uTOn6HJd7KeY_PTRvvufv60dmvON3KWp3nnqACj8IlA,2129
|
@@ -134,15 +134,15 @@ wsba_hockey/flask/app.py,sha256=J51iA65h9xyJfLgdH0h2sVSbfIR7xgGd2Oy8bJsmpAk,1873
|
|
134
134
|
wsba_hockey/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
135
135
|
wsba_hockey/tools/agg.py,sha256=OkIYd-ApvGVYe2JJLOI21jnDIN5LH8nkeH7eo0reWFI,23364
|
136
136
|
wsba_hockey/tools/plotting.py,sha256=81hBaM7tcwUNB4-tovPn7QreOUz6B2NuI_SR4-djVSk,6029
|
137
|
-
wsba_hockey/tools/scraping.py,sha256
|
137
|
+
wsba_hockey/tools/scraping.py,sha256=-sv29886AWAMhhpJ14282WTolBZni8eXBvj4OtNVY-U,52863
|
138
138
|
wsba_hockey/tools/xg_model.py,sha256=nOr_2RBijLgPmJ0TTs4wbSsORYmRqWCKRjLKDm7sAhI,18342
|
139
139
|
wsba_hockey/tools/archive/old_scraping.py,sha256=hEjMI1RtfeZnf0RBiJFI38oXkLZ3WofeH5xqcF4pzgM,49585
|
140
140
|
wsba_hockey/tools/utils/__init__.py,sha256=vccXhOtzARoR99fmEWU1OEI3qCIdQ9Z42AlRA_BUhrs,114
|
141
141
|
wsba_hockey/tools/utils/config.py,sha256=D3Uk05-YTyrhfReMTTLfNI3HN_rON2uo_CDE9oER3Lg,351
|
142
142
|
wsba_hockey/tools/utils/save_pages.py,sha256=CsyL_0n-b-4pJoUauwU3HpnCO6n69-RlBMJQBd_qGDc,4979
|
143
143
|
wsba_hockey/tools/utils/shared.py,sha256=dH_JwZfia5fib8rksy5sW-mBp0pluBPvw37Vdr8Kap0,14211
|
144
|
-
wsba_hockey-1.1.
|
145
|
-
wsba_hockey-1.1.
|
146
|
-
wsba_hockey-1.1.
|
147
|
-
wsba_hockey-1.1.
|
148
|
-
wsba_hockey-1.1.
|
144
|
+
wsba_hockey-1.1.7.dist-info/licenses/LICENSE,sha256=Nr_Um1Pd5FQJTWWgm7maZArdtYMbDhzXYSwyJIZDGik,1114
|
145
|
+
wsba_hockey-1.1.7.dist-info/METADATA,sha256=O_B4EEwc9nbOpnAO8KVoA1Vv-mJHIUEuyqNP_OLrx7s,3566
|
146
|
+
wsba_hockey-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
147
|
+
wsba_hockey-1.1.7.dist-info/top_level.txt,sha256=acU7s3x-RZC1zGiqCOmO0g267iqCg34lzIfdmYxxGmQ,12
|
148
|
+
wsba_hockey-1.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|