wsba-hockey 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wsba_hockey/__init__.py CHANGED
@@ -1 +1 @@
1
- from wsba_hockey.wsba_main import nhl_scrape_game,nhl_scrape_schedule,nhl_scrape_season,nhl_scrape_seasons_info,nhl_scrape_standings,nhl_scrape_roster,nhl_scrape_draft_rankings,nhl_scrape_prospects,nhl_calculate_stats,nhl_shooting_impacts,nhl_apply_xG,nhl_plot_skaters_shots,nhl_plot_games,repo_load_rosters,repo_load_schedule,repo_load_teaminfo,repo_load_pbp,repo_load_seasons
1
+ from wsba_hockey.wsba_main import nhl_scrape_game,nhl_scrape_schedule,nhl_scrape_season,nhl_scrape_seasons_info,nhl_scrape_standings,nhl_scrape_roster,nhl_scrape_draft_rankings,nhl_scrape_prospects,nhl_calculate_stats,nhl_apply_xG,nhl_plot_skaters_shots,nhl_plot_games,repo_load_rosters,repo_load_schedule,repo_load_teaminfo,repo_load_pbp,repo_load_seasons
@@ -90,14 +90,7 @@ def schedule_info(season: int):
90
90
 
91
91
  @app.get("/nhl/games/{game_id}")
92
92
  def pbp(game_id: int):
93
- info = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play').json()
94
-
95
- season = info['season']
96
- dataset = ds.dataset(f's3://weakside-breakout/pbp/parquet/nhl_pbp_{season}.parquet', format='parquet')
97
- filter_expr = (ds.field('game_id')==game_id)
98
-
99
- table = dataset.to_table(use_threads=True,filter=filter_expr)
100
- df = table.to_pandas()
93
+ df = pd.read_csv(f'data/sources/20242025/{game_id}.csv')
101
94
 
102
95
  df = df.fillna('')
103
96
 
@@ -1,6 +1,7 @@
1
1
  import re
2
2
  import warnings
3
3
  import os
4
+ import asyncio
4
5
  import numpy as np
5
6
  import pandas as pd
6
7
  import requests as rs
@@ -179,7 +180,7 @@ def get_game_info(game_id):
179
180
  'coaches':get_game_coaches(game_id),
180
181
  'json_shifts':json_shifts}
181
182
 
182
- def parse_json(info):
183
+ async def parse_json(info):
183
184
  #Given game info, return JSON document
184
185
 
185
186
  #Retreive data
@@ -340,7 +341,7 @@ def clean_html_pbp(info):
340
341
 
341
342
  return cleaned_html
342
343
 
343
- def parse_html(info):
344
+ async def parse_html(info):
344
345
  #Given game info, return HTML event data
345
346
 
346
347
  #Retreive game information and html events
@@ -561,7 +562,7 @@ def espn_game_id(date,away,home):
561
562
  #Return: ESPN game id
562
563
  return game_id
563
564
 
564
- def parse_espn(date,away,home):
565
+ async def parse_espn(date,away,home):
565
566
  #Given a date formatted as YYYY-MM-DD and teams, return game events
566
567
  game_id = espn_game_id(date,away,home)
567
568
  url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
@@ -711,15 +712,24 @@ def assign_target(data):
711
712
  #Revert sort and return dataframe
712
713
  return data.reset_index()
713
714
 
714
- def combine_pbp(info,sources):
715
+ async def combine_pbp(info,sources):
715
716
  #Given game info, return complete play-by-play data for provided game
716
717
 
717
- html_pbp = parse_html(info)
718
+ #Create tasks
719
+ html_task = asyncio.create_task(parse_html(info))
720
+ if info['season'] in [20052006, 20062007, 20072008, 20082009, 20092010]:
721
+ json_task = asyncio.create_task(parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']))
722
+ json_type = 'espn'
723
+ else:
724
+ json_task = asyncio.create_task(parse_json(info))
725
+ json_type = 'nhl'
718
726
 
727
+ html_pbp, json_pbp = await asyncio.gather(html_task, json_task)
728
+
719
729
  #Route data combining - json if season is after 2009-2010:
720
- if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
730
+ if json_type == 'espn':
721
731
  #ESPN x HTML
722
- espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).sort_values(['period','seconds_elapsed']).reset_index()
732
+ espn_pbp = json_pbp.rename(columns={'coords_x':'x',"coords_y":'y'}).sort_values(['period','seconds_elapsed']).reset_index()
723
733
  merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
724
734
 
725
735
  #Merge pbp
@@ -727,8 +737,6 @@ def combine_pbp(info,sources):
727
737
 
728
738
  else:
729
739
  #JSON x HTML
730
- json_pbp = parse_json(info)
731
-
732
740
  if sources:
733
741
  dirs_html = f'sources/{info['season']}/HTML/'
734
742
  dirs_json = f'sources/{info['season']}/JSON/'
@@ -1077,12 +1085,10 @@ def combine_shifts(info,sources):
1077
1085
  #Return: full shifts data converted to play-by-play format
1078
1086
  return full_shifts
1079
1087
 
1080
- def combine_data(info,sources):
1088
+ async def combine_data(info,sources):
1081
1089
  #Given game info, return complete play-by-play data
1082
1090
 
1083
- game_id = info['game_id']
1084
-
1085
- pbp = combine_pbp(info,sources)
1091
+ pbp = await combine_pbp(info,sources)
1086
1092
  shifts = combine_shifts(info,sources)
1087
1093
 
1088
1094
  #Combine data
wsba_hockey/wsba_main.py CHANGED
@@ -2,7 +2,9 @@ import random
2
2
  import os
3
3
  import requests as rs
4
4
  import pandas as pd
5
+ import asyncio
5
6
  import time
7
+ from typing import Literal, Union
6
8
  from datetime import datetime, timedelta, date
7
9
  from wsba_hockey.tools.scraping import *
8
10
  from wsba_hockey.tools.xg_model import *
@@ -112,29 +114,41 @@ INFO_PATH = os.path.join(DIR,'tools\\teaminfo\\nhl_teaminfo.csv')
112
114
  DEFAULT_ROSTER = os.path.join(DIR,'tools\\rosters\\nhl_rosters.csv')
113
115
 
114
116
  ## SCRAPE FUNCTIONS ##
115
- def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage','shootout-complete','game-end'],verbose = False, sources = False, errors = False):
116
- #Given a set of game_ids (NHL API), return complete play-by-play information as requested
117
- # param 'game_ids' - NHL game ids (or list formatted as ['random', num_of_games, start_year, end_year])
118
- # param 'split_shifts' - boolean which splits pbp and shift events if true
119
- # param 'remove' - list of events to remove from final dataframe
120
- # param 'xg' - xG model to apply to pbp for aggregation
121
- # param 'verbose' - boolean which adds additional event info if true
122
- # param 'sources - boolean scraping the html and json sources to a master directory if true
123
- # param 'errors' - boolean returning game ids which did not scrape if true
124
-
117
+ def nhl_scrape_game(game_ids:list[int], split_shifts:bool = False, remove:list[str] = [], verbose:bool = False, sources:bool = False, errors:bool = False):
118
+ """
119
+ Given a set of game_ids (NHL API), return complete play-by-play information as requested.
120
+
121
+ Args:
122
+ game_ids (List[int] or ['random', int, int, int]):
123
+ List of NHL game IDs to scrape or use ['random', n, start_year, end_year] to fetch n random games.
124
+ split_shifts (bool, optional):
125
+ If True, returns a dict with separate 'pbp' and 'shifts' DataFrames. Default is False.
126
+ remove (List[str], optional):
127
+ List of event types to remove from the result. Default is an empty list.
128
+ verbose (bool, optional):
129
+ If True, generates extra event features (such as those required to calculate xG). Default is False.
130
+ sources (bool, optional):
131
+ If True, saves raw HTML, JSON, SHIFTS, and single-game full play-by-play to a separate folder in the working directory. Default is False.
132
+ errors (bool, optional):
133
+ If True, includes a list of game IDs that failed to scrape in the return. Default is False.
134
+
135
+ Returns:
136
+ pd.DataFrame:
137
+ If split_shifts is False, returns a single DataFrame of play-by-play data.
138
+ dict[str, pd.DataFrame]:
139
+ If split_shifts is True, returns a dictionary with keys:
140
+ - 'pbp': play-by-play events
141
+ - 'shifts': shift change events
142
+ - 'errors' (optional): list of game IDs that failed if errors=True
143
+ """
144
+
125
145
  pbps = []
126
146
  if game_ids[0] == 'random':
127
147
  #Randomize selection of game_ids
128
148
  #Some ids returned may be invalid (for example, 2020021300)
129
149
  num = game_ids[1]
130
- try:
131
- start = game_ids[2]
132
- except:
133
- start = 2007
134
- try:
135
- end = game_ids[3]
136
- except:
137
- end = (date.today().year)-1
150
+ start = game_ids[2] if len(game_ids) > 1 else 2007
151
+ end = game_ids[3] if len(game_ids) > 2 else (date.today().year)-1
138
152
 
139
153
  game_ids = []
140
154
  i = 0
@@ -161,13 +175,13 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
161
175
  error_ids = []
162
176
  prog = 0
163
177
  for game_id in game_ids:
164
- print("Scraping data from game " + str(game_id) + "...",end="")
178
+ print(f'Scraping data from game {game_id}...',end='')
165
179
  start = time.perf_counter()
166
180
 
167
181
  try:
168
182
  #Retrieve data
169
183
  info = get_game_info(game_id)
170
- data = combine_data(info, sources)
184
+ data = asyncio.run(combine_data(info, sources))
171
185
 
172
186
  #Append data to list
173
187
  pbps.append(data)
@@ -186,19 +200,19 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
186
200
  data.to_csv(f'{dirs}{info['game_id']}.csv',index=False)
187
201
 
188
202
  print(f" finished in {secs:.2f} seconds. {prog}/{len(game_ids)} ({(prog/len(game_ids))*100:.2f}%)")
189
- except:
203
+ except Exception as e:
190
204
  #Games such as the all-star game and pre-season games will incur this error
191
205
  #Other games have known problems
192
206
  if game_id in KNOWN_PROBS.keys():
193
207
  print(f"\nGame {game_id} has a known problem: {KNOWN_PROBS[game_id]}")
194
208
  else:
195
- print(f"\nUnable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
209
+ print(f"\nUnable to scrape game {game_id}. Exception: {e}")
196
210
 
197
211
  #Track error
198
212
  error_ids.append(game_id)
199
213
 
200
214
  #Add all pbps together
201
- if len(pbps) == 0:
215
+ if not pbps:
202
216
  print("\rNo data returned.")
203
217
  return pd.DataFrame()
204
218
  df = pd.concat(pbps)
@@ -210,7 +224,7 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
210
224
  ""
211
225
 
212
226
  #Print final message
213
- if len(error_ids) > 0:
227
+ if error_ids:
214
228
  print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
215
229
  else:
216
230
  print('\rScrape of provided games finished.')
@@ -218,7 +232,7 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
218
232
  #Split pbp and shift events if necessary
219
233
  #Return: complete play-by-play with data removed or split as necessary
220
234
 
221
- if split_shifts == True:
235
+ if split_shifts:
222
236
  remove.append('change')
223
237
 
224
238
  #Return: dict with pbp and shifts seperated
@@ -242,22 +256,40 @@ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','per
242
256
  else:
243
257
  return pbp
244
258
 
245
- def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
246
- #Given a season, return schedule data
247
- # param 'season' - NHL season to scrape
248
- # param 'start' - Start date in season
249
- # param 'end' - End date in season
259
+ def nhl_scrape_schedule(season:int, start:str = '', end:str = ''):
260
+ """
261
+ Given season and an optional date range, retrieve NHL schedule data.
262
+
263
+ Args:
264
+ season (int):
265
+ The NHL season formatted such as "20242025".
266
+ start (str, optional):
267
+ The date string (MM-DD) to start the schedule scrape at. Default is a blank string.
268
+ end (str, optional):
269
+ The date string (MM-DD) to end the schedule scrape at. Default is a blank string.
270
+
271
+ Returns:
272
+ pd.DataFrame:
273
+ A DataFrame containing the schedule data for the specified season and date range.
274
+ """
250
275
 
251
276
  api = "https://api-web.nhle.com/v1/schedule/"
252
277
 
253
- #Determine how to approach scraping; if month in season is after the new year the year must be adjusted
254
- new_year = ["01","02","03","04","05","06"]
255
- if start[:2] in new_year:
256
- start = str(int(season[:4])+1)+"-"+start
257
- end = str(season[:-4])+"-"+end
278
+ #If either start or end are blank then find start and endpoints for specified season
279
+ if start == '' or end == '':
280
+ season_data = rs.get('https://api.nhle.com/stats/rest/en/season').json()['data']
281
+ season_data = [s for s in season_data if s['id'] == season][0]
282
+ start = season_data['startDate'][0:10]
283
+ end = season_data['endDate'][0:10]
258
284
  else:
259
- start = str(season[:4])+"-"+start
260
- end = str(season[:-4])+"-"+end
285
+ #Determine how to approach scraping; if month in season is after the new year the year must be adjusted
286
+ new_year = ["01","02","03","04","05","06"]
287
+ if start[:2] in new_year:
288
+ start = f'{int(str(season)[:4])+1}-{start}'
289
+ end = f'{str(season)[:-4]}-{end}'
290
+ else:
291
+ start = f'{int(str(season)[:4])}-{start}'
292
+ end = f'{str(season)[:-4]}-{end}'
261
293
 
262
294
  form = '%Y-%m-%d'
263
295
 
@@ -274,9 +306,9 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
274
306
  for i in range(day):
275
307
  #For each day, call NHL api and retreive info on all games of selected game
276
308
  inc = start+timedelta(days=i)
277
- print("Scraping games on " + str(inc)[:10]+"...")
309
+ print(f'Scraping games on {str(inc)[:10]}...')
278
310
 
279
- get = rs.get(api+str(inc)[:10]).json()
311
+ get = rs.get(f'{api}{str(inc)[:10]}').json()
280
312
  gameWeek = pd.json_normalize(list(pd.json_normalize(get['gameWeek'])['games'])[0])
281
313
 
282
314
  #Return nothing if there's nothing
@@ -302,43 +334,81 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
302
334
  #Return: specificed schedule data
303
335
  return df
304
336
 
305
- def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = SCHEDULE_PATH, verbose = False, sources = False, errors = False):
306
- #Given season, scrape all play-by-play occuring within the season
307
- # param 'season' - NHL season to scrape
308
- # param 'split_shifts' - boolean which splits pbp and shift events if true
309
- # param 'remove' - list of events to remove from final dataframe
310
- # param 'start' - Start date in season
311
- # param 'end' - End date in season
312
- # param 'local' - boolean indicating whether to use local file to scrape game_ids
313
- # param 'local_path' - path of local file
314
- # param 'verbose' - boolean which adds additional event info if true
315
- # param 'sources - boolean scraping the html and json sources to a master directory if true
316
- # param 'errors' - boolean returning game ids which did not scrape if true
317
-
337
+ def nhl_scrape_season(season:int, split_shifts:bool = False, season_types:list[int] = [2,3], remove:list[str] = [], start:str = '', end:str = '', local:bool=False, local_path:str = SCHEDULE_PATH, verbose:bool = False, sources:bool = False, errors:bool = False):
338
+ """
339
+ Given season, scrape all play-by-play occuring within the season.
340
+
341
+ Args:
342
+ season (int):
343
+ The NHL season formatted such as "20242025".
344
+ split_shifts (bool, optional):
345
+ If True, returns a dict with separate 'pbp' and 'shifts' DataFrames. Default is False.
346
+ season_types (List[int], optional):
347
+ List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectfully.
348
+ remove (List[str], optional):
349
+ List of event types to remove from the result. Default is an empty list.
350
+ start (str, optional):
351
+ The date string (MM-DD) to start the schedule scrape at. Default is a blank string.
352
+ end (str, optional):
353
+ The date string (MM-DD) to end the schedule scrape at. Default is a blank string.
354
+ local (bool, optional):
355
+ If True, use local file to retreive schedule data.
356
+ local_path (bool, optional):
357
+ If True, specifies the path with schedule data necessary to scrape a season's games (only relevant if local = True).
358
+ verbose (bool, optional):
359
+ If True, generates extra event features (such as those required to calculate xG). Default is False.
360
+ sources (bool, optional):
361
+ If True, saves raw HTML, JSON, SHIFTS, and single-game full play-by-play to a separate folder in the working directory. Default is False.
362
+ errors (bool, optional):
363
+ If True, includes a list of game IDs that failed to scrape in the return. Default is False.
364
+
365
+ Returns:
366
+ pd.DataFrame:
367
+ If split_shifts is False, returns a single DataFrame of play-by-play data.
368
+ dict[str, pd.DataFrame]:
369
+ If split_shifts is True, returns a dictionary with keys:
370
+ - 'pbp': play-by-play events
371
+ - 'shifts': shift change events
372
+ - 'errors' (optional): list of game IDs that failed if errors=True
373
+ """
374
+
318
375
  #Determine whether to use schedule data in repository or to scrape
319
376
  if local:
320
377
  load = pd.read_csv(local_path)
321
378
  load['date'] = pd.to_datetime(load['date'])
322
-
323
- start = f'{(season[0:4] if int(start[0:2])>=9 else season[4:8])}-{int(start[0:2])}-{int(start[3:5])}'
324
- end = f'{(season[0:4] if int(end[0:2])>=9 else season[4:8])}-{int(end[0:2])}-{int(end[3:5])}'
325
-
326
- load = load.loc[(load['season'].astype(str)==season)&
379
+
380
+ if start == '' or end == '':
381
+ season_data = rs.get('https://api.nhle.com/stats/rest/en/season').json()['data']
382
+ season_data = [s for s in season_data if s['id'] == season][0]
383
+ start = season_data['startDate'][0:10]
384
+ end = season_data['endDate'][0:10]
385
+
386
+ form = '%Y-%m-%d'
387
+
388
+ #Create datetime values from dates
389
+ start = datetime.strptime(start,form)
390
+ end = datetime.strptime(end,form)
391
+
392
+ else:
393
+ start = f'{(str(season)[0:4] if int(start[0:2])>=9 else str(season)[4:8])}-{start[0:2]}-{start[3:5]}'
394
+ end = f'{(str(season)[0:4] if int(end[0:2])>=9 else str(season)[4:8])}-{end[0:2]}-{end[3:5]}'
395
+
396
+ load = load.loc[(load['season']==season)&
327
397
  (load['season_type'].isin(season_types))&
328
398
  (load['date']>=start)&(load['date']<=end)]
329
399
 
330
- game_ids = list(load['id'].astype(str))
400
+ game_ids = load['id'].to_list()
331
401
  else:
332
402
  load = nhl_scrape_schedule(season,start,end)
333
- load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
334
- game_ids = list(load['id'].astype(str))
403
+ load = load.loc[(load['season']==season)&(load['season_type'].isin(season_types))]
404
+ game_ids = load['id'].to_list()
335
405
 
336
406
  #If no games found, terminate the process
337
407
  if not game_ids:
338
408
  print('No games found for dates in season...')
339
409
  return ""
340
410
 
341
- print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
411
+ print(f"Scraping games from {str(season)[0:4]}-{str(season)[4:8]} season...")
342
412
  start = time.perf_counter()
343
413
 
344
414
  #Perform scrape
@@ -354,11 +424,22 @@ def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove
354
424
  #Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
355
425
  return data
356
426
 
357
- def nhl_scrape_seasons_info(seasons = []):
358
- #Returns info related to NHL seasons (by default, all seasons are included)
427
+ def nhl_scrape_seasons_info(seasons:list[int] = []):
428
+ """
429
+ Returns info related to NHL seasons (by default, all seasons are included)
430
+ Args:
431
+ seasons (List[int], optional):
432
+ The NHL season formatted such as "20242025".
433
+
434
+ Returns:
435
+ pd.DataFrame:
436
+ A DataFrame containing the information for requested seasons.
437
+ """
438
+
439
+ #
359
440
  # param 'season' - list of seasons to include
360
441
 
361
- print("Scraping info for seasons: " + str(seasons))
442
+ print(f'Scraping info for seasons: {seasons}')
362
443
  api = "https://api.nhle.com/stats/rest/en/season"
363
444
  info = "https://api-web.nhle.com/v1/standings-season"
364
445
  data = rs.get(api).json()['data']
@@ -374,12 +455,20 @@ def nhl_scrape_seasons_info(seasons = []):
374
455
  else:
375
456
  return df.sort_values(by=['id'])
376
457
 
377
- def nhl_scrape_standings(arg = "now", season_type = 2):
378
- #Returns standings
379
- # param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD, a season (scrapes the last standings date for the season) or a year (for playoffs).
380
- # param 'season_type' - by default, this scrapes the regular season standings. If set to 3, it returns the playoff bracket for the specified season
458
+ def nhl_scrape_standings(arg:str = "now", season_type:int = 2):
459
+ """
460
+ Returns standings or playoff bracket
461
+ Args:
462
+ arg (str, optional):
463
+ Date formatted as 'YYYY-MM-DD' to scrape standings for specific date or 'now' for current standings. Default is 'now'.
464
+ season_type (int, optional):
465
+ Part of season to scrape. If 3 (playoffs) then scrape the playoff bracket for the season implied by arg. When arg = 'now' this is ignored. Default is 2.
466
+
467
+ Returns:
468
+ pd.DataFrame:
469
+ A DataFrame containing the standings information (or playoff bracket).
470
+ """
381
471
 
382
- #arg param is ignored when set to "now" if season_type param is 3
383
472
  if season_type == 3:
384
473
  if arg == "now":
385
474
  arg = NEW
@@ -404,9 +493,19 @@ def nhl_scrape_standings(arg = "now", season_type = 2):
404
493
 
405
494
  return pd.json_normalize(data)
406
495
 
407
- def nhl_scrape_roster(season):
408
- #Given a nhl season, return rosters for all participating teams
409
- # param 'season' - NHL season to scrape
496
+ def nhl_scrape_roster(season: int):
497
+ """
498
+ Returns rosters for all teams in a given season.
499
+
500
+ Args:
501
+ season (int):
502
+ The NHL season formatted such as "20242025".
503
+
504
+ Returns:
505
+ pd.DataFrame:
506
+ A DataFrame containing the rosters for all teams in the specified season.
507
+ """
508
+
410
509
  print("Scrpaing rosters for the "+ season + "season...")
411
510
  teaminfo = pd.read_csv(info_path)
412
511
 
@@ -435,8 +534,18 @@ def nhl_scrape_roster(season):
435
534
 
436
535
  return pd.concat(rosts)
437
536
 
438
- def nhl_scrape_prospects(team):
439
- #Given team abbreviation, retreive current team prospects
537
+ def nhl_scrape_prospects(team:str):
538
+ """
539
+ Returns prospects for specified team
540
+
541
+ Args:
542
+ team (str):
543
+ Three character team abbreviation such as 'BOS'
544
+
545
+ Returns:
546
+ pd.DataFrame:
547
+ A DataFrame containing the prospect data for the specified team.
548
+ """
440
549
 
441
550
  api = f'https://api-web.nhle.com/v1/prospects/{team}'
442
551
 
@@ -452,10 +561,20 @@ def nhl_scrape_prospects(team):
452
561
  #Return: team prospects
453
562
  return prospects
454
563
 
455
- def nhl_scrape_team_info(country = False):
456
- #Given option to return franchise or country, return team information
564
+ def nhl_scrape_team_info(country:bool = False):
565
+ """
566
+ Returns team or country information from the NHL API.
567
+
568
+ Args:
569
+ country (bool, optional):
570
+ If True, returns country information instead of NHL team information.
457
571
 
458
- print('Scraping team information...')
572
+ Returns:
573
+ pd.DataFrame:
574
+ A DataFrame containing team or country information from the NHL API.
575
+ """
576
+
577
+ print(f'Scraping {'country' if country else 'team'} information...')
459
578
  api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
460
579
 
461
580
  data = pd.json_normalize(rs.get(api).json()['data'])
@@ -467,8 +586,19 @@ def nhl_scrape_team_info(country = False):
467
586
 
468
587
  return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
469
588
 
470
- def nhl_scrape_player_data(player_ids):
471
- #Given player id, return player information
589
+ def nhl_scrape_player_data(player_ids:list[int]):
590
+ """
591
+ Returns player data for specified players.
592
+
593
+ Args:
594
+ player_ids (list[int]):
595
+ List of NHL API player IDs to retrieve information for.
596
+
597
+ Returns:
598
+ pd.DataFrame:
599
+ A DataFrame containing player data for specified players.
600
+ """
601
+
472
602
  infos = []
473
603
  for player_id in player_ids:
474
604
  player_id = int(player_id)
@@ -489,15 +619,28 @@ def nhl_scrape_player_data(player_ids):
489
619
  else:
490
620
  return pd.DataFrame()
491
621
 
492
- def nhl_scrape_draft_rankings(arg = 'now', category = ''):
493
- #Given url argument for timeframe and prospect category, return draft rankings
494
- #Category 1 is North American Skaters
495
- #Category 2 is International Skaters
496
- #Category 3 is North American Goalie
497
- #Category 4 is International Goalie
622
+ def nhl_scrape_draft_rankings(arg:str = 'now', category:int = 0):
623
+ """
624
+ Returns draft rankings
625
+ Args:
626
+ arg (str, optional):
627
+ Date formatted as 'YYYY-MM-DD' to scrape draft rankings for specific date or 'now' for current draft rankings. Default is 'now'.
628
+ category (int, optional):
629
+ Category number for prospects. When arg = 'now' this does not apply.
630
+
631
+ - Category 1 is North American Skaters.
632
+ - Category 2 is International Skaters.
633
+ - Category 3 is North American Goalie.
634
+ - Category 4 is International Goalie
635
+
636
+ Default is 0 (all prospects).
637
+ Returns:
638
+ pd.DataFrame:
639
+ A DataFrame containing draft rankings.
640
+ """
498
641
 
499
642
  #Player category only applies when requesting a specific season
500
- api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category != "" else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
643
+ api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category > 0 else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
501
644
  data = pd.json_normalize(rs.get(api).json()['rankings'])
502
645
 
503
646
  #Add player name columns
@@ -506,10 +649,16 @@ def nhl_scrape_draft_rankings(arg = 'now', category = ''):
506
649
  #Return: prospect rankings
507
650
  return data
508
651
 
509
- def nhl_apply_xG(pbp):
510
- #Given play-by-play data, return this data with xG-related columns
511
-
512
- #param 'pbp' - play-by-play data
652
+ def nhl_apply_xG(pbp: pd.DataFrame):
653
+ """
654
+ Given play-by-play data, return this data with xG-related columns
655
+ Args:
656
+ pbp (pd.DataFrame):
657
+ A DataFrame containing play-by-play data generated within the WBSA Hockey package.
658
+ Returns:
659
+ pd.DataFrame:
660
+ A DataFrame containing input play-by-play data with xG column.
661
+ """
513
662
 
514
663
  print(f'Applying WSBA xG to model with seasons: {pbp['season'].drop_duplicates().to_list()}')
515
664
 
@@ -518,7 +667,7 @@ def nhl_apply_xG(pbp):
518
667
 
519
668
  return pbp
520
669
 
521
- def nhl_shooting_impacts(agg,type):
670
+ def shooting_impacts(agg, type):
522
671
  #Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
523
672
  #Only 5v5 is supported as of now
524
673
 
@@ -868,7 +1017,7 @@ def nhl_shooting_impacts(agg,type):
868
1017
  #Return: skater stats with shooting impacts
869
1018
  return df
870
1019
 
871
- def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,roster_path=DEFAULT_ROSTER,shot_impact=False):
1020
+ def nhl_calculate_stats(pbp:pd.DataFrame, type:Literal['skater','goalie','team'], season_types:list[int], game_strength: Union[Literal['all'], list[str]], split_game:bool = False, roster_path:str = DEFAULT_ROSTER, shot_impact:bool = False):
872
1021
  #Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
873
1022
  # param 'pbp' - play-by-play dataframe
874
1023
  # param 'type' - type of stats to calculate ('skater', 'goalie', or 'team')
@@ -879,6 +1028,33 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
879
1028
  # param 'roster_path' - path to roster file
880
1029
  # param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
881
1030
 
1031
+ """
1032
+ Given play-by-play data, seasonal information, game strength, rosters, and an xG model,
1033
+ return aggregated statistics at the skater, goalie, or team level.
1034
+
1035
+ Args:
1036
+ pbp (pd.DataFrame):
1037
+ A DataFrame containing play-by-play event data.
1038
+ type (Literal['skater', 'goalie', 'team']):
1039
+ Type of statistics to calculate. Must be one of 'skater', 'goalie', or 'team'.
1040
+ season (int):
1041
+ The NHL season formatted such as "20242025".
1042
+ season_types (List[int], optional):
1043
+ List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectfully.
1044
+ game_strength (str or list[str]):
1045
+ List of game strength states to include (e.g., ['5v5','5v4','4v5']).
1046
+ split_game (bool, optional):
1047
+ If True, aggregates stats separately for each game; otherwise, stats are aggregated across all games. Default is False.
1048
+ roster_path (str, optional):
1049
+ File path to the roster data used for mapping players and teams.
1050
+ shot_impact (bool, optional):
1051
+ If True, applies shot impact metrics to the stats DataFrame. Default is False.
1052
+
1053
+ Returns:
1054
+ pd.DataFrame:
1055
+ A DataFrame containing the aggregated statistics according to the selected parameters.
1056
+ """
1057
+
882
1058
  print(f"Calculating statistics for all games in the provided play-by-play data at {game_strength} for {type}s...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
883
1059
  start = time.perf_counter()
884
1060
 
@@ -970,7 +1146,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
970
1146
 
971
1147
  #Apply shot impacts if necessary
972
1148
  if shot_impact:
973
- complete = nhl_shooting_impacts(complete,'goalie')
1149
+ complete = shooting_impacts(complete,'goalie')
974
1150
 
975
1151
  end = time.perf_counter()
976
1152
  length = end-start
@@ -1014,7 +1190,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
1014
1190
  ]+[f'{stat}/60' for stat in PER_SIXTY[11:len(PER_SIXTY)]]]
1015
1191
  #Apply shot impacts if necessary
1016
1192
  if shot_impact:
1017
- complete = nhl_shooting_impacts(complete,'team')
1193
+ complete = shooting_impacts(complete,'team')
1018
1194
 
1019
1195
  end = time.perf_counter()
1020
1196
  length = end-start
@@ -1117,7 +1293,7 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
1117
1293
 
1118
1294
  #Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
1119
1295
  if shot_impact:
1120
- complete = nhl_shooting_impacts(complete,'skater')
1296
+ complete = shooting_impacts(complete,'skater')
1121
1297
 
1122
1298
  end = time.perf_counter()
1123
1299
  length = end-start
@@ -1125,16 +1301,34 @@ def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,ros
1125
1301
 
1126
1302
  return complete
1127
1303
 
1128
- def nhl_plot_skaters_shots(pbp,skater_dict,strengths,marker_dict=event_markers,onice = 'indv',title = True,legend=False):
1129
- #Returns dict of plots for specified skaters
1130
- # param 'pbp' - pbp to plot data
1131
- # param 'skater_dict' - skaters to plot shots for (format: {'Patrice Bergeron':['20242025','BOS']})
1132
- # param 'strengths' - strengths to include in plotting
1133
- # param 'marker_dict' - dict with markers to use for events
1134
- # param 'onice' - can set which shots to include in plotting for the specified skater ('indv', 'for', 'against')
1135
- # param 'title' - bool including title when true
1136
- # param 'legend' - bool which includes legend if true
1137
- # param 'xg' - xG model to apply to pbp for plotting
1304
+ def nhl_plot_skaters_shots(pbp:pd.DataFrame, skater_dict:dict, strengths:Union[Literal['all'], list[str]], marker_dict:dict = event_markers, onice:Literal['indv','for','against'] = ['indv'], title:bool = True, legend:bool = False):
1305
+ """
1306
+ Return a dictionary of shot plots for the specified skaters.
1307
+
1308
+ Args:
1309
+ pbp (pd.DataFrame):
1310
+ A DataFrame containing play-by-play event data to be visualized.
1311
+ skater_dict (dict[str, list[str]]):
1312
+ Dictionary of skaters to plot, where each key is a player name and the value is a list
1313
+ with season and team info (e.g., {'Patrice Bergeron': ['20242025', 'BOS']}).
1314
+ strengths (str or list[str]):
1315
+ List of game strength states to include (e.g., ['5v5','5v4','4v5']).
1316
+ marker_dict (dict[str, dict], optional):
1317
+ Dictionary of event types mapped to marker styles used in plotting.
1318
+ onice (Literal['indv', 'for', 'against'], optional):
1319
+ Determines which shot events to include for the player:
1320
+ - 'indv': only the player's own shots,
1321
+ - 'for': shots taken by the player's team while they are on ice,
1322
+ - 'against': shots taken by the opposing team while the player is on ice.
1323
+ title (bool, optional):
1324
+ Whether to include a plot title.
1325
+ legend (bool, optional):
1326
+ Whether to include a legend on the plots.
1327
+
1328
+ Returns:
1329
+ dict[str, matplotlib.figure.Figure]:
1330
+ A dictionary mapping each skater’s name to their corresponding matplotlib shot plot figure.
1331
+ """
1138
1332
 
1139
1333
  print(f'Plotting the following skater shots: {skater_dict}...')
1140
1334
 
@@ -1149,15 +1343,28 @@ def nhl_plot_skaters_shots(pbp,skater_dict,strengths,marker_dict=event_markers,o
1149
1343
  #Return: list of plotted skater shot charts
1150
1344
  return skater_plots
1151
1345
 
1152
- def nhl_plot_games(pbp,events,strengths,game_ids='all',marker_dict=event_markers,team_colors={'away':'primary','home':'primary'},legend=False):
1153
- #Returns dict of plots for specified games
1154
- # param 'pbp' - pbp to plot data
1155
- # param 'events' - type of events to plot
1156
- # param 'strengths' - strengths to include in plotting
1157
- # param 'game_ids' - games to plot (list if not set to 'all')
1158
- # param 'marker_dict' - dict with colors to use for events
1159
- # param 'legend' - bool which includes legend if true
1160
- # param 'xg' - xG model to apply to pbp for plotting
1346
+ def nhl_plot_games(pbp:pd.DataFrame, events:list[str], strengths:Union[Literal['all'], list[str]], game_ids: Union[Literal['all'], list[int]] = 'all', marker_dict:dict = event_markers, team_colors:dict = {'away':'primary','home':'primary'}, legend:bool =False):
1347
+ """
1348
+ Returns a dictionary of event plots for the specified games.
1349
+
1350
+ Args:
1351
+ pbp (pd.DataFrame):
1352
+ A DataFrame containing play-by-play event data.
1353
+ events (list[str]):
1354
+ List of event types to include in the plot (e.g., ['shot-on-goal', 'goal']).
1355
+ strengths (str or list[str]):
1356
+ List of game strength states to include (e.g., ['5v5','5v4','4v5']).
1357
+ game_ids (str or list[int]):
1358
+ List of game IDs to plot. If set to 'all', plots will be generated for all games in the DataFrame.
1359
+ marker_dict (dict[str, dict]):
1360
+ Dictionary mapping event types to marker styles and/or colors used in plotting.
1361
+ legend (bool):
1362
+ Whether to include a legend on the plots.
1363
+
1364
+ Returns:
1365
+ dict[int, matplotlib.figure.Figure]:
1366
+ A dictionary mapping each game ID to its corresponding matplotlib event plot figure.
1367
+ """
1161
1368
 
1162
1369
  #Find games to scrape
1163
1370
  if game_ids == 'all':
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wsba_hockey
3
- Version: 1.1.6
3
+ Version: 1.1.7
4
4
  Summary: WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information.
5
5
  Author-email: Owen Singh <owenbksingh@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/owensingh38/wsba_hockey/
@@ -1,8 +1,8 @@
1
- wsba_hockey/__init__.py,sha256=yfr8z5PA503iaIQv30ngancwT_WnsuK-tZETKlHcI0M,377
1
+ wsba_hockey/__init__.py,sha256=qye0rq22KeaUzBPH__pqjBA_igwsmHemOAbaY_G2tNY,356
2
2
  wsba_hockey/data_pipelines.py,sha256=SITapG3nbea6-_EsXujMW2JBQxtRaQ33XMcE6ohn2Ko,10853
3
3
  wsba_hockey/workspace.py,sha256=MwuyqyLW0dHNa06WEm60RkvbFoCn8LBXhnki66V-ttY,954
4
- wsba_hockey/wsba_main.py,sha256=Ucies8d27gWtzf8xprnu7hEcqGGHvOza8HCE0O80X-s,54031
5
- wsba_hockey/api/api/index.py,sha256=tABWg5cYCY-fPaNJ6W_bMJKEYrjn93YGy84VlkHzIXA,6853
4
+ wsba_hockey/wsba_main.py,sha256=N5i1y1QtP4jsnsSNKIR_lcAjl_V8oqAlH2YRNTWSUZk,62347
5
+ wsba_hockey/api/api/index.py,sha256=r2keq105Ve8V0JAsSZMIPs9geVHX2Fuxyi4MqnzCt48,6537
6
6
  wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py,sha256=lmu0TB0rIYkAuV9-csFJgW-1hJojso_-EZpEoorUUKM,4949
7
7
  wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py,sha256=ke8FuEflns-WlphCcQ9CC0qJqWqX3zEEuak74o6rgE8,3879
8
8
  wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/test.py,sha256=uTOn6HJd7KeY_PTRvvufv60dmvON3KWp3nnqACj8IlA,2129
@@ -134,15 +134,15 @@ wsba_hockey/flask/app.py,sha256=J51iA65h9xyJfLgdH0h2sVSbfIR7xgGd2Oy8bJsmpAk,1873
134
134
  wsba_hockey/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
135
  wsba_hockey/tools/agg.py,sha256=OkIYd-ApvGVYe2JJLOI21jnDIN5LH8nkeH7eo0reWFI,23364
136
136
  wsba_hockey/tools/plotting.py,sha256=81hBaM7tcwUNB4-tovPn7QreOUz6B2NuI_SR4-djVSk,6029
137
- wsba_hockey/tools/scraping.py,sha256=h6C016U0qmNQpHWMh7Xvn3ud57zKzRbRQ06Odl-rC_I,52573
137
+ wsba_hockey/tools/scraping.py,sha256=-sv29886AWAMhhpJ14282WTolBZni8eXBvj4OtNVY-U,52863
138
138
  wsba_hockey/tools/xg_model.py,sha256=nOr_2RBijLgPmJ0TTs4wbSsORYmRqWCKRjLKDm7sAhI,18342
139
139
  wsba_hockey/tools/archive/old_scraping.py,sha256=hEjMI1RtfeZnf0RBiJFI38oXkLZ3WofeH5xqcF4pzgM,49585
140
140
  wsba_hockey/tools/utils/__init__.py,sha256=vccXhOtzARoR99fmEWU1OEI3qCIdQ9Z42AlRA_BUhrs,114
141
141
  wsba_hockey/tools/utils/config.py,sha256=D3Uk05-YTyrhfReMTTLfNI3HN_rON2uo_CDE9oER3Lg,351
142
142
  wsba_hockey/tools/utils/save_pages.py,sha256=CsyL_0n-b-4pJoUauwU3HpnCO6n69-RlBMJQBd_qGDc,4979
143
143
  wsba_hockey/tools/utils/shared.py,sha256=dH_JwZfia5fib8rksy5sW-mBp0pluBPvw37Vdr8Kap0,14211
144
- wsba_hockey-1.1.6.dist-info/licenses/LICENSE,sha256=Nr_Um1Pd5FQJTWWgm7maZArdtYMbDhzXYSwyJIZDGik,1114
145
- wsba_hockey-1.1.6.dist-info/METADATA,sha256=2CLs8qTA1iS8P7ToF4My86KkMRrt5zYoX9ynbQTS4zk,3566
146
- wsba_hockey-1.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
147
- wsba_hockey-1.1.6.dist-info/top_level.txt,sha256=acU7s3x-RZC1zGiqCOmO0g267iqCg34lzIfdmYxxGmQ,12
148
- wsba_hockey-1.1.6.dist-info/RECORD,,
144
+ wsba_hockey-1.1.7.dist-info/licenses/LICENSE,sha256=Nr_Um1Pd5FQJTWWgm7maZArdtYMbDhzXYSwyJIZDGik,1114
145
+ wsba_hockey-1.1.7.dist-info/METADATA,sha256=O_B4EEwc9nbOpnAO8KVoA1Vv-mJHIUEuyqNP_OLrx7s,3566
146
+ wsba_hockey-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
147
+ wsba_hockey-1.1.7.dist-info/top_level.txt,sha256=acU7s3x-RZC1zGiqCOmO0g267iqCg34lzIfdmYxxGmQ,12
148
+ wsba_hockey-1.1.7.dist-info/RECORD,,