wsba-hockey 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wsba_hockey/wsba_main.py CHANGED
@@ -1,79 +1,230 @@
1
1
  import requests as rs
2
2
  import pandas as pd
3
3
  import numpy as np
4
- from datetime import datetime, timedelta
5
- from wsba_hockey.tools.scraping import *
4
+ from datetime import datetime, timedelta, date
5
+ import time
6
+ import random
7
+ from tools.scraping import *
8
+ from tools.xg_model import *
9
+ from tools.agg import *
10
+ from tools.plotting import *
6
11
 
7
12
  ### WSBA HOCKEY ###
8
13
  ## Provided below are all integral functions in the WSBA Hockey Python package. ##
9
14
 
15
+ ## GLOBAL VARIABLES ##
16
+ seasons = [
17
+ '20072008',
18
+ '20082009',
19
+ '20092010',
20
+ '20102011',
21
+ '20112012',
22
+ '20122013',
23
+ '20132014',
24
+ '20142015',
25
+ '20152016',
26
+ '20162017',
27
+ '20172018',
28
+ '20182019',
29
+ '20192020',
30
+ '20202021',
31
+ '20212022',
32
+ '20222023',
33
+ '20232024',
34
+ '20242025'
35
+ ]
36
+
37
+ convert_seasons = {'2007': '20072008',
38
+ '2008': '20082009',
39
+ '2009': '20092010',
40
+ '2010': '20102011',
41
+ '2011': '20112012',
42
+ '2012': '20122013',
43
+ '2013': '20132014',
44
+ '2014': '20142015',
45
+ '2015': '20152016',
46
+ '2016': '20162017',
47
+ '2017': '20172018',
48
+ '2018': '20182019',
49
+ '2019': '20192020',
50
+ '2020': '20202021',
51
+ '2021': '20212022',
52
+ '2022': '20222023',
53
+ '2023': '20232024',
54
+ '2024': '20242025'}
55
+
56
+ convert_team_abbr = {'L.A':'LAK',
57
+ 'N.J':'NJD',
58
+ 'S.J':'SJS',
59
+ 'T.B':'TBL',
60
+ 'PHX':'ARI'}
61
+
62
+ per_sixty = ['Fi','xGi','Gi','A1','A2','P1','P','FF','FA','xGF','xGA','GF','GA']
63
+
64
+ #Some games in the API are specifically known to cause errors in scraping.
65
+ #This list is updated as frequently as necessary
66
+ known_probs ={
67
+ '2007020011':'Missing shifts data for game between Chicago and Minnesota.',
68
+ '2007021178':'Game between the Bruins and Sabres is missing data after the second period, for some reason.',
69
+ '2008020259':'HTML data is completely missing for this game.',
70
+ '2008020409':'HTML data is completely missing for this game.',
71
+ '2008021077':'HTML data is completely missing for this game.',
72
+ '2009020081':'HTML pbp for this game between Pittsburgh and Carolina is missing all but the period start and first faceoff events, for some reason.',
73
+ '2009020658':'Missing shifts data for game between New York Islanders and Dallas.',
74
+ '2009020885':'Missing shifts data for game between Sharks and Blue Jackets.',
75
+ '2010020124':'Game between Capitals and Hurricanes is sporadically missing player on-ice data',
76
+ '2013020971':'On March 10th, 2014, Stars forward Rich Peverley suffered from a cardiac episode midgame and as a result, the remainder of the game was postponed. \nThe game resumed on April 9th, and the only goal scorer in the game, Blue Jackets forward Nathan Horton, did not appear in the resumed game due to injury. Interestingly, Horton would never play in the NHL again.',
77
+ '2018021133':'Game between Lightning and Capitals has incorrectly labeled event teams (i.e. WSH TAKEAWAY - #71 CIRELLI (Cirelli is a Tampa Bay skater in this game)).',
78
+ '2019020876':'Due to the frightening collapse of Blues defensemen Jay Bouwmeester, a game on February 2nd, 2020 between the Ducks and Blues was postponed. \nWhen the game resumed, Ducks defensemen Hampus Lindholm, who assisted on a goal in the inital game, did not play in the resumed match.'
79
+ }
80
+
81
+ name_change = {
82
+ "":"",
83
+ }
84
+
85
+ shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
86
+
87
+ new = 2024
88
+
89
+ standings_end = {
90
+ '20072008':'04-06',
91
+ '20082009':'04-12',
92
+ '20092010':'04-11',
93
+ '20102011':'04-10',
94
+ '20112012':'04-07',
95
+ '20122013':'04-28',
96
+ '20132014':'04-13',
97
+ '20142015':'04-11',
98
+ '20152016':'04-10',
99
+ '20162017':'04-09',
100
+ '20172018':'04-08',
101
+ '20182019':'04-06',
102
+ '20192020':'03-11',
103
+ '20202021':'05-19',
104
+ '20212022':'04-01',
105
+ '20222023':'04-14',
106
+ '20232024':'04-18',
107
+ '20242025':'04-17'
108
+ }
109
+
10
110
  ## SCRAPE FUNCTIONS ##
11
- def nhl_scrape_game(game_ids,split_shifts = False,remove = ['period-start','period-end','challenge','stoppage']):
111
+ def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage'],verbose = False, errors = False):
12
112
  #Given a set of game_ids (NHL API), return complete play-by-play information as requested
13
- # param 'game_ids' - NHL game ids
113
+ # param 'game_ids' - NHL game ids (or list formatted as ['random', num_of_games, start_year, end_year])
14
114
  # param 'split_shifts' - boolean which splits pbp and shift events if true
15
115
  # param 'remove' - list of events to remove from final dataframe
116
+ # param 'xg' - xG model to apply to pbp for aggregation
117
+ # param 'verbose' - boolean which adds additional event info if true
118
+ # param 'errors' - boolean returning game ids which did not scrape if true
16
119
 
17
120
  pbps = []
18
- for game_id in game_ids:
19
- print("Scraping data from game " + str(game_id) + "...")
20
-
21
- game_id = str(game_id)
22
- season = str(game_id[:4])+str(int(game_id[:4])+1)
23
-
24
- api = "https://api-web.nhle.com/v1/gamecenter/"+game_id+"/play-by-play"
25
- home_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TH"+str(game_id)[-6:]+".HTM"
26
- away_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TV"+str(game_id)[-6:]+".HTM"
27
-
28
- #Retrieve raw data
121
+ if game_ids[0] == 'random':
122
+ #Randomize selection of game_ids
123
+ #Some ids returned may be invalid (for example, 2020021300)
124
+ num = game_ids[1]
29
125
  try:
30
- json = rs.get(api).json()
31
- home_shift = rs.get(home_log).content
32
- away_shift = rs.get(away_log).content
33
-
34
- if int(game_id[:4]) < 2010:
35
- print()
36
- raise Exception('Games before 2010-2011 are not available yet.')
37
- else:
38
- #Parse Json
39
- pbp = parse_json(json)
40
-
41
- #Create shifts
42
- #If no shifts data exists only export play-by-play
126
+ start = game_ids[2]
127
+ except:
128
+ start = 2007
129
+ try:
130
+ end = game_ids[3]
131
+ except:
132
+ end = (date.today().year)-1
133
+
134
+ game_ids = []
135
+ i = 0
136
+ print("Finding valid, random game ids...")
137
+ while i is not num:
138
+ print(f"\rGame IDs found in range {start}-{end}: {i}/{num}",end="")
139
+ rand_year = random.randint(start,end)
140
+ rand_season_type = random.randint(2,3)
141
+ rand_game = random.randint(1,1312)
142
+
143
+ #Ensure id validity (and that number of scraped games is equal to specified value)
144
+ rand_id = f'{rand_year}{rand_season_type:02d}{rand_game:04d}'
43
145
  try:
44
- shifts = fix_names(combine_shifts(home_shift,away_shift,json,game_id),json)
45
- data = combine_data(pbp,shifts)
146
+ rs.get(f"https://api-web.nhle.com/v1/gamecenter/{rand_id}/play-by-play").json()
147
+ i += 1
148
+ game_ids.append(rand_id)
149
+ except:
150
+ continue
151
+
152
+ print(f"\rGame IDs found in range {start}-{end}: {i}/{num}")
46
153
 
47
- except:
48
- print(f"Cannot find or create shifts for game {game_id}...")
49
- data = combine_data(pbp,pd.DataFrame(columns=get_col()))
154
+ #Scrape each game
155
+ #Track Errors
156
+ error_ids = []
157
+ for game_id in game_ids:
158
+ print("Scraping data from game " + str(game_id) + "...",end="")
159
+ start = time.perf_counter()
50
160
 
51
- #Combine and append data to list
161
+ try:
162
+ #Retrieve data
163
+ info = get_game_info(game_id)
164
+ data = combine_data(info)
165
+
166
+ #Append data to list
52
167
  pbps.append(data)
53
- except:
54
- print(f"Unable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
55
- pbps.append(pd.DataFrame())
56
168
 
169
+ end = time.perf_counter()
170
+ secs = end - start
171
+ print(f" finished in {secs:.2f} seconds.")
172
+
173
+ except:
174
+ #Games such as the all-star game and pre-season games will incur this error
175
+ #Other games have known problems
176
+ if game_id in known_probs.keys():
177
+ print(f"\nGame {game_id} has a known problem: {known_probs[game_id]}")
178
+ else:
179
+ print(f"\nUnable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
180
+
181
+ #Track error
182
+ error_ids.append(game_id)
183
+
57
184
  #Add all pbps together
185
+ if len(pbps) == 0:
186
+ print("\rNo data returned.")
187
+ return pd.DataFrame()
58
188
  df = pd.concat(pbps)
59
189
 
190
+ #If verbose is true features required to calculate xG are added to dataframe
191
+ if verbose:
192
+ df = prep_xG_data(df)
193
+ else:
194
+ ""
195
+
196
+ #Print final message
197
+ if len(error_ids) > 0:
198
+ print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
199
+ else:
200
+ print('\rScrape of provided games finished.')
201
+
60
202
  #Split pbp and shift events if necessary
61
203
  #Return: complete play-by-play with data removed or split as necessary
62
- try: df['event_type']
63
- except KeyError:
64
- raise KeyError("No data is available to return.")
65
-
204
+
66
205
  if split_shifts == True:
67
- if len(remove) == 0:
68
- remove = ['change']
206
+ remove.append('change')
69
207
 
70
208
  #Return: dict with pbp and shifts seperated
71
- return {"pbp":df.loc[~df['event_type'].isin(remove)].dropna(axis=1,how='all'),
72
- "shifts":df.loc[df['event_type']=='change'].dropna(axis=1,how='all')
209
+ pbp_dict = {"pbp":df.loc[~df['event_type'].isin(remove)],
210
+ "shifts":df.loc[df['event_type']=='change']
73
211
  }
212
+
213
+ if errors:
214
+ pbp_dict.update({'errors':error_ids})
215
+
216
+ return pbp_dict
74
217
  else:
75
218
  #Return: all events that are not set for removal by the provided list
76
- return df.loc[~df['event_type'].isin(remove)]
219
+ pbp = df.loc[~df['event_type'].isin(remove)]
220
+
221
+ if errors:
222
+ pbp_dict = {'pbp':pbp,
223
+ 'errors':error_ids}
224
+
225
+ return pbp_dict
226
+ else:
227
+ return pbp
77
228
 
78
229
  def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
79
230
  #Given a season, return schedule data
@@ -117,16 +268,18 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
117
268
  "id": [gameWeek[i]['id']],
118
269
  "season": [gameWeek[i]['season']],
119
270
  "season_type":[gameWeek[i]['gameType']],
271
+ "away_team_abbr":[gameWeek[i]['awayTeam']['abbrev']],
272
+ "home_team_abbr":[gameWeek[i]['homeTeam']['abbrev']],
120
273
  "gamecenter_link":[gameWeek[i]['gameCenterLink']]
121
274
  }))
122
275
 
123
276
  #Concatenate all games
124
277
  df = pd.concat(game)
125
278
 
126
- #Return: specificed schedule data (excluding preseason games)
127
- return df.loc[df['season_type']>1]
279
+ #Return: specificed schedule data
280
+ return df
128
281
 
129
- def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv"):
282
+ def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv", verbose = False, errors = False):
130
283
  #Given season, scrape all play-by-play occuring within the season
131
284
  # param 'season' - NHL season to scrape
132
285
  # param 'split_shifts' - boolean which splits pbp and shift events if true
@@ -135,50 +288,61 @@ def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','per
135
288
  # param 'end' - End date in season
136
289
  # param 'local' - boolean indicating whether to use local file to scrape game_ids
137
290
  # param 'local_path' - path of local file
291
+ # param 'verbose' - boolean which adds additional event info if true
292
+ # param 'errors' - boolean returning game ids which did not scrape if true
138
293
 
139
- #While the default value of local is false, schedule data is provided in the package files; enabling local will automatically find and scrape games in a specified season, saving time otherwise spent scraping a season's schedule
294
+ #Determine whether to use schedule data in repository or to scrape
140
295
  if local == True:
141
296
  load = pd.read_csv(local_path)
142
- load = load.loc[load['season'].astype(str)==season]
297
+ load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
143
298
  game_ids = list(load['id'].astype(str))
144
299
  else:
145
- game_ids = list(nhl_scrape_schedule(season,start,end)['id'].astype(str))
146
-
147
- df = []
148
- df_s = []
149
-
150
- errors = []
151
- for game_id in game_ids:
152
- try:
153
- if split_shifts == True:
154
- data = nhl_scrape_game([game_id],split_shifts=True,remove=remove)
155
- df.append(data['pbp'])
156
- df_s.append(data['shifts'])
157
- else:
158
- data = nhl_scrape_game([game_id],remove=remove)
159
- df.append(data)
300
+ load = nhl_scrape_schedule(season,start,end)
301
+ load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
302
+ game_ids = list(load['id'].astype(str))
160
303
 
161
- except:
162
- #Errors should be rare; testing of eight full-season scraped produced just one missing regular season game due to error
163
- continue
304
+ #If no games found, terminate the process
305
+ if not game_ids:
306
+ print('No games found for dates in season...')
307
+ return ""
308
+
309
+ print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
310
+ start = time.perf_counter()
164
311
 
165
- #Missing data handled as a KeyError
166
- try: pbp = pd.concat(df)
167
- except:
168
- raise KeyError("No data is available to return.")
169
-
312
+ #Perform scrape
170
313
  if split_shifts == True:
171
- try: shifts = pd.concat(df_s)
172
- except: raise KeyError("No data is available to return.")
314
+ data = nhl_scrape_game(game_ids,split_shifts=True,remove=remove,verbose=verbose,errors=errors)
173
315
  else:
174
- ""
175
-
316
+ data = nhl_scrape_game(game_ids,remove=remove,verbose=verbose,errors=errors)
317
+
318
+ end = time.perf_counter()
319
+ secs = end - start
320
+
321
+ print(f'Finished season scrape in {(secs/60)/60:.2f} hours.')
176
322
  #Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
177
323
  if split_shifts == True:
178
- return {"pbp":pbp,
179
- 'shifts':shifts}
324
+ pbp_dict = {'pbp':data['pbp'],
325
+ 'shifts':data['shifts']}
326
+
327
+ if errors:
328
+ pbp_dict.update({'errors':data['errors']})
329
+ return pbp_dict
180
330
  else:
181
- return pbp
331
+ pbp = data['pbp']
332
+
333
+ if errors:
334
+ pbp_dict = {'pbp':pbp,
335
+ 'errors':data['errors']}
336
+ return pbp_dict
337
+ else:
338
+ return pbp
339
+
340
+ #errors = []
341
+ #for season in seasons[10:12]:
342
+ # data = nhl_scrape_season(season,remove=[],local=True,errors=True)
343
+ # errors.append(data['errors'])
344
+ # data['pbp'].to_csv(f'pbp/csv/nhl_pbp_{season}.csv',index=False)
345
+ #print(f'Errors: {errors}')
182
346
 
183
347
  def nhl_scrape_seasons_info(seasons = []):
184
348
  #Returns info related to NHL seasons (by default, all seasons are included)
@@ -200,19 +364,38 @@ def nhl_scrape_seasons_info(seasons = []):
200
364
  else:
201
365
  return df.sort_values(by=['id'])
202
366
 
203
- def nhl_scrape_standings(arg = "now"):
367
+ def nhl_scrape_standings(arg = "now", season_type = 2):
204
368
  #Returns standings
205
- # param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD
206
-
207
- if arg == "now":
208
- print("Scraping standings as of now...")
369
+ # param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD, a season (scrapes the last standings date for the season) or a year (for playoffs).
370
+ # param 'season_type' - by default, this scrapes the regular season standings. If set to 3, it returns the playoff bracket for the specified season
371
+
372
+ #arg param is ignored when set to "now" if season_type param is 3
373
+ if season_type == 3:
374
+ if arg == "now":
375
+ arg = new
376
+
377
+ print(f"Scraping playoff bracket for date: {arg}")
378
+ api = f"https://api-web.nhle.com/v1/playoff-bracket/{arg}"
379
+
380
+ data = rs.get(api).json()['series']
381
+
382
+ return pd.json_normalize(data)
383
+
209
384
  else:
210
- print("Scraping standings for season: "+arg)
211
- api = "https://api-web.nhle.com/v1/standings/"+arg
212
-
213
- data = rs.get(api).json()['standings']
385
+ if arg == "now":
386
+ print("Scraping standings as of now...")
387
+ elif arg in seasons:
388
+ print(f'Scraping standings for season: {arg}')
389
+ else:
390
+ print(f"Scraping standings for date: {arg}")
391
+
392
+ api = f"https://api-web.nhle.com/v1/standings/{arg[4:8]}-{standings_end[arg]}"
393
+ data = rs.get(api).json()['standings']
394
+
395
+ return pd.json_normalize(data)
214
396
 
215
- return pd.json_normalize(data)
397
+ #stand = [nhl_scrape_standings(season) for season in seasons]
398
+ #pd.concat(stand).to_csv('teaminfo/nhl_standings.csv',index=False)
216
399
 
217
400
  def nhl_scrape_roster(season):
218
401
  #Given a nhl season, return rosters for all participating teams
@@ -245,51 +428,410 @@ def nhl_scrape_roster(season):
245
428
 
246
429
  return pd.concat(rosts)
247
430
 
248
- def nhl_scrape_player_info(roster):
249
- #Given compiled roster information from the nhl_scrape_roster function, return a list of all players (seperated into team and season) and associated information
250
- # param 'roster' - dataframe of roster information from the nhl_scrape_roster function
431
+ def nhl_scrape_prospects(team):
432
+ #Given team abbreviation, retreive current team prospects
251
433
 
252
- data = roster
434
+ api = f'https://api-web.nhle.com/v1/prospects/{team}'
253
435
 
254
- print("Creating player info for provided roster data...")
436
+ data = rs.get(api).json()
437
+
438
+ #Iterate through positions
439
+ players = [pd.json_normalize(data[pos]) for pos in ['forwards','defensemen','goalies']]
255
440
 
256
- alt_name_col = ['firstName.cs', 'firstName.de', 'firstName.es', 'firstName.fi', 'firstName.sk', 'firstName.sv']
257
- for i in range(len(alt_name_col)):
258
- try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
259
- except: continue
441
+ prospects = pd.concat(players)
442
+ #Add name columns
443
+ prospects['fullName'] = (prospects['firstName.default']+" "+prospects['lastName.default']).str.upper()
260
444
 
261
- name_col = ['fullName', 'fullName.1', 'fullName.2', 'fullName.3', 'fullName.4', 'fullName.5', 'fullName.6']
445
+ #Return: team prospects
446
+ return prospects
262
447
 
263
- for name in name_col:
264
- try: data[name]
265
- except:
266
- data[name] = np.nan
267
-
268
- infos = []
269
- for name in name_col:
270
- infos.append(data[[name,"id","season","team_abbr","headshot",
271
- "sweaterNumber","headingPosition",
272
- "positionCode",'shootsCatches',
273
- 'heightInInches','weightInPounds',
274
- 'birthDate','birthCountry']].rename(columns={
275
- name:'Player',
276
- 'id':"API",
277
- "season":"Season",
278
- "team_abbr":"Team",
279
- 'headshot':'Headshot',
280
- 'sweaterNumber':"Number",
281
- 'headingPosition':"Primary Position",
282
- 'positionCode':'Position',
283
- 'shootsCatches':'Handedness',
284
- 'heightInInches':'Height',
285
- 'weightInPounds':'Weight',
286
- 'birthDate':'Birthday',
287
- 'birthCountry':'Nationality'}))
288
- players = pd.concat(infos)
289
- players['Season'] = players['Season'].astype(str)
290
- players['Player'] = players['Player'].replace(r'^\s*$', np.nan, regex=True)
291
-
292
- return players.loc[players['Player'].notna()].sort_values(by=['Player','Season','Team'])
448
+ def nhl_scrape_team_info(country = False):
449
+ #Given option to return franchise or country, return team information
450
+
451
+ print('Scraping team information...')
452
+ api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
453
+
454
+ data = pd.json_normalize(rs.get(api).json()['data'])
455
+
456
+ #Add logos if necessary
457
+ if not country:
458
+ data['logo_light'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_light.svg'
459
+ data['logo_dark'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_dark.svg'
460
+
461
+ return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
462
+
463
+ def nhl_scrape_player_data(player_id):
464
+ #Given player id, return player information
465
+ api = f'https://api-web.nhle.com/v1/player/{player_id}/landing'
466
+
467
+ data = pd.json_normalize(rs.get(api).json())
468
+
469
+ #Add name column
470
+ data['fullName'] = (data['firstName.default'] + " " + data['lastName.default']).str.upper()
471
+
472
+ #Return: player data
473
+ return data
474
+
475
+ def nhl_scrape_draft_rankings(arg = 'now', category = ''):
476
+ #Given url argument for timeframe and prospect category, return draft rankings
477
+ #Category 1 is North American Skaters
478
+ #Category 2 is International Skaters
479
+ #Category 3 is North American Goalie
480
+ #Category 4 is International Goalie
481
+
482
+ #Player category only applies when requesting a specific season
483
+ api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category != "" else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
484
+ data = pd.json_normalize(rs.get(api).json()['rankings'])
485
+
486
+ #Add player name columns
487
+ data['fullName'] = (data['firstName']+" "+data['lastName']).str.upper()
488
+
489
+ #Return: prospect rankings
490
+ return data
491
+
492
+ def nhl_shooting_impacts(agg,team=False):
493
+ #Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
494
+ #Only 5v5 is supported as of now
495
+
496
+ #param 'agg' - stats table
497
+ #param 'team' - boolean determining if team stats are calculated instead of skater stats
498
+
499
+ #COMPOSITE IMPACT EVALUATIONS:
500
+
501
+ #SR = Shot Rate
502
+ #SQ = Shot Quality
503
+ #FN = Finishing
504
+
505
+ #I = Impact
506
+
507
+ #INDV = Individual
508
+ #OOFF = On-Ice Offense
509
+ #ODEF = On-Ice Defense
510
+
511
+ #Grouping-Metric Code: XXXX-YYI
512
+
513
+ #Goal Composition Formula
514
+ #The aggregation of goals is composed of three factors: shot rate, shot quality, and finishing
515
+ #These are represented by their own metrics in which Goals = (Fenwick*(League Average Fenwick SH%)) + ((xGoals/Fenwick - League Average Fenwick SH%)*Fenwick) + (Goals - xGoals)
516
+ def goal_comp(fenwick,xg_fen,xg,g,fsh):
517
+ rate = fenwick * fsh
518
+ qual = (xg_fen-fsh)*fenwick
519
+ fini = g-xg
520
+
521
+ return rate+qual+fini
522
+
523
+ if team:
524
+ pos = agg
525
+ for group in [('OOFF','F'),('ODEF','A')]:
526
+ #Have to set this columns for compatibility with df.apply
527
+ pos['fsh'] = pos[f'Fsh{group[1]}%']
528
+ pos['fenwick'] = pos[f'F{group[1]}/60']
529
+ pos['xg'] = pos[f'xG{group[1]}/60']
530
+ pos['g'] = pos[f'G{group[1]}/60']
531
+ pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
532
+ pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
533
+
534
+ #Find average for position in frame
535
+ avg_fen = pos['fenwick'].mean()
536
+ avg_xg = pos['xg'].mean()
537
+ avg_g = pos['g'].mean()
538
+ avg_fsh = avg_g/avg_fen
539
+ avg_xg_fen = avg_xg/avg_fen
540
+
541
+ #Calculate composite percentiles
542
+ pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
543
+ pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
544
+ pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
545
+
546
+ #Calculate shot rate, shot quality, and finishing impacts
547
+ pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
548
+ pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
549
+ pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
550
+
551
+ #Add extra metrics
552
+ pos['RushF/60'] = (pos['RushF']/pos['TOI'])*60
553
+ pos['RushA/60'] = (pos['RushA']/pos['TOI'])*60
554
+ pos['Rushes FF'] = pos['RushF/60'].rank(pct=True)
555
+ pos['Rushes FA'] = pos['RushA/60'].rank(pct=True)
556
+ pos['RushFxG/60'] = (pos['RushFxG']/pos['TOI'])*60
557
+ pos['RushAxG/60'] = (pos['RushAxG']/pos['TOI'])*60
558
+ pos['Rushes xGF'] = pos['RushFxG/60'].rank(pct=True)
559
+ pos['Rushes xGA'] = pos['RushAxG/60'].rank(pct=True)
560
+ pos['RushFG/60'] = (pos['RushFG']/pos['TOI'])*60
561
+ pos['RushAG/60'] = (pos['RushAG']/pos['TOI'])*60
562
+ pos['Rushes GF'] = pos['RushFG/60'].rank(pct=True)
563
+ pos['Rushes GA'] = pos['RushAG/60'].rank(pct=True)
564
+
565
+ #Flip against metric percentiles
566
+ pos['ODEF-SR'] = 1-pos['ODEF-SR']
567
+ pos['ODEF-SQ'] = 1-pos['ODEF-SQ']
568
+ pos['ODEF-FN'] = 1-pos['ODEF-FN']
569
+
570
+ #Return: team stats with shooting impacts
571
+ return pos.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Season','Team'])
572
+
573
+
574
+ else:
575
+ #Remove skaters with less than 150 minutes of TOI then split between forwards and dmen
576
+ agg = agg.loc[agg['TOI']>=150]
577
+ forwards = agg.loc[agg['Position']!='D']
578
+ defensemen = agg.loc[agg['Position']=='D']
579
+
580
+ #Loop through both positions, all groupings (INDV, OOFF, and ODEF) generating impacts
581
+ for pos in [forwards,defensemen]:
582
+ for group in [('INDV','i'),('OOFF','F'),('ODEF','A')]:
583
+ #Have to set this columns for compatibility with df.apply
584
+ pos['fsh'] = pos[f'Fsh{group[1]}%']
585
+ pos['fenwick'] = pos[f'F{group[1]}/60']
586
+ pos['xg'] = pos[f'xG{group[1]}/60']
587
+ pos['g'] = pos[f'G{group[1]}/60']
588
+ pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
589
+ pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
590
+
591
+ #Find average for position in frame
592
+ avg_fen = pos['fenwick'].mean()
593
+ avg_xg = pos['xg'].mean()
594
+ avg_g = pos['g'].mean()
595
+ avg_fsh = avg_g/avg_fen
596
+ avg_xg_fen = avg_xg/avg_fen
597
+
598
+ #Calculate composite percentiles
599
+ pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
600
+ pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
601
+ pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
602
+
603
+ #Calculate shot rate, shot quality, and finishing impacts
604
+ pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
605
+ pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
606
+ pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
607
+
608
+ #Calculate On-Ice Involvement Percentiles
609
+ pos['Fenwick'] = pos['FC%'].rank(pct=True)
610
+ pos['xG'] = pos['xGC%'].rank(pct=True)
611
+ pos['Goal Factor'] = pos['GI%'].rank(pct=True)
612
+ pos['Goal Scoring'] = pos['GC%'].rank(pct=True)
613
+ pos['Rush/60'] = (pos['Rush']/pos['TOI'])*60
614
+ pos['RushxG/60'] = (pos['Rush xG']/pos['TOI'])*60
615
+ pos['Rushes xG'] = pos['RushxG/60'].rank(pct=True)
616
+ pos['Rushes FF'] = pos['Rush/60'].rank(pct=True)
617
+
618
+ #Add positions back together
619
+ complete = pd.concat([forwards,defensemen])
620
+
621
+ #Flip against metric percentiles
622
+ complete['ODEF-SR'] = 1-complete['ODEF-SR']
623
+ complete['ODEF-SQ'] = 1-complete['ODEF-SQ']
624
+ complete['ODEF-FN'] = 1-complete['ODEF-FN']
625
+
626
+ #Extraneous Values
627
+ complete['Extraneous Gi'] = complete['INDV-SRI']+complete['INDV-SQI']+complete['INDV-FNI']
628
+ complete['Extraneous xGi'] = complete['INDV-SRI']+complete['INDV-SQI']
629
+ complete['Extraneous GF'] = complete['OOFF-SRI']+complete['OOFF-SQI']+complete['OOFF-FNI']
630
+ complete['Extraneous xGF'] = complete['OOFF-SRI']+complete['OOFF-SQI']
631
+ complete['Extraneous GA'] = complete['ODEF-SRI']+complete['ODEF-SQI']+complete['ODEF-FNI']
632
+ complete['Extraneous xGA'] = complete['ODEF-SRI']+complete['ODEF-SQI']
633
+
634
+ #Goal Composites
635
+ complete['Linemate Extraneous Goals'] = complete['Extraneous GF'] - complete['Extraneous Gi']
636
+ complete['Linemate Goal Induction'] = complete['Linemate Extraneous Goals']*complete['AC%']
637
+ complete['Composite Goal Impact'] = complete['Extraneous Gi'] + complete['Linemate Goal Induction']
638
+ complete['Linemate Rel. Goal Impact'] = complete['Composite Goal Impact'] - (complete['Extraneous GF']-complete['Composite Goal Impact'])
639
+ complete['Net Goal Impact'] = complete['Extraneous GF'] - complete['Extraneous GA']
640
+ complete['Net xGoal Impact'] = complete['Extraneous xGF'] - complete['Extraneous xGA']
641
+
642
+ #Return: skater stats with shooting impacts
643
+ return complete.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Player','Season','Team','ID'])
644
+
645
+ def nhl_calculate_stats(pbp,type,season_types,game_strength,roster_path="rosters/nhl_rosters.csv",xg="moneypuck",shot_impact=False):
646
+ #Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
647
+ # param 'pbp' - play-by-play dataframe
648
+ # param 'type' - type of stats to calculate ('skater', 'goaltender', or 'team')
649
+ # param 'season' - season or timeframe of events in play-by-play
650
+ # param 'season_type' - list of season types (preseason, regular season, or playoffs) to include in aggregation
651
+ # param 'game_strength' - list of game_strengths to include in aggregation
652
+ # param 'roster_path' - path to roster file
653
+ # param 'xg' - xG model to apply to pbp for aggregation
654
+ # param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
655
+
656
+ print(f"Calculating statistics for all games in the provided play-by-play data...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
657
+ start = time.perf_counter()
658
+
659
+ #Add extra data and apply team changes
660
+ pbp = prep_xG_data(pbp).replace(convert_team_abbr)
661
+
662
+ #Check if xG column exists and apply model if it does not
663
+ try:
664
+ pbp['xG']
665
+ except KeyError:
666
+ if xg == 'wsba':
667
+ pbp = wsba_xG(pbp)
668
+ else:
669
+ pbp = moneypuck_xG(pbp)
670
+
671
+ #Filter by season types and remove shootouts
672
+ pbp = pbp.loc[(pbp['season_type'].isin(season_types)) & (pbp['period'] < 5)]
673
+
674
+ # Filter by game strength if not "all"
675
+ if game_strength != "all":
676
+ pbp = pbp.loc[pbp['strength_state'].isin(game_strength)]
677
+
678
+ #Split calculation
679
+ if type == 'team':
680
+ complete = calc_team(pbp)
681
+
682
+ #Set TOI to minute
683
+ complete['TOI'] = complete['TOI']/60
684
+
685
+ #Add per 60 stats
686
+ for stat in per_sixty[7:13]:
687
+ complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
688
+
689
+ end = time.perf_counter()
690
+ length = end-start
691
+ print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
692
+ #Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
693
+ if shot_impact:
694
+ return nhl_shooting_impacts(complete,True)
695
+ else:
696
+ return complete
697
+ else:
698
+ indv_stats = calc_indv(pbp)
699
+ onice_stats = calc_onice(pbp)
700
+
701
+ #IDs sometimes set as objects
702
+ indv_stats['ID'] = indv_stats['ID'].astype(float)
703
+ onice_stats['ID'] = onice_stats['ID'].astype(float)
704
+
705
+ #Merge and add columns for extra stats
706
+ complete = pd.merge(indv_stats,onice_stats,how="outer",on=['ID','Team','Season'])
707
+ complete['GC%'] = complete['Gi']/complete['GF']
708
+ complete['AC%'] = (complete['A1']+complete['A2'])/complete['GF']
709
+ complete['GI%'] = (complete['Gi']+complete['A1']+complete['A2'])/complete['GF']
710
+ complete['FC%'] = complete['Fi']/complete['FF']
711
+ complete['xGC%'] = complete['xGi']/complete['xGF']
712
+
713
+ #Remove entries with no ID listed
714
+ complete = complete.loc[complete['ID'].notna()]
715
+
716
+ #Import rosters and player info
717
+ rosters = pd.read_csv(roster_path)
718
+ names = rosters[['id','fullName',
719
+ 'headshot','positionCode','shootsCatches',
720
+ 'heightInInches','weightInPounds',
721
+ 'birthDate','birthCountry']].drop_duplicates(subset=['id','fullName'],keep='last')
722
+
723
+ #Add names
724
+ complete = pd.merge(complete,names,how='left',left_on='ID',right_on='id')
725
+
726
+ #Rename if there are no missing names
727
+ complete = complete.rename(columns={'fullName':'Player',
728
+ 'headshot':'Headshot',
729
+ 'positionCode':'Position',
730
+ 'shootsCatches':'Handedness',
731
+ 'heightInInches':'Height (in)',
732
+ 'weightInPounds':'Weight (lbs)',
733
+ 'birthDate':'Birthday',
734
+ 'birthCountry':'Nationality'})
735
+
736
+ #Set TOI to minute
737
+ complete['TOI'] = complete['TOI']/60
738
+
739
+ #Add per 60 stats
740
+ for stat in per_sixty:
741
+ complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
742
+
743
+ #Add player age
744
+ complete['Birthday'] = pd.to_datetime(complete['Birthday'])
745
+ complete['season_year'] = complete['Season'].astype(str).str[4:8].astype(int)
746
+ complete['Age'] = complete['season_year'] - complete['Birthday'].dt.year
747
+
748
+ #Find player headshot
749
+ complete['Headshot'] = 'https://assets.nhle.com/mugs/nhl/'+complete['Season'].astype(str)+'/'+complete['Team']+'/'+complete['ID'].astype(int).astype(str)+'.png'
750
+
751
+ end = time.perf_counter()
752
+ length = end-start
753
+ #Remove goalies that occasionally appear in a set
754
+ complete = complete.loc[complete['Position']!='G']
755
+ #Add WSBA ID
756
+ complete['WSBA'] = complete['Player']+complete['Season'].astype(str)+complete['Team']
757
+
758
+ #Shot Type Metrics
759
+ type_metrics = []
760
+ for type in shot_types:
761
+ for stat in per_sixty[:3]:
762
+ type_metrics.append(f'{type.capitalize()}{stat}')
763
+
764
+ complete = complete[[
765
+ 'Player','ID',
766
+ "Season","Team",'WSBA',
767
+ 'Headshot','Position','Handedness',
768
+ 'Height (in)','Weight (lbs)',
769
+ 'Birthday','Age','Nationality',
770
+ 'GP','TOI',
771
+ "Gi","A1","A2",'P1','P',
772
+ "Fi","xGi",'xGi/Fi',"Gi/xGi","Fshi%",
773
+ "GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
774
+ "GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
775
+ 'Rush',"Rush xG",'Rush G',"GC%","AC%","GI%","FC%","xGC%",
776
+ ]+[f'{stat}/60' for stat in per_sixty]+type_metrics].fillna(0).sort_values(['Player','Season','Team','ID'])
777
+
778
+ print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
779
+ #Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
780
+ if shot_impact:
781
+ return nhl_shooting_impacts(complete,False)
782
+ else:
783
+ return complete
784
+
785
+ #stats = []
786
+ #for season in seasons[6:18]:
787
+ # pbp = pd.read_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet')
788
+ # stat = nhl_calculate_stats(pbp,'skater',[2],['5v5'],shot_impact=True)
789
+ # stat.to_csv(f'stats/skater/wsba_nhl_{season}.csv',index=False)
790
+ # stats.append(stat)
791
+ #pd.concat(stats).to_csv('stats/db/wsba_nhl_skater_db.csv',index=False)
792
+
793
+ def nhl_plot_skaters_shots(pbp,skater_dict,strengths,color_dict=event_colors,legend=False,xg='moneypuck'):
794
+ #Returns list of plots for specified skaters
795
+ # param 'pbp' - pbp to plot data
796
+ # param 'skater_dict' - skaters to plot shots for (format: {'Patrice Bergeron':['20242025','BOS']})
797
+ # param 'strengths' - strengths to include in plotting
798
+ # param 'color_dict' - dict with colors to use for events
799
+ # param 'legend' - bool which includes legend if true
800
+ # param 'xg' - xG model to apply to pbp for plotting
801
+
802
+ print(f'Plotting the following skater shots: {skater_dict}...')
803
+
804
+ #Iterate through games, adding plot to list
805
+ skater_plots = []
806
+ for skater in skater_dict.keys():
807
+ skater_info = skater_dict[skater]
808
+ title = f'{skater} Fenwick Shots for {skater_info[1]} in {skater_info[0][2:4]}-{skater_info[0][6:8]}'
809
+ skater_plots.append(plot_skater_shots(pbp,skater,skater_info[0],skater_info[1],strengths,title,color_dict,legend,xg))
810
+
811
+ #Return: list of plotted skater shot charts
812
+ return skater_plots
813
+
814
+ def nhl_plot_games(pbp,events,strengths,game_ids='all',color_dict=event_colors,legend=False,xg='moneypuck'):
815
+ #Returns list of plots for specified games
816
+ # param 'pbp' - pbp to plot data
817
+ # param 'events' - type of events to plot
818
+ # param 'strengths' - strengths to include in plotting
819
+ # param 'game_ids' - games to plot (list if not set to 'all')
820
+ # param 'color_dict' - dict with colors to use for events
821
+ # param 'legend' - bool which includes legend if true
822
+ # param 'xg' - xG model to apply to pbp for plotting
823
+
824
+ #Find games to scrape
825
+ if game_ids == 'all':
826
+ game_ids = pbp['game_id'].drop_duplicates().to_list()
827
+
828
+ print(f'Plotting the following games: {game_ids}...')
829
+
830
+ #Iterate through games, adding plot to list
831
+ game_plots = [plot_game_events(pbp,game,events,strengths,color_dict,legend,xg) for game in game_ids]
832
+
833
+ #Return: list of plotted game events
834
+ return game_plots
293
835
 
294
836
  def repo_load_rosters(seasons = []):
295
837
  #Returns roster data from repository
@@ -314,4 +856,29 @@ def repo_load_schedule(seasons = []):
314
856
  def repo_load_teaminfo():
315
857
  #Returns team data from repository
316
858
 
317
- return pd.read_csv("teaminfo/nhl_teaminfo.csv")
859
+ return pd.read_csv("teaminfo/nhl_teaminfo.csv")
860
+
861
+ def repo_load_pbp(seasons = []):
862
+ #Returns play-by-play data from repository
863
+ # param 'seasons' - list of seasons to include
864
+
865
+ #Add parquet to total
866
+ print(f'Loading play-by-play from the following seasons: {seasons}...')
867
+ dfs = [pd.read_parquet(f"https://github.com/owensingh38/wsba_hockey/raw/refs/heads/main/src/wsba_hockey/pbp/parquet/nhl_pbp_{season}.parquet") for season in seasons]
868
+
869
+ return pd.concat(dfs)
870
+
871
+ def repo_load_seasons():
872
+ #List of available seasons to scrape
873
+
874
+ return seasons
875
+
876
+ def admin_convert_to_parquet(seasons):
877
+ for season in seasons:
878
+ load = pd.read_csv(f'pbp/csv/nhl_pbp_{season}.csv')
879
+
880
+ load.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
881
+
882
+ #for season in seasons[6:12]:
883
+ # data = pd.read_csv(f"pbp/csv/nhl_pbp_{season}.csv")
884
+ # data.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)