wsba-hockey 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1213 +0,0 @@
1
- import random
2
- import os
3
- import requests as rs
4
- import pandas as pd
5
- import time
6
- from datetime import datetime, timedelta, date
7
- from tools.scraping import *
8
- from tools.xg_model import *
9
- from tools.agg import *
10
- from tools.plotting import *
11
-
12
- ### WSBA HOCKEY ###
13
- ## Provided below are all integral functions in the WSBA Hockey Python package. ##
14
-
15
- ## GLOBAL VARIABLES ##
16
- seasons = [
17
- '20072008',
18
- '20082009',
19
- '20092010',
20
- '20102011',
21
- '20112012',
22
- '20122013',
23
- '20132014',
24
- '20142015',
25
- '20152016',
26
- '20162017',
27
- '20172018',
28
- '20182019',
29
- '20192020',
30
- '20202021',
31
- '20212022',
32
- '20222023',
33
- '20232024',
34
- '20242025'
35
- ]
36
-
37
- convert_seasons = {'2007': '20072008',
38
- '2008': '20082009',
39
- '2009': '20092010',
40
- '2010': '20102011',
41
- '2011': '20112012',
42
- '2012': '20122013',
43
- '2013': '20132014',
44
- '2014': '20142015',
45
- '2015': '20152016',
46
- '2016': '20162017',
47
- '2017': '20172018',
48
- '2018': '20182019',
49
- '2019': '20192020',
50
- '2020': '20202021',
51
- '2021': '20212022',
52
- '2022': '20222023',
53
- '2023': '20232024',
54
- '2024': '20242025'}
55
-
56
- convert_team_abbr = {'L.A':'LAK',
57
- 'N.J':'NJD',
58
- 'S.J':'SJS',
59
- 'T.B':'TBL',
60
- 'PHX':'ARI'}
61
-
62
- per_sixty = ['Fi','xGi','Gi','A1','A2','P1','P','OZF','NZF','DZF','FF','FA','xGF','xGA','GF','GA','CF','CA','HF','HA','Give','Take','Penl','Penl2','Penl5','Draw','Block']
63
-
64
- #Some games in the API are specifically known to cause errors in scraping.
65
- #This list is updated as frequently as necessary
66
- known_probs = {
67
- '2007020011':'Missing shifts data for game between Chicago and Minnesota.',
68
- '2007021178':'Game between the Bruins and Sabres is missing data after the second period, for some reason.',
69
- '2008020259':'HTML data is completely missing for this game.',
70
- '2008020409':'HTML data is completely missing for this game.',
71
- '2008021077':'HTML data is completely missing for this game.',
72
- '2009020081':'HTML pbp for this game between Pittsburgh and Carolina is missing all but the period start and first faceoff events, for some reason.',
73
- '2009020658':'Missing shifts data for game between New York Islanders and Dallas.',
74
- '2009020885':'Missing shifts data for game between Sharks and Blue Jackets.',
75
- '2010020124':'Game between Capitals and Hurricanes is sporadically missing player on-ice data',
76
- '2012020018':'HTML events contain mislabeled events.',
77
- '2013020971':'On March 10th, 2014, Stars forward Rich Peverley suffered from a cardiac episode midgame and as a result, the remainder of the game was postponed. \nThe game resumed on April 9th, and the only goal scorer in the game, Blue Jackets forward Nathan Horton, did not appear in the resumed game due to injury. Interestingly, Horton would never play in the NHL again.',
78
- '2018021133':'Game between Lightning and Capitals has incorrectly labeled event teams (i.e. WSH TAKEAWAY - #71 CIRELLI (Cirelli is a Tampa Bay skater in this game)).',
79
- '2019020876':'Due to the frightening collapse of Blues defensemen Jay Bouwmeester, a game on February 2nd, 2020 between the Ducks and Blues was postponed. \nWhen the game resumed, Ducks defensemen Hampus Lindholm, who assisted on a goal in the inital game, did not play in the resumed match.'
80
- }
81
-
82
- shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
83
-
84
- new = 2024
85
-
86
- standings_end = {
87
- '20072008':'04-06',
88
- '20082009':'04-12',
89
- '20092010':'04-11',
90
- '20102011':'04-10',
91
- '20112012':'04-07',
92
- '20122013':'04-28',
93
- '20132014':'04-13',
94
- '20142015':'04-11',
95
- '20152016':'04-10',
96
- '20162017':'04-09',
97
- '20172018':'04-08',
98
- '20182019':'04-06',
99
- '20192020':'03-11',
100
- '20202021':'05-19',
101
- '20212022':'04-01',
102
- '20222023':'04-14',
103
- '20232024':'04-18',
104
- '20242025':'04-17'
105
- }
106
-
107
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal','penalty']
108
-
109
- dir = os.path.dirname(os.path.realpath(__file__))
110
- schedule_path = os.path.join(dir,'tools\\schedule\\schedule.csv')
111
- info_path = os.path.join(dir,'tools\\teaminfo\\nhl_teaminfo.csv')
112
- default_roster = os.path.join(dir,'tools\\rosters\\nhl_rosters.csv')
113
-
114
- ## SCRAPE FUNCTIONS ##
115
- def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage','shootout-complete','game-end'],verbose = False, sources = False, errors = False):
116
- #Given a set of game_ids (NHL API), return complete play-by-play information as requested
117
- # param 'game_ids' - NHL game ids (or list formatted as ['random', num_of_games, start_year, end_year])
118
- # param 'split_shifts' - boolean which splits pbp and shift events if true
119
- # param 'remove' - list of events to remove from final dataframe
120
- # param 'xg' - xG model to apply to pbp for aggregation
121
- # param 'verbose' - boolean which adds additional event info if true
122
- # param 'sources - boolean scraping the html and json sources to a master directory if true
123
- # param 'errors' - boolean returning game ids which did not scrape if true
124
-
125
- pbps = []
126
- if game_ids[0] == 'random':
127
- #Randomize selection of game_ids
128
- #Some ids returned may be invalid (for example, 2020021300)
129
- num = game_ids[1]
130
- try:
131
- start = game_ids[2]
132
- except:
133
- start = 2007
134
- try:
135
- end = game_ids[3]
136
- except:
137
- end = (date.today().year)-1
138
-
139
- game_ids = []
140
- i = 0
141
- print("Finding valid, random game ids...")
142
- while i is not num:
143
- print(f"\rGame IDs found in range {start}-{end}: {i}/{num}",end="")
144
- rand_year = random.randint(start,end)
145
- rand_season_type = random.randint(2,3)
146
- rand_game = random.randint(1,1312)
147
-
148
- #Ensure id validity (and that number of scraped games is equal to specified value)
149
- rand_id = f'{rand_year}{rand_season_type:02d}{rand_game:04d}'
150
- try:
151
- rs.get(f"https://api-web.nhle.com/v1/gamecenter/{rand_id}/play-by-play").json()
152
- i += 1
153
- game_ids.append(rand_id)
154
- except:
155
- continue
156
-
157
- print(f"\rGame IDs found in range {start}-{end}: {i}/{num}")
158
-
159
- #Scrape each game
160
- #Track Errors
161
- error_ids = []
162
- prog = 0
163
- for game_id in game_ids:
164
- print("Scraping data from game " + str(game_id) + "...",end="")
165
- start = time.perf_counter()
166
-
167
- try:
168
- #Retrieve data
169
- info = get_game_info(game_id)
170
- data = combine_data(info, sources)
171
-
172
- #Append data to list
173
- pbps.append(data)
174
-
175
- end = time.perf_counter()
176
- secs = end - start
177
- prog += 1
178
-
179
- #Export if sources is true
180
- if sources:
181
- dirs = f'sources/{info['season']}/'
182
-
183
- if not os.path.exists(dirs):
184
- os.makedirs(dirs)
185
-
186
- data.to_csv(f'{dirs}{info['game_id']}.csv',index=False)
187
-
188
- print(f" finished in {secs:.2f} seconds. {prog}/{len(game_ids)} ({(prog/len(game_ids))*100:.2f}%)")
189
- except:
190
- #Games such as the all-star game and pre-season games will incur this error
191
- #Other games have known problems
192
- if game_id in known_probs.keys():
193
- print(f"\nGame {game_id} has a known problem: {known_probs[game_id]}")
194
- else:
195
- print(f"\nUnable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
196
-
197
- #Track error
198
- error_ids.append(game_id)
199
-
200
- #Add all pbps together
201
- if len(pbps) == 0:
202
- print("\rNo data returned.")
203
- return pd.DataFrame()
204
- df = pd.concat(pbps)
205
-
206
- #If verbose is true features required to calculate xG are added to dataframe
207
- if verbose:
208
- df = prep_xG_data(df)
209
- else:
210
- ""
211
-
212
- #Print final message
213
- if len(error_ids) > 0:
214
- print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
215
- else:
216
- print('\rScrape of provided games finished.')
217
-
218
- #Split pbp and shift events if necessary
219
- #Return: complete play-by-play with data removed or split as necessary
220
-
221
- if split_shifts == True:
222
- remove.append('change')
223
-
224
- #Return: dict with pbp and shifts seperated
225
- pbp_dict = {"pbp":df.loc[~df['event_type'].isin(remove)],
226
- "shifts":df.loc[df['event_type']=='change']
227
- }
228
-
229
- if errors:
230
- pbp_dict.update({'errors':error_ids})
231
-
232
- return pbp_dict
233
- else:
234
- #Return: all events that are not set for removal by the provided list
235
- pbp = df.loc[~df['event_type'].isin(remove)]
236
-
237
- if errors:
238
- pbp_dict = {'pbp':pbp,
239
- 'errors':error_ids}
240
-
241
- return pbp_dict
242
- else:
243
- return pbp
244
-
245
- def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
246
- #Given a season, return schedule data
247
- # param 'season' - NHL season to scrape
248
- # param 'start' - Start date in season
249
- # param 'end' - End date in season
250
-
251
- api = "https://api-web.nhle.com/v1/schedule/"
252
-
253
- #Determine how to approach scraping; if month in season is after the new year the year must be adjusted
254
- new_year = ["01","02","03","04","05","06"]
255
- if start[:2] in new_year:
256
- start = str(int(season[:4])+1)+"-"+start
257
- end = str(season[:-4])+"-"+end
258
- else:
259
- start = str(season[:4])+"-"+start
260
- end = str(season[:-4])+"-"+end
261
-
262
- form = '%Y-%m-%d'
263
-
264
- #Create datetime values from dates
265
- start = datetime.strptime(start,form)
266
- end = datetime.strptime(end,form)
267
-
268
- game = []
269
-
270
- day = (end-start).days+1
271
- if day < 0:
272
- #Handles dates which are over a year apart
273
- day = 365 + day
274
- for i in range(day):
275
- #For each day, call NHL api and retreive info on all games of selected game
276
- inc = start+timedelta(days=i)
277
- print("Scraping games on " + str(inc)[:10]+"...")
278
-
279
- get = rs.get(api+str(inc)[:10]).json()
280
- gameWeek = pd.json_normalize(list(pd.json_normalize(get['gameWeek'])['games'])[0])
281
-
282
- #Return nothing if there's nothing
283
- if gameWeek.empty:
284
- game.append(gameWeek)
285
- else:
286
- gameWeek['date'] = get['gameWeek'][0]['date']
287
-
288
- gameWeek['season_type'] = gameWeek['gameType']
289
- gameWeek['away_team_abbr'] = gameWeek['awayTeam.abbrev']
290
- gameWeek['home_team_abbr'] = gameWeek['homeTeam.abbrev']
291
- gameWeek['game_title'] = gameWeek['away_team_abbr'] + " @ " + gameWeek['home_team_abbr'] + " - " + gameWeek['date']
292
- gameWeek['estStartTime'] = pd.to_datetime(gameWeek['startTimeUTC']).dt.tz_convert('US/Eastern').dt.strftime("%I:%M %p")
293
-
294
- front_col = ['id','season','date','season_type','game_title','away_team_abbr','home_team_abbr','estStartTime']
295
- gameWeek = gameWeek[front_col+[col for col in gameWeek.columns.to_list() if col not in front_col]]
296
-
297
- game.append(gameWeek)
298
-
299
- #Concatenate all games
300
- df = pd.concat(game)
301
-
302
- #Return: specificed schedule data
303
- return df
304
-
305
- def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = schedule_path, verbose = False, sources = False, errors = False):
306
- #Given season, scrape all play-by-play occuring within the season
307
- # param 'season' - NHL season to scrape
308
- # param 'split_shifts' - boolean which splits pbp and shift events if true
309
- # param 'remove' - list of events to remove from final dataframe
310
- # param 'start' - Start date in season
311
- # param 'end' - End date in season
312
- # param 'local' - boolean indicating whether to use local file to scrape game_ids
313
- # param 'local_path' - path of local file
314
- # param 'verbose' - boolean which adds additional event info if true
315
- # param 'sources - boolean scraping the html and json sources to a master directory if true
316
- # param 'errors' - boolean returning game ids which did not scrape if true
317
-
318
- #Determine whether to use schedule data in repository or to scrape
319
- if local:
320
- load = pd.read_csv(local_path)
321
- load['date'] = pd.to_datetime(load['date'])
322
-
323
- start = f'{(season[0:4] if int(start[0:2])>=9 else season[4:8])}-{int(start[0:2])}-{int(start[3:5])}'
324
- end = f'{(season[0:4] if int(end[0:2])>=9 else season[4:8])}-{int(end[0:2])}-{int(end[3:5])}'
325
-
326
- load = load.loc[(load['season'].astype(str)==season)&
327
- (load['season_type'].isin(season_types))&
328
- (load['date']>=start)&(load['date']<=end)]
329
-
330
- game_ids = list(load['id'].astype(str))
331
- else:
332
- load = nhl_scrape_schedule(season,start,end)
333
- load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
334
- game_ids = list(load['id'].astype(str))
335
-
336
- #If no games found, terminate the process
337
- if not game_ids:
338
- print('No games found for dates in season...')
339
- return ""
340
-
341
- print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
342
- start = time.perf_counter()
343
-
344
- #Perform scrape
345
- if split_shifts:
346
- data = nhl_scrape_game(game_ids,split_shifts=True,remove=remove,verbose=verbose,sources=sources,errors=errors)
347
- else:
348
- data = nhl_scrape_game(game_ids,remove=remove,verbose=verbose,sources=sources,errors=errors)
349
-
350
- end = time.perf_counter()
351
- secs = end - start
352
-
353
- print(f'Finished season scrape in {(secs/60)/60:.2f} hours.')
354
- #Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
355
- return data
356
-
357
- def nhl_scrape_seasons_info(seasons = []):
358
- #Returns info related to NHL seasons (by default, all seasons are included)
359
- # param 'season' - list of seasons to include
360
-
361
- print("Scraping info for seasons: " + str(seasons))
362
- api = "https://api.nhle.com/stats/rest/en/season"
363
- info = "https://api-web.nhle.com/v1/standings-season"
364
- data = rs.get(api).json()['data']
365
- data_2 = rs.get(info).json()['seasons']
366
-
367
- df = pd.json_normalize(data)
368
- df_2 = pd.json_normalize(data_2)
369
-
370
- df = pd.merge(df,df_2,how='outer',on=['id'])
371
-
372
- if len(seasons) > 0:
373
- return df.loc[df['id'].astype(str).isin(seasons)].sort_values(by=['id'])
374
- else:
375
- return df.sort_values(by=['id'])
376
-
377
- def nhl_scrape_standings(arg = "now", season_type = 2):
378
- #Returns standings
379
- # param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD, a season (scrapes the last standings date for the season) or a year (for playoffs).
380
- # param 'season_type' - by default, this scrapes the regular season standings. If set to 3, it returns the playoff bracket for the specified season
381
-
382
- #arg param is ignored when set to "now" if season_type param is 3
383
- if season_type == 3:
384
- if arg == "now":
385
- arg = new
386
-
387
- print(f"Scraping playoff bracket for date: {arg}")
388
- api = f"https://api-web.nhle.com/v1/playoff-bracket/{arg}"
389
-
390
- data = rs.get(api).json()['series']
391
-
392
- return pd.json_normalize(data)
393
-
394
- else:
395
- if arg == "now":
396
- print("Scraping standings as of now...")
397
- elif arg in seasons:
398
- print(f'Scraping standings for season: {arg}')
399
- else:
400
- print(f"Scraping standings for date: {arg}")
401
-
402
- api = f"https://api-web.nhle.com/v1/standings/{arg[4:8]}-{standings_end[arg]}"
403
- data = rs.get(api).json()['standings']
404
-
405
- return pd.json_normalize(data)
406
-
407
- def nhl_scrape_roster(season):
408
- #Given a nhl season, return rosters for all participating teams
409
- # param 'season' - NHL season to scrape
410
- print("Scrpaing rosters for the "+ season + "season...")
411
- teaminfo = pd.read_csv(info_path)
412
-
413
- rosts = []
414
- for team in list(teaminfo['Team']):
415
- try:
416
- print("Scraping " + team + " roster...")
417
- api = "https://api-web.nhle.com/v1/roster/"+team+"/"+season
418
-
419
- data = rs.get(api).json()
420
- forwards = pd.json_normalize(data['forwards'])
421
- forwards['headingPosition'] = "F"
422
- dmen = pd.json_normalize(data['defensemen'])
423
- dmen['headingPosition'] = "D"
424
- goalies = pd.json_normalize(data['goalies'])
425
- goalies['headingPosition'] = "G"
426
-
427
- roster = pd.concat([forwards,dmen,goalies]).reset_index(drop=True)
428
- roster['fullName'] = (roster['firstName.default']+" "+roster['lastName.default']).str.upper()
429
- roster['season'] = str(season)
430
- roster['team_abbr'] = team
431
-
432
- rosts.append(roster)
433
- except:
434
- print("No roster found for " + team + "...")
435
-
436
- return pd.concat(rosts)
437
-
438
- def nhl_scrape_prospects(team):
439
- #Given team abbreviation, retreive current team prospects
440
-
441
- api = f'https://api-web.nhle.com/v1/prospects/{team}'
442
-
443
- data = rs.get(api).json()
444
-
445
- #Iterate through positions
446
- players = [pd.json_normalize(data[pos]) for pos in ['forwards','defensemen','goalies']]
447
-
448
- prospects = pd.concat(players)
449
- #Add name columns
450
- prospects['fullName'] = (prospects['firstName.default']+" "+prospects['lastName.default']).str.upper()
451
-
452
- #Return: team prospects
453
- return prospects
454
-
455
- def nhl_scrape_team_info(country = False):
456
- #Given option to return franchise or country, return team information
457
-
458
- print('Scraping team information...')
459
- api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
460
-
461
- data = pd.json_normalize(rs.get(api).json()['data'])
462
-
463
- #Add logos if necessary
464
- if not country:
465
- data['logo_light'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_light.svg'
466
- data['logo_dark'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_dark.svg'
467
-
468
- return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
469
-
470
- def nhl_scrape_player_data(player_ids):
471
- #Given player id, return player information
472
- infos = []
473
- for player_id in player_ids:
474
- player_id = int(player_id)
475
- api = f'https://api-web.nhle.com/v1/player/{player_id}/landing'
476
-
477
- data = pd.json_normalize(rs.get(api).json())
478
- #Add name column
479
- data['fullName'] = (data['firstName.default'] + " " + data['lastName.default']).str.upper()
480
-
481
- #Append
482
- infos.append(data)
483
-
484
- if infos:
485
- df = pd.concat(infos)
486
-
487
- #Return: player data
488
- return df
489
- else:
490
- return pd.DataFrame()
491
-
492
- def nhl_scrape_draft_rankings(arg = 'now', category = ''):
493
- #Given url argument for timeframe and prospect category, return draft rankings
494
- #Category 1 is North American Skaters
495
- #Category 2 is International Skaters
496
- #Category 3 is North American Goalie
497
- #Category 4 is International Goalie
498
-
499
- #Player category only applies when requesting a specific season
500
- api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category != "" else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
501
- data = pd.json_normalize(rs.get(api).json()['rankings'])
502
-
503
- #Add player name columns
504
- data['fullName'] = (data['firstName']+" "+data['lastName']).str.upper()
505
-
506
- #Return: prospect rankings
507
- return data
508
-
509
- def nhl_apply_xG(pbp):
510
- #Given play-by-play data, return this data with xG-related columns
511
-
512
- #param 'pbp' - play-by-play data
513
-
514
- print(f'Applying WSBA xG to model with seasons: {pbp['season'].drop_duplicates().to_list()}')
515
-
516
- #Apply xG model
517
- pbp = wsba_xG(pbp)
518
-
519
- return pbp
520
-
521
- def nhl_shooting_impacts(agg,type):
522
- #Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
523
- #Only 5v5 is supported as of now
524
-
525
- #param 'agg' - stats table
526
- #param 'type' - type of stats to calculate ('skater', 'goalie', or 'team')
527
-
528
- #COMPOSITE IMPACT EVALUATIONS:
529
-
530
- #SR = Shot Rate
531
- #SQ = Shot Quality
532
- #FN = Finishing
533
-
534
- #I = Impact
535
-
536
- #INDV = Individual
537
- #OOFF = On-Ice Offense
538
- #ODEF = On-Ice Defense
539
-
540
- #Grouping-Metric Code: XXXX-YYI
541
-
542
- #Goal Composition Formula
543
- #The aggregation of goals is composed of three factors: shot rate, shot quality, and finishing
544
- #These are represented by their own metrics in which Goals = (Fenwick*(League Average Fenwick SH%)) + ((xGoals/Fenwick - League Average Fenwick SH%)*Fenwick) + (Goals - xGoals)
545
- def goal_comp(fenwick,xg_fen,xg,g,fsh):
546
- rate = fenwick * fsh
547
- qual = (xg_fen-fsh)*fenwick
548
- fini = g-xg
549
-
550
- return rate+qual+fini
551
-
552
- if type == 'goalie':
553
- pos = agg
554
- for group in [('OOFF','F'),('ODEF','A')]:
555
- #Have to set this columns for compatibility with df.apply
556
- pos['fsh'] = pos[f'Fsh{group[1]}%']
557
- pos['fenwick'] = pos[f'F{group[1]}/60']
558
- pos['xg'] = pos[f'xG{group[1]}/60']
559
- pos['g'] = pos[f'G{group[1]}/60']
560
- pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
561
- pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
562
-
563
- #Find average for position in frame
564
- avg_fen = pos['fenwick'].mean()
565
- avg_xg = pos['xg'].mean()
566
- avg_g = pos['g'].mean()
567
- avg_fsh = avg_g/avg_fen
568
- avg_xg_fen = avg_xg/avg_fen
569
-
570
- #Calculate composite percentiles
571
- pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
572
- pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
573
- pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
574
-
575
- #Calculate shot rate, shot quality, and finishing impacts
576
- pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
577
- pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
578
- pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
579
-
580
- #Convert impacts to totals
581
- #Calculate shot rate, shot quality, and finishing impacts
582
- pos[f'{group[0]}-SRI-T'] = (pos[f'{group[0]}-SRI']/60)*pos['TOI']
583
- pos[f'{group[0]}-SQI-T'] = (pos[f'{group[0]}-SQI']/60)*pos['TOI']
584
- pos[f'{group[0]}-FNI-T'] = (pos[f'{group[0]}-FNI']/60)*pos['TOI']
585
-
586
- #Rank per 60 stats
587
- for stat in ['FF','FA','xGF','xGA','GF','GA','CF','CA','GSAx']:
588
- pos[f'{stat}/60-P'] = pos[f'{stat}/60'].rank(pct=True)
589
-
590
- #Flip percentiles for against stats
591
- for stat in ['FA','xGA','GA','CA']:
592
- pos[f'{stat}/60-P'] = 1-pos[f'{stat}/60-P']
593
-
594
- #Add extra metrics
595
- pos['RushF/60'] = (pos['RushF']/pos['TOI'])*60
596
- pos['RushA/60'] = (pos['RushA']/pos['TOI'])*60
597
- pos['RushesFF'] = pos['RushF/60'].rank(pct=True)
598
- pos['RushesFA'] = 1 - pos['RushA/60'].rank(pct=True)
599
- pos['RushFxG/60'] = (pos['RushFxG']/pos['TOI'])*60
600
- pos['RushAxG/60'] = (pos['RushAxG']/pos['TOI'])*60
601
- pos['RushesxGF'] = pos['RushFxG/60'].rank(pct=True)
602
- pos['RushesxGA'] = 1 - pos['RushAxG/60'].rank(pct=True)
603
- pos['RushFG/60'] = (pos['RushFG']/pos['TOI'])*60
604
- pos['RushAG/60'] = (pos['RushAG']/pos['TOI'])*60
605
- pos['RushesGF'] = pos['RushFG/60'].rank(pct=True)
606
- pos['RushesGA'] = 1 - pos['RushAG/60'].rank(pct=True)
607
-
608
- #Flip against metric percentiles
609
- pos['ODEF-SR'] = 1-pos['ODEF-SR']
610
- pos['ODEF-SQ'] = 1-pos['ODEF-SQ']
611
- pos['ODEF-FN'] = 1-pos['ODEF-FN']
612
-
613
- #Extraneous Values
614
- pos['EGF'] = pos['OOFF-SRI']+pos['OOFF-SQI']+pos['OOFF-FNI']
615
- pos['ExGF'] = pos['OOFF-SRI']+pos['OOFF-SQI']
616
- pos['EGA'] = pos['ODEF-SRI']+pos['ODEF-SQI']+pos['ODEF-FNI']
617
- pos['ExGA'] = pos['ODEF-SRI']+pos['ODEF-SQI']
618
-
619
- #...and their percentiles
620
- pos['EGF-P'] = pos['EGF'].rank(pct=True)
621
- pos['ExGF-P'] = pos['ExGF'].rank(pct=True)
622
- pos['EGA-P'] = pos['EGA'].rank(pct=True)
623
- pos['ExGA-P'] = pos['ExGA'].rank(pct=True)
624
-
625
- pos['EGA-P'] = 1-pos['EGA']
626
- pos['ExGA-P'] = 1-pos['ExGA']
627
-
628
- #...and then their totals
629
- pos['EGF-T'] = (pos['EGF']/60)*pos['TOI']
630
- pos['ExGF-T'] = (pos['ExGF']/60)*pos['TOI']
631
- pos['EGA-T'] = (pos['EGA']/60)*pos['TOI']
632
- pos['ExGA-T'] = (pos['ExGA']/60)*pos['TOI']
633
-
634
- #Goal Composites...
635
- pos['Team-Adjusted-EGI'] = pos['ODEF-FNI']-pos['ExGA']
636
- pos['GISAx'] = pos['ExGA']-pos['EGA']
637
- pos['NetGI'] = pos['EGF'] - pos['EGA']
638
- pos['NetxGI'] = pos['ExGF'] - pos['ExGA']
639
-
640
- #...and their percentiles
641
- pos['Team-Adjusted-EGI-P'] = pos['Team-Adjusted-EGI'].rank(pct=True)
642
- pos['GISAx-P'] = pos['GISAx'].rank(pct=True)
643
- pos['NetGI-P'] = pos['NetGI'].rank(pct=True)
644
- pos['NetxGI-P'] = pos['NetxGI'].rank(pct=True)
645
-
646
- #...and then their totals
647
- pos['Team-Adjusted-EGI-T'] = (pos['Team-Adjusted-EGI']/60)*pos['TOI']
648
- pos['GISAx-T'] = (pos['GISAx']/60)*pos['TOI']
649
- pos['NetGI-T'] = (pos['NetGI']/60)*pos['TOI']
650
- pos['NetxGI-T'] = (pos['NetxGI']/60)*pos['TOI']
651
-
652
- #Return: team stats with shooting impacts
653
- return pos.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Goalie','Season','Team'])
654
-
655
- elif type =='team':
656
- pos = agg
657
- for group in [('OOFF','F'),('ODEF','A')]:
658
- #Have to set this columns for compatibility with df.apply
659
- pos['fsh'] = pos[f'Fsh{group[1]}%']
660
- pos['fenwick'] = pos[f'F{group[1]}/60']
661
- pos['xg'] = pos[f'xG{group[1]}/60']
662
- pos['g'] = pos[f'G{group[1]}/60']
663
- pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
664
- pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
665
-
666
- #Find average for position in frame
667
- avg_fen = pos['fenwick'].mean()
668
- avg_xg = pos['xg'].mean()
669
- avg_g = pos['g'].mean()
670
- avg_fsh = avg_g/avg_fen
671
- avg_xg_fen = avg_xg/avg_fen
672
-
673
- #Calculate composite percentiles
674
- pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
675
- pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
676
- pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
677
-
678
- #Calculate shot rate, shot quality, and finishing impacts
679
- pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
680
- pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
681
- pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
682
-
683
- #Convert impacts to totals
684
- #Calculate shot rate, shot quality, and finishing impacts
685
- pos[f'{group[0]}-SRI-T'] = (pos[f'{group[0]}-SRI']/60)*pos['TOI']
686
- pos[f'{group[0]}-SQI-T'] = (pos[f'{group[0]}-SQI']/60)*pos['TOI']
687
- pos[f'{group[0]}-FNI-T'] = (pos[f'{group[0]}-FNI']/60)*pos['TOI']
688
-
689
- #Rank per 60 stats
690
- for stat in per_sixty[10:len(per_sixty)]:
691
- pos[f'{stat}/60-P'] = pos[f'{stat}/60'].rank(pct=True)
692
-
693
- #Flip percentiles for against stats
694
- for stat in ['FA','xGA','GA','CA','HA','Give','Penl','Penl2','Penl5']:
695
- pos[f'{stat}/60-P'] = 1-pos[f'{stat}/60-P']
696
-
697
- #Add extra metrics
698
- pos['RushF/60'] = (pos['RushF']/pos['TOI'])*60
699
- pos['RushA/60'] = (pos['RushA']/pos['TOI'])*60
700
- pos['RushesFF'] = pos['RushF/60'].rank(pct=True)
701
- pos['RushesFA'] = 1 - pos['RushA/60'].rank(pct=True)
702
- pos['RushFxG/60'] = (pos['RushFxG']/pos['TOI'])*60
703
- pos['RushAxG/60'] = (pos['RushAxG']/pos['TOI'])*60
704
- pos['RushesxGF'] = pos['RushFxG/60'].rank(pct=True)
705
- pos['RushesxGA'] = 1 - pos['RushAxG/60'].rank(pct=True)
706
- pos['RushFG/60'] = (pos['RushFG']/pos['TOI'])*60
707
- pos['RushAG/60'] = (pos['RushAG']/pos['TOI'])*60
708
- pos['RushesGF'] = pos['RushFG/60'].rank(pct=True)
709
- pos['RushesGA'] = 1 - pos['RushAG/60'].rank(pct=True)
710
-
711
- #Flip against metric percentiles
712
- pos['ODEF-SR'] = 1-pos['ODEF-SR']
713
- pos['ODEF-SQ'] = 1-pos['ODEF-SQ']
714
- pos['ODEF-FN'] = 1-pos['ODEF-FN']
715
-
716
- pos['EGF'] = pos['OOFF-SRI']+pos['OOFF-SQI']+pos['OOFF-FNI']
717
- pos['ExGF'] = pos['OOFF-SRI']+pos['OOFF-SQI']
718
- pos['EGA'] = pos['ODEF-SRI']+pos['ODEF-SQI']+pos['ODEF-FNI']
719
- pos['ExGA'] = pos['ODEF-SRI']+pos['ODEF-SQI']
720
-
721
- #...and their percentiles
722
- pos['EGF-P'] = pos['EGF'].rank(pct=True)
723
- pos['ExGF-P'] = pos['ExGF'].rank(pct=True)
724
- pos['EGA-P'] = pos['EGA'].rank(pct=True)
725
- pos['ExGA-P'] = pos['ExGA'].rank(pct=True)
726
-
727
- pos['EGA-P'] = 1-pos['EGA']
728
- pos['ExGA-P'] = 1-pos['ExGA']
729
-
730
- #...and then their totals
731
- pos['EGF-T'] = (pos['EGF']/60)*pos['TOI']
732
- pos['ExGF-T'] = (pos['ExGF']/60)*pos['TOI']
733
- pos['EGA-T'] = (pos['EGA']/60)*pos['TOI']
734
- pos['ExGA-T'] = (pos['ExGA']/60)*pos['TOI']
735
-
736
- #Return: team stats with shooting impacts
737
- return pos.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Season','Team'])
738
-
739
- else:
740
- #Remove skaters with less than 150 minutes of TOI then split between forwards and dmen
741
- #These are added back in after the fact
742
- forwards = agg.loc[(agg['Position']!='D')&(agg['TOI']>=150)]
743
- defensemen = agg.loc[(agg['Position']=='D')&(agg['TOI']>=150)]
744
- non_players = agg.loc[agg['TOI']<150]
745
-
746
- #Loop through both positions, all groupings (INDV, OOFF, and ODEF) generating impacts
747
- for pos in [forwards,defensemen]:
748
- for group in [('INDV','i'),('OOFF','F'),('ODEF','A')]:
749
- #Have to set this columns for compatibility with df.apply
750
- pos['fsh'] = pos[f'Fsh{group[1]}%']
751
- pos['fenwick'] = pos[f'F{group[1]}/60']
752
- pos['xg'] = pos[f'xG{group[1]}/60']
753
- pos['g'] = pos[f'G{group[1]}/60']
754
- pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
755
- pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
756
-
757
- #Find average for position in frame
758
- avg_fen = pos['fenwick'].mean()
759
- avg_xg = pos['xg'].mean()
760
- avg_g = pos['g'].mean()
761
- avg_fsh = avg_g/avg_fen
762
- avg_xg_fen = avg_xg/avg_fen
763
-
764
- #Calculate composite percentiles
765
- pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
766
- pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
767
- pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
768
-
769
- #Calculate shot rate, shot quality, and finishing impacts
770
- pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
771
- pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
772
- pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
773
-
774
- #Convert impacts to totals
775
- #Calculate shot rate, shot quality, and finishing impacts
776
- pos[f'{group[0]}-SRI-T'] = (pos[f'{group[0]}-SRI']/60)*pos['TOI']
777
- pos[f'{group[0]}-SQI-T'] = (pos[f'{group[0]}-SQI']/60)*pos['TOI']
778
- pos[f'{group[0]}-FNI-T'] = (pos[f'{group[0]}-FNI']/60)*pos['TOI']
779
-
780
- #Calculate On-Ice Involvement Percentiles
781
- pos['Fi/F'] = pos['FC%'].rank(pct=True)
782
- pos['xGi/F'] = pos['xGC%'].rank(pct=True)
783
- pos['Pi/F'] = pos['GI%'].rank(pct=True)
784
- pos['Gi/F'] = pos['GC%'].rank(pct=True)
785
- pos['RushFi/60'] = (pos['Rush']/pos['TOI'])*60
786
- pos['RushxGi/60'] = (pos['Rush xG']/pos['TOI'])*60
787
- pos['RushesxGi'] = pos['RushxGi/60'].rank(pct=True)
788
- pos['RushesFi'] = pos['RushFi/60'].rank(pct=True)
789
-
790
- #Rank per 60 stats
791
- for stat in per_sixty:
792
- pos[f'{stat}/60-P'] = pos[f'{stat}/60'].rank(pct=True)
793
-
794
- #Flip percentiles for against stats
795
- for stat in ['FA','xGA','GA','CA','HA','Give','Penl','Penl2','Penl5']:
796
- pos[f'{stat}/60-P'] = 1-pos[f'{stat}/60-P']
797
-
798
- #Add positions back together
799
- complete = pd.concat([forwards,defensemen])
800
-
801
- #Flip against metric percentiles
802
- complete['ODEF-SR'] = 1-complete['ODEF-SR']
803
- complete['ODEF-SQ'] = 1-complete['ODEF-SQ']
804
- complete['ODEF-FN'] = 1-complete['ODEF-FN']
805
-
806
- #Extraneous Values
807
- complete['EGi'] = complete['INDV-SRI']+complete['INDV-SQI']+complete['INDV-FNI']
808
- complete['ExGi'] = complete['INDV-SRI']+complete['INDV-SQI']
809
- complete['EGF'] = complete['OOFF-SRI']+complete['OOFF-SQI']+complete['OOFF-FNI']
810
- complete['ExGF'] = complete['OOFF-SRI']+complete['OOFF-SQI']
811
- complete['EGA'] = complete['ODEF-SRI']+complete['ODEF-SQI']+complete['ODEF-FNI']
812
- complete['ExGA'] = complete['ODEF-SRI']+complete['ODEF-SQI']
813
-
814
- #...and their percentiles
815
- complete['EGi-P'] = complete['EGi'].rank(pct=True)
816
- complete['ExGi-P'] = complete['ExGi'].rank(pct=True)
817
- complete['EGF-P'] = complete['EGF'].rank(pct=True)
818
- complete['ExGF-P'] = complete['ExGF'].rank(pct=True)
819
- complete['EGA-P'] = complete['EGA'].rank(pct=True)
820
- complete['ExGA-P'] = complete['ExGA'].rank(pct=True)
821
-
822
- complete['EGA-P'] = 1-complete['EGA']
823
- complete['ExGA-P'] = 1-complete['ExGA']
824
-
825
- #...and then their totals
826
- complete['EGi-T'] = (complete['EGi']/60)*complete['TOI']
827
- complete['ExGi-T'] = (complete['ExGi']/60)*complete['TOI']
828
- complete['EGF-T'] = (complete['EGF']/60)*complete['TOI']
829
- complete['ExGF-T'] = (complete['ExGF']/60)*complete['TOI']
830
- complete['EGA-T'] = (complete['EGA']/60)*complete['TOI']
831
- complete['ExGA-T'] = (complete['ExGA']/60)*complete['TOI']
832
-
833
- #Goal Composites...
834
- complete['LiEG'] = complete['EGF'] - complete['EGi']
835
- complete['LiExG'] = complete['ExGF'] - complete['ExGi']
836
- complete['LiGIn'] = complete['LiEG']*complete['AC%']
837
- complete['LixGIn'] = complete['LiExG']*complete['AC%']
838
- complete['ALiGIn'] = complete['LiGIn']-complete['LixGIn']
839
- complete['CompGI'] = complete['EGi'] + complete['LiGIn']
840
- complete['LiRelGI'] = complete['CompGI'] - (complete['EGF']-complete['CompGI'])
841
- complete['NetGI'] = complete['EGF'] - complete['EGA']
842
- complete['NetxGI'] = complete['ExGF'] - complete['ExGA']
843
-
844
- #...and their percentiles
845
- complete['LiEG-P'] = complete['LiEG'].rank(pct=True)
846
- complete['LiExG-P'] = complete['LiExG'].rank(pct=True)
847
- complete['LiGIn-P'] = complete['LiGIn'].rank(pct=True)
848
- complete['LixGIn-P'] = complete['LixGIn'].rank(pct=True)
849
- complete['ALiGIn-P'] = complete['ALiGIn'].rank(pct=True)
850
- complete['CompGI-P'] = complete['CompGI'].rank(pct=True)
851
- complete['LiRelGI-P'] = complete['LiRelGI'].rank(pct=True)
852
- complete['NetGI-P'] = complete['NetGI'].rank(pct=True)
853
- complete['NetxGI-P'] = complete['NetxGI'].rank(pct=True)
854
-
855
- #..and then their totals
856
- complete['LiEG-T'] = (complete['LiEG']/60)*complete['TOI']
857
- complete['LiExG-T'] = (complete['LiExG']/60)*complete['TOI']
858
- complete['LiGIn-T'] = (complete['LiGIn']/60)*complete['TOI']
859
- complete['LixGIn-T'] = (complete['LixGIn']/60)*complete['TOI']
860
- complete['ALiGIn-T'] = (complete['ALiGIn']/60)*complete['TOI']
861
- complete['CompGI-T'] = (complete['CompGI']/60)*complete['TOI']
862
- complete['LiRelGI-T'] = (complete['LiRelGI']/60)*complete['TOI']
863
- complete['NetGI-T'] = (complete['NetGI']/60)*complete['TOI']
864
- complete['NetxGI-T'] = (complete['NetxGI']/60)*complete['TOI']
865
-
866
- #Add back skaters with less than 150 minutes TOI
867
- df = pd.concat([complete,non_players]).drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Player','Season','Team','ID'])
868
- #Return: skater stats with shooting impacts
869
- return df
870
-
871
- def nhl_calculate_stats(pbp,type,season_types,game_strength,split_game=False,roster_path=default_roster,shot_impact=False):
872
- #Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
873
- # param 'pbp' - play-by-play dataframe
874
- # param 'type' - type of stats to calculate ('skater', 'goalie', or 'team')
875
- # param 'season' - season or timeframe of events in play-by-play
876
- # param 'season_type' - list of season types (preseason, regular season, or playoffs) to include in aggregation
877
- # param 'game_strength' - list of game_strengths to include in aggregation
878
- # param 'split_game' - boolean which if true groups aggregation by game
879
- # param 'roster_path' - path to roster file
880
- # param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
881
-
882
- print(f"Calculating statistics for all games in the provided play-by-play data at {game_strength} for {type}s...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
883
- start = time.perf_counter()
884
-
885
- #Check if xG column exists and apply model if it does not
886
- try:
887
- pbp['xG']
888
- except KeyError:
889
- pbp = wsba_xG(pbp)
890
-
891
- #Apply season_type filter
892
- pbp = pbp.loc[(pbp['season_type'].isin(season_types))]
893
-
894
- #Convert all columns with player ids to float in order to avoid merging errors
895
- for col in get_col():
896
- if "_id" in col:
897
- try: pbp[col] = pbp[col].astype(float)
898
- except KeyError: continue
899
-
900
- #Split by game if specified
901
- if split_game:
902
- second_group = ['season','game_id']
903
- else:
904
- second_group = ['season']
905
-
906
- #Split calculation
907
- if type == 'goalie':
908
- complete = calc_goalie(pbp,game_strength,second_group)
909
-
910
- #Set TOI to minute
911
- complete['TOI'] = complete['TOI']/60
912
-
913
- #Add per 60 stats
914
- for stat in ['FF','FA','xGF','xGA','GF','GA','CF','CA','GSAx']:
915
- complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
916
-
917
- complete['GF%'] = complete['GF']/(complete['GF']+complete['GA'])
918
- complete['xGF%'] = complete['xGF']/(complete['xGF']+complete['xGA'])
919
- complete['FF%'] = complete['FF']/(complete['FF']+complete['FA'])
920
- complete['CF%'] = complete['CF']/(complete['CF']+complete['CA'])
921
-
922
- #Remove entries with no ID listed
923
- complete = complete.loc[complete['ID'].notna()]
924
-
925
- #Import rosters and player info
926
- rosters = pd.read_csv(roster_path)
927
- names = rosters[['id','fullName',
928
- 'headshot','positionCode','shootsCatches',
929
- 'heightInInches','weightInPounds',
930
- 'birthDate','birthCountry']].drop_duplicates(subset=['id','fullName'],keep='last')
931
-
932
- #Add names
933
- complete = pd.merge(complete,names,how='left',left_on='ID',right_on='id')
934
-
935
- #Rename if there are no missing names
936
- complete = complete.rename(columns={'fullName':'Goalie',
937
- 'headshot':'Headshot',
938
- 'positionCode':'Position',
939
- 'shootsCatches':'Handedness',
940
- 'heightInInches':'Height (in)',
941
- 'weightInPounds':'Weight (lbs)',
942
- 'birthDate':'Birthday',
943
- 'birthCountry':'Nationality'})
944
-
945
- #WSBA
946
- complete['WSBA'] = complete['Goalie']+complete['Team']+complete['Season'].astype(str)
947
-
948
- #Add player age
949
- complete['Birthday'] = pd.to_datetime(complete['Birthday'])
950
- complete['season_year'] = complete['Season'].astype(str).str[4:8].astype(int)
951
- complete['Age'] = complete['season_year'] - complete['Birthday'].dt.year
952
-
953
- #Find player headshot
954
- complete['Headshot'] = 'https://assets.nhle.com/mugs/nhl/'+complete['Season'].astype(str)+'/'+complete['Team']+'/'+complete['ID'].astype(int).astype(str)+'.png'
955
-
956
- end = time.perf_counter()
957
- length = end-start
958
- print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
959
-
960
- head = ['Goalie','ID','Game'] if 'Game' in complete.columns else ['Goalie','ID']
961
- complete = complete[head+[
962
- "Season","Team",'WSBA',
963
- 'Headshot','Position','Handedness',
964
- 'Height (in)','Weight (lbs)',
965
- 'Birthday','Age','Nationality',
966
- 'GP','TOI',
967
- "GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
968
- "GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
969
- 'CF','CA',
970
- 'GSAx',
971
- 'RushF','RushA','RushFxG','RushAxG','RushFG','RushAG'
972
- ]+[f'{stat}/60' for stat in ['FF','FA','xGF','xGA','GF','GA','CF','CA','GSAx']]]
973
-
974
- #Apply shot impacts if necessary
975
- if shot_impact:
976
- complete = nhl_shooting_impacts(complete,'goalie')
977
-
978
- end = time.perf_counter()
979
- length = end-start
980
- print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
981
-
982
- return complete
983
-
984
- elif type == 'team':
985
- complete = calc_team(pbp,game_strength,second_group)
986
-
987
- #WSBA
988
- complete['WSBA'] = complete['Team']+complete['Season'].astype(str)
989
-
990
- #Set TOI to minute
991
- complete['TOI'] = complete['TOI']/60
992
-
993
- #Add per 60 stats
994
- for stat in per_sixty[10:len(per_sixty)]:
995
- complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
996
-
997
- complete['GF%'] = complete['GF']/(complete['GF']+complete['GA'])
998
- complete['xGF%'] = complete['xGF']/(complete['xGF']+complete['xGA'])
999
- complete['FF%'] = complete['FF']/(complete['FF']+complete['FA'])
1000
- complete['CF%'] = complete['CF']/(complete['CF']+complete['CA'])
1001
-
1002
- head = ['Team','Game'] if 'Game' in complete.columns else ['Team']
1003
- complete = complete[head+[
1004
- 'Season','WSBA',
1005
- 'GP','TOI',
1006
- "GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
1007
- "GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
1008
- 'CF','CA',
1009
- 'GF%','FF%','xGF%','CF%',
1010
- 'HF','HA','HF%',
1011
- 'Penl','Penl2','Penl5','PIM','Draw','PENL%',
1012
- 'Give','Take','PM%',
1013
- 'Block',
1014
- 'RushF','RushA','RushFxG','RushAxG','RushFG','RushAG'
1015
- ]+[f'{stat}/60' for stat in per_sixty[10:len(per_sixty)]]]
1016
- #Apply shot impacts if necessary
1017
- if shot_impact:
1018
- complete = nhl_shooting_impacts(complete,'team')
1019
-
1020
- end = time.perf_counter()
1021
- length = end-start
1022
- print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
1023
-
1024
- return complete
1025
- else:
1026
- indv_stats = calc_indv(pbp,game_strength,second_group)
1027
- onice_stats = calc_onice(pbp,game_strength,second_group)
1028
-
1029
- #IDs sometimes set as objects
1030
- indv_stats['ID'] = indv_stats['ID'].astype(float)
1031
- onice_stats['ID'] = onice_stats['ID'].astype(float)
1032
-
1033
- #Merge and add columns for extra stats
1034
- complete = pd.merge(indv_stats,onice_stats,how="outer",on=['ID','Team','Season']+(['Game'] if 'game_id' in second_group else []))
1035
- complete['GC%'] = complete['Gi']/complete['GF']
1036
- complete['AC%'] = (complete['A1']+complete['A2'])/complete['GF']
1037
- complete['GI%'] = (complete['Gi']+complete['A1']+complete['A2'])/complete['GF']
1038
- complete['FC%'] = complete['Fi']/complete['FF']
1039
- complete['xGC%'] = complete['xGi']/complete['xGF']
1040
- complete['GF%'] = complete['GF']/(complete['GF']+complete['GA'])
1041
- complete['xGF%'] = complete['xGF']/(complete['xGF']+complete['xGA'])
1042
- complete['FF%'] = complete['FF']/(complete['FF']+complete['FA'])
1043
- complete['CF%'] = complete['CF']/(complete['CF']+complete['CA'])
1044
-
1045
- #Remove entries with no ID listed
1046
- complete = complete.loc[complete['ID'].notna()]
1047
-
1048
- #Import rosters and player info
1049
- rosters = pd.read_csv(roster_path)
1050
- names = rosters[['id','fullName',
1051
- 'headshot','positionCode','shootsCatches',
1052
- 'heightInInches','weightInPounds',
1053
- 'birthDate','birthCountry']].drop_duplicates(subset=['id','fullName'],keep='last')
1054
-
1055
- #Add names
1056
- complete = pd.merge(complete,names,how='left',left_on='ID',right_on='id')
1057
-
1058
- #Rename if there are no missing names
1059
- complete = complete.rename(columns={'fullName':'Player',
1060
- 'headshot':'Headshot',
1061
- 'positionCode':'Position',
1062
- 'shootsCatches':'Handedness',
1063
- 'heightInInches':'Height (in)',
1064
- 'weightInPounds':'Weight (lbs)',
1065
- 'birthDate':'Birthday',
1066
- 'birthCountry':'Nationality'})
1067
-
1068
- #Set TOI to minute
1069
- complete['TOI'] = complete['TOI']/60
1070
-
1071
- #Add player age
1072
- complete['Birthday'] = pd.to_datetime(complete['Birthday'])
1073
- complete['season_year'] = complete['Season'].astype(str).str[4:8].astype(int)
1074
- complete['Age'] = complete['season_year'] - complete['Birthday'].dt.year
1075
-
1076
- #Find player headshot
1077
- complete['Headshot'] = 'https://assets.nhle.com/mugs/nhl/'+complete['Season'].astype(str)+'/'+complete['Team']+'/'+complete['ID'].astype(int).astype(str)+'.png'
1078
-
1079
- #Remove goalies that occasionally appear in a set
1080
- complete = complete.loc[complete['Position']!='G']
1081
- #Add WSBA ID
1082
- complete['WSBA'] = complete['Player']+complete['Season'].astype(str)+complete['Team']
1083
-
1084
- #Add per 60 stats
1085
- for stat in per_sixty:
1086
- complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
1087
-
1088
- #Shot Type Metrics
1089
- type_metrics = []
1090
- for type in shot_types:
1091
- for stat in per_sixty[:3]:
1092
- type_metrics.append(f'{type.capitalize()}{stat}')
1093
-
1094
- head = ['Player','ID','Game'] if 'Game' in complete.columns else ['Player','ID']
1095
- complete = complete[head+[
1096
- "Season","Team",'WSBA',
1097
- 'Headshot','Position','Handedness',
1098
- 'Height (in)','Weight (lbs)',
1099
- 'Birthday','Age','Nationality',
1100
- 'GP','TOI',
1101
- "Gi","A1","A2",'P1','P',
1102
- 'Give','Take','PM%','HF','HA','HF%',
1103
- "Fi","xGi",'xGi/Fi',"Gi/xGi","Fshi%",
1104
- "GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
1105
- "GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
1106
- 'Ci','CF','CA','CF%',
1107
- 'FF%','xGF%','GF%',
1108
- 'Rush',"Rush xG",'Rush G',"GC%","AC%","GI%","FC%","xGC%",
1109
- 'F','FW','FL','F%',
1110
- 'Penl','Penl2','Penl5',
1111
- 'Draw','PIM','PENL%',
1112
- 'Block',
1113
- 'OZF','NZF','DZF',
1114
- 'OZF%','NZF%','DZF%',
1115
- ]+[f'{stat}/60' for stat in per_sixty]+type_metrics].fillna(0).sort_values(['Player','Season','Team','ID'])
1116
-
1117
- #Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
1118
- if shot_impact:
1119
- complete = nhl_shooting_impacts(complete,'skater')
1120
-
1121
- end = time.perf_counter()
1122
- length = end-start
1123
- print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
1124
-
1125
- return complete
1126
-
1127
- def nhl_plot_skaters_shots(pbp,skater_dict,strengths,marker_dict=event_markers,onice = 'indv',title = True,legend=False):
1128
- #Returns dict of plots for specified skaters
1129
- # param 'pbp' - pbp to plot data
1130
- # param 'skater_dict' - skaters to plot shots for (format: {'Patrice Bergeron':['20242025','BOS']})
1131
- # param 'strengths' - strengths to include in plotting
1132
- # param 'marker_dict' - dict with markers to use for events
1133
- # param 'onice' - can set which shots to include in plotting for the specified skater ('indv', 'for', 'against')
1134
- # param 'title' - bool including title when true
1135
- # param 'legend' - bool which includes legend if true
1136
- # param 'xg' - xG model to apply to pbp for plotting
1137
-
1138
- print(f'Plotting the following skater shots: {skater_dict}...')
1139
-
1140
- #Iterate through skaters, adding plots to dict
1141
- skater_plots = {}
1142
- for skater in skater_dict.keys():
1143
- skater_info = skater_dict[skater]
1144
- title = f'{skater} Fenwick Shots for {skater_info[1]} in {skater_info[0][2:4]}-{skater_info[0][6:8]}' if title else ''
1145
- #Key is formatted as PLAYERSEASONTEAM (i.e. PATRICE BERGERON20212022BOS)
1146
- skater_plots.update({f'{skater}{skater_info[0]}{skater_info[1]}':[plot_skater_shots(pbp,skater,skater_info[0],skater_info[1],strengths,title,marker_dict,onice,legend)]})
1147
-
1148
- #Return: list of plotted skater shot charts
1149
- return skater_plots
1150
-
1151
- def nhl_plot_games(pbp,events,strengths,game_ids='all',marker_dict=event_markers,team_colors={'away':'primary','home':'primary'},legend=False):
1152
- #Returns dict of plots for specified games
1153
- # param 'pbp' - pbp to plot data
1154
- # param 'events' - type of events to plot
1155
- # param 'strengths' - strengths to include in plotting
1156
- # param 'game_ids' - games to plot (list if not set to 'all')
1157
- # param 'marker_dict' - dict with colors to use for events
1158
- # param 'legend' - bool which includes legend if true
1159
- # param 'xg' - xG model to apply to pbp for plotting
1160
-
1161
- #Find games to scrape
1162
- if game_ids == 'all':
1163
- game_ids = pbp['game_id'].drop_duplicates().to_list()
1164
-
1165
- print(f'Plotting the following games: {game_ids}...')
1166
-
1167
- game_plots = {}
1168
- #Iterate through games, adding plot to dict
1169
- for game in game_ids:
1170
- game_plots.update({game:[plot_game_events(pbp,game,events,strengths,marker_dict,team_colors,legend)]})
1171
-
1172
- #Return: list of plotted game events
1173
- return game_plots
1174
-
1175
- def repo_load_rosters(seasons = []):
1176
- #Returns roster data from repository
1177
- # param 'seasons' - list of seasons to include
1178
-
1179
- data = pd.read_csv(default_roster)
1180
- if len(seasons)>0:
1181
- data = data.loc[data['season'].isin(seasons)]
1182
-
1183
- return data
1184
-
1185
- def repo_load_schedule(seasons = []):
1186
- #Returns schedule data from repository
1187
- # param 'seasons' - list of seasons to include
1188
-
1189
- data = pd.read_csv(schedule_path)
1190
- if len(seasons)>0:
1191
- data = data.loc[data['season'].isin(seasons)]
1192
-
1193
- return data
1194
-
1195
- def repo_load_teaminfo():
1196
- #Returns team data from repository
1197
-
1198
- return pd.read_csv(info_path)
1199
-
1200
- def repo_load_pbp(seasons = []):
1201
- #Returns play-by-play data from repository
1202
- # param 'seasons' - list of seasons to include
1203
-
1204
- #Add parquet to total
1205
- print(f'Loading play-by-play from the following seasons: {seasons}...')
1206
- dfs = [pd.read_parquet(f"https://weakside-breakout.s3.us-east-2.amazonaws.com/pbp/{season}.parquet") for season in seasons]
1207
-
1208
- return pd.concat(dfs)
1209
-
1210
- def repo_load_seasons():
1211
- #List of available seasons to scrape
1212
-
1213
- return seasons