wsba-hockey 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +858 -377
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +374 -0
- wsba_hockey/wsba_main.py +725 -123
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/METADATA +49 -11
- wsba_hockey-0.1.4.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.1.dist-info/RECORD +0 -8
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/top_level.txt +0 -0
wsba_hockey/wsba_main.py
CHANGED
@@ -1,66 +1,230 @@
|
|
1
1
|
import requests as rs
|
2
2
|
import pandas as pd
|
3
3
|
import numpy as np
|
4
|
-
from datetime import datetime, timedelta
|
5
|
-
|
4
|
+
from datetime import datetime, timedelta, date
|
5
|
+
import time
|
6
|
+
import random
|
7
|
+
from tools.scraping import *
|
8
|
+
from tools.xg_model import *
|
9
|
+
from tools.agg import *
|
10
|
+
from tools.plotting import *
|
6
11
|
|
7
12
|
### WSBA HOCKEY ###
|
8
13
|
## Provided below are all integral functions in the WSBA Hockey Python package. ##
|
9
14
|
|
15
|
+
## GLOBAL VARIABLES ##
|
16
|
+
seasons = [
|
17
|
+
'20072008',
|
18
|
+
'20082009',
|
19
|
+
'20092010',
|
20
|
+
'20102011',
|
21
|
+
'20112012',
|
22
|
+
'20122013',
|
23
|
+
'20132014',
|
24
|
+
'20142015',
|
25
|
+
'20152016',
|
26
|
+
'20162017',
|
27
|
+
'20172018',
|
28
|
+
'20182019',
|
29
|
+
'20192020',
|
30
|
+
'20202021',
|
31
|
+
'20212022',
|
32
|
+
'20222023',
|
33
|
+
'20232024',
|
34
|
+
'20242025'
|
35
|
+
]
|
36
|
+
|
37
|
+
convert_seasons = {'2007': '20072008',
|
38
|
+
'2008': '20082009',
|
39
|
+
'2009': '20092010',
|
40
|
+
'2010': '20102011',
|
41
|
+
'2011': '20112012',
|
42
|
+
'2012': '20122013',
|
43
|
+
'2013': '20132014',
|
44
|
+
'2014': '20142015',
|
45
|
+
'2015': '20152016',
|
46
|
+
'2016': '20162017',
|
47
|
+
'2017': '20172018',
|
48
|
+
'2018': '20182019',
|
49
|
+
'2019': '20192020',
|
50
|
+
'2020': '20202021',
|
51
|
+
'2021': '20212022',
|
52
|
+
'2022': '20222023',
|
53
|
+
'2023': '20232024',
|
54
|
+
'2024': '20242025'}
|
55
|
+
|
56
|
+
convert_team_abbr = {'L.A':'LAK',
|
57
|
+
'N.J':'NJD',
|
58
|
+
'S.J':'SJS',
|
59
|
+
'T.B':'TBL',
|
60
|
+
'PHX':'ARI'}
|
61
|
+
|
62
|
+
per_sixty = ['Fi','xGi','Gi','A1','A2','P1','P','FF','FA','xGF','xGA','GF','GA']
|
63
|
+
|
64
|
+
#Some games in the API are specifically known to cause errors in scraping.
|
65
|
+
#This list is updated as frequently as necessary
|
66
|
+
known_probs ={
|
67
|
+
'2007020011':'Missing shifts data for game between Chicago and Minnesota.',
|
68
|
+
'2007021178':'Game between the Bruins and Sabres is missing data after the second period, for some reason.',
|
69
|
+
'2008020259':'HTML data is completely missing for this game.',
|
70
|
+
'2008020409':'HTML data is completely missing for this game.',
|
71
|
+
'2008021077':'HTML data is completely missing for this game.',
|
72
|
+
'2009020081':'HTML pbp for this game between Pittsburgh and Carolina is missing all but the period start and first faceoff events, for some reason.',
|
73
|
+
'2009020658':'Missing shifts data for game between New York Islanders and Dallas.',
|
74
|
+
'2009020885':'Missing shifts data for game between Sharks and Blue Jackets.',
|
75
|
+
'2010020124':'Game between Capitals and Hurricanes is sporadically missing player on-ice data',
|
76
|
+
'2013020971':'On March 10th, 2014, Stars forward Rich Peverley suffered from a cardiac episode midgame and as a result, the remainder of the game was postponed. \nThe game resumed on April 9th, and the only goal scorer in the game, Blue Jackets forward Nathan Horton, did not appear in the resumed game due to injury. Interestingly, Horton would never play in the NHL again.',
|
77
|
+
'2018021133':'Game between Lightning and Capitals has incorrectly labeled event teams (i.e. WSH TAKEAWAY - #71 CIRELLI (Cirelli is a Tampa Bay skater in this game)).',
|
78
|
+
'2019020876':'Due to the frightening collapse of Blues defensemen Jay Bouwmeester, a game on February 2nd, 2020 between the Ducks and Blues was postponed. \nWhen the game resumed, Ducks defensemen Hampus Lindholm, who assisted on a goal in the inital game, did not play in the resumed match.'
|
79
|
+
}
|
80
|
+
|
81
|
+
name_change = {
|
82
|
+
"":"",
|
83
|
+
}
|
84
|
+
|
85
|
+
shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
|
86
|
+
|
87
|
+
new = 2024
|
88
|
+
|
89
|
+
standings_end = {
|
90
|
+
'20072008':'04-06',
|
91
|
+
'20082009':'04-12',
|
92
|
+
'20092010':'04-11',
|
93
|
+
'20102011':'04-10',
|
94
|
+
'20112012':'04-07',
|
95
|
+
'20122013':'04-28',
|
96
|
+
'20132014':'04-13',
|
97
|
+
'20142015':'04-11',
|
98
|
+
'20152016':'04-10',
|
99
|
+
'20162017':'04-09',
|
100
|
+
'20172018':'04-08',
|
101
|
+
'20182019':'04-06',
|
102
|
+
'20192020':'03-11',
|
103
|
+
'20202021':'05-19',
|
104
|
+
'20212022':'04-01',
|
105
|
+
'20222023':'04-14',
|
106
|
+
'20232024':'04-18',
|
107
|
+
'20242025':'04-17'
|
108
|
+
}
|
109
|
+
|
10
110
|
## SCRAPE FUNCTIONS ##
|
11
|
-
def nhl_scrape_game(game_ids,split_shifts = False,remove = ['period-start','period-end','challenge','stoppage']):
|
111
|
+
def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage'],verbose = False, errors = False):
|
12
112
|
#Given a set of game_ids (NHL API), return complete play-by-play information as requested
|
13
|
-
# param 'game_ids' - NHL game ids
|
113
|
+
# param 'game_ids' - NHL game ids (or list formatted as ['random', num_of_games, start_year, end_year])
|
14
114
|
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
15
115
|
# param 'remove' - list of events to remove from final dataframe
|
116
|
+
# param 'xg' - xG model to apply to pbp for aggregation
|
117
|
+
# param 'verbose' - boolean which adds additional event info if true
|
118
|
+
# param 'errors' - boolean returning game ids which did not scrape if true
|
16
119
|
|
17
120
|
pbps = []
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
121
|
+
if game_ids[0] == 'random':
|
122
|
+
#Randomize selection of game_ids
|
123
|
+
#Some ids returned may be invalid (for example, 2020021300)
|
124
|
+
num = game_ids[1]
|
125
|
+
try:
|
126
|
+
start = game_ids[2]
|
127
|
+
except:
|
128
|
+
start = 2007
|
129
|
+
try:
|
130
|
+
end = game_ids[3]
|
131
|
+
except:
|
132
|
+
end = (date.today().year)-1
|
133
|
+
|
134
|
+
game_ids = []
|
135
|
+
i = 0
|
136
|
+
print("Finding valid, random game ids...")
|
137
|
+
while i is not num:
|
138
|
+
print(f"\rGame IDs found in range {start}-{end}: {i}/{num}",end="")
|
139
|
+
rand_year = random.randint(start,end)
|
140
|
+
rand_season_type = random.randint(2,3)
|
141
|
+
rand_game = random.randint(1,1312)
|
142
|
+
|
143
|
+
#Ensure id validity (and that number of scraped games is equal to specified value)
|
144
|
+
rand_id = f'{rand_year}{rand_season_type:02d}{rand_game:04d}'
|
145
|
+
try:
|
146
|
+
rs.get(f"https://api-web.nhle.com/v1/gamecenter/{rand_id}/play-by-play").json()
|
147
|
+
i += 1
|
148
|
+
game_ids.append(rand_id)
|
149
|
+
except:
|
150
|
+
continue
|
39
151
|
|
40
|
-
|
41
|
-
|
152
|
+
print(f"\rGame IDs found in range {start}-{end}: {i}/{num}")
|
153
|
+
|
154
|
+
#Scrape each game
|
155
|
+
#Track Errors
|
156
|
+
error_ids = []
|
157
|
+
for game_id in game_ids:
|
158
|
+
print("Scraping data from game " + str(game_id) + "...",end="")
|
159
|
+
start = time.perf_counter()
|
42
160
|
|
43
|
-
|
44
|
-
|
161
|
+
try:
|
162
|
+
#Retrieve data
|
163
|
+
info = get_game_info(game_id)
|
164
|
+
data = combine_data(info)
|
165
|
+
|
166
|
+
#Append data to list
|
167
|
+
pbps.append(data)
|
45
168
|
|
46
|
-
|
169
|
+
end = time.perf_counter()
|
170
|
+
secs = end - start
|
171
|
+
print(f" finished in {secs:.2f} seconds.")
|
47
172
|
|
173
|
+
except:
|
174
|
+
#Games such as the all-star game and pre-season games will incur this error
|
175
|
+
#Other games have known problems
|
176
|
+
if game_id in known_probs.keys():
|
177
|
+
print(f"\nGame {game_id} has a known problem: {known_probs[game_id]}")
|
178
|
+
else:
|
179
|
+
print(f"\nUnable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
|
180
|
+
|
181
|
+
#Track error
|
182
|
+
error_ids.append(game_id)
|
183
|
+
|
48
184
|
#Add all pbps together
|
185
|
+
if len(pbps) == 0:
|
186
|
+
print("\rNo data returned.")
|
187
|
+
return pd.DataFrame()
|
49
188
|
df = pd.concat(pbps)
|
50
189
|
|
190
|
+
#If verbose is true features required to calculate xG are added to dataframe
|
191
|
+
if verbose:
|
192
|
+
df = prep_xG_data(df)
|
193
|
+
else:
|
194
|
+
""
|
195
|
+
|
196
|
+
#Print final message
|
197
|
+
if len(error_ids) > 0:
|
198
|
+
print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
|
199
|
+
else:
|
200
|
+
print('\rScrape of provided games finished.')
|
201
|
+
|
51
202
|
#Split pbp and shift events if necessary
|
52
203
|
#Return: complete play-by-play with data removed or split as necessary
|
204
|
+
|
53
205
|
if split_shifts == True:
|
54
|
-
|
55
|
-
remove = ['change']
|
206
|
+
remove.append('change')
|
56
207
|
|
57
208
|
#Return: dict with pbp and shifts seperated
|
58
|
-
|
59
|
-
"shifts":df.loc[df['event_type']=='change']
|
209
|
+
pbp_dict = {"pbp":df.loc[~df['event_type'].isin(remove)],
|
210
|
+
"shifts":df.loc[df['event_type']=='change']
|
60
211
|
}
|
212
|
+
|
213
|
+
if errors:
|
214
|
+
pbp_dict.update({'errors':error_ids})
|
215
|
+
|
216
|
+
return pbp_dict
|
61
217
|
else:
|
62
218
|
#Return: all events that are not set for removal by the provided list
|
63
|
-
|
219
|
+
pbp = df.loc[~df['event_type'].isin(remove)]
|
220
|
+
|
221
|
+
if errors:
|
222
|
+
pbp_dict = {'pbp':pbp,
|
223
|
+
'errors':error_ids}
|
224
|
+
|
225
|
+
return pbp_dict
|
226
|
+
else:
|
227
|
+
return pbp
|
64
228
|
|
65
229
|
def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
66
230
|
#Given a season, return schedule data
|
@@ -104,16 +268,18 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
|
104
268
|
"id": [gameWeek[i]['id']],
|
105
269
|
"season": [gameWeek[i]['season']],
|
106
270
|
"season_type":[gameWeek[i]['gameType']],
|
271
|
+
"away_team_abbr":[gameWeek[i]['awayTeam']['abbrev']],
|
272
|
+
"home_team_abbr":[gameWeek[i]['homeTeam']['abbrev']],
|
107
273
|
"gamecenter_link":[gameWeek[i]['gameCenterLink']]
|
108
274
|
}))
|
109
275
|
|
110
276
|
#Concatenate all games
|
111
277
|
df = pd.concat(game)
|
112
278
|
|
113
|
-
#Return: specificed schedule data
|
114
|
-
return df
|
279
|
+
#Return: specificed schedule data
|
280
|
+
return df
|
115
281
|
|
116
|
-
def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','period-end','
|
282
|
+
def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv", verbose = False, errors = False):
|
117
283
|
#Given season, scrape all play-by-play occuring within the season
|
118
284
|
# param 'season' - NHL season to scrape
|
119
285
|
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
@@ -122,53 +288,61 @@ def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','per
|
|
122
288
|
# param 'end' - End date in season
|
123
289
|
# param 'local' - boolean indicating whether to use local file to scrape game_ids
|
124
290
|
# param 'local_path' - path of local file
|
291
|
+
# param 'verbose' - boolean which adds additional event info if true
|
292
|
+
# param 'errors' - boolean returning game ids which did not scrape if true
|
125
293
|
|
126
|
-
#
|
294
|
+
#Determine whether to use schedule data in repository or to scrape
|
127
295
|
if local == True:
|
128
296
|
load = pd.read_csv(local_path)
|
129
|
-
load = load.loc[load['season'].astype(str)==season]
|
297
|
+
load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
|
130
298
|
game_ids = list(load['id'].astype(str))
|
131
299
|
else:
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
df_s = []
|
300
|
+
load = nhl_scrape_schedule(season,start,end)
|
301
|
+
load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
|
302
|
+
game_ids = list(load['id'].astype(str))
|
136
303
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
data = nhl_scrape_game([game_id],split_shifts=True,remove=remove)
|
142
|
-
df.append(data['pbp'])
|
143
|
-
df_s.append(data['shifts'])
|
144
|
-
else:
|
145
|
-
data = nhl_scrape_game([game_id],remove=remove)
|
146
|
-
df.append(data)
|
147
|
-
|
148
|
-
except:
|
149
|
-
#Errors should be rare; testing of eight full-season scraped produced just one missing game due to erro
|
150
|
-
#Games which have not happened yet also print as errors
|
151
|
-
print("An error occurred...")
|
152
|
-
errors.append(pd.DataFrame({"id":game_id}))
|
304
|
+
#If no games found, terminate the process
|
305
|
+
if not game_ids:
|
306
|
+
print('No games found for dates in season...')
|
307
|
+
return ""
|
153
308
|
|
154
|
-
|
309
|
+
print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
|
310
|
+
start = time.perf_counter()
|
311
|
+
|
312
|
+
#Perform scrape
|
155
313
|
if split_shifts == True:
|
156
|
-
|
314
|
+
data = nhl_scrape_game(game_ids,split_shifts=True,remove=remove,verbose=verbose,errors=errors)
|
157
315
|
else:
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
316
|
+
data = nhl_scrape_game(game_ids,remove=remove,verbose=verbose,errors=errors)
|
317
|
+
|
318
|
+
end = time.perf_counter()
|
319
|
+
secs = end - start
|
320
|
+
|
321
|
+
print(f'Finished season scrape in {(secs/60)/60:.2f} hours.')
|
164
322
|
#Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
|
165
323
|
if split_shifts == True:
|
166
|
-
|
167
|
-
'shifts':shifts
|
168
|
-
|
324
|
+
pbp_dict = {'pbp':data['pbp'],
|
325
|
+
'shifts':data['shifts']}
|
326
|
+
|
327
|
+
if errors:
|
328
|
+
pbp_dict.update({'errors':data['errors']})
|
329
|
+
return pbp_dict
|
169
330
|
else:
|
170
|
-
|
171
|
-
|
331
|
+
pbp = data['pbp']
|
332
|
+
|
333
|
+
if errors:
|
334
|
+
pbp_dict = {'pbp':pbp,
|
335
|
+
'errors':data['errors']}
|
336
|
+
return pbp_dict
|
337
|
+
else:
|
338
|
+
return pbp
|
339
|
+
|
340
|
+
#errors = []
|
341
|
+
#for season in seasons[10:12]:
|
342
|
+
# data = nhl_scrape_season(season,remove=[],local=True,errors=True)
|
343
|
+
# errors.append(data['errors'])
|
344
|
+
# data['pbp'].to_csv(f'pbp/csv/nhl_pbp_{season}.csv',index=False)
|
345
|
+
#print(f'Errors: {errors}')
|
172
346
|
|
173
347
|
def nhl_scrape_seasons_info(seasons = []):
|
174
348
|
#Returns info related to NHL seasons (by default, all seasons are included)
|
@@ -190,19 +364,38 @@ def nhl_scrape_seasons_info(seasons = []):
|
|
190
364
|
else:
|
191
365
|
return df.sort_values(by=['id'])
|
192
366
|
|
193
|
-
def nhl_scrape_standings(arg = "now"):
|
367
|
+
def nhl_scrape_standings(arg = "now", season_type = 2):
|
194
368
|
#Returns standings
|
195
|
-
#
|
196
|
-
|
197
|
-
|
198
|
-
|
369
|
+
# param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD, a season (scrapes the last standings date for the season) or a year (for playoffs).
|
370
|
+
# param 'season_type' - by default, this scrapes the regular season standings. If set to 3, it returns the playoff bracket for the specified season
|
371
|
+
|
372
|
+
#arg param is ignored when set to "now" if season_type param is 3
|
373
|
+
if season_type == 3:
|
374
|
+
if arg == "now":
|
375
|
+
arg = new
|
376
|
+
|
377
|
+
print(f"Scraping playoff bracket for date: {arg}")
|
378
|
+
api = f"https://api-web.nhle.com/v1/playoff-bracket/{arg}"
|
379
|
+
|
380
|
+
data = rs.get(api).json()['series']
|
381
|
+
|
382
|
+
return pd.json_normalize(data)
|
383
|
+
|
199
384
|
else:
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
385
|
+
if arg == "now":
|
386
|
+
print("Scraping standings as of now...")
|
387
|
+
elif arg in seasons:
|
388
|
+
print(f'Scraping standings for season: {arg}')
|
389
|
+
else:
|
390
|
+
print(f"Scraping standings for date: {arg}")
|
204
391
|
|
205
|
-
|
392
|
+
api = f"https://api-web.nhle.com/v1/standings/{arg[4:8]}-{standings_end[arg]}"
|
393
|
+
data = rs.get(api).json()['standings']
|
394
|
+
|
395
|
+
return pd.json_normalize(data)
|
396
|
+
|
397
|
+
#stand = [nhl_scrape_standings(season) for season in seasons]
|
398
|
+
#pd.concat(stand).to_csv('teaminfo/nhl_standings.csv',index=False)
|
206
399
|
|
207
400
|
def nhl_scrape_roster(season):
|
208
401
|
#Given a nhl season, return rosters for all participating teams
|
@@ -235,48 +428,457 @@ def nhl_scrape_roster(season):
|
|
235
428
|
|
236
429
|
return pd.concat(rosts)
|
237
430
|
|
238
|
-
def
|
239
|
-
#Given
|
240
|
-
|
431
|
+
def nhl_scrape_prospects(team):
|
432
|
+
#Given team abbreviation, retreive current team prospects
|
433
|
+
|
434
|
+
api = f'https://api-web.nhle.com/v1/prospects/{team}'
|
241
435
|
|
242
|
-
data =
|
436
|
+
data = rs.get(api).json()
|
437
|
+
|
438
|
+
#Iterate through positions
|
439
|
+
players = [pd.json_normalize(data[pos]) for pos in ['forwards','defensemen','goalies']]
|
243
440
|
|
244
|
-
|
441
|
+
prospects = pd.concat(players)
|
442
|
+
#Add name columns
|
443
|
+
prospects['fullName'] = (prospects['firstName.default']+" "+prospects['lastName.default']).str.upper()
|
245
444
|
|
246
|
-
|
247
|
-
|
248
|
-
try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
|
249
|
-
except: continue
|
445
|
+
#Return: team prospects
|
446
|
+
return prospects
|
250
447
|
|
251
|
-
|
448
|
+
def nhl_scrape_team_info(country = False):
|
449
|
+
#Given option to return franchise or country, return team information
|
252
450
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
451
|
+
print('Scraping team information...')
|
452
|
+
api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
|
453
|
+
|
454
|
+
data = pd.json_normalize(rs.get(api).json()['data'])
|
455
|
+
|
456
|
+
#Add logos if necessary
|
457
|
+
if not country:
|
458
|
+
data['logo_light'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_light.svg'
|
459
|
+
data['logo_dark'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_dark.svg'
|
460
|
+
|
461
|
+
return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
|
462
|
+
|
463
|
+
def nhl_scrape_player_data(player_id):
|
464
|
+
#Given player id, return player information
|
465
|
+
api = f'https://api-web.nhle.com/v1/player/{player_id}/landing'
|
466
|
+
|
467
|
+
data = pd.json_normalize(rs.get(api).json())
|
468
|
+
|
469
|
+
#Add name column
|
470
|
+
data['fullName'] = (data['firstName.default'] + " " + data['lastName.default']).str.upper()
|
471
|
+
|
472
|
+
#Return: player data
|
473
|
+
return data
|
474
|
+
|
475
|
+
def nhl_scrape_draft_rankings(arg = 'now', category = ''):
|
476
|
+
#Given url argument for timeframe and prospect category, return draft rankings
|
477
|
+
#Category 1 is North American Skaters
|
478
|
+
#Category 2 is International Skaters
|
479
|
+
#Category 3 is North American Goalie
|
480
|
+
#Category 4 is International Goalie
|
481
|
+
|
482
|
+
#Player category only applies when requesting a specific season
|
483
|
+
api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category != "" else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
|
484
|
+
data = pd.json_normalize(rs.get(api).json()['rankings'])
|
485
|
+
|
486
|
+
#Add player name columns
|
487
|
+
data['fullName'] = (data['firstName']+" "+data['lastName']).str.upper()
|
488
|
+
|
489
|
+
#Return: prospect rankings
|
490
|
+
return data
|
491
|
+
|
492
|
+
def nhl_shooting_impacts(agg,team=False):
|
493
|
+
#Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
|
494
|
+
#Only 5v5 is supported as of now
|
495
|
+
|
496
|
+
#param 'agg' - stats table
|
497
|
+
#param 'team' - boolean determining if team stats are calculated instead of skater stats
|
498
|
+
|
499
|
+
#COMPOSITE IMPACT EVALUATIONS:
|
500
|
+
|
501
|
+
#SR = Shot Rate
|
502
|
+
#SQ = Shot Quality
|
503
|
+
#FN = Finishing
|
504
|
+
|
505
|
+
#I = Impact
|
506
|
+
|
507
|
+
#INDV = Individual
|
508
|
+
#OOFF = On-Ice Offense
|
509
|
+
#ODEF = On-Ice Defense
|
510
|
+
|
511
|
+
#Grouping-Metric Code: XXXX-YYI
|
512
|
+
|
513
|
+
#Goal Composition Formula
|
514
|
+
#The aggregation of goals is composed of three factors: shot rate, shot quality, and finishing
|
515
|
+
#These are represented by their own metrics in which Goals = (Fenwick*(League Average Fenwick SH%)) + ((xGoals/Fenwick - League Average Fenwick SH%)*Fenwick) + (Goals - xGoals)
|
516
|
+
def goal_comp(fenwick,xg_fen,xg,g,fsh):
|
517
|
+
rate = fenwick * fsh
|
518
|
+
qual = (xg_fen-fsh)*fenwick
|
519
|
+
fini = g-xg
|
520
|
+
|
521
|
+
return rate+qual+fini
|
522
|
+
|
523
|
+
if team:
|
524
|
+
pos = agg
|
525
|
+
for group in [('OOFF','F'),('ODEF','A')]:
|
526
|
+
#Have to set this columns for compatibility with df.apply
|
527
|
+
pos['fsh'] = pos[f'Fsh{group[1]}%']
|
528
|
+
pos['fenwick'] = pos[f'F{group[1]}/60']
|
529
|
+
pos['xg'] = pos[f'xG{group[1]}/60']
|
530
|
+
pos['g'] = pos[f'G{group[1]}/60']
|
531
|
+
pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
|
532
|
+
pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
|
533
|
+
|
534
|
+
#Find average for position in frame
|
535
|
+
avg_fen = pos['fenwick'].mean()
|
536
|
+
avg_xg = pos['xg'].mean()
|
537
|
+
avg_g = pos['g'].mean()
|
538
|
+
avg_fsh = avg_g/avg_fen
|
539
|
+
avg_xg_fen = avg_xg/avg_fen
|
540
|
+
|
541
|
+
#Calculate composite percentiles
|
542
|
+
pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
|
543
|
+
pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
|
544
|
+
pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
|
545
|
+
|
546
|
+
#Calculate shot rate, shot quality, and finishing impacts
|
547
|
+
pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
548
|
+
pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
549
|
+
pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
|
550
|
+
|
551
|
+
#Add extra metrics
|
552
|
+
pos['RushF/60'] = (pos['RushF']/pos['TOI'])*60
|
553
|
+
pos['RushA/60'] = (pos['RushA']/pos['TOI'])*60
|
554
|
+
pos['Rushes FF'] = pos['RushF/60'].rank(pct=True)
|
555
|
+
pos['Rushes FA'] = pos['RushA/60'].rank(pct=True)
|
556
|
+
pos['RushFxG/60'] = (pos['RushFxG']/pos['TOI'])*60
|
557
|
+
pos['RushAxG/60'] = (pos['RushAxG']/pos['TOI'])*60
|
558
|
+
pos['Rushes xGF'] = pos['RushFxG/60'].rank(pct=True)
|
559
|
+
pos['Rushes xGA'] = pos['RushAxG/60'].rank(pct=True)
|
560
|
+
pos['RushFG/60'] = (pos['RushFG']/pos['TOI'])*60
|
561
|
+
pos['RushAG/60'] = (pos['RushAG']/pos['TOI'])*60
|
562
|
+
pos['Rushes GF'] = pos['RushFG/60'].rank(pct=True)
|
563
|
+
pos['Rushes GA'] = pos['RushAG/60'].rank(pct=True)
|
564
|
+
|
565
|
+
#Flip against metric percentiles
|
566
|
+
pos['ODEF-SR'] = 1-pos['ODEF-SR']
|
567
|
+
pos['ODEF-SQ'] = 1-pos['ODEF-SQ']
|
568
|
+
pos['ODEF-FN'] = 1-pos['ODEF-FN']
|
569
|
+
|
570
|
+
#Return: team stats with shooting impacts
|
571
|
+
return pos.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Season','Team'])
|
572
|
+
|
573
|
+
|
574
|
+
else:
|
575
|
+
#Remove skaters with less than 150 minutes of TOI then split between forwards and dmen
|
576
|
+
agg = agg.loc[agg['TOI']>=150]
|
577
|
+
forwards = agg.loc[agg['Position']!='D']
|
578
|
+
defensemen = agg.loc[agg['Position']=='D']
|
579
|
+
|
580
|
+
#Loop through both positions, all groupings (INDV, OOFF, and ODEF) generating impacts
|
581
|
+
for pos in [forwards,defensemen]:
|
582
|
+
for group in [('INDV','i'),('OOFF','F'),('ODEF','A')]:
|
583
|
+
#Have to set this columns for compatibility with df.apply
|
584
|
+
pos['fsh'] = pos[f'Fsh{group[1]}%']
|
585
|
+
pos['fenwick'] = pos[f'F{group[1]}/60']
|
586
|
+
pos['xg'] = pos[f'xG{group[1]}/60']
|
587
|
+
pos['g'] = pos[f'G{group[1]}/60']
|
588
|
+
pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
|
589
|
+
pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
|
590
|
+
|
591
|
+
#Find average for position in frame
|
592
|
+
avg_fen = pos['fenwick'].mean()
|
593
|
+
avg_xg = pos['xg'].mean()
|
594
|
+
avg_g = pos['g'].mean()
|
595
|
+
avg_fsh = avg_g/avg_fen
|
596
|
+
avg_xg_fen = avg_xg/avg_fen
|
597
|
+
|
598
|
+
#Calculate composite percentiles
|
599
|
+
pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
|
600
|
+
pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
|
601
|
+
pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
|
602
|
+
|
603
|
+
#Calculate shot rate, shot quality, and finishing impacts
|
604
|
+
pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
605
|
+
pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
606
|
+
pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
|
607
|
+
|
608
|
+
#Calculate On-Ice Involvement Percentiles
|
609
|
+
pos['Fenwick'] = pos['FC%'].rank(pct=True)
|
610
|
+
pos['xG'] = pos['xGC%'].rank(pct=True)
|
611
|
+
pos['Goal Factor'] = pos['GI%'].rank(pct=True)
|
612
|
+
pos['Goal Scoring'] = pos['GC%'].rank(pct=True)
|
613
|
+
pos['Rush/60'] = (pos['Rush']/pos['TOI'])*60
|
614
|
+
pos['RushxG/60'] = (pos['Rush xG']/pos['TOI'])*60
|
615
|
+
pos['Rushes xG'] = pos['RushxG/60'].rank(pct=True)
|
616
|
+
pos['Rushes FF'] = pos['Rush/60'].rank(pct=True)
|
617
|
+
|
618
|
+
#Add positions back together
|
619
|
+
complete = pd.concat([forwards,defensemen])
|
620
|
+
|
621
|
+
#Flip against metric percentiles
|
622
|
+
complete['ODEF-SR'] = 1-complete['ODEF-SR']
|
623
|
+
complete['ODEF-SQ'] = 1-complete['ODEF-SQ']
|
624
|
+
complete['ODEF-FN'] = 1-complete['ODEF-FN']
|
625
|
+
|
626
|
+
#Extraneous Values
|
627
|
+
complete['Extraneous Gi'] = complete['INDV-SRI']+complete['INDV-SQI']+complete['INDV-FNI']
|
628
|
+
complete['Extraneous xGi'] = complete['INDV-SRI']+complete['INDV-SQI']
|
629
|
+
complete['Extraneous GF'] = complete['OOFF-SRI']+complete['OOFF-SQI']+complete['OOFF-FNI']
|
630
|
+
complete['Extraneous xGF'] = complete['OOFF-SRI']+complete['OOFF-SQI']
|
631
|
+
complete['Extraneous GA'] = complete['ODEF-SRI']+complete['ODEF-SQI']+complete['ODEF-FNI']
|
632
|
+
complete['Extraneous xGA'] = complete['ODEF-SRI']+complete['ODEF-SQI']
|
633
|
+
|
634
|
+
#Goal Composites
|
635
|
+
complete['Linemate Extraneous Goals'] = complete['Extraneous GF'] - complete['Extraneous Gi']
|
636
|
+
complete['Linemate Goal Induction'] = complete['Linemate Extraneous Goals']*complete['AC%']
|
637
|
+
complete['Composite Goal Impact'] = complete['Extraneous Gi'] + complete['Linemate Goal Induction']
|
638
|
+
complete['Linemate Rel. Goal Impact'] = complete['Composite Goal Impact'] - (complete['Extraneous GF']-complete['Composite Goal Impact'])
|
639
|
+
complete['Net Goal Impact'] = complete['Extraneous GF'] - complete['Extraneous GA']
|
640
|
+
complete['Net xGoal Impact'] = complete['Extraneous xGF'] - complete['Extraneous xGA']
|
641
|
+
|
642
|
+
#Return: skater stats with shooting impacts
|
643
|
+
return complete.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Player','Season','Team','ID'])
|
644
|
+
|
645
|
+
def nhl_calculate_stats(pbp,type,season_types,game_strength,roster_path="rosters/nhl_rosters.csv",xg="moneypuck",shot_impact=False):
|
646
|
+
#Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
|
647
|
+
# param 'pbp' - play-by-play dataframe
|
648
|
+
# param 'type' - type of stats to calculate ('skater', 'goaltender', or 'team')
|
649
|
+
# param 'season' - season or timeframe of events in play-by-play
|
650
|
+
# param 'season_type' - list of season types (preseason, regular season, or playoffs) to include in aggregation
|
651
|
+
# param 'game_strength' - list of game_strengths to include in aggregation
|
652
|
+
# param 'roster_path' - path to roster file
|
653
|
+
# param 'xg' - xG model to apply to pbp for aggregation
|
654
|
+
# param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
|
655
|
+
|
656
|
+
print(f"Calculating statistics for all games in the provided play-by-play data...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
|
657
|
+
start = time.perf_counter()
|
658
|
+
|
659
|
+
#Add extra data and apply team changes
|
660
|
+
pbp = prep_xG_data(pbp).replace(convert_team_abbr)
|
661
|
+
|
662
|
+
#Check if xG column exists and apply model if it does not
|
663
|
+
try:
|
664
|
+
pbp['xG']
|
665
|
+
except KeyError:
|
666
|
+
if xg == 'wsba':
|
667
|
+
pbp = wsba_xG(pbp)
|
668
|
+
else:
|
669
|
+
pbp = moneypuck_xG(pbp)
|
670
|
+
|
671
|
+
#Filter by season types and remove shootouts
|
672
|
+
pbp = pbp.loc[(pbp['season_type'].isin(season_types)) & (pbp['period'] < 5)]
|
673
|
+
|
674
|
+
# Filter by game strength if not "all"
|
675
|
+
if game_strength != "all":
|
676
|
+
pbp = pbp.loc[pbp['strength_state'].isin(game_strength)]
|
677
|
+
|
678
|
+
#Split calculation
|
679
|
+
if type == 'team':
|
680
|
+
complete = calc_team(pbp)
|
681
|
+
|
682
|
+
#Set TOI to minute
|
683
|
+
complete['TOI'] = complete['TOI']/60
|
684
|
+
|
685
|
+
#Add per 60 stats
|
686
|
+
for stat in per_sixty[7:13]:
|
687
|
+
complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
|
688
|
+
|
689
|
+
end = time.perf_counter()
|
690
|
+
length = end-start
|
691
|
+
print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
|
692
|
+
#Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
|
693
|
+
if shot_impact:
|
694
|
+
return nhl_shooting_impacts(complete,True)
|
695
|
+
else:
|
696
|
+
return complete
|
697
|
+
else:
|
698
|
+
indv_stats = calc_indv(pbp)
|
699
|
+
onice_stats = calc_onice(pbp)
|
700
|
+
|
701
|
+
#IDs sometimes set as objects
|
702
|
+
indv_stats['ID'] = indv_stats['ID'].astype(float)
|
703
|
+
onice_stats['ID'] = onice_stats['ID'].astype(float)
|
704
|
+
|
705
|
+
#Merge and add columns for extra stats
|
706
|
+
complete = pd.merge(indv_stats,onice_stats,how="outer",on=['ID','Team','Season'])
|
707
|
+
complete['GC%'] = complete['Gi']/complete['GF']
|
708
|
+
complete['AC%'] = (complete['A1']+complete['A2'])/complete['GF']
|
709
|
+
complete['GI%'] = (complete['Gi']+complete['A1']+complete['A2'])/complete['GF']
|
710
|
+
complete['FC%'] = complete['Fi']/complete['FF']
|
711
|
+
complete['xGC%'] = complete['xGi']/complete['xGF']
|
712
|
+
|
713
|
+
#Remove entries with no ID listed
|
714
|
+
complete = complete.loc[complete['ID'].notna()]
|
715
|
+
|
716
|
+
#Import rosters and player info
|
717
|
+
rosters = pd.read_csv(roster_path)
|
718
|
+
names = rosters[['id','fullName',
|
719
|
+
'headshot','positionCode','shootsCatches',
|
720
|
+
'heightInInches','weightInPounds',
|
721
|
+
'birthDate','birthCountry']].drop_duplicates(subset=['id','fullName'],keep='last')
|
722
|
+
|
723
|
+
#Add names
|
724
|
+
complete = pd.merge(complete,names,how='left',left_on='ID',right_on='id')
|
725
|
+
|
726
|
+
#Rename if there are no missing names
|
727
|
+
complete = complete.rename(columns={'fullName':'Player',
|
728
|
+
'headshot':'Headshot',
|
729
|
+
'positionCode':'Position',
|
730
|
+
'shootsCatches':'Handedness',
|
731
|
+
'heightInInches':'Height (in)',
|
732
|
+
'weightInPounds':'Weight (lbs)',
|
733
|
+
'birthDate':'Birthday',
|
734
|
+
'birthCountry':'Nationality'})
|
735
|
+
|
736
|
+
#Set TOI to minute
|
737
|
+
complete['TOI'] = complete['TOI']/60
|
738
|
+
|
739
|
+
#Add per 60 stats
|
740
|
+
for stat in per_sixty:
|
741
|
+
complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
|
742
|
+
|
743
|
+
#Add player age
|
744
|
+
complete['Birthday'] = pd.to_datetime(complete['Birthday'])
|
745
|
+
complete['season_year'] = complete['Season'].astype(str).str[4:8].astype(int)
|
746
|
+
complete['Age'] = complete['season_year'] - complete['Birthday'].dt.year
|
747
|
+
|
748
|
+
#Find player headshot
|
749
|
+
complete['Headshot'] = 'https://assets.nhle.com/mugs/nhl/'+complete['Season'].astype(str)+'/'+complete['Team']+'/'+complete['ID'].astype(int).astype(str)+'.png'
|
750
|
+
|
751
|
+
end = time.perf_counter()
|
752
|
+
length = end-start
|
753
|
+
#Remove goalies that occasionally appear in a set
|
754
|
+
complete = complete.loc[complete['Position']!='G']
|
755
|
+
#Add WSBA ID
|
756
|
+
complete['WSBA'] = complete['Player']+complete['Season'].astype(str)+complete['Team']
|
757
|
+
|
758
|
+
#Shot Type Metrics
|
759
|
+
type_metrics = []
|
760
|
+
for type in shot_types:
|
761
|
+
for stat in per_sixty[:3]:
|
762
|
+
type_metrics.append(f'{type.capitalize()}{stat}')
|
763
|
+
|
764
|
+
complete = complete[[
|
765
|
+
'Player','ID',
|
766
|
+
"Season","Team",'WSBA',
|
767
|
+
'Headshot','Position','Handedness',
|
768
|
+
'Height (in)','Weight (lbs)',
|
769
|
+
'Birthday','Age','Nationality',
|
770
|
+
'GP','TOI',
|
771
|
+
"Gi","A1","A2",'P1','P',
|
772
|
+
"Fi","xGi",'xGi/Fi',"Gi/xGi","Fshi%",
|
773
|
+
"GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
|
774
|
+
"GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
|
775
|
+
'Rush',"Rush xG",'Rush G',"GC%","AC%","GI%","FC%","xGC%",
|
776
|
+
]+[f'{stat}/60' for stat in per_sixty]+type_metrics].fillna(0).sort_values(['Player','Season','Team','ID'])
|
777
|
+
|
778
|
+
print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
|
779
|
+
#Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
|
780
|
+
if shot_impact:
|
781
|
+
return nhl_shooting_impacts(complete,False)
|
782
|
+
else:
|
783
|
+
return complete
|
784
|
+
|
785
|
+
#stats = []
|
786
|
+
#for season in seasons[6:18]:
|
787
|
+
# pbp = pd.read_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet')
|
788
|
+
# stat = nhl_calculate_stats(pbp,'skater',[2],['5v5'],shot_impact=True)
|
789
|
+
# stat.to_csv(f'stats/skater/wsba_nhl_{season}.csv',index=False)
|
790
|
+
# stats.append(stat)
|
791
|
+
#pd.concat(stats).to_csv('stats/db/wsba_nhl_skater_db.csv',index=False)
|
792
|
+
|
793
|
+
def nhl_plot_skaters_shots(pbp,skater_dict,strengths,color_dict=event_colors,legend=False,xg='moneypuck'):
|
794
|
+
#Returns list of plots for specified skaters
|
795
|
+
# param 'pbp' - pbp to plot data
|
796
|
+
# param 'skater_dict' - skaters to plot shots for (format: {'Patrice Bergeron':['20242025','BOS']})
|
797
|
+
# param 'strengths' - strengths to include in plotting
|
798
|
+
# param 'color_dict' - dict with colors to use for events
|
799
|
+
# param 'legend' - bool which includes legend if true
|
800
|
+
# param 'xg' - xG model to apply to pbp for plotting
|
801
|
+
|
802
|
+
print(f'Plotting the following skater shots: {skater_dict}...')
|
803
|
+
|
804
|
+
#Iterate through games, adding plot to list
|
805
|
+
skater_plots = []
|
806
|
+
for skater in skater_dict.keys():
|
807
|
+
skater_info = skater_dict[skater]
|
808
|
+
title = f'{skater} Fenwick Shots for {skater_info[1]} in {skater_info[0][2:4]}-{skater_info[0][6:8]}'
|
809
|
+
skater_plots.append(plot_skater_shots(pbp,skater,skater_info[0],skater_info[1],strengths,title,color_dict,legend,xg))
|
810
|
+
|
811
|
+
#Return: list of plotted skater shot charts
|
812
|
+
return skater_plots
|
813
|
+
|
814
|
+
def nhl_plot_games(pbp,events,strengths,game_ids='all',color_dict=event_colors,legend=False,xg='moneypuck'):
|
815
|
+
#Returns list of plots for specified games
|
816
|
+
# param 'pbp' - pbp to plot data
|
817
|
+
# param 'events' - type of events to plot
|
818
|
+
# param 'strengths' - strengths to include in plotting
|
819
|
+
# param 'game_ids' - games to plot (list if not set to 'all')
|
820
|
+
# param 'color_dict' - dict with colors to use for events
|
821
|
+
# param 'legend' - bool which includes legend if true
|
822
|
+
# param 'xg' - xG model to apply to pbp for plotting
|
823
|
+
|
824
|
+
#Find games to scrape
|
825
|
+
if game_ids == 'all':
|
826
|
+
game_ids = pbp['game_id'].drop_duplicates().to_list()
|
827
|
+
|
828
|
+
print(f'Plotting the following games: {game_ids}...')
|
829
|
+
|
830
|
+
#Iterate through games, adding plot to list
|
831
|
+
game_plots = [plot_game_events(pbp,game,events,strengths,color_dict,legend,xg) for game in game_ids]
|
832
|
+
|
833
|
+
#Return: list of plotted game events
|
834
|
+
return game_plots
|
835
|
+
|
836
|
+
def repo_load_rosters(seasons = []):
|
837
|
+
#Returns roster data from repository
|
838
|
+
# param 'seasons' - list of seasons to include
|
839
|
+
|
840
|
+
data = pd.read_csv("rosters/nhl_rosters.csv")
|
841
|
+
if len(seasons)>0:
|
842
|
+
data = data.loc[data['season'].isin(seasons)]
|
843
|
+
|
844
|
+
return data
|
845
|
+
|
846
|
+
def repo_load_schedule(seasons = []):
|
847
|
+
#Returns schedule data from repository
|
848
|
+
# param 'seasons' - list of seasons to include
|
849
|
+
|
850
|
+
data = pd.read_csv("schedule/schedule.csv")
|
851
|
+
if len(seasons)>0:
|
852
|
+
data = data.loc[data['season'].isin(seasons)]
|
853
|
+
|
854
|
+
return data
|
855
|
+
|
856
|
+
def repo_load_teaminfo():
|
857
|
+
#Returns team data from repository
|
858
|
+
|
859
|
+
return pd.read_csv("teaminfo/nhl_teaminfo.csv")
|
860
|
+
|
861
|
+
def repo_load_pbp(seasons = []):
|
862
|
+
#Returns play-by-play data from repository
|
863
|
+
# param 'seasons' - list of seasons to include
|
864
|
+
|
865
|
+
#Add parquet to total
|
866
|
+
print(f'Loading play-by-play from the following seasons: {seasons}...')
|
867
|
+
dfs = [pd.read_parquet(f"https://github.com/owensingh38/wsba_hockey/raw/refs/heads/main/src/wsba_hockey/pbp/parquet/nhl_pbp_{season}.parquet") for season in seasons]
|
868
|
+
|
869
|
+
return pd.concat(dfs)
|
870
|
+
|
871
|
+
def repo_load_seasons():
|
872
|
+
#List of available seasons to scrape
|
873
|
+
|
874
|
+
return seasons
|
875
|
+
|
876
|
+
def admin_convert_to_parquet(seasons):
|
877
|
+
for season in seasons:
|
878
|
+
load = pd.read_csv(f'pbp/csv/nhl_pbp_{season}.csv')
|
879
|
+
|
880
|
+
load.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
|
881
|
+
|
882
|
+
#for season in seasons[6:12]:
|
883
|
+
# data = pd.read_csv(f"pbp/csv/nhl_pbp_{season}.csv")
|
884
|
+
# data.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
|