wsba-hockey 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +836 -369
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +275 -47
- wsba_hockey/wsba_main.py +699 -132
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/METADATA +42 -11
- wsba_hockey-1.0.0.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.2.dist-info/RECORD +0 -9
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/top_level.txt +0 -0
wsba_hockey/wsba_main.py
CHANGED
@@ -1,79 +1,230 @@
|
|
1
1
|
import requests as rs
|
2
2
|
import pandas as pd
|
3
3
|
import numpy as np
|
4
|
-
from datetime import datetime, timedelta
|
5
|
-
|
4
|
+
from datetime import datetime, timedelta, date
|
5
|
+
import time
|
6
|
+
import random
|
7
|
+
from .tools.scraping import *
|
8
|
+
from .tools.xg_model import *
|
9
|
+
from .tools.agg import *
|
10
|
+
from .tools.plotting import *
|
6
11
|
|
7
12
|
### WSBA HOCKEY ###
|
8
13
|
## Provided below are all integral functions in the WSBA Hockey Python package. ##
|
9
14
|
|
15
|
+
## GLOBAL VARIABLES ##
|
16
|
+
seasons = [
|
17
|
+
'20072008',
|
18
|
+
'20082009',
|
19
|
+
'20092010',
|
20
|
+
'20102011',
|
21
|
+
'20112012',
|
22
|
+
'20122013',
|
23
|
+
'20132014',
|
24
|
+
'20142015',
|
25
|
+
'20152016',
|
26
|
+
'20162017',
|
27
|
+
'20172018',
|
28
|
+
'20182019',
|
29
|
+
'20192020',
|
30
|
+
'20202021',
|
31
|
+
'20212022',
|
32
|
+
'20222023',
|
33
|
+
'20232024',
|
34
|
+
'20242025'
|
35
|
+
]
|
36
|
+
|
37
|
+
convert_seasons = {'2007': '20072008',
|
38
|
+
'2008': '20082009',
|
39
|
+
'2009': '20092010',
|
40
|
+
'2010': '20102011',
|
41
|
+
'2011': '20112012',
|
42
|
+
'2012': '20122013',
|
43
|
+
'2013': '20132014',
|
44
|
+
'2014': '20142015',
|
45
|
+
'2015': '20152016',
|
46
|
+
'2016': '20162017',
|
47
|
+
'2017': '20172018',
|
48
|
+
'2018': '20182019',
|
49
|
+
'2019': '20192020',
|
50
|
+
'2020': '20202021',
|
51
|
+
'2021': '20212022',
|
52
|
+
'2022': '20222023',
|
53
|
+
'2023': '20232024',
|
54
|
+
'2024': '20242025'}
|
55
|
+
|
56
|
+
convert_team_abbr = {'L.A':'LAK',
|
57
|
+
'N.J':'NJD',
|
58
|
+
'S.J':'SJS',
|
59
|
+
'T.B':'TBL',
|
60
|
+
'PHX':'ARI'}
|
61
|
+
|
62
|
+
per_sixty = ['Fi','xGi','Gi','A1','A2','P1','P','FF','FA','xGF','xGA','GF','GA']
|
63
|
+
|
64
|
+
#Some games in the API are specifically known to cause errors in scraping.
|
65
|
+
#This list is updated as frequently as necessary
|
66
|
+
known_probs ={
|
67
|
+
'2007020011':'Missing shifts data for game between Chicago and Minnesota.',
|
68
|
+
'2007021178':'Game between the Bruins and Sabres is missing data after the second period, for some reason.',
|
69
|
+
'2008020259':'HTML data is completely missing for this game.',
|
70
|
+
'2008020409':'HTML data is completely missing for this game.',
|
71
|
+
'2008021077':'HTML data is completely missing for this game.',
|
72
|
+
'2009020081':'HTML pbp for this game between Pittsburgh and Carolina is missing all but the period start and first faceoff events, for some reason.',
|
73
|
+
'2009020658':'Missing shifts data for game between New York Islanders and Dallas.',
|
74
|
+
'2009020885':'Missing shifts data for game between Sharks and Blue Jackets.',
|
75
|
+
'2010020124':'Game between Capitals and Hurricanes is sporadically missing player on-ice data',
|
76
|
+
'2013020971':'On March 10th, 2014, Stars forward Rich Peverley suffered from a cardiac episode midgame and as a result, the remainder of the game was postponed. \nThe game resumed on April 9th, and the only goal scorer in the game, Blue Jackets forward Nathan Horton, did not appear in the resumed game due to injury. Interestingly, Horton would never play in the NHL again.',
|
77
|
+
'2018021133':'Game between Lightning and Capitals has incorrectly labeled event teams (i.e. WSH TAKEAWAY - #71 CIRELLI (Cirelli is a Tampa Bay skater in this game)).',
|
78
|
+
'2019020876':'Due to the frightening collapse of Blues defensemen Jay Bouwmeester, a game on February 2nd, 2020 between the Ducks and Blues was postponed. \nWhen the game resumed, Ducks defensemen Hampus Lindholm, who assisted on a goal in the inital game, did not play in the resumed match.'
|
79
|
+
}
|
80
|
+
|
81
|
+
name_change = {
|
82
|
+
"":"",
|
83
|
+
}
|
84
|
+
|
85
|
+
shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
|
86
|
+
|
87
|
+
new = 2024
|
88
|
+
|
89
|
+
standings_end = {
|
90
|
+
'20072008':'04-06',
|
91
|
+
'20082009':'04-12',
|
92
|
+
'20092010':'04-11',
|
93
|
+
'20102011':'04-10',
|
94
|
+
'20112012':'04-07',
|
95
|
+
'20122013':'04-28',
|
96
|
+
'20132014':'04-13',
|
97
|
+
'20142015':'04-11',
|
98
|
+
'20152016':'04-10',
|
99
|
+
'20162017':'04-09',
|
100
|
+
'20172018':'04-08',
|
101
|
+
'20182019':'04-06',
|
102
|
+
'20192020':'03-11',
|
103
|
+
'20202021':'05-19',
|
104
|
+
'20212022':'04-01',
|
105
|
+
'20222023':'04-14',
|
106
|
+
'20232024':'04-18',
|
107
|
+
'20242025':'04-17'
|
108
|
+
}
|
109
|
+
|
10
110
|
## SCRAPE FUNCTIONS ##
|
11
|
-
def nhl_scrape_game(game_ids,split_shifts = False,remove = ['period-start','period-end','challenge','stoppage']):
|
111
|
+
def nhl_scrape_game(game_ids,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage'],verbose = False, errors = False):
|
12
112
|
#Given a set of game_ids (NHL API), return complete play-by-play information as requested
|
13
|
-
# param 'game_ids' - NHL game ids
|
113
|
+
# param 'game_ids' - NHL game ids (or list formatted as ['random', num_of_games, start_year, end_year])
|
14
114
|
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
15
115
|
# param 'remove' - list of events to remove from final dataframe
|
116
|
+
# param 'xg' - xG model to apply to pbp for aggregation
|
117
|
+
# param 'verbose' - boolean which adds additional event info if true
|
118
|
+
# param 'errors' - boolean returning game ids which did not scrape if true
|
16
119
|
|
17
120
|
pbps = []
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
season = str(game_id[:4])+str(int(game_id[:4])+1)
|
23
|
-
|
24
|
-
api = "https://api-web.nhle.com/v1/gamecenter/"+game_id+"/play-by-play"
|
25
|
-
home_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TH"+str(game_id)[-6:]+".HTM"
|
26
|
-
away_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TV"+str(game_id)[-6:]+".HTM"
|
27
|
-
|
28
|
-
#Retrieve raw data
|
121
|
+
if game_ids[0] == 'random':
|
122
|
+
#Randomize selection of game_ids
|
123
|
+
#Some ids returned may be invalid (for example, 2020021300)
|
124
|
+
num = game_ids[1]
|
29
125
|
try:
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
126
|
+
start = game_ids[2]
|
127
|
+
except:
|
128
|
+
start = 2007
|
129
|
+
try:
|
130
|
+
end = game_ids[3]
|
131
|
+
except:
|
132
|
+
end = (date.today().year)-1
|
133
|
+
|
134
|
+
game_ids = []
|
135
|
+
i = 0
|
136
|
+
print("Finding valid, random game ids...")
|
137
|
+
while i is not num:
|
138
|
+
print(f"\rGame IDs found in range {start}-{end}: {i}/{num}",end="")
|
139
|
+
rand_year = random.randint(start,end)
|
140
|
+
rand_season_type = random.randint(2,3)
|
141
|
+
rand_game = random.randint(1,1312)
|
142
|
+
|
143
|
+
#Ensure id validity (and that number of scraped games is equal to specified value)
|
144
|
+
rand_id = f'{rand_year}{rand_season_type:02d}{rand_game:04d}'
|
43
145
|
try:
|
44
|
-
|
45
|
-
|
146
|
+
rs.get(f"https://api-web.nhle.com/v1/gamecenter/{rand_id}/play-by-play").json()
|
147
|
+
i += 1
|
148
|
+
game_ids.append(rand_id)
|
149
|
+
except:
|
150
|
+
continue
|
151
|
+
|
152
|
+
print(f"\rGame IDs found in range {start}-{end}: {i}/{num}")
|
46
153
|
|
47
|
-
|
48
|
-
|
49
|
-
|
154
|
+
#Scrape each game
|
155
|
+
#Track Errors
|
156
|
+
error_ids = []
|
157
|
+
for game_id in game_ids:
|
158
|
+
print("Scraping data from game " + str(game_id) + "...",end="")
|
159
|
+
start = time.perf_counter()
|
50
160
|
|
51
|
-
|
161
|
+
try:
|
162
|
+
#Retrieve data
|
163
|
+
info = get_game_info(game_id)
|
164
|
+
data = combine_data(info)
|
165
|
+
|
166
|
+
#Append data to list
|
52
167
|
pbps.append(data)
|
53
|
-
except:
|
54
|
-
print(f"Unable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
|
55
|
-
pbps.append(pd.DataFrame())
|
56
168
|
|
169
|
+
end = time.perf_counter()
|
170
|
+
secs = end - start
|
171
|
+
print(f" finished in {secs:.2f} seconds.")
|
172
|
+
|
173
|
+
except:
|
174
|
+
#Games such as the all-star game and pre-season games will incur this error
|
175
|
+
#Other games have known problems
|
176
|
+
if game_id in known_probs.keys():
|
177
|
+
print(f"\nGame {game_id} has a known problem: {known_probs[game_id]}")
|
178
|
+
else:
|
179
|
+
print(f"\nUnable to scrape game {game_id}. Ensure the ID is properly inputted and formatted.")
|
180
|
+
|
181
|
+
#Track error
|
182
|
+
error_ids.append(game_id)
|
183
|
+
|
57
184
|
#Add all pbps together
|
185
|
+
if len(pbps) == 0:
|
186
|
+
print("\rNo data returned.")
|
187
|
+
return pd.DataFrame()
|
58
188
|
df = pd.concat(pbps)
|
59
189
|
|
190
|
+
#If verbose is true features required to calculate xG are added to dataframe
|
191
|
+
if verbose:
|
192
|
+
df = prep_xG_data(df)
|
193
|
+
else:
|
194
|
+
""
|
195
|
+
|
196
|
+
#Print final message
|
197
|
+
if len(error_ids) > 0:
|
198
|
+
print(f'\rScrape of provided games finished.\nThe following games failed to scrape: {error_ids}')
|
199
|
+
else:
|
200
|
+
print('\rScrape of provided games finished.')
|
201
|
+
|
60
202
|
#Split pbp and shift events if necessary
|
61
203
|
#Return: complete play-by-play with data removed or split as necessary
|
62
|
-
|
63
|
-
except KeyError:
|
64
|
-
raise KeyError("No data is available to return.")
|
65
|
-
|
204
|
+
|
66
205
|
if split_shifts == True:
|
67
|
-
|
68
|
-
remove = ['change']
|
206
|
+
remove.append('change')
|
69
207
|
|
70
208
|
#Return: dict with pbp and shifts seperated
|
71
|
-
|
72
|
-
"shifts":df.loc[df['event_type']=='change']
|
209
|
+
pbp_dict = {"pbp":df.loc[~df['event_type'].isin(remove)],
|
210
|
+
"shifts":df.loc[df['event_type']=='change']
|
73
211
|
}
|
212
|
+
|
213
|
+
if errors:
|
214
|
+
pbp_dict.update({'errors':error_ids})
|
215
|
+
|
216
|
+
return pbp_dict
|
74
217
|
else:
|
75
218
|
#Return: all events that are not set for removal by the provided list
|
76
|
-
|
219
|
+
pbp = df.loc[~df['event_type'].isin(remove)]
|
220
|
+
|
221
|
+
if errors:
|
222
|
+
pbp_dict = {'pbp':pbp,
|
223
|
+
'errors':error_ids}
|
224
|
+
|
225
|
+
return pbp_dict
|
226
|
+
else:
|
227
|
+
return pbp
|
77
228
|
|
78
229
|
def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
79
230
|
#Given a season, return schedule data
|
@@ -117,16 +268,18 @@ def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
|
117
268
|
"id": [gameWeek[i]['id']],
|
118
269
|
"season": [gameWeek[i]['season']],
|
119
270
|
"season_type":[gameWeek[i]['gameType']],
|
271
|
+
"away_team_abbr":[gameWeek[i]['awayTeam']['abbrev']],
|
272
|
+
"home_team_abbr":[gameWeek[i]['homeTeam']['abbrev']],
|
120
273
|
"gamecenter_link":[gameWeek[i]['gameCenterLink']]
|
121
274
|
}))
|
122
275
|
|
123
276
|
#Concatenate all games
|
124
277
|
df = pd.concat(game)
|
125
278
|
|
126
|
-
#Return: specificed schedule data
|
127
|
-
return df
|
279
|
+
#Return: specificed schedule data
|
280
|
+
return df
|
128
281
|
|
129
|
-
def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv"):
|
282
|
+
def nhl_scrape_season(season,split_shifts = False, season_types = [2,3], remove = ['period-start','period-end','game-end','challenge','stoppage'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv", verbose = False, errors = False):
|
130
283
|
#Given season, scrape all play-by-play occuring within the season
|
131
284
|
# param 'season' - NHL season to scrape
|
132
285
|
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
@@ -135,50 +288,61 @@ def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','per
|
|
135
288
|
# param 'end' - End date in season
|
136
289
|
# param 'local' - boolean indicating whether to use local file to scrape game_ids
|
137
290
|
# param 'local_path' - path of local file
|
291
|
+
# param 'verbose' - boolean which adds additional event info if true
|
292
|
+
# param 'errors' - boolean returning game ids which did not scrape if true
|
138
293
|
|
139
|
-
#
|
294
|
+
#Determine whether to use schedule data in repository or to scrape
|
140
295
|
if local == True:
|
141
296
|
load = pd.read_csv(local_path)
|
142
|
-
load = load.loc[load['season'].astype(str)==season]
|
297
|
+
load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
|
143
298
|
game_ids = list(load['id'].astype(str))
|
144
299
|
else:
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
df_s = []
|
149
|
-
|
150
|
-
errors = []
|
151
|
-
for game_id in game_ids:
|
152
|
-
try:
|
153
|
-
if split_shifts == True:
|
154
|
-
data = nhl_scrape_game([game_id],split_shifts=True,remove=remove)
|
155
|
-
df.append(data['pbp'])
|
156
|
-
df_s.append(data['shifts'])
|
157
|
-
else:
|
158
|
-
data = nhl_scrape_game([game_id],remove=remove)
|
159
|
-
df.append(data)
|
300
|
+
load = nhl_scrape_schedule(season,start,end)
|
301
|
+
load = load.loc[(load['season'].astype(str)==season)&(load['season_type'].isin(season_types))]
|
302
|
+
game_ids = list(load['id'].astype(str))
|
160
303
|
|
161
|
-
|
162
|
-
|
163
|
-
|
304
|
+
#If no games found, terminate the process
|
305
|
+
if not game_ids:
|
306
|
+
print('No games found for dates in season...')
|
307
|
+
return ""
|
308
|
+
|
309
|
+
print(f"Scraping games from {season[0:4]}-{season[4:8]} season...")
|
310
|
+
start = time.perf_counter()
|
164
311
|
|
165
|
-
#
|
166
|
-
try: pbp = pd.concat(df)
|
167
|
-
except:
|
168
|
-
raise KeyError("No data is available to return.")
|
169
|
-
|
312
|
+
#Perform scrape
|
170
313
|
if split_shifts == True:
|
171
|
-
|
172
|
-
except: raise KeyError("No data is available to return.")
|
314
|
+
data = nhl_scrape_game(game_ids,split_shifts=True,remove=remove,verbose=verbose,errors=errors)
|
173
315
|
else:
|
174
|
-
|
175
|
-
|
316
|
+
data = nhl_scrape_game(game_ids,remove=remove,verbose=verbose,errors=errors)
|
317
|
+
|
318
|
+
end = time.perf_counter()
|
319
|
+
secs = end - start
|
320
|
+
|
321
|
+
print(f'Finished season scrape in {(secs/60)/60:.2f} hours.')
|
176
322
|
#Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
|
177
323
|
if split_shifts == True:
|
178
|
-
|
179
|
-
'shifts':shifts}
|
324
|
+
pbp_dict = {'pbp':data['pbp'],
|
325
|
+
'shifts':data['shifts']}
|
326
|
+
|
327
|
+
if errors:
|
328
|
+
pbp_dict.update({'errors':data['errors']})
|
329
|
+
return pbp_dict
|
180
330
|
else:
|
181
|
-
|
331
|
+
pbp = data['pbp']
|
332
|
+
|
333
|
+
if errors:
|
334
|
+
pbp_dict = {'pbp':pbp,
|
335
|
+
'errors':data['errors']}
|
336
|
+
return pbp_dict
|
337
|
+
else:
|
338
|
+
return pbp
|
339
|
+
|
340
|
+
#errors = []
|
341
|
+
#for season in seasons[10:12]:
|
342
|
+
# data = nhl_scrape_season(season,remove=[],local=True,errors=True)
|
343
|
+
# errors.append(data['errors'])
|
344
|
+
# data['pbp'].to_csv(f'pbp/csv/nhl_pbp_{season}.csv',index=False)
|
345
|
+
#print(f'Errors: {errors}')
|
182
346
|
|
183
347
|
def nhl_scrape_seasons_info(seasons = []):
|
184
348
|
#Returns info related to NHL seasons (by default, all seasons are included)
|
@@ -200,19 +364,38 @@ def nhl_scrape_seasons_info(seasons = []):
|
|
200
364
|
else:
|
201
365
|
return df.sort_values(by=['id'])
|
202
366
|
|
203
|
-
def nhl_scrape_standings(arg = "now"):
|
367
|
+
def nhl_scrape_standings(arg = "now", season_type = 2):
|
204
368
|
#Returns standings
|
205
|
-
# param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD
|
206
|
-
|
207
|
-
|
208
|
-
|
369
|
+
# param 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD, a season (scrapes the last standings date for the season) or a year (for playoffs).
|
370
|
+
# param 'season_type' - by default, this scrapes the regular season standings. If set to 3, it returns the playoff bracket for the specified season
|
371
|
+
|
372
|
+
#arg param is ignored when set to "now" if season_type param is 3
|
373
|
+
if season_type == 3:
|
374
|
+
if arg == "now":
|
375
|
+
arg = new
|
376
|
+
|
377
|
+
print(f"Scraping playoff bracket for date: {arg}")
|
378
|
+
api = f"https://api-web.nhle.com/v1/playoff-bracket/{arg}"
|
379
|
+
|
380
|
+
data = rs.get(api).json()['series']
|
381
|
+
|
382
|
+
return pd.json_normalize(data)
|
383
|
+
|
209
384
|
else:
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
385
|
+
if arg == "now":
|
386
|
+
print("Scraping standings as of now...")
|
387
|
+
elif arg in seasons:
|
388
|
+
print(f'Scraping standings for season: {arg}')
|
389
|
+
else:
|
390
|
+
print(f"Scraping standings for date: {arg}")
|
391
|
+
|
392
|
+
api = f"https://api-web.nhle.com/v1/standings/{arg[4:8]}-{standings_end[arg]}"
|
393
|
+
data = rs.get(api).json()['standings']
|
394
|
+
|
395
|
+
return pd.json_normalize(data)
|
214
396
|
|
215
|
-
|
397
|
+
#stand = [nhl_scrape_standings(season) for season in seasons]
|
398
|
+
#pd.concat(stand).to_csv('teaminfo/nhl_standings.csv',index=False)
|
216
399
|
|
217
400
|
def nhl_scrape_roster(season):
|
218
401
|
#Given a nhl season, return rosters for all participating teams
|
@@ -245,51 +428,410 @@ def nhl_scrape_roster(season):
|
|
245
428
|
|
246
429
|
return pd.concat(rosts)
|
247
430
|
|
248
|
-
def
|
249
|
-
#Given
|
250
|
-
# param 'roster' - dataframe of roster information from the nhl_scrape_roster function
|
431
|
+
def nhl_scrape_prospects(team):
|
432
|
+
#Given team abbreviation, retreive current team prospects
|
251
433
|
|
252
|
-
|
434
|
+
api = f'https://api-web.nhle.com/v1/prospects/{team}'
|
253
435
|
|
254
|
-
|
436
|
+
data = rs.get(api).json()
|
437
|
+
|
438
|
+
#Iterate through positions
|
439
|
+
players = [pd.json_normalize(data[pos]) for pos in ['forwards','defensemen','goalies']]
|
255
440
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
except: continue
|
441
|
+
prospects = pd.concat(players)
|
442
|
+
#Add name columns
|
443
|
+
prospects['fullName'] = (prospects['firstName.default']+" "+prospects['lastName.default']).str.upper()
|
260
444
|
|
261
|
-
|
445
|
+
#Return: team prospects
|
446
|
+
return prospects
|
262
447
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
448
|
+
def nhl_scrape_team_info(country = False):
|
449
|
+
#Given option to return franchise or country, return team information
|
450
|
+
|
451
|
+
print('Scraping team information...')
|
452
|
+
api = f'https://api.nhle.com/stats/rest/en/{'country' if country else 'team'}'
|
453
|
+
|
454
|
+
data = pd.json_normalize(rs.get(api).json()['data'])
|
455
|
+
|
456
|
+
#Add logos if necessary
|
457
|
+
if not country:
|
458
|
+
data['logo_light'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_light.svg'
|
459
|
+
data['logo_dark'] = 'https://assets.nhle.com/logos/nhl/svg/'+data['triCode']+'_dark.svg'
|
460
|
+
|
461
|
+
return data.sort_values(by=(['country3Code','countryCode','iocCode','countryName'] if country else ['fullName','triCode','id']))
|
462
|
+
|
463
|
+
def nhl_scrape_player_data(player_id):
|
464
|
+
#Given player id, return player information
|
465
|
+
api = f'https://api-web.nhle.com/v1/player/{player_id}/landing'
|
466
|
+
|
467
|
+
data = pd.json_normalize(rs.get(api).json())
|
468
|
+
|
469
|
+
#Add name column
|
470
|
+
data['fullName'] = (data['firstName.default'] + " " + data['lastName.default']).str.upper()
|
471
|
+
|
472
|
+
#Return: player data
|
473
|
+
return data
|
474
|
+
|
475
|
+
def nhl_scrape_draft_rankings(arg = 'now', category = ''):
|
476
|
+
#Given url argument for timeframe and prospect category, return draft rankings
|
477
|
+
#Category 1 is North American Skaters
|
478
|
+
#Category 2 is International Skaters
|
479
|
+
#Category 3 is North American Goalie
|
480
|
+
#Category 4 is International Goalie
|
481
|
+
|
482
|
+
#Player category only applies when requesting a specific season
|
483
|
+
api = f"https://api-web.nhle.com/v1/draft/rankings/{arg}/{category}" if category != "" else f"https://api-web.nhle.com/v1/draft/rankings/{arg}"
|
484
|
+
data = pd.json_normalize(rs.get(api).json()['rankings'])
|
485
|
+
|
486
|
+
#Add player name columns
|
487
|
+
data['fullName'] = (data['firstName']+" "+data['lastName']).str.upper()
|
488
|
+
|
489
|
+
#Return: prospect rankings
|
490
|
+
return data
|
491
|
+
|
492
|
+
def nhl_shooting_impacts(agg,team=False):
|
493
|
+
#Given stats table generated from the nhl_calculate_stats function, return table with shot impacts
|
494
|
+
#Only 5v5 is supported as of now
|
495
|
+
|
496
|
+
#param 'agg' - stats table
|
497
|
+
#param 'team' - boolean determining if team stats are calculated instead of skater stats
|
498
|
+
|
499
|
+
#COMPOSITE IMPACT EVALUATIONS:
|
500
|
+
|
501
|
+
#SR = Shot Rate
|
502
|
+
#SQ = Shot Quality
|
503
|
+
#FN = Finishing
|
504
|
+
|
505
|
+
#I = Impact
|
506
|
+
|
507
|
+
#INDV = Individual
|
508
|
+
#OOFF = On-Ice Offense
|
509
|
+
#ODEF = On-Ice Defense
|
510
|
+
|
511
|
+
#Grouping-Metric Code: XXXX-YYI
|
512
|
+
|
513
|
+
#Goal Composition Formula
|
514
|
+
#The aggregation of goals is composed of three factors: shot rate, shot quality, and finishing
|
515
|
+
#These are represented by their own metrics in which Goals = (Fenwick*(League Average Fenwick SH%)) + ((xGoals/Fenwick - League Average Fenwick SH%)*Fenwick) + (Goals - xGoals)
|
516
|
+
def goal_comp(fenwick,xg_fen,xg,g,fsh):
|
517
|
+
rate = fenwick * fsh
|
518
|
+
qual = (xg_fen-fsh)*fenwick
|
519
|
+
fini = g-xg
|
520
|
+
|
521
|
+
return rate+qual+fini
|
522
|
+
|
523
|
+
if team:
|
524
|
+
pos = agg
|
525
|
+
for group in [('OOFF','F'),('ODEF','A')]:
|
526
|
+
#Have to set this columns for compatibility with df.apply
|
527
|
+
pos['fsh'] = pos[f'Fsh{group[1]}%']
|
528
|
+
pos['fenwick'] = pos[f'F{group[1]}/60']
|
529
|
+
pos['xg'] = pos[f'xG{group[1]}/60']
|
530
|
+
pos['g'] = pos[f'G{group[1]}/60']
|
531
|
+
pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
|
532
|
+
pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
|
533
|
+
|
534
|
+
#Find average for position in frame
|
535
|
+
avg_fen = pos['fenwick'].mean()
|
536
|
+
avg_xg = pos['xg'].mean()
|
537
|
+
avg_g = pos['g'].mean()
|
538
|
+
avg_fsh = avg_g/avg_fen
|
539
|
+
avg_xg_fen = avg_xg/avg_fen
|
540
|
+
|
541
|
+
#Calculate composite percentiles
|
542
|
+
pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
|
543
|
+
pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
|
544
|
+
pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
|
545
|
+
|
546
|
+
#Calculate shot rate, shot quality, and finishing impacts
|
547
|
+
pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
548
|
+
pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
549
|
+
pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
|
550
|
+
|
551
|
+
#Add extra metrics
|
552
|
+
pos['RushF/60'] = (pos['RushF']/pos['TOI'])*60
|
553
|
+
pos['RushA/60'] = (pos['RushA']/pos['TOI'])*60
|
554
|
+
pos['Rushes FF'] = pos['RushF/60'].rank(pct=True)
|
555
|
+
pos['Rushes FA'] = pos['RushA/60'].rank(pct=True)
|
556
|
+
pos['RushFxG/60'] = (pos['RushFxG']/pos['TOI'])*60
|
557
|
+
pos['RushAxG/60'] = (pos['RushAxG']/pos['TOI'])*60
|
558
|
+
pos['Rushes xGF'] = pos['RushFxG/60'].rank(pct=True)
|
559
|
+
pos['Rushes xGA'] = pos['RushAxG/60'].rank(pct=True)
|
560
|
+
pos['RushFG/60'] = (pos['RushFG']/pos['TOI'])*60
|
561
|
+
pos['RushAG/60'] = (pos['RushAG']/pos['TOI'])*60
|
562
|
+
pos['Rushes GF'] = pos['RushFG/60'].rank(pct=True)
|
563
|
+
pos['Rushes GA'] = pos['RushAG/60'].rank(pct=True)
|
564
|
+
|
565
|
+
#Flip against metric percentiles
|
566
|
+
pos['ODEF-SR'] = 1-pos['ODEF-SR']
|
567
|
+
pos['ODEF-SQ'] = 1-pos['ODEF-SQ']
|
568
|
+
pos['ODEF-FN'] = 1-pos['ODEF-FN']
|
569
|
+
|
570
|
+
#Return: team stats with shooting impacts
|
571
|
+
return pos.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Season','Team'])
|
572
|
+
|
573
|
+
|
574
|
+
else:
|
575
|
+
#Remove skaters with less than 150 minutes of TOI then split between forwards and dmen
|
576
|
+
agg = agg.loc[agg['TOI']>=150]
|
577
|
+
forwards = agg.loc[agg['Position']!='D']
|
578
|
+
defensemen = agg.loc[agg['Position']=='D']
|
579
|
+
|
580
|
+
#Loop through both positions, all groupings (INDV, OOFF, and ODEF) generating impacts
|
581
|
+
for pos in [forwards,defensemen]:
|
582
|
+
for group in [('INDV','i'),('OOFF','F'),('ODEF','A')]:
|
583
|
+
#Have to set this columns for compatibility with df.apply
|
584
|
+
pos['fsh'] = pos[f'Fsh{group[1]}%']
|
585
|
+
pos['fenwick'] = pos[f'F{group[1]}/60']
|
586
|
+
pos['xg'] = pos[f'xG{group[1]}/60']
|
587
|
+
pos['g'] = pos[f'G{group[1]}/60']
|
588
|
+
pos['xg_fen'] = pos[f'xG{group[1]}/F{group[1]}']
|
589
|
+
pos['finishing'] = pos[f'G{group[1]}/xG{group[1]}']
|
590
|
+
|
591
|
+
#Find average for position in frame
|
592
|
+
avg_fen = pos['fenwick'].mean()
|
593
|
+
avg_xg = pos['xg'].mean()
|
594
|
+
avg_g = pos['g'].mean()
|
595
|
+
avg_fsh = avg_g/avg_fen
|
596
|
+
avg_xg_fen = avg_xg/avg_fen
|
597
|
+
|
598
|
+
#Calculate composite percentiles
|
599
|
+
pos[f'{group[0]}-SR'] = pos['fenwick'].rank(pct=True)
|
600
|
+
pos[f'{group[0]}-SQ'] = pos['xg_fen'].rank(pct=True)
|
601
|
+
pos[f'{group[0]}-FN'] = pos['finishing'].rank(pct=True)
|
602
|
+
|
603
|
+
#Calculate shot rate, shot quality, and finishing impacts
|
604
|
+
pos[f'{group[0]}-SRI'] = pos['g'] - pos.apply(lambda x: goal_comp(avg_fen,x.xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
605
|
+
pos[f'{group[0]}-SQI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,avg_xg_fen,x.xg,x.g,avg_fsh),axis=1)
|
606
|
+
pos[f'{group[0]}-FNI'] = pos['g'] - pos.apply(lambda x: goal_comp(x.fenwick,x.xg_fen,avg_xg,avg_g,avg_fsh),axis=1)
|
607
|
+
|
608
|
+
#Calculate On-Ice Involvement Percentiles
|
609
|
+
pos['Fenwick'] = pos['FC%'].rank(pct=True)
|
610
|
+
pos['xG'] = pos['xGC%'].rank(pct=True)
|
611
|
+
pos['Goal Factor'] = pos['GI%'].rank(pct=True)
|
612
|
+
pos['Goal Scoring'] = pos['GC%'].rank(pct=True)
|
613
|
+
pos['Rush/60'] = (pos['Rush']/pos['TOI'])*60
|
614
|
+
pos['RushxG/60'] = (pos['Rush xG']/pos['TOI'])*60
|
615
|
+
pos['Rushes xG'] = pos['RushxG/60'].rank(pct=True)
|
616
|
+
pos['Rushes FF'] = pos['Rush/60'].rank(pct=True)
|
617
|
+
|
618
|
+
#Add positions back together
|
619
|
+
complete = pd.concat([forwards,defensemen])
|
620
|
+
|
621
|
+
#Flip against metric percentiles
|
622
|
+
complete['ODEF-SR'] = 1-complete['ODEF-SR']
|
623
|
+
complete['ODEF-SQ'] = 1-complete['ODEF-SQ']
|
624
|
+
complete['ODEF-FN'] = 1-complete['ODEF-FN']
|
625
|
+
|
626
|
+
#Extraneous Values
|
627
|
+
complete['Extraneous Gi'] = complete['INDV-SRI']+complete['INDV-SQI']+complete['INDV-FNI']
|
628
|
+
complete['Extraneous xGi'] = complete['INDV-SRI']+complete['INDV-SQI']
|
629
|
+
complete['Extraneous GF'] = complete['OOFF-SRI']+complete['OOFF-SQI']+complete['OOFF-FNI']
|
630
|
+
complete['Extraneous xGF'] = complete['OOFF-SRI']+complete['OOFF-SQI']
|
631
|
+
complete['Extraneous GA'] = complete['ODEF-SRI']+complete['ODEF-SQI']+complete['ODEF-FNI']
|
632
|
+
complete['Extraneous xGA'] = complete['ODEF-SRI']+complete['ODEF-SQI']
|
633
|
+
|
634
|
+
#Goal Composites
|
635
|
+
complete['Linemate Extraneous Goals'] = complete['Extraneous GF'] - complete['Extraneous Gi']
|
636
|
+
complete['Linemate Goal Induction'] = complete['Linemate Extraneous Goals']*complete['AC%']
|
637
|
+
complete['Composite Goal Impact'] = complete['Extraneous Gi'] + complete['Linemate Goal Induction']
|
638
|
+
complete['Linemate Rel. Goal Impact'] = complete['Composite Goal Impact'] - (complete['Extraneous GF']-complete['Composite Goal Impact'])
|
639
|
+
complete['Net Goal Impact'] = complete['Extraneous GF'] - complete['Extraneous GA']
|
640
|
+
complete['Net xGoal Impact'] = complete['Extraneous xGF'] - complete['Extraneous xGA']
|
641
|
+
|
642
|
+
#Return: skater stats with shooting impacts
|
643
|
+
return complete.drop(columns=['fsh','fenwick','xg_fen','xg','g','finishing']).sort_values(['Player','Season','Team','ID'])
|
644
|
+
|
645
|
+
def nhl_calculate_stats(pbp,type,season_types,game_strength,roster_path="rosters/nhl_rosters.csv",xg="moneypuck",shot_impact=False):
|
646
|
+
#Given play-by-play, seasonal information, game_strength, rosters, and xG model, return aggregated stats
|
647
|
+
# param 'pbp' - play-by-play dataframe
|
648
|
+
# param 'type' - type of stats to calculate ('skater', 'goaltender', or 'team')
|
649
|
+
# param 'season' - season or timeframe of events in play-by-play
|
650
|
+
# param 'season_type' - list of season types (preseason, regular season, or playoffs) to include in aggregation
|
651
|
+
# param 'game_strength' - list of game_strengths to include in aggregation
|
652
|
+
# param 'roster_path' - path to roster file
|
653
|
+
# param 'xg' - xG model to apply to pbp for aggregation
|
654
|
+
# param 'shot_impact' - boolean determining if the shot impact model will be applied to the dataset
|
655
|
+
|
656
|
+
print(f"Calculating statistics for all games in the provided play-by-play data...\nSeasons included: {pbp['season'].drop_duplicates().to_list()}...")
|
657
|
+
start = time.perf_counter()
|
658
|
+
|
659
|
+
#Add extra data and apply team changes
|
660
|
+
pbp = prep_xG_data(pbp).replace(convert_team_abbr)
|
661
|
+
|
662
|
+
#Check if xG column exists and apply model if it does not
|
663
|
+
try:
|
664
|
+
pbp['xG']
|
665
|
+
except KeyError:
|
666
|
+
if xg == 'wsba':
|
667
|
+
pbp = wsba_xG(pbp)
|
668
|
+
else:
|
669
|
+
pbp = moneypuck_xG(pbp)
|
670
|
+
|
671
|
+
#Filter by season types and remove shootouts
|
672
|
+
pbp = pbp.loc[(pbp['season_type'].isin(season_types)) & (pbp['period'] < 5)]
|
673
|
+
|
674
|
+
# Filter by game strength if not "all"
|
675
|
+
if game_strength != "all":
|
676
|
+
pbp = pbp.loc[pbp['strength_state'].isin(game_strength)]
|
677
|
+
|
678
|
+
#Split calculation
|
679
|
+
if type == 'team':
|
680
|
+
complete = calc_team(pbp)
|
681
|
+
|
682
|
+
#Set TOI to minute
|
683
|
+
complete['TOI'] = complete['TOI']/60
|
684
|
+
|
685
|
+
#Add per 60 stats
|
686
|
+
for stat in per_sixty[7:13]:
|
687
|
+
complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
|
688
|
+
|
689
|
+
end = time.perf_counter()
|
690
|
+
length = end-start
|
691
|
+
print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
|
692
|
+
#Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
|
693
|
+
if shot_impact:
|
694
|
+
return nhl_shooting_impacts(complete,True)
|
695
|
+
else:
|
696
|
+
return complete
|
697
|
+
else:
|
698
|
+
indv_stats = calc_indv(pbp)
|
699
|
+
onice_stats = calc_onice(pbp)
|
700
|
+
|
701
|
+
#IDs sometimes set as objects
|
702
|
+
indv_stats['ID'] = indv_stats['ID'].astype(float)
|
703
|
+
onice_stats['ID'] = onice_stats['ID'].astype(float)
|
704
|
+
|
705
|
+
#Merge and add columns for extra stats
|
706
|
+
complete = pd.merge(indv_stats,onice_stats,how="outer",on=['ID','Team','Season'])
|
707
|
+
complete['GC%'] = complete['Gi']/complete['GF']
|
708
|
+
complete['AC%'] = (complete['A1']+complete['A2'])/complete['GF']
|
709
|
+
complete['GI%'] = (complete['Gi']+complete['A1']+complete['A2'])/complete['GF']
|
710
|
+
complete['FC%'] = complete['Fi']/complete['FF']
|
711
|
+
complete['xGC%'] = complete['xGi']/complete['xGF']
|
712
|
+
|
713
|
+
#Remove entries with no ID listed
|
714
|
+
complete = complete.loc[complete['ID'].notna()]
|
715
|
+
|
716
|
+
#Import rosters and player info
|
717
|
+
rosters = pd.read_csv(roster_path)
|
718
|
+
names = rosters[['id','fullName',
|
719
|
+
'headshot','positionCode','shootsCatches',
|
720
|
+
'heightInInches','weightInPounds',
|
721
|
+
'birthDate','birthCountry']].drop_duplicates(subset=['id','fullName'],keep='last')
|
722
|
+
|
723
|
+
#Add names
|
724
|
+
complete = pd.merge(complete,names,how='left',left_on='ID',right_on='id')
|
725
|
+
|
726
|
+
#Rename if there are no missing names
|
727
|
+
complete = complete.rename(columns={'fullName':'Player',
|
728
|
+
'headshot':'Headshot',
|
729
|
+
'positionCode':'Position',
|
730
|
+
'shootsCatches':'Handedness',
|
731
|
+
'heightInInches':'Height (in)',
|
732
|
+
'weightInPounds':'Weight (lbs)',
|
733
|
+
'birthDate':'Birthday',
|
734
|
+
'birthCountry':'Nationality'})
|
735
|
+
|
736
|
+
#Set TOI to minute
|
737
|
+
complete['TOI'] = complete['TOI']/60
|
738
|
+
|
739
|
+
#Add per 60 stats
|
740
|
+
for stat in per_sixty:
|
741
|
+
complete[f'{stat}/60'] = (complete[stat]/complete['TOI'])*60
|
742
|
+
|
743
|
+
#Add player age
|
744
|
+
complete['Birthday'] = pd.to_datetime(complete['Birthday'])
|
745
|
+
complete['season_year'] = complete['Season'].astype(str).str[4:8].astype(int)
|
746
|
+
complete['Age'] = complete['season_year'] - complete['Birthday'].dt.year
|
747
|
+
|
748
|
+
#Find player headshot
|
749
|
+
complete['Headshot'] = 'https://assets.nhle.com/mugs/nhl/'+complete['Season'].astype(str)+'/'+complete['Team']+'/'+complete['ID'].astype(int).astype(str)+'.png'
|
750
|
+
|
751
|
+
end = time.perf_counter()
|
752
|
+
length = end-start
|
753
|
+
#Remove goalies that occasionally appear in a set
|
754
|
+
complete = complete.loc[complete['Position']!='G']
|
755
|
+
#Add WSBA ID
|
756
|
+
complete['WSBA'] = complete['Player']+complete['Season'].astype(str)+complete['Team']
|
757
|
+
|
758
|
+
#Shot Type Metrics
|
759
|
+
type_metrics = []
|
760
|
+
for type in shot_types:
|
761
|
+
for stat in per_sixty[:3]:
|
762
|
+
type_metrics.append(f'{type.capitalize()}{stat}')
|
763
|
+
|
764
|
+
complete = complete[[
|
765
|
+
'Player','ID',
|
766
|
+
"Season","Team",'WSBA',
|
767
|
+
'Headshot','Position','Handedness',
|
768
|
+
'Height (in)','Weight (lbs)',
|
769
|
+
'Birthday','Age','Nationality',
|
770
|
+
'GP','TOI',
|
771
|
+
"Gi","A1","A2",'P1','P',
|
772
|
+
"Fi","xGi",'xGi/Fi',"Gi/xGi","Fshi%",
|
773
|
+
"GF","FF","xGF","xGF/FF","GF/xGF","FshF%",
|
774
|
+
"GA","FA","xGA","xGA/FA","GA/xGA","FshA%",
|
775
|
+
'Rush',"Rush xG",'Rush G',"GC%","AC%","GI%","FC%","xGC%",
|
776
|
+
]+[f'{stat}/60' for stat in per_sixty]+type_metrics].fillna(0).sort_values(['Player','Season','Team','ID'])
|
777
|
+
|
778
|
+
print(f'...finished in {(length if length <60 else length/60):.2f} {'seconds' if length <60 else 'minutes'}.')
|
779
|
+
#Apply shot impacts if necessary (Note: this will remove skaters with fewer than 150 minutes of TOI due to the shot impact TOI rule)
|
780
|
+
if shot_impact:
|
781
|
+
return nhl_shooting_impacts(complete,False)
|
782
|
+
else:
|
783
|
+
return complete
|
784
|
+
|
785
|
+
#stats = []
|
786
|
+
#for season in seasons[6:18]:
|
787
|
+
# pbp = pd.read_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet')
|
788
|
+
# stat = nhl_calculate_stats(pbp,'skater',[2],['5v5'],shot_impact=True)
|
789
|
+
# stat.to_csv(f'stats/skater/wsba_nhl_{season}.csv',index=False)
|
790
|
+
# stats.append(stat)
|
791
|
+
#pd.concat(stats).to_csv('stats/db/wsba_nhl_skater_db.csv',index=False)
|
792
|
+
|
793
|
+
def nhl_plot_skaters_shots(pbp,skater_dict,strengths,color_dict=event_colors,legend=False,xg='moneypuck'):
|
794
|
+
#Returns list of plots for specified skaters
|
795
|
+
# param 'pbp' - pbp to plot data
|
796
|
+
# param 'skater_dict' - skaters to plot shots for (format: {'Patrice Bergeron':['20242025','BOS']})
|
797
|
+
# param 'strengths' - strengths to include in plotting
|
798
|
+
# param 'color_dict' - dict with colors to use for events
|
799
|
+
# param 'legend' - bool which includes legend if true
|
800
|
+
# param 'xg' - xG model to apply to pbp for plotting
|
801
|
+
|
802
|
+
print(f'Plotting the following skater shots: {skater_dict}...')
|
803
|
+
|
804
|
+
#Iterate through games, adding plot to list
|
805
|
+
skater_plots = []
|
806
|
+
for skater in skater_dict.keys():
|
807
|
+
skater_info = skater_dict[skater]
|
808
|
+
title = f'{skater} Fenwick Shots for {skater_info[1]} in {skater_info[0][2:4]}-{skater_info[0][6:8]}'
|
809
|
+
skater_plots.append(plot_skater_shots(pbp,skater,skater_info[0],skater_info[1],strengths,title,color_dict,legend,xg))
|
810
|
+
|
811
|
+
#Return: list of plotted skater shot charts
|
812
|
+
return skater_plots
|
813
|
+
|
814
|
+
def nhl_plot_games(pbp,events,strengths,game_ids='all',color_dict=event_colors,legend=False,xg='moneypuck'):
|
815
|
+
#Returns list of plots for specified games
|
816
|
+
# param 'pbp' - pbp to plot data
|
817
|
+
# param 'events' - type of events to plot
|
818
|
+
# param 'strengths' - strengths to include in plotting
|
819
|
+
# param 'game_ids' - games to plot (list if not set to 'all')
|
820
|
+
# param 'color_dict' - dict with colors to use for events
|
821
|
+
# param 'legend' - bool which includes legend if true
|
822
|
+
# param 'xg' - xG model to apply to pbp for plotting
|
823
|
+
|
824
|
+
#Find games to scrape
|
825
|
+
if game_ids == 'all':
|
826
|
+
game_ids = pbp['game_id'].drop_duplicates().to_list()
|
827
|
+
|
828
|
+
print(f'Plotting the following games: {game_ids}...')
|
829
|
+
|
830
|
+
#Iterate through games, adding plot to list
|
831
|
+
game_plots = [plot_game_events(pbp,game,events,strengths,color_dict,legend,xg) for game in game_ids]
|
832
|
+
|
833
|
+
#Return: list of plotted game events
|
834
|
+
return game_plots
|
293
835
|
|
294
836
|
def repo_load_rosters(seasons = []):
|
295
837
|
#Returns roster data from repository
|
@@ -314,4 +856,29 @@ def repo_load_schedule(seasons = []):
|
|
314
856
|
def repo_load_teaminfo():
|
315
857
|
#Returns team data from repository
|
316
858
|
|
317
|
-
return pd.read_csv("teaminfo/nhl_teaminfo.csv")
|
859
|
+
return pd.read_csv("teaminfo/nhl_teaminfo.csv")
|
860
|
+
|
861
|
+
def repo_load_pbp(seasons = []):
|
862
|
+
#Returns play-by-play data from repository
|
863
|
+
# param 'seasons' - list of seasons to include
|
864
|
+
|
865
|
+
#Add parquet to total
|
866
|
+
print(f'Loading play-by-play from the following seasons: {seasons}...')
|
867
|
+
dfs = [pd.read_parquet(f"https://github.com/owensingh38/wsba_hockey/raw/refs/heads/main/src/wsba_hockey/pbp/parquet/nhl_pbp_{season}.parquet") for season in seasons]
|
868
|
+
|
869
|
+
return pd.concat(dfs)
|
870
|
+
|
871
|
+
def repo_load_seasons():
|
872
|
+
#List of available seasons to scrape
|
873
|
+
|
874
|
+
return seasons
|
875
|
+
|
876
|
+
def admin_convert_to_parquet(seasons):
|
877
|
+
for season in seasons:
|
878
|
+
load = pd.read_csv(f'pbp/csv/nhl_pbp_{season}.csv')
|
879
|
+
|
880
|
+
load.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
|
881
|
+
|
882
|
+
#for season in seasons[6:12]:
|
883
|
+
# data = pd.read_csv(f"pbp/csv/nhl_pbp_{season}.csv")
|
884
|
+
# data.to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
|