wsba-hockey 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +836 -369
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +275 -47
- wsba_hockey/wsba_main.py +699 -132
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-0.1.4.dist-info}/METADATA +42 -11
- wsba_hockey-0.1.4.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-0.1.4.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.2.dist-info/RECORD +0 -9
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1104 @@
|
|
1
|
+
import re
|
2
|
+
from bs4 import BeautifulSoup
|
3
|
+
import requests as rs
|
4
|
+
import json as json_lib
|
5
|
+
from tools.utils.shared import *
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
import warnings
|
9
|
+
warnings.filterwarnings('ignore')
|
10
|
+
|
11
|
+
### SCRAPING FUNCTIONS ###
|
12
|
+
# Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
|
13
|
+
|
14
|
+
### JSON GAME INFO ###
|
15
|
+
def get_game_roster(json):
|
16
|
+
#Given raw json data, return game rosters
|
17
|
+
roster = pd.json_normalize(json['rosterSpots'])
|
18
|
+
|
19
|
+
alt_name_col = ['firstName.cs','firstName.de','firstName.es','firstName.fi','firstName.sk','firstName.sv']
|
20
|
+
|
21
|
+
#Add alt-names pattern (appropriately replaces names in shift html)
|
22
|
+
roster['fullName.default'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
|
23
|
+
for col in alt_name_col:
|
24
|
+
try:
|
25
|
+
roster[f'fullName.{re.sub('firstName.',"",col)}'] = (roster[col] + " " + roster['lastName.default']).str.upper()
|
26
|
+
except:
|
27
|
+
continue
|
28
|
+
return roster
|
29
|
+
|
30
|
+
def get_game_info(json):
|
31
|
+
#Given raw json data, return game information
|
32
|
+
|
33
|
+
base = pd.json_normalize(json)
|
34
|
+
game_id = base['id'][0]
|
35
|
+
season = base['season'][0]
|
36
|
+
season_type = base['gameType'][0]
|
37
|
+
game_date = base['gameDate'][0]
|
38
|
+
start_time = base['startTimeUTC'][0]
|
39
|
+
venue = base['venue.default'][0]
|
40
|
+
venue_location = base['venueLocation.default'][0]
|
41
|
+
away_team_id = base['awayTeam.id'][0]
|
42
|
+
away_team_abbr = base['awayTeam.abbrev'][0]
|
43
|
+
home_team_id = base['homeTeam.id'][0]
|
44
|
+
home_team_abbr = base['homeTeam.abbrev'][0]
|
45
|
+
|
46
|
+
return {"game_id":game_id,
|
47
|
+
"season":season,
|
48
|
+
"season_type":season_type,
|
49
|
+
"game_date":game_date,
|
50
|
+
"start_time":start_time,
|
51
|
+
'venue':venue,
|
52
|
+
'venue_location':venue_location,
|
53
|
+
'away_team_id':away_team_id,
|
54
|
+
'away_team_abbr':away_team_abbr,
|
55
|
+
'home_team_id':home_team_id,
|
56
|
+
'home_team_abbr':home_team_abbr,
|
57
|
+
'rosters':get_game_roster(json)}
|
58
|
+
|
59
|
+
## HTML PBP DATA ###
|
60
|
+
def strip_html_pbp(td,json):
|
61
|
+
#Harry Shomer's Code (modified)
|
62
|
+
|
63
|
+
#JSON Prep
|
64
|
+
info = get_game_info(json)
|
65
|
+
roster = info['rosters']
|
66
|
+
|
67
|
+
away = info['away_team_id'] #Away team in the HTML is the seventh column
|
68
|
+
home = info['home_team_id'] #Home team in the HTML is the eighth column
|
69
|
+
away_players = roster.loc[roster['teamId']==away][['fullName.default','playerId','sweaterNumber']]
|
70
|
+
home_players = roster.loc[roster['teamId']==home][['fullName.default','playerId','sweaterNumber']]
|
71
|
+
|
72
|
+
rosters = {"away":away_players.set_index("sweaterNumber")[['playerId','fullName.default']].to_dict(),
|
73
|
+
"home":home_players.set_index("sweaterNumber")[['playerId','fullName.default']].to_dict()
|
74
|
+
}
|
75
|
+
|
76
|
+
#HTML Parsing
|
77
|
+
for y in range(len(td)):
|
78
|
+
# Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
|
79
|
+
if y == 3:
|
80
|
+
td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
|
81
|
+
index = td[y].find(':')
|
82
|
+
td[y] = td[y][:index+3]
|
83
|
+
elif (y == 6 or y == 7) and td[0] != '#':
|
84
|
+
# 6 & 7-> These are the player 1 ice one's
|
85
|
+
# The second statement controls for when it's just a header
|
86
|
+
baz = td[y].find_all('td')
|
87
|
+
bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
|
88
|
+
|
89
|
+
# The setup in the list is now: Name/Number->Position->Blank...and repeat
|
90
|
+
# Now strip all the html
|
91
|
+
players = []
|
92
|
+
for i in range(len(bar)):
|
93
|
+
if i % 3 == 0:
|
94
|
+
try:
|
95
|
+
#Using the supplied json we can bind player name and id to number and team
|
96
|
+
#Find number and team of player then lookup roster dictionary
|
97
|
+
|
98
|
+
number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
|
99
|
+
if y == 6:
|
100
|
+
team = 'away'
|
101
|
+
else:
|
102
|
+
team = 'home'
|
103
|
+
|
104
|
+
id = rosters[team]['playerId'][int(number)]
|
105
|
+
name = rosters[team]['fullName.default'][int(number)]
|
106
|
+
|
107
|
+
except KeyError:
|
108
|
+
name = ''
|
109
|
+
number = ''
|
110
|
+
id = ''
|
111
|
+
elif i % 3 == 1:
|
112
|
+
if name != '':
|
113
|
+
position = bar[i].get_text()
|
114
|
+
players.append([name, number, position, id])
|
115
|
+
|
116
|
+
td[y] = players
|
117
|
+
else:
|
118
|
+
td[y] = td[y].get_text()
|
119
|
+
|
120
|
+
return td
|
121
|
+
|
122
|
+
|
123
|
+
def clean_html_pbp(html,json):
|
124
|
+
#Harry Shomer's Code (modified)
|
125
|
+
soup = get_contents(html)
|
126
|
+
|
127
|
+
# Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
|
128
|
+
td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
|
129
|
+
|
130
|
+
cleaned_html = [strip_html_pbp(x,json) for x in td]
|
131
|
+
|
132
|
+
return cleaned_html
|
133
|
+
|
134
|
+
def get_html_roster(html,json,teams):
|
135
|
+
#Given raw html and teams, return available roster data
|
136
|
+
events = clean_html_pbp(html,json)
|
137
|
+
|
138
|
+
#Roster dict
|
139
|
+
roster_dict = {teams['away']:{},
|
140
|
+
teams['home']:{}}
|
141
|
+
|
142
|
+
for event in events:
|
143
|
+
if event[0] == "#":
|
144
|
+
continue
|
145
|
+
else:
|
146
|
+
#Players are keys, value is a list with number, position, and description id
|
147
|
+
for i in range(len(event[6])):
|
148
|
+
player = event[6][i][0]
|
149
|
+
num = event[6][i][1]
|
150
|
+
pos = event[6][i][2]
|
151
|
+
id = event[6][i][3]
|
152
|
+
team = teams['away']
|
153
|
+
|
154
|
+
#Accounting for players with three or more parts in their name
|
155
|
+
if len(player.split())>2:
|
156
|
+
last = " ".join(player.split()[1:len(player.split())])
|
157
|
+
else:
|
158
|
+
last = player.split()[len(player.split())-1]
|
159
|
+
|
160
|
+
desc_id = f'#{num} {last}'
|
161
|
+
roster_dict[team].update({
|
162
|
+
desc_id:[num,pos,player,team,id]
|
163
|
+
})
|
164
|
+
for i in range(len(event[7])):
|
165
|
+
player = event[7][i][0]
|
166
|
+
num = event[7][i][1]
|
167
|
+
pos = event[7][i][2]
|
168
|
+
id = event[7][i][3]
|
169
|
+
team = teams['home']
|
170
|
+
|
171
|
+
#Accounting for players with three or more parts in their name
|
172
|
+
if len(player.split())>2:
|
173
|
+
last = " ".join(player.split()[1:len(player.split())])
|
174
|
+
else:
|
175
|
+
last = player.split()[len(player.split())-1]
|
176
|
+
|
177
|
+
desc_id = f'#{num} {last}'
|
178
|
+
roster_dict[team].update({
|
179
|
+
desc_id:[num,pos,player,team,id]
|
180
|
+
})
|
181
|
+
|
182
|
+
return roster_dict
|
183
|
+
|
184
|
+
def get_json_coaches(game_id):
|
185
|
+
#Given game id, return head coaches for away and home team
|
186
|
+
|
187
|
+
#Retreive data
|
188
|
+
json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
|
189
|
+
data = json['gameInfo']
|
190
|
+
|
191
|
+
#Add coaches
|
192
|
+
try:
|
193
|
+
away = data['awayTeam']['headCoach']['default'].upper()
|
194
|
+
home = data['homeTeam']['headCoach']['default'].upper()
|
195
|
+
|
196
|
+
coaches = {'away':away,
|
197
|
+
'home':home}
|
198
|
+
except KeyError:
|
199
|
+
return {}
|
200
|
+
|
201
|
+
#Return: dict with coaches
|
202
|
+
return coaches
|
203
|
+
|
204
|
+
def parse_html_event(event,roster,teams):
|
205
|
+
#Given event from html events list and game roster, return event data
|
206
|
+
|
207
|
+
events_dict = dict()
|
208
|
+
if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX']:
|
209
|
+
return pd.DataFrame()
|
210
|
+
else:
|
211
|
+
#Event info
|
212
|
+
events_dict['event_num'] = int(event[0])
|
213
|
+
events_dict['period'] = int(event[1])
|
214
|
+
events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
|
215
|
+
events_dict['period_time_elapsed'] = event[3]
|
216
|
+
events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
|
217
|
+
events_dict['event_type'] = event[4]
|
218
|
+
desc = re.sub(u'\xa0'," ",event[5])
|
219
|
+
events_dict['description'] = desc
|
220
|
+
|
221
|
+
events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
|
222
|
+
zone = [x for x in desc.split(',') if 'Zone' in x]
|
223
|
+
if not zone:
|
224
|
+
events_dict['zone_code'] = None
|
225
|
+
elif zone[0].find("Off") != -1:
|
226
|
+
events_dict['zone_code'] = 'O'
|
227
|
+
elif zone[0].find("Neu") != -1:
|
228
|
+
events_dict['zone_code'] = 'N'
|
229
|
+
elif zone[0].find("Def") != -1:
|
230
|
+
events_dict['zone_code'] = 'D'
|
231
|
+
|
232
|
+
#Convert team names for compatiblity
|
233
|
+
replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
|
234
|
+
for name, repl in replace:
|
235
|
+
teams['away'] = teams['away'].replace(repl,name)
|
236
|
+
teams['home'] = teams['home'].replace(repl,name)
|
237
|
+
desc = desc.replace(repl,name)
|
238
|
+
|
239
|
+
event_team = desc[0:3] if desc[0:3] in [teams['away'],teams['home']] else ""
|
240
|
+
events_dict['event_team_abbr'] = event_team
|
241
|
+
|
242
|
+
|
243
|
+
events_dict['away_team_abbr'] = teams['away']
|
244
|
+
events_dict['home_team_abbr'] = teams['home']
|
245
|
+
event_skaters = []
|
246
|
+
|
247
|
+
away_skaters = 0
|
248
|
+
away_goalie = 0
|
249
|
+
#Away on-ice
|
250
|
+
for i in range(len(event[6])):
|
251
|
+
player = event[6][i][0]
|
252
|
+
num = event[6][i][1]
|
253
|
+
pos = event[6][i][2]
|
254
|
+
id = event[6][i][3]
|
255
|
+
|
256
|
+
if pos == 'G':
|
257
|
+
events_dict['away_goalie'] = player
|
258
|
+
events_dict['away_goalie_id'] = id
|
259
|
+
away_goalie += 1
|
260
|
+
else:
|
261
|
+
events_dict[f'away_on_{i+1}'] = player
|
262
|
+
events_dict[f'away_on_{i+1}_id'] = id
|
263
|
+
away_skaters += 1
|
264
|
+
|
265
|
+
home_skaters = 0
|
266
|
+
home_goalie = 0
|
267
|
+
#Home on-ice
|
268
|
+
for i in range(len(event[7])):
|
269
|
+
player = event[7][i][0]
|
270
|
+
num = event[7][i][1]
|
271
|
+
pos = event[7][i][2]
|
272
|
+
id = event[7][i][3]
|
273
|
+
|
274
|
+
if pos == 'G':
|
275
|
+
events_dict['home_goalie'] = player
|
276
|
+
events_dict['home_goalie_id'] = id
|
277
|
+
home_goalie += 1
|
278
|
+
else:
|
279
|
+
events_dict[f'home_on_{i+1}'] = player
|
280
|
+
events_dict[f'home_on_{i+1}_id'] = id
|
281
|
+
home_skaters += 1
|
282
|
+
|
283
|
+
#Determine parsing route based on event (single player events are left)
|
284
|
+
if event[4] in ['FAC','HIT','BLOCK','PENL']:
|
285
|
+
#Regex to find team and player number involved (finds all for each event)
|
286
|
+
#Code is modified from Harry Shomer in order to account for periods in a team abbreviation
|
287
|
+
regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
|
288
|
+
fac = regex.findall(desc)
|
289
|
+
|
290
|
+
try: team_1,num_1 = fac[0]
|
291
|
+
except: team_1 = ''
|
292
|
+
try: team_2,num_2 = fac[1]
|
293
|
+
except: team_2 = ''
|
294
|
+
|
295
|
+
try: rost_1 = roster[team_1]
|
296
|
+
except: rost_1 = {}
|
297
|
+
try: rost_2 = roster[team_2]
|
298
|
+
except: rost_2 = {}
|
299
|
+
|
300
|
+
#Filter incorrectly parsed teams
|
301
|
+
repl = []
|
302
|
+
for team, num in fac:
|
303
|
+
if team in [teams['home'],teams['away']]:
|
304
|
+
repl.append((team,num))
|
305
|
+
|
306
|
+
fac = repl
|
307
|
+
|
308
|
+
#Determine append order (really only applies to faceoffs)
|
309
|
+
if len(fac) == 0:
|
310
|
+
#No data
|
311
|
+
""
|
312
|
+
else:
|
313
|
+
if len(fac) == 1:
|
314
|
+
#Find event players using given roster
|
315
|
+
for desc_id,info in rost_1.items():
|
316
|
+
if desc_id in desc:
|
317
|
+
event_skaters.append([info[2],info[1],info[4]])
|
318
|
+
else:
|
319
|
+
if team_1 == event_team:
|
320
|
+
for desc_id,info in rost_1.items():
|
321
|
+
if desc_id in desc:
|
322
|
+
event_skaters.append([info[2],info[1],info[4]])
|
323
|
+
for desc_id,info in rost_2.items():
|
324
|
+
if desc_id in desc:
|
325
|
+
event_skaters.append([info[2],info[1],info[4]])
|
326
|
+
else:
|
327
|
+
for desc_id,info in rost_2.items():
|
328
|
+
if desc_id in desc:
|
329
|
+
event_skaters.append([info[2],info[1],info[4]])
|
330
|
+
for desc_id,info in rost_1.items():
|
331
|
+
if desc_id in desc:
|
332
|
+
event_skaters.append([info[2],info[1],info[4]])
|
333
|
+
else:
|
334
|
+
#Parse goal
|
335
|
+
if event[4] == 'GOAL':
|
336
|
+
regex = re.compile(r'#(\d+)\s+')
|
337
|
+
goal = regex.findall(desc)
|
338
|
+
|
339
|
+
goal_team = roster[event_team]
|
340
|
+
#Search through individual element in goal (adds skaters in order from goal, first assist, second assist)
|
341
|
+
for point in goal:
|
342
|
+
for info in goal_team.values():
|
343
|
+
if info[0] == point:
|
344
|
+
event_skaters.append([info[2],info[1],info[4]])
|
345
|
+
break
|
346
|
+
else:
|
347
|
+
#Parse single player or no player events
|
348
|
+
combined = roster[teams['away']] | roster[teams['home']]
|
349
|
+
for desc_id,info in combined.items():
|
350
|
+
if desc_id in desc:
|
351
|
+
event_skaters.append([info[2],info[1],info[4]])
|
352
|
+
|
353
|
+
for i in range(len(event_skaters)):
|
354
|
+
events_dict[f'event_player_{i+1}_name'] = event_skaters[i][0]
|
355
|
+
events_dict[f'event_player_{i+1}_id'] = event_skaters[i][2]
|
356
|
+
events_dict[f'event_player_{i+1}_pos'] = event_skaters[i][1]
|
357
|
+
|
358
|
+
events_dict['away_skaters'] = away_skaters
|
359
|
+
events_dict['home_skaters'] = home_skaters
|
360
|
+
events_dict['away_goalie_in'] = away_goalie
|
361
|
+
events_dict['home_goalie_in'] = home_goalie
|
362
|
+
|
363
|
+
event_skaters = away_skaters if teams['away'] == event_team else home_skaters
|
364
|
+
event_skaters_against = away_skaters if teams['home'] == event_team else home_skaters
|
365
|
+
events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
|
366
|
+
events_dict['event_skaters'] = np.where(event_team == teams['home'],home_skaters,away_skaters)
|
367
|
+
|
368
|
+
#Return: dataframe of event in a single row
|
369
|
+
return (pd.DataFrame([events_dict]))
|
370
|
+
|
371
|
+
def parse_html(game_id,html,json):
|
372
|
+
#Given the game id, raw html document to a provided game, and json data, return parsed HTML play-by-play
|
373
|
+
|
374
|
+
#Retreive cleaned html data (from Harry Shomer's hockey_scraper package)
|
375
|
+
events = clean_html_pbp(html,json)
|
376
|
+
|
377
|
+
json_info = pd.json_normalize(json)
|
378
|
+
teams = {
|
379
|
+
'away':json_info['awayTeam.abbrev'][0],
|
380
|
+
'home':json_info['homeTeam.abbrev'][0]
|
381
|
+
}
|
382
|
+
|
383
|
+
roster = get_html_roster(html,json,teams)
|
384
|
+
event_log = []
|
385
|
+
for event in events:
|
386
|
+
event_log.append(parse_html_event(event,roster,teams))
|
387
|
+
|
388
|
+
data = pd.concat(event_log)
|
389
|
+
data['event_type'] = data['event_type'].replace({
|
390
|
+
"PGSTR": "pre-game-start",
|
391
|
+
"PGEND": "pre-game-end",
|
392
|
+
'GSTR':"game-start",
|
393
|
+
"ANTHEM":"anthem",
|
394
|
+
"PSTR":"period-start",
|
395
|
+
'FAC':"faceoff",
|
396
|
+
"SHOT":"shot-on-goal",
|
397
|
+
"BLOCK":"blocked-shot",
|
398
|
+
"STOP":"stoppage",
|
399
|
+
"MISS":"missed-shot",
|
400
|
+
"HIT":"hit",
|
401
|
+
"GOAL":"goal",
|
402
|
+
"GIVE":"giveaway",
|
403
|
+
"TAKE":"takeaway",
|
404
|
+
"DELPEN":"delayed-penalty",
|
405
|
+
"PENL":"penalty",
|
406
|
+
"CHL":"challenge",
|
407
|
+
"PEND":"period-end",
|
408
|
+
"GEND":"game-end"
|
409
|
+
})
|
410
|
+
|
411
|
+
check_col = ['event_player_1_id','event_player_2_id','event_player_3_id',
|
412
|
+
'away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
|
413
|
+
'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
|
414
|
+
'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
|
415
|
+
'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id']
|
416
|
+
|
417
|
+
for col in check_col:
|
418
|
+
try: data[col]
|
419
|
+
except:
|
420
|
+
data[col] = ""
|
421
|
+
|
422
|
+
#Return: HTML play-by-play
|
423
|
+
return data
|
424
|
+
|
425
|
+
### JSON PBP DATA ###
|
426
|
+
def parse_json(json):
|
427
|
+
#Given json data from an NHL API call, return play-by-play data.
|
428
|
+
|
429
|
+
events = pd.json_normalize(json['plays']).reset_index(drop=True)
|
430
|
+
info = pd.json_normalize(json)
|
431
|
+
roster =get_game_roster(json)
|
432
|
+
|
433
|
+
#Return error if game is set in the future
|
434
|
+
if info['gameState'][0] == 'FUT':
|
435
|
+
raise ValueError(f"Game {info['id'][0]} has not occured yet.")
|
436
|
+
|
437
|
+
away = info['awayTeam.id'][0]
|
438
|
+
home = info['homeTeam.id'][0]
|
439
|
+
teams = {
|
440
|
+
away:info['awayTeam.abbrev'][0],
|
441
|
+
home:info['homeTeam.abbrev'][0]
|
442
|
+
}
|
443
|
+
|
444
|
+
#Create player information dicts used to create event_player columns
|
445
|
+
players = {}
|
446
|
+
for id, player in zip(list(roster['playerId']),list(roster['fullName.default'])):
|
447
|
+
players.update({id:player.upper()})
|
448
|
+
|
449
|
+
#Test columns
|
450
|
+
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
|
451
|
+
|
452
|
+
for col in cols:
|
453
|
+
try:events[col]
|
454
|
+
except:
|
455
|
+
events[col]=""
|
456
|
+
|
457
|
+
#Event_player_columns include players in a given set of events; the higher the number, the greater the importance the event player was to the play
|
458
|
+
events['event_player_1_id'] = events['details.winningPlayerId'].combine_first(events['details.scoringPlayerId'])\
|
459
|
+
.combine_first(events['details.shootingPlayerId'])\
|
460
|
+
.combine_first(events['details.playerId'])\
|
461
|
+
.combine_first(events['details.hittingPlayerId'])\
|
462
|
+
.combine_first(events['details.committedByPlayerId'])
|
463
|
+
|
464
|
+
events['event_player_2_id'] = events['details.losingPlayerId'].combine_first(events['details.assist1PlayerId'])\
|
465
|
+
.combine_first(events['details.hitteePlayerId'])\
|
466
|
+
.combine_first(events['details.drawnByPlayerId'])\
|
467
|
+
.combine_first(events['details.blockingPlayerId'])
|
468
|
+
|
469
|
+
events['event_player_3_id'] = events['details.assist2PlayerId']
|
470
|
+
|
471
|
+
events['event_team_status'] = np.where(events['details.eventOwnerTeamId']==home,"home","away")
|
472
|
+
|
473
|
+
#Coordinate adjustments:
|
474
|
+
#The WSBA NHL Scraper includes three sets of coordinates per event:
|
475
|
+
# x, y - Raw coordinates from JSON pbpp
|
476
|
+
# x_fixed, y_fixed - Coordinates fixed to the right side of the ice (x is always greater than 0)
|
477
|
+
# x_adj, y_adj - Adjusted coordinates configuring away events with negative x vlaues while home events are always positive
|
478
|
+
|
479
|
+
#Some games (mostly preseason and all star games) do not include coordinates.
|
480
|
+
try:
|
481
|
+
events['x_fixed'] = abs(events['details.xCoord'])
|
482
|
+
events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
|
483
|
+
events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
|
484
|
+
events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
|
485
|
+
events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
|
486
|
+
events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
|
487
|
+
except TypeError:
|
488
|
+
print(f"No coordinates found for game {info['id'][0]}...")
|
489
|
+
|
490
|
+
events['x_fixed'] = np.nan
|
491
|
+
events['y_fixed'] = np.nan
|
492
|
+
events['x_adj'] = np.nan
|
493
|
+
events['y_adj'] = np.nan
|
494
|
+
events['event_distance'] = np.nan
|
495
|
+
events['event_angle'] = np.nan
|
496
|
+
|
497
|
+
|
498
|
+
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
|
499
|
+
events['event_goalie'] = events['details.goalieInNetId'].replace(players)
|
500
|
+
|
501
|
+
#Rename columns to follow WSBA naming conventions
|
502
|
+
events = events.rename(columns={
|
503
|
+
"eventId":"event_id",
|
504
|
+
"periodDescriptor.number":"period",
|
505
|
+
"periodDescriptor.periodType":"period_type",
|
506
|
+
"timeInPeriod":"period_time_elasped",
|
507
|
+
"timeRemaining":"period_time_remaining",
|
508
|
+
"situationCode":"situation_code",
|
509
|
+
"homeTeamDefendingSide":"home_team_defending_side",
|
510
|
+
"typeCode":"event_type_code",
|
511
|
+
"typeDescKey":"event_type",
|
512
|
+
"details.shotType":"shot_type",
|
513
|
+
"details.duration":"penalty_duration",
|
514
|
+
"details.descKey":"penalty_description",
|
515
|
+
"details.reason":"reason",
|
516
|
+
"details.zoneCode":"zone_code",
|
517
|
+
"details.xCoord":"x",
|
518
|
+
"details.yCoord":"y",
|
519
|
+
"details.goalieInNetId": "event_goalie_id",
|
520
|
+
"details.awaySOG":"away_SOG",
|
521
|
+
"details.homeSOG":"home_SOG"
|
522
|
+
})
|
523
|
+
|
524
|
+
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
525
|
+
events['period_time_simple'] = events['period_time_elasped'].str.replace(":","",regex=True)
|
526
|
+
events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
|
527
|
+
((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
|
528
|
+
((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
|
529
|
+
events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
|
530
|
+
|
531
|
+
events = events.loc[(events['event_type']!="")]
|
532
|
+
|
533
|
+
#Assign score and fenwick for each event
|
534
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
535
|
+
ag = 0
|
536
|
+
ags = []
|
537
|
+
hg = 0
|
538
|
+
hgs = []
|
539
|
+
|
540
|
+
af = 0
|
541
|
+
afs = []
|
542
|
+
hf = 0
|
543
|
+
hfs = []
|
544
|
+
for event,team in zip(list(events['event_type']),list(events['event_team_status'])):
|
545
|
+
if event in fenwick_events:
|
546
|
+
if team == "home":
|
547
|
+
hf += 1
|
548
|
+
if event == 'goal':
|
549
|
+
hg += 1
|
550
|
+
else:
|
551
|
+
af += 1
|
552
|
+
if event == 'goal':
|
553
|
+
ag += 1
|
554
|
+
|
555
|
+
ags.append(ag)
|
556
|
+
hgs.append(hg)
|
557
|
+
afs.append(af)
|
558
|
+
hfs.append(hf)
|
559
|
+
|
560
|
+
events['away_score'] = ags
|
561
|
+
events['home_score'] = hgs
|
562
|
+
events['away_fenwick'] = afs
|
563
|
+
events['home_fenwick'] = hfs
|
564
|
+
|
565
|
+
#Return: dataframe with parsed game
|
566
|
+
return events
|
567
|
+
|
568
|
+
def combine_pbp(game_id,html,json):
|
569
|
+
#Given game id, html data, and json data, return complete play-by-play data for provided game
|
570
|
+
|
571
|
+
html_pbp = parse_html(game_id,html,json)
|
572
|
+
info = get_game_info(json)
|
573
|
+
|
574
|
+
#Route data combining - json if season is after 2009-2010:
|
575
|
+
if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
|
576
|
+
#ESPN x HTML
|
577
|
+
espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'})
|
578
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
|
579
|
+
|
580
|
+
df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
|
581
|
+
|
582
|
+
else:
|
583
|
+
#JSON x HTML
|
584
|
+
json_pbp = parse_json(json)
|
585
|
+
#Modify merge conditions and merge pbps
|
586
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
|
587
|
+
html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'])
|
588
|
+
|
589
|
+
df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
|
590
|
+
|
591
|
+
#Add game info
|
592
|
+
info_col = ['season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
593
|
+
'away_team_abbr','home_team_abbr']
|
594
|
+
|
595
|
+
for col in info_col:
|
596
|
+
df[col] = info[col]
|
597
|
+
|
598
|
+
#Fill period_type column and assign shifts a sub-500 event code
|
599
|
+
df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
|
600
|
+
try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
601
|
+
except:
|
602
|
+
""
|
603
|
+
df = df.sort_values(['period','seconds_elapsed']).reset_index()
|
604
|
+
|
605
|
+
df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
606
|
+
|
607
|
+
col = [col for col in get_col() if col in df.columns.to_list()]
|
608
|
+
#Return: complete play-by-play information for provided game
|
609
|
+
return df[col]
|
610
|
+
|
611
|
+
### ESPN SCRAPING FUNCTIONS ###
|
612
|
+
def espn_game_id(date,away,home):
|
613
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
|
614
|
+
date = date.replace("-","")
|
615
|
+
|
616
|
+
#Retreive data
|
617
|
+
api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
|
618
|
+
schedule = pd.json_normalize(rs.get(api).json()['events'])
|
619
|
+
|
620
|
+
#Create team abbreviation columns
|
621
|
+
schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
|
622
|
+
schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
|
623
|
+
|
624
|
+
#Modify team abbreviations as necessary
|
625
|
+
schedule = schedule.replace({
|
626
|
+
"LA":"LAK",
|
627
|
+
"NJ":"NJD",
|
628
|
+
"SJ":"SJS",
|
629
|
+
"TB":"TBL",
|
630
|
+
})
|
631
|
+
|
632
|
+
#Retreive game id
|
633
|
+
game_id = schedule.loc[(schedule['away_team_abbr']==away)&
|
634
|
+
(schedule['home_team_abbr']==home),'id'].tolist()[0]
|
635
|
+
|
636
|
+
#Return: ESPN game id
|
637
|
+
return game_id
|
638
|
+
|
639
|
+
def parse_espn(date,away,home):
|
640
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game events
|
641
|
+
game_id = espn_game_id(date,away,home)
|
642
|
+
url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
|
643
|
+
|
644
|
+
#Code modified from Patrick Bacon
|
645
|
+
|
646
|
+
#Retreive game events as json
|
647
|
+
page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
|
648
|
+
soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
|
649
|
+
json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
|
650
|
+
|
651
|
+
#DataFrame of time-related info for events
|
652
|
+
clock_df = pd.DataFrame()
|
653
|
+
|
654
|
+
for period in range(0, len(json)):
|
655
|
+
clock_df = clock_df._append(pd.DataFrame(json[period]))
|
656
|
+
|
657
|
+
clock_df = clock_df[~pd.isna(clock_df.clock)]
|
658
|
+
|
659
|
+
# Needed to add .split(',"st":3')[0] for playoffs
|
660
|
+
|
661
|
+
#DataFrame of coordinates for events
|
662
|
+
coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
|
663
|
+
|
664
|
+
clock_df = clock_df.assign(
|
665
|
+
clock = clock_df.clock.apply(lambda x: x['displayValue'])
|
666
|
+
)
|
667
|
+
|
668
|
+
coords_df = coords_df.assign(
|
669
|
+
coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
|
670
|
+
coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
|
671
|
+
event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
|
672
|
+
)
|
673
|
+
|
674
|
+
#Combine
|
675
|
+
espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
|
676
|
+
|
677
|
+
espn_events = espn_events.assign(
|
678
|
+
period = espn_events['period'].apply(lambda x: x['number']),
|
679
|
+
minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
|
680
|
+
seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
|
681
|
+
event_type = espn_events['type'].apply(lambda x: x['txt'])
|
682
|
+
)
|
683
|
+
|
684
|
+
espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
685
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_x
|
686
|
+
),
|
687
|
+
coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
688
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
|
689
|
+
|
690
|
+
espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
|
691
|
+
|
692
|
+
espn_events = espn_events.assign(
|
693
|
+
coords_x = espn_events.coords_x.astype(int),
|
694
|
+
coords_y = espn_events.coords_y.astype(int)
|
695
|
+
)
|
696
|
+
|
697
|
+
#Rename events
|
698
|
+
#The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
|
699
|
+
espn_events['event_type'] = espn_events['event_type'].replace({
|
700
|
+
"Face Off":'faceoff',
|
701
|
+
"Hit":'hit',
|
702
|
+
"Shot":'shot-on-goal',
|
703
|
+
"Missed":'missed-shot',
|
704
|
+
"Blocked":'blocked-shot',
|
705
|
+
"Goal":'goal',
|
706
|
+
"Turnover":'giveaway',
|
707
|
+
"Delayed Penalty":'delayed-penalty',
|
708
|
+
"Penalty":'penalty',
|
709
|
+
})
|
710
|
+
|
711
|
+
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
712
|
+
espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
|
713
|
+
espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
|
714
|
+
((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
|
715
|
+
((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
|
716
|
+
espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
|
717
|
+
|
718
|
+
espn_events = espn_events.rename(columns = {'text':'description'})
|
719
|
+
|
720
|
+
#Add event team
|
721
|
+
espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
|
722
|
+
"away":away,
|
723
|
+
"home":home
|
724
|
+
})
|
725
|
+
|
726
|
+
#Some games (mostly preseason and all star games) do not include coordinates.
|
727
|
+
try:
|
728
|
+
espn_events['x_fixed'] = abs(espn_events['coords_x'])
|
729
|
+
espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
|
730
|
+
espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
|
731
|
+
espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
|
732
|
+
espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
|
733
|
+
espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
|
734
|
+
except TypeError:
|
735
|
+
print(f"No coordinates found for ESPN game...")
|
736
|
+
|
737
|
+
espn_events['x_fixed'] = np.nan
|
738
|
+
espn_events['y_fixed'] = np.nan
|
739
|
+
espn_events['x_adj'] = np.nan
|
740
|
+
espn_events['y_adj'] = np.nan
|
741
|
+
espn_events['event_distance'] = np.nan
|
742
|
+
espn_events['event_angle'] = np.nan
|
743
|
+
|
744
|
+
#Assign score and fenwick for each event
|
745
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
746
|
+
ag = 0
|
747
|
+
ags = []
|
748
|
+
hg = 0
|
749
|
+
hgs = []
|
750
|
+
|
751
|
+
af = 0
|
752
|
+
afs = []
|
753
|
+
hf = 0
|
754
|
+
hfs = []
|
755
|
+
for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
|
756
|
+
if event in fenwick_events:
|
757
|
+
if team == "home":
|
758
|
+
hf += 1
|
759
|
+
if event == 'goal':
|
760
|
+
hg += 1
|
761
|
+
else:
|
762
|
+
af += 1
|
763
|
+
if event == 'goal':
|
764
|
+
ag += 1
|
765
|
+
|
766
|
+
ags.append(ag)
|
767
|
+
hgs.append(hg)
|
768
|
+
afs.append(af)
|
769
|
+
hfs.append(hf)
|
770
|
+
|
771
|
+
espn_events['away_score'] = ags
|
772
|
+
espn_events['home_score'] = hgs
|
773
|
+
espn_events['away_fenwick'] = afs
|
774
|
+
espn_events['home_fenwick'] = hfs
|
775
|
+
#Return: play-by-play events in supplied game from ESPN
|
776
|
+
return espn_events
|
777
|
+
|
778
|
+
### SHIFT SCRAPING FUNCTIONS ###
|
779
|
+
def analyze_shifts(shift, id, name, pos, team):
|
780
|
+
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
781
|
+
#Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
|
782
|
+
shifts = dict()
|
783
|
+
|
784
|
+
shifts['player_name'] = name.upper()
|
785
|
+
shifts['player_id'] = id
|
786
|
+
shifts['player_pos'] = pos
|
787
|
+
shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
|
788
|
+
shifts['event_team_abbr'] = get_team(team.strip(' '))
|
789
|
+
shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
|
790
|
+
shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
|
791
|
+
|
792
|
+
# I've had problems with this one...if there are no digits the time is fucked up
|
793
|
+
if re.compile(r'\d+').findall(shift[3].split('/')[0]):
|
794
|
+
shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
|
795
|
+
else:
|
796
|
+
shifts['end'] = shifts['start'] + shifts['duration']
|
797
|
+
return shifts
|
798
|
+
|
799
|
+
def parse_shifts_html(html,json):
|
800
|
+
#Parsing of shifts data for a single team in a provided game
|
801
|
+
#Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
|
802
|
+
|
803
|
+
#JSON Prep
|
804
|
+
info = get_game_info(json)
|
805
|
+
roster = info['rosters']
|
806
|
+
|
807
|
+
away = info['away_team_id'] #Away team in the HTML is the seventh column
|
808
|
+
home = info['home_team_id'] #Home team in the HTML is the eighth column
|
809
|
+
away_players = roster.loc[roster['teamId']==away][['playerId','fullName.default','positionCode','sweaterNumber']]
|
810
|
+
home_players = roster.loc[roster['teamId']==home][['playerId','fullName.default','positionCode','sweaterNumber']]
|
811
|
+
|
812
|
+
#Create roster dict
|
813
|
+
rosters = {"away":away_players.set_index("sweaterNumber")[['playerId','fullName.default','positionCode']].to_dict(),
|
814
|
+
"home":home_players.set_index("sweaterNumber")[['playerId','fullName.default','positionCode']].to_dict()
|
815
|
+
}
|
816
|
+
|
817
|
+
all_shifts = []
|
818
|
+
#columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
819
|
+
td, teams = get_soup(html)
|
820
|
+
|
821
|
+
team = teams[0]
|
822
|
+
home_team = teams[1]
|
823
|
+
players = dict()
|
824
|
+
status = 'home' if team == home_team else 'away'
|
825
|
+
|
826
|
+
# Iterates through each player shifts table with the following data:
|
827
|
+
# Shift #, Period, Start, End, and Duration.
|
828
|
+
for t in td:
|
829
|
+
t = t.get_text()
|
830
|
+
if ',' in t: # If a comma exists it is a player
|
831
|
+
name = t
|
832
|
+
|
833
|
+
name = name.split(',')
|
834
|
+
number = int(name[0][:2].strip())
|
835
|
+
id = rosters[status]['playerId'][number]
|
836
|
+
players[id] = dict()
|
837
|
+
|
838
|
+
#HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
|
839
|
+
|
840
|
+
players[id]['name'] = rosters[status]['fullName.default'][number]
|
841
|
+
players[id]['pos'] = rosters[status]['positionCode'][number]
|
842
|
+
|
843
|
+
players[id]['shifts'] = []
|
844
|
+
else:
|
845
|
+
players[id]['shifts'].extend([t])
|
846
|
+
|
847
|
+
for key in players.keys():
|
848
|
+
# Create lists of shifts-table columns for analysis
|
849
|
+
players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
|
850
|
+
|
851
|
+
name = players[key]['name']
|
852
|
+
pos = players[key]['pos']
|
853
|
+
|
854
|
+
# Parsing
|
855
|
+
shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
|
856
|
+
all_shifts.extend(shifts)
|
857
|
+
|
858
|
+
df = pd.DataFrame(all_shifts)
|
859
|
+
|
860
|
+
shifts_raw = df[df['duration'] > 0]
|
861
|
+
|
862
|
+
#Return: single-team individual shifts by player
|
863
|
+
return shifts_raw
|
864
|
+
|
865
|
+
def parse_shift_events(html,json,home):
|
866
|
+
#Given shift document and home team conditional, parse and convert document to shift events congruent to html play-by-play
|
867
|
+
shift = parse_shifts_html(html,json)
|
868
|
+
rosters = get_game_roster(json)
|
869
|
+
|
870
|
+
# Identify shift starts for each shift event
|
871
|
+
shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
|
872
|
+
num_on=('player_name', 'size'),
|
873
|
+
players_on=('player_name', lambda x: ', '.join(x)),
|
874
|
+
ids_on=('player_id', lambda x: ', '.join(map(str,x))),
|
875
|
+
).reset_index()
|
876
|
+
|
877
|
+
shifts_on = shifts_on.rename(columns={
|
878
|
+
'start':"seconds_elapsed"
|
879
|
+
})
|
880
|
+
|
881
|
+
# Identify shift stops for each shift event
|
882
|
+
shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
|
883
|
+
num_off=('player_name', 'size'),
|
884
|
+
players_off=('player_name', lambda x: ', '.join(x)),
|
885
|
+
ids_off=('player_id', lambda x: ', '.join(map(str,x))),
|
886
|
+
).reset_index()
|
887
|
+
|
888
|
+
shifts_off = shifts_off.rename(columns={
|
889
|
+
'end':"seconds_elapsed"
|
890
|
+
})
|
891
|
+
|
892
|
+
# Merge and sort by time in game
|
893
|
+
shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
|
894
|
+
|
895
|
+
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
|
896
|
+
shifts['event_type'] = 'change'
|
897
|
+
|
898
|
+
#Shift events similar to html (remove shootout shifts)
|
899
|
+
shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
|
900
|
+
|
901
|
+
#Generate on-ice columns
|
902
|
+
skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
|
903
|
+
goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
|
904
|
+
team = list(shift['event_team_abbr'])[0]
|
905
|
+
|
906
|
+
skaters = pd.DataFrame()
|
907
|
+
goalies = pd.DataFrame()
|
908
|
+
for player in skater_names:
|
909
|
+
#For each player in the game, determine when they began and ended shifts.
|
910
|
+
#With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
|
911
|
+
on_ice = (np.cumsum(
|
912
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
913
|
+
.apply(str)
|
914
|
+
.apply(lambda x: int(bool(re.search(player, x)))) -
|
915
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
916
|
+
.apply(str)
|
917
|
+
.apply(lambda x: int(bool(re.search(player, x))))
|
918
|
+
))
|
919
|
+
skaters[player] = on_ice
|
920
|
+
|
921
|
+
skaters = skaters.fillna(0).astype(int)
|
922
|
+
|
923
|
+
on_skaters = (skaters == 1).stack().reset_index()
|
924
|
+
on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
925
|
+
|
926
|
+
max_players = 6
|
927
|
+
for i in range(max_players):
|
928
|
+
on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
929
|
+
|
930
|
+
on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
931
|
+
|
932
|
+
#Repeat this process with goaltenders
|
933
|
+
for player in goalie_names:
|
934
|
+
on_ice = (np.cumsum(
|
935
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
936
|
+
.apply(str)
|
937
|
+
.apply(lambda x: int(bool(re.search(player, x)))) -
|
938
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
939
|
+
.apply(str)
|
940
|
+
.apply(lambda x: int(bool(re.search(player, x))))
|
941
|
+
))
|
942
|
+
goalies[player] = on_ice
|
943
|
+
|
944
|
+
goalies = goalies.fillna(0).astype(int)
|
945
|
+
|
946
|
+
on_goalies = (goalies == 1).stack().reset_index()
|
947
|
+
on_goalies = on_goalies[on_goalies[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
948
|
+
|
949
|
+
max_players = 1
|
950
|
+
for i in range(max_players):
|
951
|
+
on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
952
|
+
|
953
|
+
on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
954
|
+
|
955
|
+
#combine on-ice skaters and goaltenders for each shift event
|
956
|
+
on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
|
957
|
+
|
958
|
+
shifts['row'] = shifts.index
|
959
|
+
|
960
|
+
if home:
|
961
|
+
shifts['home_team_abbr'] = team
|
962
|
+
else:
|
963
|
+
shifts['away_team_abbr'] = team
|
964
|
+
#Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
|
965
|
+
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
|
966
|
+
|
967
|
+
def combine_shifts(away_html,home_html,json):
|
968
|
+
#JSON Prep
|
969
|
+
info = get_game_info(json)
|
970
|
+
del info['rosters']
|
971
|
+
|
972
|
+
roster = get_game_roster(json)
|
973
|
+
#Quickly combine shifts data
|
974
|
+
away = parse_shift_events(away_html,json,False)
|
975
|
+
home = parse_shift_events(home_html,json,True)
|
976
|
+
|
977
|
+
#Combine shifts
|
978
|
+
data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
|
979
|
+
|
980
|
+
#Create info columns
|
981
|
+
for col in info.keys():
|
982
|
+
data[col] = info[col]
|
983
|
+
|
984
|
+
#Create player information dicts to create on-ice names
|
985
|
+
players = {}
|
986
|
+
for id, player in zip(list(roster['playerId']),list(roster['fullName.default'])):
|
987
|
+
players.update({str(id):player.upper()})
|
988
|
+
|
989
|
+
for i in range(0,7):
|
990
|
+
if i == 6:
|
991
|
+
data['away_goalie'] = data['away_goalie_id'].replace(players)
|
992
|
+
data['home_goalie'] = data['home_goalie_id'].replace(players)
|
993
|
+
else:
|
994
|
+
data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
|
995
|
+
data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
|
996
|
+
|
997
|
+
data = data.sort_values(['period','seconds_elapsed'])
|
998
|
+
#Fill on-ice columns down
|
999
|
+
on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
|
1000
|
+
'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
|
1001
|
+
'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
|
1002
|
+
'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
|
1003
|
+
'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
|
1004
|
+
|
1005
|
+
for col in on_ice_col:
|
1006
|
+
data[col] = data[col].ffill()
|
1007
|
+
|
1008
|
+
#Create strength state information
|
1009
|
+
away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
|
1010
|
+
home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
|
1011
|
+
data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1012
|
+
data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1013
|
+
data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
|
1014
|
+
|
1015
|
+
#Return: full shifts data converted to play-by-play format
|
1016
|
+
col = [col for col in get_col() if col in data.columns.to_list()]
|
1017
|
+
return data[col]
|
1018
|
+
|
1019
|
+
### FINALIZE PBP ###
|
1020
|
+
def get_col():
|
1021
|
+
return [
|
1022
|
+
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
1023
|
+
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
1024
|
+
'seconds_elapsed',"situation_code","strength_state","home_team_defending_side",
|
1025
|
+
"event_type_code","event_type","description","penalty_duration",
|
1026
|
+
"event_team_abbr",'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
|
1027
|
+
"event_team_status",
|
1028
|
+
"event_player_1_name","event_player_2_name","event_player_3_name",
|
1029
|
+
"event_player_1_id","event_player_2_id","event_player_3_id",
|
1030
|
+
"event_player_1_pos","event_player_2_pos","event_player_3_pos",
|
1031
|
+
"event_goalie","event_goalie_id",
|
1032
|
+
"shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
1033
|
+
"event_skaters","away_skaters","home_skaters",
|
1034
|
+
"event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
|
1035
|
+
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
1036
|
+
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
|
1037
|
+
"away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
|
1038
|
+
"home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
|
1039
|
+
"event_coach","away_coach","home_coach"
|
1040
|
+
]
|
1041
|
+
|
1042
|
+
def combine_data(game_id,html_pbp,away_shifts,home_shifts,json):
|
1043
|
+
#Given game_id, html_pbp, away and home shifts, and json pbp, return total game play-by-play data is provided with additional and corrected details
|
1044
|
+
#Create dfs
|
1045
|
+
pbp = combine_pbp(game_id,html_pbp,json)
|
1046
|
+
shifts = combine_shifts(away_shifts,home_shifts,json)
|
1047
|
+
|
1048
|
+
#Combine data
|
1049
|
+
df = pd.concat([pbp,shifts])
|
1050
|
+
|
1051
|
+
#Create priority columns designed to order events that occur at the same time in a game
|
1052
|
+
even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
|
1053
|
+
df['priority'] = np.where(df['event_type'].isin(even_pri),1,
|
1054
|
+
np.where(df['event_type']=='goal',2,
|
1055
|
+
np.where(df['event_type']=='stoppage',3,
|
1056
|
+
np.where(df['event_type']=='delayed-penalty',4,
|
1057
|
+
np.where(df['event_type']=='penalty',5,
|
1058
|
+
np.where(df['event_type']=='period-end',6,
|
1059
|
+
np.where(df['event_type']=='change',7,
|
1060
|
+
np.where(df['event_type']=='game-end',8,
|
1061
|
+
np.where(df['event_type']=='period-start',9,
|
1062
|
+
np.where(df['event_type']=='faceoff',10,0))))))))))
|
1063
|
+
|
1064
|
+
df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
|
1065
|
+
df = df.sort_values(['period','seconds_elapsed','priority'])
|
1066
|
+
|
1067
|
+
#Recalibrate event_num column to accurately depict the order of all events, including changes
|
1068
|
+
df.reset_index(inplace=True,drop=True)
|
1069
|
+
df['event_num'] = df.index+1
|
1070
|
+
df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
1071
|
+
df['event_type_last'] = df['event_type'].shift(1)
|
1072
|
+
df['event_type_last_2'] = df['event_type_last'].shift(1)
|
1073
|
+
df['event_type_next'] = df['event_type'].shift(-1)
|
1074
|
+
lag_events = ['stoppage','goal','period-end']
|
1075
|
+
lead_events = ['faceoff','period-end']
|
1076
|
+
period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
|
1077
|
+
#Define shifts by "line-change" or "on-the-fly"
|
1078
|
+
df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
|
1079
|
+
df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
|
1080
|
+
try:
|
1081
|
+
df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
|
1082
|
+
except:
|
1083
|
+
""
|
1084
|
+
|
1085
|
+
#Retrieve coaches
|
1086
|
+
coaches = get_json_coaches(game_id)
|
1087
|
+
if not coaches:
|
1088
|
+
df['away_coach'] = ""
|
1089
|
+
df['home_coach'] = ""
|
1090
|
+
df['event_coach'] = ""
|
1091
|
+
else:
|
1092
|
+
df['away_coach'] = coaches['away']
|
1093
|
+
df['home_coach'] = coaches['home']
|
1094
|
+
df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
|
1095
|
+
|
1096
|
+
#Forward fill as necessary
|
1097
|
+
cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
|
1098
|
+
for col in cols:
|
1099
|
+
try: df[col]
|
1100
|
+
except: df[col] = ""
|
1101
|
+
df[col] = df[col].ffill()
|
1102
|
+
|
1103
|
+
#Return: complete play-by-play with all important data for each event in a provided game
|
1104
|
+
return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)
|