wsba-hockey 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/api/api/index.py +1 -1
- wsba_hockey/tools/plotting.py +1 -1
- wsba_hockey/tools/scraping.py +1 -1
- wsba_hockey/tools/xg_model.py +2 -2
- {wsba_hockey-1.1.1.dist-info → wsba_hockey-1.1.2.dist-info}/METADATA +1 -1
- {wsba_hockey-1.1.1.dist-info → wsba_hockey-1.1.2.dist-info}/RECORD +9 -21
- wsba_hockey/api/api/main.py +0 -4
- wsba_hockey/api/api/tools/__init__.py +0 -0
- wsba_hockey/api/api/tools/agg.py +0 -374
- wsba_hockey/api/api/tools/archive/old_scraping.py +0 -1104
- wsba_hockey/api/api/tools/plotting.py +0 -144
- wsba_hockey/api/api/tools/scraping.py +0 -1000
- wsba_hockey/api/api/tools/utils/__init__.py +0 -1
- wsba_hockey/api/api/tools/utils/config.py +0 -14
- wsba_hockey/api/api/tools/utils/save_pages.py +0 -133
- wsba_hockey/api/api/tools/utils/shared.py +0 -450
- wsba_hockey/api/api/tools/xg_model.py +0 -455
- wsba_hockey/api/api/wsba_main.py +0 -1213
- {wsba_hockey-1.1.1.dist-info → wsba_hockey-1.1.2.dist-info}/WHEEL +0 -0
- {wsba_hockey-1.1.1.dist-info → wsba_hockey-1.1.2.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-1.1.1.dist-info → wsba_hockey-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,1000 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import warnings
|
3
|
-
import os
|
4
|
-
import numpy as np
|
5
|
-
import pandas as pd
|
6
|
-
import requests as rs
|
7
|
-
import json as json_lib
|
8
|
-
from bs4 import BeautifulSoup
|
9
|
-
from tools.utils.shared import *
|
10
|
-
warnings.filterwarnings('ignore')
|
11
|
-
|
12
|
-
### SCRAPING FUNCTIONS ###
|
13
|
-
# Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
|
14
|
-
|
15
|
-
## ORDER OF OPERATIONS ##
|
16
|
-
# Create game information to use with all functions
|
17
|
-
# Retreive JSON data
|
18
|
-
# Parse JSON data
|
19
|
-
# Retreive and clean HTML pbp with player information
|
20
|
-
# Parse HTML pbp, return parsed HTML
|
21
|
-
# Combine pbp data
|
22
|
-
# Retreive and analyze HTML shifts with player information for home and away teams
|
23
|
-
# Parse shift events
|
24
|
-
# Combine all data, return complete play-by-play
|
25
|
-
|
26
|
-
## UTILITY FUNCTIONS ##
|
27
|
-
def get_col():
|
28
|
-
return [
|
29
|
-
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
30
|
-
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
31
|
-
'seconds_elapsed','period_time','game_time',"strength_state","strength_state_venue","home_team_defending_side",
|
32
|
-
"event_type_code","event_type","description","event_reason",
|
33
|
-
"penalty_type","penalty_duration","penalty_attribution",
|
34
|
-
"event_team_abbr","event_team_venue",
|
35
|
-
'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
|
36
|
-
"event_player_1_name","event_player_2_name","event_player_3_name",
|
37
|
-
"event_player_1_id","event_player_2_id","event_player_3_id",
|
38
|
-
"event_player_1_pos","event_player_2_pos","event_player_3_pos",
|
39
|
-
"event_goalie_name","event_goalie_id",
|
40
|
-
"shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
41
|
-
"event_skaters","away_skaters","home_skaters",
|
42
|
-
"event_distance","event_angle","event_length","seconds_since_last",
|
43
|
-
"away_score","home_score", "away_fenwick", "home_fenwick",
|
44
|
-
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
45
|
-
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
|
46
|
-
"away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
|
47
|
-
"home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
|
48
|
-
"event_coach","away_coach","home_coach"
|
49
|
-
]
|
50
|
-
|
51
|
-
def med_x_coord(group):
|
52
|
-
#Calculate median x coordinate of a corsi shot for a team in a period to determine the direction they are shooting in that period (for coordinate adjustments and geometric calculations)
|
53
|
-
med_x = group.loc[group['event_type'].isin(['blocked-shot','missed-shot','shot-on-goal','goal']),'x'].median(skipna=True)
|
54
|
-
group['med_x'] = med_x
|
55
|
-
|
56
|
-
return group
|
57
|
-
|
58
|
-
def adjust_coords(pbp):
|
59
|
-
#Given JSON or ESPN pbp data, return pbp with adjusted coordinates
|
60
|
-
|
61
|
-
#Recalibrate coordinates
|
62
|
-
#Determine the direction teams are shooting in a given period
|
63
|
-
pbp = pbp.groupby(['event_team_venue','period','game_id'],group_keys=False).apply(med_x_coord)
|
64
|
-
|
65
|
-
pbp = pbp.reset_index(drop=True)
|
66
|
-
|
67
|
-
#Adjust coordinates
|
68
|
-
pbp['x_adj'] = np.where((((pbp['event_team_venue']=='home')&(pbp['med_x'] < 0))|((pbp['event_team_venue']=='away')&(pbp['med_x'] > 0))),-pbp['x'],pbp['x'])
|
69
|
-
|
70
|
-
#Adjust y if necessary
|
71
|
-
pbp['y_adj'] = np.where((pbp['x']==pbp['x_adj']),pbp['y'],-pbp['y'])
|
72
|
-
|
73
|
-
#Calculate event distance and angle relative to venue location
|
74
|
-
pbp['event_distance'] = np.where(pbp['event_team_venue']=='home',np.sqrt(((89 - pbp['x_adj'])**2) + (pbp['y_adj']**2)),np.sqrt((((-89) - pbp['x_adj'])**2) + (pbp['y_adj']**2)))
|
75
|
-
pbp['event_angle'] = np.where(pbp['event_team_venue']=='away',np.degrees(np.arctan2(abs(pbp['y_adj']), abs(89 - pbp['x_adj']))),np.degrees(np.arctan2(abs(pbp['y_adj']), abs((-89) - pbp['x_adj']))))
|
76
|
-
|
77
|
-
#Return: pbp with adjiusted coordinates
|
78
|
-
return pbp
|
79
|
-
|
80
|
-
## JSON FUNCTIONS ##
|
81
|
-
def get_game_roster(json):
|
82
|
-
#Given raw json data, return game rosters
|
83
|
-
roster = pd.json_normalize(json['rosterSpots'])
|
84
|
-
roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
|
85
|
-
|
86
|
-
#Return: roster information
|
87
|
-
return roster
|
88
|
-
|
89
|
-
def get_game_coaches(game_id):
|
90
|
-
#Given game info, return head coaches for away and home team
|
91
|
-
|
92
|
-
#Retreive data
|
93
|
-
json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
|
94
|
-
data = json['gameInfo']
|
95
|
-
|
96
|
-
#Add coaches
|
97
|
-
try:
|
98
|
-
away = data['awayTeam']['headCoach']['default'].upper()
|
99
|
-
home = data['homeTeam']['headCoach']['default'].upper()
|
100
|
-
|
101
|
-
coaches = {'away':away,
|
102
|
-
'home':home}
|
103
|
-
except KeyError:
|
104
|
-
return {}
|
105
|
-
|
106
|
-
#Return: dict with coaches
|
107
|
-
return coaches
|
108
|
-
|
109
|
-
def get_game_info(game_id):
|
110
|
-
#Given game_id, return game information
|
111
|
-
|
112
|
-
#Retreive data
|
113
|
-
api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
|
114
|
-
json = rs.get(api).json()
|
115
|
-
|
116
|
-
#Games don't always have JSON shifts, for whatever reason
|
117
|
-
shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
|
118
|
-
shifts = rs.get(shifts).json()
|
119
|
-
json_shifts = pd.json_normalize(shifts['data'])
|
120
|
-
|
121
|
-
if shifts['total'] == 0:
|
122
|
-
json_shifts = pd.DataFrame()
|
123
|
-
|
124
|
-
#Split information
|
125
|
-
base = pd.json_normalize(json)
|
126
|
-
game_id = base['id'][0]
|
127
|
-
season = base['season'][0]
|
128
|
-
season_type = base['gameType'][0]
|
129
|
-
game_date = base['gameDate'][0]
|
130
|
-
game_state = base['gameState'][0]
|
131
|
-
start_time = base['startTimeUTC'][0]
|
132
|
-
venue = base['venue.default'][0]
|
133
|
-
venue_location = base['venueLocation.default'][0]
|
134
|
-
away_team_id = base['awayTeam.id'][0]
|
135
|
-
away_team_abbr = base['awayTeam.abbrev'][0]
|
136
|
-
home_team_id = base['homeTeam.id'][0]
|
137
|
-
home_team_abbr = base['homeTeam.abbrev'][0]
|
138
|
-
|
139
|
-
#Add roster
|
140
|
-
roster = get_game_roster(json)
|
141
|
-
#In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
|
142
|
-
roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
|
143
|
-
roster['team_abbr'] = roster['teamId'].replace({
|
144
|
-
away_team_id:[away_team_abbr],
|
145
|
-
home_team_id:[home_team_abbr]
|
146
|
-
})
|
147
|
-
roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
|
148
|
-
|
149
|
-
#Create an additional roster dictionary for use with HTML parsing
|
150
|
-
#Roster dict
|
151
|
-
roster_dict = {'away':{},
|
152
|
-
'home':{}}
|
153
|
-
|
154
|
-
#Evaluate and add players by team
|
155
|
-
for team in ['away','home']:
|
156
|
-
abbr = (away_team_abbr if team == 'away' else home_team_abbr)
|
157
|
-
rost = roster.loc[roster['team_abbr']==abbr]
|
158
|
-
|
159
|
-
#Now iterate through team players
|
160
|
-
for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
|
161
|
-
roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
|
162
|
-
|
163
|
-
#Return: game information
|
164
|
-
return {"game_id":str(game_id),
|
165
|
-
"season":season,
|
166
|
-
"season_type":season_type,
|
167
|
-
"game_date":game_date,
|
168
|
-
"game_state":game_state,
|
169
|
-
"start_time":start_time,
|
170
|
-
'venue':venue,
|
171
|
-
'venue_location':venue_location,
|
172
|
-
'away_team_id':away_team_id,
|
173
|
-
'away_team_abbr':away_team_abbr,
|
174
|
-
'home_team_id':home_team_id,
|
175
|
-
'home_team_abbr':home_team_abbr,
|
176
|
-
'events':pd.json_normalize(json['plays']).reset_index(drop=True),
|
177
|
-
'rosters':roster,
|
178
|
-
'HTML_rosters':roster_dict,
|
179
|
-
'coaches':get_game_coaches(game_id),
|
180
|
-
'json_shifts':json_shifts}
|
181
|
-
|
182
|
-
def parse_json(info):
|
183
|
-
#Given game info, return JSON document
|
184
|
-
|
185
|
-
#Retreive data
|
186
|
-
events = info['events']
|
187
|
-
|
188
|
-
#Return error if game is set in the future
|
189
|
-
if info['game_state'] == 'FUT':
|
190
|
-
raise ValueError(f"Game {info['id'][0]} has not occured yet.")
|
191
|
-
|
192
|
-
#Test columns
|
193
|
-
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
|
194
|
-
|
195
|
-
for col in cols:
|
196
|
-
try:events[col]
|
197
|
-
except:
|
198
|
-
events[col]=""
|
199
|
-
|
200
|
-
#Event_player_columns include players in a given set of events; the higher the number, the greater the importance the event player was to the play
|
201
|
-
events['event_player_1_id'] = events['details.winningPlayerId'].combine_first(events['details.scoringPlayerId'])\
|
202
|
-
.combine_first(events['details.shootingPlayerId'])\
|
203
|
-
.combine_first(events['details.playerId'])\
|
204
|
-
.combine_first(events['details.hittingPlayerId'])\
|
205
|
-
.combine_first(events['details.committedByPlayerId'])
|
206
|
-
|
207
|
-
events['event_player_2_id'] = events['details.losingPlayerId'].combine_first(events['details.assist1PlayerId'])\
|
208
|
-
.combine_first(events['details.hitteePlayerId'])\
|
209
|
-
.combine_first(events['details.drawnByPlayerId'])\
|
210
|
-
.combine_first(events['details.blockingPlayerId'])
|
211
|
-
|
212
|
-
events['event_player_3_id'] = events['details.assist2PlayerId']
|
213
|
-
|
214
|
-
events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
|
215
|
-
|
216
|
-
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
|
217
|
-
info['away_team_id']:[info['away_team_abbr']],
|
218
|
-
info['home_team_id']:[info['home_team_abbr']]
|
219
|
-
})
|
220
|
-
|
221
|
-
#Rename columns to follow WSBA naming conventions
|
222
|
-
events = events.rename(columns={
|
223
|
-
"eventId":"event_id",
|
224
|
-
"periodDescriptor.number":"period",
|
225
|
-
"periodDescriptor.periodType":"period_type",
|
226
|
-
"timeInPeriod":"period_time_elasped",
|
227
|
-
"timeRemaining":"period_time_remaining",
|
228
|
-
"situationCode":"situation_code",
|
229
|
-
"homeTeamDefendingSide":"home_team_defending_side",
|
230
|
-
"typeCode":"event_type_code",
|
231
|
-
"typeDescKey":"event_type",
|
232
|
-
"details.shotType":"shot_type",
|
233
|
-
"details.duration":"penalty_duration",
|
234
|
-
"details.descKey":"penalty_type",
|
235
|
-
"details.typeCode":'penalty_attribution',
|
236
|
-
"details.reason":"event_reason",
|
237
|
-
"details.zoneCode":"zone_code",
|
238
|
-
"details.xCoord":"x",
|
239
|
-
"details.yCoord":"y",
|
240
|
-
"details.goalieInNetId": "event_goalie_id",
|
241
|
-
"details.awaySOG":"away_sog",
|
242
|
-
"details.homeSOG":"home_sog"
|
243
|
-
})
|
244
|
-
|
245
|
-
#Coordinate adjustments:
|
246
|
-
# x, y - Raw coordinates from JSON pbp
|
247
|
-
# x_adj, y_adj - Adjusted coordinates configuring the away offensive zone to the left and the home offensive zone to the right
|
248
|
-
#Some games (mostly preseason and all star games) do not include coordinates.
|
249
|
-
|
250
|
-
try:
|
251
|
-
events = adjust_coords(events)
|
252
|
-
|
253
|
-
except KeyError:
|
254
|
-
print(f"No coordinates found for game {info['game_id'][0]}...")
|
255
|
-
|
256
|
-
events['x_adj'] = np.nan
|
257
|
-
events['y_adj'] = np.nan
|
258
|
-
events['event_distance'] = np.nan
|
259
|
-
events['event_angle'] = np.nan
|
260
|
-
|
261
|
-
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
262
|
-
events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
|
263
|
-
events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
|
264
|
-
|
265
|
-
events = events.loc[(events['event_type']!="")]
|
266
|
-
|
267
|
-
#Return: dataframe with parsed game
|
268
|
-
return events
|
269
|
-
|
270
|
-
|
271
|
-
## HTML PBP FUNCTIONS ##
|
272
|
-
def strip_html_pbp(td,rosters):
|
273
|
-
#Given html row, parse data from HTML pbp
|
274
|
-
#Harry Shomer's Code (modified)
|
275
|
-
|
276
|
-
#HTML Parsing
|
277
|
-
for y in range(len(td)):
|
278
|
-
# Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
|
279
|
-
if y == 3:
|
280
|
-
td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
|
281
|
-
index = td[y].find(':')
|
282
|
-
td[y] = td[y][:index+3]
|
283
|
-
elif (y == 6 or y == 7) and td[0] != '#':
|
284
|
-
# 6 & 7-> These are the player 1 ice one's
|
285
|
-
# The second statement controls for when it's just a header
|
286
|
-
baz = td[y].find_all('td')
|
287
|
-
bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
|
288
|
-
|
289
|
-
# The setup in the list is now: Name/Number->Position->Blank...and repeat
|
290
|
-
# Now strip all the html
|
291
|
-
players = []
|
292
|
-
for i in range(len(bar)):
|
293
|
-
if i % 3 == 0:
|
294
|
-
try:
|
295
|
-
#Using the supplied json we can bind player name and id to number and team
|
296
|
-
#Find number and team of player then lookup roster dictionary
|
297
|
-
|
298
|
-
number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
|
299
|
-
if y == 6:
|
300
|
-
team = 'away'
|
301
|
-
else:
|
302
|
-
team = 'home'
|
303
|
-
|
304
|
-
id = rosters[team][str(number)][4]
|
305
|
-
name = rosters[team][str(number)][2]
|
306
|
-
position = rosters[team][str(number)][1]
|
307
|
-
|
308
|
-
except KeyError:
|
309
|
-
name = ''
|
310
|
-
number = ''
|
311
|
-
id = ''
|
312
|
-
elif i % 3 == 1:
|
313
|
-
if name != '':
|
314
|
-
players.append([name, number, position, id])
|
315
|
-
|
316
|
-
td[y] = players
|
317
|
-
else:
|
318
|
-
td[y] = td[y].get_text()
|
319
|
-
|
320
|
-
return td
|
321
|
-
|
322
|
-
|
323
|
-
def clean_html_pbp(info):
|
324
|
-
#Harry Shomer's Code (modified)
|
325
|
-
|
326
|
-
game_id = info['game_id']
|
327
|
-
#Retreive data
|
328
|
-
season = info['season']
|
329
|
-
doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
|
330
|
-
html = rs.get(doc).content
|
331
|
-
soup = get_contents(html)
|
332
|
-
|
333
|
-
#Rosters
|
334
|
-
rosters = info['HTML_rosters']
|
335
|
-
|
336
|
-
# Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
|
337
|
-
td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
|
338
|
-
|
339
|
-
cleaned_html = [strip_html_pbp(x,rosters) for x in td]
|
340
|
-
|
341
|
-
return cleaned_html
|
342
|
-
|
343
|
-
def parse_html(info):
|
344
|
-
#Given game info, return HTML event data
|
345
|
-
|
346
|
-
#Retreive game information and html events
|
347
|
-
rosters = info['HTML_rosters']
|
348
|
-
events = clean_html_pbp(info)
|
349
|
-
|
350
|
-
teams = {info['away_team_abbr']:['away'],
|
351
|
-
info['home_team_abbr']:['home']}
|
352
|
-
|
353
|
-
#Parsing
|
354
|
-
event_log = []
|
355
|
-
for event in events:
|
356
|
-
events_dict = {}
|
357
|
-
if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM', 'SPC', 'PBOX', 'EISTR', 'EIEND','EGPID'] or event[3]=='-16:0-':
|
358
|
-
continue
|
359
|
-
else:
|
360
|
-
#Event info
|
361
|
-
events_dict['event_num'] = int(event[0])
|
362
|
-
events_dict['period'] = int(event[1])
|
363
|
-
events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
|
364
|
-
events_dict['period_time_elapsed'] = event[3]
|
365
|
-
events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
|
366
|
-
events_dict['event_type'] = event[4]
|
367
|
-
|
368
|
-
desc = re.sub(u'\xa0'," ",event[5])
|
369
|
-
events_dict['description'] = desc
|
370
|
-
|
371
|
-
events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
|
372
|
-
zone = [x for x in desc.split(',') if 'Zone' in x]
|
373
|
-
if not zone:
|
374
|
-
events_dict['zone_code'] = None
|
375
|
-
elif zone[0].find("Off") != -1:
|
376
|
-
events_dict['zone_code'] = 'O'
|
377
|
-
elif zone[0].find("Neu") != -1:
|
378
|
-
events_dict['zone_code'] = 'N'
|
379
|
-
elif zone[0].find("Def") != -1:
|
380
|
-
events_dict['zone_code'] = 'D'
|
381
|
-
|
382
|
-
#Convert team names for compatiblity
|
383
|
-
replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
|
384
|
-
for name, repl in replace:
|
385
|
-
desc = desc.replace(repl,name)
|
386
|
-
|
387
|
-
event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
|
388
|
-
events_dict['event_team_abbr'] = event_team
|
389
|
-
|
390
|
-
events_dict['away_team_abbr'] = info['away_team_abbr']
|
391
|
-
events_dict['home_team_abbr'] = info['home_team_abbr']
|
392
|
-
|
393
|
-
away_skaters = 0
|
394
|
-
away_goalie = 0
|
395
|
-
#Away on-ice
|
396
|
-
for i in range(len(event[6])):
|
397
|
-
player = event[6][i][0]
|
398
|
-
pos = event[6][i][2]
|
399
|
-
id = event[6][i][3]
|
400
|
-
|
401
|
-
if pos == 'G':
|
402
|
-
events_dict['away_goalie'] = player
|
403
|
-
events_dict['away_goalie_id'] = id
|
404
|
-
away_goalie += 1
|
405
|
-
else:
|
406
|
-
events_dict[f'away_on_{i+1}'] = player
|
407
|
-
events_dict[f'away_on_{i+1}_id'] = id
|
408
|
-
away_skaters += 1
|
409
|
-
|
410
|
-
home_skaters = 0
|
411
|
-
home_goalie = 0
|
412
|
-
#Home on-ice
|
413
|
-
for i in range(len(event[7])):
|
414
|
-
player = event[7][i][0]
|
415
|
-
pos = event[7][i][2]
|
416
|
-
id = event[7][i][3]
|
417
|
-
|
418
|
-
if pos == 'G':
|
419
|
-
events_dict['home_goalie'] = player
|
420
|
-
events_dict['home_goalie_id'] = id
|
421
|
-
home_goalie += 1
|
422
|
-
else:
|
423
|
-
events_dict[f'home_on_{i+1}'] = player
|
424
|
-
events_dict[f'home_on_{i+1}_id'] = id
|
425
|
-
home_skaters += 1
|
426
|
-
|
427
|
-
event_players = []
|
428
|
-
#Determine parsing route based on event
|
429
|
-
if event[4] in ['FAC','HIT','BLOCK','PENL']:
|
430
|
-
#Regex to find team and player number involved (finds all for each event)
|
431
|
-
#Code is modified from Harry Shomer in order to account for periods in a team abbreviation
|
432
|
-
regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
|
433
|
-
fac = regex.findall(desc)
|
434
|
-
#Filter incorrectly parsed teams
|
435
|
-
repl = []
|
436
|
-
for team, num in fac:
|
437
|
-
if team in teams.keys():
|
438
|
-
repl.append((team,num))
|
439
|
-
fac = repl
|
440
|
-
|
441
|
-
#Find first event player
|
442
|
-
ep1_num = ''
|
443
|
-
for i in range(len(fac)):
|
444
|
-
team, num = fac[i]
|
445
|
-
if team == event_team:
|
446
|
-
ep1_num = num
|
447
|
-
event_players.append(fac[i])
|
448
|
-
else:
|
449
|
-
continue
|
450
|
-
|
451
|
-
#Find other players
|
452
|
-
for i in range(len(fac)):
|
453
|
-
team, num = fac[i]
|
454
|
-
if num == ep1_num:
|
455
|
-
continue
|
456
|
-
else:
|
457
|
-
event_players.append(fac[i])
|
458
|
-
elif event[4]=='GOAL':
|
459
|
-
#Parse goal
|
460
|
-
regex = re.compile(r'#(\d+)\s+')
|
461
|
-
goal = regex.findall(desc)
|
462
|
-
|
463
|
-
#Add all involved players
|
464
|
-
for point in goal:
|
465
|
-
#In this loop, point is a player number. We can assign event_team to all players in a goal
|
466
|
-
event_players.append((event_team,str(point)))
|
467
|
-
elif event[4]=='DELPEN':
|
468
|
-
#Don't parse DELPEN events
|
469
|
-
#These events typically have no text but when they do it is often erroneous or otherwise problematic
|
470
|
-
|
471
|
-
""
|
472
|
-
else:
|
473
|
-
#Parse single or no player events
|
474
|
-
regex = re.compile(r'#\d+')
|
475
|
-
fac = regex.findall(desc)
|
476
|
-
|
477
|
-
for i in range(len(fac)):
|
478
|
-
num = fac[i].replace("#","")
|
479
|
-
event_players.append((event_team,str(num)))
|
480
|
-
|
481
|
-
for i in range(len(event_players)):
|
482
|
-
#For each player, evaluate their event data, then retreive information from rosters
|
483
|
-
team, num = event_players[i]
|
484
|
-
|
485
|
-
status = teams[team]
|
486
|
-
data = rosters[status[0]]
|
487
|
-
|
488
|
-
#In rare instances the event player is not on the event team (i.e. "WSH TAKEAWAY - #71 CIRELLI, Off. Zone" when #71 CIRELLI is on TBL)
|
489
|
-
try:
|
490
|
-
events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
|
491
|
-
events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
|
492
|
-
events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
|
493
|
-
except:
|
494
|
-
''
|
495
|
-
|
496
|
-
#Event skaters and strength-state information
|
497
|
-
events_dict['away_skaters'] = away_skaters
|
498
|
-
events_dict['home_skaters'] = home_skaters
|
499
|
-
events_dict['away_goalie_in'] = away_goalie
|
500
|
-
events_dict['home_goalie_in'] = home_goalie
|
501
|
-
|
502
|
-
event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
|
503
|
-
event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
|
504
|
-
events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
|
505
|
-
events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
|
506
|
-
|
507
|
-
event_log.append(pd.DataFrame([events_dict]))
|
508
|
-
|
509
|
-
data = pd.concat(event_log)
|
510
|
-
data['event_type'] = data['event_type'].replace({
|
511
|
-
"PGSTR": "pre-game-start",
|
512
|
-
"PGEND": "pre-game-end",
|
513
|
-
'GSTR':"game-start",
|
514
|
-
"ANTHEM":"anthem",
|
515
|
-
"PSTR":"period-start",
|
516
|
-
"FAC":"faceoff",
|
517
|
-
"SHOT":"shot-on-goal",
|
518
|
-
"BLOCK":"blocked-shot",
|
519
|
-
"STOP":"stoppage",
|
520
|
-
"MISS":"missed-shot",
|
521
|
-
"HIT":"hit",
|
522
|
-
"GOAL":"goal",
|
523
|
-
"GIVE":"giveaway",
|
524
|
-
"TAKE":"takeaway",
|
525
|
-
"DELPEN":"delayed-penalty",
|
526
|
-
"PENL":"penalty",
|
527
|
-
"CHL":"challenge",
|
528
|
-
"SOC":'shootout-complete',
|
529
|
-
"PEND":"period-end",
|
530
|
-
"GEND":"game-end"
|
531
|
-
})
|
532
|
-
|
533
|
-
#Return: parsed HTML pbp
|
534
|
-
return data
|
535
|
-
|
536
|
-
def assign_target(data):
|
537
|
-
#Assign target number to plays to assist with merging
|
538
|
-
|
539
|
-
#New sort
|
540
|
-
data = data.sort_values(['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','event_player_2_id'])
|
541
|
-
|
542
|
-
#Target number distingushes events that occur in the same second to assist in merging the JSON and HTML
|
543
|
-
#Sometimes the target number may not reflect the same order as the event number in either document (especially in earlier seasons where the events are out of order in the HTML or JSON)
|
544
|
-
data['target_num'] = np.where(data['event_type'].isin(['penalty','blocked-shot','missed-shot','shot-on-goal','goal']),data['event_type'].isin(['penalty','blocked-shot','missed-shot','shot-on-goal','goal']).cumsum(),0)
|
545
|
-
|
546
|
-
#Revert sort and return dataframe
|
547
|
-
return data.reset_index()
|
548
|
-
|
549
|
-
def combine_pbp(info,sources):
|
550
|
-
#Given game info, return complete play-by-play data for provided game
|
551
|
-
|
552
|
-
html_pbp = parse_html(info)
|
553
|
-
|
554
|
-
#Route data combining - json if season is after 2009-2010:
|
555
|
-
if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
|
556
|
-
#ESPN x HTML
|
557
|
-
#espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).sort_values(['period','seconds_elapsed']).reset_index()
|
558
|
-
#merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
|
559
|
-
|
560
|
-
#Merge pbp
|
561
|
-
#df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
|
562
|
-
print('In-repair, please try again later...')
|
563
|
-
|
564
|
-
else:
|
565
|
-
#JSON x HTML
|
566
|
-
json_pbp = parse_json(info)
|
567
|
-
|
568
|
-
if sources:
|
569
|
-
dirs_html = f'sources/{info['season']}/HTML/'
|
570
|
-
dirs_json = f'sources/{info['season']}/JSON/'
|
571
|
-
|
572
|
-
if not os.path.exists(dirs_html):
|
573
|
-
os.makedirs(dirs_html)
|
574
|
-
if not os.path.exists(dirs_json):
|
575
|
-
os.makedirs(dirs_json)
|
576
|
-
|
577
|
-
html_pbp.to_csv(f'{dirs_html}{info['game_id']}_HTML.csv',index=False)
|
578
|
-
json_pbp.to_csv(f'{dirs_json}{info['game_id']}_JSON.csv',index=False)
|
579
|
-
|
580
|
-
#Assign target numbers
|
581
|
-
html_pbp = assign_target(html_pbp)
|
582
|
-
json_pbp = assign_target(json_pbp)
|
583
|
-
|
584
|
-
#Merge on index if the df lengths are the same and the events are in the same general order; merge on columns otherwise
|
585
|
-
if (len(html_pbp) == len(json_pbp)) and (html_pbp['event_type'].equals(json_pbp['event_type'])) and (html_pbp['seconds_elapsed'].equals(json_pbp['seconds_elapsed'])):
|
586
|
-
html_pbp = html_pbp.drop(columns=['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore').reset_index()
|
587
|
-
df = pd.merge(html_pbp,json_pbp,how='left',left_index=True,right_index=True).sort_values(['event_num'])
|
588
|
-
else:
|
589
|
-
print(f' merging on columns...',end="")
|
590
|
-
#Modify merge conditions and merge pbps
|
591
|
-
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','target_num']
|
592
|
-
html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
|
593
|
-
|
594
|
-
#While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
|
595
|
-
html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
|
596
|
-
json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
|
597
|
-
|
598
|
-
#Merge pbp
|
599
|
-
df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col).sort_values(['event_num'])
|
600
|
-
|
601
|
-
#Add game info
|
602
|
-
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
603
|
-
'away_team_abbr','home_team_abbr']
|
604
|
-
|
605
|
-
for col in info_col:
|
606
|
-
df[col] = info[col]
|
607
|
-
|
608
|
-
#Fill period_type column and assign shifts a sub-500 event code
|
609
|
-
df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
|
610
|
-
try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
611
|
-
except:
|
612
|
-
""
|
613
|
-
df = df.sort_values(['period','seconds_elapsed']).reset_index()
|
614
|
-
|
615
|
-
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
616
|
-
|
617
|
-
#Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
|
618
|
-
df['strength_state'] = np.where((df['period'].astype(str)=='5')&(df['event_type'].isin(['missed-shot','shot-on-goal','goal']))&(df['season_type']==2),"1v0",df['strength_state'])
|
619
|
-
df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
|
620
|
-
|
621
|
-
col = [col for col in get_col() if col in df.columns.to_list()]
|
622
|
-
#Return: complete play-by-play information for provided game
|
623
|
-
return df[col]
|
624
|
-
|
625
|
-
## SHIFT SCRAPING FUNCTIONS ##
|
626
|
-
def parse_shifts_json(info):
|
627
|
-
#Given game info, return json shift chart
|
628
|
-
|
629
|
-
log = info['json_shifts']
|
630
|
-
#Filter non-shift events and duplicate events
|
631
|
-
log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
|
632
|
-
|
633
|
-
#Add full name columns
|
634
|
-
log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
|
635
|
-
|
636
|
-
log = log.rename(columns={
|
637
|
-
'playerId':'player_id',
|
638
|
-
'teamAbbrev':'event_team_abbr',
|
639
|
-
'startTime':'start',
|
640
|
-
'endTime':'end'
|
641
|
-
})
|
642
|
-
|
643
|
-
#Convert time columns
|
644
|
-
log['start'] = log['start'].astype(str).apply(convert_to_seconds)
|
645
|
-
log['end'] = log['end'].astype(str).apply(convert_to_seconds)
|
646
|
-
log = log[['player_name','player_id',
|
647
|
-
'period','event_team_abbr',
|
648
|
-
'start','duration','end']]
|
649
|
-
|
650
|
-
#Recalibrate duration
|
651
|
-
log['duration'] = log['end'] - log['start']
|
652
|
-
|
653
|
-
#Return: JSON shifts (seperated by team)
|
654
|
-
away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
|
655
|
-
home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
|
656
|
-
|
657
|
-
return {'away':away,
|
658
|
-
'home':home}
|
659
|
-
|
660
|
-
def analyze_shifts(shift, id, name, pos, team):
|
661
|
-
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
662
|
-
#Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
|
663
|
-
shifts = dict()
|
664
|
-
|
665
|
-
shifts['player_name'] = name.upper()
|
666
|
-
shifts['player_id'] = id
|
667
|
-
shifts['player_pos'] = pos
|
668
|
-
shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
|
669
|
-
shifts['event_team_abbr'] = get_team(team.strip(' '))
|
670
|
-
shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
|
671
|
-
shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
|
672
|
-
|
673
|
-
#Sometimes there are no digits
|
674
|
-
if re.compile(r'\d+').findall(shift[3].split('/')[0]):
|
675
|
-
shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
|
676
|
-
else:
|
677
|
-
shifts['end'] = shifts['start'] + shifts['duration']
|
678
|
-
return shifts
|
679
|
-
|
680
|
-
def parse_shifts_html(info,home):
|
681
|
-
#Parsing of shifts data for a single team in a provided game
|
682
|
-
#Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
|
683
|
-
|
684
|
-
#Roster info prep
|
685
|
-
roster = info['HTML_rosters']
|
686
|
-
|
687
|
-
rosters = roster['home' if home else 'away']
|
688
|
-
|
689
|
-
all_shifts = []
|
690
|
-
#columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
691
|
-
|
692
|
-
#Retreive HTML
|
693
|
-
game_id = info['game_id']
|
694
|
-
season = info['season']
|
695
|
-
link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
|
696
|
-
doc = rs.get(link).content
|
697
|
-
td, teams = get_soup(doc)
|
698
|
-
|
699
|
-
team = teams[0]
|
700
|
-
players = dict()
|
701
|
-
|
702
|
-
# Iterates through each player shifts table with the following data:
|
703
|
-
# Shift #, Period, Start, End, and Duration.
|
704
|
-
for t in td:
|
705
|
-
t = t.get_text()
|
706
|
-
if ',' in t: # If a comma exists it is a player
|
707
|
-
name = t
|
708
|
-
|
709
|
-
name = name.split(',')
|
710
|
-
number = int(name[0][:2].strip())
|
711
|
-
id = rosters[str(number)][4]
|
712
|
-
players[id] = dict()
|
713
|
-
|
714
|
-
#HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
|
715
|
-
|
716
|
-
players[id]['name'] = rosters[str(number)][2]
|
717
|
-
players[id]['pos'] = rosters[str(number)][1]
|
718
|
-
|
719
|
-
players[id]['shifts'] = []
|
720
|
-
else:
|
721
|
-
players[id]['shifts'].extend([t])
|
722
|
-
|
723
|
-
for key in players.keys():
|
724
|
-
# Create lists of shifts-table columns for analysis
|
725
|
-
players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
|
726
|
-
|
727
|
-
name = players[key]['name']
|
728
|
-
pos = players[key]['pos']
|
729
|
-
|
730
|
-
# Parsing
|
731
|
-
shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
|
732
|
-
all_shifts.extend(shifts)
|
733
|
-
|
734
|
-
df = pd.DataFrame(all_shifts)
|
735
|
-
|
736
|
-
shifts_raw = df[df['duration'] > 0]
|
737
|
-
|
738
|
-
#Return: single-team individual shifts by player
|
739
|
-
return shifts_raw
|
740
|
-
|
741
|
-
def parse_shift_events(info,home):
|
742
|
-
#Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
|
743
|
-
|
744
|
-
#Determine whether to use JSON shifts or HTML shifts
|
745
|
-
if len(info['json_shifts']) == 0:
|
746
|
-
shift = parse_shifts_html(info,home)
|
747
|
-
else:
|
748
|
-
shift = parse_shifts_json(info)['home' if home else 'away']
|
749
|
-
|
750
|
-
rosters = info['rosters']
|
751
|
-
|
752
|
-
# Identify shift starts for each shift event
|
753
|
-
shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
|
754
|
-
num_on=('player_name', 'size'),
|
755
|
-
players_on=('player_name', lambda x: ', '.join(x)),
|
756
|
-
ids_on=('player_id', lambda x: ', '.join(map(str,x))),
|
757
|
-
).reset_index()
|
758
|
-
|
759
|
-
shifts_on = shifts_on.rename(columns={
|
760
|
-
'start':"seconds_elapsed"
|
761
|
-
})
|
762
|
-
|
763
|
-
# Identify shift stops for each shift event
|
764
|
-
shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
|
765
|
-
num_off=('player_name', 'size'),
|
766
|
-
players_off=('player_name', lambda x: ', '.join(x)),
|
767
|
-
ids_off=('player_id', lambda x: ', '.join(map(str,x))),
|
768
|
-
).reset_index()
|
769
|
-
|
770
|
-
shifts_off = shifts_off.rename(columns={
|
771
|
-
'end':"seconds_elapsed"
|
772
|
-
})
|
773
|
-
|
774
|
-
# Merge and sort by time in game
|
775
|
-
shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
|
776
|
-
|
777
|
-
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
|
778
|
-
shifts['event_type'] = 'change'
|
779
|
-
|
780
|
-
#Shift events similar to html (remove shootout shifts)
|
781
|
-
shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
|
782
|
-
|
783
|
-
#Generate on-ice columns
|
784
|
-
skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
|
785
|
-
goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
|
786
|
-
team = list(shift['event_team_abbr'])[0]
|
787
|
-
|
788
|
-
skaters = pd.DataFrame()
|
789
|
-
goalies = pd.DataFrame()
|
790
|
-
for player in skater_names:
|
791
|
-
#For each player in the game, determine when they began and ended shifts.
|
792
|
-
#With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
|
793
|
-
on_ice = (np.cumsum(
|
794
|
-
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
795
|
-
.apply(str)
|
796
|
-
.apply(lambda x: int(bool(re.search(player, x)))) -
|
797
|
-
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
798
|
-
.apply(str)
|
799
|
-
.apply(lambda x: int(bool(re.search(player, x))))
|
800
|
-
))
|
801
|
-
skaters[player] = on_ice
|
802
|
-
|
803
|
-
skaters = skaters.fillna(0).astype(int)
|
804
|
-
|
805
|
-
on_skaters = (skaters == 1).stack().reset_index()
|
806
|
-
on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
807
|
-
|
808
|
-
max_players = 6
|
809
|
-
for i in range(max_players):
|
810
|
-
on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
811
|
-
|
812
|
-
on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
813
|
-
|
814
|
-
#Repeat this process with goaltenders
|
815
|
-
for player in goalie_names:
|
816
|
-
on_ice = (np.cumsum(
|
817
|
-
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
818
|
-
.apply(str)
|
819
|
-
.apply(lambda x: int(bool(re.search(player, x)))) -
|
820
|
-
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
821
|
-
.apply(str)
|
822
|
-
.apply(lambda x: int(bool(re.search(player, x))))
|
823
|
-
))
|
824
|
-
goalies[player] = on_ice
|
825
|
-
|
826
|
-
goalies = goalies.fillna(0).astype(int)
|
827
|
-
|
828
|
-
on_goalies = (goalies == 1).stack().reset_index()
|
829
|
-
on_goalies = on_goalies[on_goalies[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
830
|
-
|
831
|
-
max_players = 1
|
832
|
-
for i in range(max_players):
|
833
|
-
on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
834
|
-
|
835
|
-
on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
836
|
-
|
837
|
-
#combine on-ice skaters and goaltenders for each shift event
|
838
|
-
on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
|
839
|
-
|
840
|
-
shifts['row'] = shifts.index
|
841
|
-
|
842
|
-
if home:
|
843
|
-
shifts['home_team_abbr'] = team
|
844
|
-
else:
|
845
|
-
shifts['away_team_abbr'] = team
|
846
|
-
#Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
|
847
|
-
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
|
848
|
-
|
849
|
-
## FINALIZE PBP FUNCTIONS ##
|
850
|
-
def combine_shifts(info,sources):
|
851
|
-
#Given game info, return complete shift events
|
852
|
-
|
853
|
-
#JSON Prep
|
854
|
-
roster = info['rosters']
|
855
|
-
|
856
|
-
#Quickly combine shifts data
|
857
|
-
away = parse_shift_events(info,False)
|
858
|
-
home = parse_shift_events(info,True)
|
859
|
-
|
860
|
-
#Combine shifts
|
861
|
-
data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
|
862
|
-
|
863
|
-
#Add game info
|
864
|
-
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
865
|
-
'away_team_abbr','home_team_abbr']
|
866
|
-
|
867
|
-
for col in info_col:
|
868
|
-
data[col] = info[col]
|
869
|
-
|
870
|
-
#Create player information dicts to create on-ice names
|
871
|
-
roster['playerId'] = roster['playerId'].astype(str)
|
872
|
-
players = roster.set_index("playerId")['full_name'].to_dict()
|
873
|
-
|
874
|
-
for i in range(0,7):
|
875
|
-
if i == 6:
|
876
|
-
data['away_goalie'] = data['away_goalie_id'].replace(players)
|
877
|
-
data['home_goalie'] = data['home_goalie_id'].replace(players)
|
878
|
-
else:
|
879
|
-
data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
|
880
|
-
data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
|
881
|
-
|
882
|
-
data = data.sort_values(['period','seconds_elapsed'])
|
883
|
-
#Fill on-ice columns down
|
884
|
-
on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
|
885
|
-
'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
|
886
|
-
'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
|
887
|
-
'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
|
888
|
-
'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
|
889
|
-
|
890
|
-
for col in on_ice_col:
|
891
|
-
data[col] = data[col].ffill()
|
892
|
-
|
893
|
-
#Create strength state information
|
894
|
-
away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
|
895
|
-
home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
|
896
|
-
data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
897
|
-
data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
898
|
-
data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
|
899
|
-
|
900
|
-
#Create final shifts df
|
901
|
-
col = [col for col in get_col() if col in data.columns.to_list()]
|
902
|
-
full_shifts = data[col]
|
903
|
-
|
904
|
-
#Export sources if true
|
905
|
-
if sources:
|
906
|
-
dirs = f'sources/{info['season']}/SHIFTS/'
|
907
|
-
|
908
|
-
if not os.path.exists(dirs):
|
909
|
-
os.makedirs(dirs)
|
910
|
-
|
911
|
-
full_shifts.to_csv(f'{dirs}{info['game_id']}_SHIFTS.csv',index=False)
|
912
|
-
|
913
|
-
#Return: full shifts data converted to play-by-play format
|
914
|
-
return full_shifts
|
915
|
-
|
916
|
-
def combine_data(info,sources):
|
917
|
-
#Given game info, return complete play-by-play data
|
918
|
-
|
919
|
-
game_id = info['game_id']
|
920
|
-
|
921
|
-
pbp = combine_pbp(info,sources)
|
922
|
-
shifts = combine_shifts(info,sources)
|
923
|
-
|
924
|
-
#Combine data
|
925
|
-
df = pd.concat([pbp,shifts])
|
926
|
-
|
927
|
-
df['event_num'] = df['event_num'].replace(np.nan,0)
|
928
|
-
|
929
|
-
#Create priority columns designed to order events that occur at the same time in a game
|
930
|
-
even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
|
931
|
-
df['priority'] = np.where(df['event_type'].isin(even_pri),1,
|
932
|
-
np.where(df['event_type']=='goal',2,
|
933
|
-
np.where(df['event_type']=='stoppage',3,
|
934
|
-
np.where(df['event_type']=='delayed-penalty',4,
|
935
|
-
np.where(df['event_type']=='penalty',5,
|
936
|
-
np.where(df['event_type']=='period-end',6,
|
937
|
-
np.where(df['event_type']=='change',7,
|
938
|
-
np.where(df['event_type']=='game-end',8,
|
939
|
-
np.where(df['event_type']=='period-start',9,
|
940
|
-
np.where(df['event_type']=='faceoff',10,0))))))))))
|
941
|
-
|
942
|
-
df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
|
943
|
-
df = df.sort_values(['period','seconds_elapsed','event_num','priority'])
|
944
|
-
|
945
|
-
#Recalibrate event_num column to accurately depict the order of all events, including changes
|
946
|
-
df.reset_index(inplace=True,drop=True)
|
947
|
-
df['event_num'] = df.index+1
|
948
|
-
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
949
|
-
df['event_type_last'] = df['event_type'].shift(1)
|
950
|
-
df['event_type_last_2'] = df['event_type_last'].shift(1)
|
951
|
-
df['event_type_next'] = df['event_type'].shift(-1)
|
952
|
-
lag_events = ['stoppage','goal','period-end']
|
953
|
-
lead_events = ['faceoff','period-end']
|
954
|
-
period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
|
955
|
-
#Define shifts by "line-change" or "on-the-fly"
|
956
|
-
df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
|
957
|
-
df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
|
958
|
-
try:
|
959
|
-
df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
|
960
|
-
except:
|
961
|
-
""
|
962
|
-
|
963
|
-
#Add time since last event and overall event length
|
964
|
-
df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
|
965
|
-
df['event_length'] = df['seconds_since_last'].shift(-1)
|
966
|
-
|
967
|
-
#Add fixed strength state column
|
968
|
-
df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
|
969
|
-
|
970
|
-
#Retrieve coaches
|
971
|
-
coaches = info['coaches']
|
972
|
-
if not coaches:
|
973
|
-
df['away_coach'] = ""
|
974
|
-
df['home_coach'] = ""
|
975
|
-
df['event_coach'] = ""
|
976
|
-
else:
|
977
|
-
df['away_coach'] = coaches['away']
|
978
|
-
df['home_coach'] = coaches['home']
|
979
|
-
df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
|
980
|
-
|
981
|
-
#Assign score, corsi, fenwick, and penalties for each event
|
982
|
-
for venue in ['away','home']:
|
983
|
-
df[f'{venue}_score'] = ((df['event_team_venue']==venue)&(df['event_type']=='goal')).cumsum()
|
984
|
-
df[f'{venue}_corsi'] = ((df['event_team_venue']==venue)&(df['event_type'].isin(['blocked-shot','missed-shot','shot-on-goal','goal']))).cumsum()
|
985
|
-
df[f'{venue}_fenwick'] = ((df['event_team_venue']==venue)&(df['event_type'].isin(['missed-shot','shot-on-goal','goal']))).cumsum()
|
986
|
-
df[f'{venue}_penalties'] = ((df['event_team_venue']==venue)&(df['event_type']=='penalty')).cumsum()
|
987
|
-
|
988
|
-
#Add time adjustments
|
989
|
-
df['period_time'] = np.trunc((df['seconds_elapsed']-((df['period']-1)*1200))/60).astype(str).str.replace('.0','')+":"+(df['seconds_elapsed'] % 60).astype(str).str.pad(2,'left','0')
|
990
|
-
df['game_time'] = np.trunc(df['seconds_elapsed']/60).astype(str).str.replace('.0','')+":"+(df['seconds_elapsed'] % 60).astype(str).str.pad(2,'left','0')
|
991
|
-
|
992
|
-
#Forward fill as necessary
|
993
|
-
cols = ['period_type','home_team_defending_side','away_coach','home_coach']
|
994
|
-
for col in cols:
|
995
|
-
try: df[col]
|
996
|
-
except: df[col] = ""
|
997
|
-
df[col] = df[col].ffill()
|
998
|
-
|
999
|
-
#Return: complete play-by-play with all important data for each event in a provided game
|
1000
|
-
return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)
|