wsba-hockey 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +836 -369
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +275 -47
- wsba_hockey/wsba_main.py +699 -132
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/METADATA +42 -11
- wsba_hockey-1.0.0.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.2.dist-info/RECORD +0 -9
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/top_level.txt +0 -0
wsba_hockey/tools/scraping.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from bs4 import BeautifulSoup
|
3
|
-
import
|
3
|
+
import requests as rs
|
4
|
+
import json as json_lib
|
5
|
+
from .utils.shared import *
|
4
6
|
import numpy as np
|
5
7
|
import pandas as pd
|
6
8
|
import warnings
|
@@ -9,97 +11,156 @@ warnings.filterwarnings('ignore')
|
|
9
11
|
### SCRAPING FUNCTIONS ###
|
10
12
|
# Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
|
11
13
|
|
12
|
-
##
|
13
|
-
|
14
|
-
|
14
|
+
## ORDER OF OPERATIONS ##
|
15
|
+
# Create game information to use with all functions
|
16
|
+
# Retreive JSON data
|
17
|
+
# Parse JSON data
|
18
|
+
# Retreive and clean HTML pbp with player information
|
19
|
+
# Parse HTML pbp, return parsed HTML
|
20
|
+
# Combine pbp data
|
21
|
+
# Retreive and analyze HTML shifts with player information for home and away teams
|
22
|
+
# Parse shift events
|
23
|
+
# Combine all data, return complete play-by-play
|
24
|
+
|
25
|
+
## UTILITY FUNCTIONS ##
|
26
|
+
def get_col():
|
27
|
+
return [
|
28
|
+
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
29
|
+
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
30
|
+
'seconds_elapsed',"situation_code","strength_state","strength_state_venue","home_team_defending_side",
|
31
|
+
"event_type_code","event_type","description","penalty_duration",
|
32
|
+
"event_team_abbr","event_team_venue",
|
33
|
+
'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
|
34
|
+
"event_player_1_name","event_player_2_name","event_player_3_name",
|
35
|
+
"event_player_1_id","event_player_2_id","event_player_3_id",
|
36
|
+
"event_player_1_pos","event_player_2_pos","event_player_3_pos",
|
37
|
+
"event_goalie_name","event_goalie_id",
|
38
|
+
"shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
39
|
+
"event_skaters","away_skaters","home_skaters",
|
40
|
+
"event_distance","event_angle","event_length","seconds_since_last",
|
41
|
+
"away_score","home_score", "away_fenwick", "home_fenwick","away_sog","home_sog",
|
42
|
+
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
43
|
+
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
|
44
|
+
"away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
|
45
|
+
"home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
|
46
|
+
"event_coach","away_coach","home_coach"
|
47
|
+
]
|
48
|
+
|
49
|
+
|
50
|
+
## JSON FUNCTIONS ##
|
51
|
+
def get_game_roster(json):
|
52
|
+
#Given raw json data, return game rosters
|
15
53
|
roster = pd.json_normalize(json['rosterSpots'])
|
16
|
-
|
17
|
-
home = info['homeTeam.id'][0]
|
18
|
-
away = info['awayTeam.id'][0]
|
19
|
-
|
20
|
-
#Add up to four alternative names for each player in the game
|
21
|
-
roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
|
22
|
-
try: roster['playerName_2'] = roster['firstName.cs']+" "+roster['lastName.default']
|
23
|
-
except: roster['playerName_2'] = ""
|
24
|
-
try: roster['playerName_3'] = roster['firstName.de']+" "+roster['lastName.default']
|
25
|
-
except: roster['playerName_3'] = ""
|
26
|
-
try: roster['playerName_4'] = roster['firstName.es']+" "+roster['lastName.default']
|
27
|
-
except: roster['playerName_4'] = ""
|
28
|
-
|
29
|
-
#For each home/away player their name is included as a key and their id or position is the value
|
30
|
-
home_players = {}
|
31
|
-
home_id = roster.loc[roster['teamId']==home]
|
32
|
-
hid = list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])
|
33
|
-
hpos = list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])
|
34
|
-
hp = list(home_id['playerName'])+list(home_id['playerName_2'])+list(home_id['playerName_3'])+list(home_id['playerName_4'])
|
35
|
-
|
36
|
-
for id, pos, player in zip(hid,hpos,hp):
|
37
|
-
try: home_players.update({player.upper():
|
38
|
-
{result:id if result == 'id' else pos}})
|
39
|
-
except:
|
40
|
-
continue
|
54
|
+
roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
|
41
55
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
for id, pos, player in zip(aid,apos,ap):
|
49
|
-
try: away_players.update({player.upper():
|
50
|
-
{result:id if result == 'id' else pos}})
|
51
|
-
except:
|
52
|
-
continue
|
56
|
+
#Return: roster information
|
57
|
+
return roster
|
58
|
+
|
59
|
+
def get_game_coaches(game_id):
|
60
|
+
#Given game info, return head coaches for away and home team
|
53
61
|
|
54
|
-
#
|
55
|
-
|
56
|
-
|
57
|
-
'away':away_players
|
58
|
-
}
|
62
|
+
#Retreive data
|
63
|
+
json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
|
64
|
+
data = json['gameInfo']
|
59
65
|
|
60
|
-
|
61
|
-
|
66
|
+
#Add coaches
|
67
|
+
try:
|
68
|
+
away = data['awayTeam']['headCoach']['default'].upper()
|
69
|
+
home = data['homeTeam']['headCoach']['default'].upper()
|
70
|
+
|
71
|
+
coaches = {'away':away,
|
72
|
+
'home':home}
|
73
|
+
except KeyError:
|
74
|
+
return {}
|
62
75
|
|
63
|
-
|
64
|
-
|
65
|
-
|
76
|
+
#Return: dict with coaches
|
77
|
+
return coaches
|
78
|
+
|
79
|
+
def get_game_info(game_id):
|
80
|
+
#Given game_id, return game information
|
81
|
+
|
82
|
+
#Retreive data
|
83
|
+
api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
|
84
|
+
json = rs.get(api).json()
|
85
|
+
|
86
|
+
#Games don't always have JSON shifts, for whatever reason
|
87
|
+
shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
|
88
|
+
shifts = rs.get(shifts).json()
|
89
|
+
json_shifts = pd.json_normalize(shifts['data'])
|
90
|
+
|
91
|
+
if shifts['total'] == 0:
|
92
|
+
json_shifts = pd.DataFrame()
|
93
|
+
|
94
|
+
#Split information
|
95
|
+
base = pd.json_normalize(json)
|
96
|
+
game_id = base['id'][0]
|
97
|
+
season = base['season'][0]
|
98
|
+
season_type = base['gameType'][0]
|
99
|
+
game_date = base['gameDate'][0]
|
100
|
+
game_state = base['gameState'][0]
|
101
|
+
start_time = base['startTimeUTC'][0]
|
102
|
+
venue = base['venue.default'][0]
|
103
|
+
venue_location = base['venueLocation.default'][0]
|
104
|
+
away_team_id = base['awayTeam.id'][0]
|
105
|
+
away_team_abbr = base['awayTeam.abbrev'][0]
|
106
|
+
home_team_id = base['homeTeam.id'][0]
|
107
|
+
home_team_abbr = base['homeTeam.abbrev'][0]
|
108
|
+
|
109
|
+
#Add roster
|
110
|
+
roster = get_game_roster(json)
|
111
|
+
#In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
|
112
|
+
roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
|
113
|
+
roster['team_abbr'] = roster['teamId'].replace({
|
114
|
+
away_team_id:[away_team_abbr],
|
115
|
+
home_team_id:[home_team_abbr]
|
116
|
+
})
|
117
|
+
roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
|
118
|
+
|
119
|
+
#Create an additional roster dictionary for use with HTML parsing
|
120
|
+
#Roster dict
|
121
|
+
roster_dict = {'away':{},
|
122
|
+
'home':{}}
|
123
|
+
|
124
|
+
#Evaluate and add players by team
|
125
|
+
for team in ['away','home']:
|
126
|
+
abbr = (away_team_abbr if team == 'away' else home_team_abbr)
|
127
|
+
rost = roster.loc[roster['team_abbr']==abbr]
|
128
|
+
|
129
|
+
#Now iterate through team players
|
130
|
+
for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
|
131
|
+
roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
|
132
|
+
|
133
|
+
#Return: game information
|
134
|
+
return {"game_id":str(game_id),
|
135
|
+
"season":season,
|
136
|
+
"season_type":season_type,
|
137
|
+
"game_date":game_date,
|
138
|
+
"game_state":game_state,
|
139
|
+
"start_time":start_time,
|
140
|
+
'venue':venue,
|
141
|
+
'venue_location':venue_location,
|
142
|
+
'away_team_id':away_team_id,
|
143
|
+
'away_team_abbr':away_team_abbr,
|
144
|
+
'home_team_id':home_team_id,
|
145
|
+
'home_team_abbr':home_team_abbr,
|
146
|
+
'events':pd.json_normalize(json['plays']).reset_index(drop=True),
|
147
|
+
'rosters':roster,
|
148
|
+
'HTML_rosters':roster_dict,
|
149
|
+
'coaches':get_game_coaches(game_id),
|
150
|
+
'json_shifts':json_shifts}
|
151
|
+
|
152
|
+
def parse_json(info):
|
153
|
+
#Given game info, return JSON document
|
154
|
+
|
155
|
+
#Retreive data
|
156
|
+
events = info['events']
|
66
157
|
|
67
158
|
#Return error if game is set in the future
|
68
|
-
if info['
|
159
|
+
if info['game_state'] == 'FUT':
|
69
160
|
raise ValueError(f"Game {info['id'][0]} has not occured yet.")
|
70
|
-
|
71
|
-
#Game information
|
72
|
-
events['game_id'] = info['id'][0]
|
73
|
-
events['season'] = info['season'][0]
|
74
|
-
events['season_type'] = info['gameType'][0]
|
75
|
-
events['game_date'] = info['gameDate'][0]
|
76
|
-
events['start_time'] = info['startTimeUTC'][0]
|
77
|
-
events['venue'] = info['venue.default'][0]
|
78
|
-
events['venue_location'] = info['venueLocation.default'][0]
|
79
|
-
events['away_team_id'] = info['awayTeam.id'][0]
|
80
|
-
events['away_team_abbr'] = info['awayTeam.abbrev'][0]
|
81
|
-
events['home_team_id'] = info['homeTeam.id'][0]
|
82
|
-
events['home_team_abbr'] = info['homeTeam.abbrev'][0]
|
83
|
-
|
84
|
-
teams = {
|
85
|
-
info['awayTeam.id'][0]:info['awayTeam.abbrev'][0],
|
86
|
-
info['homeTeam.id'][0]:info['homeTeam.abbrev'][0]
|
87
|
-
}
|
88
|
-
|
89
|
-
#Create player information dicts used to create event_player columns
|
90
|
-
roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
|
91
|
-
players = {}
|
92
|
-
players_pos = {}
|
93
|
-
ids = {}
|
94
|
-
for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
|
95
|
-
players.update({id:player.upper()})
|
96
|
-
for id, pos in zip(list(roster['playerId']),list(roster['positionCode'])):
|
97
|
-
players_pos.update({id:pos.upper()})
|
98
|
-
for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
|
99
|
-
ids.update({player.upper():id})
|
100
|
-
|
161
|
+
|
101
162
|
#Test columns
|
102
|
-
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date'
|
163
|
+
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
|
103
164
|
|
104
165
|
for col in cols:
|
105
166
|
try:events[col]
|
@@ -120,7 +181,7 @@ def parse_json(json):
|
|
120
181
|
|
121
182
|
events['event_player_3_id'] = events['details.assist2PlayerId']
|
122
183
|
|
123
|
-
events['
|
184
|
+
events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
|
124
185
|
|
125
186
|
#Coordinate adjustments:
|
126
187
|
#The WSBA NHL Scraper includes three sets of coordinates per event:
|
@@ -132,8 +193,8 @@ def parse_json(json):
|
|
132
193
|
try:
|
133
194
|
events['x_fixed'] = abs(events['details.xCoord'])
|
134
195
|
events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
|
135
|
-
events['x_adj'] = np.where(events['
|
136
|
-
events['y_adj'] = np.where(events['
|
196
|
+
events['x_adj'] = np.where(events['event_team_venue']=="home",events['x_fixed'],-events['x_fixed'])
|
197
|
+
events['y_adj'] = np.where(events['event_team_venue']=="home",events['y_fixed'],-events['y_fixed'])
|
137
198
|
events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
|
138
199
|
events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
|
139
200
|
except TypeError:
|
@@ -147,32 +208,11 @@ def parse_json(json):
|
|
147
208
|
events['event_angle'] = np.nan
|
148
209
|
|
149
210
|
|
150
|
-
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
events['event_player_2_name'] = events['event_player_2_id'].replace(players)
|
155
|
-
events['event_player_3_name'] = events['event_player_3_id'].replace(players)
|
156
|
-
|
157
|
-
events['event_player_1_pos'] = events['event_player_1_id'].replace(players_pos)
|
158
|
-
events['event_player_2_pos'] = events['event_player_2_id'].replace(players_pos)
|
159
|
-
events['event_player_3_pos'] = events['event_player_3_id'].replace(players_pos)
|
160
|
-
|
161
|
-
events['event_goalie_name'] = events['details.goalieInNetId'].replace(players)
|
162
|
-
|
163
|
-
#Create situations given situation code (this is reconfigured with on ice skaters when provided shifts data)
|
164
|
-
events['away_skaters'] = events['situationCode'].astype(str).str.slice(start=1,stop=2)
|
165
|
-
events['home_skaters'] = events['situationCode'].astype(str).str.slice(start=2,stop=3)
|
166
|
-
events['event_skaters'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['home_skaters'],events['away_skaters'])
|
167
|
-
events['event_skaters_against'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['away_skaters'],events['home_skaters'])
|
211
|
+
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
|
212
|
+
info['away_team_id']:[info['away_team_abbr']],
|
213
|
+
info['home_team_id']:[info['home_team_abbr']]
|
214
|
+
})
|
168
215
|
|
169
|
-
events['strength_state'] = events['event_skaters']+"v"+events['event_skaters_against']
|
170
|
-
events['strength'] = np.where(events['event_skaters']==events['event_skaters_against'],
|
171
|
-
"EV",np.where(
|
172
|
-
events['event_skaters']>events['event_skaters_against'],
|
173
|
-
"PP","SH"
|
174
|
-
))
|
175
|
-
|
176
216
|
#Rename columns to follow WSBA naming conventions
|
177
217
|
events = events.rename(columns={
|
178
218
|
"eventId":"event_id",
|
@@ -197,14 +237,12 @@ def parse_json(json):
|
|
197
237
|
})
|
198
238
|
|
199
239
|
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
200
|
-
events['
|
201
|
-
events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
|
202
|
-
((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
|
203
|
-
((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
|
204
|
-
events['period_seconds_remaining'] = 1200-events['period_seconds_elapsed']
|
240
|
+
events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
|
205
241
|
events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
|
242
|
+
|
243
|
+
events = events.loc[(events['event_type']!="")]
|
206
244
|
|
207
|
-
#
|
245
|
+
#Assign score and fenwick for each event
|
208
246
|
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
209
247
|
ag = 0
|
210
248
|
ags = []
|
@@ -215,16 +253,16 @@ def parse_json(json):
|
|
215
253
|
afs = []
|
216
254
|
hf = 0
|
217
255
|
hfs = []
|
218
|
-
for event,team in zip(list(events['event_type']),list(events['
|
256
|
+
for event,team in zip(list(events['event_type']),list(events['event_team_venue'])):
|
219
257
|
if event in fenwick_events:
|
220
258
|
if team == "home":
|
221
|
-
hf
|
259
|
+
hf += 1
|
222
260
|
if event == 'goal':
|
223
|
-
hg
|
261
|
+
hg += 1
|
224
262
|
else:
|
225
|
-
af
|
263
|
+
af += 1
|
226
264
|
if event == 'goal':
|
227
|
-
ag
|
265
|
+
ag += 1
|
228
266
|
|
229
267
|
ags.append(ag)
|
230
268
|
hgs.append(hg)
|
@@ -235,84 +273,561 @@ def parse_json(json):
|
|
235
273
|
events['home_score'] = hgs
|
236
274
|
events['away_fenwick'] = afs
|
237
275
|
events['home_fenwick'] = hfs
|
238
|
-
|
239
|
-
events = events.loc[(events['event_type']!="")&(events['event_type']!="game-end")]
|
240
276
|
|
241
|
-
#Return: dataframe with parsed
|
277
|
+
#Return: dataframe with parsed game
|
242
278
|
return events
|
243
279
|
|
280
|
+
### ESPN SCRAPING FUNCTIONS ###
|
281
|
+
def espn_game_id(date,away,home):
|
282
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
|
283
|
+
date = date.replace("-","")
|
244
284
|
|
285
|
+
#Retreive data
|
286
|
+
api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
|
287
|
+
schedule = pd.json_normalize(rs.get(api).json()['events'])
|
245
288
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
289
|
+
#Create team abbreviation columns
|
290
|
+
schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
|
291
|
+
schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
|
292
|
+
|
293
|
+
#Modify team abbreviations as necessary
|
294
|
+
schedule = schedule.replace({
|
295
|
+
"LA":"LAK",
|
296
|
+
"NJ":"NJD",
|
297
|
+
"SJ":"SJS",
|
298
|
+
"TB":"TBL",
|
299
|
+
})
|
251
300
|
|
252
|
-
|
253
|
-
|
254
|
-
|
301
|
+
#Retreive game id
|
302
|
+
game_id = schedule.loc[(schedule['away_team_abbr']==away)&
|
303
|
+
(schedule['home_team_abbr']==home),'id'].tolist()[0]
|
255
304
|
|
256
|
-
|
257
|
-
|
258
|
-
return td, get_teams(soup)
|
305
|
+
#Return: ESPN game id
|
306
|
+
return game_id
|
259
307
|
|
308
|
+
def parse_espn(date,away,home):
|
309
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game events
|
310
|
+
game_id = espn_game_id(date,away,home)
|
311
|
+
url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
|
312
|
+
|
313
|
+
#Code modified from Patrick Bacon
|
260
314
|
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
315
|
+
#Retreive game events as json
|
316
|
+
page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
|
317
|
+
soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
|
318
|
+
json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
|
319
|
+
|
320
|
+
#DataFrame of time-related info for events
|
321
|
+
clock_df = pd.DataFrame()
|
322
|
+
|
323
|
+
for period in range(0, len(json)):
|
324
|
+
clock_df = clock_df._append(pd.DataFrame(json[period]))
|
325
|
+
|
326
|
+
clock_df = clock_df[~pd.isna(clock_df.clock)]
|
327
|
+
|
328
|
+
# Needed to add .split(',"st":3')[0] for playoffs
|
329
|
+
|
330
|
+
#DataFrame of coordinates for events
|
331
|
+
coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
|
332
|
+
|
333
|
+
clock_df = clock_df.assign(
|
334
|
+
clock = clock_df.clock.apply(lambda x: x['displayValue'])
|
335
|
+
)
|
336
|
+
|
337
|
+
coords_df = coords_df.assign(
|
338
|
+
coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
|
339
|
+
coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
|
340
|
+
event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
|
341
|
+
)
|
342
|
+
|
343
|
+
#Combine
|
344
|
+
espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
|
345
|
+
|
346
|
+
espn_events = espn_events.assign(
|
347
|
+
period = espn_events['period'].apply(lambda x: x['number']),
|
348
|
+
minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
|
349
|
+
seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
|
350
|
+
event_type = espn_events['type'].apply(lambda x: x['txt'])
|
351
|
+
)
|
352
|
+
|
353
|
+
espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
354
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_x
|
355
|
+
),
|
356
|
+
coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
357
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
|
358
|
+
|
359
|
+
espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
|
360
|
+
|
361
|
+
espn_events = espn_events.assign(
|
362
|
+
coords_x = espn_events.coords_x.astype(int),
|
363
|
+
coords_y = espn_events.coords_y.astype(int)
|
364
|
+
)
|
365
|
+
|
366
|
+
#Rename events
|
367
|
+
#The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
|
368
|
+
espn_events['event_type'] = espn_events['event_type'].replace({
|
369
|
+
"Face Off":'faceoff',
|
370
|
+
"Hit":'hit',
|
371
|
+
"Shot":'shot-on-goal',
|
372
|
+
"Missed":'missed-shot',
|
373
|
+
"Blocked":'blocked-shot',
|
374
|
+
"Goal":'goal',
|
375
|
+
"Turnover":'giveaway',
|
376
|
+
"Delayed Penalty":'delayed-penalty',
|
377
|
+
"Penalty":'penalty',
|
378
|
+
})
|
379
|
+
|
380
|
+
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
381
|
+
espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
|
382
|
+
espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
|
383
|
+
((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
|
384
|
+
((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
|
385
|
+
espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
|
386
|
+
|
387
|
+
espn_events = espn_events.rename(columns = {'text':'description'})
|
388
|
+
|
389
|
+
#Add event team
|
390
|
+
espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
|
391
|
+
"away":away,
|
392
|
+
"home":home
|
393
|
+
})
|
394
|
+
|
395
|
+
#Some games (mostly preseason and all star games) do not include coordinates.
|
396
|
+
try:
|
397
|
+
espn_events['x_fixed'] = abs(espn_events['coords_x'])
|
398
|
+
espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
|
399
|
+
espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
|
400
|
+
espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
|
401
|
+
espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
|
402
|
+
espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
|
403
|
+
except TypeError:
|
404
|
+
print(f"No coordinates found for ESPN game...")
|
405
|
+
|
406
|
+
espn_events['x_fixed'] = np.nan
|
407
|
+
espn_events['y_fixed'] = np.nan
|
408
|
+
espn_events['x_adj'] = np.nan
|
409
|
+
espn_events['y_adj'] = np.nan
|
410
|
+
espn_events['event_distance'] = np.nan
|
411
|
+
espn_events['event_angle'] = np.nan
|
412
|
+
|
413
|
+
#Assign score and fenwick for each event
|
414
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
415
|
+
ag = 0
|
416
|
+
ags = []
|
417
|
+
hg = 0
|
418
|
+
hgs = []
|
419
|
+
|
420
|
+
af = 0
|
421
|
+
afs = []
|
422
|
+
hf = 0
|
423
|
+
hfs = []
|
424
|
+
for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
|
425
|
+
if event in fenwick_events:
|
426
|
+
if team == "home":
|
427
|
+
hf += 1
|
428
|
+
if event == 'goal':
|
429
|
+
hg += 1
|
430
|
+
else:
|
431
|
+
af += 1
|
432
|
+
if event == 'goal':
|
433
|
+
ag += 1
|
434
|
+
|
435
|
+
ags.append(ag)
|
436
|
+
hgs.append(hg)
|
437
|
+
afs.append(af)
|
438
|
+
hfs.append(hf)
|
439
|
+
|
440
|
+
espn_events['away_score'] = ags
|
441
|
+
espn_events['home_score'] = hgs
|
442
|
+
espn_events['away_fenwick'] = afs
|
443
|
+
espn_events['home_fenwick'] = hfs
|
444
|
+
#Return: play-by-play events in supplied game from ESPN
|
445
|
+
return espn_events
|
446
|
+
|
447
|
+
## HTML PBP FUNCTIONS ##
|
448
|
+
def strip_html_pbp(td,rosters):
|
449
|
+
#Given html row, parse data from HTML pbp
|
450
|
+
#Harry Shomer's Code (modified)
|
451
|
+
|
452
|
+
#HTML Parsing
|
453
|
+
for y in range(len(td)):
|
454
|
+
# Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
|
455
|
+
if y == 3:
|
456
|
+
td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
|
457
|
+
index = td[y].find(':')
|
458
|
+
td[y] = td[y][:index+3]
|
459
|
+
elif (y == 6 or y == 7) and td[0] != '#':
|
460
|
+
# 6 & 7-> These are the player 1 ice one's
|
461
|
+
# The second statement controls for when it's just a header
|
462
|
+
baz = td[y].find_all('td')
|
463
|
+
bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
|
464
|
+
|
465
|
+
# The setup in the list is now: Name/Number->Position->Blank...and repeat
|
466
|
+
# Now strip all the html
|
467
|
+
players = []
|
468
|
+
for i in range(len(bar)):
|
469
|
+
if i % 3 == 0:
|
470
|
+
try:
|
471
|
+
#Using the supplied json we can bind player name and id to number and team
|
472
|
+
#Find number and team of player then lookup roster dictionary
|
473
|
+
|
474
|
+
number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
|
475
|
+
if y == 6:
|
476
|
+
team = 'away'
|
477
|
+
else:
|
478
|
+
team = 'home'
|
479
|
+
|
480
|
+
id = rosters[team][str(number)][4]
|
481
|
+
name = rosters[team][str(number)][2]
|
482
|
+
position = rosters[team][str(number)][1]
|
483
|
+
|
484
|
+
except KeyError:
|
485
|
+
name = ''
|
486
|
+
number = ''
|
487
|
+
id = ''
|
488
|
+
elif i % 3 == 1:
|
489
|
+
if name != '':
|
490
|
+
players.append([name, number, position, id])
|
491
|
+
|
492
|
+
td[y] = players
|
493
|
+
else:
|
494
|
+
td[y] = td[y].get_text()
|
495
|
+
|
496
|
+
return td
|
266
497
|
|
267
|
-
# Get Home Team
|
268
|
-
teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
|
269
|
-
regex = re.compile(r'>(.*)<br/?>')
|
270
|
-
home_team = regex.findall(str(teams[7]))
|
271
498
|
|
272
|
-
|
499
|
+
def clean_html_pbp(info):
|
500
|
+
#Harry Shomer's Code (modified)
|
273
501
|
|
274
|
-
|
275
|
-
|
502
|
+
game_id = info['game_id']
|
503
|
+
#Retreive data
|
504
|
+
season = info['season']
|
505
|
+
doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
|
506
|
+
html = rs.get(doc).content
|
507
|
+
soup = get_contents(html)
|
508
|
+
|
509
|
+
#Rosters
|
510
|
+
rosters = info['HTML_rosters']
|
511
|
+
|
512
|
+
# Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
|
513
|
+
td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
|
514
|
+
|
515
|
+
cleaned_html = [strip_html_pbp(x,rosters) for x in td]
|
516
|
+
|
517
|
+
return cleaned_html
|
518
|
+
|
519
|
+
def parse_html(info):
|
520
|
+
#Given game info, return HTML event data
|
521
|
+
|
522
|
+
#Retreive game information and html events
|
523
|
+
rosters = info['HTML_rosters']
|
524
|
+
events = clean_html_pbp(info)
|
525
|
+
|
526
|
+
teams = {info['away_team_abbr']:['away'],
|
527
|
+
info['home_team_abbr']:['home']}
|
528
|
+
|
529
|
+
#Parsing
|
530
|
+
event_log = []
|
531
|
+
for event in events:
|
532
|
+
events_dict = {}
|
533
|
+
if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX','SOC'] or event[3]=='-16:0-':
|
534
|
+
continue
|
535
|
+
else:
|
536
|
+
#Event info
|
537
|
+
events_dict['event_num'] = int(event[0])
|
538
|
+
events_dict['period'] = int(event[1])
|
539
|
+
events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
|
540
|
+
events_dict['period_time_elapsed'] = event[3]
|
541
|
+
events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
|
542
|
+
events_dict['event_type'] = event[4]
|
543
|
+
|
544
|
+
desc = re.sub(u'\xa0'," ",event[5])
|
545
|
+
events_dict['description'] = desc
|
546
|
+
|
547
|
+
events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
|
548
|
+
zone = [x for x in desc.split(',') if 'Zone' in x]
|
549
|
+
if not zone:
|
550
|
+
events_dict['zone_code'] = None
|
551
|
+
elif zone[0].find("Off") != -1:
|
552
|
+
events_dict['zone_code'] = 'O'
|
553
|
+
elif zone[0].find("Neu") != -1:
|
554
|
+
events_dict['zone_code'] = 'N'
|
555
|
+
elif zone[0].find("Def") != -1:
|
556
|
+
events_dict['zone_code'] = 'D'
|
557
|
+
|
558
|
+
#Convert team names for compatiblity
|
559
|
+
replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
|
560
|
+
for name, repl in replace:
|
561
|
+
desc = desc.replace(repl,name)
|
562
|
+
|
563
|
+
event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
|
564
|
+
events_dict['event_team_abbr'] = event_team
|
565
|
+
|
566
|
+
events_dict['away_team_abbr'] = info['away_team_abbr']
|
567
|
+
events_dict['home_team_abbr'] = info['home_team_abbr']
|
568
|
+
|
569
|
+
away_skaters = 0
|
570
|
+
away_goalie = 0
|
571
|
+
#Away on-ice
|
572
|
+
for i in range(len(event[6])):
|
573
|
+
player = event[6][i][0]
|
574
|
+
pos = event[6][i][2]
|
575
|
+
id = event[6][i][3]
|
576
|
+
|
577
|
+
if pos == 'G':
|
578
|
+
events_dict['away_goalie'] = player
|
579
|
+
events_dict['away_goalie_id'] = id
|
580
|
+
away_goalie += 1
|
581
|
+
else:
|
582
|
+
events_dict[f'away_on_{i+1}'] = player
|
583
|
+
events_dict[f'away_on_{i+1}_id'] = id
|
584
|
+
away_skaters += 1
|
585
|
+
|
586
|
+
home_skaters = 0
|
587
|
+
home_goalie = 0
|
588
|
+
#Home on-ice
|
589
|
+
for i in range(len(event[7])):
|
590
|
+
player = event[7][i][0]
|
591
|
+
pos = event[7][i][2]
|
592
|
+
id = event[7][i][3]
|
593
|
+
|
594
|
+
if pos == 'G':
|
595
|
+
events_dict['home_goalie'] = player
|
596
|
+
events_dict['home_goalie_id'] = id
|
597
|
+
home_goalie += 1
|
598
|
+
else:
|
599
|
+
events_dict[f'home_on_{i+1}'] = player
|
600
|
+
events_dict[f'home_on_{i+1}_id'] = id
|
601
|
+
home_skaters += 1
|
602
|
+
|
603
|
+
event_players = []
|
604
|
+
#Determine parsing route based on event
|
605
|
+
if event[4] in ['FAC','HIT','BLOCK','PENL']:
|
606
|
+
#Regex to find team and player number involved (finds all for each event)
|
607
|
+
#Code is modified from Harry Shomer in order to account for periods in a team abbreviation
|
608
|
+
regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
|
609
|
+
fac = regex.findall(desc)
|
610
|
+
#Filter incorrectly parsed teams
|
611
|
+
repl = []
|
612
|
+
for team, num in fac:
|
613
|
+
if team in teams.keys():
|
614
|
+
repl.append((team,num))
|
615
|
+
fac = repl
|
616
|
+
|
617
|
+
#Find first event player
|
618
|
+
ep1_num = ''
|
619
|
+
for i in range(len(fac)):
|
620
|
+
team, num = fac[i]
|
621
|
+
if team == event_team:
|
622
|
+
ep1_num = num
|
623
|
+
event_players.append(fac[i])
|
624
|
+
else:
|
625
|
+
continue
|
626
|
+
|
627
|
+
#Find other players
|
628
|
+
for i in range(len(fac)):
|
629
|
+
team, num = fac[i]
|
630
|
+
if num == ep1_num:
|
631
|
+
continue
|
632
|
+
else:
|
633
|
+
event_players.append(fac[i])
|
634
|
+
elif event[4]=='GOAL':
|
635
|
+
#Parse goal
|
636
|
+
regex = re.compile(r'#(\d+)\s+')
|
637
|
+
goal = regex.findall(desc)
|
638
|
+
|
639
|
+
#Add all involved players
|
640
|
+
for point in goal:
|
641
|
+
#In this loop, point is a player number. We can assign event_team to all players in a goal
|
642
|
+
event_players.append((event_team,str(point)))
|
643
|
+
elif event[4]=='DELPEN':
|
644
|
+
#Don't parse DELPEN events
|
645
|
+
#These events typically have no text but when they do it is often erroneous or otherwise problematic
|
646
|
+
|
647
|
+
""
|
648
|
+
else:
|
649
|
+
#Parse single or no player events
|
650
|
+
regex = re.compile(r'#\d+')
|
651
|
+
fac = regex.findall(desc)
|
652
|
+
|
653
|
+
for i in range(len(fac)):
|
654
|
+
num = fac[i].replace("#","")
|
655
|
+
event_players.append((event_team,str(num)))
|
656
|
+
|
657
|
+
for i in range(len(event_players)):
|
658
|
+
#For each player, evaluate their event data, then retreive information from rosters
|
659
|
+
team, num = event_players[i]
|
660
|
+
|
661
|
+
status = teams[team]
|
662
|
+
data = rosters[status[0]]
|
663
|
+
|
664
|
+
events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
|
665
|
+
events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
|
666
|
+
events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
|
667
|
+
|
668
|
+
events_dict['away_skaters'] = away_skaters
|
669
|
+
events_dict['home_skaters'] = home_skaters
|
670
|
+
events_dict['away_goalie_in'] = away_goalie
|
671
|
+
events_dict['home_goalie_in'] = home_goalie
|
672
|
+
|
673
|
+
event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
|
674
|
+
event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
|
675
|
+
events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
|
676
|
+
events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
|
677
|
+
|
678
|
+
event_log.append(pd.DataFrame([events_dict]))
|
679
|
+
|
680
|
+
data = pd.concat(event_log)
|
681
|
+
data['event_type'] = data['event_type'].replace({
|
682
|
+
"PGSTR": "pre-game-start",
|
683
|
+
"PGEND": "pre-game-end",
|
684
|
+
'GSTR':"game-start",
|
685
|
+
"ANTHEM":"anthem",
|
686
|
+
"PSTR":"period-start",
|
687
|
+
'FAC':"faceoff",
|
688
|
+
"SHOT":"shot-on-goal",
|
689
|
+
"BLOCK":"blocked-shot",
|
690
|
+
"STOP":"stoppage",
|
691
|
+
"MISS":"missed-shot",
|
692
|
+
"HIT":"hit",
|
693
|
+
"GOAL":"goal",
|
694
|
+
"GIVE":"giveaway",
|
695
|
+
"TAKE":"takeaway",
|
696
|
+
"DELPEN":"delayed-penalty",
|
697
|
+
"PENL":"penalty",
|
698
|
+
"CHL":"challenge",
|
699
|
+
"PEND":"period-end",
|
700
|
+
"GEND":"game-end"
|
701
|
+
})
|
702
|
+
|
703
|
+
#Return: parsed HTML pbp
|
704
|
+
return data
|
705
|
+
|
706
|
+
def combine_pbp(info):
|
707
|
+
#Given game info, return complete play-by-play data for provided game
|
708
|
+
|
709
|
+
html_pbp = parse_html(info)
|
710
|
+
|
711
|
+
#Route data combining - json if season is after 2009-2010:
|
712
|
+
if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
|
713
|
+
#ESPN x HTML
|
714
|
+
espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).drop(columns=['event_player_1_name'])
|
715
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
|
716
|
+
|
717
|
+
df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
|
718
|
+
|
719
|
+
else:
|
720
|
+
#JSON x HTML
|
721
|
+
json_pbp = parse_json(info)
|
722
|
+
#Modify merge conditions and merge pbps
|
723
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
|
724
|
+
html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
|
725
|
+
|
726
|
+
#While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
|
727
|
+
html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
|
728
|
+
json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
|
729
|
+
|
730
|
+
df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
|
731
|
+
|
732
|
+
#Add game info
|
733
|
+
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
734
|
+
'away_team_abbr','home_team_abbr']
|
735
|
+
|
736
|
+
for col in info_col:
|
737
|
+
df[col] = info[col]
|
738
|
+
|
739
|
+
#Fill period_type column and assign shifts a sub-500 event code
|
740
|
+
df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
|
741
|
+
try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
742
|
+
except:
|
743
|
+
""
|
744
|
+
df = df.sort_values(['period','seconds_elapsed']).reset_index()
|
745
|
+
|
746
|
+
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
747
|
+
|
748
|
+
#Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
|
749
|
+
df['strength_state'] = np.where(np.logical_and(df['period'].astype(str)=='5',df['event_type'].isin(['missed-shot','shot-on-goal','goal'])),"1v0",df['strength_state'])
|
750
|
+
df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
|
751
|
+
|
752
|
+
col = [col for col in get_col() if col in df.columns.to_list()]
|
753
|
+
#Return: complete play-by-play information for provided game
|
754
|
+
return df[col]
|
755
|
+
|
756
|
+
## SHIFT SCRAPING FUNCTIONS ##
|
757
|
+
def parse_shifts_json(info):
|
758
|
+
#Given game info, return json shift chart
|
759
|
+
|
760
|
+
log = info['json_shifts']
|
761
|
+
#Filter non-shift events and duplicate events
|
762
|
+
log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
|
763
|
+
|
764
|
+
#Add full name columns
|
765
|
+
log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
|
766
|
+
|
767
|
+
log = log.rename(columns={
|
768
|
+
'playerId':'player_id',
|
769
|
+
'teamAbbrev':'event_team_abbr',
|
770
|
+
'startTime':'start',
|
771
|
+
'endTime':'end'
|
772
|
+
})
|
773
|
+
|
774
|
+
#Convert time columns
|
775
|
+
log['start'] = log['start'].astype(str).apply(convert_to_seconds)
|
776
|
+
log['end'] = log['end'].astype(str).apply(convert_to_seconds)
|
777
|
+
log = log[['player_name','player_id',
|
778
|
+
'period','event_team_abbr',
|
779
|
+
'start','duration','end']]
|
780
|
+
|
781
|
+
#Recalibrate duration
|
782
|
+
log['duration'] = log['end'] - log['start']
|
783
|
+
|
784
|
+
#Return: JSON shifts (seperated by team)
|
785
|
+
away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
|
786
|
+
home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
|
787
|
+
|
788
|
+
return {'away':away,
|
789
|
+
'home':home}
|
790
|
+
|
791
|
+
def analyze_shifts(shift, id, name, pos, team):
|
276
792
|
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
277
793
|
#Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
|
278
794
|
shifts = dict()
|
279
795
|
|
280
796
|
shifts['player_name'] = name.upper()
|
797
|
+
shifts['player_id'] = id
|
798
|
+
shifts['player_pos'] = pos
|
281
799
|
shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
|
282
|
-
shifts['
|
283
|
-
shifts['start'] =
|
284
|
-
shifts['duration'] =
|
800
|
+
shifts['event_team_abbr'] = get_team(team.strip(' '))
|
801
|
+
shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
|
802
|
+
shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
|
285
803
|
|
286
|
-
#
|
804
|
+
#Sometimes there are no digits
|
287
805
|
if re.compile(r'\d+').findall(shift[3].split('/')[0]):
|
288
|
-
shifts['end'] =
|
806
|
+
shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
|
289
807
|
else:
|
290
808
|
shifts['end'] = shifts['start'] + shifts['duration']
|
291
|
-
|
292
|
-
try:
|
293
|
-
if home_team == team:
|
294
|
-
shifts['player_id'] = player_ids['home'][name.upper()]['id']
|
295
|
-
else:
|
296
|
-
shifts['player_id'] = player_ids['away'][name.upper()]['id']
|
297
|
-
except KeyError:
|
298
|
-
shifts['player_id'] = None
|
299
|
-
|
300
809
|
return shifts
|
301
810
|
|
302
|
-
def
|
303
|
-
#
|
304
|
-
#Stage one: create dataframe with raw individual shifts
|
305
|
-
#Stage two: convert shift events to play-by-play structure created with json_parsing
|
811
|
+
def parse_shifts_html(info,home):
|
812
|
+
#Parsing of shifts data for a single team in a provided game
|
306
813
|
#Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
|
307
814
|
|
815
|
+
#Roster info prep
|
816
|
+
roster = info['HTML_rosters']
|
308
817
|
|
818
|
+
rosters = roster['home' if home else 'away']
|
819
|
+
|
309
820
|
all_shifts = []
|
310
|
-
columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
821
|
+
#columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
311
822
|
|
312
|
-
|
823
|
+
#Retreive HTML
|
824
|
+
game_id = info['game_id']
|
825
|
+
season = info['season']
|
826
|
+
link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
|
827
|
+
doc = rs.get(link).content
|
828
|
+
td, teams = get_soup(doc)
|
313
829
|
|
314
830
|
team = teams[0]
|
315
|
-
home_team = teams[1]
|
316
831
|
players = dict()
|
317
832
|
|
318
833
|
# Iterates through each player shifts table with the following data:
|
@@ -321,37 +836,55 @@ def parse_shifts(html, player_ids, game_id):
|
|
321
836
|
t = t.get_text()
|
322
837
|
if ',' in t: # If a comma exists it is a player
|
323
838
|
name = t
|
839
|
+
|
324
840
|
name = name.split(',')
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
841
|
+
number = int(name[0][:2].strip())
|
842
|
+
id = rosters[str(number)][4]
|
843
|
+
players[id] = dict()
|
844
|
+
|
845
|
+
#HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
|
846
|
+
|
847
|
+
players[id]['name'] = rosters[str(number)][2]
|
848
|
+
players[id]['pos'] = rosters[str(number)][1]
|
849
|
+
|
850
|
+
players[id]['shifts'] = []
|
331
851
|
else:
|
332
|
-
players[
|
852
|
+
players[id]['shifts'].extend([t])
|
333
853
|
|
334
854
|
for key in players.keys():
|
335
855
|
# Create lists of shifts-table columns for analysis
|
336
856
|
players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
|
337
857
|
|
858
|
+
name = players[key]['name']
|
859
|
+
pos = players[key]['pos']
|
860
|
+
|
338
861
|
# Parsing
|
339
|
-
shifts = [analyze_shifts(shift, key,
|
862
|
+
shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
|
340
863
|
all_shifts.extend(shifts)
|
341
864
|
|
342
865
|
df = pd.DataFrame(all_shifts)
|
343
|
-
df['game_id'] = str(game_id)
|
344
866
|
|
345
|
-
shifts_raw = df[
|
867
|
+
shifts_raw = df[df['duration'] > 0]
|
346
868
|
|
347
|
-
|
869
|
+
#Return: single-team individual shifts by player
|
870
|
+
return shifts_raw
|
871
|
+
|
872
|
+
def parse_shift_events(info,home):
|
873
|
+
#Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
|
874
|
+
|
875
|
+
#Determine whether to use JSON shifts or HTML shifts
|
876
|
+
if len(info['json_shifts']) == 0:
|
877
|
+
shift = parse_shifts_html(info,home)
|
878
|
+
else:
|
879
|
+
shift = parse_shifts_json(info)['home' if home else 'away']
|
880
|
+
|
881
|
+
rosters = info['rosters']
|
348
882
|
|
349
|
-
# Second-stage beginds here
|
350
883
|
# Identify shift starts for each shift event
|
351
|
-
shifts_on =
|
884
|
+
shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
|
352
885
|
num_on=('player_name', 'size'),
|
353
886
|
players_on=('player_name', lambda x: ', '.join(x)),
|
354
|
-
ids_on=('player_id', lambda x: ', '.join(map(str,
|
887
|
+
ids_on=('player_id', lambda x: ', '.join(map(str,x))),
|
355
888
|
).reset_index()
|
356
889
|
|
357
890
|
shifts_on = shifts_on.rename(columns={
|
@@ -359,10 +892,10 @@ def parse_shifts(html, player_ids, game_id):
|
|
359
892
|
})
|
360
893
|
|
361
894
|
# Identify shift stops for each shift event
|
362
|
-
shifts_off =
|
895
|
+
shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
|
363
896
|
num_off=('player_name', 'size'),
|
364
897
|
players_off=('player_name', lambda x: ', '.join(x)),
|
365
|
-
ids_off=('player_id', lambda x: ', '.join(map(str,
|
898
|
+
ids_off=('player_id', lambda x: ', '.join(map(str,x))),
|
366
899
|
).reset_index()
|
367
900
|
|
368
901
|
shifts_off = shifts_off.rename(columns={
|
@@ -370,57 +903,29 @@ def parse_shifts(html, player_ids, game_id):
|
|
370
903
|
})
|
371
904
|
|
372
905
|
# Merge and sort by time in game
|
373
|
-
shifts = pd.merge(shifts_on, shifts_off, on=['
|
906
|
+
shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
|
374
907
|
|
375
|
-
shifts = shifts
|
376
|
-
|
377
|
-
#Modify columns of new total shifts dataframe
|
378
|
-
shifts['period'] = shifts['period'].astype(int)
|
908
|
+
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
|
379
909
|
shifts['event_type'] = 'change'
|
380
|
-
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200 * (shifts['period']-1))
|
381
|
-
shifts['game_seconds_remaining'] = 3600 - shifts['seconds_elapsed']
|
382
|
-
|
383
|
-
# Handle missing values at the start and end of periods
|
384
|
-
shifts['players_on'] = shifts['players_on'].fillna('None')
|
385
|
-
shifts['players_off'] = shifts['players_off'].fillna('None')
|
386
|
-
shifts['ids_on'] = shifts['ids_on'].fillna('0')
|
387
|
-
shifts['ids_off'] = shifts['ids_off'].fillna('0')
|
388
|
-
shifts['num_on'] = shifts['num_on'].fillna(0).astype(int)
|
389
|
-
shifts['num_off'] = shifts['num_off'].fillna(0).astype(int)
|
390
|
-
|
391
|
-
#Manual Team Rename
|
392
|
-
shifts['team_abbr'] = shifts['team_abbr'].replace({
|
393
|
-
"L.A":"LAK",
|
394
|
-
"N.J":"NJD",
|
395
|
-
"S.J":"SJS",
|
396
|
-
"T.B":"TBL"
|
397
|
-
})
|
398
910
|
|
399
|
-
#
|
400
|
-
|
911
|
+
#Shift events similar to html (remove shootout shifts)
|
912
|
+
shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
|
913
|
+
|
914
|
+
#Generate on-ice columns
|
915
|
+
skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
|
916
|
+
goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
|
917
|
+
team = list(shift['event_team_abbr'])[0]
|
401
918
|
|
402
|
-
def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
403
|
-
#Given roster info (from the retreive_players function), shifts df, and team, generate on_ice columns for shift events
|
404
|
-
#These on-ice columns configure the on-ice players for events in the json play by play as well
|
405
919
|
skaters = pd.DataFrame()
|
406
920
|
goalies = pd.DataFrame()
|
407
|
-
|
408
|
-
team = {key:value for key, value in rosters['home'].items() if value['pos'] != "G"}
|
409
|
-
else:
|
410
|
-
team = {key:value for key, value in rosters['away'].items() if value['pos'] != "G"}
|
411
|
-
|
412
|
-
names = list(team.keys())
|
413
|
-
try: names.remove("")
|
414
|
-
except ValueError: ""
|
415
|
-
|
416
|
-
for player in names:
|
921
|
+
for player in skater_names:
|
417
922
|
#For each player in the game, determine when they began and ended shifts.
|
418
923
|
#With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
|
419
924
|
on_ice = (np.cumsum(
|
420
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
925
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
421
926
|
.apply(str)
|
422
927
|
.apply(lambda x: int(bool(re.search(player, x)))) -
|
423
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
928
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
424
929
|
.apply(str)
|
425
930
|
.apply(lambda x: int(bool(re.search(player, x))))
|
426
931
|
))
|
@@ -428,32 +933,22 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
428
933
|
|
429
934
|
skaters = skaters.fillna(0).astype(int)
|
430
935
|
|
431
|
-
|
432
936
|
on_skaters = (skaters == 1).stack().reset_index()
|
433
937
|
on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
434
938
|
|
435
939
|
max_players = 6
|
436
940
|
for i in range(max_players):
|
437
|
-
on_skaters[f"{'home' if home else 'away'}_on_{i+1}"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
941
|
+
on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
438
942
|
|
439
943
|
on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
440
944
|
|
441
|
-
#Repeat
|
442
|
-
|
443
|
-
team = {key:value for key, value in rosters['home'].items() if value['pos'] == "G"}
|
444
|
-
else:
|
445
|
-
team = {key:value for key, value in rosters['away'].items() if value['pos'] == "G"}
|
446
|
-
|
447
|
-
names = list(team.keys())
|
448
|
-
try: names.remove("")
|
449
|
-
except ValueError: ""
|
450
|
-
|
451
|
-
for player in names:
|
945
|
+
#Repeat this process with goaltenders
|
946
|
+
for player in goalie_names:
|
452
947
|
on_ice = (np.cumsum(
|
453
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
948
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
454
949
|
.apply(str)
|
455
950
|
.apply(lambda x: int(bool(re.search(player, x)))) -
|
456
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
951
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
457
952
|
.apply(str)
|
458
953
|
.apply(lambda x: int(bool(re.search(player, x))))
|
459
954
|
))
|
@@ -466,7 +961,7 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
466
961
|
|
467
962
|
max_players = 1
|
468
963
|
for i in range(max_players):
|
469
|
-
on_goalies[f"{'home' if home else 'away'}
|
964
|
+
on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
470
965
|
|
471
966
|
on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
472
967
|
|
@@ -474,104 +969,100 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
474
969
|
on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
|
475
970
|
|
476
971
|
shifts['row'] = shifts.index
|
477
|
-
|
972
|
+
|
973
|
+
if home:
|
974
|
+
shifts['home_team_abbr'] = team
|
975
|
+
else:
|
976
|
+
shifts['away_team_abbr'] = team
|
478
977
|
#Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
|
479
|
-
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"
|
978
|
+
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
|
480
979
|
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
data_id = retreive_players(json)
|
980
|
+
## FINALIZE PBP FUNCTIONS ##
|
981
|
+
def combine_shifts(info):
|
982
|
+
#Given game info, return complete shift events
|
485
983
|
|
486
|
-
|
487
|
-
|
984
|
+
#JSON Prep
|
985
|
+
roster = info['rosters']
|
488
986
|
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
away_shifts = construct_skaters_matrix(data,away,pd.json_normalize(json)['awayTeam.abbrev'][0],False).fillna("REMOVE")
|
493
|
-
home_shifts = construct_skaters_matrix(data,home,pd.json_normalize(json)['homeTeam.abbrev'][0],True).fillna("REMOVE")
|
987
|
+
#Quickly combine shifts data
|
988
|
+
away = parse_shift_events(info,False)
|
989
|
+
home = parse_shift_events(info,True)
|
494
990
|
|
495
|
-
shifts
|
991
|
+
#Combine shifts
|
992
|
+
data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
|
993
|
+
|
994
|
+
#Add game info
|
995
|
+
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
996
|
+
'away_team_abbr','home_team_abbr']
|
496
997
|
|
497
|
-
|
498
|
-
|
998
|
+
for col in info_col:
|
999
|
+
data[col] = info[col]
|
499
1000
|
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
data['fullName'] = (data['firstName.default']+" "+data['lastName.default']).str.upper()
|
1001
|
+
#Create player information dicts to create on-ice names
|
1002
|
+
roster['playerId'] = roster['playerId'].astype(str)
|
1003
|
+
players = roster.set_index("playerId")['full_name'].to_dict()
|
504
1004
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
1005
|
+
for i in range(0,7):
|
1006
|
+
if i == 6:
|
1007
|
+
data['away_goalie'] = data['away_goalie_id'].replace(players)
|
1008
|
+
data['home_goalie'] = data['home_goalie_id'].replace(players)
|
1009
|
+
else:
|
1010
|
+
data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
|
1011
|
+
data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
|
509
1012
|
|
510
|
-
|
1013
|
+
data = data.sort_values(['period','seconds_elapsed'])
|
1014
|
+
#Fill on-ice columns down
|
1015
|
+
on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
|
1016
|
+
'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
|
1017
|
+
'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
|
1018
|
+
'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
|
1019
|
+
'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
|
511
1020
|
|
512
|
-
for
|
513
|
-
|
514
|
-
except:
|
515
|
-
data[name] = np.nan
|
1021
|
+
for col in on_ice_col:
|
1022
|
+
data[col] = data[col].ffill()
|
516
1023
|
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
1024
|
+
#Create strength state information
|
1025
|
+
away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
|
1026
|
+
home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
|
1027
|
+
data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1028
|
+
data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1029
|
+
data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
|
521
1030
|
|
522
|
-
|
1031
|
+
#Return: full shifts data converted to play-by-play format
|
1032
|
+
col = [col for col in get_col() if col in data.columns.to_list()]
|
1033
|
+
return data[col]
|
523
1034
|
|
524
|
-
|
525
|
-
|
526
|
-
if alt == np.nan or alt == "" or str(alt) == 'nan':
|
527
|
-
continue
|
528
|
-
else:
|
529
|
-
replace.update({alt:default})
|
530
|
-
|
531
|
-
return shifts_df.replace(replace,regex=True)
|
1035
|
+
def combine_data(info):
|
1036
|
+
#Given game info, return complete play-by-play data
|
532
1037
|
|
533
|
-
|
534
|
-
return [
|
535
|
-
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
536
|
-
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
537
|
-
'seconds_elapsed', "situation_code","strength_state","home_team_defending_side","shift_type",
|
538
|
-
"event_type_code","event_type","description","reason","penalty_duration","penalty_description",
|
539
|
-
"event_team_abbr",'num_on', 'players_on', 'ids_on', 'num_off', 'players_off', 'ids_off',
|
540
|
-
"event_team_status","event_player_1_id","event_player_2_id","event_player_3_id",
|
541
|
-
"event_player_1_name","event_player_2_name","event_player_3_name","event_player_1_pos","event_player_2_pos",
|
542
|
-
"event_player_3_pos","event_goalie_id",
|
543
|
-
"event_goalie_name","shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
544
|
-
"event_skaters","away_skaters","home_skaters",
|
545
|
-
"event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
|
546
|
-
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
547
|
-
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie"
|
548
|
-
]
|
1038
|
+
game_id = info['game_id']
|
549
1039
|
|
550
|
-
|
551
|
-
|
552
|
-
df = pd.concat([json,html])
|
1040
|
+
pbp = combine_pbp(info)
|
1041
|
+
shifts = combine_shifts(info)
|
553
1042
|
|
554
|
-
#
|
555
|
-
df
|
556
|
-
df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
1043
|
+
#Combine data
|
1044
|
+
df = pd.concat([pbp,shifts])
|
557
1045
|
|
558
1046
|
#Create priority columns designed to order events that occur at the same time in a game
|
559
|
-
start_pri = ['period-start','game-start']
|
560
1047
|
even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
|
561
|
-
df['priority'] = np.where(df['event_type'].isin(
|
562
|
-
np.where(df['event_type'].isin(even_pri),1,
|
1048
|
+
df['priority'] = np.where(df['event_type'].isin(even_pri),1,
|
563
1049
|
np.where(df['event_type']=='goal',2,
|
564
1050
|
np.where(df['event_type']=='stoppage',3,
|
565
|
-
np.where(df['event_type']=='penalty',4,
|
566
|
-
np.where(df['event_type']=='
|
1051
|
+
np.where(df['event_type']=='delayed-penalty',4,
|
1052
|
+
np.where(df['event_type']=='penalty',5,
|
567
1053
|
np.where(df['event_type']=='period-end',6,
|
568
|
-
np.where(df['event_type']=='
|
569
|
-
np.where(df['event_type']=='
|
1054
|
+
np.where(df['event_type']=='change',7,
|
1055
|
+
np.where(df['event_type']=='game-end',8,
|
1056
|
+
np.where(df['event_type']=='period-start',9,
|
1057
|
+
np.where(df['event_type']=='faceoff',10,0))))))))))
|
1058
|
+
|
1059
|
+
df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
|
1060
|
+
df = df.sort_values(['period','seconds_elapsed','priority'])
|
570
1061
|
|
571
|
-
|
572
|
-
|
1062
|
+
#Recalibrate event_num column to accurately depict the order of all events, including changes
|
1063
|
+
df.reset_index(inplace=True,drop=True)
|
573
1064
|
df['event_num'] = df.index+1
|
574
|
-
df['
|
1065
|
+
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
575
1066
|
df['event_type_last'] = df['event_type'].shift(1)
|
576
1067
|
df['event_type_last_2'] = df['event_type_last'].shift(1)
|
577
1068
|
df['event_type_next'] = df['event_type'].shift(-1)
|
@@ -580,60 +1071,36 @@ def combine_data(json,html):
|
|
580
1071
|
period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
|
581
1072
|
#Define shifts by "line-change" or "on-the-fly"
|
582
1073
|
df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
df['
|
591
|
-
df['
|
592
|
-
|
593
|
-
|
594
|
-
df['
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
'away_team_abbr','home_team_abbr','home_team_defending_side',
|
613
|
-
'away_score','away_fenwick',
|
614
|
-
'home_score','home_fenwick',
|
615
|
-
'away_goalie','home_goalie']
|
616
|
-
away_on = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6']
|
617
|
-
home_on = ['home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6']
|
618
|
-
|
619
|
-
#Forward fill appropriate columns
|
620
|
-
for col in ffill_col+away_on+home_on:
|
1074
|
+
df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
|
1075
|
+
try:
|
1076
|
+
df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
|
1077
|
+
except:
|
1078
|
+
""
|
1079
|
+
|
1080
|
+
#Add time since last event and overall event length
|
1081
|
+
df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
|
1082
|
+
df['event_length'] = df['seconds_since_last'].shift(-1)
|
1083
|
+
|
1084
|
+
#Add fixed strength state column
|
1085
|
+
df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
|
1086
|
+
|
1087
|
+
#Retrieve coaches
|
1088
|
+
coaches = info['coaches']
|
1089
|
+
if not coaches:
|
1090
|
+
df['away_coach'] = ""
|
1091
|
+
df['home_coach'] = ""
|
1092
|
+
df['event_coach'] = ""
|
1093
|
+
else:
|
1094
|
+
df['away_coach'] = coaches['away']
|
1095
|
+
df['home_coach'] = coaches['home']
|
1096
|
+
df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
|
1097
|
+
|
1098
|
+
#Forward fill as necessary
|
1099
|
+
cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
|
1100
|
+
for col in cols:
|
1101
|
+
try: df[col]
|
1102
|
+
except: df[col] = ""
|
621
1103
|
df[col] = df[col].ffill()
|
622
1104
|
|
623
|
-
#Now that forward fill is complete, replace "REMOVE" with nan
|
624
|
-
df.replace("REMOVE",np.nan,inplace=True)
|
625
|
-
|
626
|
-
#Reconfigure strength state and sitution codes
|
627
|
-
df['away_skaters'] = df[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
628
|
-
df['home_skaters'] = df[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
629
|
-
df['away_goalie_in'] = np.where(df['away_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
630
|
-
df['home_goalie_in'] = np.where(df['home_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
631
|
-
|
632
|
-
df['event_skaters'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['home_skaters'],df['away_skaters'])
|
633
|
-
df['event_skaters_against'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['away_skaters'],df['home_skaters'])
|
634
|
-
|
635
|
-
df['strength_state'] = df['event_skaters'].astype(str) + "v" + df['event_skaters_against'].astype(str)
|
636
|
-
df['situation_code'] = np.where(df['situation_code'].isna(),df['away_goalie_in'].astype(str) + df['away_skaters'].astype(str) + df['home_skaters'].astype(str) + df['home_goalie_in'].astype(str),df['situation_code'])
|
637
|
-
|
638
1105
|
#Return: complete play-by-play with all important data for each event in a provided game
|
639
|
-
return df[get_col()].replace(r'^\s*$', np.nan, regex=True)
|
1106
|
+
return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)
|