wsba-hockey 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +858 -377
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +374 -0
- wsba_hockey/wsba_main.py +725 -123
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/METADATA +49 -11
- wsba_hockey-0.1.4.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.1.dist-info/RECORD +0 -8
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.1.dist-info → wsba_hockey-0.1.4.dist-info}/top_level.txt +0 -0
wsba_hockey/tools/scraping.py
CHANGED
@@ -1,105 +1,166 @@
|
|
1
1
|
import re
|
2
|
-
from bs4 import BeautifulSoup
|
3
|
-
import
|
4
|
-
import
|
5
|
-
|
2
|
+
from bs4 import BeautifulSoup
|
3
|
+
import requests as rs
|
4
|
+
import json as json_lib
|
5
|
+
from tools.utils.shared import *
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
8
|
import warnings
|
9
|
-
import requests as rs
|
10
|
-
from zipfile import ZipFile
|
11
9
|
warnings.filterwarnings('ignore')
|
12
10
|
|
13
11
|
### SCRAPING FUNCTIONS ###
|
14
12
|
# Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
|
15
13
|
|
16
|
-
##
|
17
|
-
|
18
|
-
|
14
|
+
## ORDER OF OPERATIONS ##
|
15
|
+
# Create game information to use with all functions
|
16
|
+
# Retreive JSON data
|
17
|
+
# Parse JSON data
|
18
|
+
# Retreive and clean HTML pbp with player information
|
19
|
+
# Parse HTML pbp, return parsed HTML
|
20
|
+
# Combine pbp data
|
21
|
+
# Retreive and analyze HTML shifts with player information for home and away teams
|
22
|
+
# Parse shift events
|
23
|
+
# Combine all data, return complete play-by-play
|
24
|
+
|
25
|
+
## UTILITY FUNCTIONS ##
|
26
|
+
def get_col():
|
27
|
+
return [
|
28
|
+
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
29
|
+
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
30
|
+
'seconds_elapsed',"situation_code","strength_state","strength_state_venue","home_team_defending_side",
|
31
|
+
"event_type_code","event_type","description","penalty_duration",
|
32
|
+
"event_team_abbr","event_team_venue",
|
33
|
+
'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
|
34
|
+
"event_player_1_name","event_player_2_name","event_player_3_name",
|
35
|
+
"event_player_1_id","event_player_2_id","event_player_3_id",
|
36
|
+
"event_player_1_pos","event_player_2_pos","event_player_3_pos",
|
37
|
+
"event_goalie_name","event_goalie_id",
|
38
|
+
"shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
39
|
+
"event_skaters","away_skaters","home_skaters",
|
40
|
+
"event_distance","event_angle","event_length","seconds_since_last",
|
41
|
+
"away_score","home_score", "away_fenwick", "home_fenwick","away_sog","home_sog",
|
42
|
+
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
43
|
+
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
|
44
|
+
"away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
|
45
|
+
"home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
|
46
|
+
"event_coach","away_coach","home_coach"
|
47
|
+
]
|
48
|
+
|
49
|
+
|
50
|
+
## JSON FUNCTIONS ##
|
51
|
+
def get_game_roster(json):
|
52
|
+
#Given raw json data, return game rosters
|
19
53
|
roster = pd.json_normalize(json['rosterSpots'])
|
20
|
-
|
21
|
-
home = info['homeTeam.id'][0]
|
22
|
-
away = info['awayTeam.id'][0]
|
23
|
-
|
24
|
-
#Add up to four alternative names for each player in the game
|
25
|
-
roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
|
26
|
-
try: roster['playerName_2'] = roster['firstName.cs']+" "+roster['lastName.default']
|
27
|
-
except: roster['playerName_2'] = ""
|
28
|
-
try: roster['playerName_3'] = roster['firstName.de']+" "+roster['lastName.default']
|
29
|
-
except: roster['playerName_3'] = ""
|
30
|
-
try: roster['playerName_4'] = roster['firstName.es']+" "+roster['lastName.default']
|
31
|
-
except: roster['playerName_4'] = ""
|
32
|
-
|
33
|
-
#For each home/away player their name is included as a key and their id or position is the value
|
34
|
-
home_players = {}
|
35
|
-
home_id = roster.loc[roster['teamId']==home]
|
36
|
-
hid = list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])
|
37
|
-
hpos = list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])
|
38
|
-
hp = list(home_id['playerName'])+list(home_id['playerName_2'])+list(home_id['playerName_3'])+list(home_id['playerName_4'])
|
39
|
-
|
40
|
-
for id, pos, player in zip(hid,hpos,hp):
|
41
|
-
try: home_players.update({player.upper():
|
42
|
-
{result:id if result == 'id' else pos}})
|
43
|
-
except:
|
44
|
-
continue
|
54
|
+
roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
|
45
55
|
|
46
|
-
|
47
|
-
|
48
|
-
aid = list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])
|
49
|
-
apos = list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])
|
50
|
-
ap = list(away_id['playerName'])+list(away_id['playerName_2'])+list(away_id['playerName_3'])+list(away_id['playerName_4'])
|
51
|
-
|
52
|
-
for id, pos, player in zip(aid,apos,ap):
|
53
|
-
try: away_players.update({player.upper():
|
54
|
-
{result:id if result == 'id' else pos}})
|
55
|
-
except:
|
56
|
-
continue
|
57
|
-
|
58
|
-
#Return: Dict of away and home players keyed with id or position as value
|
59
|
-
return {
|
60
|
-
'home':home_players,
|
61
|
-
'away':away_players
|
62
|
-
}
|
56
|
+
#Return: roster information
|
57
|
+
return roster
|
63
58
|
|
64
|
-
def
|
65
|
-
#Given
|
59
|
+
def get_game_coaches(game_id):
|
60
|
+
#Given game info, return head coaches for away and home team
|
61
|
+
|
62
|
+
#Retreive data
|
63
|
+
json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
|
64
|
+
data = json['gameInfo']
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
#Add coaches
|
67
|
+
try:
|
68
|
+
away = data['awayTeam']['headCoach']['default'].upper()
|
69
|
+
home = data['homeTeam']['headCoach']['default'].upper()
|
70
|
+
|
71
|
+
coaches = {'away':away,
|
72
|
+
'home':home}
|
73
|
+
except KeyError:
|
74
|
+
return {}
|
70
75
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
#
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
76
|
+
#Return: dict with coaches
|
77
|
+
return coaches
|
78
|
+
|
79
|
+
def get_game_info(game_id):
|
80
|
+
#Given game_id, return game information
|
81
|
+
|
82
|
+
#Retreive data
|
83
|
+
api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
|
84
|
+
json = rs.get(api).json()
|
85
|
+
|
86
|
+
#Games don't always have JSON shifts, for whatever reason
|
87
|
+
shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
|
88
|
+
shifts = rs.get(shifts).json()
|
89
|
+
json_shifts = pd.json_normalize(shifts['data'])
|
90
|
+
|
91
|
+
if shifts['total'] == 0:
|
92
|
+
json_shifts = pd.DataFrame()
|
93
|
+
|
94
|
+
#Split information
|
95
|
+
base = pd.json_normalize(json)
|
96
|
+
game_id = base['id'][0]
|
97
|
+
season = base['season'][0]
|
98
|
+
season_type = base['gameType'][0]
|
99
|
+
game_date = base['gameDate'][0]
|
100
|
+
game_state = base['gameState'][0]
|
101
|
+
start_time = base['startTimeUTC'][0]
|
102
|
+
venue = base['venue.default'][0]
|
103
|
+
venue_location = base['venueLocation.default'][0]
|
104
|
+
away_team_id = base['awayTeam.id'][0]
|
105
|
+
away_team_abbr = base['awayTeam.abbrev'][0]
|
106
|
+
home_team_id = base['homeTeam.id'][0]
|
107
|
+
home_team_abbr = base['homeTeam.abbrev'][0]
|
108
|
+
|
109
|
+
#Add roster
|
110
|
+
roster = get_game_roster(json)
|
111
|
+
#In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
|
112
|
+
roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
|
113
|
+
roster['team_abbr'] = roster['teamId'].replace({
|
114
|
+
away_team_id:[away_team_abbr],
|
115
|
+
home_team_id:[home_team_abbr]
|
116
|
+
})
|
117
|
+
roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
|
100
118
|
|
119
|
+
#Create an additional roster dictionary for use with HTML parsing
|
120
|
+
#Roster dict
|
121
|
+
roster_dict = {'away':{},
|
122
|
+
'home':{}}
|
123
|
+
|
124
|
+
#Evaluate and add players by team
|
125
|
+
for team in ['away','home']:
|
126
|
+
abbr = (away_team_abbr if team == 'away' else home_team_abbr)
|
127
|
+
rost = roster.loc[roster['team_abbr']==abbr]
|
128
|
+
|
129
|
+
#Now iterate through team players
|
130
|
+
for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
|
131
|
+
roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
|
132
|
+
|
133
|
+
#Return: game information
|
134
|
+
return {"game_id":str(game_id),
|
135
|
+
"season":season,
|
136
|
+
"season_type":season_type,
|
137
|
+
"game_date":game_date,
|
138
|
+
"game_state":game_state,
|
139
|
+
"start_time":start_time,
|
140
|
+
'venue':venue,
|
141
|
+
'venue_location':venue_location,
|
142
|
+
'away_team_id':away_team_id,
|
143
|
+
'away_team_abbr':away_team_abbr,
|
144
|
+
'home_team_id':home_team_id,
|
145
|
+
'home_team_abbr':home_team_abbr,
|
146
|
+
'events':pd.json_normalize(json['plays']).reset_index(drop=True),
|
147
|
+
'rosters':roster,
|
148
|
+
'HTML_rosters':roster_dict,
|
149
|
+
'coaches':get_game_coaches(game_id),
|
150
|
+
'json_shifts':json_shifts}
|
151
|
+
|
152
|
+
def parse_json(info):
|
153
|
+
#Given game info, return JSON document
|
154
|
+
|
155
|
+
#Retreive data
|
156
|
+
events = info['events']
|
157
|
+
|
158
|
+
#Return error if game is set in the future
|
159
|
+
if info['game_state'] == 'FUT':
|
160
|
+
raise ValueError(f"Game {info['id'][0]} has not occured yet.")
|
161
|
+
|
101
162
|
#Test columns
|
102
|
-
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date'
|
163
|
+
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
|
103
164
|
|
104
165
|
for col in cols:
|
105
166
|
try:events[col]
|
@@ -120,46 +181,38 @@ def parse_json(json):
|
|
120
181
|
|
121
182
|
events['event_player_3_id'] = events['details.assist2PlayerId']
|
122
183
|
|
123
|
-
events['
|
184
|
+
events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
|
124
185
|
|
125
186
|
#Coordinate adjustments:
|
126
187
|
#The WSBA NHL Scraper includes three sets of coordinates per event:
|
127
188
|
# x, y - Raw coordinates from JSON pbpp
|
128
189
|
# x_fixed, y_fixed - Coordinates fixed to the right side of the ice (x is always greater than 0)
|
129
190
|
# x_adj, y_adj - Adjusted coordinates configuring away events with negative x vlaues while home events are always positive
|
130
|
-
events['x_fixed'] = abs(events['details.xCoord'])
|
131
|
-
events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
|
132
|
-
events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
|
133
|
-
events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
|
134
|
-
events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
|
135
|
-
events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
|
136
|
-
|
137
|
-
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
|
138
|
-
|
139
|
-
#Event player information includes ids (included in the JSON events), names (from "rosterSpots"), and positions (also from "rosterSpots")
|
140
|
-
events['event_player_1_name'] = events['event_player_1_id'].replace(players)
|
141
|
-
events['event_player_2_name'] = events['event_player_2_id'].replace(players)
|
142
|
-
events['event_player_3_name'] = events['event_player_3_id'].replace(players)
|
143
|
-
|
144
|
-
events['event_player_1_pos'] = events['event_player_1_id'].replace(players_pos)
|
145
|
-
events['event_player_2_pos'] = events['event_player_2_id'].replace(players_pos)
|
146
|
-
events['event_player_3_pos'] = events['event_player_3_id'].replace(players_pos)
|
147
|
-
|
148
|
-
events['event_goalie_name'] = events['details.goalieInNetId'].replace(players)
|
149
|
-
|
150
|
-
#Create situations given situation code (this is reconfigured with on ice skaters when provided shifts data)
|
151
|
-
events['away_skaters'] = events['situationCode'].astype(str).str.slice(start=1,stop=2)
|
152
|
-
events['home_skaters'] = events['situationCode'].astype(str).str.slice(start=2,stop=3)
|
153
|
-
events['event_skaters'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['home_skaters'],events['away_skaters'])
|
154
|
-
events['event_skaters_against'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['away_skaters'],events['home_skaters'])
|
155
|
-
|
156
|
-
events['strength_state'] = events['event_skaters']+"v"+events['event_skaters_against']
|
157
|
-
events['strength'] = np.where(events['event_skaters']==events['event_skaters_against'],
|
158
|
-
"EV",np.where(
|
159
|
-
events['event_skaters']>events['event_skaters_against'],
|
160
|
-
"PP","SH"
|
161
|
-
))
|
162
191
|
|
192
|
+
#Some games (mostly preseason and all star games) do not include coordinates.
|
193
|
+
try:
|
194
|
+
events['x_fixed'] = abs(events['details.xCoord'])
|
195
|
+
events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
|
196
|
+
events['x_adj'] = np.where(events['event_team_venue']=="home",events['x_fixed'],-events['x_fixed'])
|
197
|
+
events['y_adj'] = np.where(events['event_team_venue']=="home",events['y_fixed'],-events['y_fixed'])
|
198
|
+
events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
|
199
|
+
events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
|
200
|
+
except TypeError:
|
201
|
+
print(f"No coordinates found for game {info['id'][0]}...")
|
202
|
+
|
203
|
+
events['x_fixed'] = np.nan
|
204
|
+
events['y_fixed'] = np.nan
|
205
|
+
events['x_adj'] = np.nan
|
206
|
+
events['y_adj'] = np.nan
|
207
|
+
events['event_distance'] = np.nan
|
208
|
+
events['event_angle'] = np.nan
|
209
|
+
|
210
|
+
|
211
|
+
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
|
212
|
+
info['away_team_id']:[info['away_team_abbr']],
|
213
|
+
info['home_team_id']:[info['home_team_abbr']]
|
214
|
+
})
|
215
|
+
|
163
216
|
#Rename columns to follow WSBA naming conventions
|
164
217
|
events = events.rename(columns={
|
165
218
|
"eventId":"event_id",
|
@@ -184,14 +237,12 @@ def parse_json(json):
|
|
184
237
|
})
|
185
238
|
|
186
239
|
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
187
|
-
events['
|
188
|
-
events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
|
189
|
-
((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
|
190
|
-
((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
|
191
|
-
events['period_seconds_remaining'] = 1200-events['period_seconds_elapsed']
|
240
|
+
events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
|
192
241
|
events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
|
242
|
+
|
243
|
+
events = events.loc[(events['event_type']!="")]
|
193
244
|
|
194
|
-
#
|
245
|
+
#Assign score and fenwick for each event
|
195
246
|
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
196
247
|
ag = 0
|
197
248
|
ags = []
|
@@ -202,16 +253,16 @@ def parse_json(json):
|
|
202
253
|
afs = []
|
203
254
|
hf = 0
|
204
255
|
hfs = []
|
205
|
-
for event,team in zip(list(events['event_type']),list(events['
|
256
|
+
for event,team in zip(list(events['event_type']),list(events['event_team_venue'])):
|
206
257
|
if event in fenwick_events:
|
207
258
|
if team == "home":
|
208
|
-
hf
|
259
|
+
hf += 1
|
209
260
|
if event == 'goal':
|
210
|
-
hg
|
261
|
+
hg += 1
|
211
262
|
else:
|
212
|
-
af
|
263
|
+
af += 1
|
213
264
|
if event == 'goal':
|
214
|
-
ag
|
265
|
+
ag += 1
|
215
266
|
|
216
267
|
ags.append(ag)
|
217
268
|
hgs.append(hg)
|
@@ -222,84 +273,561 @@ def parse_json(json):
|
|
222
273
|
events['home_score'] = hgs
|
223
274
|
events['away_fenwick'] = afs
|
224
275
|
events['home_fenwick'] = hfs
|
225
|
-
|
226
|
-
events = events.loc[(events['event_type']!="")&(events['event_type']!="game-end")]
|
227
276
|
|
228
|
-
#Return: dataframe with parsed
|
277
|
+
#Return: dataframe with parsed game
|
229
278
|
return events
|
230
279
|
|
280
|
+
### ESPN SCRAPING FUNCTIONS ###
|
281
|
+
def espn_game_id(date,away,home):
|
282
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
|
283
|
+
date = date.replace("-","")
|
284
|
+
|
285
|
+
#Retreive data
|
286
|
+
api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
|
287
|
+
schedule = pd.json_normalize(rs.get(api).json()['events'])
|
288
|
+
|
289
|
+
#Create team abbreviation columns
|
290
|
+
schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
|
291
|
+
schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
|
292
|
+
|
293
|
+
#Modify team abbreviations as necessary
|
294
|
+
schedule = schedule.replace({
|
295
|
+
"LA":"LAK",
|
296
|
+
"NJ":"NJD",
|
297
|
+
"SJ":"SJS",
|
298
|
+
"TB":"TBL",
|
299
|
+
})
|
300
|
+
|
301
|
+
#Retreive game id
|
302
|
+
game_id = schedule.loc[(schedule['away_team_abbr']==away)&
|
303
|
+
(schedule['home_team_abbr']==home),'id'].tolist()[0]
|
231
304
|
|
305
|
+
#Return: ESPN game id
|
306
|
+
return game_id
|
232
307
|
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
308
|
+
def parse_espn(date,away,home):
|
309
|
+
#Given a date formatted as YYYY-MM-DD and teams, return game events
|
310
|
+
game_id = espn_game_id(date,away,home)
|
311
|
+
url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
|
312
|
+
|
313
|
+
#Code modified from Patrick Bacon
|
238
314
|
|
239
|
-
|
240
|
-
|
241
|
-
|
315
|
+
#Retreive game events as json
|
316
|
+
page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
|
317
|
+
soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
|
318
|
+
json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
|
242
319
|
|
243
|
-
|
244
|
-
|
245
|
-
return td, get_teams(soup)
|
320
|
+
#DataFrame of time-related info for events
|
321
|
+
clock_df = pd.DataFrame()
|
246
322
|
|
323
|
+
for period in range(0, len(json)):
|
324
|
+
clock_df = clock_df._append(pd.DataFrame(json[period]))
|
247
325
|
|
248
|
-
|
249
|
-
|
250
|
-
#
|
251
|
-
|
252
|
-
|
326
|
+
clock_df = clock_df[~pd.isna(clock_df.clock)]
|
327
|
+
|
328
|
+
# Needed to add .split(',"st":3')[0] for playoffs
|
329
|
+
|
330
|
+
#DataFrame of coordinates for events
|
331
|
+
coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
|
332
|
+
|
333
|
+
clock_df = clock_df.assign(
|
334
|
+
clock = clock_df.clock.apply(lambda x: x['displayValue'])
|
335
|
+
)
|
336
|
+
|
337
|
+
coords_df = coords_df.assign(
|
338
|
+
coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
|
339
|
+
coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
|
340
|
+
event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
|
341
|
+
)
|
342
|
+
|
343
|
+
#Combine
|
344
|
+
espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
|
345
|
+
|
346
|
+
espn_events = espn_events.assign(
|
347
|
+
period = espn_events['period'].apply(lambda x: x['number']),
|
348
|
+
minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
|
349
|
+
seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
|
350
|
+
event_type = espn_events['type'].apply(lambda x: x['txt'])
|
351
|
+
)
|
352
|
+
|
353
|
+
espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
354
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_x
|
355
|
+
),
|
356
|
+
coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
357
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
|
358
|
+
|
359
|
+
espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
|
360
|
+
|
361
|
+
espn_events = espn_events.assign(
|
362
|
+
coords_x = espn_events.coords_x.astype(int),
|
363
|
+
coords_y = espn_events.coords_y.astype(int)
|
364
|
+
)
|
365
|
+
|
366
|
+
#Rename events
|
367
|
+
#The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
|
368
|
+
espn_events['event_type'] = espn_events['event_type'].replace({
|
369
|
+
"Face Off":'faceoff',
|
370
|
+
"Hit":'hit',
|
371
|
+
"Shot":'shot-on-goal',
|
372
|
+
"Missed":'missed-shot',
|
373
|
+
"Blocked":'blocked-shot',
|
374
|
+
"Goal":'goal',
|
375
|
+
"Turnover":'giveaway',
|
376
|
+
"Delayed Penalty":'delayed-penalty',
|
377
|
+
"Penalty":'penalty',
|
378
|
+
})
|
379
|
+
|
380
|
+
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
381
|
+
espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
|
382
|
+
espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
|
383
|
+
((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
|
384
|
+
((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
|
385
|
+
espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
|
386
|
+
|
387
|
+
espn_events = espn_events.rename(columns = {'text':'description'})
|
388
|
+
|
389
|
+
#Add event team
|
390
|
+
espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
|
391
|
+
"away":away,
|
392
|
+
"home":home
|
393
|
+
})
|
394
|
+
|
395
|
+
#Some games (mostly preseason and all star games) do not include coordinates.
|
396
|
+
try:
|
397
|
+
espn_events['x_fixed'] = abs(espn_events['coords_x'])
|
398
|
+
espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
|
399
|
+
espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
|
400
|
+
espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
|
401
|
+
espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
|
402
|
+
espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
|
403
|
+
except TypeError:
|
404
|
+
print(f"No coordinates found for ESPN game...")
|
405
|
+
|
406
|
+
espn_events['x_fixed'] = np.nan
|
407
|
+
espn_events['y_fixed'] = np.nan
|
408
|
+
espn_events['x_adj'] = np.nan
|
409
|
+
espn_events['y_adj'] = np.nan
|
410
|
+
espn_events['event_distance'] = np.nan
|
411
|
+
espn_events['event_angle'] = np.nan
|
412
|
+
|
413
|
+
#Assign score and fenwick for each event
|
414
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
415
|
+
ag = 0
|
416
|
+
ags = []
|
417
|
+
hg = 0
|
418
|
+
hgs = []
|
419
|
+
|
420
|
+
af = 0
|
421
|
+
afs = []
|
422
|
+
hf = 0
|
423
|
+
hfs = []
|
424
|
+
for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
|
425
|
+
if event in fenwick_events:
|
426
|
+
if team == "home":
|
427
|
+
hf += 1
|
428
|
+
if event == 'goal':
|
429
|
+
hg += 1
|
430
|
+
else:
|
431
|
+
af += 1
|
432
|
+
if event == 'goal':
|
433
|
+
ag += 1
|
434
|
+
|
435
|
+
ags.append(ag)
|
436
|
+
hgs.append(hg)
|
437
|
+
afs.append(af)
|
438
|
+
hfs.append(hf)
|
439
|
+
|
440
|
+
espn_events['away_score'] = ags
|
441
|
+
espn_events['home_score'] = hgs
|
442
|
+
espn_events['away_fenwick'] = afs
|
443
|
+
espn_events['home_fenwick'] = hfs
|
444
|
+
#Return: play-by-play events in supplied game from ESPN
|
445
|
+
return espn_events
|
446
|
+
|
447
|
+
## HTML PBP FUNCTIONS ##
|
448
|
+
def strip_html_pbp(td,rosters):
|
449
|
+
#Given html row, parse data from HTML pbp
|
450
|
+
#Harry Shomer's Code (modified)
|
451
|
+
|
452
|
+
#HTML Parsing
|
453
|
+
for y in range(len(td)):
|
454
|
+
# Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
|
455
|
+
if y == 3:
|
456
|
+
td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
|
457
|
+
index = td[y].find(':')
|
458
|
+
td[y] = td[y][:index+3]
|
459
|
+
elif (y == 6 or y == 7) and td[0] != '#':
|
460
|
+
# 6 & 7-> These are the player 1 ice one's
|
461
|
+
# The second statement controls for when it's just a header
|
462
|
+
baz = td[y].find_all('td')
|
463
|
+
bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
|
464
|
+
|
465
|
+
# The setup in the list is now: Name/Number->Position->Blank...and repeat
|
466
|
+
# Now strip all the html
|
467
|
+
players = []
|
468
|
+
for i in range(len(bar)):
|
469
|
+
if i % 3 == 0:
|
470
|
+
try:
|
471
|
+
#Using the supplied json we can bind player name and id to number and team
|
472
|
+
#Find number and team of player then lookup roster dictionary
|
473
|
+
|
474
|
+
number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
|
475
|
+
if y == 6:
|
476
|
+
team = 'away'
|
477
|
+
else:
|
478
|
+
team = 'home'
|
479
|
+
|
480
|
+
id = rosters[team][str(number)][4]
|
481
|
+
name = rosters[team][str(number)][2]
|
482
|
+
position = rosters[team][str(number)][1]
|
483
|
+
|
484
|
+
except KeyError:
|
485
|
+
name = ''
|
486
|
+
number = ''
|
487
|
+
id = ''
|
488
|
+
elif i % 3 == 1:
|
489
|
+
if name != '':
|
490
|
+
players.append([name, number, position, id])
|
491
|
+
|
492
|
+
td[y] = players
|
493
|
+
else:
|
494
|
+
td[y] = td[y].get_text()
|
495
|
+
|
496
|
+
return td
|
497
|
+
|
498
|
+
|
499
|
+
def clean_html_pbp(info):
|
500
|
+
#Harry Shomer's Code (modified)
|
253
501
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
502
|
+
game_id = info['game_id']
|
503
|
+
#Retreive data
|
504
|
+
season = info['season']
|
505
|
+
doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
|
506
|
+
html = rs.get(doc).content
|
507
|
+
soup = get_contents(html)
|
258
508
|
|
259
|
-
|
509
|
+
#Rosters
|
510
|
+
rosters = info['HTML_rosters']
|
260
511
|
|
261
|
-
#
|
262
|
-
|
512
|
+
# Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
|
513
|
+
td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
|
514
|
+
|
515
|
+
cleaned_html = [strip_html_pbp(x,rosters) for x in td]
|
516
|
+
|
517
|
+
return cleaned_html
|
518
|
+
|
519
|
+
def parse_html(info):
|
520
|
+
#Given game info, return HTML event data
|
521
|
+
|
522
|
+
#Retreive game information and html events
|
523
|
+
rosters = info['HTML_rosters']
|
524
|
+
events = clean_html_pbp(info)
|
525
|
+
|
526
|
+
teams = {info['away_team_abbr']:['away'],
|
527
|
+
info['home_team_abbr']:['home']}
|
528
|
+
|
529
|
+
#Parsing
|
530
|
+
event_log = []
|
531
|
+
for event in events:
|
532
|
+
events_dict = {}
|
533
|
+
if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX','SOC'] or event[3]=='-16:0-':
|
534
|
+
continue
|
535
|
+
else:
|
536
|
+
#Event info
|
537
|
+
events_dict['event_num'] = int(event[0])
|
538
|
+
events_dict['period'] = int(event[1])
|
539
|
+
events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
|
540
|
+
events_dict['period_time_elapsed'] = event[3]
|
541
|
+
events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
|
542
|
+
events_dict['event_type'] = event[4]
|
543
|
+
|
544
|
+
desc = re.sub(u'\xa0'," ",event[5])
|
545
|
+
events_dict['description'] = desc
|
546
|
+
|
547
|
+
events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
|
548
|
+
zone = [x for x in desc.split(',') if 'Zone' in x]
|
549
|
+
if not zone:
|
550
|
+
events_dict['zone_code'] = None
|
551
|
+
elif zone[0].find("Off") != -1:
|
552
|
+
events_dict['zone_code'] = 'O'
|
553
|
+
elif zone[0].find("Neu") != -1:
|
554
|
+
events_dict['zone_code'] = 'N'
|
555
|
+
elif zone[0].find("Def") != -1:
|
556
|
+
events_dict['zone_code'] = 'D'
|
557
|
+
|
558
|
+
#Convert team names for compatiblity
|
559
|
+
replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
|
560
|
+
for name, repl in replace:
|
561
|
+
desc = desc.replace(repl,name)
|
562
|
+
|
563
|
+
event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
|
564
|
+
events_dict['event_team_abbr'] = event_team
|
565
|
+
|
566
|
+
events_dict['away_team_abbr'] = info['away_team_abbr']
|
567
|
+
events_dict['home_team_abbr'] = info['home_team_abbr']
|
568
|
+
|
569
|
+
away_skaters = 0
|
570
|
+
away_goalie = 0
|
571
|
+
#Away on-ice
|
572
|
+
for i in range(len(event[6])):
|
573
|
+
player = event[6][i][0]
|
574
|
+
pos = event[6][i][2]
|
575
|
+
id = event[6][i][3]
|
576
|
+
|
577
|
+
if pos == 'G':
|
578
|
+
events_dict['away_goalie'] = player
|
579
|
+
events_dict['away_goalie_id'] = id
|
580
|
+
away_goalie += 1
|
581
|
+
else:
|
582
|
+
events_dict[f'away_on_{i+1}'] = player
|
583
|
+
events_dict[f'away_on_{i+1}_id'] = id
|
584
|
+
away_skaters += 1
|
585
|
+
|
586
|
+
home_skaters = 0
|
587
|
+
home_goalie = 0
|
588
|
+
#Home on-ice
|
589
|
+
for i in range(len(event[7])):
|
590
|
+
player = event[7][i][0]
|
591
|
+
pos = event[7][i][2]
|
592
|
+
id = event[7][i][3]
|
593
|
+
|
594
|
+
if pos == 'G':
|
595
|
+
events_dict['home_goalie'] = player
|
596
|
+
events_dict['home_goalie_id'] = id
|
597
|
+
home_goalie += 1
|
598
|
+
else:
|
599
|
+
events_dict[f'home_on_{i+1}'] = player
|
600
|
+
events_dict[f'home_on_{i+1}_id'] = id
|
601
|
+
home_skaters += 1
|
602
|
+
|
603
|
+
event_players = []
|
604
|
+
#Determine parsing route based on event
|
605
|
+
if event[4] in ['FAC','HIT','BLOCK','PENL']:
|
606
|
+
#Regex to find team and player number involved (finds all for each event)
|
607
|
+
#Code is modified from Harry Shomer in order to account for periods in a team abbreviation
|
608
|
+
regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
|
609
|
+
fac = regex.findall(desc)
|
610
|
+
#Filter incorrectly parsed teams
|
611
|
+
repl = []
|
612
|
+
for team, num in fac:
|
613
|
+
if team in teams.keys():
|
614
|
+
repl.append((team,num))
|
615
|
+
fac = repl
|
616
|
+
|
617
|
+
#Find first event player
|
618
|
+
ep1_num = ''
|
619
|
+
for i in range(len(fac)):
|
620
|
+
team, num = fac[i]
|
621
|
+
if team == event_team:
|
622
|
+
ep1_num = num
|
623
|
+
event_players.append(fac[i])
|
624
|
+
else:
|
625
|
+
continue
|
626
|
+
|
627
|
+
#Find other players
|
628
|
+
for i in range(len(fac)):
|
629
|
+
team, num = fac[i]
|
630
|
+
if num == ep1_num:
|
631
|
+
continue
|
632
|
+
else:
|
633
|
+
event_players.append(fac[i])
|
634
|
+
elif event[4]=='GOAL':
|
635
|
+
#Parse goal
|
636
|
+
regex = re.compile(r'#(\d+)\s+')
|
637
|
+
goal = regex.findall(desc)
|
638
|
+
|
639
|
+
#Add all involved players
|
640
|
+
for point in goal:
|
641
|
+
#In this loop, point is a player number. We can assign event_team to all players in a goal
|
642
|
+
event_players.append((event_team,str(point)))
|
643
|
+
elif event[4]=='DELPEN':
|
644
|
+
#Don't parse DELPEN events
|
645
|
+
#These events typically have no text but when they do it is often erroneous or otherwise problematic
|
646
|
+
|
647
|
+
""
|
648
|
+
else:
|
649
|
+
#Parse single or no player events
|
650
|
+
regex = re.compile(r'#\d+')
|
651
|
+
fac = regex.findall(desc)
|
652
|
+
|
653
|
+
for i in range(len(fac)):
|
654
|
+
num = fac[i].replace("#","")
|
655
|
+
event_players.append((event_team,str(num)))
|
656
|
+
|
657
|
+
for i in range(len(event_players)):
|
658
|
+
#For each player, evaluate their event data, then retreive information from rosters
|
659
|
+
team, num = event_players[i]
|
660
|
+
|
661
|
+
status = teams[team]
|
662
|
+
data = rosters[status[0]]
|
663
|
+
|
664
|
+
events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
|
665
|
+
events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
|
666
|
+
events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
|
667
|
+
|
668
|
+
events_dict['away_skaters'] = away_skaters
|
669
|
+
events_dict['home_skaters'] = home_skaters
|
670
|
+
events_dict['away_goalie_in'] = away_goalie
|
671
|
+
events_dict['home_goalie_in'] = home_goalie
|
672
|
+
|
673
|
+
event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
|
674
|
+
event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
|
675
|
+
events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
|
676
|
+
events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
|
677
|
+
|
678
|
+
event_log.append(pd.DataFrame([events_dict]))
|
679
|
+
|
680
|
+
data = pd.concat(event_log)
|
681
|
+
data['event_type'] = data['event_type'].replace({
|
682
|
+
"PGSTR": "pre-game-start",
|
683
|
+
"PGEND": "pre-game-end",
|
684
|
+
'GSTR':"game-start",
|
685
|
+
"ANTHEM":"anthem",
|
686
|
+
"PSTR":"period-start",
|
687
|
+
'FAC':"faceoff",
|
688
|
+
"SHOT":"shot-on-goal",
|
689
|
+
"BLOCK":"blocked-shot",
|
690
|
+
"STOP":"stoppage",
|
691
|
+
"MISS":"missed-shot",
|
692
|
+
"HIT":"hit",
|
693
|
+
"GOAL":"goal",
|
694
|
+
"GIVE":"giveaway",
|
695
|
+
"TAKE":"takeaway",
|
696
|
+
"DELPEN":"delayed-penalty",
|
697
|
+
"PENL":"penalty",
|
698
|
+
"CHL":"challenge",
|
699
|
+
"PEND":"period-end",
|
700
|
+
"GEND":"game-end"
|
701
|
+
})
|
702
|
+
|
703
|
+
#Return: parsed HTML pbp
|
704
|
+
return data
|
705
|
+
|
706
|
+
def combine_pbp(info):
|
707
|
+
#Given game info, return complete play-by-play data for provided game
|
708
|
+
|
709
|
+
html_pbp = parse_html(info)
|
710
|
+
|
711
|
+
#Route data combining - json if season is after 2009-2010:
|
712
|
+
if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
|
713
|
+
#ESPN x HTML
|
714
|
+
espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).drop(columns=['event_player_1_name'])
|
715
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
|
716
|
+
|
717
|
+
df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
|
718
|
+
|
719
|
+
else:
|
720
|
+
#JSON x HTML
|
721
|
+
json_pbp = parse_json(info)
|
722
|
+
#Modify merge conditions and merge pbps
|
723
|
+
merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
|
724
|
+
html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
|
725
|
+
|
726
|
+
#While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
|
727
|
+
html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
|
728
|
+
json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
|
729
|
+
|
730
|
+
df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
|
731
|
+
|
732
|
+
#Add game info
|
733
|
+
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
734
|
+
'away_team_abbr','home_team_abbr']
|
735
|
+
|
736
|
+
for col in info_col:
|
737
|
+
df[col] = info[col]
|
738
|
+
|
739
|
+
#Fill period_type column and assign shifts a sub-500 event code
|
740
|
+
df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
|
741
|
+
try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
742
|
+
except:
|
743
|
+
""
|
744
|
+
df = df.sort_values(['period','seconds_elapsed']).reset_index()
|
745
|
+
|
746
|
+
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
747
|
+
|
748
|
+
#Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
|
749
|
+
df['strength_state'] = np.where(np.logical_and(df['period'].astype(str)=='5',df['event_type'].isin(['missed-shot','shot-on-goal','goal'])),"1v0",df['strength_state'])
|
750
|
+
df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
|
751
|
+
|
752
|
+
col = [col for col in get_col() if col in df.columns.to_list()]
|
753
|
+
#Return: complete play-by-play information for provided game
|
754
|
+
return df[col]
|
755
|
+
|
756
|
+
## SHIFT SCRAPING FUNCTIONS ##
|
757
|
+
def parse_shifts_json(info):
|
758
|
+
#Given game info, return json shift chart
|
759
|
+
|
760
|
+
log = info['json_shifts']
|
761
|
+
#Filter non-shift events and duplicate events
|
762
|
+
log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
|
763
|
+
|
764
|
+
#Add full name columns
|
765
|
+
log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
|
766
|
+
|
767
|
+
log = log.rename(columns={
|
768
|
+
'playerId':'player_id',
|
769
|
+
'teamAbbrev':'event_team_abbr',
|
770
|
+
'startTime':'start',
|
771
|
+
'endTime':'end'
|
772
|
+
})
|
773
|
+
|
774
|
+
#Convert time columns
|
775
|
+
log['start'] = log['start'].astype(str).apply(convert_to_seconds)
|
776
|
+
log['end'] = log['end'].astype(str).apply(convert_to_seconds)
|
777
|
+
log = log[['player_name','player_id',
|
778
|
+
'period','event_team_abbr',
|
779
|
+
'start','duration','end']]
|
780
|
+
|
781
|
+
#Recalibrate duration
|
782
|
+
log['duration'] = log['end'] - log['start']
|
783
|
+
|
784
|
+
#Return: JSON shifts (seperated by team)
|
785
|
+
away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
|
786
|
+
home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
|
787
|
+
|
788
|
+
return {'away':away,
|
789
|
+
'home':home}
|
790
|
+
|
791
|
+
def analyze_shifts(shift, id, name, pos, team):
|
263
792
|
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
264
793
|
#Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
|
265
794
|
shifts = dict()
|
266
795
|
|
267
796
|
shifts['player_name'] = name.upper()
|
797
|
+
shifts['player_id'] = id
|
798
|
+
shifts['player_pos'] = pos
|
268
799
|
shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
|
269
|
-
shifts['
|
270
|
-
shifts['start'] =
|
271
|
-
shifts['duration'] =
|
800
|
+
shifts['event_team_abbr'] = get_team(team.strip(' '))
|
801
|
+
shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
|
802
|
+
shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
|
272
803
|
|
273
|
-
#
|
804
|
+
#Sometimes there are no digits
|
274
805
|
if re.compile(r'\d+').findall(shift[3].split('/')[0]):
|
275
|
-
shifts['end'] =
|
806
|
+
shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
|
276
807
|
else:
|
277
808
|
shifts['end'] = shifts['start'] + shifts['duration']
|
278
|
-
|
279
|
-
try:
|
280
|
-
if home_team == team:
|
281
|
-
shifts['player_id'] = player_ids['home'][name.upper()]['id']
|
282
|
-
else:
|
283
|
-
shifts['player_id'] = player_ids['away'][name.upper()]['id']
|
284
|
-
except KeyError:
|
285
|
-
shifts['player_id'] = None
|
286
|
-
|
287
809
|
return shifts
|
288
810
|
|
289
|
-
def
|
290
|
-
#
|
291
|
-
#Stage one: create dataframe with raw individual shifts
|
292
|
-
#Stage two: convert shift events to play-by-play structure created with json_parsing
|
811
|
+
def parse_shifts_html(info,home):
|
812
|
+
#Parsing of shifts data for a single team in a provided game
|
293
813
|
#Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
|
294
814
|
|
815
|
+
#Roster info prep
|
816
|
+
roster = info['HTML_rosters']
|
295
817
|
|
818
|
+
rosters = roster['home' if home else 'away']
|
819
|
+
|
296
820
|
all_shifts = []
|
297
|
-
columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
821
|
+
#columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
298
822
|
|
299
|
-
|
823
|
+
#Retreive HTML
|
824
|
+
game_id = info['game_id']
|
825
|
+
season = info['season']
|
826
|
+
link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
|
827
|
+
doc = rs.get(link).content
|
828
|
+
td, teams = get_soup(doc)
|
300
829
|
|
301
830
|
team = teams[0]
|
302
|
-
home_team = teams[1]
|
303
831
|
players = dict()
|
304
832
|
|
305
833
|
# Iterates through each player shifts table with the following data:
|
@@ -308,37 +836,55 @@ def parse_shifts(html, player_ids, game_id):
|
|
308
836
|
t = t.get_text()
|
309
837
|
if ',' in t: # If a comma exists it is a player
|
310
838
|
name = t
|
839
|
+
|
311
840
|
name = name.split(',')
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
841
|
+
number = int(name[0][:2].strip())
|
842
|
+
id = rosters[str(number)][4]
|
843
|
+
players[id] = dict()
|
844
|
+
|
845
|
+
#HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
|
846
|
+
|
847
|
+
players[id]['name'] = rosters[str(number)][2]
|
848
|
+
players[id]['pos'] = rosters[str(number)][1]
|
849
|
+
|
850
|
+
players[id]['shifts'] = []
|
318
851
|
else:
|
319
|
-
players[
|
852
|
+
players[id]['shifts'].extend([t])
|
320
853
|
|
321
854
|
for key in players.keys():
|
322
855
|
# Create lists of shifts-table columns for analysis
|
323
856
|
players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
|
324
857
|
|
858
|
+
name = players[key]['name']
|
859
|
+
pos = players[key]['pos']
|
860
|
+
|
325
861
|
# Parsing
|
326
|
-
shifts = [analyze_shifts(shift, key,
|
862
|
+
shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
|
327
863
|
all_shifts.extend(shifts)
|
328
864
|
|
329
865
|
df = pd.DataFrame(all_shifts)
|
330
|
-
df['game_id'] = str(game_id)
|
331
866
|
|
332
|
-
shifts_raw = df[
|
867
|
+
shifts_raw = df[df['duration'] > 0]
|
333
868
|
|
334
|
-
|
869
|
+
#Return: single-team individual shifts by player
|
870
|
+
return shifts_raw
|
871
|
+
|
872
|
+
def parse_shift_events(info,home):
|
873
|
+
#Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
|
874
|
+
|
875
|
+
#Determine whether to use JSON shifts or HTML shifts
|
876
|
+
if len(info['json_shifts']) == 0:
|
877
|
+
shift = parse_shifts_html(info,home)
|
878
|
+
else:
|
879
|
+
shift = parse_shifts_json(info)['home' if home else 'away']
|
880
|
+
|
881
|
+
rosters = info['rosters']
|
335
882
|
|
336
|
-
# Second-stage beginds here
|
337
883
|
# Identify shift starts for each shift event
|
338
|
-
shifts_on =
|
884
|
+
shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
|
339
885
|
num_on=('player_name', 'size'),
|
340
886
|
players_on=('player_name', lambda x: ', '.join(x)),
|
341
|
-
ids_on=('player_id', lambda x: ', '.join(map(str,
|
887
|
+
ids_on=('player_id', lambda x: ', '.join(map(str,x))),
|
342
888
|
).reset_index()
|
343
889
|
|
344
890
|
shifts_on = shifts_on.rename(columns={
|
@@ -346,10 +892,10 @@ def parse_shifts(html, player_ids, game_id):
|
|
346
892
|
})
|
347
893
|
|
348
894
|
# Identify shift stops for each shift event
|
349
|
-
shifts_off =
|
895
|
+
shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
|
350
896
|
num_off=('player_name', 'size'),
|
351
897
|
players_off=('player_name', lambda x: ', '.join(x)),
|
352
|
-
ids_off=('player_id', lambda x: ', '.join(map(str,
|
898
|
+
ids_off=('player_id', lambda x: ', '.join(map(str,x))),
|
353
899
|
).reset_index()
|
354
900
|
|
355
901
|
shifts_off = shifts_off.rename(columns={
|
@@ -357,57 +903,29 @@ def parse_shifts(html, player_ids, game_id):
|
|
357
903
|
})
|
358
904
|
|
359
905
|
# Merge and sort by time in game
|
360
|
-
shifts = pd.merge(shifts_on, shifts_off, on=['
|
906
|
+
shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
|
361
907
|
|
362
|
-
shifts = shifts
|
363
|
-
|
364
|
-
#Modify columns of new total shifts dataframe
|
365
|
-
shifts['period'] = shifts['period'].astype(int)
|
908
|
+
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
|
366
909
|
shifts['event_type'] = 'change'
|
367
|
-
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200 * (shifts['period']-1))
|
368
|
-
shifts['game_seconds_remaining'] = 3600 - shifts['seconds_elapsed']
|
369
|
-
|
370
|
-
# Handle missing values at the start and end of periods
|
371
|
-
shifts['players_on'] = shifts['players_on'].fillna('None')
|
372
|
-
shifts['players_off'] = shifts['players_off'].fillna('None')
|
373
|
-
shifts['ids_on'] = shifts['ids_on'].fillna('0')
|
374
|
-
shifts['ids_off'] = shifts['ids_off'].fillna('0')
|
375
|
-
shifts['num_on'] = shifts['num_on'].fillna(0).astype(int)
|
376
|
-
shifts['num_off'] = shifts['num_off'].fillna(0).astype(int)
|
377
|
-
|
378
|
-
#Manual Team Rename
|
379
|
-
shifts['team_abbr'] = shifts['team_abbr'].replace({
|
380
|
-
"L.A":"LAK",
|
381
|
-
"N.J":"NJD",
|
382
|
-
"S.J":"SJS",
|
383
|
-
"T.B":"TBL"
|
384
|
-
})
|
385
910
|
|
386
|
-
#
|
387
|
-
|
911
|
+
#Shift events similar to html (remove shootout shifts)
|
912
|
+
shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
|
913
|
+
|
914
|
+
#Generate on-ice columns
|
915
|
+
skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
|
916
|
+
goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
|
917
|
+
team = list(shift['event_team_abbr'])[0]
|
388
918
|
|
389
|
-
def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
390
|
-
#Given roster info (from the retreive_players function), shifts df, and team, generate on_ice columns for shift events
|
391
|
-
#These on-ice columns configure the on-ice players for events in the json play by play as well
|
392
919
|
skaters = pd.DataFrame()
|
393
920
|
goalies = pd.DataFrame()
|
394
|
-
|
395
|
-
team = {key:value for key, value in rosters['home'].items() if value['pos'] != "G"}
|
396
|
-
else:
|
397
|
-
team = {key:value for key, value in rosters['away'].items() if value['pos'] != "G"}
|
398
|
-
|
399
|
-
names = list(team.keys())
|
400
|
-
try: names.remove("")
|
401
|
-
except ValueError: ""
|
402
|
-
|
403
|
-
for player in names:
|
921
|
+
for player in skater_names:
|
404
922
|
#For each player in the game, determine when they began and ended shifts.
|
405
923
|
#With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
|
406
924
|
on_ice = (np.cumsum(
|
407
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
925
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
408
926
|
.apply(str)
|
409
927
|
.apply(lambda x: int(bool(re.search(player, x)))) -
|
410
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
928
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
411
929
|
.apply(str)
|
412
930
|
.apply(lambda x: int(bool(re.search(player, x))))
|
413
931
|
))
|
@@ -415,32 +933,22 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
415
933
|
|
416
934
|
skaters = skaters.fillna(0).astype(int)
|
417
935
|
|
418
|
-
|
419
936
|
on_skaters = (skaters == 1).stack().reset_index()
|
420
937
|
on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
421
938
|
|
422
939
|
max_players = 6
|
423
940
|
for i in range(max_players):
|
424
|
-
on_skaters[f"{'home' if home else 'away'}_on_{i+1}"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
941
|
+
on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
425
942
|
|
426
943
|
on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
427
944
|
|
428
|
-
#Repeat
|
429
|
-
|
430
|
-
team = {key:value for key, value in rosters['home'].items() if value['pos'] == "G"}
|
431
|
-
else:
|
432
|
-
team = {key:value for key, value in rosters['away'].items() if value['pos'] == "G"}
|
433
|
-
|
434
|
-
names = list(team.keys())
|
435
|
-
try: names.remove("")
|
436
|
-
except ValueError: ""
|
437
|
-
|
438
|
-
for player in names:
|
945
|
+
#Repeat this process with goaltenders
|
946
|
+
for player in goalie_names:
|
439
947
|
on_ice = (np.cumsum(
|
440
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
948
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
|
441
949
|
.apply(str)
|
442
950
|
.apply(lambda x: int(bool(re.search(player, x)))) -
|
443
|
-
shifts.loc[(shifts['event_team_abbr'] ==
|
951
|
+
shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
|
444
952
|
.apply(str)
|
445
953
|
.apply(lambda x: int(bool(re.search(player, x))))
|
446
954
|
))
|
@@ -453,7 +961,7 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
453
961
|
|
454
962
|
max_players = 1
|
455
963
|
for i in range(max_players):
|
456
|
-
on_goalies[f"{'home' if home else 'away'}
|
964
|
+
on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
457
965
|
|
458
966
|
on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
459
967
|
|
@@ -461,87 +969,100 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
|
461
969
|
on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
|
462
970
|
|
463
971
|
shifts['row'] = shifts.index
|
464
|
-
|
972
|
+
|
973
|
+
if home:
|
974
|
+
shifts['home_team_abbr'] = team
|
975
|
+
else:
|
976
|
+
shifts['away_team_abbr'] = team
|
465
977
|
#Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
|
466
|
-
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"
|
978
|
+
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
|
467
979
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
data_id = retreive_players(json)
|
980
|
+
## FINALIZE PBP FUNCTIONS ##
|
981
|
+
def combine_shifts(info):
|
982
|
+
#Given game info, return complete shift events
|
472
983
|
|
473
|
-
|
474
|
-
|
984
|
+
#JSON Prep
|
985
|
+
roster = info['rosters']
|
475
986
|
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
away_shifts = construct_skaters_matrix(data,away,pd.json_normalize(json)['awayTeam.abbrev'][0],False).fillna("REMOVE")
|
480
|
-
home_shifts = construct_skaters_matrix(data,home,pd.json_normalize(json)['homeTeam.abbrev'][0],True).fillna("REMOVE")
|
987
|
+
#Quickly combine shifts data
|
988
|
+
away = parse_shift_events(info,False)
|
989
|
+
home = parse_shift_events(info,True)
|
481
990
|
|
482
|
-
shifts
|
991
|
+
#Combine shifts
|
992
|
+
data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
|
993
|
+
|
994
|
+
#Add game info
|
995
|
+
info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
|
996
|
+
'away_team_abbr','home_team_abbr']
|
483
997
|
|
484
|
-
|
485
|
-
|
998
|
+
for col in info_col:
|
999
|
+
data[col] = info[col]
|
1000
|
+
|
1001
|
+
#Create player information dicts to create on-ice names
|
1002
|
+
roster['playerId'] = roster['playerId'].astype(str)
|
1003
|
+
players = roster.set_index("playerId")['full_name'].to_dict()
|
486
1004
|
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
1005
|
+
for i in range(0,7):
|
1006
|
+
if i == 6:
|
1007
|
+
data['away_goalie'] = data['away_goalie_id'].replace(players)
|
1008
|
+
data['home_goalie'] = data['home_goalie_id'].replace(players)
|
1009
|
+
else:
|
1010
|
+
data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
|
1011
|
+
data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
|
491
1012
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
1013
|
+
data = data.sort_values(['period','seconds_elapsed'])
|
1014
|
+
#Fill on-ice columns down
|
1015
|
+
on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
|
1016
|
+
'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
|
1017
|
+
'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
|
1018
|
+
'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
|
1019
|
+
'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
|
496
1020
|
|
497
|
-
|
1021
|
+
for col in on_ice_col:
|
1022
|
+
data[col] = data[col].ffill()
|
498
1023
|
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
1024
|
+
#Create strength state information
|
1025
|
+
away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
|
1026
|
+
home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
|
1027
|
+
data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1028
|
+
data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
1029
|
+
data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
|
503
1030
|
|
504
|
-
|
505
|
-
for
|
506
|
-
|
507
|
-
"fullName":'default'}))
|
1031
|
+
#Return: full shifts data converted to play-by-play format
|
1032
|
+
col = [col for col in get_col() if col in data.columns.to_list()]
|
1033
|
+
return data[col]
|
508
1034
|
|
509
|
-
|
1035
|
+
def combine_data(info):
|
1036
|
+
#Given game info, return complete play-by-play data
|
510
1037
|
|
511
|
-
|
512
|
-
for default, alt in zip(names_df['default'],names_df['alt']):
|
513
|
-
if alt == np.nan or alt == "" or str(alt) == 'nan':
|
514
|
-
continue
|
515
|
-
else:
|
516
|
-
replace.update({alt:default})
|
517
|
-
|
518
|
-
return shifts_df.replace(replace,regex=True)
|
1038
|
+
game_id = info['game_id']
|
519
1039
|
|
520
|
-
|
521
|
-
|
522
|
-
df = pd.concat([json,html])
|
1040
|
+
pbp = combine_pbp(info)
|
1041
|
+
shifts = combine_shifts(info)
|
523
1042
|
|
524
|
-
#
|
525
|
-
df
|
526
|
-
df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
1043
|
+
#Combine data
|
1044
|
+
df = pd.concat([pbp,shifts])
|
527
1045
|
|
528
1046
|
#Create priority columns designed to order events that occur at the same time in a game
|
529
|
-
start_pri = ['period-start','game-start']
|
530
1047
|
even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
|
531
|
-
df['priority'] = np.where(df['event_type'].isin(
|
532
|
-
np.where(df['event_type'].isin(even_pri),1,
|
1048
|
+
df['priority'] = np.where(df['event_type'].isin(even_pri),1,
|
533
1049
|
np.where(df['event_type']=='goal',2,
|
534
1050
|
np.where(df['event_type']=='stoppage',3,
|
535
|
-
np.where(df['event_type']=='penalty',4,
|
536
|
-
np.where(df['event_type']=='
|
1051
|
+
np.where(df['event_type']=='delayed-penalty',4,
|
1052
|
+
np.where(df['event_type']=='penalty',5,
|
537
1053
|
np.where(df['event_type']=='period-end',6,
|
538
|
-
np.where(df['event_type']=='
|
539
|
-
np.where(df['event_type']=='
|
1054
|
+
np.where(df['event_type']=='change',7,
|
1055
|
+
np.where(df['event_type']=='game-end',8,
|
1056
|
+
np.where(df['event_type']=='period-start',9,
|
1057
|
+
np.where(df['event_type']=='faceoff',10,0))))))))))
|
1058
|
+
|
1059
|
+
df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
|
1060
|
+
df = df.sort_values(['period','seconds_elapsed','priority'])
|
540
1061
|
|
541
|
-
|
542
|
-
|
1062
|
+
#Recalibrate event_num column to accurately depict the order of all events, including changes
|
1063
|
+
df.reset_index(inplace=True,drop=True)
|
543
1064
|
df['event_num'] = df.index+1
|
544
|
-
df['
|
1065
|
+
df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
545
1066
|
df['event_type_last'] = df['event_type'].shift(1)
|
546
1067
|
df['event_type_last_2'] = df['event_type_last'].shift(1)
|
547
1068
|
df['event_type_next'] = df['event_type'].shift(-1)
|
@@ -550,76 +1071,36 @@ def combine_data(json,html):
|
|
550
1071
|
period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
|
551
1072
|
#Define shifts by "line-change" or "on-the-fly"
|
552
1073
|
df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
df['
|
561
|
-
df['
|
562
|
-
|
563
|
-
|
564
|
-
df['
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
'away_team_abbr','home_team_abbr','home_team_defending_side',
|
583
|
-
'away_score','away_fenwick',
|
584
|
-
'home_score','home_fenwick',
|
585
|
-
'away_goalie','home_goalie']
|
586
|
-
away_on = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6']
|
587
|
-
home_on = ['home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6']
|
588
|
-
|
589
|
-
#Forward fill appropriate columns
|
590
|
-
for col in ffill_col+away_on+home_on:
|
1074
|
+
df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
|
1075
|
+
try:
|
1076
|
+
df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
|
1077
|
+
except:
|
1078
|
+
""
|
1079
|
+
|
1080
|
+
#Add time since last event and overall event length
|
1081
|
+
df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
|
1082
|
+
df['event_length'] = df['seconds_since_last'].shift(-1)
|
1083
|
+
|
1084
|
+
#Add fixed strength state column
|
1085
|
+
df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
|
1086
|
+
|
1087
|
+
#Retrieve coaches
|
1088
|
+
coaches = info['coaches']
|
1089
|
+
if not coaches:
|
1090
|
+
df['away_coach'] = ""
|
1091
|
+
df['home_coach'] = ""
|
1092
|
+
df['event_coach'] = ""
|
1093
|
+
else:
|
1094
|
+
df['away_coach'] = coaches['away']
|
1095
|
+
df['home_coach'] = coaches['home']
|
1096
|
+
df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
|
1097
|
+
|
1098
|
+
#Forward fill as necessary
|
1099
|
+
cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
|
1100
|
+
for col in cols:
|
1101
|
+
try: df[col]
|
1102
|
+
except: df[col] = ""
|
591
1103
|
df[col] = df[col].ffill()
|
592
1104
|
|
593
|
-
#Now that forward fill is complete, replace "REMOVE" with nan
|
594
|
-
df.replace("REMOVE",np.nan,inplace=True)
|
595
|
-
|
596
|
-
#Reconfigure strength state and sitution codes
|
597
|
-
df['away_skaters'] = df[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
598
|
-
df['home_skaters'] = df[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
599
|
-
df['away_goalie_in'] = np.where(df['away_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
600
|
-
df['home_goalie_in'] = np.where(df['home_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
601
|
-
|
602
|
-
df['event_skaters'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['home_skaters'],df['away_skaters'])
|
603
|
-
df['event_skaters_against'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['away_skaters'],df['home_skaters'])
|
604
|
-
|
605
|
-
df['strength_state'] = df['event_skaters'].astype(str) + "v" + df['event_skaters_against'].astype(str)
|
606
|
-
df['situation_code'] = np.where(df['situation_code'].isna(),df['away_goalie_in'].astype(str) + df['away_skaters'].astype(str) + df['home_skaters'].astype(str) + df['home_goalie_in'].astype(str),df['situation_code'])
|
607
|
-
|
608
|
-
col = [
|
609
|
-
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
610
|
-
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
611
|
-
'seconds_elapsed', "situation_code","strength_state","home_team_defending_side","shift_type",
|
612
|
-
"event_type_code","event_type","description","reason","penalty_duration","penalty_description",
|
613
|
-
"event_team_abbr",'num_on', 'players_on', 'ids_on', 'num_off', 'players_off', 'ids_off',
|
614
|
-
"event_team_status","event_player_1_id","event_player_2_id","event_player_3_id",
|
615
|
-
"event_player_1_name","event_player_2_name","event_player_3_name","event_player_1_pos","event_player_2_pos",
|
616
|
-
"event_player_3_pos","event_goalie_id",
|
617
|
-
"event_goalie_name","shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
618
|
-
"event_skaters","away_skaters","home_skaters",
|
619
|
-
"event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
|
620
|
-
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
621
|
-
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie"
|
622
|
-
]
|
623
|
-
|
624
1105
|
#Return: complete play-by-play with all important data for each event in a provided game
|
625
|
-
return df[col].replace(r'^\s*$', np.nan, regex=True)
|
1106
|
+
return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)
|