wsba-hockey 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  import re
2
2
  from bs4 import BeautifulSoup
3
- import hockey_scraper.utils.shared as shared
3
+ import requests as rs
4
+ import json as json_lib
5
+ from tools.utils.shared import *
4
6
  import numpy as np
5
7
  import pandas as pd
6
8
  import warnings
@@ -9,97 +11,156 @@ warnings.filterwarnings('ignore')
9
11
  ### SCRAPING FUNCTIONS ###
10
12
  # Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
11
13
 
12
- ## JSON DATA ##
13
- def retreive_players(json,result = "id"):
14
- #Given json data from an NHL API call, return dictionary with home and away players and either their id or their position.
14
+ ## ORDER OF OPERATIONS ##
15
+ # Create game information to use with all functions
16
+ # Retreive JSON data
17
+ # Parse JSON data
18
+ # Retreive and clean HTML pbp with player information
19
+ # Parse HTML pbp, return parsed HTML
20
+ # Combine pbp data
21
+ # Retreive and analyze HTML shifts with player information for home and away teams
22
+ # Parse shift events
23
+ # Combine all data, return complete play-by-play
24
+
25
+ ## UTILITY FUNCTIONS ##
26
+ def get_col():
27
+ return [
28
+ 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
29
+ 'away_team_abbr','home_team_abbr','event_num','period','period_type',
30
+ 'seconds_elapsed',"situation_code","strength_state","strength_state_venue","home_team_defending_side",
31
+ "event_type_code","event_type","description","penalty_duration",
32
+ "event_team_abbr","event_team_venue",
33
+ 'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
34
+ "event_player_1_name","event_player_2_name","event_player_3_name",
35
+ "event_player_1_id","event_player_2_id","event_player_3_id",
36
+ "event_player_1_pos","event_player_2_pos","event_player_3_pos",
37
+ "event_goalie_name","event_goalie_id",
38
+ "shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
39
+ "event_skaters","away_skaters","home_skaters",
40
+ "event_distance","event_angle","event_length","seconds_since_last",
41
+ "away_score","home_score", "away_fenwick", "home_fenwick","away_sog","home_sog",
42
+ "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
43
+ "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
44
+ "away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
45
+ "home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
46
+ "event_coach","away_coach","home_coach"
47
+ ]
48
+
49
+
50
+ ## JSON FUNCTIONS ##
51
+ def get_game_roster(json):
52
+ #Given raw json data, return game rosters
15
53
  roster = pd.json_normalize(json['rosterSpots'])
16
- info = pd.json_normalize(json)
17
- home = info['homeTeam.id'][0]
18
- away = info['awayTeam.id'][0]
19
-
20
- #Add up to four alternative names for each player in the game
21
- roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
22
- try: roster['playerName_2'] = roster['firstName.cs']+" "+roster['lastName.default']
23
- except: roster['playerName_2'] = ""
24
- try: roster['playerName_3'] = roster['firstName.de']+" "+roster['lastName.default']
25
- except: roster['playerName_3'] = ""
26
- try: roster['playerName_4'] = roster['firstName.es']+" "+roster['lastName.default']
27
- except: roster['playerName_4'] = ""
28
-
29
- #For each home/away player their name is included as a key and their id or position is the value
30
- home_players = {}
31
- home_id = roster.loc[roster['teamId']==home]
32
- hid = list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])
33
- hpos = list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])
34
- hp = list(home_id['playerName'])+list(home_id['playerName_2'])+list(home_id['playerName_3'])+list(home_id['playerName_4'])
35
-
36
- for id, pos, player in zip(hid,hpos,hp):
37
- try: home_players.update({player.upper():
38
- {result:id if result == 'id' else pos}})
39
- except:
40
- continue
54
+ roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
41
55
 
42
- away_players = {}
43
- away_id = roster.loc[roster['teamId']==away]
44
- aid = list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])
45
- apos = list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])
46
- ap = list(away_id['playerName'])+list(away_id['playerName_2'])+list(away_id['playerName_3'])+list(away_id['playerName_4'])
47
-
48
- for id, pos, player in zip(aid,apos,ap):
49
- try: away_players.update({player.upper():
50
- {result:id if result == 'id' else pos}})
51
- except:
52
- continue
56
+ #Return: roster information
57
+ return roster
58
+
59
+ def get_game_coaches(game_id):
60
+ #Given game info, return head coaches for away and home team
53
61
 
54
- #Return: Dict of away and home players keyed with id or position as value
55
- return {
56
- 'home':home_players,
57
- 'away':away_players
58
- }
62
+ #Retreive data
63
+ json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
64
+ data = json['gameInfo']
59
65
 
60
- def parse_json(json):
61
- #Given json data from an NHL API call, return play-by-play data.
66
+ #Add coaches
67
+ try:
68
+ away = data['awayTeam']['headCoach']['default'].upper()
69
+ home = data['homeTeam']['headCoach']['default'].upper()
70
+
71
+ coaches = {'away':away,
72
+ 'home':home}
73
+ except KeyError:
74
+ return {}
62
75
 
63
- events = pd.json_normalize(json['plays']).reset_index(drop=True)
64
- info = pd.json_normalize(json)
65
- roster = pd.json_normalize(json['rosterSpots'])
76
+ #Return: dict with coaches
77
+ return coaches
78
+
79
+ def get_game_info(game_id):
80
+ #Given game_id, return game information
81
+
82
+ #Retreive data
83
+ api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
84
+ json = rs.get(api).json()
85
+
86
+ #Games don't always have JSON shifts, for whatever reason
87
+ shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
88
+ shifts = rs.get(shifts).json()
89
+ json_shifts = pd.json_normalize(shifts['data'])
90
+
91
+ if shifts['total'] == 0:
92
+ json_shifts = pd.DataFrame()
93
+
94
+ #Split information
95
+ base = pd.json_normalize(json)
96
+ game_id = base['id'][0]
97
+ season = base['season'][0]
98
+ season_type = base['gameType'][0]
99
+ game_date = base['gameDate'][0]
100
+ game_state = base['gameState'][0]
101
+ start_time = base['startTimeUTC'][0]
102
+ venue = base['venue.default'][0]
103
+ venue_location = base['venueLocation.default'][0]
104
+ away_team_id = base['awayTeam.id'][0]
105
+ away_team_abbr = base['awayTeam.abbrev'][0]
106
+ home_team_id = base['homeTeam.id'][0]
107
+ home_team_abbr = base['homeTeam.abbrev'][0]
108
+
109
+ #Add roster
110
+ roster = get_game_roster(json)
111
+ #In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
112
+ roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
113
+ roster['team_abbr'] = roster['teamId'].replace({
114
+ away_team_id:[away_team_abbr],
115
+ home_team_id:[home_team_abbr]
116
+ })
117
+ roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
118
+
119
+ #Create an additional roster dictionary for use with HTML parsing
120
+ #Roster dict
121
+ roster_dict = {'away':{},
122
+ 'home':{}}
123
+
124
+ #Evaluate and add players by team
125
+ for team in ['away','home']:
126
+ abbr = (away_team_abbr if team == 'away' else home_team_abbr)
127
+ rost = roster.loc[roster['team_abbr']==abbr]
128
+
129
+ #Now iterate through team players
130
+ for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
131
+ roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
132
+
133
+ #Return: game information
134
+ return {"game_id":str(game_id),
135
+ "season":season,
136
+ "season_type":season_type,
137
+ "game_date":game_date,
138
+ "game_state":game_state,
139
+ "start_time":start_time,
140
+ 'venue':venue,
141
+ 'venue_location':venue_location,
142
+ 'away_team_id':away_team_id,
143
+ 'away_team_abbr':away_team_abbr,
144
+ 'home_team_id':home_team_id,
145
+ 'home_team_abbr':home_team_abbr,
146
+ 'events':pd.json_normalize(json['plays']).reset_index(drop=True),
147
+ 'rosters':roster,
148
+ 'HTML_rosters':roster_dict,
149
+ 'coaches':get_game_coaches(game_id),
150
+ 'json_shifts':json_shifts}
151
+
152
+ def parse_json(info):
153
+ #Given game info, return JSON document
154
+
155
+ #Retreive data
156
+ events = info['events']
66
157
 
67
158
  #Return error if game is set in the future
68
- if info['gameState'][0] == 'FUT':
159
+ if info['game_state'] == 'FUT':
69
160
  raise ValueError(f"Game {info['id'][0]} has not occured yet.")
70
-
71
- #Game information
72
- events['game_id'] = info['id'][0]
73
- events['season'] = info['season'][0]
74
- events['season_type'] = info['gameType'][0]
75
- events['game_date'] = info['gameDate'][0]
76
- events['start_time'] = info['startTimeUTC'][0]
77
- events['venue'] = info['venue.default'][0]
78
- events['venue_location'] = info['venueLocation.default'][0]
79
- events['away_team_id'] = info['awayTeam.id'][0]
80
- events['away_team_abbr'] = info['awayTeam.abbrev'][0]
81
- events['home_team_id'] = info['homeTeam.id'][0]
82
- events['home_team_abbr'] = info['homeTeam.abbrev'][0]
83
-
84
- teams = {
85
- info['awayTeam.id'][0]:info['awayTeam.abbrev'][0],
86
- info['homeTeam.id'][0]:info['homeTeam.abbrev'][0]
87
- }
88
-
89
- #Create player information dicts used to create event_player columns
90
- roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
91
- players = {}
92
- players_pos = {}
93
- ids = {}
94
- for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
95
- players.update({id:player.upper()})
96
- for id, pos in zip(list(roster['playerId']),list(roster['positionCode'])):
97
- players_pos.update({id:pos.upper()})
98
- for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
99
- ids.update({player.upper():id})
100
-
161
+
101
162
  #Test columns
102
- cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date', 'away_team_id', 'away_team_abbr', 'home_team_id', 'home_team_abbr']
163
+ cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
103
164
 
104
165
  for col in cols:
105
166
  try:events[col]
@@ -120,7 +181,7 @@ def parse_json(json):
120
181
 
121
182
  events['event_player_3_id'] = events['details.assist2PlayerId']
122
183
 
123
- events['event_team_status'] = np.where(events['home_team_id']==events['details.eventOwnerTeamId'],"home","away")
184
+ events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
124
185
 
125
186
  #Coordinate adjustments:
126
187
  #The WSBA NHL Scraper includes three sets of coordinates per event:
@@ -132,8 +193,8 @@ def parse_json(json):
132
193
  try:
133
194
  events['x_fixed'] = abs(events['details.xCoord'])
134
195
  events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
135
- events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
136
- events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
196
+ events['x_adj'] = np.where(events['event_team_venue']=="home",events['x_fixed'],-events['x_fixed'])
197
+ events['y_adj'] = np.where(events['event_team_venue']=="home",events['y_fixed'],-events['y_fixed'])
137
198
  events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
138
199
  events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
139
200
  except TypeError:
@@ -147,32 +208,11 @@ def parse_json(json):
147
208
  events['event_angle'] = np.nan
148
209
 
149
210
 
150
- events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
151
-
152
- #Event player information includes ids (included in the JSON events), names (from "rosterSpots"), and positions (also from "rosterSpots")
153
- events['event_player_1_name'] = events['event_player_1_id'].replace(players)
154
- events['event_player_2_name'] = events['event_player_2_id'].replace(players)
155
- events['event_player_3_name'] = events['event_player_3_id'].replace(players)
156
-
157
- events['event_player_1_pos'] = events['event_player_1_id'].replace(players_pos)
158
- events['event_player_2_pos'] = events['event_player_2_id'].replace(players_pos)
159
- events['event_player_3_pos'] = events['event_player_3_id'].replace(players_pos)
160
-
161
- events['event_goalie_name'] = events['details.goalieInNetId'].replace(players)
162
-
163
- #Create situations given situation code (this is reconfigured with on ice skaters when provided shifts data)
164
- events['away_skaters'] = events['situationCode'].astype(str).str.slice(start=1,stop=2)
165
- events['home_skaters'] = events['situationCode'].astype(str).str.slice(start=2,stop=3)
166
- events['event_skaters'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['home_skaters'],events['away_skaters'])
167
- events['event_skaters_against'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['away_skaters'],events['home_skaters'])
211
+ events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
212
+ info['away_team_id']:[info['away_team_abbr']],
213
+ info['home_team_id']:[info['home_team_abbr']]
214
+ })
168
215
 
169
- events['strength_state'] = events['event_skaters']+"v"+events['event_skaters_against']
170
- events['strength'] = np.where(events['event_skaters']==events['event_skaters_against'],
171
- "EV",np.where(
172
- events['event_skaters']>events['event_skaters_against'],
173
- "PP","SH"
174
- ))
175
-
176
216
  #Rename columns to follow WSBA naming conventions
177
217
  events = events.rename(columns={
178
218
  "eventId":"event_id",
@@ -197,14 +237,12 @@ def parse_json(json):
197
237
  })
198
238
 
199
239
  #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
200
- events['period_time_simple'] = events['period_time_elasped'].str.replace(":","",regex=True)
201
- events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
202
- ((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
203
- ((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
204
- events['period_seconds_remaining'] = 1200-events['period_seconds_elapsed']
240
+ events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
205
241
  events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
242
+
243
+ events = events.loc[(events['event_type']!="")]
206
244
 
207
- #The following code is utilized to generate score and fenwick columns for each event
245
+ #Assign score and fenwick for each event
208
246
  fenwick_events = ['missed-shot','shot-on-goal','goal']
209
247
  ag = 0
210
248
  ags = []
@@ -215,16 +253,16 @@ def parse_json(json):
215
253
  afs = []
216
254
  hf = 0
217
255
  hfs = []
218
- for event,team in zip(list(events['event_type']),list(events['event_team_status'])):
256
+ for event,team in zip(list(events['event_type']),list(events['event_team_venue'])):
219
257
  if event in fenwick_events:
220
258
  if team == "home":
221
- hf = hf+1
259
+ hf += 1
222
260
  if event == 'goal':
223
- hg = hg+1
261
+ hg += 1
224
262
  else:
225
- af = af+1
263
+ af += 1
226
264
  if event == 'goal':
227
- ag = ag+1
265
+ ag += 1
228
266
 
229
267
  ags.append(ag)
230
268
  hgs.append(hg)
@@ -235,84 +273,561 @@ def parse_json(json):
235
273
  events['home_score'] = hgs
236
274
  events['away_fenwick'] = afs
237
275
  events['home_fenwick'] = hfs
238
-
239
- events = events.loc[(events['event_type']!="")&(events['event_type']!="game-end")]
240
276
 
241
- #Return: dataframe with parsed games in event
277
+ #Return: dataframe with parsed game
242
278
  return events
243
279
 
280
+ ### ESPN SCRAPING FUNCTIONS ###
281
+ def espn_game_id(date,away,home):
282
+ #Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
283
+ date = date.replace("-","")
244
284
 
285
+ #Retreive data
286
+ api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
287
+ schedule = pd.json_normalize(rs.get(api).json()['events'])
245
288
 
246
- ## HTML DATA ##
247
- def get_soup(shifts_html):
248
- #Parses provided shifts html with BeautifulSoup
249
- #Utilizes method from Harry Shomer's hockey_scraper package
250
- parsers = ["lxml", "html.parser", "html5lib"]
289
+ #Create team abbreviation columns
290
+ schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
291
+ schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
292
+
293
+ #Modify team abbreviations as necessary
294
+ schedule = schedule.replace({
295
+ "LA":"LAK",
296
+ "NJ":"NJD",
297
+ "SJ":"SJS",
298
+ "TB":"TBL",
299
+ })
251
300
 
252
- for parser in parsers:
253
- soup = BeautifulSoup(shifts_html, parser)
254
- td = soup.findAll(True, {'class': ['playerHeading + border', 'lborder + bborder']})
301
+ #Retreive game id
302
+ game_id = schedule.loc[(schedule['away_team_abbr']==away)&
303
+ (schedule['home_team_abbr']==home),'id'].tolist()[0]
255
304
 
256
- if len(td) > 0:
257
- break
258
- return td, get_teams(soup)
305
+ #Return: ESPN game id
306
+ return game_id
259
307
 
308
+ def parse_espn(date,away,home):
309
+ #Given a date formatted as YYYY-MM-DD and teams, return game events
310
+ game_id = espn_game_id(date,away,home)
311
+ url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
312
+
313
+ #Code modified from Patrick Bacon
260
314
 
261
- def get_teams(soup):
262
- #Collects teams in given shifts html (parsed by Beautiful Soup)
263
- #Utilizes method from Harry Shomer's hockey_scraper package
264
- team = soup.find('td', class_='teamHeading + border') # Team for shifts
265
- team = team.get_text()
315
+ #Retreive game events as json
316
+ page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
317
+ soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
318
+ json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
319
+
320
+ #DataFrame of time-related info for events
321
+ clock_df = pd.DataFrame()
322
+
323
+ for period in range(0, len(json)):
324
+ clock_df = clock_df._append(pd.DataFrame(json[period]))
325
+
326
+ clock_df = clock_df[~pd.isna(clock_df.clock)]
327
+
328
+ # Needed to add .split(',"st":3')[0] for playoffs
329
+
330
+ #DataFrame of coordinates for events
331
+ coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
332
+
333
+ clock_df = clock_df.assign(
334
+ clock = clock_df.clock.apply(lambda x: x['displayValue'])
335
+ )
336
+
337
+ coords_df = coords_df.assign(
338
+ coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
339
+ coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
340
+ event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
341
+ )
342
+
343
+ #Combine
344
+ espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
345
+
346
+ espn_events = espn_events.assign(
347
+ period = espn_events['period'].apply(lambda x: x['number']),
348
+ minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
349
+ seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
350
+ event_type = espn_events['type'].apply(lambda x: x['txt'])
351
+ )
352
+
353
+ espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
354
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_x
355
+ ),
356
+ coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
357
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
358
+
359
+ espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
360
+
361
+ espn_events = espn_events.assign(
362
+ coords_x = espn_events.coords_x.astype(int),
363
+ coords_y = espn_events.coords_y.astype(int)
364
+ )
365
+
366
+ #Rename events
367
+ #The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
368
+ espn_events['event_type'] = espn_events['event_type'].replace({
369
+ "Face Off":'faceoff',
370
+ "Hit":'hit',
371
+ "Shot":'shot-on-goal',
372
+ "Missed":'missed-shot',
373
+ "Blocked":'blocked-shot',
374
+ "Goal":'goal',
375
+ "Turnover":'giveaway',
376
+ "Delayed Penalty":'delayed-penalty',
377
+ "Penalty":'penalty',
378
+ })
379
+
380
+ #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
381
+ espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
382
+ espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
383
+ ((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
384
+ ((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
385
+ espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
386
+
387
+ espn_events = espn_events.rename(columns = {'text':'description'})
388
+
389
+ #Add event team
390
+ espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
391
+ "away":away,
392
+ "home":home
393
+ })
394
+
395
+ #Some games (mostly preseason and all star games) do not include coordinates.
396
+ try:
397
+ espn_events['x_fixed'] = abs(espn_events['coords_x'])
398
+ espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
399
+ espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
400
+ espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
401
+ espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
402
+ espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
403
+ except TypeError:
404
+ print(f"No coordinates found for ESPN game...")
405
+
406
+ espn_events['x_fixed'] = np.nan
407
+ espn_events['y_fixed'] = np.nan
408
+ espn_events['x_adj'] = np.nan
409
+ espn_events['y_adj'] = np.nan
410
+ espn_events['event_distance'] = np.nan
411
+ espn_events['event_angle'] = np.nan
412
+
413
+ #Assign score and fenwick for each event
414
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
415
+ ag = 0
416
+ ags = []
417
+ hg = 0
418
+ hgs = []
419
+
420
+ af = 0
421
+ afs = []
422
+ hf = 0
423
+ hfs = []
424
+ for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
425
+ if event in fenwick_events:
426
+ if team == "home":
427
+ hf += 1
428
+ if event == 'goal':
429
+ hg += 1
430
+ else:
431
+ af += 1
432
+ if event == 'goal':
433
+ ag += 1
434
+
435
+ ags.append(ag)
436
+ hgs.append(hg)
437
+ afs.append(af)
438
+ hfs.append(hf)
439
+
440
+ espn_events['away_score'] = ags
441
+ espn_events['home_score'] = hgs
442
+ espn_events['away_fenwick'] = afs
443
+ espn_events['home_fenwick'] = hfs
444
+ #Return: play-by-play events in supplied game from ESPN
445
+ return espn_events
446
+
447
+ ## HTML PBP FUNCTIONS ##
448
+ def strip_html_pbp(td,rosters):
449
+ #Given html row, parse data from HTML pbp
450
+ #Harry Shomer's Code (modified)
451
+
452
+ #HTML Parsing
453
+ for y in range(len(td)):
454
+ # Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
455
+ if y == 3:
456
+ td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
457
+ index = td[y].find(':')
458
+ td[y] = td[y][:index+3]
459
+ elif (y == 6 or y == 7) and td[0] != '#':
460
+ # 6 & 7-> These are the player 1 ice one's
461
+ # The second statement controls for when it's just a header
462
+ baz = td[y].find_all('td')
463
+ bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
464
+
465
+ # The setup in the list is now: Name/Number->Position->Blank...and repeat
466
+ # Now strip all the html
467
+ players = []
468
+ for i in range(len(bar)):
469
+ if i % 3 == 0:
470
+ try:
471
+ #Using the supplied json we can bind player name and id to number and team
472
+ #Find number and team of player then lookup roster dictionary
473
+
474
+ number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
475
+ if y == 6:
476
+ team = 'away'
477
+ else:
478
+ team = 'home'
479
+
480
+ id = rosters[team][str(number)][4]
481
+ name = rosters[team][str(number)][2]
482
+ position = rosters[team][str(number)][1]
483
+
484
+ except KeyError:
485
+ name = ''
486
+ number = ''
487
+ id = ''
488
+ elif i % 3 == 1:
489
+ if name != '':
490
+ players.append([name, number, position, id])
491
+
492
+ td[y] = players
493
+ else:
494
+ td[y] = td[y].get_text()
495
+
496
+ return td
266
497
 
267
- # Get Home Team
268
- teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
269
- regex = re.compile(r'>(.*)<br/?>')
270
- home_team = regex.findall(str(teams[7]))
271
498
 
272
- return [team, home_team[0]]
499
+ def clean_html_pbp(info):
500
+ #Harry Shomer's Code (modified)
273
501
 
274
- #PARSE FUNCTIONS
275
- def analyze_shifts(shift, name, team, home_team, player_ids):
502
+ game_id = info['game_id']
503
+ #Retreive data
504
+ season = info['season']
505
+ doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
506
+ html = rs.get(doc).content
507
+ soup = get_contents(html)
508
+
509
+ #Rosters
510
+ rosters = info['HTML_rosters']
511
+
512
+ # Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
513
+ td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
514
+
515
+ cleaned_html = [strip_html_pbp(x,rosters) for x in td]
516
+
517
+ return cleaned_html
518
+
519
+ def parse_html(info):
520
+ #Given game info, return HTML event data
521
+
522
+ #Retreive game information and html events
523
+ rosters = info['HTML_rosters']
524
+ events = clean_html_pbp(info)
525
+
526
+ teams = {info['away_team_abbr']:['away'],
527
+ info['home_team_abbr']:['home']}
528
+
529
+ #Parsing
530
+ event_log = []
531
+ for event in events:
532
+ events_dict = {}
533
+ if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX','SOC'] or event[3]=='-16:0-':
534
+ continue
535
+ else:
536
+ #Event info
537
+ events_dict['event_num'] = int(event[0])
538
+ events_dict['period'] = int(event[1])
539
+ events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
540
+ events_dict['period_time_elapsed'] = event[3]
541
+ events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
542
+ events_dict['event_type'] = event[4]
543
+
544
+ desc = re.sub(u'\xa0'," ",event[5])
545
+ events_dict['description'] = desc
546
+
547
+ events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
548
+ zone = [x for x in desc.split(',') if 'Zone' in x]
549
+ if not zone:
550
+ events_dict['zone_code'] = None
551
+ elif zone[0].find("Off") != -1:
552
+ events_dict['zone_code'] = 'O'
553
+ elif zone[0].find("Neu") != -1:
554
+ events_dict['zone_code'] = 'N'
555
+ elif zone[0].find("Def") != -1:
556
+ events_dict['zone_code'] = 'D'
557
+
558
+ #Convert team names for compatiblity
559
+ replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
560
+ for name, repl in replace:
561
+ desc = desc.replace(repl,name)
562
+
563
+ event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
564
+ events_dict['event_team_abbr'] = event_team
565
+
566
+ events_dict['away_team_abbr'] = info['away_team_abbr']
567
+ events_dict['home_team_abbr'] = info['home_team_abbr']
568
+
569
+ away_skaters = 0
570
+ away_goalie = 0
571
+ #Away on-ice
572
+ for i in range(len(event[6])):
573
+ player = event[6][i][0]
574
+ pos = event[6][i][2]
575
+ id = event[6][i][3]
576
+
577
+ if pos == 'G':
578
+ events_dict['away_goalie'] = player
579
+ events_dict['away_goalie_id'] = id
580
+ away_goalie += 1
581
+ else:
582
+ events_dict[f'away_on_{i+1}'] = player
583
+ events_dict[f'away_on_{i+1}_id'] = id
584
+ away_skaters += 1
585
+
586
+ home_skaters = 0
587
+ home_goalie = 0
588
+ #Home on-ice
589
+ for i in range(len(event[7])):
590
+ player = event[7][i][0]
591
+ pos = event[7][i][2]
592
+ id = event[7][i][3]
593
+
594
+ if pos == 'G':
595
+ events_dict['home_goalie'] = player
596
+ events_dict['home_goalie_id'] = id
597
+ home_goalie += 1
598
+ else:
599
+ events_dict[f'home_on_{i+1}'] = player
600
+ events_dict[f'home_on_{i+1}_id'] = id
601
+ home_skaters += 1
602
+
603
+ event_players = []
604
+ #Determine parsing route based on event
605
+ if event[4] in ['FAC','HIT','BLOCK','PENL']:
606
+ #Regex to find team and player number involved (finds all for each event)
607
+ #Code is modified from Harry Shomer in order to account for periods in a team abbreviation
608
+ regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
609
+ fac = regex.findall(desc)
610
+ #Filter incorrectly parsed teams
611
+ repl = []
612
+ for team, num in fac:
613
+ if team in teams.keys():
614
+ repl.append((team,num))
615
+ fac = repl
616
+
617
+ #Find first event player
618
+ ep1_num = ''
619
+ for i in range(len(fac)):
620
+ team, num = fac[i]
621
+ if team == event_team:
622
+ ep1_num = num
623
+ event_players.append(fac[i])
624
+ else:
625
+ continue
626
+
627
+ #Find other players
628
+ for i in range(len(fac)):
629
+ team, num = fac[i]
630
+ if num == ep1_num:
631
+ continue
632
+ else:
633
+ event_players.append(fac[i])
634
+ elif event[4]=='GOAL':
635
+ #Parse goal
636
+ regex = re.compile(r'#(\d+)\s+')
637
+ goal = regex.findall(desc)
638
+
639
+ #Add all involved players
640
+ for point in goal:
641
+ #In this loop, point is a player number. We can assign event_team to all players in a goal
642
+ event_players.append((event_team,str(point)))
643
+ elif event[4]=='DELPEN':
644
+ #Don't parse DELPEN events
645
+ #These events typically have no text but when they do it is often erroneous or otherwise problematic
646
+
647
+ ""
648
+ else:
649
+ #Parse single or no player events
650
+ regex = re.compile(r'#\d+')
651
+ fac = regex.findall(desc)
652
+
653
+ for i in range(len(fac)):
654
+ num = fac[i].replace("#","")
655
+ event_players.append((event_team,str(num)))
656
+
657
+ for i in range(len(event_players)):
658
+ #For each player, evaluate their event data, then retreive information from rosters
659
+ team, num = event_players[i]
660
+
661
+ status = teams[team]
662
+ data = rosters[status[0]]
663
+
664
+ events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
665
+ events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
666
+ events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
667
+
668
+ events_dict['away_skaters'] = away_skaters
669
+ events_dict['home_skaters'] = home_skaters
670
+ events_dict['away_goalie_in'] = away_goalie
671
+ events_dict['home_goalie_in'] = home_goalie
672
+
673
+ event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
674
+ event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
675
+ events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
676
+ events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
677
+
678
+ event_log.append(pd.DataFrame([events_dict]))
679
+
680
+ data = pd.concat(event_log)
681
+ data['event_type'] = data['event_type'].replace({
682
+ "PGSTR": "pre-game-start",
683
+ "PGEND": "pre-game-end",
684
+ 'GSTR':"game-start",
685
+ "ANTHEM":"anthem",
686
+ "PSTR":"period-start",
687
+ 'FAC':"faceoff",
688
+ "SHOT":"shot-on-goal",
689
+ "BLOCK":"blocked-shot",
690
+ "STOP":"stoppage",
691
+ "MISS":"missed-shot",
692
+ "HIT":"hit",
693
+ "GOAL":"goal",
694
+ "GIVE":"giveaway",
695
+ "TAKE":"takeaway",
696
+ "DELPEN":"delayed-penalty",
697
+ "PENL":"penalty",
698
+ "CHL":"challenge",
699
+ "PEND":"period-end",
700
+ "GEND":"game-end"
701
+ })
702
+
703
+ #Return: parsed HTML pbp
704
+ return data
705
+
706
+ def combine_pbp(info):
707
+ #Given game info, return complete play-by-play data for provided game
708
+
709
+ html_pbp = parse_html(info)
710
+
711
+ #Route data combining - json if season is after 2009-2010:
712
+ if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
713
+ #ESPN x HTML
714
+ espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).drop(columns=['event_player_1_name'])
715
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
716
+
717
+ df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
718
+
719
+ else:
720
+ #JSON x HTML
721
+ json_pbp = parse_json(info)
722
+ #Modify merge conditions and merge pbps
723
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
724
+ html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
725
+
726
+ #While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
727
+ html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
728
+ json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
729
+
730
+ df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
731
+
732
+ #Add game info
733
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
734
+ 'away_team_abbr','home_team_abbr']
735
+
736
+ for col in info_col:
737
+ df[col] = info[col]
738
+
739
+ #Fill period_type column and assign shifts a sub-500 event code
740
+ df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
741
+ try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
742
+ except:
743
+ ""
744
+ df = df.sort_values(['period','seconds_elapsed']).reset_index()
745
+
746
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
747
+
748
+ #Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
749
+ df['strength_state'] = np.where(np.logical_and(df['period'].astype(str)=='5',df['event_type'].isin(['missed-shot','shot-on-goal','goal'])),"1v0",df['strength_state'])
750
+ df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
751
+
752
+ col = [col for col in get_col() if col in df.columns.to_list()]
753
+ #Return: complete play-by-play information for provided game
754
+ return df[col]
755
+
756
+ ## SHIFT SCRAPING FUNCTIONS ##
757
+ def parse_shifts_json(info):
758
+ #Given game info, return json shift chart
759
+
760
+ log = info['json_shifts']
761
+ #Filter non-shift events and duplicate events
762
+ log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
763
+
764
+ #Add full name columns
765
+ log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
766
+
767
+ log = log.rename(columns={
768
+ 'playerId':'player_id',
769
+ 'teamAbbrev':'event_team_abbr',
770
+ 'startTime':'start',
771
+ 'endTime':'end'
772
+ })
773
+
774
+ #Convert time columns
775
+ log['start'] = log['start'].astype(str).apply(convert_to_seconds)
776
+ log['end'] = log['end'].astype(str).apply(convert_to_seconds)
777
+ log = log[['player_name','player_id',
778
+ 'period','event_team_abbr',
779
+ 'start','duration','end']]
780
+
781
+ #Recalibrate duration
782
+ log['duration'] = log['end'] - log['start']
783
+
784
+ #Return: JSON shifts (seperated by team)
785
+ away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
786
+ home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
787
+
788
+ return {'away':away,
789
+ 'home':home}
790
+
791
+ def analyze_shifts(shift, id, name, pos, team):
276
792
  #Collects teams in given shifts html (parsed by Beautiful Soup)
277
793
  #Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
278
794
  shifts = dict()
279
795
 
280
796
  shifts['player_name'] = name.upper()
797
+ shifts['player_id'] = id
798
+ shifts['player_pos'] = pos
281
799
  shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
282
- shifts['team_abbr'] = shared.get_team(team.strip(' '))
283
- shifts['start'] = shared.convert_to_seconds(shift[2].split('/')[0])
284
- shifts['duration'] = shared.convert_to_seconds(shift[4].split('/')[0])
800
+ shifts['event_team_abbr'] = get_team(team.strip(' '))
801
+ shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
802
+ shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
285
803
 
286
- # I've had problems with this one...if there are no digits the time is fucked up
804
+ #Sometimes there are no digits
287
805
  if re.compile(r'\d+').findall(shift[3].split('/')[0]):
288
- shifts['end'] = shared.convert_to_seconds(shift[3].split('/')[0])
806
+ shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
289
807
  else:
290
808
  shifts['end'] = shifts['start'] + shifts['duration']
291
-
292
- try:
293
- if home_team == team:
294
- shifts['player_id'] = player_ids['home'][name.upper()]['id']
295
- else:
296
- shifts['player_id'] = player_ids['away'][name.upper()]['id']
297
- except KeyError:
298
- shifts['player_id'] = None
299
-
300
809
  return shifts
301
810
 
302
- def parse_shifts(html, player_ids, game_id):
303
- #Two-stage parsing of shifts data for a single team in a provided game
304
- #Stage one: create dataframe with raw individual shifts
305
- #Stage two: convert shift events to play-by-play structure created with json_parsing
811
+ def parse_shifts_html(info,home):
812
+ #Parsing of shifts data for a single team in a provided game
306
813
  #Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
307
814
 
815
+ #Roster info prep
816
+ roster = info['HTML_rosters']
308
817
 
818
+ rosters = roster['home' if home else 'away']
819
+
309
820
  all_shifts = []
310
- columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
821
+ #columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
311
822
 
312
- td, teams = get_soup(html)
823
+ #Retreive HTML
824
+ game_id = info['game_id']
825
+ season = info['season']
826
+ link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
827
+ doc = rs.get(link).content
828
+ td, teams = get_soup(doc)
313
829
 
314
830
  team = teams[0]
315
- home_team = teams[1]
316
831
  players = dict()
317
832
 
318
833
  # Iterates through each player shifts table with the following data:
@@ -321,37 +836,55 @@ def parse_shifts(html, player_ids, game_id):
321
836
  t = t.get_text()
322
837
  if ',' in t: # If a comma exists it is a player
323
838
  name = t
839
+
324
840
  name = name.split(',')
325
- name = ' '.join([name[1].strip(' '), name[0][2:].strip(' ')])
326
- #name = shared.fix_name(name)
327
- #This has been excluded as means to control the differences in names between the JSON and HTML documents
328
- players[name] = dict()
329
- players[name]['number'] = name[0][:2].strip()
330
- players[name]['shifts'] = []
841
+ number = int(name[0][:2].strip())
842
+ id = rosters[str(number)][4]
843
+ players[id] = dict()
844
+
845
+ #HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
846
+
847
+ players[id]['name'] = rosters[str(number)][2]
848
+ players[id]['pos'] = rosters[str(number)][1]
849
+
850
+ players[id]['shifts'] = []
331
851
  else:
332
- players[name]['shifts'].extend([t])
852
+ players[id]['shifts'].extend([t])
333
853
 
334
854
  for key in players.keys():
335
855
  # Create lists of shifts-table columns for analysis
336
856
  players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
337
857
 
858
+ name = players[key]['name']
859
+ pos = players[key]['pos']
860
+
338
861
  # Parsing
339
- shifts = [analyze_shifts(shift, key, team, home_team, player_ids) for shift in players[key]['shifts']]
862
+ shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
340
863
  all_shifts.extend(shifts)
341
864
 
342
865
  df = pd.DataFrame(all_shifts)
343
- df['game_id'] = str(game_id)
344
866
 
345
- shifts_raw = df[columns]
867
+ shifts_raw = df[df['duration'] > 0]
346
868
 
347
- shifts_raw = shifts_raw[shifts_raw['duration'] > 0]
869
+ #Return: single-team individual shifts by player
870
+ return shifts_raw
871
+
872
+ def parse_shift_events(info,home):
873
+ #Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
874
+
875
+ #Determine whether to use JSON shifts or HTML shifts
876
+ if len(info['json_shifts']) == 0:
877
+ shift = parse_shifts_html(info,home)
878
+ else:
879
+ shift = parse_shifts_json(info)['home' if home else 'away']
880
+
881
+ rosters = info['rosters']
348
882
 
349
- # Second-stage beginds here
350
883
  # Identify shift starts for each shift event
351
- shifts_on = shifts_raw.groupby(['team_abbr', 'period', 'start']).agg(
884
+ shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
352
885
  num_on=('player_name', 'size'),
353
886
  players_on=('player_name', lambda x: ', '.join(x)),
354
- ids_on=('player_id', lambda x: ', '.join(map(str, x)))
887
+ ids_on=('player_id', lambda x: ', '.join(map(str,x))),
355
888
  ).reset_index()
356
889
 
357
890
  shifts_on = shifts_on.rename(columns={
@@ -359,10 +892,10 @@ def parse_shifts(html, player_ids, game_id):
359
892
  })
360
893
 
361
894
  # Identify shift stops for each shift event
362
- shifts_off = shifts_raw.groupby(['team_abbr', 'period', 'end']).agg(
895
+ shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
363
896
  num_off=('player_name', 'size'),
364
897
  players_off=('player_name', lambda x: ', '.join(x)),
365
- ids_off=('player_id', lambda x: ', '.join(map(str, x)))
898
+ ids_off=('player_id', lambda x: ', '.join(map(str,x))),
366
899
  ).reset_index()
367
900
 
368
901
  shifts_off = shifts_off.rename(columns={
@@ -370,57 +903,29 @@ def parse_shifts(html, player_ids, game_id):
370
903
  })
371
904
 
372
905
  # Merge and sort by time in game
373
- shifts = pd.merge(shifts_on, shifts_off, on=['team_abbr', 'period', 'seconds_elapsed'], how='outer')
906
+ shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
374
907
 
375
- shifts = shifts.sort_values('seconds_elapsed')
376
-
377
- #Modify columns of new total shifts dataframe
378
- shifts['period'] = shifts['period'].astype(int)
908
+ shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
379
909
  shifts['event_type'] = 'change'
380
- shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200 * (shifts['period']-1))
381
- shifts['game_seconds_remaining'] = 3600 - shifts['seconds_elapsed']
382
-
383
- # Handle missing values at the start and end of periods
384
- shifts['players_on'] = shifts['players_on'].fillna('None')
385
- shifts['players_off'] = shifts['players_off'].fillna('None')
386
- shifts['ids_on'] = shifts['ids_on'].fillna('0')
387
- shifts['ids_off'] = shifts['ids_off'].fillna('0')
388
- shifts['num_on'] = shifts['num_on'].fillna(0).astype(int)
389
- shifts['num_off'] = shifts['num_off'].fillna(0).astype(int)
390
-
391
- #Manual Team Rename
392
- shifts['team_abbr'] = shifts['team_abbr'].replace({
393
- "L.A":"LAK",
394
- "N.J":"NJD",
395
- "S.J":"SJS",
396
- "T.B":"TBL"
397
- })
398
910
 
399
- #Return: shift events formatted similarly to json pbp: shootout changes are discluded
400
- return shifts.loc[shifts['period']<5].rename(columns={'team_abbr':'event_team_abbr'})
911
+ #Shift events similar to html (remove shootout shifts)
912
+ shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
913
+
914
+ #Generate on-ice columns
915
+ skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
916
+ goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
917
+ team = list(shift['event_team_abbr'])[0]
401
918
 
402
- def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
403
- #Given roster info (from the retreive_players function), shifts df, and team, generate on_ice columns for shift events
404
- #These on-ice columns configure the on-ice players for events in the json play by play as well
405
919
  skaters = pd.DataFrame()
406
920
  goalies = pd.DataFrame()
407
- if home:
408
- team = {key:value for key, value in rosters['home'].items() if value['pos'] != "G"}
409
- else:
410
- team = {key:value for key, value in rosters['away'].items() if value['pos'] != "G"}
411
-
412
- names = list(team.keys())
413
- try: names.remove("")
414
- except ValueError: ""
415
-
416
- for player in names:
921
+ for player in skater_names:
417
922
  #For each player in the game, determine when they began and ended shifts.
418
923
  #With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
419
924
  on_ice = (np.cumsum(
420
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
925
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
421
926
  .apply(str)
422
927
  .apply(lambda x: int(bool(re.search(player, x)))) -
423
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
928
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
424
929
  .apply(str)
425
930
  .apply(lambda x: int(bool(re.search(player, x))))
426
931
  ))
@@ -428,32 +933,22 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
428
933
 
429
934
  skaters = skaters.fillna(0).astype(int)
430
935
 
431
-
432
936
  on_skaters = (skaters == 1).stack().reset_index()
433
937
  on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
434
938
 
435
939
  max_players = 6
436
940
  for i in range(max_players):
437
- on_skaters[f"{'home' if home else 'away'}_on_{i+1}"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
941
+ on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
438
942
 
439
943
  on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
440
944
 
441
- #Repeat above process with goaltenders
442
- if home:
443
- team = {key:value for key, value in rosters['home'].items() if value['pos'] == "G"}
444
- else:
445
- team = {key:value for key, value in rosters['away'].items() if value['pos'] == "G"}
446
-
447
- names = list(team.keys())
448
- try: names.remove("")
449
- except ValueError: ""
450
-
451
- for player in names:
945
+ #Repeat this process with goaltenders
946
+ for player in goalie_names:
452
947
  on_ice = (np.cumsum(
453
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
948
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
454
949
  .apply(str)
455
950
  .apply(lambda x: int(bool(re.search(player, x)))) -
456
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
951
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
457
952
  .apply(str)
458
953
  .apply(lambda x: int(bool(re.search(player, x))))
459
954
  ))
@@ -466,7 +961,7 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
466
961
 
467
962
  max_players = 1
468
963
  for i in range(max_players):
469
- on_goalies[f"{'home' if home else 'away'}_goalie"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
964
+ on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
470
965
 
471
966
  on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
472
967
 
@@ -474,104 +969,100 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
474
969
  on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
475
970
 
476
971
  shifts['row'] = shifts.index
477
-
972
+
973
+ if home:
974
+ shifts['home_team_abbr'] = team
975
+ else:
976
+ shifts['away_team_abbr'] = team
478
977
  #Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
479
- return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"REMOVE")
978
+ return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
480
979
 
481
- def combine_shifts(home_shift,away_shift,json,game_id):
482
- #Given shifts html documents for home and away team, return shift events complete with both teams' changes in the provided game
483
- data = retreive_players(json,result="pos")
484
- data_id = retreive_players(json)
980
+ ## FINALIZE PBP FUNCTIONS ##
981
+ def combine_shifts(info):
982
+ #Given game info, return complete shift events
485
983
 
486
- away = parse_shifts(away_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
487
- home = parse_shifts(home_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
984
+ #JSON Prep
985
+ roster = info['rosters']
488
986
 
489
- away['row'] = away.index
490
- home['row'] = home.index
491
-
492
- away_shifts = construct_skaters_matrix(data,away,pd.json_normalize(json)['awayTeam.abbrev'][0],False).fillna("REMOVE")
493
- home_shifts = construct_skaters_matrix(data,home,pd.json_normalize(json)['homeTeam.abbrev'][0],True).fillna("REMOVE")
987
+ #Quickly combine shifts data
988
+ away = parse_shift_events(info,False)
989
+ home = parse_shift_events(info,True)
494
990
 
495
- shifts = pd.concat([away_shifts,home_shifts]).sort_values(by=['period','seconds_elapsed'])
991
+ #Combine shifts
992
+ data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
993
+
994
+ #Add game info
995
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
996
+ 'away_team_abbr','home_team_abbr']
496
997
 
497
- #Return: shifts dataframe with both teams' changes
498
- return shifts.drop(columns=['row'])
998
+ for col in info_col:
999
+ data[col] = info[col]
499
1000
 
500
- def fix_names(shifts_df,json):
501
- #Uses alternative names provided in the json to search shifts and ensure both shifts and json dataframes use the same name for each player
502
- data = pd.json_normalize(json['rosterSpots'])
503
- data['fullName'] = (data['firstName.default']+" "+data['lastName.default']).str.upper()
1001
+ #Create player information dicts to create on-ice names
1002
+ roster['playerId'] = roster['playerId'].astype(str)
1003
+ players = roster.set_index("playerId")['full_name'].to_dict()
504
1004
 
505
- alt_name_col = ['firstName.cs', 'firstName.de', 'firstName.es', 'firstName.fi', 'firstName.sk', 'firstName.sv']
506
- for i in range(len(alt_name_col)):
507
- try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
508
- except: continue
1005
+ for i in range(0,7):
1006
+ if i == 6:
1007
+ data['away_goalie'] = data['away_goalie_id'].replace(players)
1008
+ data['home_goalie'] = data['home_goalie_id'].replace(players)
1009
+ else:
1010
+ data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
1011
+ data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
509
1012
 
510
- name_col = ['fullName', 'fullName.1', 'fullName.2', 'fullName.3', 'fullName.4', 'fullName.5', 'fullName.6']
1013
+ data = data.sort_values(['period','seconds_elapsed'])
1014
+ #Fill on-ice columns down
1015
+ on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
1016
+ 'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
1017
+ 'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
1018
+ 'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
1019
+ 'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
511
1020
 
512
- for name in name_col:
513
- try: data[name]
514
- except:
515
- data[name] = np.nan
1021
+ for col in on_ice_col:
1022
+ data[col] = data[col].ffill()
516
1023
 
517
- names_dfs = []
518
- for name in name_col[1:len(name_col)]:
519
- names_dfs.append(data[[name,'fullName']].rename(columns={name:"alt",
520
- "fullName":'default'}))
1024
+ #Create strength state information
1025
+ away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
1026
+ home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
1027
+ data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1028
+ data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1029
+ data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
521
1030
 
522
- names_df = pd.concat(names_dfs)
1031
+ #Return: full shifts data converted to play-by-play format
1032
+ col = [col for col in get_col() if col in data.columns.to_list()]
1033
+ return data[col]
523
1034
 
524
- replace = {}
525
- for default, alt in zip(names_df['default'],names_df['alt']):
526
- if alt == np.nan or alt == "" or str(alt) == 'nan':
527
- continue
528
- else:
529
- replace.update({alt:default})
530
-
531
- return shifts_df.replace(replace,regex=True)
1035
+ def combine_data(info):
1036
+ #Given game info, return complete play-by-play data
532
1037
 
533
- def get_col():
534
- return [
535
- 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
536
- 'away_team_abbr','home_team_abbr','event_num','period','period_type',
537
- 'seconds_elapsed', "situation_code","strength_state","home_team_defending_side","shift_type",
538
- "event_type_code","event_type","description","reason","penalty_duration","penalty_description",
539
- "event_team_abbr",'num_on', 'players_on', 'ids_on', 'num_off', 'players_off', 'ids_off',
540
- "event_team_status","event_player_1_id","event_player_2_id","event_player_3_id",
541
- "event_player_1_name","event_player_2_name","event_player_3_name","event_player_1_pos","event_player_2_pos",
542
- "event_player_3_pos","event_goalie_id",
543
- "event_goalie_name","shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
544
- "event_skaters","away_skaters","home_skaters",
545
- "event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
546
- "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
547
- "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie"
548
- ]
1038
+ game_id = info['game_id']
549
1039
 
550
- def combine_data(json,html):
551
- #Given json pbp and html shifts, total game play-by-play data is provided with additional and corrected details
552
- df = pd.concat([json,html])
1040
+ pbp = combine_pbp(info)
1041
+ shifts = combine_shifts(info)
553
1042
 
554
- #Fill period_type column and assign shifts a sub-500 event code
555
- df['period_type'] = np.where(df['period']<4,"REG",np.where(df['period']==4,"OT","SO"))
556
- df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
1043
+ #Combine data
1044
+ df = pd.concat([pbp,shifts])
557
1045
 
558
1046
  #Create priority columns designed to order events that occur at the same time in a game
559
- start_pri = ['period-start','game-start']
560
1047
  even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
561
- df['priority'] = np.where(df['event_type'].isin(start_pri),0,
562
- np.where(df['event_type'].isin(even_pri),1,
1048
+ df['priority'] = np.where(df['event_type'].isin(even_pri),1,
563
1049
  np.where(df['event_type']=='goal',2,
564
1050
  np.where(df['event_type']=='stoppage',3,
565
- np.where(df['event_type']=='penalty',4,
566
- np.where(df['event_type']=='change',5,
1051
+ np.where(df['event_type']=='delayed-penalty',4,
1052
+ np.where(df['event_type']=='penalty',5,
567
1053
  np.where(df['event_type']=='period-end',6,
568
- np.where(df['event_type']=='game-end',7,
569
- np.where(df['event_type']=='faceoff',8,9)))))))))
1054
+ np.where(df['event_type']=='change',7,
1055
+ np.where(df['event_type']=='game-end',8,
1056
+ np.where(df['event_type']=='period-start',9,
1057
+ np.where(df['event_type']=='faceoff',10,0))))))))))
1058
+
1059
+ df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
1060
+ df = df.sort_values(['period','seconds_elapsed','priority'])
570
1061
 
571
- df = df.sort_values(by=['period','seconds_elapsed','priority']).reset_index()
572
- #Recreate event_num column to accurately depict the order of all events, including changes
1062
+ #Recalibrate event_num column to accurately depict the order of all events, including changes
1063
+ df.reset_index(inplace=True,drop=True)
573
1064
  df['event_num'] = df.index+1
574
- df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
1065
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
575
1066
  df['event_type_last'] = df['event_type'].shift(1)
576
1067
  df['event_type_last_2'] = df['event_type_last'].shift(1)
577
1068
  df['event_type_next'] = df['event_type'].shift(-1)
@@ -580,60 +1071,36 @@ def combine_data(json,html):
580
1071
  period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
581
1072
  #Define shifts by "line-change" or "on-the-fly"
582
1073
  df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
583
-
584
- #Descrpitions:
585
- #HTML pbp includes descriptions for each event; without the HTML pbp, play descriptions must be generated
586
- #Different, more originally formatting is employed with these descriptions in comparison to that provided in the HTML pbp
587
- df['start_end_desc'] = np.where(df['event_type'].isin(['period-start','period-end']),df['away_team_abbr'] + "vs" + df['home_team_abbr'] + ": Period " + df['period'].astype(str) + " " + df['event_type'].str.replace("period-","",regex=True).str.capitalize(),np.nan)
588
- df['take_give_desc'] = np.where(df['event_type'].isin(['takeaway','giveaway']),df['event_team_abbr'] + " " + df['event_type'].str.upper() + " by " + df['event_player_1_name'],np.nan)
589
- df['stoppage_desc'] = np.where(df['event_type']=='stoppage',"STOPPAGE: " + df['reason'].str.replace("-"," ",regex=True).str.capitalize(),np.nan)
590
- df['blocked_desc'] = np.where(df['event_type']=='blocked-shot',df['event_team_abbr'] + " SHOT from " + df['event_player_1_name'] + " BLOCKED by " + df['event_player_2_name'],np.nan)
591
- df['missed_desc'] = np.where(df['event_type']=='missed-shot',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " MISSED: " + df['reason'].astype(str).str.replace("-"," ",regex=True),np.nan)
592
- df['sog_desc'] = np.where(df['event_type']=='shot-on-goal',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " SAVED by " + df['event_goalie_name'],np.nan)
593
- df['goal_desc'] = np.where(df['event_type']=='goal',df['event_team_abbr'] + " GOAL SCORED by " + df['event_player_1_name'],np.nan)
594
- df['assist_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_2_name'].notna())," ASSISTED by " + df['event_player_2_name'],"")
595
- df['assist2_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_3_name'].notna())," and ASSISTED by " + df['event_player_3_name'],"")
596
- df['goal_desc_complete'] = df['goal_desc'] + df['assist_desc'] + df['assist2_desc']
597
- df['hit_desc'] = np.where(df['event_type']=='hit',df['event_team_abbr'] + " HIT by " + df['event_player_1_name'] + " on " + df['event_player_2_name'],np.nan)
598
- df['faceoff_desc'] = np.where(df['event_type']=='faceoff',"FACEOFF WON by " + df['event_player_1_name'] + " AGAINST " + df['event_player_2_name'],np.nan)
599
- df['penalty_desc'] = np.where(df['event_type']=='penalty',df['event_team_abbr'] + " PENALTY on " + df['event_player_1_name'] + ": " + df['penalty_duration'].astype(str).str.replace(".0","",regex=True) + " minutes for " + df['penalty_description'].astype(str).str.replace("-"," ",regex=True).str.upper(),np.nan)
600
-
601
- df['description'] = df['start_end_desc'].combine_first(df['take_give_desc'])\
602
- .combine_first(df['stoppage_desc'])\
603
- .combine_first(df['blocked_desc'])\
604
- .combine_first(df['missed_desc'])\
605
- .combine_first(df['sog_desc'])\
606
- .combine_first(df['goal_desc_complete'])\
607
- .combine_first(df['hit_desc'])\
608
- .combine_first(df['faceoff_desc'])\
609
- .combine_first(df['penalty_desc'])
610
- ffill_col = ['season','season_type','game_id','game_date',
611
- "start_time","venue","venue_location",
612
- 'away_team_abbr','home_team_abbr','home_team_defending_side',
613
- 'away_score','away_fenwick',
614
- 'home_score','home_fenwick',
615
- 'away_goalie','home_goalie']
616
- away_on = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6']
617
- home_on = ['home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6']
618
-
619
- #Forward fill appropriate columns
620
- for col in ffill_col+away_on+home_on:
1074
+ df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
1075
+ try:
1076
+ df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
1077
+ except:
1078
+ ""
1079
+
1080
+ #Add time since last event and overall event length
1081
+ df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
1082
+ df['event_length'] = df['seconds_since_last'].shift(-1)
1083
+
1084
+ #Add fixed strength state column
1085
+ df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
1086
+
1087
+ #Retrieve coaches
1088
+ coaches = info['coaches']
1089
+ if not coaches:
1090
+ df['away_coach'] = ""
1091
+ df['home_coach'] = ""
1092
+ df['event_coach'] = ""
1093
+ else:
1094
+ df['away_coach'] = coaches['away']
1095
+ df['home_coach'] = coaches['home']
1096
+ df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
1097
+
1098
+ #Forward fill as necessary
1099
+ cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
1100
+ for col in cols:
1101
+ try: df[col]
1102
+ except: df[col] = ""
621
1103
  df[col] = df[col].ffill()
622
1104
 
623
- #Now that forward fill is complete, replace "REMOVE" with nan
624
- df.replace("REMOVE",np.nan,inplace=True)
625
-
626
- #Reconfigure strength state and sitution codes
627
- df['away_skaters'] = df[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
628
- df['home_skaters'] = df[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
629
- df['away_goalie_in'] = np.where(df['away_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
630
- df['home_goalie_in'] = np.where(df['home_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
631
-
632
- df['event_skaters'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['home_skaters'],df['away_skaters'])
633
- df['event_skaters_against'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['away_skaters'],df['home_skaters'])
634
-
635
- df['strength_state'] = df['event_skaters'].astype(str) + "v" + df['event_skaters_against'].astype(str)
636
- df['situation_code'] = np.where(df['situation_code'].isna(),df['away_goalie_in'].astype(str) + df['away_skaters'].astype(str) + df['home_skaters'].astype(str) + df['home_goalie_in'].astype(str),df['situation_code'])
637
-
638
1105
  #Return: complete play-by-play with all important data for each event in a provided game
639
- return df[get_col()].replace(r'^\s*$', np.nan, regex=True)
1106
+ return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)