wsba-hockey 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,105 +1,166 @@
1
1
  import re
2
- from bs4 import BeautifulSoup, SoupStrainer
3
- import hockey_scraper.utils.shared as shared
4
- import hockey_scraper.nhl.pbp.html_pbp as html
5
- import hockey_scraper.nhl.game_scraper as gs
2
+ from bs4 import BeautifulSoup
3
+ import requests as rs
4
+ import json as json_lib
5
+ from tools.utils.shared import *
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  import warnings
9
- import requests as rs
10
- from zipfile import ZipFile
11
9
  warnings.filterwarnings('ignore')
12
10
 
13
11
  ### SCRAPING FUNCTIONS ###
14
12
  # Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
15
13
 
16
- ## JSON DATA ##
17
- def retreive_players(json,result = "id"):
18
- #Given json data from an NHL API call, return dictionary with home and away players and either their id or their position.
14
+ ## ORDER OF OPERATIONS ##
15
+ # Create game information to use with all functions
16
+ # Retreive JSON data
17
+ # Parse JSON data
18
+ # Retreive and clean HTML pbp with player information
19
+ # Parse HTML pbp, return parsed HTML
20
+ # Combine pbp data
21
+ # Retreive and analyze HTML shifts with player information for home and away teams
22
+ # Parse shift events
23
+ # Combine all data, return complete play-by-play
24
+
25
+ ## UTILITY FUNCTIONS ##
26
+ def get_col():
27
+ return [
28
+ 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
29
+ 'away_team_abbr','home_team_abbr','event_num','period','period_type',
30
+ 'seconds_elapsed',"situation_code","strength_state","strength_state_venue","home_team_defending_side",
31
+ "event_type_code","event_type","description","penalty_duration",
32
+ "event_team_abbr","event_team_venue",
33
+ 'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
34
+ "event_player_1_name","event_player_2_name","event_player_3_name",
35
+ "event_player_1_id","event_player_2_id","event_player_3_id",
36
+ "event_player_1_pos","event_player_2_pos","event_player_3_pos",
37
+ "event_goalie_name","event_goalie_id",
38
+ "shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
39
+ "event_skaters","away_skaters","home_skaters",
40
+ "event_distance","event_angle","event_length","seconds_since_last",
41
+ "away_score","home_score", "away_fenwick", "home_fenwick","away_sog","home_sog",
42
+ "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
43
+ "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
44
+ "away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
45
+ "home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
46
+ "event_coach","away_coach","home_coach"
47
+ ]
48
+
49
+
50
+ ## JSON FUNCTIONS ##
51
+ def get_game_roster(json):
52
+ #Given raw json data, return game rosters
19
53
  roster = pd.json_normalize(json['rosterSpots'])
20
- info = pd.json_normalize(json)
21
- home = info['homeTeam.id'][0]
22
- away = info['awayTeam.id'][0]
23
-
24
- #Add up to four alternative names for each player in the game
25
- roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
26
- try: roster['playerName_2'] = roster['firstName.cs']+" "+roster['lastName.default']
27
- except: roster['playerName_2'] = ""
28
- try: roster['playerName_3'] = roster['firstName.de']+" "+roster['lastName.default']
29
- except: roster['playerName_3'] = ""
30
- try: roster['playerName_4'] = roster['firstName.es']+" "+roster['lastName.default']
31
- except: roster['playerName_4'] = ""
32
-
33
- #For each home/away player their name is included as a key and their id or position is the value
34
- home_players = {}
35
- home_id = roster.loc[roster['teamId']==home]
36
- hid = list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])
37
- hpos = list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])
38
- hp = list(home_id['playerName'])+list(home_id['playerName_2'])+list(home_id['playerName_3'])+list(home_id['playerName_4'])
39
-
40
- for id, pos, player in zip(hid,hpos,hp):
41
- try: home_players.update({player.upper():
42
- {result:id if result == 'id' else pos}})
43
- except:
44
- continue
54
+ roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
45
55
 
46
- away_players = {}
47
- away_id = roster.loc[roster['teamId']==away]
48
- aid = list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])
49
- apos = list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])
50
- ap = list(away_id['playerName'])+list(away_id['playerName_2'])+list(away_id['playerName_3'])+list(away_id['playerName_4'])
51
-
52
- for id, pos, player in zip(aid,apos,ap):
53
- try: away_players.update({player.upper():
54
- {result:id if result == 'id' else pos}})
55
- except:
56
- continue
57
-
58
- #Return: Dict of away and home players keyed with id or position as value
59
- return {
60
- 'home':home_players,
61
- 'away':away_players
62
- }
56
+ #Return: roster information
57
+ return roster
63
58
 
64
- def parse_json(json):
65
- #Given json data from an NHL API call, return play-by-play data.
59
+ def get_game_coaches(game_id):
60
+ #Given game info, return head coaches for away and home team
61
+
62
+ #Retreive data
63
+ json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
64
+ data = json['gameInfo']
66
65
 
67
- events = pd.json_normalize(json['plays']).reset_index(drop=True)
68
- info = pd.json_normalize(json)
69
- roster = pd.json_normalize(json['rosterSpots'])
66
+ #Add coaches
67
+ try:
68
+ away = data['awayTeam']['headCoach']['default'].upper()
69
+ home = data['homeTeam']['headCoach']['default'].upper()
70
+
71
+ coaches = {'away':away,
72
+ 'home':home}
73
+ except KeyError:
74
+ return {}
70
75
 
71
- #Game information
72
- events['game_id'] = info['id'][0]
73
- events['season'] = info['season'][0]
74
- events['season_type'] = info['gameType'][0]
75
- events['game_date'] = info['gameDate'][0]
76
- events['start_time'] = info['startTimeUTC'][0]
77
- events['venue'] = info['venue.default'][0]
78
- events['venue_location'] = info['venueLocation.default'][0]
79
- events['away_team_id'] = info['awayTeam.id'][0]
80
- events['away_team_abbr'] = info['awayTeam.abbrev'][0]
81
- events['home_team_id'] = info['homeTeam.id'][0]
82
- events['home_team_abbr'] = info['homeTeam.abbrev'][0]
83
-
84
- teams = {
85
- info['awayTeam.id'][0]:info['awayTeam.abbrev'][0],
86
- info['homeTeam.id'][0]:info['homeTeam.abbrev'][0]
87
- }
88
-
89
- #Create player information dicts used to create event_player columns
90
- roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
91
- players = {}
92
- players_pos = {}
93
- ids = {}
94
- for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
95
- players.update({id:player.upper()})
96
- for id, pos in zip(list(roster['playerId']),list(roster['positionCode'])):
97
- players_pos.update({id:pos.upper()})
98
- for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
99
- ids.update({player.upper():id})
76
+ #Return: dict with coaches
77
+ return coaches
78
+
79
+ def get_game_info(game_id):
80
+ #Given game_id, return game information
81
+
82
+ #Retreive data
83
+ api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
84
+ json = rs.get(api).json()
85
+
86
+ #Games don't always have JSON shifts, for whatever reason
87
+ shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
88
+ shifts = rs.get(shifts).json()
89
+ json_shifts = pd.json_normalize(shifts['data'])
90
+
91
+ if shifts['total'] == 0:
92
+ json_shifts = pd.DataFrame()
93
+
94
+ #Split information
95
+ base = pd.json_normalize(json)
96
+ game_id = base['id'][0]
97
+ season = base['season'][0]
98
+ season_type = base['gameType'][0]
99
+ game_date = base['gameDate'][0]
100
+ game_state = base['gameState'][0]
101
+ start_time = base['startTimeUTC'][0]
102
+ venue = base['venue.default'][0]
103
+ venue_location = base['venueLocation.default'][0]
104
+ away_team_id = base['awayTeam.id'][0]
105
+ away_team_abbr = base['awayTeam.abbrev'][0]
106
+ home_team_id = base['homeTeam.id'][0]
107
+ home_team_abbr = base['homeTeam.abbrev'][0]
108
+
109
+ #Add roster
110
+ roster = get_game_roster(json)
111
+ #In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
112
+ roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
113
+ roster['team_abbr'] = roster['teamId'].replace({
114
+ away_team_id:[away_team_abbr],
115
+ home_team_id:[home_team_abbr]
116
+ })
117
+ roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
100
118
 
119
+ #Create an additional roster dictionary for use with HTML parsing
120
+ #Roster dict
121
+ roster_dict = {'away':{},
122
+ 'home':{}}
123
+
124
+ #Evaluate and add players by team
125
+ for team in ['away','home']:
126
+ abbr = (away_team_abbr if team == 'away' else home_team_abbr)
127
+ rost = roster.loc[roster['team_abbr']==abbr]
128
+
129
+ #Now iterate through team players
130
+ for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
131
+ roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
132
+
133
+ #Return: game information
134
+ return {"game_id":str(game_id),
135
+ "season":season,
136
+ "season_type":season_type,
137
+ "game_date":game_date,
138
+ "game_state":game_state,
139
+ "start_time":start_time,
140
+ 'venue':venue,
141
+ 'venue_location':venue_location,
142
+ 'away_team_id':away_team_id,
143
+ 'away_team_abbr':away_team_abbr,
144
+ 'home_team_id':home_team_id,
145
+ 'home_team_abbr':home_team_abbr,
146
+ 'events':pd.json_normalize(json['plays']).reset_index(drop=True),
147
+ 'rosters':roster,
148
+ 'HTML_rosters':roster_dict,
149
+ 'coaches':get_game_coaches(game_id),
150
+ 'json_shifts':json_shifts}
151
+
152
+ def parse_json(info):
153
+ #Given game info, return JSON document
154
+
155
+ #Retreive data
156
+ events = info['events']
157
+
158
+ #Return error if game is set in the future
159
+ if info['game_state'] == 'FUT':
160
+ raise ValueError(f"Game {info['id'][0]} has not occured yet.")
161
+
101
162
  #Test columns
102
- cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date', 'away_team_id', 'away_team_abbr', 'home_team_id', 'home_team_abbr']
163
+ cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
103
164
 
104
165
  for col in cols:
105
166
  try:events[col]
@@ -120,46 +181,38 @@ def parse_json(json):
120
181
 
121
182
  events['event_player_3_id'] = events['details.assist2PlayerId']
122
183
 
123
- events['event_team_status'] = np.where(events['home_team_id']==events['details.eventOwnerTeamId'],"home","away")
184
+ events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
124
185
 
125
186
  #Coordinate adjustments:
126
187
  #The WSBA NHL Scraper includes three sets of coordinates per event:
127
188
  # x, y - Raw coordinates from JSON pbpp
128
189
  # x_fixed, y_fixed - Coordinates fixed to the right side of the ice (x is always greater than 0)
129
190
  # x_adj, y_adj - Adjusted coordinates configuring away events with negative x vlaues while home events are always positive
130
- events['x_fixed'] = abs(events['details.xCoord'])
131
- events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
132
- events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
133
- events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
134
- events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
135
- events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
136
-
137
- events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
138
-
139
- #Event player information includes ids (included in the JSON events), names (from "rosterSpots"), and positions (also from "rosterSpots")
140
- events['event_player_1_name'] = events['event_player_1_id'].replace(players)
141
- events['event_player_2_name'] = events['event_player_2_id'].replace(players)
142
- events['event_player_3_name'] = events['event_player_3_id'].replace(players)
143
-
144
- events['event_player_1_pos'] = events['event_player_1_id'].replace(players_pos)
145
- events['event_player_2_pos'] = events['event_player_2_id'].replace(players_pos)
146
- events['event_player_3_pos'] = events['event_player_3_id'].replace(players_pos)
147
-
148
- events['event_goalie_name'] = events['details.goalieInNetId'].replace(players)
149
-
150
- #Create situations given situation code (this is reconfigured with on ice skaters when provided shifts data)
151
- events['away_skaters'] = events['situationCode'].astype(str).str.slice(start=1,stop=2)
152
- events['home_skaters'] = events['situationCode'].astype(str).str.slice(start=2,stop=3)
153
- events['event_skaters'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['home_skaters'],events['away_skaters'])
154
- events['event_skaters_against'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['away_skaters'],events['home_skaters'])
155
-
156
- events['strength_state'] = events['event_skaters']+"v"+events['event_skaters_against']
157
- events['strength'] = np.where(events['event_skaters']==events['event_skaters_against'],
158
- "EV",np.where(
159
- events['event_skaters']>events['event_skaters_against'],
160
- "PP","SH"
161
- ))
162
191
 
192
+ #Some games (mostly preseason and all star games) do not include coordinates.
193
+ try:
194
+ events['x_fixed'] = abs(events['details.xCoord'])
195
+ events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
196
+ events['x_adj'] = np.where(events['event_team_venue']=="home",events['x_fixed'],-events['x_fixed'])
197
+ events['y_adj'] = np.where(events['event_team_venue']=="home",events['y_fixed'],-events['y_fixed'])
198
+ events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
199
+ events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
200
+ except TypeError:
201
+ print(f"No coordinates found for game {info['id'][0]}...")
202
+
203
+ events['x_fixed'] = np.nan
204
+ events['y_fixed'] = np.nan
205
+ events['x_adj'] = np.nan
206
+ events['y_adj'] = np.nan
207
+ events['event_distance'] = np.nan
208
+ events['event_angle'] = np.nan
209
+
210
+
211
+ events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
212
+ info['away_team_id']:[info['away_team_abbr']],
213
+ info['home_team_id']:[info['home_team_abbr']]
214
+ })
215
+
163
216
  #Rename columns to follow WSBA naming conventions
164
217
  events = events.rename(columns={
165
218
  "eventId":"event_id",
@@ -184,14 +237,12 @@ def parse_json(json):
184
237
  })
185
238
 
186
239
  #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
187
- events['period_time_simple'] = events['period_time_elasped'].str.replace(":","",regex=True)
188
- events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
189
- ((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
190
- ((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
191
- events['period_seconds_remaining'] = 1200-events['period_seconds_elapsed']
240
+ events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
192
241
  events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
242
+
243
+ events = events.loc[(events['event_type']!="")]
193
244
 
194
- #The following code is utilized to generate score and fenwick columns for each event
245
+ #Assign score and fenwick for each event
195
246
  fenwick_events = ['missed-shot','shot-on-goal','goal']
196
247
  ag = 0
197
248
  ags = []
@@ -202,16 +253,16 @@ def parse_json(json):
202
253
  afs = []
203
254
  hf = 0
204
255
  hfs = []
205
- for event,team in zip(list(events['event_type']),list(events['event_team_status'])):
256
+ for event,team in zip(list(events['event_type']),list(events['event_team_venue'])):
206
257
  if event in fenwick_events:
207
258
  if team == "home":
208
- hf = hf+1
259
+ hf += 1
209
260
  if event == 'goal':
210
- hg = hg+1
261
+ hg += 1
211
262
  else:
212
- af = af+1
263
+ af += 1
213
264
  if event == 'goal':
214
- ag = ag+1
265
+ ag += 1
215
266
 
216
267
  ags.append(ag)
217
268
  hgs.append(hg)
@@ -222,84 +273,561 @@ def parse_json(json):
222
273
  events['home_score'] = hgs
223
274
  events['away_fenwick'] = afs
224
275
  events['home_fenwick'] = hfs
225
-
226
- events = events.loc[(events['event_type']!="")&(events['event_type']!="game-end")]
227
276
 
228
- #Return: dataframe with parsed games in event
277
+ #Return: dataframe with parsed game
229
278
  return events
230
279
 
280
+ ### ESPN SCRAPING FUNCTIONS ###
281
+ def espn_game_id(date,away,home):
282
+ #Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
283
+ date = date.replace("-","")
284
+
285
+ #Retreive data
286
+ api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
287
+ schedule = pd.json_normalize(rs.get(api).json()['events'])
288
+
289
+ #Create team abbreviation columns
290
+ schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
291
+ schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
292
+
293
+ #Modify team abbreviations as necessary
294
+ schedule = schedule.replace({
295
+ "LA":"LAK",
296
+ "NJ":"NJD",
297
+ "SJ":"SJS",
298
+ "TB":"TBL",
299
+ })
300
+
301
+ #Retreive game id
302
+ game_id = schedule.loc[(schedule['away_team_abbr']==away)&
303
+ (schedule['home_team_abbr']==home),'id'].tolist()[0]
231
304
 
305
+ #Return: ESPN game id
306
+ return game_id
232
307
 
233
- ## HTML DATA ##
234
- def get_soup(shifts_html):
235
- #Parses provided shifts html with BeautifulSoup
236
- #Utilizes method from Harry Shomer's hockey_scraper package
237
- parsers = ["lxml", "html.parser", "html5lib"]
308
+ def parse_espn(date,away,home):
309
+ #Given a date formatted as YYYY-MM-DD and teams, return game events
310
+ game_id = espn_game_id(date,away,home)
311
+ url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
312
+
313
+ #Code modified from Patrick Bacon
238
314
 
239
- for parser in parsers:
240
- soup = BeautifulSoup(shifts_html, parser)
241
- td = soup.findAll(True, {'class': ['playerHeading + border', 'lborder + bborder']})
315
+ #Retreive game events as json
316
+ page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
317
+ soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
318
+ json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
242
319
 
243
- if len(td) > 0:
244
- break
245
- return td, get_teams(soup)
320
+ #DataFrame of time-related info for events
321
+ clock_df = pd.DataFrame()
246
322
 
323
+ for period in range(0, len(json)):
324
+ clock_df = clock_df._append(pd.DataFrame(json[period]))
247
325
 
248
- def get_teams(soup):
249
- #Collects teams in given shifts html (parsed by Beautiful Soup)
250
- #Utilizes method from Harry Shomer's hockey_scraper package
251
- team = soup.find('td', class_='teamHeading + border') # Team for shifts
252
- team = team.get_text()
326
+ clock_df = clock_df[~pd.isna(clock_df.clock)]
327
+
328
+ # Needed to add .split(',"st":3')[0] for playoffs
329
+
330
+ #DataFrame of coordinates for events
331
+ coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
332
+
333
+ clock_df = clock_df.assign(
334
+ clock = clock_df.clock.apply(lambda x: x['displayValue'])
335
+ )
336
+
337
+ coords_df = coords_df.assign(
338
+ coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
339
+ coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
340
+ event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
341
+ )
342
+
343
+ #Combine
344
+ espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
345
+
346
+ espn_events = espn_events.assign(
347
+ period = espn_events['period'].apply(lambda x: x['number']),
348
+ minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
349
+ seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
350
+ event_type = espn_events['type'].apply(lambda x: x['txt'])
351
+ )
352
+
353
+ espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
354
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_x
355
+ ),
356
+ coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
357
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
358
+
359
+ espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
360
+
361
+ espn_events = espn_events.assign(
362
+ coords_x = espn_events.coords_x.astype(int),
363
+ coords_y = espn_events.coords_y.astype(int)
364
+ )
365
+
366
+ #Rename events
367
+ #The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
368
+ espn_events['event_type'] = espn_events['event_type'].replace({
369
+ "Face Off":'faceoff',
370
+ "Hit":'hit',
371
+ "Shot":'shot-on-goal',
372
+ "Missed":'missed-shot',
373
+ "Blocked":'blocked-shot',
374
+ "Goal":'goal',
375
+ "Turnover":'giveaway',
376
+ "Delayed Penalty":'delayed-penalty',
377
+ "Penalty":'penalty',
378
+ })
379
+
380
+ #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
381
+ espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
382
+ espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
383
+ ((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
384
+ ((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
385
+ espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
386
+
387
+ espn_events = espn_events.rename(columns = {'text':'description'})
388
+
389
+ #Add event team
390
+ espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
391
+ "away":away,
392
+ "home":home
393
+ })
394
+
395
+ #Some games (mostly preseason and all star games) do not include coordinates.
396
+ try:
397
+ espn_events['x_fixed'] = abs(espn_events['coords_x'])
398
+ espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
399
+ espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
400
+ espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
401
+ espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
402
+ espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
403
+ except TypeError:
404
+ print(f"No coordinates found for ESPN game...")
405
+
406
+ espn_events['x_fixed'] = np.nan
407
+ espn_events['y_fixed'] = np.nan
408
+ espn_events['x_adj'] = np.nan
409
+ espn_events['y_adj'] = np.nan
410
+ espn_events['event_distance'] = np.nan
411
+ espn_events['event_angle'] = np.nan
412
+
413
+ #Assign score and fenwick for each event
414
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
415
+ ag = 0
416
+ ags = []
417
+ hg = 0
418
+ hgs = []
419
+
420
+ af = 0
421
+ afs = []
422
+ hf = 0
423
+ hfs = []
424
+ for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
425
+ if event in fenwick_events:
426
+ if team == "home":
427
+ hf += 1
428
+ if event == 'goal':
429
+ hg += 1
430
+ else:
431
+ af += 1
432
+ if event == 'goal':
433
+ ag += 1
434
+
435
+ ags.append(ag)
436
+ hgs.append(hg)
437
+ afs.append(af)
438
+ hfs.append(hf)
439
+
440
+ espn_events['away_score'] = ags
441
+ espn_events['home_score'] = hgs
442
+ espn_events['away_fenwick'] = afs
443
+ espn_events['home_fenwick'] = hfs
444
+ #Return: play-by-play events in supplied game from ESPN
445
+ return espn_events
446
+
447
+ ## HTML PBP FUNCTIONS ##
448
+ def strip_html_pbp(td,rosters):
449
+ #Given html row, parse data from HTML pbp
450
+ #Harry Shomer's Code (modified)
451
+
452
+ #HTML Parsing
453
+ for y in range(len(td)):
454
+ # Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
455
+ if y == 3:
456
+ td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
457
+ index = td[y].find(':')
458
+ td[y] = td[y][:index+3]
459
+ elif (y == 6 or y == 7) and td[0] != '#':
460
+ # 6 & 7-> These are the player 1 ice one's
461
+ # The second statement controls for when it's just a header
462
+ baz = td[y].find_all('td')
463
+ bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
464
+
465
+ # The setup in the list is now: Name/Number->Position->Blank...and repeat
466
+ # Now strip all the html
467
+ players = []
468
+ for i in range(len(bar)):
469
+ if i % 3 == 0:
470
+ try:
471
+ #Using the supplied json we can bind player name and id to number and team
472
+ #Find number and team of player then lookup roster dictionary
473
+
474
+ number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
475
+ if y == 6:
476
+ team = 'away'
477
+ else:
478
+ team = 'home'
479
+
480
+ id = rosters[team][str(number)][4]
481
+ name = rosters[team][str(number)][2]
482
+ position = rosters[team][str(number)][1]
483
+
484
+ except KeyError:
485
+ name = ''
486
+ number = ''
487
+ id = ''
488
+ elif i % 3 == 1:
489
+ if name != '':
490
+ players.append([name, number, position, id])
491
+
492
+ td[y] = players
493
+ else:
494
+ td[y] = td[y].get_text()
495
+
496
+ return td
497
+
498
+
499
+ def clean_html_pbp(info):
500
+ #Harry Shomer's Code (modified)
253
501
 
254
- # Get Home Team
255
- teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
256
- regex = re.compile(r'>(.*)<br/?>')
257
- home_team = regex.findall(str(teams[7]))
502
+ game_id = info['game_id']
503
+ #Retreive data
504
+ season = info['season']
505
+ doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
506
+ html = rs.get(doc).content
507
+ soup = get_contents(html)
258
508
 
259
- return [team, home_team[0]]
509
+ #Rosters
510
+ rosters = info['HTML_rosters']
260
511
 
261
- #PARSE FUNCTIONS
262
- def analyze_shifts(shift, name, team, home_team, player_ids):
512
+ # Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
513
+ td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
514
+
515
+ cleaned_html = [strip_html_pbp(x,rosters) for x in td]
516
+
517
+ return cleaned_html
518
+
519
+ def parse_html(info):
520
+ #Given game info, return HTML event data
521
+
522
+ #Retreive game information and html events
523
+ rosters = info['HTML_rosters']
524
+ events = clean_html_pbp(info)
525
+
526
+ teams = {info['away_team_abbr']:['away'],
527
+ info['home_team_abbr']:['home']}
528
+
529
+ #Parsing
530
+ event_log = []
531
+ for event in events:
532
+ events_dict = {}
533
+ if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX','SOC'] or event[3]=='-16:0-':
534
+ continue
535
+ else:
536
+ #Event info
537
+ events_dict['event_num'] = int(event[0])
538
+ events_dict['period'] = int(event[1])
539
+ events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
540
+ events_dict['period_time_elapsed'] = event[3]
541
+ events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
542
+ events_dict['event_type'] = event[4]
543
+
544
+ desc = re.sub(u'\xa0'," ",event[5])
545
+ events_dict['description'] = desc
546
+
547
+ events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
548
+ zone = [x for x in desc.split(',') if 'Zone' in x]
549
+ if not zone:
550
+ events_dict['zone_code'] = None
551
+ elif zone[0].find("Off") != -1:
552
+ events_dict['zone_code'] = 'O'
553
+ elif zone[0].find("Neu") != -1:
554
+ events_dict['zone_code'] = 'N'
555
+ elif zone[0].find("Def") != -1:
556
+ events_dict['zone_code'] = 'D'
557
+
558
+ #Convert team names for compatiblity
559
+ replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
560
+ for name, repl in replace:
561
+ desc = desc.replace(repl,name)
562
+
563
+ event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
564
+ events_dict['event_team_abbr'] = event_team
565
+
566
+ events_dict['away_team_abbr'] = info['away_team_abbr']
567
+ events_dict['home_team_abbr'] = info['home_team_abbr']
568
+
569
+ away_skaters = 0
570
+ away_goalie = 0
571
+ #Away on-ice
572
+ for i in range(len(event[6])):
573
+ player = event[6][i][0]
574
+ pos = event[6][i][2]
575
+ id = event[6][i][3]
576
+
577
+ if pos == 'G':
578
+ events_dict['away_goalie'] = player
579
+ events_dict['away_goalie_id'] = id
580
+ away_goalie += 1
581
+ else:
582
+ events_dict[f'away_on_{i+1}'] = player
583
+ events_dict[f'away_on_{i+1}_id'] = id
584
+ away_skaters += 1
585
+
586
+ home_skaters = 0
587
+ home_goalie = 0
588
+ #Home on-ice
589
+ for i in range(len(event[7])):
590
+ player = event[7][i][0]
591
+ pos = event[7][i][2]
592
+ id = event[7][i][3]
593
+
594
+ if pos == 'G':
595
+ events_dict['home_goalie'] = player
596
+ events_dict['home_goalie_id'] = id
597
+ home_goalie += 1
598
+ else:
599
+ events_dict[f'home_on_{i+1}'] = player
600
+ events_dict[f'home_on_{i+1}_id'] = id
601
+ home_skaters += 1
602
+
603
+ event_players = []
604
+ #Determine parsing route based on event
605
+ if event[4] in ['FAC','HIT','BLOCK','PENL']:
606
+ #Regex to find team and player number involved (finds all for each event)
607
+ #Code is modified from Harry Shomer in order to account for periods in a team abbreviation
608
+ regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
609
+ fac = regex.findall(desc)
610
+ #Filter incorrectly parsed teams
611
+ repl = []
612
+ for team, num in fac:
613
+ if team in teams.keys():
614
+ repl.append((team,num))
615
+ fac = repl
616
+
617
+ #Find first event player
618
+ ep1_num = ''
619
+ for i in range(len(fac)):
620
+ team, num = fac[i]
621
+ if team == event_team:
622
+ ep1_num = num
623
+ event_players.append(fac[i])
624
+ else:
625
+ continue
626
+
627
+ #Find other players
628
+ for i in range(len(fac)):
629
+ team, num = fac[i]
630
+ if num == ep1_num:
631
+ continue
632
+ else:
633
+ event_players.append(fac[i])
634
+ elif event[4]=='GOAL':
635
+ #Parse goal
636
+ regex = re.compile(r'#(\d+)\s+')
637
+ goal = regex.findall(desc)
638
+
639
+ #Add all involved players
640
+ for point in goal:
641
+ #In this loop, point is a player number. We can assign event_team to all players in a goal
642
+ event_players.append((event_team,str(point)))
643
+ elif event[4]=='DELPEN':
644
+ #Don't parse DELPEN events
645
+ #These events typically have no text but when they do it is often erroneous or otherwise problematic
646
+
647
+ ""
648
+ else:
649
+ #Parse single or no player events
650
+ regex = re.compile(r'#\d+')
651
+ fac = regex.findall(desc)
652
+
653
+ for i in range(len(fac)):
654
+ num = fac[i].replace("#","")
655
+ event_players.append((event_team,str(num)))
656
+
657
+ for i in range(len(event_players)):
658
+ #For each player, evaluate their event data, then retreive information from rosters
659
+ team, num = event_players[i]
660
+
661
+ status = teams[team]
662
+ data = rosters[status[0]]
663
+
664
+ events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
665
+ events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
666
+ events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
667
+
668
+ events_dict['away_skaters'] = away_skaters
669
+ events_dict['home_skaters'] = home_skaters
670
+ events_dict['away_goalie_in'] = away_goalie
671
+ events_dict['home_goalie_in'] = home_goalie
672
+
673
+ event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
674
+ event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
675
+ events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
676
+ events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
677
+
678
+ event_log.append(pd.DataFrame([events_dict]))
679
+
680
+ data = pd.concat(event_log)
681
+ data['event_type'] = data['event_type'].replace({
682
+ "PGSTR": "pre-game-start",
683
+ "PGEND": "pre-game-end",
684
+ 'GSTR':"game-start",
685
+ "ANTHEM":"anthem",
686
+ "PSTR":"period-start",
687
+ 'FAC':"faceoff",
688
+ "SHOT":"shot-on-goal",
689
+ "BLOCK":"blocked-shot",
690
+ "STOP":"stoppage",
691
+ "MISS":"missed-shot",
692
+ "HIT":"hit",
693
+ "GOAL":"goal",
694
+ "GIVE":"giveaway",
695
+ "TAKE":"takeaway",
696
+ "DELPEN":"delayed-penalty",
697
+ "PENL":"penalty",
698
+ "CHL":"challenge",
699
+ "PEND":"period-end",
700
+ "GEND":"game-end"
701
+ })
702
+
703
+ #Return: parsed HTML pbp
704
+ return data
705
+
706
+ def combine_pbp(info):
707
+ #Given game info, return complete play-by-play data for provided game
708
+
709
+ html_pbp = parse_html(info)
710
+
711
+ #Route data combining - json if season is after 2009-2010:
712
+ if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
713
+ #ESPN x HTML
714
+ espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).drop(columns=['event_player_1_name'])
715
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
716
+
717
+ df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
718
+
719
+ else:
720
+ #JSON x HTML
721
+ json_pbp = parse_json(info)
722
+ #Modify merge conditions and merge pbps
723
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
724
+ html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
725
+
726
+ #While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
727
+ html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
728
+ json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
729
+
730
+ df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
731
+
732
+ #Add game info
733
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
734
+ 'away_team_abbr','home_team_abbr']
735
+
736
+ for col in info_col:
737
+ df[col] = info[col]
738
+
739
+ #Fill period_type column and assign shifts a sub-500 event code
740
+ df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
741
+ try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
742
+ except:
743
+ ""
744
+ df = df.sort_values(['period','seconds_elapsed']).reset_index()
745
+
746
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
747
+
748
+ #Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
749
+ df['strength_state'] = np.where(np.logical_and(df['period'].astype(str)=='5',df['event_type'].isin(['missed-shot','shot-on-goal','goal'])),"1v0",df['strength_state'])
750
+ df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
751
+
752
+ col = [col for col in get_col() if col in df.columns.to_list()]
753
+ #Return: complete play-by-play information for provided game
754
+ return df[col]
755
+
756
+ ## SHIFT SCRAPING FUNCTIONS ##
757
+ def parse_shifts_json(info):
758
+ #Given game info, return json shift chart
759
+
760
+ log = info['json_shifts']
761
+ #Filter non-shift events and duplicate events
762
+ log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
763
+
764
+ #Add full name columns
765
+ log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
766
+
767
+ log = log.rename(columns={
768
+ 'playerId':'player_id',
769
+ 'teamAbbrev':'event_team_abbr',
770
+ 'startTime':'start',
771
+ 'endTime':'end'
772
+ })
773
+
774
+ #Convert time columns
775
+ log['start'] = log['start'].astype(str).apply(convert_to_seconds)
776
+ log['end'] = log['end'].astype(str).apply(convert_to_seconds)
777
+ log = log[['player_name','player_id',
778
+ 'period','event_team_abbr',
779
+ 'start','duration','end']]
780
+
781
+ #Recalibrate duration
782
+ log['duration'] = log['end'] - log['start']
783
+
784
+ #Return: JSON shifts (seperated by team)
785
+ away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
786
+ home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
787
+
788
+ return {'away':away,
789
+ 'home':home}
790
+
791
+ def analyze_shifts(shift, id, name, pos, team):
263
792
  #Collects teams in given shifts html (parsed by Beautiful Soup)
264
793
  #Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
265
794
  shifts = dict()
266
795
 
267
796
  shifts['player_name'] = name.upper()
797
+ shifts['player_id'] = id
798
+ shifts['player_pos'] = pos
268
799
  shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
269
- shifts['team_abbr'] = shared.get_team(team.strip(' '))
270
- shifts['start'] = shared.convert_to_seconds(shift[2].split('/')[0])
271
- shifts['duration'] = shared.convert_to_seconds(shift[4].split('/')[0])
800
+ shifts['event_team_abbr'] = get_team(team.strip(' '))
801
+ shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
802
+ shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
272
803
 
273
- # I've had problems with this one...if there are no digits the time is fucked up
804
+ #Sometimes there are no digits
274
805
  if re.compile(r'\d+').findall(shift[3].split('/')[0]):
275
- shifts['end'] = shared.convert_to_seconds(shift[3].split('/')[0])
806
+ shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
276
807
  else:
277
808
  shifts['end'] = shifts['start'] + shifts['duration']
278
-
279
- try:
280
- if home_team == team:
281
- shifts['player_id'] = player_ids['home'][name.upper()]['id']
282
- else:
283
- shifts['player_id'] = player_ids['away'][name.upper()]['id']
284
- except KeyError:
285
- shifts['player_id'] = None
286
-
287
809
  return shifts
288
810
 
289
- def parse_shifts(html, player_ids, game_id):
290
- #Two-stage parsing of shifts data for a single team in a provided game
291
- #Stage one: create dataframe with raw individual shifts
292
- #Stage two: convert shift events to play-by-play structure created with json_parsing
811
+ def parse_shifts_html(info,home):
812
+ #Parsing of shifts data for a single team in a provided game
293
813
  #Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
294
814
 
815
+ #Roster info prep
816
+ roster = info['HTML_rosters']
295
817
 
818
+ rosters = roster['home' if home else 'away']
819
+
296
820
  all_shifts = []
297
- columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
821
+ #columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
298
822
 
299
- td, teams = get_soup(html)
823
+ #Retreive HTML
824
+ game_id = info['game_id']
825
+ season = info['season']
826
+ link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
827
+ doc = rs.get(link).content
828
+ td, teams = get_soup(doc)
300
829
 
301
830
  team = teams[0]
302
- home_team = teams[1]
303
831
  players = dict()
304
832
 
305
833
  # Iterates through each player shifts table with the following data:
@@ -308,37 +836,55 @@ def parse_shifts(html, player_ids, game_id):
308
836
  t = t.get_text()
309
837
  if ',' in t: # If a comma exists it is a player
310
838
  name = t
839
+
311
840
  name = name.split(',')
312
- name = ' '.join([name[1].strip(' '), name[0][2:].strip(' ')])
313
- #name = shared.fix_name(name)
314
- #This has been excluded as means to control the differences in names between the JSON and HTML documents
315
- players[name] = dict()
316
- players[name]['number'] = name[0][:2].strip()
317
- players[name]['shifts'] = []
841
+ number = int(name[0][:2].strip())
842
+ id = rosters[str(number)][4]
843
+ players[id] = dict()
844
+
845
+ #HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
846
+
847
+ players[id]['name'] = rosters[str(number)][2]
848
+ players[id]['pos'] = rosters[str(number)][1]
849
+
850
+ players[id]['shifts'] = []
318
851
  else:
319
- players[name]['shifts'].extend([t])
852
+ players[id]['shifts'].extend([t])
320
853
 
321
854
  for key in players.keys():
322
855
  # Create lists of shifts-table columns for analysis
323
856
  players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
324
857
 
858
+ name = players[key]['name']
859
+ pos = players[key]['pos']
860
+
325
861
  # Parsing
326
- shifts = [analyze_shifts(shift, key, team, home_team, player_ids) for shift in players[key]['shifts']]
862
+ shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
327
863
  all_shifts.extend(shifts)
328
864
 
329
865
  df = pd.DataFrame(all_shifts)
330
- df['game_id'] = str(game_id)
331
866
 
332
- shifts_raw = df[columns]
867
+ shifts_raw = df[df['duration'] > 0]
333
868
 
334
- shifts_raw = shifts_raw[shifts_raw['duration'] > 0]
869
+ #Return: single-team individual shifts by player
870
+ return shifts_raw
871
+
872
+ def parse_shift_events(info,home):
873
+ #Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
874
+
875
+ #Determine whether to use JSON shifts or HTML shifts
876
+ if len(info['json_shifts']) == 0:
877
+ shift = parse_shifts_html(info,home)
878
+ else:
879
+ shift = parse_shifts_json(info)['home' if home else 'away']
880
+
881
+ rosters = info['rosters']
335
882
 
336
- # Second-stage beginds here
337
883
  # Identify shift starts for each shift event
338
- shifts_on = shifts_raw.groupby(['team_abbr', 'period', 'start']).agg(
884
+ shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
339
885
  num_on=('player_name', 'size'),
340
886
  players_on=('player_name', lambda x: ', '.join(x)),
341
- ids_on=('player_id', lambda x: ', '.join(map(str, x)))
887
+ ids_on=('player_id', lambda x: ', '.join(map(str,x))),
342
888
  ).reset_index()
343
889
 
344
890
  shifts_on = shifts_on.rename(columns={
@@ -346,10 +892,10 @@ def parse_shifts(html, player_ids, game_id):
346
892
  })
347
893
 
348
894
  # Identify shift stops for each shift event
349
- shifts_off = shifts_raw.groupby(['team_abbr', 'period', 'end']).agg(
895
+ shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
350
896
  num_off=('player_name', 'size'),
351
897
  players_off=('player_name', lambda x: ', '.join(x)),
352
- ids_off=('player_id', lambda x: ', '.join(map(str, x)))
898
+ ids_off=('player_id', lambda x: ', '.join(map(str,x))),
353
899
  ).reset_index()
354
900
 
355
901
  shifts_off = shifts_off.rename(columns={
@@ -357,57 +903,29 @@ def parse_shifts(html, player_ids, game_id):
357
903
  })
358
904
 
359
905
  # Merge and sort by time in game
360
- shifts = pd.merge(shifts_on, shifts_off, on=['team_abbr', 'period', 'seconds_elapsed'], how='outer')
906
+ shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
361
907
 
362
- shifts = shifts.sort_values('seconds_elapsed')
363
-
364
- #Modify columns of new total shifts dataframe
365
- shifts['period'] = shifts['period'].astype(int)
908
+ shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
366
909
  shifts['event_type'] = 'change'
367
- shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200 * (shifts['period']-1))
368
- shifts['game_seconds_remaining'] = 3600 - shifts['seconds_elapsed']
369
-
370
- # Handle missing values at the start and end of periods
371
- shifts['players_on'] = shifts['players_on'].fillna('None')
372
- shifts['players_off'] = shifts['players_off'].fillna('None')
373
- shifts['ids_on'] = shifts['ids_on'].fillna('0')
374
- shifts['ids_off'] = shifts['ids_off'].fillna('0')
375
- shifts['num_on'] = shifts['num_on'].fillna(0).astype(int)
376
- shifts['num_off'] = shifts['num_off'].fillna(0).astype(int)
377
-
378
- #Manual Team Rename
379
- shifts['team_abbr'] = shifts['team_abbr'].replace({
380
- "L.A":"LAK",
381
- "N.J":"NJD",
382
- "S.J":"SJS",
383
- "T.B":"TBL"
384
- })
385
910
 
386
- #Return: shift events formatted similarly to json pbp: shootout changes are discluded
387
- return shifts.loc[shifts['period']<5].rename(columns={'team_abbr':'event_team_abbr'})
911
+ #Shift events similar to html (remove shootout shifts)
912
+ shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
913
+
914
+ #Generate on-ice columns
915
+ skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
916
+ goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
917
+ team = list(shift['event_team_abbr'])[0]
388
918
 
389
- def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
390
- #Given roster info (from the retreive_players function), shifts df, and team, generate on_ice columns for shift events
391
- #These on-ice columns configure the on-ice players for events in the json play by play as well
392
919
  skaters = pd.DataFrame()
393
920
  goalies = pd.DataFrame()
394
- if home:
395
- team = {key:value for key, value in rosters['home'].items() if value['pos'] != "G"}
396
- else:
397
- team = {key:value for key, value in rosters['away'].items() if value['pos'] != "G"}
398
-
399
- names = list(team.keys())
400
- try: names.remove("")
401
- except ValueError: ""
402
-
403
- for player in names:
921
+ for player in skater_names:
404
922
  #For each player in the game, determine when they began and ended shifts.
405
923
  #With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
406
924
  on_ice = (np.cumsum(
407
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
925
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
408
926
  .apply(str)
409
927
  .apply(lambda x: int(bool(re.search(player, x)))) -
410
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
928
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
411
929
  .apply(str)
412
930
  .apply(lambda x: int(bool(re.search(player, x))))
413
931
  ))
@@ -415,32 +933,22 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
415
933
 
416
934
  skaters = skaters.fillna(0).astype(int)
417
935
 
418
-
419
936
  on_skaters = (skaters == 1).stack().reset_index()
420
937
  on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
421
938
 
422
939
  max_players = 6
423
940
  for i in range(max_players):
424
- on_skaters[f"{'home' if home else 'away'}_on_{i+1}"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
941
+ on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
425
942
 
426
943
  on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
427
944
 
428
- #Repeat above process with goaltenders
429
- if home:
430
- team = {key:value for key, value in rosters['home'].items() if value['pos'] == "G"}
431
- else:
432
- team = {key:value for key, value in rosters['away'].items() if value['pos'] == "G"}
433
-
434
- names = list(team.keys())
435
- try: names.remove("")
436
- except ValueError: ""
437
-
438
- for player in names:
945
+ #Repeat this process with goaltenders
946
+ for player in goalie_names:
439
947
  on_ice = (np.cumsum(
440
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
948
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
441
949
  .apply(str)
442
950
  .apply(lambda x: int(bool(re.search(player, x)))) -
443
- shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
951
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
444
952
  .apply(str)
445
953
  .apply(lambda x: int(bool(re.search(player, x))))
446
954
  ))
@@ -453,7 +961,7 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
453
961
 
454
962
  max_players = 1
455
963
  for i in range(max_players):
456
- on_goalies[f"{'home' if home else 'away'}_goalie"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
964
+ on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
457
965
 
458
966
  on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
459
967
 
@@ -461,87 +969,100 @@ def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
461
969
  on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
462
970
 
463
971
  shifts['row'] = shifts.index
464
-
972
+
973
+ if home:
974
+ shifts['home_team_abbr'] = team
975
+ else:
976
+ shifts['away_team_abbr'] = team
465
977
  #Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
466
- return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"REMOVE")
978
+ return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
467
979
 
468
- def combine_shifts(home_shift,away_shift,json,game_id):
469
- #Given shifts html documents for home and away team, return shift events complete with both teams' changes in the provided game
470
- data = retreive_players(json,result="pos")
471
- data_id = retreive_players(json)
980
+ ## FINALIZE PBP FUNCTIONS ##
981
+ def combine_shifts(info):
982
+ #Given game info, return complete shift events
472
983
 
473
- away = parse_shifts(away_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
474
- home = parse_shifts(home_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
984
+ #JSON Prep
985
+ roster = info['rosters']
475
986
 
476
- away['row'] = away.index
477
- home['row'] = home.index
478
-
479
- away_shifts = construct_skaters_matrix(data,away,pd.json_normalize(json)['awayTeam.abbrev'][0],False).fillna("REMOVE")
480
- home_shifts = construct_skaters_matrix(data,home,pd.json_normalize(json)['homeTeam.abbrev'][0],True).fillna("REMOVE")
987
+ #Quickly combine shifts data
988
+ away = parse_shift_events(info,False)
989
+ home = parse_shift_events(info,True)
481
990
 
482
- shifts = pd.concat([away_shifts,home_shifts]).sort_values(by=['period','seconds_elapsed'])
991
+ #Combine shifts
992
+ data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
993
+
994
+ #Add game info
995
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
996
+ 'away_team_abbr','home_team_abbr']
483
997
 
484
- #Return: shifts dataframe with both teams' changes
485
- return shifts.drop(columns=['row'])
998
+ for col in info_col:
999
+ data[col] = info[col]
1000
+
1001
+ #Create player information dicts to create on-ice names
1002
+ roster['playerId'] = roster['playerId'].astype(str)
1003
+ players = roster.set_index("playerId")['full_name'].to_dict()
486
1004
 
487
- def fix_names(shifts_df,json):
488
- #Uses alternative names provided in the json to search shifts and ensure both shifts and json dataframes use the same name for each player
489
- data = pd.json_normalize(json['rosterSpots'])
490
- data['fullName'] = (data['firstName.default']+" "+data['lastName.default']).str.upper()
1005
+ for i in range(0,7):
1006
+ if i == 6:
1007
+ data['away_goalie'] = data['away_goalie_id'].replace(players)
1008
+ data['home_goalie'] = data['home_goalie_id'].replace(players)
1009
+ else:
1010
+ data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
1011
+ data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
491
1012
 
492
- alt_name_col = ['firstName.cs', 'firstName.de', 'firstName.es', 'firstName.fi', 'firstName.sk', 'firstName.sv']
493
- for i in range(len(alt_name_col)):
494
- try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
495
- except: continue
1013
+ data = data.sort_values(['period','seconds_elapsed'])
1014
+ #Fill on-ice columns down
1015
+ on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
1016
+ 'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
1017
+ 'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
1018
+ 'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
1019
+ 'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
496
1020
 
497
- name_col = ['fullName', 'fullName.1', 'fullName.2', 'fullName.3', 'fullName.4', 'fullName.5', 'fullName.6']
1021
+ for col in on_ice_col:
1022
+ data[col] = data[col].ffill()
498
1023
 
499
- for name in name_col:
500
- try: data[name]
501
- except:
502
- data[name] = np.nan
1024
+ #Create strength state information
1025
+ away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
1026
+ home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
1027
+ data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1028
+ data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1029
+ data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
503
1030
 
504
- names_dfs = []
505
- for name in name_col[1:len(name_col)]:
506
- names_dfs.append(data[[name,'fullName']].rename(columns={name:"alt",
507
- "fullName":'default'}))
1031
+ #Return: full shifts data converted to play-by-play format
1032
+ col = [col for col in get_col() if col in data.columns.to_list()]
1033
+ return data[col]
508
1034
 
509
- names_df = pd.concat(names_dfs)
1035
+ def combine_data(info):
1036
+ #Given game info, return complete play-by-play data
510
1037
 
511
- replace = {}
512
- for default, alt in zip(names_df['default'],names_df['alt']):
513
- if alt == np.nan or alt == "" or str(alt) == 'nan':
514
- continue
515
- else:
516
- replace.update({alt:default})
517
-
518
- return shifts_df.replace(replace,regex=True)
1038
+ game_id = info['game_id']
519
1039
 
520
- def combine_data(json,html):
521
- #Given json pbp and html shifts, total game play-by-play data is provided with additional and corrected details
522
- df = pd.concat([json,html])
1040
+ pbp = combine_pbp(info)
1041
+ shifts = combine_shifts(info)
523
1042
 
524
- #Fill period_type column and assign shifts a sub-500 event code
525
- df['period_type'] = np.where(df['period']<4,"REG",np.where(df['period']==4,"OT","SO"))
526
- df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
1043
+ #Combine data
1044
+ df = pd.concat([pbp,shifts])
527
1045
 
528
1046
  #Create priority columns designed to order events that occur at the same time in a game
529
- start_pri = ['period-start','game-start']
530
1047
  even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
531
- df['priority'] = np.where(df['event_type'].isin(start_pri),0,
532
- np.where(df['event_type'].isin(even_pri),1,
1048
+ df['priority'] = np.where(df['event_type'].isin(even_pri),1,
533
1049
  np.where(df['event_type']=='goal',2,
534
1050
  np.where(df['event_type']=='stoppage',3,
535
- np.where(df['event_type']=='penalty',4,
536
- np.where(df['event_type']=='change',5,
1051
+ np.where(df['event_type']=='delayed-penalty',4,
1052
+ np.where(df['event_type']=='penalty',5,
537
1053
  np.where(df['event_type']=='period-end',6,
538
- np.where(df['event_type']=='game-end',7,
539
- np.where(df['event_type']=='faceoff',8,9)))))))))
1054
+ np.where(df['event_type']=='change',7,
1055
+ np.where(df['event_type']=='game-end',8,
1056
+ np.where(df['event_type']=='period-start',9,
1057
+ np.where(df['event_type']=='faceoff',10,0))))))))))
1058
+
1059
+ df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
1060
+ df = df.sort_values(['period','seconds_elapsed','priority'])
540
1061
 
541
- df = df.sort_values(by=['period','seconds_elapsed','priority']).reset_index()
542
- #Recreate event_num column to accurately depict the order of all events, including changes
1062
+ #Recalibrate event_num column to accurately depict the order of all events, including changes
1063
+ df.reset_index(inplace=True,drop=True)
543
1064
  df['event_num'] = df.index+1
544
- df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
1065
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
545
1066
  df['event_type_last'] = df['event_type'].shift(1)
546
1067
  df['event_type_last_2'] = df['event_type_last'].shift(1)
547
1068
  df['event_type_next'] = df['event_type'].shift(-1)
@@ -550,76 +1071,36 @@ def combine_data(json,html):
550
1071
  period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
551
1072
  #Define shifts by "line-change" or "on-the-fly"
552
1073
  df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
553
-
554
- #Descrpitions:
555
- #HTML pbp includes descriptions for each event; without the HTML pbp, play descriptions must be generated
556
- #Different, more originally formatting is employed with these descriptions in comparison to that provided in the HTML pbp
557
- df['start_end_desc'] = np.where(df['event_type'].isin(['period-start','period-end']),df['away_team_abbr'] + "vs" + df['home_team_abbr'] + ": Period " + df['period'].astype(str) + " " + df['event_type'].str.replace("period-","",regex=True).str.capitalize(),np.nan)
558
- df['take_give_desc'] = np.where(df['event_type'].isin(['takeaway','giveaway']),df['event_team_abbr'] + " " + df['event_type'].str.upper() + " by " + df['event_player_1_name'],np.nan)
559
- df['stoppage_desc'] = np.where(df['event_type']=='stoppage',"STOPPAGE: " + df['reason'].str.replace("-"," ",regex=True).str.capitalize(),np.nan)
560
- df['blocked_desc'] = np.where(df['event_type']=='blocked-shot',df['event_team_abbr'] + " SHOT from " + df['event_player_1_name'] + " BLOCKED by " + df['event_player_2_name'],np.nan)
561
- df['missed_desc'] = np.where(df['event_type']=='missed-shot',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " MISSED: " + df['reason'].astype(str).str.replace("-"," ",regex=True),np.nan)
562
- df['sog_desc'] = np.where(df['event_type']=='shot-on-goal',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " SAVED by " + df['event_goalie_name'],np.nan)
563
- df['goal_desc'] = np.where(df['event_type']=='goal',df['event_team_abbr'] + " GOAL SCORED by " + df['event_player_1_name'],np.nan)
564
- df['assist_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_2_name'].notna())," ASSISTED by " + df['event_player_2_name'],"")
565
- df['assist2_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_3_name'].notna())," and ASSISTED by " + df['event_player_3_name'],"")
566
- df['goal_desc_complete'] = df['goal_desc'] + df['assist_desc'] + df['assist2_desc']
567
- df['hit_desc'] = np.where(df['event_type']=='hit',df['event_team_abbr'] + " HIT by " + df['event_player_1_name'] + " on " + df['event_player_2_name'],np.nan)
568
- df['faceoff_desc'] = np.where(df['event_type']=='faceoff',"FACEOFF WON by " + df['event_player_1_name'] + " AGAINST " + df['event_player_2_name'],np.nan)
569
- df['penalty_desc'] = np.where(df['event_type']=='penalty',df['event_team_abbr'] + " PENALTY on " + df['event_player_1_name'] + ": " + df['penalty_duration'].astype(str).str.replace(".0","",regex=True) + " minutes for " + df['penalty_description'].astype(str).str.replace("-"," ",regex=True).str.upper(),np.nan)
570
-
571
- df['description'] = df['start_end_desc'].combine_first(df['take_give_desc'])\
572
- .combine_first(df['stoppage_desc'])\
573
- .combine_first(df['blocked_desc'])\
574
- .combine_first(df['missed_desc'])\
575
- .combine_first(df['sog_desc'])\
576
- .combine_first(df['goal_desc_complete'])\
577
- .combine_first(df['hit_desc'])\
578
- .combine_first(df['faceoff_desc'])\
579
- .combine_first(df['penalty_desc'])
580
- ffill_col = ['season','season_type','game_id','game_date',
581
- "start_time","venue","venue_location",
582
- 'away_team_abbr','home_team_abbr','home_team_defending_side',
583
- 'away_score','away_fenwick',
584
- 'home_score','home_fenwick',
585
- 'away_goalie','home_goalie']
586
- away_on = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6']
587
- home_on = ['home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6']
588
-
589
- #Forward fill appropriate columns
590
- for col in ffill_col+away_on+home_on:
1074
+ df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
1075
+ try:
1076
+ df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
1077
+ except:
1078
+ ""
1079
+
1080
+ #Add time since last event and overall event length
1081
+ df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
1082
+ df['event_length'] = df['seconds_since_last'].shift(-1)
1083
+
1084
+ #Add fixed strength state column
1085
+ df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
1086
+
1087
+ #Retrieve coaches
1088
+ coaches = info['coaches']
1089
+ if not coaches:
1090
+ df['away_coach'] = ""
1091
+ df['home_coach'] = ""
1092
+ df['event_coach'] = ""
1093
+ else:
1094
+ df['away_coach'] = coaches['away']
1095
+ df['home_coach'] = coaches['home']
1096
+ df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
1097
+
1098
+ #Forward fill as necessary
1099
+ cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
1100
+ for col in cols:
1101
+ try: df[col]
1102
+ except: df[col] = ""
591
1103
  df[col] = df[col].ffill()
592
1104
 
593
- #Now that forward fill is complete, replace "REMOVE" with nan
594
- df.replace("REMOVE",np.nan,inplace=True)
595
-
596
- #Reconfigure strength state and sitution codes
597
- df['away_skaters'] = df[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
598
- df['home_skaters'] = df[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
599
- df['away_goalie_in'] = np.where(df['away_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
600
- df['home_goalie_in'] = np.where(df['home_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
601
-
602
- df['event_skaters'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['home_skaters'],df['away_skaters'])
603
- df['event_skaters_against'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['away_skaters'],df['home_skaters'])
604
-
605
- df['strength_state'] = df['event_skaters'].astype(str) + "v" + df['event_skaters_against'].astype(str)
606
- df['situation_code'] = np.where(df['situation_code'].isna(),df['away_goalie_in'].astype(str) + df['away_skaters'].astype(str) + df['home_skaters'].astype(str) + df['home_goalie_in'].astype(str),df['situation_code'])
607
-
608
- col = [
609
- 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
610
- 'away_team_abbr','home_team_abbr','event_num','period','period_type',
611
- 'seconds_elapsed', "situation_code","strength_state","home_team_defending_side","shift_type",
612
- "event_type_code","event_type","description","reason","penalty_duration","penalty_description",
613
- "event_team_abbr",'num_on', 'players_on', 'ids_on', 'num_off', 'players_off', 'ids_off',
614
- "event_team_status","event_player_1_id","event_player_2_id","event_player_3_id",
615
- "event_player_1_name","event_player_2_name","event_player_3_name","event_player_1_pos","event_player_2_pos",
616
- "event_player_3_pos","event_goalie_id",
617
- "event_goalie_name","shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
618
- "event_skaters","away_skaters","home_skaters",
619
- "event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
620
- "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
621
- "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie"
622
- ]
623
-
624
1105
  #Return: complete play-by-play with all important data for each event in a provided game
625
- return df[col].replace(r'^\s*$', np.nan, regex=True)
1106
+ return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)