wsba-hockey 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1104 @@
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import requests as rs
4
+ import json as json_lib
5
+ from tools.utils.shared import *
6
+ import numpy as np
7
+ import pandas as pd
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
+
11
+ ### SCRAPING FUNCTIONS ###
12
+ # Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
13
+
14
+ ### JSON GAME INFO ###
15
+ def get_game_roster(json):
16
+ #Given raw json data, return game rosters
17
+ roster = pd.json_normalize(json['rosterSpots'])
18
+
19
+ alt_name_col = ['firstName.cs','firstName.de','firstName.es','firstName.fi','firstName.sk','firstName.sv']
20
+
21
+ #Add alt-names pattern (appropriately replaces names in shift html)
22
+ roster['fullName.default'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
23
+ for col in alt_name_col:
24
+ try:
25
+ roster[f'fullName.{re.sub('firstName.',"",col)}'] = (roster[col] + " " + roster['lastName.default']).str.upper()
26
+ except:
27
+ continue
28
+ return roster
29
+
30
+ def get_game_info(json):
31
+ #Given raw json data, return game information
32
+
33
+ base = pd.json_normalize(json)
34
+ game_id = base['id'][0]
35
+ season = base['season'][0]
36
+ season_type = base['gameType'][0]
37
+ game_date = base['gameDate'][0]
38
+ start_time = base['startTimeUTC'][0]
39
+ venue = base['venue.default'][0]
40
+ venue_location = base['venueLocation.default'][0]
41
+ away_team_id = base['awayTeam.id'][0]
42
+ away_team_abbr = base['awayTeam.abbrev'][0]
43
+ home_team_id = base['homeTeam.id'][0]
44
+ home_team_abbr = base['homeTeam.abbrev'][0]
45
+
46
+ return {"game_id":game_id,
47
+ "season":season,
48
+ "season_type":season_type,
49
+ "game_date":game_date,
50
+ "start_time":start_time,
51
+ 'venue':venue,
52
+ 'venue_location':venue_location,
53
+ 'away_team_id':away_team_id,
54
+ 'away_team_abbr':away_team_abbr,
55
+ 'home_team_id':home_team_id,
56
+ 'home_team_abbr':home_team_abbr,
57
+ 'rosters':get_game_roster(json)}
58
+
59
+ ## HTML PBP DATA ###
60
+ def strip_html_pbp(td,json):
61
+ #Harry Shomer's Code (modified)
62
+
63
+ #JSON Prep
64
+ info = get_game_info(json)
65
+ roster = info['rosters']
66
+
67
+ away = info['away_team_id'] #Away team in the HTML is the seventh column
68
+ home = info['home_team_id'] #Home team in the HTML is the eighth column
69
+ away_players = roster.loc[roster['teamId']==away][['fullName.default','playerId','sweaterNumber']]
70
+ home_players = roster.loc[roster['teamId']==home][['fullName.default','playerId','sweaterNumber']]
71
+
72
+ rosters = {"away":away_players.set_index("sweaterNumber")[['playerId','fullName.default']].to_dict(),
73
+ "home":home_players.set_index("sweaterNumber")[['playerId','fullName.default']].to_dict()
74
+ }
75
+
76
+ #HTML Parsing
77
+ for y in range(len(td)):
78
+ # Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
79
+ if y == 3:
80
+ td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
81
+ index = td[y].find(':')
82
+ td[y] = td[y][:index+3]
83
+ elif (y == 6 or y == 7) and td[0] != '#':
84
+ # 6 & 7-> These are the player 1 ice one's
85
+ # The second statement controls for when it's just a header
86
+ baz = td[y].find_all('td')
87
+ bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
88
+
89
+ # The setup in the list is now: Name/Number->Position->Blank...and repeat
90
+ # Now strip all the html
91
+ players = []
92
+ for i in range(len(bar)):
93
+ if i % 3 == 0:
94
+ try:
95
+ #Using the supplied json we can bind player name and id to number and team
96
+ #Find number and team of player then lookup roster dictionary
97
+
98
+ number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
99
+ if y == 6:
100
+ team = 'away'
101
+ else:
102
+ team = 'home'
103
+
104
+ id = rosters[team]['playerId'][int(number)]
105
+ name = rosters[team]['fullName.default'][int(number)]
106
+
107
+ except KeyError:
108
+ name = ''
109
+ number = ''
110
+ id = ''
111
+ elif i % 3 == 1:
112
+ if name != '':
113
+ position = bar[i].get_text()
114
+ players.append([name, number, position, id])
115
+
116
+ td[y] = players
117
+ else:
118
+ td[y] = td[y].get_text()
119
+
120
+ return td
121
+
122
+
123
+ def clean_html_pbp(html,json):
124
+ #Harry Shomer's Code (modified)
125
+ soup = get_contents(html)
126
+
127
+ # Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
128
+ td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
129
+
130
+ cleaned_html = [strip_html_pbp(x,json) for x in td]
131
+
132
+ return cleaned_html
133
+
134
+ def get_html_roster(html,json,teams):
135
+ #Given raw html and teams, return available roster data
136
+ events = clean_html_pbp(html,json)
137
+
138
+ #Roster dict
139
+ roster_dict = {teams['away']:{},
140
+ teams['home']:{}}
141
+
142
+ for event in events:
143
+ if event[0] == "#":
144
+ continue
145
+ else:
146
+ #Players are keys, value is a list with number, position, and description id
147
+ for i in range(len(event[6])):
148
+ player = event[6][i][0]
149
+ num = event[6][i][1]
150
+ pos = event[6][i][2]
151
+ id = event[6][i][3]
152
+ team = teams['away']
153
+
154
+ #Accounting for players with three or more parts in their name
155
+ if len(player.split())>2:
156
+ last = " ".join(player.split()[1:len(player.split())])
157
+ else:
158
+ last = player.split()[len(player.split())-1]
159
+
160
+ desc_id = f'#{num} {last}'
161
+ roster_dict[team].update({
162
+ desc_id:[num,pos,player,team,id]
163
+ })
164
+ for i in range(len(event[7])):
165
+ player = event[7][i][0]
166
+ num = event[7][i][1]
167
+ pos = event[7][i][2]
168
+ id = event[7][i][3]
169
+ team = teams['home']
170
+
171
+ #Accounting for players with three or more parts in their name
172
+ if len(player.split())>2:
173
+ last = " ".join(player.split()[1:len(player.split())])
174
+ else:
175
+ last = player.split()[len(player.split())-1]
176
+
177
+ desc_id = f'#{num} {last}'
178
+ roster_dict[team].update({
179
+ desc_id:[num,pos,player,team,id]
180
+ })
181
+
182
+ return roster_dict
183
+
184
+ def get_json_coaches(game_id):
185
+ #Given game id, return head coaches for away and home team
186
+
187
+ #Retreive data
188
+ json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
189
+ data = json['gameInfo']
190
+
191
+ #Add coaches
192
+ try:
193
+ away = data['awayTeam']['headCoach']['default'].upper()
194
+ home = data['homeTeam']['headCoach']['default'].upper()
195
+
196
+ coaches = {'away':away,
197
+ 'home':home}
198
+ except KeyError:
199
+ return {}
200
+
201
+ #Return: dict with coaches
202
+ return coaches
203
+
204
+ def parse_html_event(event,roster,teams):
205
+ #Given event from html events list and game roster, return event data
206
+
207
+ events_dict = dict()
208
+ if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM','SPC','PBOX']:
209
+ return pd.DataFrame()
210
+ else:
211
+ #Event info
212
+ events_dict['event_num'] = int(event[0])
213
+ events_dict['period'] = int(event[1])
214
+ events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
215
+ events_dict['period_time_elapsed'] = event[3]
216
+ events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
217
+ events_dict['event_type'] = event[4]
218
+ desc = re.sub(u'\xa0'," ",event[5])
219
+ events_dict['description'] = desc
220
+
221
+ events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
222
+ zone = [x for x in desc.split(',') if 'Zone' in x]
223
+ if not zone:
224
+ events_dict['zone_code'] = None
225
+ elif zone[0].find("Off") != -1:
226
+ events_dict['zone_code'] = 'O'
227
+ elif zone[0].find("Neu") != -1:
228
+ events_dict['zone_code'] = 'N'
229
+ elif zone[0].find("Def") != -1:
230
+ events_dict['zone_code'] = 'D'
231
+
232
+ #Convert team names for compatiblity
233
+ replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
234
+ for name, repl in replace:
235
+ teams['away'] = teams['away'].replace(repl,name)
236
+ teams['home'] = teams['home'].replace(repl,name)
237
+ desc = desc.replace(repl,name)
238
+
239
+ event_team = desc[0:3] if desc[0:3] in [teams['away'],teams['home']] else ""
240
+ events_dict['event_team_abbr'] = event_team
241
+
242
+
243
+ events_dict['away_team_abbr'] = teams['away']
244
+ events_dict['home_team_abbr'] = teams['home']
245
+ event_skaters = []
246
+
247
+ away_skaters = 0
248
+ away_goalie = 0
249
+ #Away on-ice
250
+ for i in range(len(event[6])):
251
+ player = event[6][i][0]
252
+ num = event[6][i][1]
253
+ pos = event[6][i][2]
254
+ id = event[6][i][3]
255
+
256
+ if pos == 'G':
257
+ events_dict['away_goalie'] = player
258
+ events_dict['away_goalie_id'] = id
259
+ away_goalie += 1
260
+ else:
261
+ events_dict[f'away_on_{i+1}'] = player
262
+ events_dict[f'away_on_{i+1}_id'] = id
263
+ away_skaters += 1
264
+
265
+ home_skaters = 0
266
+ home_goalie = 0
267
+ #Home on-ice
268
+ for i in range(len(event[7])):
269
+ player = event[7][i][0]
270
+ num = event[7][i][1]
271
+ pos = event[7][i][2]
272
+ id = event[7][i][3]
273
+
274
+ if pos == 'G':
275
+ events_dict['home_goalie'] = player
276
+ events_dict['home_goalie_id'] = id
277
+ home_goalie += 1
278
+ else:
279
+ events_dict[f'home_on_{i+1}'] = player
280
+ events_dict[f'home_on_{i+1}_id'] = id
281
+ home_skaters += 1
282
+
283
+ #Determine parsing route based on event (single player events are left)
284
+ if event[4] in ['FAC','HIT','BLOCK','PENL']:
285
+ #Regex to find team and player number involved (finds all for each event)
286
+ #Code is modified from Harry Shomer in order to account for periods in a team abbreviation
287
+ regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
288
+ fac = regex.findall(desc)
289
+
290
+ try: team_1,num_1 = fac[0]
291
+ except: team_1 = ''
292
+ try: team_2,num_2 = fac[1]
293
+ except: team_2 = ''
294
+
295
+ try: rost_1 = roster[team_1]
296
+ except: rost_1 = {}
297
+ try: rost_2 = roster[team_2]
298
+ except: rost_2 = {}
299
+
300
+ #Filter incorrectly parsed teams
301
+ repl = []
302
+ for team, num in fac:
303
+ if team in [teams['home'],teams['away']]:
304
+ repl.append((team,num))
305
+
306
+ fac = repl
307
+
308
+ #Determine append order (really only applies to faceoffs)
309
+ if len(fac) == 0:
310
+ #No data
311
+ ""
312
+ else:
313
+ if len(fac) == 1:
314
+ #Find event players using given roster
315
+ for desc_id,info in rost_1.items():
316
+ if desc_id in desc:
317
+ event_skaters.append([info[2],info[1],info[4]])
318
+ else:
319
+ if team_1 == event_team:
320
+ for desc_id,info in rost_1.items():
321
+ if desc_id in desc:
322
+ event_skaters.append([info[2],info[1],info[4]])
323
+ for desc_id,info in rost_2.items():
324
+ if desc_id in desc:
325
+ event_skaters.append([info[2],info[1],info[4]])
326
+ else:
327
+ for desc_id,info in rost_2.items():
328
+ if desc_id in desc:
329
+ event_skaters.append([info[2],info[1],info[4]])
330
+ for desc_id,info in rost_1.items():
331
+ if desc_id in desc:
332
+ event_skaters.append([info[2],info[1],info[4]])
333
+ else:
334
+ #Parse goal
335
+ if event[4] == 'GOAL':
336
+ regex = re.compile(r'#(\d+)\s+')
337
+ goal = regex.findall(desc)
338
+
339
+ goal_team = roster[event_team]
340
+ #Search through individual element in goal (adds skaters in order from goal, first assist, second assist)
341
+ for point in goal:
342
+ for info in goal_team.values():
343
+ if info[0] == point:
344
+ event_skaters.append([info[2],info[1],info[4]])
345
+ break
346
+ else:
347
+ #Parse single player or no player events
348
+ combined = roster[teams['away']] | roster[teams['home']]
349
+ for desc_id,info in combined.items():
350
+ if desc_id in desc:
351
+ event_skaters.append([info[2],info[1],info[4]])
352
+
353
+ for i in range(len(event_skaters)):
354
+ events_dict[f'event_player_{i+1}_name'] = event_skaters[i][0]
355
+ events_dict[f'event_player_{i+1}_id'] = event_skaters[i][2]
356
+ events_dict[f'event_player_{i+1}_pos'] = event_skaters[i][1]
357
+
358
+ events_dict['away_skaters'] = away_skaters
359
+ events_dict['home_skaters'] = home_skaters
360
+ events_dict['away_goalie_in'] = away_goalie
361
+ events_dict['home_goalie_in'] = home_goalie
362
+
363
+ event_skaters = away_skaters if teams['away'] == event_team else home_skaters
364
+ event_skaters_against = away_skaters if teams['home'] == event_team else home_skaters
365
+ events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
366
+ events_dict['event_skaters'] = np.where(event_team == teams['home'],home_skaters,away_skaters)
367
+
368
+ #Return: dataframe of event in a single row
369
+ return (pd.DataFrame([events_dict]))
370
+
371
+ def parse_html(game_id,html,json):
372
+ #Given the game id, raw html document to a provided game, and json data, return parsed HTML play-by-play
373
+
374
+ #Retreive cleaned html data (from Harry Shomer's hockey_scraper package)
375
+ events = clean_html_pbp(html,json)
376
+
377
+ json_info = pd.json_normalize(json)
378
+ teams = {
379
+ 'away':json_info['awayTeam.abbrev'][0],
380
+ 'home':json_info['homeTeam.abbrev'][0]
381
+ }
382
+
383
+ roster = get_html_roster(html,json,teams)
384
+ event_log = []
385
+ for event in events:
386
+ event_log.append(parse_html_event(event,roster,teams))
387
+
388
+ data = pd.concat(event_log)
389
+ data['event_type'] = data['event_type'].replace({
390
+ "PGSTR": "pre-game-start",
391
+ "PGEND": "pre-game-end",
392
+ 'GSTR':"game-start",
393
+ "ANTHEM":"anthem",
394
+ "PSTR":"period-start",
395
+ 'FAC':"faceoff",
396
+ "SHOT":"shot-on-goal",
397
+ "BLOCK":"blocked-shot",
398
+ "STOP":"stoppage",
399
+ "MISS":"missed-shot",
400
+ "HIT":"hit",
401
+ "GOAL":"goal",
402
+ "GIVE":"giveaway",
403
+ "TAKE":"takeaway",
404
+ "DELPEN":"delayed-penalty",
405
+ "PENL":"penalty",
406
+ "CHL":"challenge",
407
+ "PEND":"period-end",
408
+ "GEND":"game-end"
409
+ })
410
+
411
+ check_col = ['event_player_1_id','event_player_2_id','event_player_3_id',
412
+ 'away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
413
+ 'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
414
+ 'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
415
+ 'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id']
416
+
417
+ for col in check_col:
418
+ try: data[col]
419
+ except:
420
+ data[col] = ""
421
+
422
+ #Return: HTML play-by-play
423
+ return data
424
+
425
+ ### JSON PBP DATA ###
426
+ def parse_json(json):
427
+ #Given json data from an NHL API call, return play-by-play data.
428
+
429
+ events = pd.json_normalize(json['plays']).reset_index(drop=True)
430
+ info = pd.json_normalize(json)
431
+ roster =get_game_roster(json)
432
+
433
+ #Return error if game is set in the future
434
+ if info['gameState'][0] == 'FUT':
435
+ raise ValueError(f"Game {info['id'][0]} has not occured yet.")
436
+
437
+ away = info['awayTeam.id'][0]
438
+ home = info['homeTeam.id'][0]
439
+ teams = {
440
+ away:info['awayTeam.abbrev'][0],
441
+ home:info['homeTeam.abbrev'][0]
442
+ }
443
+
444
+ #Create player information dicts used to create event_player columns
445
+ players = {}
446
+ for id, player in zip(list(roster['playerId']),list(roster['fullName.default'])):
447
+ players.update({id:player.upper()})
448
+
449
+ #Test columns
450
+ cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
451
+
452
+ for col in cols:
453
+ try:events[col]
454
+ except:
455
+ events[col]=""
456
+
457
+ #Event_player_columns include players in a given set of events; the higher the number, the greater the importance the event player was to the play
458
+ events['event_player_1_id'] = events['details.winningPlayerId'].combine_first(events['details.scoringPlayerId'])\
459
+ .combine_first(events['details.shootingPlayerId'])\
460
+ .combine_first(events['details.playerId'])\
461
+ .combine_first(events['details.hittingPlayerId'])\
462
+ .combine_first(events['details.committedByPlayerId'])
463
+
464
+ events['event_player_2_id'] = events['details.losingPlayerId'].combine_first(events['details.assist1PlayerId'])\
465
+ .combine_first(events['details.hitteePlayerId'])\
466
+ .combine_first(events['details.drawnByPlayerId'])\
467
+ .combine_first(events['details.blockingPlayerId'])
468
+
469
+ events['event_player_3_id'] = events['details.assist2PlayerId']
470
+
471
+ events['event_team_status'] = np.where(events['details.eventOwnerTeamId']==home,"home","away")
472
+
473
+ #Coordinate adjustments:
474
+ #The WSBA NHL Scraper includes three sets of coordinates per event:
475
+ # x, y - Raw coordinates from JSON pbpp
476
+ # x_fixed, y_fixed - Coordinates fixed to the right side of the ice (x is always greater than 0)
477
+ # x_adj, y_adj - Adjusted coordinates configuring away events with negative x vlaues while home events are always positive
478
+
479
+ #Some games (mostly preseason and all star games) do not include coordinates.
480
+ try:
481
+ events['x_fixed'] = abs(events['details.xCoord'])
482
+ events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
483
+ events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
484
+ events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
485
+ events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
486
+ events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
487
+ except TypeError:
488
+ print(f"No coordinates found for game {info['id'][0]}...")
489
+
490
+ events['x_fixed'] = np.nan
491
+ events['y_fixed'] = np.nan
492
+ events['x_adj'] = np.nan
493
+ events['y_adj'] = np.nan
494
+ events['event_distance'] = np.nan
495
+ events['event_angle'] = np.nan
496
+
497
+
498
+ events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
499
+ events['event_goalie'] = events['details.goalieInNetId'].replace(players)
500
+
501
+ #Rename columns to follow WSBA naming conventions
502
+ events = events.rename(columns={
503
+ "eventId":"event_id",
504
+ "periodDescriptor.number":"period",
505
+ "periodDescriptor.periodType":"period_type",
506
+ "timeInPeriod":"period_time_elasped",
507
+ "timeRemaining":"period_time_remaining",
508
+ "situationCode":"situation_code",
509
+ "homeTeamDefendingSide":"home_team_defending_side",
510
+ "typeCode":"event_type_code",
511
+ "typeDescKey":"event_type",
512
+ "details.shotType":"shot_type",
513
+ "details.duration":"penalty_duration",
514
+ "details.descKey":"penalty_description",
515
+ "details.reason":"reason",
516
+ "details.zoneCode":"zone_code",
517
+ "details.xCoord":"x",
518
+ "details.yCoord":"y",
519
+ "details.goalieInNetId": "event_goalie_id",
520
+ "details.awaySOG":"away_SOG",
521
+ "details.homeSOG":"home_SOG"
522
+ })
523
+
524
+ #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
525
+ events['period_time_simple'] = events['period_time_elasped'].str.replace(":","",regex=True)
526
+ events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
527
+ ((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
528
+ ((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
529
+ events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
530
+
531
+ events = events.loc[(events['event_type']!="")]
532
+
533
+ #Assign score and fenwick for each event
534
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
535
+ ag = 0
536
+ ags = []
537
+ hg = 0
538
+ hgs = []
539
+
540
+ af = 0
541
+ afs = []
542
+ hf = 0
543
+ hfs = []
544
+ for event,team in zip(list(events['event_type']),list(events['event_team_status'])):
545
+ if event in fenwick_events:
546
+ if team == "home":
547
+ hf += 1
548
+ if event == 'goal':
549
+ hg += 1
550
+ else:
551
+ af += 1
552
+ if event == 'goal':
553
+ ag += 1
554
+
555
+ ags.append(ag)
556
+ hgs.append(hg)
557
+ afs.append(af)
558
+ hfs.append(hf)
559
+
560
+ events['away_score'] = ags
561
+ events['home_score'] = hgs
562
+ events['away_fenwick'] = afs
563
+ events['home_fenwick'] = hfs
564
+
565
+ #Return: dataframe with parsed game
566
+ return events
567
+
568
+ def combine_pbp(game_id,html,json):
569
+ #Given game id, html data, and json data, return complete play-by-play data for provided game
570
+
571
+ html_pbp = parse_html(game_id,html,json)
572
+ info = get_game_info(json)
573
+
574
+ #Route data combining - json if season is after 2009-2010:
575
+ if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
576
+ #ESPN x HTML
577
+ espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'})
578
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
579
+
580
+ df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
581
+
582
+ else:
583
+ #JSON x HTML
584
+ json_pbp = parse_json(json)
585
+ #Modify merge conditions and merge pbps
586
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
587
+ html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'])
588
+
589
+ df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col)
590
+
591
+ #Add game info
592
+ info_col = ['season','season_type','game_id','game_date',"start_time","venue","venue_location",
593
+ 'away_team_abbr','home_team_abbr']
594
+
595
+ for col in info_col:
596
+ df[col] = info[col]
597
+
598
+ #Fill period_type column and assign shifts a sub-500 event code
599
+ df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
600
+ try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
601
+ except:
602
+ ""
603
+ df = df.sort_values(['period','seconds_elapsed']).reset_index()
604
+
605
+ df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
606
+
607
+ col = [col for col in get_col() if col in df.columns.to_list()]
608
+ #Return: complete play-by-play information for provided game
609
+ return df[col]
610
+
611
+ ### ESPN SCRAPING FUNCTIONS ###
612
+ def espn_game_id(date,away,home):
613
+ #Given a date formatted as YYYY-MM-DD and teams, return game id from ESPN schedule
614
+ date = date.replace("-","")
615
+
616
+ #Retreive data
617
+ api = f"https://site.api.espn.com/apis/site/v2/sports/hockey/nhl/scoreboard?dates={date}"
618
+ schedule = pd.json_normalize(rs.get(api).json()['events'])
619
+
620
+ #Create team abbreviation columns
621
+ schedule['away_team_abbr'] = schedule['shortName'].str[:3].str.strip(" ")
622
+ schedule['home_team_abbr'] = schedule['shortName'].str[-3:].str.strip(" ")
623
+
624
+ #Modify team abbreviations as necessary
625
+ schedule = schedule.replace({
626
+ "LA":"LAK",
627
+ "NJ":"NJD",
628
+ "SJ":"SJS",
629
+ "TB":"TBL",
630
+ })
631
+
632
+ #Retreive game id
633
+ game_id = schedule.loc[(schedule['away_team_abbr']==away)&
634
+ (schedule['home_team_abbr']==home),'id'].tolist()[0]
635
+
636
+ #Return: ESPN game id
637
+ return game_id
638
+
639
+ def parse_espn(date,away,home):
640
+ #Given a date formatted as YYYY-MM-DD and teams, return game events
641
+ game_id = espn_game_id(date,away,home)
642
+ url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{game_id}'
643
+
644
+ #Code modified from Patrick Bacon
645
+
646
+ #Retreive game events as json
647
+ page = rs.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
648
+ soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
649
+ json = json_lib.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
650
+
651
+ #DataFrame of time-related info for events
652
+ clock_df = pd.DataFrame()
653
+
654
+ for period in range(0, len(json)):
655
+ clock_df = clock_df._append(pd.DataFrame(json[period]))
656
+
657
+ clock_df = clock_df[~pd.isna(clock_df.clock)]
658
+
659
+ # Needed to add .split(',"st":3')[0] for playoffs
660
+
661
+ #DataFrame of coordinates for events
662
+ coords_df = pd.DataFrame(json_lib.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
663
+
664
+ clock_df = clock_df.assign(
665
+ clock = clock_df.clock.apply(lambda x: x['displayValue'])
666
+ )
667
+
668
+ coords_df = coords_df.assign(
669
+ coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
670
+ coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
671
+ event_player_1_name = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
672
+ )
673
+
674
+ #Combine
675
+ espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
676
+
677
+ espn_events = espn_events.assign(
678
+ period = espn_events['period'].apply(lambda x: x['number']),
679
+ minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
680
+ seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
681
+ event_type = espn_events['type'].apply(lambda x: x['txt'])
682
+ )
683
+
684
+ espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
685
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_x
686
+ ),
687
+ coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
688
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
689
+
690
+ espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1_name))]
691
+
692
+ espn_events = espn_events.assign(
693
+ coords_x = espn_events.coords_x.astype(int),
694
+ coords_y = espn_events.coords_y.astype(int)
695
+ )
696
+
697
+ #Rename events
698
+ #The turnover event includes just one player in the event information, meaning takeaways will have no coordinates for play-by-plays created by ESPN scraping
699
+ espn_events['event_type'] = espn_events['event_type'].replace({
700
+ "Face Off":'faceoff',
701
+ "Hit":'hit',
702
+ "Shot":'shot-on-goal',
703
+ "Missed":'missed-shot',
704
+ "Blocked":'blocked-shot',
705
+ "Goal":'goal',
706
+ "Turnover":'giveaway',
707
+ "Delayed Penalty":'delayed-penalty',
708
+ "Penalty":'penalty',
709
+ })
710
+
711
+ #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
712
+ espn_events['period_time_simple'] = espn_events['clock'].str.replace(":","",regex=True)
713
+ espn_events['period_seconds_elapsed'] = np.where(espn_events['period_time_simple'].str.len()==3,
714
+ ((espn_events['period_time_simple'].str[0].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)),
715
+ ((espn_events['period_time_simple'].str[0:2].astype(int)*60)+espn_events['period_time_simple'].str[-2:].astype(int)))
716
+ espn_events['seconds_elapsed'] = ((espn_events['period']-1)*1200)+espn_events['period_seconds_elapsed']
717
+
718
+ espn_events = espn_events.rename(columns = {'text':'description'})
719
+
720
+ #Add event team
721
+ espn_events['event_team_abbr'] = espn_events['homeAway'].replace({
722
+ "away":away,
723
+ "home":home
724
+ })
725
+
726
+ #Some games (mostly preseason and all star games) do not include coordinates.
727
+ try:
728
+ espn_events['x_fixed'] = abs(espn_events['coords_x'])
729
+ espn_events['y_fixed'] = np.where(espn_events['coords_x']<0,-espn_events['coords_y'],espn_events['coords_y'])
730
+ espn_events['x_adj'] = np.where(espn_events['homeAway']=="home",espn_events['x_fixed'],-espn_events['x_fixed'])
731
+ espn_events['y_adj'] = np.where(espn_events['homeAway']=="home",espn_events['y_fixed'],-espn_events['y_fixed'])
732
+ espn_events['event_distance'] = np.sqrt(((89 - espn_events['x_fixed'])**2) + (espn_events['y_fixed']**2))
733
+ espn_events['event_angle'] = np.degrees(np.arctan2(abs(espn_events['y_fixed']), abs(89 - espn_events['x_fixed'])))
734
+ except TypeError:
735
+ print(f"No coordinates found for ESPN game...")
736
+
737
+ espn_events['x_fixed'] = np.nan
738
+ espn_events['y_fixed'] = np.nan
739
+ espn_events['x_adj'] = np.nan
740
+ espn_events['y_adj'] = np.nan
741
+ espn_events['event_distance'] = np.nan
742
+ espn_events['event_angle'] = np.nan
743
+
744
+ #Assign score and fenwick for each event
745
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
746
+ ag = 0
747
+ ags = []
748
+ hg = 0
749
+ hgs = []
750
+
751
+ af = 0
752
+ afs = []
753
+ hf = 0
754
+ hfs = []
755
+ for event,team in zip(list(espn_events['event_type']),list(espn_events['homeAway'])):
756
+ if event in fenwick_events:
757
+ if team == "home":
758
+ hf += 1
759
+ if event == 'goal':
760
+ hg += 1
761
+ else:
762
+ af += 1
763
+ if event == 'goal':
764
+ ag += 1
765
+
766
+ ags.append(ag)
767
+ hgs.append(hg)
768
+ afs.append(af)
769
+ hfs.append(hf)
770
+
771
+ espn_events['away_score'] = ags
772
+ espn_events['home_score'] = hgs
773
+ espn_events['away_fenwick'] = afs
774
+ espn_events['home_fenwick'] = hfs
775
+ #Return: play-by-play events in supplied game from ESPN
776
+ return espn_events
777
+
778
+ ### SHIFT SCRAPING FUNCTIONS ###
779
+ def analyze_shifts(shift, id, name, pos, team):
780
+ #Collects teams in given shifts html (parsed by Beautiful Soup)
781
+ #Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
782
+ shifts = dict()
783
+
784
+ shifts['player_name'] = name.upper()
785
+ shifts['player_id'] = id
786
+ shifts['player_pos'] = pos
787
+ shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
788
+ shifts['event_team_abbr'] = get_team(team.strip(' '))
789
+ shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
790
+ shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
791
+
792
+ # I've had problems with this one...if there are no digits the time is fucked up
793
+ if re.compile(r'\d+').findall(shift[3].split('/')[0]):
794
+ shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
795
+ else:
796
+ shifts['end'] = shifts['start'] + shifts['duration']
797
+ return shifts
798
+
799
+ def parse_shifts_html(html,json):
800
+ #Parsing of shifts data for a single team in a provided game
801
+ #Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
802
+
803
+ #JSON Prep
804
+ info = get_game_info(json)
805
+ roster = info['rosters']
806
+
807
+ away = info['away_team_id'] #Away team in the HTML is the seventh column
808
+ home = info['home_team_id'] #Home team in the HTML is the eighth column
809
+ away_players = roster.loc[roster['teamId']==away][['playerId','fullName.default','positionCode','sweaterNumber']]
810
+ home_players = roster.loc[roster['teamId']==home][['playerId','fullName.default','positionCode','sweaterNumber']]
811
+
812
+ #Create roster dict
813
+ rosters = {"away":away_players.set_index("sweaterNumber")[['playerId','fullName.default','positionCode']].to_dict(),
814
+ "home":home_players.set_index("sweaterNumber")[['playerId','fullName.default','positionCode']].to_dict()
815
+ }
816
+
817
+ all_shifts = []
818
+ #columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
819
+ td, teams = get_soup(html)
820
+
821
+ team = teams[0]
822
+ home_team = teams[1]
823
+ players = dict()
824
+ status = 'home' if team == home_team else 'away'
825
+
826
+ # Iterates through each player shifts table with the following data:
827
+ # Shift #, Period, Start, End, and Duration.
828
+ for t in td:
829
+ t = t.get_text()
830
+ if ',' in t: # If a comma exists it is a player
831
+ name = t
832
+
833
+ name = name.split(',')
834
+ number = int(name[0][:2].strip())
835
+ id = rosters[status]['playerId'][number]
836
+ players[id] = dict()
837
+
838
+ #HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
839
+
840
+ players[id]['name'] = rosters[status]['fullName.default'][number]
841
+ players[id]['pos'] = rosters[status]['positionCode'][number]
842
+
843
+ players[id]['shifts'] = []
844
+ else:
845
+ players[id]['shifts'].extend([t])
846
+
847
+ for key in players.keys():
848
+ # Create lists of shifts-table columns for analysis
849
+ players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
850
+
851
+ name = players[key]['name']
852
+ pos = players[key]['pos']
853
+
854
+ # Parsing
855
+ shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
856
+ all_shifts.extend(shifts)
857
+
858
+ df = pd.DataFrame(all_shifts)
859
+
860
+ shifts_raw = df[df['duration'] > 0]
861
+
862
+ #Return: single-team individual shifts by player
863
+ return shifts_raw
864
+
865
+ def parse_shift_events(html,json,home):
866
+ #Given shift document and home team conditional, parse and convert document to shift events congruent to html play-by-play
867
+ shift = parse_shifts_html(html,json)
868
+ rosters = get_game_roster(json)
869
+
870
+ # Identify shift starts for each shift event
871
+ shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
872
+ num_on=('player_name', 'size'),
873
+ players_on=('player_name', lambda x: ', '.join(x)),
874
+ ids_on=('player_id', lambda x: ', '.join(map(str,x))),
875
+ ).reset_index()
876
+
877
+ shifts_on = shifts_on.rename(columns={
878
+ 'start':"seconds_elapsed"
879
+ })
880
+
881
+ # Identify shift stops for each shift event
882
+ shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
883
+ num_off=('player_name', 'size'),
884
+ players_off=('player_name', lambda x: ', '.join(x)),
885
+ ids_off=('player_id', lambda x: ', '.join(map(str,x))),
886
+ ).reset_index()
887
+
888
+ shifts_off = shifts_off.rename(columns={
889
+ 'end':"seconds_elapsed"
890
+ })
891
+
892
+ # Merge and sort by time in game
893
+ shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
894
+
895
+ shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
896
+ shifts['event_type'] = 'change'
897
+
898
+ #Shift events similar to html (remove shootout shifts)
899
+ shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
900
+
901
+ #Generate on-ice columns
902
+ skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
903
+ goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
904
+ team = list(shift['event_team_abbr'])[0]
905
+
906
+ skaters = pd.DataFrame()
907
+ goalies = pd.DataFrame()
908
+ for player in skater_names:
909
+ #For each player in the game, determine when they began and ended shifts.
910
+ #With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
911
+ on_ice = (np.cumsum(
912
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
913
+ .apply(str)
914
+ .apply(lambda x: int(bool(re.search(player, x)))) -
915
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
916
+ .apply(str)
917
+ .apply(lambda x: int(bool(re.search(player, x))))
918
+ ))
919
+ skaters[player] = on_ice
920
+
921
+ skaters = skaters.fillna(0).astype(int)
922
+
923
+ on_skaters = (skaters == 1).stack().reset_index()
924
+ on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
925
+
926
+ max_players = 6
927
+ for i in range(max_players):
928
+ on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
929
+
930
+ on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
931
+
932
+ #Repeat this process with goaltenders
933
+ for player in goalie_names:
934
+ on_ice = (np.cumsum(
935
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
936
+ .apply(str)
937
+ .apply(lambda x: int(bool(re.search(player, x)))) -
938
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
939
+ .apply(str)
940
+ .apply(lambda x: int(bool(re.search(player, x))))
941
+ ))
942
+ goalies[player] = on_ice
943
+
944
+ goalies = goalies.fillna(0).astype(int)
945
+
946
+ on_goalies = (goalies == 1).stack().reset_index()
947
+ on_goalies = on_goalies[on_goalies[0]].groupby("level_0")["level_1"].apply(list).reset_index()
948
+
949
+ max_players = 1
950
+ for i in range(max_players):
951
+ on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
952
+
953
+ on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
954
+
955
+ #combine on-ice skaters and goaltenders for each shift event
956
+ on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
957
+
958
+ shifts['row'] = shifts.index
959
+
960
+ if home:
961
+ shifts['home_team_abbr'] = team
962
+ else:
963
+ shifts['away_team_abbr'] = team
964
+ #Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
965
+ return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
966
+
967
+ def combine_shifts(away_html,home_html,json):
968
+ #JSON Prep
969
+ info = get_game_info(json)
970
+ del info['rosters']
971
+
972
+ roster = get_game_roster(json)
973
+ #Quickly combine shifts data
974
+ away = parse_shift_events(away_html,json,False)
975
+ home = parse_shift_events(home_html,json,True)
976
+
977
+ #Combine shifts
978
+ data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
979
+
980
+ #Create info columns
981
+ for col in info.keys():
982
+ data[col] = info[col]
983
+
984
+ #Create player information dicts to create on-ice names
985
+ players = {}
986
+ for id, player in zip(list(roster['playerId']),list(roster['fullName.default'])):
987
+ players.update({str(id):player.upper()})
988
+
989
+ for i in range(0,7):
990
+ if i == 6:
991
+ data['away_goalie'] = data['away_goalie_id'].replace(players)
992
+ data['home_goalie'] = data['home_goalie_id'].replace(players)
993
+ else:
994
+ data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
995
+ data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
996
+
997
+ data = data.sort_values(['period','seconds_elapsed'])
998
+ #Fill on-ice columns down
999
+ on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
1000
+ 'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
1001
+ 'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
1002
+ 'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
1003
+ 'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
1004
+
1005
+ for col in on_ice_col:
1006
+ data[col] = data[col].ffill()
1007
+
1008
+ #Create strength state information
1009
+ away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
1010
+ home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
1011
+ data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1012
+ data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
1013
+ data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
1014
+
1015
+ #Return: full shifts data converted to play-by-play format
1016
+ col = [col for col in get_col() if col in data.columns.to_list()]
1017
+ return data[col]
1018
+
1019
+ ### FINALIZE PBP ###
1020
+ def get_col():
1021
+ return [
1022
+ 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
1023
+ 'away_team_abbr','home_team_abbr','event_num','period','period_type',
1024
+ 'seconds_elapsed',"situation_code","strength_state","home_team_defending_side",
1025
+ "event_type_code","event_type","description","penalty_duration",
1026
+ "event_team_abbr",'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
1027
+ "event_team_status",
1028
+ "event_player_1_name","event_player_2_name","event_player_3_name",
1029
+ "event_player_1_id","event_player_2_id","event_player_3_id",
1030
+ "event_player_1_pos","event_player_2_pos","event_player_3_pos",
1031
+ "event_goalie","event_goalie_id",
1032
+ "shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
1033
+ "event_skaters","away_skaters","home_skaters",
1034
+ "event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
1035
+ "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
1036
+ "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
1037
+ "away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
1038
+ "home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
1039
+ "event_coach","away_coach","home_coach"
1040
+ ]
1041
+
1042
+ def combine_data(game_id,html_pbp,away_shifts,home_shifts,json):
1043
+ #Given game_id, html_pbp, away and home shifts, and json pbp, return total game play-by-play data is provided with additional and corrected details
1044
+ #Create dfs
1045
+ pbp = combine_pbp(game_id,html_pbp,json)
1046
+ shifts = combine_shifts(away_shifts,home_shifts,json)
1047
+
1048
+ #Combine data
1049
+ df = pd.concat([pbp,shifts])
1050
+
1051
+ #Create priority columns designed to order events that occur at the same time in a game
1052
+ even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
1053
+ df['priority'] = np.where(df['event_type'].isin(even_pri),1,
1054
+ np.where(df['event_type']=='goal',2,
1055
+ np.where(df['event_type']=='stoppage',3,
1056
+ np.where(df['event_type']=='delayed-penalty',4,
1057
+ np.where(df['event_type']=='penalty',5,
1058
+ np.where(df['event_type']=='period-end',6,
1059
+ np.where(df['event_type']=='change',7,
1060
+ np.where(df['event_type']=='game-end',8,
1061
+ np.where(df['event_type']=='period-start',9,
1062
+ np.where(df['event_type']=='faceoff',10,0))))))))))
1063
+
1064
+ df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
1065
+ df = df.sort_values(['period','seconds_elapsed','priority'])
1066
+
1067
+ #Recalibrate event_num column to accurately depict the order of all events, including changes
1068
+ df.reset_index(inplace=True,drop=True)
1069
+ df['event_num'] = df.index+1
1070
+ df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
1071
+ df['event_type_last'] = df['event_type'].shift(1)
1072
+ df['event_type_last_2'] = df['event_type_last'].shift(1)
1073
+ df['event_type_next'] = df['event_type'].shift(-1)
1074
+ lag_events = ['stoppage','goal','period-end']
1075
+ lead_events = ['faceoff','period-end']
1076
+ period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
1077
+ #Define shifts by "line-change" or "on-the-fly"
1078
+ df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
1079
+ df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
1080
+ try:
1081
+ df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
1082
+ except:
1083
+ ""
1084
+
1085
+ #Retrieve coaches
1086
+ coaches = get_json_coaches(game_id)
1087
+ if not coaches:
1088
+ df['away_coach'] = ""
1089
+ df['home_coach'] = ""
1090
+ df['event_coach'] = ""
1091
+ else:
1092
+ df['away_coach'] = coaches['away']
1093
+ df['home_coach'] = coaches['home']
1094
+ df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
1095
+
1096
+ #Forward fill as necessary
1097
+ cols = ['period_type','home_team_defending_side','away_score','away_fenwick','home_score','home_fenwick','away_coach','home_coach']
1098
+ for col in cols:
1099
+ try: df[col]
1100
+ except: df[col] = ""
1101
+ df[col] = df[col].ffill()
1102
+
1103
+ #Return: complete play-by-play with all important data for each event in a provided game
1104
+ return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)