wsba-hockey 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. wsba_hockey/api/api/index.py +129 -0
  2. wsba_hockey/api/api/main.py +4 -0
  3. wsba_hockey/api/api/tools/__init__.py +0 -0
  4. wsba_hockey/api/api/tools/agg.py +374 -0
  5. wsba_hockey/api/api/tools/archive/old_scraping.py +1104 -0
  6. wsba_hockey/api/api/tools/plotting.py +144 -0
  7. wsba_hockey/api/api/tools/scraping.py +1000 -0
  8. wsba_hockey/api/api/tools/utils/__init__.py +1 -0
  9. wsba_hockey/api/api/tools/utils/config.py +14 -0
  10. wsba_hockey/api/api/tools/utils/save_pages.py +133 -0
  11. wsba_hockey/api/api/tools/utils/shared.py +450 -0
  12. wsba_hockey/api/api/tools/xg_model.py +455 -0
  13. wsba_hockey/api/api/wsba_main.py +1213 -0
  14. wsba_hockey/data_pipelines.py +71 -8
  15. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/app.py +6 -5
  16. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/goalie/app.py +101 -0
  17. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/goalie/plot.py +71 -0
  18. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/goalie/rink_plot.py +245 -0
  19. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/app.py +1 -1
  20. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/plot.py +2 -0
  21. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/rink_plot.py +1 -1
  22. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/app.py +3 -3
  23. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/plot.py +2 -0
  24. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/rink_plot.py +1 -1
  25. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/app.py +44 -28
  26. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/plot.py +12 -3
  27. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/rink_plot.py +1 -1
  28. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/app.py +1 -1
  29. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/plot.py +5 -4
  30. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/rink_plot.py +1 -1
  31. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/team_heatmaps/app.py +103 -0
  32. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/team_heatmaps/plot.py +95 -0
  33. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/team_heatmaps/rink_plot.py +245 -0
  34. wsba_hockey/flask/app.py +77 -0
  35. wsba_hockey/tools/plotting.py +2 -1
  36. wsba_hockey/tools/scraping.py +7 -3
  37. wsba_hockey/tools/xg_model.py +3 -3
  38. wsba_hockey/workspace.py +28 -12
  39. wsba_hockey/wsba_main.py +10 -17
  40. {wsba_hockey-1.1.0.dist-info → wsba_hockey-1.1.1.dist-info}/METADATA +1 -1
  41. {wsba_hockey-1.1.0.dist-info → wsba_hockey-1.1.1.dist-info}/RECORD +44 -24
  42. {wsba_hockey-1.1.0.dist-info → wsba_hockey-1.1.1.dist-info}/WHEEL +0 -0
  43. {wsba_hockey-1.1.0.dist-info → wsba_hockey-1.1.1.dist-info}/licenses/LICENSE +0 -0
  44. {wsba_hockey-1.1.0.dist-info → wsba_hockey-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1000 @@
1
+ import re
2
+ import warnings
3
+ import os
4
+ import numpy as np
5
+ import pandas as pd
6
+ import requests as rs
7
+ import json as json_lib
8
+ from bs4 import BeautifulSoup
9
+ from tools.utils.shared import *
10
+ warnings.filterwarnings('ignore')
11
+
12
+ ### SCRAPING FUNCTIONS ###
13
+ # Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
14
+
15
+ ## ORDER OF OPERATIONS ##
16
+ # Create game information to use with all functions
17
+ # Retreive JSON data
18
+ # Parse JSON data
19
+ # Retreive and clean HTML pbp with player information
20
+ # Parse HTML pbp, return parsed HTML
21
+ # Combine pbp data
22
+ # Retreive and analyze HTML shifts with player information for home and away teams
23
+ # Parse shift events
24
+ # Combine all data, return complete play-by-play
25
+
26
+ ## UTILITY FUNCTIONS ##
27
+ def get_col():
28
+ return [
29
+ 'season','season_type','game_id','game_date',"start_time","venue","venue_location",
30
+ 'away_team_abbr','home_team_abbr','event_num','period','period_type',
31
+ 'seconds_elapsed','period_time','game_time',"strength_state","strength_state_venue","home_team_defending_side",
32
+ "event_type_code","event_type","description","event_reason",
33
+ "penalty_type","penalty_duration","penalty_attribution",
34
+ "event_team_abbr","event_team_venue",
35
+ 'num_on', 'players_on','ids_on','num_off','players_off','ids_off','shift_type',
36
+ "event_player_1_name","event_player_2_name","event_player_3_name",
37
+ "event_player_1_id","event_player_2_id","event_player_3_id",
38
+ "event_player_1_pos","event_player_2_pos","event_player_3_pos",
39
+ "event_goalie_name","event_goalie_id",
40
+ "shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
41
+ "event_skaters","away_skaters","home_skaters",
42
+ "event_distance","event_angle","event_length","seconds_since_last",
43
+ "away_score","home_score", "away_fenwick", "home_fenwick",
44
+ "away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
45
+ "home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie",
46
+ "away_on_1_id","away_on_2_id","away_on_3_id","away_on_4_id","away_on_5_id","away_on_6_id","away_goalie_id",
47
+ "home_on_1_id","home_on_2_id","home_on_3_id","home_on_4_id","home_on_5_id","home_on_6_id","home_goalie_id",
48
+ "event_coach","away_coach","home_coach"
49
+ ]
50
+
51
+ def med_x_coord(group):
52
+ #Calculate median x coordinate of a corsi shot for a team in a period to determine the direction they are shooting in that period (for coordinate adjustments and geometric calculations)
53
+ med_x = group.loc[group['event_type'].isin(['blocked-shot','missed-shot','shot-on-goal','goal']),'x'].median(skipna=True)
54
+ group['med_x'] = med_x
55
+
56
+ return group
57
+
58
+ def adjust_coords(pbp):
59
+ #Given JSON or ESPN pbp data, return pbp with adjusted coordinates
60
+
61
+ #Recalibrate coordinates
62
+ #Determine the direction teams are shooting in a given period
63
+ pbp = pbp.groupby(['event_team_venue','period','game_id'],group_keys=False).apply(med_x_coord)
64
+
65
+ pbp = pbp.reset_index(drop=True)
66
+
67
+ #Adjust coordinates
68
+ pbp['x_adj'] = np.where((((pbp['event_team_venue']=='home')&(pbp['med_x'] < 0))|((pbp['event_team_venue']=='away')&(pbp['med_x'] > 0))),-pbp['x'],pbp['x'])
69
+
70
+ #Adjust y if necessary
71
+ pbp['y_adj'] = np.where((pbp['x']==pbp['x_adj']),pbp['y'],-pbp['y'])
72
+
73
+ #Calculate event distance and angle relative to venue location
74
+ pbp['event_distance'] = np.where(pbp['event_team_venue']=='home',np.sqrt(((89 - pbp['x_adj'])**2) + (pbp['y_adj']**2)),np.sqrt((((-89) - pbp['x_adj'])**2) + (pbp['y_adj']**2)))
75
+ pbp['event_angle'] = np.where(pbp['event_team_venue']=='away',np.degrees(np.arctan2(abs(pbp['y_adj']), abs(89 - pbp['x_adj']))),np.degrees(np.arctan2(abs(pbp['y_adj']), abs((-89) - pbp['x_adj']))))
76
+
77
+ #Return: pbp with adjiusted coordinates
78
+ return pbp
79
+
80
+ ## JSON FUNCTIONS ##
81
+ def get_game_roster(json):
82
+ #Given raw json data, return game rosters
83
+ roster = pd.json_normalize(json['rosterSpots'])
84
+ roster['full_name'] = (roster['firstName.default'] + " " + roster['lastName.default']).str.upper()
85
+
86
+ #Return: roster information
87
+ return roster
88
+
89
+ def get_game_coaches(game_id):
90
+ #Given game info, return head coaches for away and home team
91
+
92
+ #Retreive data
93
+ json = rs.get(f'https://api-web.nhle.com/v1/gamecenter/{game_id}/right-rail').json()
94
+ data = json['gameInfo']
95
+
96
+ #Add coaches
97
+ try:
98
+ away = data['awayTeam']['headCoach']['default'].upper()
99
+ home = data['homeTeam']['headCoach']['default'].upper()
100
+
101
+ coaches = {'away':away,
102
+ 'home':home}
103
+ except KeyError:
104
+ return {}
105
+
106
+ #Return: dict with coaches
107
+ return coaches
108
+
109
+ def get_game_info(game_id):
110
+ #Given game_id, return game information
111
+
112
+ #Retreive data
113
+ api = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
114
+ json = rs.get(api).json()
115
+
116
+ #Games don't always have JSON shifts, for whatever reason
117
+ shifts = f"https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={game_id}"
118
+ shifts = rs.get(shifts).json()
119
+ json_shifts = pd.json_normalize(shifts['data'])
120
+
121
+ if shifts['total'] == 0:
122
+ json_shifts = pd.DataFrame()
123
+
124
+ #Split information
125
+ base = pd.json_normalize(json)
126
+ game_id = base['id'][0]
127
+ season = base['season'][0]
128
+ season_type = base['gameType'][0]
129
+ game_date = base['gameDate'][0]
130
+ game_state = base['gameState'][0]
131
+ start_time = base['startTimeUTC'][0]
132
+ venue = base['venue.default'][0]
133
+ venue_location = base['venueLocation.default'][0]
134
+ away_team_id = base['awayTeam.id'][0]
135
+ away_team_abbr = base['awayTeam.abbrev'][0]
136
+ home_team_id = base['homeTeam.id'][0]
137
+ home_team_abbr = base['homeTeam.abbrev'][0]
138
+
139
+ #Add roster
140
+ roster = get_game_roster(json)
141
+ #In the HTML parsing process, player are identified by a regex pattern (ABB #00 such as BOS #37) or number and name in the following format: #00 NAME (i.e. #37 BERGERON) so these are added as IDs of sorts.
142
+ roster['descID'] = '#'+roster['sweaterNumber'].astype(str)+" "+roster['lastName.default'].str.upper()
143
+ roster['team_abbr'] = roster['teamId'].replace({
144
+ away_team_id:[away_team_abbr],
145
+ home_team_id:[home_team_abbr]
146
+ })
147
+ roster['key'] = roster['team_abbr'] + " #" + roster['sweaterNumber'].astype(str)
148
+
149
+ #Create an additional roster dictionary for use with HTML parsing
150
+ #Roster dict
151
+ roster_dict = {'away':{},
152
+ 'home':{}}
153
+
154
+ #Evaluate and add players by team
155
+ for team in ['away','home']:
156
+ abbr = (away_team_abbr if team == 'away' else home_team_abbr)
157
+ rost = roster.loc[roster['team_abbr']==abbr]
158
+
159
+ #Now iterate through team players
160
+ for player,id,num,pos,team_abbr,key in zip(rost['full_name'],rost['playerId'],rost['sweaterNumber'],rost['positionCode'],rost['team_abbr'],rost['key']):
161
+ roster_dict[team].update({str(num):[key, pos, player, team_abbr, id]})
162
+
163
+ #Return: game information
164
+ return {"game_id":str(game_id),
165
+ "season":season,
166
+ "season_type":season_type,
167
+ "game_date":game_date,
168
+ "game_state":game_state,
169
+ "start_time":start_time,
170
+ 'venue':venue,
171
+ 'venue_location':venue_location,
172
+ 'away_team_id':away_team_id,
173
+ 'away_team_abbr':away_team_abbr,
174
+ 'home_team_id':home_team_id,
175
+ 'home_team_abbr':home_team_abbr,
176
+ 'events':pd.json_normalize(json['plays']).reset_index(drop=True),
177
+ 'rosters':roster,
178
+ 'HTML_rosters':roster_dict,
179
+ 'coaches':get_game_coaches(game_id),
180
+ 'json_shifts':json_shifts}
181
+
182
+ def parse_json(info):
183
+ #Given game info, return JSON document
184
+
185
+ #Retreive data
186
+ events = info['events']
187
+
188
+ #Return error if game is set in the future
189
+ if info['game_state'] == 'FUT':
190
+ raise ValueError(f"Game {info['id'][0]} has not occured yet.")
191
+
192
+ #Test columns
193
+ cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date']
194
+
195
+ for col in cols:
196
+ try:events[col]
197
+ except:
198
+ events[col]=""
199
+
200
+ #Event_player_columns include players in a given set of events; the higher the number, the greater the importance the event player was to the play
201
+ events['event_player_1_id'] = events['details.winningPlayerId'].combine_first(events['details.scoringPlayerId'])\
202
+ .combine_first(events['details.shootingPlayerId'])\
203
+ .combine_first(events['details.playerId'])\
204
+ .combine_first(events['details.hittingPlayerId'])\
205
+ .combine_first(events['details.committedByPlayerId'])
206
+
207
+ events['event_player_2_id'] = events['details.losingPlayerId'].combine_first(events['details.assist1PlayerId'])\
208
+ .combine_first(events['details.hitteePlayerId'])\
209
+ .combine_first(events['details.drawnByPlayerId'])\
210
+ .combine_first(events['details.blockingPlayerId'])
211
+
212
+ events['event_player_3_id'] = events['details.assist2PlayerId']
213
+
214
+ events['event_team_venue'] = np.where(events['details.eventOwnerTeamId']==info['home_team_id'],"home","away")
215
+
216
+ events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace({
217
+ info['away_team_id']:[info['away_team_abbr']],
218
+ info['home_team_id']:[info['home_team_abbr']]
219
+ })
220
+
221
+ #Rename columns to follow WSBA naming conventions
222
+ events = events.rename(columns={
223
+ "eventId":"event_id",
224
+ "periodDescriptor.number":"period",
225
+ "periodDescriptor.periodType":"period_type",
226
+ "timeInPeriod":"period_time_elasped",
227
+ "timeRemaining":"period_time_remaining",
228
+ "situationCode":"situation_code",
229
+ "homeTeamDefendingSide":"home_team_defending_side",
230
+ "typeCode":"event_type_code",
231
+ "typeDescKey":"event_type",
232
+ "details.shotType":"shot_type",
233
+ "details.duration":"penalty_duration",
234
+ "details.descKey":"penalty_type",
235
+ "details.typeCode":'penalty_attribution',
236
+ "details.reason":"event_reason",
237
+ "details.zoneCode":"zone_code",
238
+ "details.xCoord":"x",
239
+ "details.yCoord":"y",
240
+ "details.goalieInNetId": "event_goalie_id",
241
+ "details.awaySOG":"away_sog",
242
+ "details.homeSOG":"home_sog"
243
+ })
244
+
245
+ #Coordinate adjustments:
246
+ # x, y - Raw coordinates from JSON pbp
247
+ # x_adj, y_adj - Adjusted coordinates configuring the away offensive zone to the left and the home offensive zone to the right
248
+ #Some games (mostly preseason and all star games) do not include coordinates.
249
+
250
+ try:
251
+ events = adjust_coords(events)
252
+
253
+ except KeyError:
254
+ print(f"No coordinates found for game {info['game_id'][0]}...")
255
+
256
+ events['x_adj'] = np.nan
257
+ events['y_adj'] = np.nan
258
+ events['event_distance'] = np.nan
259
+ events['event_angle'] = np.nan
260
+
261
+ #Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
262
+ events['period_seconds_elapsed'] = events['period_time_elasped'].apply(convert_to_seconds)
263
+ events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
264
+
265
+ events = events.loc[(events['event_type']!="")]
266
+
267
+ #Return: dataframe with parsed game
268
+ return events
269
+
270
+
271
+ ## HTML PBP FUNCTIONS ##
272
+ def strip_html_pbp(td,rosters):
273
+ #Given html row, parse data from HTML pbp
274
+ #Harry Shomer's Code (modified)
275
+
276
+ #HTML Parsing
277
+ for y in range(len(td)):
278
+ # Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
279
+ if y == 3:
280
+ td[y] = td[y].get_text() # This gets us elapsed and remaining combined-< 3:0017:00
281
+ index = td[y].find(':')
282
+ td[y] = td[y][:index+3]
283
+ elif (y == 6 or y == 7) and td[0] != '#':
284
+ # 6 & 7-> These are the player 1 ice one's
285
+ # The second statement controls for when it's just a header
286
+ baz = td[y].find_all('td')
287
+ bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
288
+
289
+ # The setup in the list is now: Name/Number->Position->Blank...and repeat
290
+ # Now strip all the html
291
+ players = []
292
+ for i in range(len(bar)):
293
+ if i % 3 == 0:
294
+ try:
295
+ #Using the supplied json we can bind player name and id to number and team
296
+ #Find number and team of player then lookup roster dictionary
297
+
298
+ number = bar[i].get_text().strip('\n') # Get number and strip leading/trailing newlines
299
+ if y == 6:
300
+ team = 'away'
301
+ else:
302
+ team = 'home'
303
+
304
+ id = rosters[team][str(number)][4]
305
+ name = rosters[team][str(number)][2]
306
+ position = rosters[team][str(number)][1]
307
+
308
+ except KeyError:
309
+ name = ''
310
+ number = ''
311
+ id = ''
312
+ elif i % 3 == 1:
313
+ if name != '':
314
+ players.append([name, number, position, id])
315
+
316
+ td[y] = players
317
+ else:
318
+ td[y] = td[y].get_text()
319
+
320
+ return td
321
+
322
+
323
+ def clean_html_pbp(info):
324
+ #Harry Shomer's Code (modified)
325
+
326
+ game_id = info['game_id']
327
+ #Retreive data
328
+ season = info['season']
329
+ doc = f"https://www.nhl.com/scores/htmlreports/{season}/PL{game_id[-6:]}.HTM"
330
+ html = rs.get(doc).content
331
+ soup = get_contents(html)
332
+
333
+ #Rosters
334
+ rosters = info['HTML_rosters']
335
+
336
+ # Create a list of lists (each length 8)...corresponds to 8 columns in html pbp
337
+ td = [soup[i:i + 8] for i in range(0, len(soup), 8)]
338
+
339
+ cleaned_html = [strip_html_pbp(x,rosters) for x in td]
340
+
341
+ return cleaned_html
342
+
343
+ def parse_html(info):
344
+ #Given game info, return HTML event data
345
+
346
+ #Retreive game information and html events
347
+ rosters = info['HTML_rosters']
348
+ events = clean_html_pbp(info)
349
+
350
+ teams = {info['away_team_abbr']:['away'],
351
+ info['home_team_abbr']:['home']}
352
+
353
+ #Parsing
354
+ event_log = []
355
+ for event in events:
356
+ events_dict = {}
357
+ if event[0] == "#" or event[4] in ['GOFF', 'EGT', 'PGSTR', 'PGEND', 'ANTHEM', 'SPC', 'PBOX', 'EISTR', 'EIEND','EGPID'] or event[3]=='-16:0-':
358
+ continue
359
+ else:
360
+ #Event info
361
+ events_dict['event_num'] = int(event[0])
362
+ events_dict['period'] = int(event[1])
363
+ events_dict['strength'] = re.sub(u'\xa0'," ",event[2])
364
+ events_dict['period_time_elapsed'] = event[3]
365
+ events_dict['seconds_elapsed'] = convert_to_seconds(event[3]) + (1200*(int(event[1])-1))
366
+ events_dict['event_type'] = event[4]
367
+
368
+ desc = re.sub(u'\xa0'," ",event[5])
369
+ events_dict['description'] = desc
370
+
371
+ events_dict['shot_type'] = desc.split(",")[1].lower().strip(" ") if event[4] in ['BLOCK','MISS','SHOT','GOAL'] else ""
372
+ zone = [x for x in desc.split(',') if 'Zone' in x]
373
+ if not zone:
374
+ events_dict['zone_code'] = None
375
+ elif zone[0].find("Off") != -1:
376
+ events_dict['zone_code'] = 'O'
377
+ elif zone[0].find("Neu") != -1:
378
+ events_dict['zone_code'] = 'N'
379
+ elif zone[0].find("Def") != -1:
380
+ events_dict['zone_code'] = 'D'
381
+
382
+ #Convert team names for compatiblity
383
+ replace = [('LAK',"L.A"),('NJD',"N.J"),('SJS',"S.J"),('TBL',"T.B")]
384
+ for name, repl in replace:
385
+ desc = desc.replace(repl,name)
386
+
387
+ event_team = desc[0:3] if desc[0:3] in teams.keys() else ""
388
+ events_dict['event_team_abbr'] = event_team
389
+
390
+ events_dict['away_team_abbr'] = info['away_team_abbr']
391
+ events_dict['home_team_abbr'] = info['home_team_abbr']
392
+
393
+ away_skaters = 0
394
+ away_goalie = 0
395
+ #Away on-ice
396
+ for i in range(len(event[6])):
397
+ player = event[6][i][0]
398
+ pos = event[6][i][2]
399
+ id = event[6][i][3]
400
+
401
+ if pos == 'G':
402
+ events_dict['away_goalie'] = player
403
+ events_dict['away_goalie_id'] = id
404
+ away_goalie += 1
405
+ else:
406
+ events_dict[f'away_on_{i+1}'] = player
407
+ events_dict[f'away_on_{i+1}_id'] = id
408
+ away_skaters += 1
409
+
410
+ home_skaters = 0
411
+ home_goalie = 0
412
+ #Home on-ice
413
+ for i in range(len(event[7])):
414
+ player = event[7][i][0]
415
+ pos = event[7][i][2]
416
+ id = event[7][i][3]
417
+
418
+ if pos == 'G':
419
+ events_dict['home_goalie'] = player
420
+ events_dict['home_goalie_id'] = id
421
+ home_goalie += 1
422
+ else:
423
+ events_dict[f'home_on_{i+1}'] = player
424
+ events_dict[f'home_on_{i+1}_id'] = id
425
+ home_skaters += 1
426
+
427
+ event_players = []
428
+ #Determine parsing route based on event
429
+ if event[4] in ['FAC','HIT','BLOCK','PENL']:
430
+ #Regex to find team and player number involved (finds all for each event)
431
+ #Code is modified from Harry Shomer in order to account for periods in a team abbreviation
432
+ regex = re.compile(r'([A-Z]{2,3}|\b[A-Z]\.[A-Z])\s+#(\d+)')
433
+ fac = regex.findall(desc)
434
+ #Filter incorrectly parsed teams
435
+ repl = []
436
+ for team, num in fac:
437
+ if team in teams.keys():
438
+ repl.append((team,num))
439
+ fac = repl
440
+
441
+ #Find first event player
442
+ ep1_num = ''
443
+ for i in range(len(fac)):
444
+ team, num = fac[i]
445
+ if team == event_team:
446
+ ep1_num = num
447
+ event_players.append(fac[i])
448
+ else:
449
+ continue
450
+
451
+ #Find other players
452
+ for i in range(len(fac)):
453
+ team, num = fac[i]
454
+ if num == ep1_num:
455
+ continue
456
+ else:
457
+ event_players.append(fac[i])
458
+ elif event[4]=='GOAL':
459
+ #Parse goal
460
+ regex = re.compile(r'#(\d+)\s+')
461
+ goal = regex.findall(desc)
462
+
463
+ #Add all involved players
464
+ for point in goal:
465
+ #In this loop, point is a player number. We can assign event_team to all players in a goal
466
+ event_players.append((event_team,str(point)))
467
+ elif event[4]=='DELPEN':
468
+ #Don't parse DELPEN events
469
+ #These events typically have no text but when they do it is often erroneous or otherwise problematic
470
+
471
+ ""
472
+ else:
473
+ #Parse single or no player events
474
+ regex = re.compile(r'#\d+')
475
+ fac = regex.findall(desc)
476
+
477
+ for i in range(len(fac)):
478
+ num = fac[i].replace("#","")
479
+ event_players.append((event_team,str(num)))
480
+
481
+ for i in range(len(event_players)):
482
+ #For each player, evaluate their event data, then retreive information from rosters
483
+ team, num = event_players[i]
484
+
485
+ status = teams[team]
486
+ data = rosters[status[0]]
487
+
488
+ #In rare instances the event player is not on the event team (i.e. "WSH TAKEAWAY - #71 CIRELLI, Off. Zone" when #71 CIRELLI is on TBL)
489
+ try:
490
+ events_dict[f'event_player_{i+1}_name'] = data[str(num)][2]
491
+ events_dict[f'event_player_{i+1}_id'] = data[str(num)][4]
492
+ events_dict[f'event_player_{i+1}_pos'] = data[str(num)][1]
493
+ except:
494
+ ''
495
+
496
+ #Event skaters and strength-state information
497
+ events_dict['away_skaters'] = away_skaters
498
+ events_dict['home_skaters'] = home_skaters
499
+ events_dict['away_goalie_in'] = away_goalie
500
+ events_dict['home_goalie_in'] = home_goalie
501
+
502
+ event_skaters = away_skaters if info['away_team_abbr'] == event_team else home_skaters
503
+ event_skaters_against = away_skaters if info['home_team_abbr'] == event_team else home_skaters
504
+ events_dict['strength_state'] = f'{event_skaters}v{event_skaters_against}'
505
+ events_dict['event_skaters'] = np.where(event_team == info['home_team_abbr'],home_skaters,away_skaters)
506
+
507
+ event_log.append(pd.DataFrame([events_dict]))
508
+
509
+ data = pd.concat(event_log)
510
+ data['event_type'] = data['event_type'].replace({
511
+ "PGSTR": "pre-game-start",
512
+ "PGEND": "pre-game-end",
513
+ 'GSTR':"game-start",
514
+ "ANTHEM":"anthem",
515
+ "PSTR":"period-start",
516
+ "FAC":"faceoff",
517
+ "SHOT":"shot-on-goal",
518
+ "BLOCK":"blocked-shot",
519
+ "STOP":"stoppage",
520
+ "MISS":"missed-shot",
521
+ "HIT":"hit",
522
+ "GOAL":"goal",
523
+ "GIVE":"giveaway",
524
+ "TAKE":"takeaway",
525
+ "DELPEN":"delayed-penalty",
526
+ "PENL":"penalty",
527
+ "CHL":"challenge",
528
+ "SOC":'shootout-complete',
529
+ "PEND":"period-end",
530
+ "GEND":"game-end"
531
+ })
532
+
533
+ #Return: parsed HTML pbp
534
+ return data
535
+
536
+ def assign_target(data):
537
+ #Assign target number to plays to assist with merging
538
+
539
+ #New sort
540
+ data = data.sort_values(['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','event_player_2_id'])
541
+
542
+ #Target number distingushes events that occur in the same second to assist in merging the JSON and HTML
543
+ #Sometimes the target number may not reflect the same order as the event number in either document (especially in earlier seasons where the events are out of order in the HTML or JSON)
544
+ data['target_num'] = np.where(data['event_type'].isin(['penalty','blocked-shot','missed-shot','shot-on-goal','goal']),data['event_type'].isin(['penalty','blocked-shot','missed-shot','shot-on-goal','goal']).cumsum(),0)
545
+
546
+ #Revert sort and return dataframe
547
+ return data.reset_index()
548
+
549
+ def combine_pbp(info,sources):
550
+ #Given game info, return complete play-by-play data for provided game
551
+
552
+ html_pbp = parse_html(info)
553
+
554
+ #Route data combining - json if season is after 2009-2010:
555
+ if str(info['season']) in ['20052006','20062007','20072008','20082009','20092010']:
556
+ #ESPN x HTML
557
+ #espn_pbp = parse_espn(str(info['game_date']),info['away_team_abbr'],info['home_team_abbr']).rename(columns={'coords_x':'x',"coords_y":'y'}).sort_values(['period','seconds_elapsed']).reset_index()
558
+ #merge_col = ['period','seconds_elapsed','event_type','event_team_abbr']
559
+
560
+ #Merge pbp
561
+ #df = pd.merge(html_pbp,espn_pbp,how='left',on=merge_col)
562
+ print('In-repair, please try again later...')
563
+
564
+ else:
565
+ #JSON x HTML
566
+ json_pbp = parse_json(info)
567
+
568
+ if sources:
569
+ dirs_html = f'sources/{info['season']}/HTML/'
570
+ dirs_json = f'sources/{info['season']}/JSON/'
571
+
572
+ if not os.path.exists(dirs_html):
573
+ os.makedirs(dirs_html)
574
+ if not os.path.exists(dirs_json):
575
+ os.makedirs(dirs_json)
576
+
577
+ html_pbp.to_csv(f'{dirs_html}{info['game_id']}_HTML.csv',index=False)
578
+ json_pbp.to_csv(f'{dirs_json}{info['game_id']}_JSON.csv',index=False)
579
+
580
+ #Assign target numbers
581
+ html_pbp = assign_target(html_pbp)
582
+ json_pbp = assign_target(json_pbp)
583
+
584
+ #Merge on index if the df lengths are the same and the events are in the same general order; merge on columns otherwise
585
+ if (len(html_pbp) == len(json_pbp)) and (html_pbp['event_type'].equals(json_pbp['event_type'])) and (html_pbp['seconds_elapsed'].equals(json_pbp['seconds_elapsed'])):
586
+ html_pbp = html_pbp.drop(columns=['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore').reset_index()
587
+ df = pd.merge(html_pbp,json_pbp,how='left',left_index=True,right_index=True).sort_values(['event_num'])
588
+ else:
589
+ print(f' merging on columns...',end="")
590
+ #Modify merge conditions and merge pbps
591
+ merge_col = ['period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id','target_num']
592
+ html_pbp = html_pbp.drop(columns=['event_player_2_id','event_player_3_id','shot_type','zone_code'],errors='ignore')
593
+
594
+ #While rare sometimes column 'event_player_1_id' is interpreted differently between the two dataframes.
595
+ html_pbp['event_player_1_id'] = html_pbp['event_player_1_id'].astype(object)
596
+ json_pbp['event_player_1_id'] = json_pbp['event_player_1_id'].astype(object)
597
+
598
+ #Merge pbp
599
+ df = pd.merge(html_pbp,json_pbp,how='left',on=merge_col).sort_values(['event_num'])
600
+
601
+ #Add game info
602
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
603
+ 'away_team_abbr','home_team_abbr']
604
+
605
+ for col in info_col:
606
+ df[col] = info[col]
607
+
608
+ #Fill period_type column and assign shifts a sub-500 event code
609
+ df['period_type'] = np.where(df['period']<4,"REG",np.where(np.logical_and(df['period']==5,df['season_type']==2),"SO","OT"))
610
+ try: df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
611
+ except:
612
+ ""
613
+ df = df.sort_values(['period','seconds_elapsed']).reset_index()
614
+
615
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
616
+
617
+ #Correct strength state for penalty shots and shootouts - most games dont have shifts in shootout and are disculuded otherwise
618
+ df['strength_state'] = np.where((df['period'].astype(str)=='5')&(df['event_type'].isin(['missed-shot','shot-on-goal','goal']))&(df['season_type']==2),"1v0",df['strength_state'])
619
+ df['strength_state'] = np.where(df['description'].str.contains('Penalty Shot',case=False),"1v0",df['strength_state'])
620
+
621
+ col = [col for col in get_col() if col in df.columns.to_list()]
622
+ #Return: complete play-by-play information for provided game
623
+ return df[col]
624
+
625
+ ## SHIFT SCRAPING FUNCTIONS ##
626
+ def parse_shifts_json(info):
627
+ #Given game info, return json shift chart
628
+
629
+ log = info['json_shifts']
630
+ #Filter non-shift events and duplicate events
631
+ log = log.loc[log['detailCode']==0].drop_duplicates(subset=['playerId','shiftNumber'])
632
+
633
+ #Add full name columns
634
+ log['player_name'] = (log['firstName'] + " " + log['lastName']).str.upper()
635
+
636
+ log = log.rename(columns={
637
+ 'playerId':'player_id',
638
+ 'teamAbbrev':'event_team_abbr',
639
+ 'startTime':'start',
640
+ 'endTime':'end'
641
+ })
642
+
643
+ #Convert time columns
644
+ log['start'] = log['start'].astype(str).apply(convert_to_seconds)
645
+ log['end'] = log['end'].astype(str).apply(convert_to_seconds)
646
+ log = log[['player_name','player_id',
647
+ 'period','event_team_abbr',
648
+ 'start','duration','end']]
649
+
650
+ #Recalibrate duration
651
+ log['duration'] = log['end'] - log['start']
652
+
653
+ #Return: JSON shifts (seperated by team)
654
+ away = log.loc[log['event_team_abbr']==info['away_team_abbr']]
655
+ home = log.loc[log['event_team_abbr']==info['home_team_abbr']]
656
+
657
+ return {'away':away,
658
+ 'home':home}
659
+
660
+ def analyze_shifts(shift, id, name, pos, team):
661
+ #Collects teams in given shifts html (parsed by Beautiful Soup)
662
+ #Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
663
+ shifts = dict()
664
+
665
+ shifts['player_name'] = name.upper()
666
+ shifts['player_id'] = id
667
+ shifts['player_pos'] = pos
668
+ shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
669
+ shifts['event_team_abbr'] = get_team(team.strip(' '))
670
+ shifts['start'] = convert_to_seconds(shift[2].split('/')[0])
671
+ shifts['duration'] = convert_to_seconds(shift[4].split('/')[0])
672
+
673
+ #Sometimes there are no digits
674
+ if re.compile(r'\d+').findall(shift[3].split('/')[0]):
675
+ shifts['end'] = convert_to_seconds(shift[3].split('/')[0])
676
+ else:
677
+ shifts['end'] = shifts['start'] + shifts['duration']
678
+ return shifts
679
+
680
+ def parse_shifts_html(info,home):
681
+ #Parsing of shifts data for a single team in a provided game
682
+ #Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
683
+
684
+ #Roster info prep
685
+ roster = info['HTML_rosters']
686
+
687
+ rosters = roster['home' if home else 'away']
688
+
689
+ all_shifts = []
690
+ #columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
691
+
692
+ #Retreive HTML
693
+ game_id = info['game_id']
694
+ season = info['season']
695
+ link = f"https://www.nhl.com/scores/htmlreports/{season}/T{'H' if home else 'V'}{game_id[-6:]}.HTM"
696
+ doc = rs.get(link).content
697
+ td, teams = get_soup(doc)
698
+
699
+ team = teams[0]
700
+ players = dict()
701
+
702
+ # Iterates through each player shifts table with the following data:
703
+ # Shift #, Period, Start, End, and Duration.
704
+ for t in td:
705
+ t = t.get_text()
706
+ if ',' in t: # If a comma exists it is a player
707
+ name = t
708
+
709
+ name = name.split(',')
710
+ number = int(name[0][:2].strip())
711
+ id = rosters[str(number)][4]
712
+ players[id] = dict()
713
+
714
+ #HTML shift functions assess one team at a time, which simplifies the lookup process with number to name and id
715
+
716
+ players[id]['name'] = rosters[str(number)][2]
717
+ players[id]['pos'] = rosters[str(number)][1]
718
+
719
+ players[id]['shifts'] = []
720
+ else:
721
+ players[id]['shifts'].extend([t])
722
+
723
+ for key in players.keys():
724
+ # Create lists of shifts-table columns for analysis
725
+ players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
726
+
727
+ name = players[key]['name']
728
+ pos = players[key]['pos']
729
+
730
+ # Parsing
731
+ shifts = [analyze_shifts(shift, key, name, pos, team) for shift in players[key]['shifts']]
732
+ all_shifts.extend(shifts)
733
+
734
+ df = pd.DataFrame(all_shifts)
735
+
736
+ shifts_raw = df[df['duration'] > 0]
737
+
738
+ #Return: single-team individual shifts by player
739
+ return shifts_raw
740
+
741
+ def parse_shift_events(info,home):
742
+ #Given game info and home team conditional, parse and convert document to shift events congruent to html play-by-play
743
+
744
+ #Determine whether to use JSON shifts or HTML shifts
745
+ if len(info['json_shifts']) == 0:
746
+ shift = parse_shifts_html(info,home)
747
+ else:
748
+ shift = parse_shifts_json(info)['home' if home else 'away']
749
+
750
+ rosters = info['rosters']
751
+
752
+ # Identify shift starts for each shift event
753
+ shifts_on = shift.groupby(['event_team_abbr', 'period', 'start']).agg(
754
+ num_on=('player_name', 'size'),
755
+ players_on=('player_name', lambda x: ', '.join(x)),
756
+ ids_on=('player_id', lambda x: ', '.join(map(str,x))),
757
+ ).reset_index()
758
+
759
+ shifts_on = shifts_on.rename(columns={
760
+ 'start':"seconds_elapsed"
761
+ })
762
+
763
+ # Identify shift stops for each shift event
764
+ shifts_off = shift.groupby(['event_team_abbr', 'period', 'end']).agg(
765
+ num_off=('player_name', 'size'),
766
+ players_off=('player_name', lambda x: ', '.join(x)),
767
+ ids_off=('player_id', lambda x: ', '.join(map(str,x))),
768
+ ).reset_index()
769
+
770
+ shifts_off = shifts_off.rename(columns={
771
+ 'end':"seconds_elapsed"
772
+ })
773
+
774
+ # Merge and sort by time in game
775
+ shifts = pd.merge(shifts_on, shifts_off, on=['event_team_abbr', 'period', 'seconds_elapsed'], how='outer')
776
+
777
+ shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200*(shifts['period'].astype(int)-1))
778
+ shifts['event_type'] = 'change'
779
+
780
+ #Shift events similar to html (remove shootout shifts)
781
+ shifts = shifts.loc[shifts['period'].astype(int)<5].sort_values(['period','seconds_elapsed'])
782
+
783
+ #Generate on-ice columns
784
+ skater_names = list(rosters.loc[rosters['positionCode']!="G",'playerId'].astype(str))
785
+ goalie_names = list(rosters.loc[rosters['positionCode']=="G",'playerId'].astype(str))
786
+ team = list(shift['event_team_abbr'])[0]
787
+
788
+ skaters = pd.DataFrame()
789
+ goalies = pd.DataFrame()
790
+ for player in skater_names:
791
+ #For each player in the game, determine when they began and ended shifts.
792
+ #With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
793
+ on_ice = (np.cumsum(
794
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
795
+ .apply(str)
796
+ .apply(lambda x: int(bool(re.search(player, x)))) -
797
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
798
+ .apply(str)
799
+ .apply(lambda x: int(bool(re.search(player, x))))
800
+ ))
801
+ skaters[player] = on_ice
802
+
803
+ skaters = skaters.fillna(0).astype(int)
804
+
805
+ on_skaters = (skaters == 1).stack().reset_index()
806
+ on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
807
+
808
+ max_players = 6
809
+ for i in range(max_players):
810
+ on_skaters[f"{'home' if home else 'away'}_on_{i+1}_id"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
811
+
812
+ on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
813
+
814
+ #Repeat this process with goaltenders
815
+ for player in goalie_names:
816
+ on_ice = (np.cumsum(
817
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_on']
818
+ .apply(str)
819
+ .apply(lambda x: int(bool(re.search(player, x)))) -
820
+ shifts.loc[(shifts['event_team_abbr'] == team), 'ids_off']
821
+ .apply(str)
822
+ .apply(lambda x: int(bool(re.search(player, x))))
823
+ ))
824
+ goalies[player] = on_ice
825
+
826
+ goalies = goalies.fillna(0).astype(int)
827
+
828
+ on_goalies = (goalies == 1).stack().reset_index()
829
+ on_goalies = on_goalies[on_goalies[0]].groupby("level_0")["level_1"].apply(list).reset_index()
830
+
831
+ max_players = 1
832
+ for i in range(max_players):
833
+ on_goalies[f"{'home' if home else 'away'}_goalie_id"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
834
+
835
+ on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
836
+
837
+ #combine on-ice skaters and goaltenders for each shift event
838
+ on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
839
+
840
+ shifts['row'] = shifts.index
841
+
842
+ if home:
843
+ shifts['home_team_abbr'] = team
844
+ else:
845
+ shifts['away_team_abbr'] = team
846
+ #Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
847
+ return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"")
848
+
849
+ ## FINALIZE PBP FUNCTIONS ##
850
+ def combine_shifts(info,sources):
851
+ #Given game info, return complete shift events
852
+
853
+ #JSON Prep
854
+ roster = info['rosters']
855
+
856
+ #Quickly combine shifts data
857
+ away = parse_shift_events(info,False)
858
+ home = parse_shift_events(info,True)
859
+
860
+ #Combine shifts
861
+ data = pd.concat([away,home]).sort_values(['period','seconds_elapsed'])
862
+
863
+ #Add game info
864
+ info_col = ['season','season_type','game_id','game_date',"venue","venue_location",
865
+ 'away_team_abbr','home_team_abbr']
866
+
867
+ for col in info_col:
868
+ data[col] = info[col]
869
+
870
+ #Create player information dicts to create on-ice names
871
+ roster['playerId'] = roster['playerId'].astype(str)
872
+ players = roster.set_index("playerId")['full_name'].to_dict()
873
+
874
+ for i in range(0,7):
875
+ if i == 6:
876
+ data['away_goalie'] = data['away_goalie_id'].replace(players)
877
+ data['home_goalie'] = data['home_goalie_id'].replace(players)
878
+ else:
879
+ data[f'away_on_{i+1}'] = data[f'away_on_{i+1}_id'].replace(players)
880
+ data[f'home_on_{i+1}'] = data[f'home_on_{i+1}_id'].replace(players)
881
+
882
+ data = data.sort_values(['period','seconds_elapsed'])
883
+ #Fill on-ice columns down
884
+ on_ice_col = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6',
885
+ 'away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',
886
+ 'home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6',
887
+ 'home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',
888
+ 'away_goalie','home_goalie','away_goalie_id','home_goalie_id']
889
+
890
+ for col in on_ice_col:
891
+ data[col] = data[col].ffill()
892
+
893
+ #Create strength state information
894
+ away_on = ['away_on_1_id','away_on_2_id','away_on_3_id','away_on_4_id','away_on_5_id','away_on_6_id',]
895
+ home_on = ['home_on_1_id','home_on_2_id','home_on_3_id','home_on_4_id','home_on_5_id','home_on_6_id',]
896
+ data['away_skaters'] = data[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
897
+ data['home_skaters'] = data[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
898
+ data['strength_state'] = np.where(data['event_team_abbr']==data['away_team_abbr'],data['away_skaters'].astype(str)+"v"+data['home_skaters'].astype(str),data['home_skaters'].astype(str)+"v"+data['away_skaters'].astype(str))
899
+
900
+ #Create final shifts df
901
+ col = [col for col in get_col() if col in data.columns.to_list()]
902
+ full_shifts = data[col]
903
+
904
+ #Export sources if true
905
+ if sources:
906
+ dirs = f'sources/{info['season']}/SHIFTS/'
907
+
908
+ if not os.path.exists(dirs):
909
+ os.makedirs(dirs)
910
+
911
+ full_shifts.to_csv(f'{dirs}{info['game_id']}_SHIFTS.csv',index=False)
912
+
913
+ #Return: full shifts data converted to play-by-play format
914
+ return full_shifts
915
+
916
+ def combine_data(info,sources):
917
+ #Given game info, return complete play-by-play data
918
+
919
+ game_id = info['game_id']
920
+
921
+ pbp = combine_pbp(info,sources)
922
+ shifts = combine_shifts(info,sources)
923
+
924
+ #Combine data
925
+ df = pd.concat([pbp,shifts])
926
+
927
+ df['event_num'] = df['event_num'].replace(np.nan,0)
928
+
929
+ #Create priority columns designed to order events that occur at the same time in a game
930
+ even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
931
+ df['priority'] = np.where(df['event_type'].isin(even_pri),1,
932
+ np.where(df['event_type']=='goal',2,
933
+ np.where(df['event_type']=='stoppage',3,
934
+ np.where(df['event_type']=='delayed-penalty',4,
935
+ np.where(df['event_type']=='penalty',5,
936
+ np.where(df['event_type']=='period-end',6,
937
+ np.where(df['event_type']=='change',7,
938
+ np.where(df['event_type']=='game-end',8,
939
+ np.where(df['event_type']=='period-start',9,
940
+ np.where(df['event_type']=='faceoff',10,0))))))))))
941
+
942
+ df[['period','seconds_elapsed']] = df[['period','seconds_elapsed']].astype(int)
943
+ df = df.sort_values(['period','seconds_elapsed','event_num','priority'])
944
+
945
+ #Recalibrate event_num column to accurately depict the order of all events, including changes
946
+ df.reset_index(inplace=True,drop=True)
947
+ df['event_num'] = df.index+1
948
+ df['event_team_venue'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
949
+ df['event_type_last'] = df['event_type'].shift(1)
950
+ df['event_type_last_2'] = df['event_type_last'].shift(1)
951
+ df['event_type_next'] = df['event_type'].shift(-1)
952
+ lag_events = ['stoppage','goal','period-end']
953
+ lead_events = ['faceoff','period-end']
954
+ period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
955
+ #Define shifts by "line-change" or "on-the-fly"
956
+ df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
957
+ df['description'] = df['description'].combine_first(df['event_team_abbr']+" CHANGE: "+df['shift_type'])
958
+ try:
959
+ df['event_type_code'] = np.where(df['event_type']=='change',499,df['event_type_code'])
960
+ except:
961
+ ""
962
+
963
+ #Add time since last event and overall event length
964
+ df['seconds_since_last'] = df['seconds_elapsed'] - df['seconds_elapsed'].shift(1)
965
+ df['event_length'] = df['seconds_since_last'].shift(-1)
966
+
967
+ #Add fixed strength state column
968
+ df['strength_state_venue'] = df['away_skaters'].astype(str)+'v'+df['home_skaters'].astype(str)
969
+
970
+ #Retrieve coaches
971
+ coaches = info['coaches']
972
+ if not coaches:
973
+ df['away_coach'] = ""
974
+ df['home_coach'] = ""
975
+ df['event_coach'] = ""
976
+ else:
977
+ df['away_coach'] = coaches['away']
978
+ df['home_coach'] = coaches['home']
979
+ df['event_coach'] = np.where(df['event_team_abbr']==df['home_team_abbr'],coaches['home'],np.where(df['event_team_abbr']==df['away_team_abbr'],coaches['away'],""))
980
+
981
+ #Assign score, corsi, fenwick, and penalties for each event
982
+ for venue in ['away','home']:
983
+ df[f'{venue}_score'] = ((df['event_team_venue']==venue)&(df['event_type']=='goal')).cumsum()
984
+ df[f'{venue}_corsi'] = ((df['event_team_venue']==venue)&(df['event_type'].isin(['blocked-shot','missed-shot','shot-on-goal','goal']))).cumsum()
985
+ df[f'{venue}_fenwick'] = ((df['event_team_venue']==venue)&(df['event_type'].isin(['missed-shot','shot-on-goal','goal']))).cumsum()
986
+ df[f'{venue}_penalties'] = ((df['event_team_venue']==venue)&(df['event_type']=='penalty')).cumsum()
987
+
988
+ #Add time adjustments
989
+ df['period_time'] = np.trunc((df['seconds_elapsed']-((df['period']-1)*1200))/60).astype(str).str.replace('.0','')+":"+(df['seconds_elapsed'] % 60).astype(str).str.pad(2,'left','0')
990
+ df['game_time'] = np.trunc(df['seconds_elapsed']/60).astype(str).str.replace('.0','')+":"+(df['seconds_elapsed'] % 60).astype(str).str.pad(2,'left','0')
991
+
992
+ #Forward fill as necessary
993
+ cols = ['period_type','home_team_defending_side','away_coach','home_coach']
994
+ for col in cols:
995
+ try: df[col]
996
+ except: df[col] = ""
997
+ df[col] = df[col].ffill()
998
+
999
+ #Return: complete play-by-play with all important data for each event in a provided game
1000
+ return df[[col for col in get_col() if col in df.columns.to_list()]].replace(r'^\s*$', np.nan, regex=True)