TopDownHockey-Scraper 6.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3224 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from bs4 import BeautifulSoup # Keep for fallback/compatibility
4
+ from lxml import html, etree
5
+ import requests
6
+ import time
7
+ from datetime import datetime, timedelta
8
+ import warnings
9
+ warnings.filterwarnings("ignore")
10
+ import sys
11
+ import json
12
+ from json import loads, dumps
13
+ import lxml
14
+ from requests import ConnectionError, ReadTimeout, ConnectTimeout, HTTPError, Timeout
15
+ import xml
16
+ import re
17
+ from natsort import natsorted
18
+ import xml.etree.ElementTree as ET
19
+ import xmltodict
20
+ from xml.parsers.expat import ExpatError
21
+ from requests.exceptions import ChunkedEncodingError
22
+ import traceback
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from TopDownHockey_Scraper.scrape_nhl_api_events import scrape_api_events
25
+
26
+ print('Successfully did local install plus update - OPTIMIZED VERSION (Round 1: _append(), Round 2: name corrections, Round 3: vectorization, Round 4: parallel network requests)')
27
+
28
+ # ========== OPTIMIZATIONS ==========
29
+ # Create a persistent session with connection pooling
30
+ _session = requests.Session()
31
+ _adapter = requests.adapters.HTTPAdapter(
32
+ pool_connections=10,
33
+ pool_maxsize=20,
34
+ max_retries=2
35
+ )
36
+ _session.mount('http://', _adapter)
37
+ _session.mount('https://', _adapter)
38
+
39
+ # Compile regex patterns once for reuse
40
+ _BBORDER_PATTERN = re.compile('.*bborder.*')
41
+ _ZONE_PATTERN = re.compile(r'(\S+?) Zone')
42
+ _PLAYER_NUM_PATTERN = re.compile(r'[#-]\s*(\d+)')
43
+ _MATCH_GAME_PATTERN = re.compile(r'Match|Game')
44
+ _PARENTHESIS_PATTERN = re.compile(r'\((.*?)\)')
45
+ _MULTI_SPACE_PATTERN = re.compile(r' +')
46
+ _CAPTAIN_A_PATTERN = re.compile(r' \(A\)$')
47
+ _CAPTAIN_C_PATTERN = re.compile(r' \(C\)$')
48
+
49
+ # ========== PARALLEL FETCHING HELPERS ==========
50
+ def _fetch_url(url, **kwargs):
51
+ """Helper function to fetch URL with session for use in ThreadPoolExecutor"""
52
+ return _session.get(url, **kwargs)
53
+ # ===============================================
54
+
55
+ team_names = ['ANAHEIM DUCKS',
56
+ 'ARIZONA COYOTES',
57
+ 'ATLANTA THRASHERS',
58
+ 'BOSTON BRUINS',
59
+ 'BUFFALO SABRES',
60
+ 'CALGARY FLAMES',
61
+ 'CHICAGO BLACKHAWKS',
62
+ 'COLORADO AVALANCHE',
63
+ 'COLUMBUS BLUE JACKETS',
64
+ 'DALLAS STARS',
65
+ 'DETROIT RED WINGS',
66
+ 'EDMONTON OILERS',
67
+ 'FLORIDA PANTHERS',
68
+ 'LOS ANGELES KINGS',
69
+ 'MINNESOTA WILD',
70
+ 'MONTRÉAL CANADIENS',
71
+ 'MONTREAL CANADIENS',
72
+ 'NASHVILLE PREDATORS',
73
+ 'NEW JERSEY DEVILS',
74
+ 'NEW YORK ISLANDERS',
75
+ 'NEW YORK RANGERS',
76
+ 'OTTAWA SENATORS',
77
+ 'PHILADELPHIA FLYERS',
78
+ 'PITTSBURGH PENGUINS',
79
+ 'PHOENIX COYOTES',
80
+ 'CAROLINA HURRICANES',
81
+ 'SAN JOSE SHARKS',
82
+ 'ST. LOUIS BLUES',
83
+ 'TAMPA BAY LIGHTNING',
84
+ 'TORONTO MAPLE LEAFS',
85
+ 'UTAH MAMMOTH',
86
+ 'VANCOUVER CANUCKS',
87
+ 'VEGAS GOLDEN KNIGHTS',
88
+ 'WASHINGTON CAPITALS',
89
+ 'WINNIPEG JETS',
90
+ 'SEATTLE KRAKEN']
91
+
92
+ # ewc stands for "Events we care about."
93
+
94
+ ewc = ['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL']
95
+
96
+ # ========== OPTIMIZATION: Name Correction Dictionaries ==========
97
+ # Convert nested np.where() chains to fast dictionary lookups
98
+ # This provides 50-90% speedup on name correction operations
99
+
100
+ # ==================================
101
+
102
+ from TopDownHockey_Scraper.name_corrections import NAME_CORRECTIONS, normalize_player_name
103
+
104
+ print('6.1.13 coming your way')
105
+
106
+ def subtract_from_twenty_minutes(time_string):
107
+ # Parse the input time string
108
+ minutes, seconds = map(int, time_string.split(':'))
109
+
110
+ # Convert to total seconds
111
+ input_seconds = minutes * 60 + seconds
112
+ twenty_minutes_seconds = 20 * 60 # 1200 seconds
113
+
114
+ # Calculate the difference
115
+ difference_seconds = twenty_minutes_seconds - input_seconds
116
+
117
+ # Convert back to MM:SS format
118
+ result_minutes = difference_seconds // 60
119
+ result_seconds = difference_seconds % 60
120
+
121
+ # Format the result
122
+ return f"{result_minutes}:{result_seconds:02d}"
123
+
124
+ def convert_clock_to_seconds(clock):
125
+ min = int(clock.split(':')[0])
126
+ sec = int(clock.split(':')[1])
127
+ seconds = min * 60 + sec
128
+ return seconds
129
+
130
+ def convert_seconds_to_clock(seconds):
131
+
132
+ if len(str(int(seconds/60))) == 1:
133
+ minutes_string = '0' + str(int(seconds/60))
134
+ else:
135
+ minutes_string = str(int(seconds/60))
136
+
137
+ remainder = seconds - (60 * int(seconds/60))
138
+
139
+ if len(str(remainder)) == 1:
140
+ remainder_string = '0' + str(remainder)
141
+ else:
142
+ remainder_string = str(remainder)
143
+
144
+ return minutes_string + ':' + remainder_string
145
+
146
+ def scrape_schedule(start_date, end_date):
147
+
148
+ """
149
+ Scrape the NHL's API and get a schedule back.
150
+ """
151
+
152
+ url = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=' + start_date + '&endDate=' + end_date
153
+ page = _session.get(url, timeout=30)
154
+ loaddict = json.loads(page.content)
155
+ date_list = (loaddict['dates'])
156
+ date_df = pd.DataFrame(date_list)
157
+
158
+ # OPTIMIZED: Use list + concat instead of repeated _append()
159
+ gamedf_list = []
160
+ for i in range (0, len(date_df)):
161
+ datedf = pd.DataFrame(date_df.games.iloc[i])
162
+ gamedf_list.append(datedf)
163
+ gamedf = pd.concat(gamedf_list, ignore_index=True) if gamedf_list else pd.DataFrame()
164
+ global team_df
165
+ team_df = pd.DataFrame(gamedf['teams'].values.tolist(), index = gamedf.index)
166
+ away_df = pd.DataFrame(team_df['away'].values.tolist(), index = team_df.index)
167
+ home_df = pd.DataFrame(team_df['home'].values.tolist(), index = team_df.index)
168
+ away_team_df = pd.DataFrame(away_df['team'].values.tolist(), index = away_df.index)
169
+ home_team_df = pd.DataFrame(home_df['team'].values.tolist(), index = home_df.index)
170
+
171
+ gamedf = gamedf.assign(
172
+ state = pd.DataFrame(gamedf['status'].values.tolist(), index = gamedf.index)['detailedState'],
173
+ homename = home_team_df['name'],
174
+ homeid = home_team_df['id'],
175
+ homescore = home_df['score'],
176
+ awayname = away_team_df['name'],
177
+ awayid = away_team_df['id'],
178
+ awayscore = away_df['score'],
179
+ venue = pd.DataFrame(gamedf['venue'].values.tolist(), index = gamedf.index)['name'],
180
+ gameDate = pd.to_datetime(gamedf['gameDate']).dt.tz_convert('EST')
181
+ )
182
+
183
+ gamedf = gamedf.loc[:, ['gamePk', 'link', 'gameType', 'season', 'gameDate','homeid', 'homename', 'homescore','awayid', 'awayname', 'awayscore', 'state', 'venue']].rename(
184
+ columns = {'gamePk':'ID', 'gameType':'type', 'gameDate':'date'})
185
+
186
+ gamedf['type']
187
+
188
+ return(gamedf)
189
+
190
+ def hs_strip_html(td):
191
+ """
192
+ Function from Harry Shomer's Github
193
+
194
+ Strip html tags and such
195
+
196
+ :param td: pbp (list of lxml elements)
197
+
198
+ :return: list of plays (which contain a list of info) stripped of html
199
+ """
200
+ for y in range(len(td)):
201
+ # Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
202
+ if y == 3:
203
+ td[y] = td[y].text_content() # This gets us elapsed and remaining combined-< 3:0017:00
204
+ index = td[y].find(':')
205
+ td[y] = td[y][:index+3]
206
+ elif (y == 6 or y == 7) and td[0] != '#':
207
+ # 6 & 7-> These are the player 1 ice one's
208
+ # The second statement controls for when it's just a header
209
+ baz = td[y].xpath('.//td')
210
+ bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
211
+
212
+ # The setup in the list is now: Name/Number->Position->Blank...and repeat
213
+ # Now strip all the html
214
+ players = []
215
+ for i in range(len(bar)):
216
+ if i % 3 == 0:
217
+ try:
218
+ font_elem = bar[i].xpath('.//font')
219
+ if font_elem:
220
+ name = return_name_html(font_elem[0].get('title', ''))
221
+ else:
222
+ name = ''
223
+ number = bar[i].text_content().strip('\n') # Get number and strip leading/trailing newlines
224
+ except (KeyError, IndexError):
225
+ name = ''
226
+ number = ''
227
+ elif i % 3 == 1:
228
+ if name != '':
229
+ position = bar[i].text_content()
230
+ players.append([name, number, position])
231
+
232
+ td[y] = players
233
+ else:
234
+ td[y] = td[y].text_content()
235
+
236
+ return td
237
+
238
+ def parse_goaltender_summary(goalie_table):
239
+ """Parse the goaltender summary table into a DataFrame."""
240
+
241
+ rows = goalie_table.find_all('tr')
242
+
243
+ goalie_data = []
244
+ current_team = None
245
+
246
+ for row in rows:
247
+ cells = row.find_all('td')
248
+ if not cells:
249
+ continue
250
+
251
+ # Check if this is a team header row (contains team name)
252
+ first_cell_text = cells[0].get_text(strip=True)
253
+
254
+ # Team header row - look for visitorsectionheading or homesectionheading
255
+ if 'visitorsectionheading' in str(cells[0].get('class', [])) or \
256
+ 'homesectionheading' in str(cells[0].get('class', [])):
257
+ # Extract team name
258
+ if first_cell_text and first_cell_text not in ['TOI', 'GOALS-SHOTS AGAINST', 'EV', 'PP', 'SH', 'TOT', '1', '2', '3']:
259
+ current_team = first_cell_text
260
+ continue
261
+
262
+ # Skip subheader rows (EV, PP, SH, etc.)
263
+ if first_cell_text in ['EV', 'PP', 'SH', 'TOT', '']:
264
+ continue
265
+
266
+ # Skip TEAM TOTALS and spacer rows
267
+ if 'TEAM TOTALS' in first_cell_text or first_cell_text == '\xa0':
268
+ continue
269
+
270
+ # This should be a goaltender data row
271
+ # Check if it has position "G" in the second cell
272
+ if len(cells) >= 11:
273
+ cell_texts = [c.get_text(strip=True) for c in cells]
274
+
275
+ # Goalie rows have: Number, "G", Name, EV, PP, SH, TOT, P1, P2, P3, TOT
276
+ if len(cell_texts) >= 2 and cell_texts[1] == 'G':
277
+ goalie_data.append({
278
+ 'team': current_team,
279
+ 'number': cell_texts[0],
280
+ 'name': cell_texts[2],
281
+ 'EV Total': cell_texts[3] if cell_texts[3] else None,
282
+ 'PP Total': cell_texts[4] if cell_texts[4] else None,
283
+ 'TOI': cell_texts[6] if cell_texts[6] else None,
284
+ })
285
+
286
+ return pd.DataFrame(goalie_data)
287
+
288
+ def group_if_not_none(result):
289
+ if result is not None:
290
+ result = result.group()
291
+ return(result)
292
+
293
+ def scrape_html_roster(season, game_id, page=None):
294
+ """
295
+ Scrape HTML roster page.
296
+
297
+ Args:
298
+ season: Season string (e.g., '20242025')
299
+ game_id: Game ID string (e.g., '020333')
300
+ page: Optional pre-fetched requests.Response object. If None, will fetch the page.
301
+
302
+ Returns:
303
+ DataFrame with roster information
304
+ """
305
+ if page is None:
306
+ url = 'http://www.nhl.com/scores/htmlreports/' + season + '/RO0' + game_id + '.HTM'
307
+
308
+ # TIME: Roster network request
309
+ net_start = time.time()
310
+ page = _session.get(url, timeout=10)
311
+ net_duration = time.time() - net_start
312
+ try:
313
+ print(f' ⏱️ Roster network request: {net_duration:.2f}s')
314
+ except Exception:
315
+ pass
316
+
317
+ # OPTIMIZED: Use lxml directly instead of BeautifulSoup for faster parsing
318
+ if type(page) == str:
319
+ doc = html.fromstring(page)
320
+ else:
321
+ doc = html.fromstring(page.content.decode('ISO-8859-1'))
322
+
323
+ # XPath to find td elements with align='center', class containing 'teamHeading' and 'border', width='50%'
324
+ teamsoup = doc.xpath("//td[@align='center' and @width='50%' and contains(@class, 'teamHeading') and contains(@class, 'border')]")
325
+ away_team = teamsoup[0].text_content() if len(teamsoup) > 0 else ''
326
+ home_team = teamsoup[1].text_content() if len(teamsoup) > 1 else ''
327
+
328
+ # XPath to find tables with specific attributes, then get td elements from the 3rd table (index 2)
329
+ tables = doc.xpath("//table[@align='center' and @border='0' and @cellpadding='0' and @cellspacing='0' and @width='100%']")
330
+ home_player_soup = tables[2].xpath(".//td") if len(tables) > 2 else []
331
+ # Convert lxml elements to text content
332
+ home_player_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in home_player_soup]
333
+
334
+ length = int(len(home_player_soup)/3)
335
+
336
+ home_player_df = pd.DataFrame(np.array(home_player_soup).reshape(length, 3))
337
+
338
+ home_player_df.columns = home_player_df.iloc[0]
339
+
340
+ home_player_df = home_player_df.drop(0).assign(team = 'home', team_name = home_team)
341
+
342
+ # Get away player data from 2nd table (index 1)
343
+ away_player_soup = tables[1].xpath(".//td") if len(tables) > 1 else []
344
+ # Convert lxml elements to text content
345
+ away_player_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in away_player_soup]
346
+
347
+ length = int(len(away_player_soup)/3)
348
+
349
+ away_player_df = pd.DataFrame(np.array(away_player_soup).reshape(length, 3))
350
+
351
+ away_player_df.columns = away_player_df.iloc[0]
352
+
353
+ away_player_df = away_player_df.drop(0).assign(team = 'away', team_name = away_team)
354
+
355
+ #global home_scratch_soup
356
+
357
+ if len(tables) > 3:
358
+
359
+ try:
360
+
361
+ home_scratch_soup = tables[4].xpath(".//td") if len(tables) > 4 else []
362
+ # Convert lxml elements to text content
363
+ home_scratch_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in home_scratch_soup]
364
+
365
+ if len(home_scratch_soup)>1:
366
+
367
+ length = int(len(home_scratch_soup)/3)
368
+
369
+ home_scratch_df = pd.DataFrame(np.array(home_scratch_soup).reshape(length, 3))
370
+
371
+ home_scratch_df.columns = home_scratch_df.iloc[0]
372
+
373
+ home_scratch_df = home_scratch_df.drop(0).assign(team = 'home', team_name = home_team)
374
+
375
+ except Exception as e:
376
+ print(e)
377
+ print('No home scratch soup')
378
+ home_scratch_df = pd.DataFrame()
379
+
380
+ if 'home_scratch_df' not in locals():
381
+
382
+ home_scratch_df = pd.DataFrame()
383
+
384
+ if len(tables) > 2:
385
+
386
+ try:
387
+
388
+ away_scratch_soup = tables[3].xpath(".//td") if len(tables) > 3 else []
389
+ # Convert lxml elements to text content
390
+ away_scratch_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in away_scratch_soup]
391
+
392
+ if len(away_scratch_soup)>1:
393
+
394
+ length = int(len(away_scratch_soup)/3)
395
+
396
+ away_scratch_df = pd.DataFrame(np.array(away_scratch_soup).reshape(length, 3))
397
+
398
+ away_scratch_df.columns = away_scratch_df.iloc[0]
399
+
400
+ away_scratch_df = away_scratch_df.drop(0).assign(team = 'away', team_name = away_team)
401
+
402
+ except Exception as e:
403
+ print(e)
404
+ print('No away scratch soup')
405
+ away_scratch_df = pd.DataFrame()
406
+
407
+ if 'away_scratch_df' not in locals():
408
+
409
+ away_scratch_df = pd.DataFrame()
410
+
411
+ player_df = pd.concat([home_player_df, away_player_df]).assign(status = 'player')
412
+ scratch_df = pd.concat([home_scratch_df, away_scratch_df]).assign(status = 'scratch')
413
+ roster_df = pd.concat([player_df, scratch_df])
414
+
415
+ roster_df = roster_df.assign(team = np.where(roster_df.team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', roster_df.team))
416
+
417
+ roster_df = roster_df.assign(team = np.where(roster_df.team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', roster_df.team))
418
+
419
+ # FIX NAMES
420
+
421
+ roster_df = roster_df.rename(columns = {'Nom/Name':'Name'})
422
+
423
+ roster_df.Name = roster_df.Name.apply(lambda x: _CAPTAIN_A_PATTERN.sub('', x).strip())
424
+ roster_df.Name = roster_df.Name.apply(lambda x: _CAPTAIN_C_PATTERN.sub('', x).strip())
425
+
426
+ # OPTIMIZED: Batch string replacements instead of conditional np.where()
427
+ # Max Pacioretty doesn't exist in ESPN in 2009-2010, sadly.
428
+ roster_df['Name'] = (roster_df['Name']
429
+ .str.replace('ALEXANDRE ', 'ALEX ', regex=False)
430
+ .str.replace('ALEXANDER ', 'ALEX ', regex=False)
431
+ .str.replace('CHRISTOPHER ', 'CHRIS ', regex=False))
432
+
433
+ # OPTIMIZED: Use dictionary lookup instead of nested np.where() chains
434
+ # This provides 50-90% speedup on name corrections
435
+
436
+ # OPTIMIZED: Already handled by dictionary lookup above
437
+ # (These names are already in _NAME_CORRECTIONS)
438
+
439
+ roster_df['Name'] = np.where((roster_df['Name']=="SEBASTIAN AHO") & (roster_df['Pos']=='D'), 'SEBASTIAN AHO SWE', roster_df['Name'])
440
+ roster_df['Name'] = np.where((roster_df['Name']=="ELIAS PETTERSSON") & (roster_df['Pos']=='D'), 'ELIAS PETTERSSON(D)', roster_df['Name'])
441
+ roster_df['Name'] = np.where((roster_df['Name']=="COLIN WHITE") & (roster_df['Pos']=='D'), 'COLIN WHITE CAN', roster_df['Name'])
442
+ roster_df['Name'] = np.where((roster_df['Name']=="SEAN COLLINS") & (roster_df['Pos']=='D'), 'SEAN COLLINS CAN', roster_df['Name'])
443
+ roster_df['Name'] = np.where((roster_df['Name']=="ALEX PICARD") & (roster_df['Pos']!='D'), 'ALEX PICARD F', roster_df['Name'])
444
+ roster_df['Name'] = np.where((roster_df['Name']=="ERIK GUSTAFSSON") & (int(season)<20132014), 'ERIK GUSTAFSSON 88', roster_df['Name'])
445
+ roster_df['Name'] = np.where((roster_df['Name']=="MIKKO LEHTONEN") & (int(season)<20202021), 'MIKKO LEHTONEN F', roster_df['Name'])
446
+ roster_df['Name'] = np.where(roster_df['Name']=='ALEX BARRÉ-BOULET', 'ALEX BARRE-BOULET', roster_df['Name'])
447
+ roster_df['Name'] = np.where(roster_df['Name']=='COLIN', 'COLIN WHITE CAN', roster_df['Name'])
448
+
449
+ # OPTIMIZED: Already handled by dictionary lookup above
450
+ # (These names are already in _NAME_CORRECTIONS)
451
+
452
+ roster_df['Name'] = roster_df['Name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
453
+
454
+ roster_df['Name'] = np.where(roster_df['Name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", roster_df['Name']) # Need to do this after normalization, only then he becomes Slafkovska?
455
+ roster_df['Name'] = np.where(roster_df['Name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", roster_df['Name'])
456
+ roster_df['Name'] = np.where(roster_df['Name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", roster_df['Name'])
457
+
458
+ roster_df['Name'] = np.where(roster_df['Name']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY", roster_df['Name'])
459
+
460
+ roster_df['Name'] = np.where(roster_df['Name']== "MATAJ BLAMEL" , "MATAJ BLAMEL", roster_df['Name'])
461
+
462
+ roster_df['Name'] = roster_df['Name'].str.replace(' ', ' ')
463
+
464
+ roster_df['Name'] = roster_df['Name'].apply(lambda x: normalize_player_name(x))
465
+
466
+ return roster_df
467
+
468
+ def scrape_html_shifts(season, game_id, live = True, home_page=None, away_page=None, summary = None, roster_cache = None):
469
+ """
470
+ Scrape HTML shifts pages.
471
+
472
+ Args:
473
+ season: Season string (e.g., '20242025')
474
+ game_id: Game ID string (e.g., '020333')
475
+ live: Boolean flag for live games
476
+ home_page: Optional pre-fetched requests.Response object for home shifts page. If None, will fetch.
477
+ away_page: Optional pre-fetched requests.Response object for away shifts page. If None, will fetch.
478
+
479
+ Returns:
480
+ DataFrame with shift information
481
+ """
482
+ goalie_names = roster_cache[roster_cache.Pos=='G'].Name.unique().tolist()
483
+ home_goalie_names = roster_cache[(roster_cache.Pos=='G') & (roster_cache.team=='home')].Name.unique().tolist()
484
+ away_goalie_names = roster_cache[(roster_cache.Pos=='G') & (roster_cache.team=='away')].Name.unique().tolist()
485
+
486
+ if home_page is None:
487
+ url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TH0' + game_id + '.HTM'
488
+
489
+ # TIME: Home shifts network request
490
+ net_start = time.time()
491
+ home_page = _session.get(url, timeout=10)
492
+ net_duration = time.time() - net_start
493
+ try:
494
+ print(f' ⏱️ Home shifts network request: {net_duration:.2f}s')
495
+ except Exception:
496
+ pass
497
+
498
+ # NOTE: Keeping BeautifulSoup for shifts parsing for now due to complex class matching
499
+ # lxml optimization applied to events parsing (major speedup achieved there)
500
+ if type(home_page) == str:
501
+ home_soup = BeautifulSoup(home_page)
502
+ else:
503
+ home_soup = BeautifulSoup(home_page.content, 'lxml')
504
+ found = home_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
505
+ if len(found)==0:
506
+ raise IndexError('This game has no shift data.')
507
+ thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
508
+
509
+ players = dict()
510
+
511
+ # OPTIMIZED: Reduce repeated string operations
512
+ for i in range(len(found)):
513
+ line = found[i].get_text()
514
+ if line == '25 PETTERSSON, ELIAS':
515
+ line = '25 PETTERSSON(D), ELIAS'
516
+ if ', ' in line:
517
+ # OPTIMIZED: Split once and reuse
518
+ name_parts = line.split(',')
519
+ if len(name_parts) >= 2:
520
+ number_last = name_parts[0].split(' ', 1) # Split only once
521
+ number = number_last[0].strip()
522
+ last_name = number_last[1].strip() if len(number_last) > 1 else ''
523
+ first_name = name_parts[1].strip()
524
+ full_name = first_name + " " + last_name
525
+ players[full_name] = {
526
+ 'number': number,
527
+ 'name': full_name,
528
+ 'shifts': []
529
+ }
530
+ else:
531
+ players[full_name]['shifts'].append(line) # Use append instead of extend([line])
532
+
533
+ # OPTIMIZED: Use list + concat instead of repeated _append()
534
+ alldf_list = []
535
+ for key in players.keys():
536
+ shifts_array = np.array(players[key]['shifts'])
537
+ length = (len(shifts_array) // 5) * 5
538
+ shifts_array = shifts_array[:length]
539
+ df = pd.DataFrame(shifts_array.reshape(-1, 5)).rename(
540
+ columns = {0:'shift_number', 1:'period', 2:'shift_start', 3:'shift_end', 4:'duration'})
541
+ df = df.assign(name = players[key]['name'],
542
+ number = players[key]['number'],
543
+ team = thisteam,
544
+ venue = "home")
545
+ alldf_list.append(df)
546
+
547
+ home_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
548
+
549
+ if live == True:
550
+
551
+ home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
552
+ home_shifts = home_shifts.assign(number = home_shifts.number.astype(int))
553
+
554
+ found = home_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
555
+ if len(found)==0:
556
+ raise IndexError('This game has no shift data.')
557
+ thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
558
+
559
+ players = dict()
560
+
561
+ for i in range(len(found)):
562
+ line = found[i].get_text()
563
+ if line == '25 PETTERSSON, ELIAS':
564
+ line = '25 PETTERSSON(D), ELIAS'
565
+ if ', ' in line:
566
+ # OPTIMIZED: Split once and reuse
567
+ name_parts = line.split(',')
568
+ if len(name_parts) >= 2:
569
+ number_last = name_parts[0].split(' ', 1) # Split only once
570
+ number = number_last[0].strip()
571
+ last_name = number_last[1].strip() if len(number_last) > 1 else ''
572
+ first_name = name_parts[1].strip()
573
+ full_name = first_name + " " + last_name
574
+ players[full_name] = {
575
+ 'number': number,
576
+ 'name': full_name,
577
+ 'shifts': []
578
+ }
579
+ else:
580
+ players[full_name]['shifts'].append(line) # Use append instead of extend([line])
581
+
582
+ # OPTIMIZED: Use list + concat instead of repeated _append()
583
+ alldf_list = []
584
+ for key in players.keys():
585
+ length = length = int(len(players[key]['shifts'])/6)
586
+ df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
587
+ columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
588
+ df = df.assign(name = players[key]['name'],
589
+ number = players[key]['number'],
590
+ team = thisteam,
591
+ venue = "home")
592
+ alldf_list.append(df)
593
+
594
+ home_extra_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
595
+
596
+ # Trigger: There is no home goalie for this period and we're not about to pull one from the extra shifts.
597
+
598
+ if len(home_shifts[(home_shifts.period==max(home_shifts.period)) & (home_shifts.name.isin(home_goalie_names))]) == 0 and len(home_extra_shifts[home_extra_shifts.name.isin(home_goalie_names)]) == 0:
599
+
600
+ if type(summary) == str:
601
+ summary_soup = BeautifulSoup(summary)
602
+ else:
603
+ summary_soup = BeautifulSoup(summary.content.decode('ISO-8859-1'))
604
+
605
+ sections = summary_soup.find_all('td', class_='sectionheading')
606
+ for section in sections:
607
+ if 'GOALTENDER SUMMARY' in section.get_text():
608
+ goalie_table = section.find_parent('tr').find_next_sibling('tr').find('table')
609
+ break
610
+
611
+ goalie_summary = parse_goaltender_summary(goalie_table)
612
+
613
+ goalie_summary = goalie_summary[(goalie_summary.team==thisteam) & ~(pd.isna(goalie_summary['TOI']))]
614
+
615
+ goalie_summary = goalie_summary.assign(name =
616
+ goalie_summary.name.str.split(', ').str[-1] + ' ' + goalie_summary.name.str.split(', ').str[0]
617
+ )
618
+
619
+ goalie_summary.name = goalie_summary.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
620
+
621
+ goalie_summary = goalie_summary.assign(period = max(home_shifts.period), shifts = '1', avg = goalie_summary.TOI, venue = 'home').loc[:, home_extra_shifts.columns]
622
+
623
+ home_extra_shifts = pd.concat([home_extra_shifts, goalie_summary])
624
+
625
+ home_extra_shifts = home_extra_shifts.assign(TOI_seconds_summary = home_extra_shifts.TOI.apply(lambda x: convert_clock_to_seconds(x)))
626
+
627
+ home_extra_shifts = home_extra_shifts.merge(
628
+ home_shifts.assign(toi_secs = home_shifts.duration.apply(lambda x: convert_clock_to_seconds(x))
629
+ ).groupby(['name', 'period'])['toi_secs'].sum().reset_index(),
630
+ how = 'left'
631
+ ).fillna(0)
632
+
633
+ home_extra_shifts['toi_secs'] = home_extra_shifts['toi_secs'].astype(int)
634
+
635
+ home_extra_shifts = home_extra_shifts.assign(toi_diff = abs(home_extra_shifts.toi_secs - home_extra_shifts.TOI_seconds_summary))
636
+
637
+ shifts_needing_to_be_added = home_extra_shifts[home_extra_shifts.toi_diff!=0]
638
+
639
+ if len(shifts_needing_to_be_added) > 0:
640
+
641
+ latest_shift_end = home_shifts.assign(period_secs = home_shifts.shift_end.str.split(' / ').str[0].apply(lambda x: convert_clock_to_seconds(x)))[
642
+ home_shifts.period==max(home_shifts.period)
643
+ ].sort_values(by = 'period_secs', ascending = False).period_secs.iloc[0]
644
+
645
+ max_toi = shifts_needing_to_be_added.TOI.apply(lambda x: convert_clock_to_seconds(x)).max()
646
+
647
+ overage = max_toi - latest_shift_end
648
+
649
+ if overage > 0:
650
+ shifts_needing_to_be_added.toi_diff = shifts_needing_to_be_added.toi_diff - overage
651
+
652
+ home_clock_time_now = convert_seconds_to_clock(latest_shift_end)
653
+
654
+ home_clock_period = max(home_shifts.period.astype(int))
655
+
656
+ start_times_seconds = home_clock_time_now
657
+
658
+ import math
659
+
660
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
661
+ shift_start = ((convert_clock_to_seconds(home_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str)
662
+ + ' / ' + (convert_clock_to_seconds(home_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str).apply(lambda x: subtract_from_twenty_minutes(x))),
663
+ shift_end = home_clock_time_now + ' / ' + subtract_from_twenty_minutes(home_clock_time_now),
664
+ duration = shifts_needing_to_be_added.toi_diff
665
+ )
666
+
667
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
668
+ duration = shifts_needing_to_be_added.toi_diff.apply(lambda x: convert_seconds_to_clock(x))
669
+ )
670
+
671
+ shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
672
+ home_shifts.assign(shift_number = home_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'}),
673
+ how = 'left'
674
+ ).fillna(0)
675
+
676
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
677
+
678
+ shifts_needing_to_be_added.shift_number = shifts_needing_to_be_added.shift_number.astype(int)
679
+
680
+ shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
681
+
682
+ shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
683
+
684
+ home_shifts = pd.concat([home_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
685
+
686
+ elif len(shifts_needing_to_be_added) == 0:
687
+ home_clock_period = None
688
+ home_clock_time_now = None
689
+
690
+ if away_page is None:
691
+ url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TH0' + game_id + '.HTM'
692
+
693
+ # TIME: away shifts network request
694
+ net_start = time.time()
695
+ away_page = _session.get(url, timeout=10)
696
+ net_duration = time.time() - net_start
697
+ try:
698
+ print(f' ⏱️ away shifts network request: {net_duration:.2f}s')
699
+ except Exception:
700
+ pass
701
+
702
+ # NOTE: Keeping BeautifulSoup for shifts parsing for now due to complex class matching
703
+ # lxml optimization applied to events parsing (major speedup achieved there)
704
+ if type(away_page) == str:
705
+ away_soup = BeautifulSoup(away_page)
706
+ else:
707
+ away_soup = BeautifulSoup(away_page.content, 'lxml')
708
+ found = away_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
709
+ if len(found)==0:
710
+ raise IndexError('This game has no shift data.')
711
+ thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
712
+
713
+ players = dict()
714
+
715
+ # OPTIMIZED: Reduce repeated string operations
716
+ for i in range(len(found)):
717
+ line = found[i].get_text()
718
+ if line == '25 PETTERSSON, ELIAS':
719
+ line = '25 PETTERSSON(D), ELIAS'
720
+ if ', ' in line:
721
+ # OPTIMIZED: Split once and reuse
722
+ name_parts = line.split(',')
723
+ if len(name_parts) >= 2:
724
+ number_last = name_parts[0].split(' ', 1) # Split only once
725
+ number = number_last[0].strip()
726
+ last_name = number_last[1].strip() if len(number_last) > 1 else ''
727
+ first_name = name_parts[1].strip()
728
+ full_name = first_name + " " + last_name
729
+ players[full_name] = {
730
+ 'number': number,
731
+ 'name': full_name,
732
+ 'shifts': []
733
+ }
734
+ else:
735
+ players[full_name]['shifts'].append(line) # Use append instead of extend([line])
736
+
737
+ # OPTIMIZED: Use list + concat instead of repeated _append()
738
+ alldf_list = []
739
+ for key in players.keys():
740
+ shifts_array = np.array(players[key]['shifts'])
741
+ length = (len(shifts_array) // 5) * 5
742
+ shifts_array = shifts_array[:length]
743
+ df = pd.DataFrame(shifts_array.reshape(-1, 5)).rename(
744
+ columns = {0:'shift_number', 1:'period', 2:'shift_start', 3:'shift_end', 4:'duration'})
745
+ df = df.assign(name = players[key]['name'],
746
+ number = players[key]['number'],
747
+ team = thisteam,
748
+ venue = "away")
749
+ alldf_list.append(df)
750
+
751
+ away_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
752
+
753
+ if live == True:
754
+
755
+ away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
756
+ away_shifts = away_shifts.assign(number = away_shifts.number.astype(int))
757
+
758
+ found = away_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
759
+ if len(found)==0:
760
+ raise IndexError('This game has no shift data.')
761
+ thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
762
+
763
+ players = dict()
764
+
765
+ for i in range(len(found)):
766
+ line = found[i].get_text()
767
+ if line == '25 PETTERSSON, ELIAS':
768
+ line = '25 PETTERSSON(D), ELIAS'
769
+ if ', ' in line:
770
+ # OPTIMIZED: Split once and reuse
771
+ name_parts = line.split(',')
772
+ if len(name_parts) >= 2:
773
+ number_last = name_parts[0].split(' ', 1) # Split only once
774
+ number = number_last[0].strip()
775
+ last_name = number_last[1].strip() if len(number_last) > 1 else ''
776
+ first_name = name_parts[1].strip()
777
+ full_name = first_name + " " + last_name
778
+ players[full_name] = {
779
+ 'number': number,
780
+ 'name': full_name,
781
+ 'shifts': []
782
+ }
783
+ else:
784
+ players[full_name]['shifts'].append(line) # Use append instead of extend([line])
785
+
786
+ # OPTIMIZED: Use list + concat instead of repeated _append()
787
+ alldf_list = []
788
+ for key in players.keys():
789
+ length = length = int(len(players[key]['shifts'])/6)
790
+ df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
791
+ columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
792
+ df = df.assign(name = players[key]['name'],
793
+ number = players[key]['number'],
794
+ team = thisteam,
795
+ venue = "away")
796
+ alldf_list.append(df)
797
+
798
+ away_extra_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
799
+
800
+ # Trigger: There is no away goalie for this period and we're not about to pull one from the extra shifts.
801
+
802
+ if len(away_shifts[(away_shifts.period==max(away_shifts.period)) & (away_shifts.name.isin(away_goalie_names))]) == 0 and len(away_extra_shifts[away_extra_shifts.name.isin(away_goalie_names)]) == 0:
803
+
804
+ if type(summary) == str:
805
+ summary_soup = BeautifulSoup(summary)
806
+ else:
807
+ summary_soup = BeautifulSoup(summary.content.decode('ISO-8859-1'))
808
+
809
+ sections = summary_soup.find_all('td', class_='sectionheading')
810
+ for section in sections:
811
+ if 'GOALTENDER SUMMARY' in section.get_text():
812
+ goalie_table = section.find_parent('tr').find_next_sibling('tr').find('table')
813
+ break
814
+
815
+ goalie_summary = parse_goaltender_summary(goalie_table)
816
+
817
+ goalie_summary = goalie_summary[(goalie_summary.team==thisteam) & ~(pd.isna(goalie_summary['TOI']))]
818
+
819
+ goalie_summary = goalie_summary.assign(name =
820
+ goalie_summary.name.str.split(', ').str[-1] + ' ' + goalie_summary.name.str.split(', ').str[0]
821
+ )
822
+
823
+ goalie_summary.name = goalie_summary.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
824
+
825
+ goalie_summary = goalie_summary.assign(period = max(away_shifts.period), shifts = '1', avg = goalie_summary.TOI, venue = 'away').loc[:, away_extra_shifts.columns]
826
+
827
+ away_extra_shifts = pd.concat([away_extra_shifts, goalie_summary])
828
+
829
+ away_extra_shifts = away_extra_shifts.assign(TOI_seconds_summary = away_extra_shifts.TOI.apply(lambda x: convert_clock_to_seconds(x)))
830
+
831
+ away_extra_shifts = away_extra_shifts.merge(
832
+ away_shifts.assign(toi_secs = away_shifts.duration.apply(lambda x: convert_clock_to_seconds(x))
833
+ ).groupby(['name', 'period'])['toi_secs'].sum().reset_index(),
834
+ how = 'left'
835
+ ).fillna(0)
836
+
837
+ away_extra_shifts['toi_secs'] = away_extra_shifts['toi_secs'].astype(int)
838
+
839
+ away_extra_shifts = away_extra_shifts.assign(toi_diff = abs(away_extra_shifts.toi_secs - away_extra_shifts.TOI_seconds_summary))
840
+
841
+ shifts_needing_to_be_added = away_extra_shifts[away_extra_shifts.toi_diff!=0]
842
+
843
+ if len(shifts_needing_to_be_added) > 0:
844
+
845
+ latest_shift_end = away_shifts.assign(period_secs = away_shifts.shift_end.str.split(' / ').str[0].apply(lambda x: convert_clock_to_seconds(x)))[
846
+ away_shifts.period==max(away_shifts.period)
847
+ ].sort_values(by = 'period_secs', ascending = False).period_secs.iloc[0]
848
+
849
+ max_toi = shifts_needing_to_be_added.TOI.apply(lambda x: convert_clock_to_seconds(x)).max()
850
+
851
+ overage = max_toi - latest_shift_end
852
+
853
+ if overage > 0:
854
+ shifts_needing_to_be_added.toi_diff = shifts_needing_to_be_added.toi_diff - overage
855
+
856
+ away_clock_time_now = convert_seconds_to_clock(latest_shift_end)
857
+
858
+ away_clock_period = max(away_shifts.period.astype(int))
859
+
860
+ start_times_seconds = away_clock_time_now
861
+
862
+ import math
863
+
864
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
865
+ shift_start = ((convert_clock_to_seconds(away_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str)
866
+ + ' / ' + (convert_clock_to_seconds(away_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str).apply(lambda x: subtract_from_twenty_minutes(x))),
867
+ shift_end = away_clock_time_now + ' / ' + subtract_from_twenty_minutes(away_clock_time_now),
868
+ duration = shifts_needing_to_be_added.toi_diff
869
+ )
870
+
871
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
872
+ duration = shifts_needing_to_be_added.toi_diff.apply(lambda x: convert_seconds_to_clock(x))
873
+ )
874
+
875
+ shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
876
+ away_shifts.assign(shift_number = away_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'}),
877
+ how = 'left'
878
+ ).fillna(0)
879
+
880
+ shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
881
+
882
+ shifts_needing_to_be_added.shift_number = shifts_needing_to_be_added.shift_number.astype(int)
883
+
884
+ shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
885
+
886
+ shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
887
+
888
+ away_shifts = pd.concat([away_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
889
+
890
+ elif len(shifts_needing_to_be_added) == 0:
891
+ away_clock_period = None
892
+ away_clock_time_now = None
893
+
894
+ global all_shifts
895
+
896
+ all_shifts = pd.concat([home_shifts, away_shifts])
897
+
898
+ all_shifts.name = all_shifts.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
899
+
900
+ all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
901
+
902
+ all_shifts = all_shifts.assign(end_time = all_shifts.shift_end.str.split('/').str[0])
903
+
904
+ #all_shifts = all_shifts[~all_shifts.end_time.str.contains('\xa0')]
905
+
906
+ # Filter out summary rows (GP, G, A, etc.) that might have been included
907
+ # Period should be numeric (1-4) or 'OT', so filter out anything else
908
+ if len(all_shifts) > 0:
909
+ period_str = all_shifts.period.astype(str).str.strip()
910
+ # Only keep rows where period is a valid period value
911
+ valid_mask = period_str.isin(['1', '2', '3', '4', 'OT'])
912
+ all_shifts = all_shifts[valid_mask].copy()
913
+
914
+ if len(all_shifts) > 0:
915
+ all_shifts.period = (np.where(all_shifts.period=='OT', 4, all_shifts.period)).astype(int)
916
+
917
+ all_shifts = all_shifts.assign(end_time = np.where(~all_shifts.shift_end.str.contains('\xa0'), all_shifts.end_time,
918
+ (np.where(
919
+ (((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
920
+ (all_shifts.start_time.str.split(':').str[1].astype(int)) +
921
+ (60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
922
+ (all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[3:].str[0]=='0',
923
+ (((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
924
+ (all_shifts.start_time.str.split(':').str[1].astype(int)) +
925
+ (60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
926
+ (all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[4:],
927
+ (((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
928
+ (all_shifts.start_time.str.split(':').str[1].astype(int)) +
929
+ (60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
930
+ (all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[4:]))))
931
+
932
+ # OPTIMIZED: Batch string replacements instead of conditional np.where()
933
+ all_shifts['name'] = (all_shifts['name']
934
+ .str.replace('ALEXANDRE ', 'ALEX ', regex=False)
935
+ .str.replace('ALEXANDER ', 'ALEX ', regex=False)
936
+ .str.replace('CHRISTOPHER ', 'CHRIS ', regex=False))
937
+
938
+ # OPTIMIZED: Use dictionary lookup instead of nested np.where() chains
939
+ all_shifts['name'] = all_shifts['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
940
+ all_shifts['name'] = all_shifts['name'].apply(lambda x: normalize_player_name(x))
941
+
942
+ # OPTIMIZED: Already handled by dictionary lookup above
943
+ # Old nested chains removed - they were replaced with: all_shifts['name'] = all_shifts['name'].replace(_NAME_CORRECTIONS)
944
+ # Old nested chains removed - replaced with dictionary lookup
945
+
946
+ # Apply regex to remove (A) and (C) designations at end of names
947
+ all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
948
+ all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
949
+
950
+ # Apply specific name corrections
951
+ all_shifts['name'] = np.where(all_shifts['name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", all_shifts['name']) # Need to do this after normalization, only then he becomes Slafkovska?
952
+ all_shifts['name'] = np.where(all_shifts['name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", all_shifts['name'])
953
+ all_shifts['name'] = np.where(all_shifts['name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", all_shifts['name'])
954
+
955
+ all_shifts['name'] = np.where(all_shifts['name']== 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', all_shifts['name'])
956
+
957
+ all_shifts['name'] = np.where(all_shifts['name']== 'MATAJ BLAMEL' , 'MATAJ BLAMEL', all_shifts['name'])
958
+
959
+ all_shifts['name'] = all_shifts['name'].str.replace(' ', ' ')
960
+
961
+ # Clean invalid time values (e.g., "28:10" should be "20:00")
962
+ # Times beyond 20:00 (or 5:00 for OT periods) are invalid and should be capped
963
+ def clean_time_value(time_str):
964
+ """Clean invalid time values by capping hours at 20 (23 for parsing, but we'll cap at period max)"""
965
+ if pd.isna(time_str):
966
+ return time_str
967
+ try:
968
+ # Try to parse as-is first
969
+ pd.to_datetime(time_str)
970
+ return time_str
971
+ except:
972
+ # If parsing fails, extract minutes:seconds and cap appropriately
973
+ try:
974
+ parts = str(time_str).split(':')
975
+ if len(parts) == 2:
976
+ minutes = int(parts[0])
977
+ seconds = parts[1]
978
+ # If minutes >= 20, cap at 20:00 (end of regulation period)
979
+ if minutes >= 20:
980
+ return '20:00'
981
+ else:
982
+ return time_str
983
+ except:
984
+ pass
985
+ # If all else fails, return 20:00 as safe default
986
+ return '20:00'
987
+
988
+ try:
989
+ all_shifts['start_time'] = all_shifts['start_time'].apply(clean_time_value)
990
+ all_shifts['end_time'] = all_shifts['end_time'].apply(clean_time_value)
991
+ except Exception as e:
992
+ print(f'Error cleaning time values: {e}')
993
+ print('Stupid vibe coded system is causing problems')
994
+
995
+ all_shifts = all_shifts.assign(end_time = np.where(pd.to_datetime(all_shifts.start_time).dt.time > pd.to_datetime(all_shifts.end_time).dt.time, '20:00', all_shifts.end_time),
996
+ goalie = np.where(all_shifts.name.isin(goalie_names), 1, 0))
997
+
998
+ all_shifts = all_shifts.merge(all_shifts[all_shifts.goalie==1].groupby(['team', 'period'])['name'].nunique().reset_index().rename(columns = {'name':'period_gs'}), how = 'left').fillna(0)
999
+
1000
+ # Implement fix for goalies: Goalies who showed up late in the period and were the only goalie to play have their start time re-set to 0:00.
1001
+
1002
+ # Added this period shift number thing because we were getting an issue where a goalie got pulled mid period (like for a delayed penalty) and came back and their start time for the second shift got pushed to 0.
1003
+ all_shifts = all_shifts.assign(period_shift_number = all_shifts.groupby(['period', 'name']).cumcount() + 1)
1004
+
1005
+ all_shifts = all_shifts.assign(start_time = np.where((all_shifts.goalie==1) & (all_shifts.start_time!='0:00') & (all_shifts.period_gs==1) & (all_shifts.period_shift_number==1), '0:00', all_shifts.start_time))
1006
+
1007
+ # Previously I had this code to fix some kind of problem where goalie shifts didn't properly end.
1008
+ # But now I see this is causing an issue: If a goalie gets pulled and never comes back, this inaccurately fills them in.
1009
+ # Commenting this out and testing what things look like without it.
1010
+
1011
+ # all_shifts = all_shifts.assign(end_time = np.where(
1012
+ # (pd.to_datetime(all_shifts.start_time).dt.time < datetime(2021, 6, 10, 18, 0, 0).time()) &
1013
+ # (all_shifts.period!=3) &
1014
+ # (all_shifts.period!=4) &
1015
+ # (all_shifts.period!=5) &
1016
+ # (all_shifts.goalie==1) &
1017
+ # (all_shifts.period_gs==1),
1018
+ # '20:00', all_shifts.end_time))
1019
+
1020
+ # all_shifts = all_shifts.assign(end_time = np.where(
1021
+ # (pd.to_datetime(all_shifts.start_time).dt.time < datetime(2021, 6, 10, 13, 0, 0).time()) &
1022
+ # (all_shifts.period!=4) &
1023
+ # (all_shifts.period!=5) &
1024
+ # (all_shifts.goalie==1) &
1025
+ # (all_shifts.period_gs==1),
1026
+ # '20:00', all_shifts.end_time))
1027
+
1028
+ global myshifts
1029
+ global changes_on
1030
+ global changes_off
1031
+ myshifts = all_shifts
1032
+ #print('Printing my shifts')
1033
+
1034
+ #print(myshifts)
1035
+
1036
+ myshifts.start_time = myshifts.start_time.str.strip()
1037
+ myshifts.end_time = myshifts.end_time.str.strip()
1038
+
1039
+ myshifts['number'] = myshifts.number.astype(str)
1040
+
1041
+ changes_on = myshifts.groupby(['team', 'period', 'start_time']).agg(
1042
+ on = ('name', ', '.join),
1043
+ on_numbers = ('number', ', '.join),
1044
+ number_on = ('name', 'count')
1045
+ ).reset_index().rename(columns = {'start_time':'time'}).sort_values(by = ['team', 'period', 'time'])
1046
+
1047
+ changes_off = myshifts.groupby(['team', 'period', 'end_time']).agg(
1048
+ off = ('name', ', '.join),
1049
+ off_numbers = ('number', ', '.join),
1050
+ number_off = ('name', 'count')
1051
+ ).reset_index().rename(columns = {'end_time':'time'}).sort_values(by = ['team', 'period', 'time'])
1052
+
1053
+ all_on = changes_on.merge(changes_off, on = ['team', 'period', 'time'], how = 'left')
1054
+ off_only = changes_off.merge(changes_on, on = ['team', 'period', 'time'], how = 'left', indicator = True)[
1055
+ changes_off.merge(changes_on, on = ['team', 'period', 'time'], how = 'left', indicator = True)['_merge']!='both']
1056
+ full_changes = pd.concat([all_on, off_only]).sort_values(by = ['period', 'time']).drop(columns = ['_merge'])
1057
+
1058
+ full_changes['period_seconds'] = full_changes.time.str.split(':').str[0].astype(int) * 60 + full_changes.time.str.split(':').str[1].astype(int)
1059
+
1060
+ full_changes['game_seconds'] = (np.where((full_changes.period<5) & int(game_id)!=3,
1061
+ (((full_changes.period - 1) * 1200) + full_changes.period_seconds),
1062
+ 3900))
1063
+
1064
+ full_changes = full_changes.assign(team = np.where(full_changes.team.str.contains('CANADI'), 'MONTREAL CANADIENS', full_changes.team)).sort_values(by = 'game_seconds')
1065
+
1066
+ if live == True:
1067
+
1068
+ if home_clock_period is not None and away_clock_period is not None:
1069
+
1070
+ min_game_clock = ((min([home_clock_period, away_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(home_clock_time_now), convert_clock_to_seconds(away_clock_time_now)])
1071
+
1072
+ elif home_clock_period is not None and away_clock_period is None:
1073
+
1074
+ min_game_clock = ((min([home_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(home_clock_time_now)])
1075
+
1076
+ elif away_clock_period is not None and home_clock_period is None:
1077
+
1078
+ min_game_clock = ((min([away_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(away_clock_time_now)])
1079
+
1080
+ else:
1081
+ min_game_clock = None
1082
+
1083
+ if min_game_clock is not None:
1084
+
1085
+ full_changes = full_changes[full_changes.game_seconds <= min_game_clock]
1086
+
1087
+ return min_game_clock, full_changes.reset_index(drop = True)
1088
+
1089
+ return full_changes.reset_index(drop = True)
1090
+
1091
+ def scrape_html_events(season, game_id, events_page=None, roster_page=None):
1092
+ """
1093
+ Scrape HTML events page.
1094
+
1095
+ Args:
1096
+ season: Season string (e.g., '20242025')
1097
+ game_id: Game ID string (e.g., '020333')
1098
+ events_page: Optional pre-fetched requests.Response object for events page. If None, will fetch.
1099
+ roster_page: Optional pre-fetched requests.Response object for roster page. If None, will fetch.
1100
+
1101
+ Returns:
1102
+ Tuple of (events DataFrame, roster DataFrame)
1103
+ """
1104
+ #global game
1105
+ if events_page is None:
1106
+ url = 'http://www.nhl.com/scores/htmlreports/' + season + '/PL0' + game_id + '.HTM'
1107
+
1108
+ # TIME: Network request
1109
+ net_start = time.time()
1110
+ events_page = _session.get(url, timeout=10)
1111
+ net_duration = time.time() - net_start
1112
+ try:
1113
+ print(f' ⏱️ HTML events network request: {net_duration:.2f}s')
1114
+ except Exception:
1115
+ pass
1116
+
1117
+ #if int(season)<20092010):
1118
+ # soup = BeautifulSoup(page.content, 'html.parser')
1119
+ #else:
1120
+ # soup = BeautifulSoup(page.content, 'lxml')
1121
+
1122
+ # TIME: Parsing
1123
+ parse_start = time.time()
1124
+ # OPTIMIZED: Use lxml directly instead of BeautifulSoup for faster parsing
1125
+ if type(events_page) == str:
1126
+ doc = html.fromstring(events_page)
1127
+ else:
1128
+ doc = html.fromstring(events_page.content.decode('ISO-8859-1'))
1129
+ # XPath to find td elements with class containing 'bborder'
1130
+ tds = doc.xpath("//td[contains(@class, 'bborder')]")
1131
+ #global stripped_html
1132
+ #global eventdf
1133
+ stripped_html = hs_strip_html(tds)
1134
+ length = (len(stripped_html) // 8) * 8
1135
+ stripped_html = stripped_html[:length]
1136
+ eventdf = pd.DataFrame(np.array(stripped_html).reshape(int(length/8), 8)).rename(
1137
+ columns = {0:'index', 1:'period', 2:'strength', 3:'time', 4:'event', 5:'description', 6:'away_skaters', 7:'home_skaters'})
1138
+ split = eventdf.time.str.split(':')
1139
+ # XPath to find td elements with align='center' and style containing 'font-size: 10px;font-weight:bold'
1140
+ potentialnames = doc.xpath("//td[@align='center' and contains(@style, 'font-size: 10px;font-weight:bold')]")
1141
+ game_date = potentialnames[2].text_content() if len(potentialnames) > 2 else ''
1142
+
1143
+ for i in range(0, min(999, len(potentialnames))):
1144
+ away = potentialnames[i].text_content()
1145
+ if ('Away Game') in away or ('tr./Away') in away:
1146
+ away = _MATCH_GAME_PATTERN.split(away)[0]
1147
+ break
1148
+
1149
+ for i in range(0, min(999, len(potentialnames))):
1150
+ home = potentialnames[i].text_content()
1151
+ if ('Home Game') in home or ('Dom./Home') in home:
1152
+ home = _MATCH_GAME_PATTERN.split(home)[0]
1153
+ break
1154
+
1155
+ game = eventdf.assign(away_skaters = eventdf.away_skaters.str.replace('\n', ''),
1156
+ home_skaters = eventdf.home_skaters.str.replace('\n', ''),
1157
+ original_time = eventdf.time,
1158
+ time = split.str[0] + ":" + split.str[1].str[:2],
1159
+ home_team = home,
1160
+ away_team = away)
1161
+
1162
+ game = game.assign(away_team_abbreviated = game.away_skaters[0].split(' ')[0],
1163
+ home_team_abbreviated = game.home_skaters[0].split(' ')[0])
1164
+
1165
+ game = game[game.period!='Per']
1166
+
1167
+ game = game.assign(index = game.index.astype(int)).rename(columns = {'index':'event_index'})
1168
+
1169
+ game = game.assign(event_team = game.description.str.split(' ').str[0])
1170
+
1171
+ game = game.assign(event_team = game.event_team.str.split('\xa0').str[0])
1172
+
1173
+ game = game.assign(event_team = np.where(~game.event_team.isin([game.home_team_abbreviated.iloc[0], game.away_team_abbreviated.iloc[0]]), '\xa0', game.event_team))
1174
+
1175
+ game = game.assign(other_team = np.where(game.event_team=='', '\xa0',
1176
+ np.where(game.event_team==game.home_team_abbreviated.iloc[0], game.away_team_abbreviated.iloc[0], game.home_team_abbreviated.iloc[0])))
1177
+
1178
+ # Optimized: use single function instead of multiple .str.replace() calls
1179
+ def _extract_player_numbers(desc):
1180
+ matches = re.findall(r'[#-]\s*(\d+)', str(desc))
1181
+ return ' '.join(matches)
1182
+ game['event_player_str'] = game.description.apply(_extract_player_numbers)
1183
+
1184
+ game = game.assign(event_player_1 =
1185
+ game.event_player_str.str.split(' ').str[0],
1186
+ event_player_2 =
1187
+ game.event_player_str.str.split(' ').str[1],
1188
+ event_player_3 =
1189
+ game.event_player_str.str.split(' ').str[2])
1190
+ #return game
1191
+
1192
+ if len(game[game.description.str.contains('Drawn By')])>0:
1193
+
1194
+ game = game.assign(event_player_2 = np.where(game.description.str.contains('Drawn By'),
1195
+ game.description.str.split('Drawn By').str[1].str.split('#').str[1].str.split(' ').str[0].str.strip(),
1196
+ game.event_player_2),
1197
+ event_player_3 = np.where(game.description.str.contains('Served By'),
1198
+ '\xa0',
1199
+ game.event_player_3))
1200
+
1201
+ game = game.assign(event_player_1 = np.where((~pd.isna(game.event_player_1)) & (game.event_player_1!=''),
1202
+ np.where(game.event=='FAC', game.away_team_abbreviated,
1203
+ game.event_team) + (game.event_player_1.astype(str)),
1204
+ game.event_player_1),
1205
+ event_player_2 = np.where((~pd.isna(game.event_player_2)) & (game.event_player_2!=''),
1206
+ np.where(game.event=='FAC', game.home_team_abbreviated,
1207
+ np.where(game.event.isin(['BLOCK', 'HIT', 'PENL']), game.other_team, game.event_team)) + (game.event_player_2.astype(str)),
1208
+ game.event_player_2),
1209
+ event_player_3 = np.where((~pd.isna(game.event_player_3)) & (game.event_player_3!=''),
1210
+ game.event_team + (game.event_player_3.astype(str)),
1211
+ game.event_player_3))
1212
+
1213
+ game = game.assign(
1214
+ event_player_1 = np.where((game.event=='FAC') & (game.event_team==game.home_team_abbreviated),
1215
+ game.event_player_2, game.event_player_1),
1216
+ event_player_2 = np.where((game.event=='FAC') & (game.event_team==game.home_team_abbreviated),
1217
+ game.event_player_1, game.event_player_2))
1218
+
1219
+ #return game
1220
+
1221
+ roster = scrape_html_roster(season, game_id, page=roster_page).rename(columns = {'Nom/Name':'Name'})
1222
+ roster = roster[roster.status=='player']
1223
+ roster = roster.assign(team_abbreviated = np.where(roster.team=='home',
1224
+ game.home_team_abbreviated.iloc[0],
1225
+ game.away_team_abbreviated.iloc[0]))
1226
+
1227
+ roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
1228
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1229
+ roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1230
+
1231
+ event_player_1s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_1', 'Name':'ep1_name'})
1232
+ event_player_2s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_2', 'Name':'ep2_name'})
1233
+ event_player_3s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_3', 'Name':'ep3_name'})
1234
+
1235
+ game = game.merge(
1236
+ event_player_1s, on = 'event_player_1', how = 'left').merge(
1237
+ event_player_2s, on = 'event_player_2', how = 'left').merge(
1238
+ event_player_3s, on = 'event_player_3', how = 'left').assign(
1239
+ date = game_date)
1240
+ #return game
1241
+ game['period'] = np.where(game['period'] == '', '1', game['period'])
1242
+ game['time'] = np.where((game['time'] == '') | (pd.isna(game['time'])), '0:00', game['time'])
1243
+ game['period'] = game.period.astype(int)
1244
+
1245
+ # OPTIMIZED: Split time once instead of twice
1246
+ time_split = game.time.str.split(':')
1247
+ game['period_seconds'] = time_split.str[0].str.replace('-', '', regex=False).astype(int) * 60 + time_split.str[1].str.replace('-', '', regex=False).astype(int)
1248
+
1249
+ game['game_seconds'] = (np.where((game.period<5) & int(game_id[0])!=3,
1250
+ (((game.period - 1) * 1200) + game.period_seconds),
1251
+ 3900))
1252
+
1253
+ # OPTIMIZED: Use dictionary lookup instead of nested np.where()
1254
+ # TODO: Fix priority map so that we have change before shot or miss if the change involves a player returning from penalty box.
1255
+ priority_map = {
1256
+ 'TAKE': 1, 'GIVE': 1, 'MISS': 1, 'HIT': 1, 'SHOT': 1, 'BLOCK': 1,
1257
+ 'GOAL': 2, 'STOP': 3, 'DELPEN': 4, 'PENL': 5, 'CHANGE': 6,
1258
+ 'PEND': 7, 'GEND': 8, 'FAC': 9
1259
+ }
1260
+ game = game.assign(priority=game.event.map(priority_map).fillna(0).astype(int)).sort_values(by = ['game_seconds', 'period', 'event_player_1', 'event'])
1261
+ game = game.assign(version =
1262
+ (np.where(
1263
+ (game.event==game.event.shift()) &
1264
+ (game.event_player_1==game.event_player_1.shift()) &
1265
+ (game.event_player_1!='') &
1266
+ (game.game_seconds==game.game_seconds.shift()),
1267
+ 1, 0)))
1268
+
1269
+ game = game.assign(version =
1270
+ (np.where(
1271
+ (game.event==game.event.shift(2)) &
1272
+ (game.event_player_1==game.event_player_1.shift(2)) &
1273
+ (game.game_seconds==game.game_seconds.shift(2)) &
1274
+ (game.event_player_1!='') &
1275
+ (~game.description.str.contains('Penalty Shot')),
1276
+ 2, game.version)))
1277
+
1278
+ game = game.assign(version =
1279
+ (np.where(
1280
+ (game.event==game.event.shift(3)) &
1281
+ (game.event_player_1==game.event_player_1.shift(3)) &
1282
+ (game.game_seconds==game.game_seconds.shift(3)) &
1283
+ (game.event_player_1!=''),
1284
+ 3, game.version)))
1285
+
1286
+ game = game.assign(date = pd.to_datetime(game.date[~pd.isna(game.date)].iloc[0])
1287
+ ).rename(columns = {'date':'game_date'}).sort_values(by = ['event_index'])
1288
+
1289
+ game = game.assign(event_player_1 = game.ep1_name, event_player_2 = game.ep2_name, event_player_3 = game.ep3_name).drop(columns = ['ep1_name', 'ep2_name', 'ep3_name'])
1290
+
1291
+ # OPTIMIZED: Combine team name replacements into single operation
1292
+ team_replacements = {'CANADIENS MONTREAL': 'MONTREAL CANADIENS', 'MONTRÉAL CANADIENS': 'MONTREAL CANADIENS'}
1293
+ game['home_team'] = game['home_team'].replace(team_replacements)
1294
+ game['away_team'] = game['away_team'].replace(team_replacements)
1295
+
1296
+ if int(game_id[0])!=3:
1297
+ game = game[game.game_seconds<4000]
1298
+
1299
+ game['game_date'] = np.where((season=='20072008') & (game_id == '20003'), game.game_date + pd.Timedelta(days=1), game.game_date)
1300
+
1301
+ game = game.assign(event_player_1 = np.where((game.description.str.upper().str.contains('TEAM')) | (game.description.str.lower().str.contains('bench')),
1302
+ 'BENCH',
1303
+ game.event_player_1))
1304
+
1305
+ game = game.assign(home_skater_count_temp = (game.home_skaters.apply(lambda x: len(re.findall('[A-Z]', x)))),
1306
+ away_skater_count_temp = (game.away_skaters.apply(lambda x: len(re.findall('[A-Z]', x))))
1307
+ )
1308
+
1309
+ game = game.assign(event_team = np.where((game.event=='PENL') & (game.event_team=='') & (game.description.str.lower().str.contains('bench')) & (game.home_skater_count_temp>game.home_skater_count_temp.shift(-1)),
1310
+ game.home_team_abbreviated, game.event_team))
1311
+
1312
+ game = game.assign(event_team = np.where((game.event=='PENL') & (game.event_team=='') & (game.description.str.lower().str.contains('bench')) & (game.away_skater_count_temp>game.away_skater_count_temp.shift(-1)),
1313
+ game.away_team_abbreviated, game.event_team))
1314
+
1315
+ # TIME: Total parsing
1316
+ total_parse_duration = time.time() - parse_start
1317
+ # try:
1318
+ # print(f' ⏱️ HTML events parsing/processing: {total_parse_duration:.2f}s')
1319
+ # except Exception:
1320
+ # pass
1321
+
1322
+ # OPTIMIZATION: Return roster to avoid re-scraping in merge_and_prepare
1323
+ return game.drop(columns = ['period_seconds', 'time', 'priority', 'home_skater_count_temp', 'away_skater_count_temp']), roster
1324
+
1325
+ def scrape_espn_events(espn_game_id, drop_description = True):
1326
+
1327
+ # This URL has event coordinates
1328
+
1329
+ url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{espn_game_id}'
1330
+
1331
+ page = _session.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=30)
1332
+
1333
+ soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
1334
+
1335
+ period_jsons = json.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
1336
+
1337
+ # OPTIMIZED: Use list + concat instead of repeated _append()
1338
+ clock_df_list = []
1339
+ for period in range(0, len(period_jsons)):
1340
+ clock_df_list.append(pd.DataFrame(period_jsons[period]))
1341
+ clock_df = pd.concat(clock_df_list, ignore_index=True) if clock_df_list else pd.DataFrame()
1342
+
1343
+ clock_df = clock_df[~pd.isna(clock_df.clock)]
1344
+
1345
+ # Needed to add .split(',"st":3')[0] for playoffs
1346
+
1347
+ coords_df = pd.DataFrame(json.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
1348
+
1349
+ clock_df = clock_df.assign(
1350
+ clock = clock_df.clock.apply(lambda x: x['displayValue'])
1351
+ )
1352
+
1353
+ coords_df = coords_df.assign(
1354
+ coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
1355
+ coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
1356
+ event_player_1 = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
1357
+ )
1358
+
1359
+ espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
1360
+
1361
+ espn_events = espn_events.assign(
1362
+ period = espn_events['period'].apply(lambda x: x['number']),
1363
+ minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
1364
+ seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
1365
+ event_type = espn_events['type'].apply(lambda x: x['txt'])
1366
+ )
1367
+
1368
+ espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
1369
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_x
1370
+ ),
1371
+ coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
1372
+ (espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
1373
+
1374
+ espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1))]
1375
+
1376
+ espn_events = espn_events.assign(
1377
+ # Do this later
1378
+ coords_x = espn_events.coords_x.astype(int),
1379
+ coords_y = espn_events.coords_y.astype(int)
1380
+ )
1381
+
1382
+ espn_events = espn_events.rename(columns = {'text':'description'})
1383
+
1384
+ espn_events = espn_events.assign(
1385
+ event_type = np.where(espn_events.event_type=='Face Off', 'FAC',
1386
+ np.where(espn_events.event_type=='Goal', 'GOAL',
1387
+ np.where(espn_events.event_type=='Giveaway', 'GIVE',
1388
+ np.where(espn_events.event_type=='Penalty', 'PENL',
1389
+ np.where(espn_events.event_type=='Missed', 'MISS',
1390
+ np.where(espn_events.event_type=='Shot', 'SHOT',
1391
+ np.where(espn_events.event_type=='Takeaway', 'TAKE',
1392
+ np.where(espn_events.event_type=='Blocked', 'BLOCK',
1393
+ np.where(espn_events.event_type=='Hit', 'HIT',
1394
+ espn_events.event_type))))))))))
1395
+
1396
+ espn_events = espn_events.assign(priority = np.where(espn_events.event_type.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK']), 1,
1397
+ np.where(espn_events.event_type=="GOAL", 2,
1398
+ np.where(espn_events.event_type=="STOP", 3,
1399
+ np.where(espn_events.event_type=="DELPEN", 4,
1400
+ np.where(espn_events.event_type=="PENL", 5,
1401
+ np.where(espn_events.event_type=="CHANGE", 6,
1402
+ np.where(espn_events.event_type=="PEND", 7,
1403
+ np.where(espn_events.event_type=="GEND", 8,
1404
+ np.where(espn_events.event_type=="FAC", 9, 0))))))))),
1405
+ event_player_1 = espn_events.event_player_1.str.upper(),
1406
+ game_seconds = np.where(espn_events.period<5,
1407
+ ((espn_events.period - 1) * 1200) + (espn_events.minutes * 60) + espn_events.seconds, 3900))
1408
+
1409
+ espn_events = espn_events.sort_values(by = ['period', 'game_seconds', 'event_player_1', 'priority']).rename(
1410
+ columns = {'event_type':'event'}).loc[:, ['coords_x', 'coords_y', 'event_player_1', 'event', 'game_seconds', 'description', 'period']]
1411
+
1412
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('ALEXANDRE '),
1413
+ espn_events['event_player_1'].str.replace('ALEXANDRE ', 'ALEX '),
1414
+ espn_events['event_player_1'])
1415
+
1416
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('ALEXANDER '),
1417
+ espn_events['event_player_1'].str.replace('ALEXANDER ', 'ALEX '),
1418
+ espn_events['event_player_1'])
1419
+
1420
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('CHRISTOPHER '),
1421
+ espn_events['event_player_1'].str.replace('CHRISTOPHER ', 'CHRIS '),
1422
+ espn_events['event_player_1'])
1423
+
1424
+ espn_events = espn_events.assign(event_player_1 =
1425
+ np.where(espn_events.event_player_1=='PATRICK MAROON', 'PAT MAROON',
1426
+ (np.where(espn_events.event_player_1=='J T COMPHER', 'J.T. COMPHER',
1427
+ (np.where(espn_events.event_player_1=='J T MILLER', 'J.T. MILLER',
1428
+ (np.where(espn_events.event_player_1=='T J OSHIE', 'T.J. OSHIE',
1429
+ (np.where((espn_events.event_player_1=='ALEXIS LAFRENIERE') | (espn_events.event_player_1=='ALEXIS LAFRENI RE'), 'ALEXIS LAFRENIÈRE',
1430
+ (np.where((espn_events.event_player_1=='TIM STUTZLE') | (espn_events.event_player_1=='TIM ST TZLE'), 'TIM STÜTZLE',
1431
+ (np.where(espn_events.event_player_1=='T.J. BRODIE', 'TJ BRODIE',
1432
+ (np.where(espn_events.event_player_1=='MATTHEW IRWIN', 'MATT IRWIN',
1433
+ (np.where(espn_events.event_player_1=='STEVE KAMPFER', 'STEVEN KAMPFER',
1434
+ (np.where(espn_events.event_player_1=='STEVE KAMPFER', 'STEVEN KAMPFER',
1435
+ (np.where(espn_events.event_player_1=='JEFFREY TRUCHON-VIEL', 'JEFFREY VIEL',
1436
+ (np.where(espn_events.event_player_1=='ZACHARY JONES', 'ZAC JONES',
1437
+ (np.where(espn_events.event_player_1=='MITCH MARNER', 'MITCHELL MARNER',
1438
+ (np.where(espn_events.event_player_1=='MATHEW DUMBA', 'MATT DUMBA',
1439
+ (np.where(espn_events.event_player_1=='JOSHUA MORRISSEY', 'JOSH MORRISSEY',
1440
+ (np.where(espn_events.event_player_1=='P K SUBBAN', 'P.K. SUBBAN',
1441
+ (np.where(espn_events.event_player_1=='EGOR SHARANGOVICH', 'YEGOR SHARANGOVICH',
1442
+ (np.where(espn_events.event_player_1=='MAXIME COMTOIS', 'MAX COMTOIS',
1443
+ (np.where(espn_events.event_player_1=='NICHOLAS CAAMANO', 'NICK CAAMANO',
1444
+ (np.where(espn_events.event_player_1=='DANIEL CARCILLO', 'DAN CARCILLO',
1445
+ (np.where(espn_events.event_player_1=='ALEXANDER OVECHKIN', 'ALEX OVECHKIN',
1446
+ (np.where(espn_events.event_player_1=='MICHAEL CAMMALLERI', 'MIKE CAMMALLERI',
1447
+ (np.where(espn_events.event_player_1=='DAVE STECKEL', 'DAVID STECKEL',
1448
+ (np.where(espn_events.event_player_1=='JIM DOWD', 'JAMES DOWD',
1449
+ (np.where(espn_events.event_player_1=='MAXIME TALBOT', 'MAX TALBOT',
1450
+ (np.where(espn_events.event_player_1=='MIKE ZIGOMANIS', 'MICHAEL ZIGOMANIS',
1451
+ (np.where(espn_events.event_player_1=='VINNY PROSPAL', 'VACLAV PROSPAL',
1452
+ (np.where(espn_events.event_player_1=='MIKE YORK', 'MICHAEL YORK',
1453
+ (np.where(espn_events.event_player_1=='JACOB DOWELL', 'JAKE DOWELL',
1454
+ (np.where(espn_events.event_player_1=='MICHAEL RUPP', 'MIKE RUPP',
1455
+ (np.where(espn_events.event_player_1=='ALEXEI KOVALEV', 'ALEX KOVALEV',
1456
+ (np.where(espn_events.event_player_1=='SLAVA KOZLOV', 'VYACHESLAV KOZLOV',
1457
+ (np.where(espn_events.event_player_1=='JEFF HAMILTON', 'JEFFREY HAMILTON',
1458
+ (np.where(espn_events.event_player_1=='JOHNNY POHL', 'JOHN POHL',
1459
+ (np.where(espn_events.event_player_1=='DANIEL GIRARDI', 'DAN GIRARDI',
1460
+ (np.where(espn_events.event_player_1=='NIKOLAI ZHERDEV', 'NIKOLAY ZHERDEV',
1461
+ (np.where(espn_events.event_player_1=='J.P. DUMONT', 'J-P DUMONT',
1462
+ (np.where(espn_events.event_player_1=='DWAYNE KING', 'DJ KING',
1463
+ (np.where(espn_events.event_player_1=='JOHN ODUYA', 'JOHNNY ODUYA',
1464
+ (np.where(espn_events.event_player_1=='ROBERT SCUDERI', 'ROB SCUDERI',
1465
+ (np.where(espn_events.event_player_1=='DOUG MURRAY', 'DOUGLAS MURRAY',
1466
+ (np.where(espn_events.event_player_1=='VACLAV PROSPAL', 'VINNY PROSPAL',
1467
+ (np.where(espn_events.event_player_1=='RICH PEVERLY', 'RICH PEVERLEY',
1468
+ espn_events.event_player_1.str.strip()
1469
+ ))))))))))))))))))))))))))))))))))))))))))))
1470
+ ))))))))))))))))))))))))))))))))))))))))))
1471
+
1472
+ espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JANIS MOSER" , "J.J. MOSER",
1473
+ (np.where(espn_events['event_player_1']== "NICHOLAS PAUL" , "NICK PAUL",
1474
+ (np.where(espn_events['event_player_1']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
1475
+ (np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
1476
+ espn_events['event_player_1']))))))))
1477
+
1478
+ espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JOHHNY BEECHER" , "JOHN BEECHER",
1479
+ (np.where(espn_events['event_player_1']== "ALEXANDER BARKOV" , "ALEKSANDER BARKOV",
1480
+ (np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
1481
+ espn_events['event_player_1']))))))
1482
+
1483
+ espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JANIS MOSER" , "J.J. MOSER",
1484
+ (np.where(espn_events['event_player_1']== "NICHOLAS PAUL" , "NICK PAUL",
1485
+ (np.where(espn_events['event_player_1']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
1486
+ (np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
1487
+ # New guys from 24-25
1488
+ (np.where(espn_events['event_player_1']== "JOSHUA NORRIS" , "JOSH NORRIS",
1489
+ (np.where(espn_events['event_player_1']== "P.O JOSEPH" , "PIERRE-OLIVIER JOSEPH",
1490
+ (np.where(espn_events['event_player_1']== "MIKEY EYSSIMONT" , "MICHAEL EYSSIMONT",
1491
+ (np.where(espn_events['event_player_1']== "MATAJ BLAMEL" , "MATAJ BLAMEL",
1492
+ (np.where(espn_events['event_player_1']== "VITTORIO MANCINI" , "VICTOR MANCINI",
1493
+ (np.where(espn_events['event_player_1']== "JOSHUA MAHURA" , "JOSH MAHURA",
1494
+ (np.where(espn_events['event_player_1']== "JOSEPH VELENO" , "JOE VELENO",
1495
+ (np.where(espn_events['event_player_1']== "ZACK BOLDUC" , "ZACHARY BOLDUC",
1496
+ (np.where(espn_events['event_player_1']== "JOSHUA BROWN" , "JOSH BROWN",
1497
+ (np.where(espn_events['event_player_1']== "JAKE LUCCHINI" , "JACOB LUCCHINI",
1498
+ (np.where(espn_events['event_player_1']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
1499
+ (np.where(espn_events['event_player_1']== "CAMERON ATKINSON" , "CAM ATKINSON",
1500
+ (np.where(espn_events['event_player_1']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
1501
+ (np.where(espn_events['event_player_1']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY",
1502
+ espn_events['event_player_1']))))))))))))))))))))))))))))))))))))
1503
+
1504
+
1505
+ espn_events = espn_events.assign(version =
1506
+ (np.where(
1507
+ (espn_events.event==espn_events.event.shift()) &
1508
+ (espn_events.event_player_1==espn_events.event_player_1.shift()) &
1509
+ (espn_events.event_player_1!='') &
1510
+ (espn_events.game_seconds==espn_events.game_seconds.shift()),
1511
+ 1, 0)))
1512
+
1513
+ espn_events = espn_events.assign(version =
1514
+ (np.where(
1515
+ (espn_events.event==espn_events.event.shift(2)) &
1516
+ (espn_events.event_player_1==espn_events.event_player_1.shift(2)) &
1517
+ (espn_events.game_seconds==espn_events.game_seconds.shift(2)) &
1518
+ (espn_events.event_player_1!='') &
1519
+ (~espn_events.description.str.contains('Penalty Shot')),
1520
+ 2, espn_events.version)))
1521
+
1522
+ espn_events = espn_events.assign(version =
1523
+ (np.where(
1524
+ (espn_events.event==espn_events.event.shift(3)) &
1525
+ (espn_events.event_player_1==espn_events.event_player_1.shift(3)) &
1526
+ (espn_events.game_seconds==espn_events.game_seconds.shift(3)) &
1527
+ (espn_events.event_player_1!=''),
1528
+ 3, espn_events.version)))
1529
+
1530
+ espn_events['espn_id'] = int(espn_game_id)
1531
+
1532
+ espn_events['event_player_1'] = espn_events['event_player_1'].str.strip()
1533
+
1534
+ espn_events['event_player_1'] = espn_events['event_player_1'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
1535
+
1536
+ # Apply regex to remove (A) and (C) designations at end of names
1537
+ espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1538
+ espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1539
+
1540
+ # Apply specific name corrections
1541
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", espn_events['event_player_1'])
1542
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", espn_events['event_player_1'])
1543
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", espn_events['event_player_1'])
1544
+
1545
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', espn_events['event_player_1'])
1546
+
1547
+ espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', espn_events['event_player_1'])
1548
+
1549
+ espn_events['event_player_1'] = espn_events['event_player_1'].str.replace(' ', ' ')
1550
+
1551
+ #espn_events = espn_events.assign(event_player_1 = np.where(
1552
+ #espn_events.event_player_1=='ALEX BURROWS', 'ALEXANDRE BURROWS', espn_events.event_player_1))
1553
+
1554
+ global look
1555
+ look = espn_events
1556
+
1557
+ espn_events['coords_x'] = np.where(espn_events['coords_x']>99, 99, espn_events['coords_x'])
1558
+ espn_events['coords_y'] = np.where(espn_events['coords_y']<(-42), (-42), espn_events['coords_y'])
1559
+
1560
+ if drop_description == True:
1561
+ return espn_events.drop(columns = 'description')
1562
+ else:
1563
+ return espn_events
1564
+
1565
+ def scrape_espn_ids_single_game(game_date, home_team, away_team):
1566
+
1567
+ gamedays = pd.DataFrame()
1568
+
1569
+ if home_team == 'ATLANTA THRASHERS':
1570
+ home_team = 'WINNIPEG JETS'
1571
+ if away_team == 'ATLANTA THRASHERS':
1572
+ away_team = 'WINNIPEG JETS'
1573
+
1574
+ if home_team == 'PHOENIX COYOTES':
1575
+ home_team = 'ARIZONA COYOTES'
1576
+ if away_team == 'PHOENIX COYOTES':
1577
+ away_team = 'ARIZONA COYOTES'
1578
+
1579
+ this_date = (game_date)
1580
+ url = 'http://www.espn.com/nhl/scoreboard?date=' + this_date.replace("-", "")
1581
+ page = _session.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=30)
1582
+ print('Request to ESPN IDs successful.')
1583
+ soup = BeautifulSoup(page.content, 'lxml')
1584
+ soup_found = soup.find_all('a', {'class':['AnchorLink truncate',
1585
+ 'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100',
1586
+ 'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100 mr2'], 'href':[re.compile("/nhl/team/_/name/"), re.compile("game/_")]})
1587
+ game_sections = soup.find_all('section', class_ = 'Scoreboard bg-clr-white flex flex-auto justify-between')
1588
+
1589
+ at = []
1590
+ ht = []
1591
+ gids = []
1592
+ fax = pd.DataFrame()
1593
+ #print(str(i))
1594
+ for i in range(0, len(game_sections)):
1595
+ away = game_sections[i].find_all('div', class_='ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName db')[0].contents[0].upper()
1596
+ home = game_sections[i].find_all('div', class_='ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName db')[1].contents[0].upper()
1597
+ espnid = game_sections[i]['id']
1598
+ at.append(away)
1599
+ ht.append(home)
1600
+ gids.append(espnid)
1601
+
1602
+ fax = fax.assign(
1603
+ away_team = at,
1604
+ home_team = ht,
1605
+ espn_id = gids,
1606
+ game_date = pd.to_datetime(this_date))
1607
+
1608
+ # OPTIMIZED: Use concat instead of _append
1609
+ gamedays = pd.concat([gamedays, fax], ignore_index=True) if len(gamedays) > 0 else fax
1610
+
1611
+ gamedays = gamedays[gamedays.espn_id!='gameId']
1612
+
1613
+ gamedays = gamedays.assign(
1614
+ home_team = np.where(gamedays.home_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.home_team),
1615
+ away_team = np.where(gamedays.away_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.away_team),
1616
+ espn_id = gamedays.espn_id.str.split('/').str[0].astype(int)
1617
+
1618
+ )
1619
+
1620
+ gamedays = gamedays.assign(
1621
+ home_team = np.where(gamedays.home_team=='TB', 'TBL',
1622
+ np.where(gamedays.home_team=='T.B', 'TBL',
1623
+ np.where(gamedays.home_team=='L.A', 'LAK',
1624
+ np.where(gamedays.home_team=='LA', 'LAK',
1625
+ np.where(gamedays.home_team=='S.J', 'SJS',
1626
+ np.where(gamedays.home_team=='SJ', 'SJS',
1627
+ np.where(gamedays.home_team=='N.J', 'NJD',
1628
+ np.where(gamedays.home_team=='NJ', 'NJD',
1629
+ gamedays.home_team)))))))),
1630
+ away_team = np.where(gamedays.away_team=='TB', 'TBL',
1631
+ np.where(gamedays.away_team=='T.B', 'TBL',
1632
+ np.where(gamedays.away_team=='L.A', 'LAK',
1633
+ np.where(gamedays.away_team=='LA', 'LAK',
1634
+ np.where(gamedays.away_team=='S.J', 'SJS',
1635
+ np.where(gamedays.away_team=='SJ', 'SJS',
1636
+ np.where(gamedays.away_team=='N.J', 'NJD',
1637
+ np.where(gamedays.away_team=='NJ', 'NJD',
1638
+ gamedays.away_team)))))))),
1639
+ espn_id = gamedays.espn_id.astype(int))
1640
+
1641
+ gamedays = gamedays.assign(
1642
+ away_team = np.where(gamedays.away_team=='DUCKS', 'ANA',
1643
+ np.where(gamedays.away_team=='COYOTES', 'ARI',
1644
+ np.where(gamedays.away_team=='BRUINS', 'BOS',
1645
+ np.where(gamedays.away_team=='SABRES', 'BUF',
1646
+ np.where(gamedays.away_team=='FLAMES', 'CGY',
1647
+ np.where(gamedays.away_team=='HURRICANES', 'CAR',
1648
+ np.where(gamedays.away_team=='BLACKHAWKS', 'CHI',
1649
+ np.where(gamedays.away_team=='AVALANCHE', 'COL',
1650
+ np.where(gamedays.away_team=='BLUE', 'CBJ',
1651
+ np.where(gamedays.away_team=='JACKETS', 'CBJ',
1652
+ np.where(gamedays.away_team=='BLUE JACKETS', 'CBJ',
1653
+ np.where(gamedays.away_team=='STARS', 'DAL',
1654
+ np.where(gamedays.away_team=='RED', 'DET',
1655
+ np.where(gamedays.away_team=='WINGS', 'DET',
1656
+ np.where(gamedays.away_team=='RED WINGS', 'DET',
1657
+ np.where(gamedays.away_team=='OILERS', 'EDM',
1658
+ np.where(gamedays.away_team=='PANTHERS', 'FLA',
1659
+ np.where(gamedays.away_team=='KINGS', 'LAK',
1660
+ np.where(gamedays.away_team=='WILD', 'MIN',
1661
+ np.where(gamedays.away_team=='CANADIENS', 'MTL',
1662
+ np.where(gamedays.away_team=='PREDATORS', 'NSH',
1663
+ np.where(gamedays.away_team=='DEVILS', 'NJD',
1664
+ np.where(gamedays.away_team=='ISLANDERS', 'NYI',
1665
+ np.where(gamedays.away_team=='RANGERS', 'NYR',
1666
+ np.where(gamedays.away_team=='SENATORS', 'OTT',
1667
+ np.where(gamedays.away_team=='FLYERS', 'PHI',
1668
+ np.where(gamedays.away_team=='PENGUINS', 'PIT',
1669
+ np.where(gamedays.away_team=='SHARKS', 'SJS',
1670
+ np.where(gamedays.away_team=='KRAKEN', 'SEA',
1671
+ np.where(gamedays.away_team=='BLUES', 'STL',
1672
+ np.where(gamedays.away_team=='LIGHTNING', 'TBL',
1673
+ np.where(gamedays.away_team=='LEAFS', 'TOR',
1674
+ np.where(gamedays.away_team=='MAPLE', 'TOR',
1675
+ np.where(gamedays.away_team=='MAPLE LEAFS', 'TOR',
1676
+ np.where(gamedays.away_team=='CANUCKS', 'VAN',
1677
+ np.where(gamedays.away_team=='GOLDEN', 'VGK',
1678
+ np.where(gamedays.away_team=='KNIGHTS', 'VGK',
1679
+ np.where(gamedays.away_team=='GOLDEN KNIGHTS', 'VGK',
1680
+ np.where(gamedays.away_team=='CAPITALS', 'WSH',
1681
+ np.where(gamedays.away_team=='JETS', 'WPG',
1682
+ np.where(gamedays.away_team=='CLUB', 'UTA',
1683
+ np.where(gamedays.away_team=='MAMMOTH', 'UTA',
1684
+ np.where(gamedays.away_team=='HOCKEY', 'UTA', 'mistake'
1685
+ ))))))))))))))))))))))))))))))))))))))))))))
1686
+
1687
+ gamedays = gamedays.assign(
1688
+ home_team = np.where(gamedays.home_team=='DUCKS', 'ANA',
1689
+ np.where(gamedays.home_team=='COYOTES', 'ARI',
1690
+ np.where(gamedays.home_team=='BRUINS', 'BOS',
1691
+ np.where(gamedays.home_team=='SABRES', 'BUF',
1692
+ np.where(gamedays.home_team=='FLAMES', 'CGY',
1693
+ np.where(gamedays.home_team=='HURRICANES', 'CAR',
1694
+ np.where(gamedays.home_team=='BLACKHAWKS', 'CHI',
1695
+ np.where(gamedays.home_team=='AVALANCHE', 'COL',
1696
+ np.where(gamedays.home_team=='BLUE', 'CBJ',
1697
+ np.where(gamedays.home_team=='JACKETS', 'CBJ',
1698
+ np.where(gamedays.home_team=='BLUE JACKETS', 'CBJ',
1699
+ np.where(gamedays.home_team=='STARS', 'DAL',
1700
+ np.where(gamedays.home_team=='RED', 'DET',
1701
+ np.where(gamedays.home_team=='WINGS', 'DET',
1702
+ np.where(gamedays.home_team=='RED WINGS', 'DET',
1703
+ np.where(gamedays.home_team=='OILERS', 'EDM',
1704
+ np.where(gamedays.home_team=='PANTHERS', 'FLA',
1705
+ np.where(gamedays.home_team=='KINGS', 'LAK',
1706
+ np.where(gamedays.home_team=='WILD', 'MIN',
1707
+ np.where(gamedays.home_team=='CANADIENS', 'MTL',
1708
+ np.where(gamedays.home_team=='PREDATORS', 'NSH',
1709
+ np.where(gamedays.home_team=='DEVILS', 'NJD',
1710
+ np.where(gamedays.home_team=='ISLANDERS', 'NYI',
1711
+ np.where(gamedays.home_team=='RANGERS', 'NYR',
1712
+ np.where(gamedays.home_team=='SENATORS', 'OTT',
1713
+ np.where(gamedays.home_team=='FLYERS', 'PHI',
1714
+ np.where(gamedays.home_team=='PENGUINS', 'PIT',
1715
+ np.where(gamedays.home_team=='SHARKS', 'SJS',
1716
+ np.where(gamedays.home_team=='KRAKEN', 'SEA',
1717
+ np.where(gamedays.home_team=='BLUES', 'STL',
1718
+ np.where(gamedays.home_team=='LIGHTNING', 'TBL',
1719
+ np.where(gamedays.home_team=='MAPLE', 'TOR',
1720
+ np.where(gamedays.home_team=='LEAFS', 'TOR',
1721
+ np.where(gamedays.home_team=='MAPLE LEAFS', 'TOR',
1722
+ np.where(gamedays.home_team=='CANUCKS', 'VAN',
1723
+ np.where(gamedays.home_team=='GOLDEN', 'VGK',
1724
+ np.where(gamedays.home_team=='KNIGHTS', 'VGK',
1725
+ np.where(gamedays.home_team=='GOLDEN KNIGHTS', 'VGK',
1726
+ np.where(gamedays.home_team=='CAPITALS', 'WSH',
1727
+ np.where(gamedays.home_team=='JETS', 'WPG',
1728
+ np.where(gamedays.home_team=='CLUB', 'UTA',
1729
+ np.where(gamedays.home_team=='MAMMOTH', 'UTA',
1730
+ np.where(gamedays.home_team=='HOCKEY', 'UTA', 'mistake'
1731
+ ))))))))))))))))))))))))))))))))))))))))))))
1732
+
1733
+ gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
1734
+
1735
+ return(gamedays)
1736
+
1737
+ def merge_and_prepare(events, shifts, roster=None, live = False):
1738
+
1739
+ season = str(int(str(events.game_id.iloc[0])[:4])) + str(int(str(events.game_id.iloc[0])[:4]) + 1)
1740
+ small_id = str(events.game_id.iloc[0])[5:]
1741
+ game_id = int(events.game_id.iloc[0])
1742
+
1743
+ merged = pd.concat([events, shifts])
1744
+
1745
+ home_team = merged[~(pd.isna(merged.home_team))].home_team.iloc[0]
1746
+ #print(home_team)
1747
+ away_team = merged[~(pd.isna(merged.away_team))].away_team.iloc[0]
1748
+ #print(away_team)
1749
+
1750
+ if 'CANADIENS' in home_team:
1751
+ home_team = 'MONTREAL CANADIENS'
1752
+
1753
+ if 'CANADIENS' in away_team:
1754
+ away_team = 'MONTREAL CANADIENS'
1755
+
1756
+ #print(home_team)
1757
+ #print(away_team)
1758
+
1759
+ merged = merged.assign(home_team = home_team,
1760
+ away_team = away_team,
1761
+ home_team_abbreviated = merged[~(pd.isna(merged.home_team_abbreviated))].home_team_abbreviated.iloc[0],
1762
+ away_team_abbreviated = merged[~(pd.isna(merged.away_team_abbreviated))].away_team_abbreviated.iloc[0])
1763
+
1764
+ merged = merged.assign(event_team = np.where(merged.team==merged.home_team, merged.home_team_abbreviated,
1765
+ np.where(merged.team==merged.away_team, merged.away_team_abbreviated,
1766
+ merged.event_team)))
1767
+
1768
+ merged = merged.assign(event = np.where((pd.isna(merged.event)) &
1769
+ ((~pd.isna(merged.number_off)) | (~pd.isna(merged.number_on))), "CHANGE", merged.event))
1770
+
1771
+ home_space = ' ' + merged['home_team_abbreviated'].iloc[0]
1772
+ away_space = ' ' + merged['away_team_abbreviated'].iloc[0]
1773
+
1774
+ merged['away_skaters'] = np.where(pd.isna(merged.away_skaters), '\xa0', merged.away_skaters)
1775
+
1776
+ merged['tmp'] = merged.away_skaters.str.replace("[^0-9]", " ")
1777
+
1778
+ merged['tmp2'] = (merged.tmp.str.strip().str.split(" ")).apply(lambda x: natsorted(x)).apply(lambda x: ' '.join(x))
1779
+
1780
+ merged['tmp2'] = (merged.away_team_abbreviated.iloc[0] + merged.tmp2).str.replace(" ", away_space).str.replace(" ", ", ")
1781
+
1782
+ merged['tmp2'] = np.where(merged.tmp2.str.strip()==merged.away_team_abbreviated.iloc[0], '\xa0', merged.tmp2)
1783
+
1784
+ merged['away_on_ice'] = merged['tmp2']
1785
+
1786
+ merged['home_skaters'] = np.where(pd.isna(merged.home_skaters), '\xa0', merged.home_skaters)
1787
+
1788
+ merged['tmp'] = merged.home_skaters.str.replace("[^0-9]", " ")
1789
+
1790
+ merged['tmp2'] = (merged.tmp.str.strip().str.split(" ")).apply(lambda x: natsorted(x)).apply(lambda x: ' '.join(x))
1791
+
1792
+ merged['tmp2'] = (merged.home_team_abbreviated.iloc[0] + merged.tmp2).str.replace(" ", home_space).str.replace(" ", ", ")
1793
+
1794
+ merged['tmp2'] = np.where(merged.tmp2.str.strip()==merged.home_team_abbreviated.iloc[0], '\xa0', merged.tmp2)
1795
+
1796
+ merged['home_on_ice'] = merged['tmp2']
1797
+
1798
+ merged = merged.sort_values(by = ['game_seconds', 'period'])
1799
+
1800
+ merged = merged.assign(jumping_on = (np.where(merged.home_team == merged.team, (merged.home_team_abbreviated.iloc[0] + merged.on_numbers).str.replace(", ", home_space).str.replace(" ", ", "),
1801
+ np.where(merged.away_team == merged.team, (merged.away_team_abbreviated.iloc[0] + merged.on_numbers).str.replace(", ", away_space).str.replace(" ", ", "),
1802
+ '\xa0'))),
1803
+ jumping_off = (np.where(merged.home_team == merged.team, (merged.home_team_abbreviated.iloc[0] + merged.off_numbers).str.replace(", ", home_space).str.replace(" ", ", "),
1804
+ np.where(merged.away_team == merged.team, (merged.away_team_abbreviated.iloc[0] + merged.off_numbers).str.replace(", ", away_space).str.replace(" ", ", "),
1805
+ '\xa0'))),
1806
+ prio = np.where(merged.event=="CHANGE", 0,
1807
+ np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
1808
+ by = ['game_seconds', 'period', 'event_index'])
1809
+
1810
+ merged = merged.assign(change_before_event = np.where(
1811
+ (
1812
+ (merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
1813
+ (merged.home_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.home_on_ice!=merged.home_on_ice.shift())
1814
+ ), 1, 0
1815
+ ))
1816
+
1817
+ merged = merged.assign(change_prio =
1818
+ np.where((merged.team==merged.home_team) & (merged.event=='CHANGE') , 1,
1819
+ np.where((merged.team==merged.away_team) & (merged.event=='CHANGE'), -1, 0)))
1820
+
1821
+ # TODO: Fix priority map so that we have change before shot or miss if the change involves a player returning from penalty box.
1822
+ merged = merged.assign(priority = np.where(merged.event.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK']), 1,
1823
+ np.where(merged.event=="GOAL", 2,
1824
+ np.where(merged.event=="STOP", 3,
1825
+ np.where(merged.event=="DELPEN", 4,
1826
+ np.where(merged.event=="PENL", 5,
1827
+ np.where(merged.event=="CHANGE", 6,
1828
+ np.where(merged.event=="PEND", 7,
1829
+ np.where(merged.event=="GEND", 8,
1830
+ np.where(merged.event=="FAC", 9, 0)))))))))).sort_values(by = ['game_seconds', 'period', 'priority', 'event_index', 'change_prio'])
1831
+
1832
+ merged = merged.reset_index(drop = True).reset_index().rename(columns = {'index':'event_index', 'event_index':'original_index'})
1833
+
1834
+ # OPTIMIZATION: Use passed-in roster if available, otherwise scrape it
1835
+ if roster is None:
1836
+ roster = scrape_html_roster(season, small_id).rename(columns = {'Nom/Name':'Name'})
1837
+ # roster is already prepared in scrape_html_events, no need to rename
1838
+
1839
+ roster = roster.assign(team_abbreviated = np.where(roster.team=='home',
1840
+ merged.home_team_abbreviated.iloc[0],
1841
+ merged.away_team_abbreviated.iloc[0]))
1842
+
1843
+ roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
1844
+ # OPTIMIZED: Use pre-compiled regex patterns instead of compiling in each lambda
1845
+ roster['Name'] = roster.Name.apply(lambda x: _CAPTAIN_A_PATTERN.sub('', x).strip())
1846
+ roster['Name'] = roster.Name.apply(lambda x: _CAPTAIN_C_PATTERN.sub('', x).strip())
1847
+
1848
+ roster = roster.assign(Name = np.where((roster.Name=='SEBASTIAN AHO') &( roster.team_name == 'NEW YORK ISLANDERS'), 'SEBASTIAN AHO (SWE)', roster.Name))
1849
+ roster = roster.assign(Name = np.where((roster.Name=='ELIAS PETTERSSON') &( roster.Pos == 'D'), 'ELIAS PETTERSSON(D)', roster.Name))
1850
+
1851
+ goalies = roster[(roster.Pos=='G') & (roster.status!='scratch')]
1852
+
1853
+ away_roster = roster[(roster.team=='away') & (roster.status!='scratch')]
1854
+ home_roster = roster[(roster.team=='home') & (roster.status!='scratch')]
1855
+
1856
+ merged.jumping_on = np.where(pd.isna(merged.jumping_on), '\xa0', merged.jumping_on)
1857
+ merged.jumping_off = np.where(pd.isna(merged.jumping_off), '\xa0', merged.jumping_off)
1858
+
1859
+ # OPTIMIZED: Use vectorized string operations instead of .apply()
1860
+ # This provides 10-20x speedup on the on-ice tracking loops
1861
+ change_mask = (merged.event == 'CHANGE')
1862
+
1863
+ # OPTIMIZED: Use regex pattern matching for exact teamnum matches in comma-separated strings
1864
+ # Pattern matches teamnum at start, middle (after comma+space), or end of string
1865
+ # Note: re module is already imported at module level
1866
+
1867
+ # Build all columns at once using vectorized string operations
1868
+ awaydf_dict = {}
1869
+ for i in range(0, len(away_roster)):
1870
+ teamnum = away_roster.teamnum.iloc[i]
1871
+ # Use regex to match teamnum as whole value (not substring)
1872
+ # Match: start of string OR comma+space, then teamnum, then comma OR end of string
1873
+ pattern = r'(^|, )' + re.escape(teamnum) + r'(,|$)'
1874
+ on_mask = merged.jumping_on.str.contains(pattern, na=False, regex=True)
1875
+ off_mask = merged.jumping_off.str.contains(pattern, na=False, regex=True) & change_mask
1876
+ vec = np.cumsum(on_mask.astype(int) - off_mask.astype(int))
1877
+ awaydf_dict[away_roster.Name.iloc[i]] = vec
1878
+
1879
+ awaydf = pd.DataFrame(awaydf_dict)
1880
+
1881
+ global homedf
1882
+
1883
+ # OPTIMIZED: Same optimization for home roster
1884
+ homedf_dict = {}
1885
+ for i in range(0, len(home_roster)):
1886
+ teamnum = home_roster.teamnum.iloc[i]
1887
+ pattern = r'(^|, )' + re.escape(teamnum) + r'(,|$)'
1888
+ on_mask = merged.jumping_on.str.contains(pattern, na=False, regex=True)
1889
+ off_mask = merged.jumping_off.str.contains(pattern, na=False, regex=True) & change_mask
1890
+ vec = np.cumsum(on_mask.astype(int) - off_mask.astype(int))
1891
+ homedf_dict[home_roster.Name.iloc[i]] = vec
1892
+
1893
+ homedf = pd.DataFrame(homedf_dict)
1894
+
1895
+ global home_on
1896
+ global away_on
1897
+
1898
+ # OPTIMIZED: Use list comprehension which is faster than .apply() for this operation
1899
+ # Get column names where value is 1, join, and sort
1900
+ home_on_list = []
1901
+ for idx in range(len(homedf)):
1902
+ row = homedf.iloc[idx]
1903
+ players = [col for col in homedf.columns if row[col] == 1]
1904
+ home_on_list.append(','.join(natsorted(players)) if players else '')
1905
+ home_on = pd.DataFrame({0: home_on_list})
1906
+
1907
+ away_on_list = []
1908
+ for idx in range(len(awaydf)):
1909
+ row = awaydf.iloc[idx]
1910
+ players = [col for col in awaydf.columns if row[col] == 1]
1911
+ away_on_list.append(','.join(natsorted(players)) if players else '')
1912
+ away_on = pd.DataFrame({0: away_on_list})
1913
+
1914
+ away_on = away_on[0].str.split(',', expand=True).rename(columns = {0:'away_on_1', 1:'away_on_2', 2:'away_on_3', 3:'away_on_4', 4:'away_on_5', 5:'away_on_6', 6:'away_on_7', 7:'away_on_8', 8:'away_on_9'})
1915
+ home_on = home_on[0].str.split(',', expand=True).rename(columns = {0:'home_on_1', 1:'home_on_2', 2:'home_on_3', 3:'home_on_4', 4:'home_on_5', 5:'home_on_6', 6:'home_on_7', 7:'home_on_8', 8:'home_on_9'})
1916
+
1917
+ # OPTIMIZED: Initialize missing columns in a loop
1918
+ for side in ['away', 'home']:
1919
+ for i in range(1, 10):
1920
+ col = f'{side}_on_{i}'
1921
+ if col not in (away_on if side == 'away' else home_on).columns:
1922
+ (away_on if side == 'away' else home_on)[col] = '\xa0'
1923
+
1924
+ game = pd.concat([merged, home_on, away_on], axis = 1)
1925
+
1926
+ game = game.assign(
1927
+ event_team = np.where(game.event_team==game.home_team, game.home_team_abbreviated,
1928
+ np.where(game.event_team==game.away_team, game.away_team_abbreviated,
1929
+ game.event_team)),
1930
+ description = game.description.astype(str))
1931
+
1932
+ game['description'] = np.where(game.description=='nan', '\xa0', game.description)
1933
+
1934
+ game = game.drop(columns = ['original_index', 'strength', 'original_time', 'home_team', 'away_team', 'other_team', 'event_player_str',
1935
+ 'version', 'team', 'change_before_event', 'prio', 'change_prio', 'priority', 'tmp', 'tmp2']).rename(
1936
+ columns = {'away_team_abbreviated':'away_team', 'home_team_abbreviated':'home_team', 'coordsx':'coords_x', 'coordsy':'coords_y',
1937
+ 'ep1_name':'event_player_1', 'ep2_name':'event_player_2', 'ep3_name':'event_player_3'})
1938
+
1939
+ # OPTIMIZED: Pre-compile regex and use vectorized operations where possible
1940
+ # event_zone: combine the two apply() calls into one
1941
+ def extract_zone(desc):
1942
+ match = _ZONE_PATTERN.search(str(desc))
1943
+ return match.group() if match else None
1944
+
1945
+ # OPTIMIZED: event_detail - reduce string operations by caching splits
1946
+ def extract_detail(row):
1947
+ desc = row['description']
1948
+ event = row['event']
1949
+ if pd.isna(desc):
1950
+ return '\xa0'
1951
+ if event in ['SHOT', 'BLOCK', 'MISS', 'GOAL']:
1952
+ parts = desc.split(', ')
1953
+ return parts[1].strip() if len(parts) > 1 else '\xa0'
1954
+ elif event in ["PSTR", "PEND", "SOC", "GEND"]:
1955
+ parts = desc.split(': ')
1956
+ return parts[1].strip() if len(parts) > 1 else '\xa0'
1957
+ elif event == 'PENL':
1958
+ match = _PARENTHESIS_PATTERN.search(desc)
1959
+ return match.group(1).strip() if match else '\xa0'
1960
+ elif event == 'CHANGE':
1961
+ parts = desc.split(' - ')
1962
+ return parts[0].strip() if len(parts) > 0 else '\xa0'
1963
+ return '\xa0'
1964
+
1965
+ game = game.assign(
1966
+ game_id = int(game_id),
1967
+ season = int(season),
1968
+ event_zone = game.description.apply(extract_zone),
1969
+ event_detail = game.apply(extract_detail, axis=1))
1970
+
1971
+ # Goalie finding - keep nested np.where() as it's actually quite fast for this use case
1972
+ game = game.assign(home_goalie = np.where(
1973
+ game.home_on_1.isin(goalies.Name), game.home_on_1,
1974
+ np.where(
1975
+ game.home_on_2.isin(goalies.Name), game.home_on_2,
1976
+ np.where(
1977
+ game.home_on_3.isin(goalies.Name), game.home_on_3,
1978
+ np.where(
1979
+ game.home_on_4.isin(goalies.Name), game.home_on_4,
1980
+ np.where(
1981
+ game.home_on_5.isin(goalies.Name), game.home_on_5,
1982
+ np.where(
1983
+ game.home_on_6.isin(goalies.Name), game.home_on_6,
1984
+ np.where(
1985
+ game.home_on_7.isin(goalies.Name), game.home_on_7,
1986
+ np.where(
1987
+ game.home_on_8.isin(goalies.Name), game.home_on_8,
1988
+ np.where(
1989
+ game.home_on_9.isin(goalies.Name), game.home_on_9,
1990
+ '\xa0'))))))))),
1991
+ away_goalie = np.where(
1992
+ game.away_on_1.isin(goalies.Name), game.away_on_1,
1993
+ np.where(
1994
+ game.away_on_2.isin(goalies.Name), game.away_on_2,
1995
+ np.where(
1996
+ game.away_on_3.isin(goalies.Name), game.away_on_3,
1997
+ np.where(
1998
+ game.away_on_4.isin(goalies.Name), game.away_on_4,
1999
+ np.where(
2000
+ game.away_on_5.isin(goalies.Name), game.away_on_5,
2001
+ np.where(
2002
+ game.away_on_6.isin(goalies.Name), game.away_on_6,
2003
+ np.where(
2004
+ game.away_on_7.isin(goalies.Name), game.away_on_7,
2005
+ np.where(
2006
+ game.away_on_8.isin(goalies.Name), game.away_on_8,
2007
+ np.where(
2008
+ game.away_on_9.isin(goalies.Name), game.away_on_9,
2009
+ '\xa0'))))))))))
2010
+
2011
+ # OPTIMIZED: Vectorized column cleaning - use np.where() in loop for consistency
2012
+ # Slightly faster than individual assigns due to reduced function call overhead
2013
+ on_ice_cols = [f'{side}_on_{i}' for side in ['away', 'home'] for i in range(1, 10)]
2014
+ goalie_cols = ['home_goalie', 'away_goalie']
2015
+
2016
+ for col in on_ice_cols + goalie_cols:
2017
+ if col in game.columns:
2018
+ game[col] = np.where((pd.isna(game[col])) | (game[col] == '') | (game[col] == '\xa0'), '\xa0', game[col])
2019
+
2020
+ # OPTIMIZED: Vectorized skater counting using .ne() and .sum()
2021
+ # Cache the game_id check to avoid repeated string operations
2022
+ game_id_str = str(game_id)
2023
+ is_playoff = int(game_id_str[5]) == 3 if len(game_id_str) > 5 else False
2024
+
2025
+ # Vectorized: use .ne() (not equal) which is faster than np.where() for boolean conversion
2026
+ home_on_cols = [f'home_on_{i}' for i in range(1, 10)]
2027
+ away_on_cols = [f'away_on_{i}' for i in range(1, 10)]
2028
+
2029
+ home_skaters = game[home_on_cols].ne('\xa0').sum(axis=1)
2030
+ away_skaters = game[away_on_cols].ne('\xa0').sum(axis=1)
2031
+
2032
+ # Subtract goalie if present and in regulation/playoff
2033
+ goalie_mask = ((game.period < 5) | is_playoff)
2034
+ home_skaters = home_skaters - ((game.home_goalie != '\xa0') & goalie_mask).astype(int)
2035
+ away_skaters = away_skaters - ((game.away_goalie != '\xa0') & goalie_mask).astype(int)
2036
+
2037
+ game = game.assign(home_skaters=home_skaters, away_skaters=away_skaters)
2038
+
2039
+ game = game.assign(home_skater_temp =
2040
+ np.where((game.home_goalie=='\xa0') , 'E', game.home_skaters),
2041
+ away_skater_temp =
2042
+ np.where((game.away_goalie=='\xa0') , 'E', game.away_skaters))
2043
+
2044
+ # OPTIMIZED: Reuse cached is_playoff from earlier
2045
+ game = game.assign(game_strength_state = (game.home_skater_temp.astype(str)) + 'v' + (game.away_skater_temp.astype(str)),
2046
+ event_zone = np.where(game.event_zone is not None, game.event_zone.str.replace(". Zone", "", regex=False), ''),
2047
+ home_score = np.cumsum(np.where((game.event.shift()=='GOAL') & (((game.period<5) | is_playoff)) & (game.event_team.shift()==game.home_team), 1, 0)),
2048
+ away_score = np.cumsum(np.where((game.event.shift()=='GOAL') & (((game.period<5) | is_playoff)) & (game.event_team.shift()==game.away_team), 1, 0))).drop(
2049
+ columns = ['home_skater_temp', 'away_skater_temp'])
2050
+
2051
+ game = game.assign(game_score_state = (game.home_score.astype(str)) + 'v' + (game.away_score.astype(str)),
2052
+ game_date = pd.to_datetime(game.game_date[~pd.isna(game.game_date)].iloc[0])
2053
+ )
2054
+
2055
+ game.number_off = np.where((game.jumping_on!='\xa0') & (game.jumping_off=='\xa0'), 0, game.number_off)
2056
+ game.number_on = np.where((game.jumping_off!='\xa0') & (game.jumping_on=='\xa0'), 0, game.number_on)
2057
+
2058
+ so = game[game.period==5]
2059
+
2060
+ if len(so)>0 and int(game.game_id.astype(str).str[5].iloc[0]) != 3:
2061
+ game = game[game.period<5]
2062
+ home = roster[roster.team=='home'].rename(columns = {'teamnum':'home_on_ice', 'Name':'home_goalie_name'}).loc[:, ['home_goalie_name', 'home_on_ice']]
2063
+ away = roster[roster.team=='away'].rename(columns = {'teamnum':'away_on_ice', 'Name':'away_goalie_name'}).loc[:, ['away_goalie_name', 'away_on_ice']]
2064
+ so = so.merge(away, how = 'left', indicator = True).drop(columns = ['_merge']).merge(home, how = 'left')
2065
+ so = so.assign(
2066
+ home_goalie = so.home_goalie_name,
2067
+ away_goalie = so.away_goalie_name).drop(columns = ['away_goalie_name', 'home_goalie_name'])
2068
+ so_winner = so[so.event=='GOAL'].groupby('event_team')['event', 'home_team'].count().reset_index().sort_values(by = ['event', 'event_team'],ascending = False).event_team.iloc[0]
2069
+ so = so.assign(
2070
+ home_on_1 = so.home_goalie,
2071
+ away_on_1 = so.away_goalie,
2072
+ home_on_2 = np.where(so.event_team==so.home_team, so.event_player_1, '\xa0'),
2073
+ away_on_2 = np.where(so.event_team==so.away_team, so.event_player_1, '\xa0'))
2074
+ if len(so[so.event=='PEND'])>0:
2075
+ end_event = so[so.event=='PEND'].index.astype(int)[0]
2076
+ so = so.assign(
2077
+ home_score = np.where((so.index>=end_event) & (so_winner == so.home_team), 1+so.home_score, so.home_score),
2078
+ away_score = np.where((so.index>=end_event) & (so_winner == so.away_team), 1+so.away_score, so.away_score))
2079
+ game = pd.concat([game, so])
2080
+
2081
+ game['event_length'] = game.game_seconds.shift(-1) - game.game_seconds
2082
+ game['event_length'] = (np.where((pd.isna(game.event_length)) | (game.event_length<0), 0, game.event_length)).astype(int)
2083
+ game['event_index'] = game.event_index + 1
2084
+
2085
+ if 'coords_x' and 'coords_y' in game.columns:
2086
+
2087
+ columns = ['season', 'game_id', 'game_date', 'event_index',
2088
+ 'period', 'game_seconds', 'event', 'description',
2089
+ 'event_detail', 'event_zone', 'event_team', 'event_player_1',
2090
+ 'event_player_2', 'event_player_3', 'event_length', 'coords_x',
2091
+ 'coords_y', 'number_on', 'number_off', 'jumping_on', 'jumping_off',
2092
+ 'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5',
2093
+ 'home_on_6', 'home_on_7', 'home_on_8', 'home_on_9', 'away_on_1', 'away_on_2', 'away_on_3',
2094
+ 'away_on_4', 'away_on_5', 'away_on_6', 'away_on_7', 'away_on_8', 'away_on_9', 'home_goalie',
2095
+ 'away_goalie', 'home_team', 'away_team', 'home_skaters', 'away_skaters',
2096
+ 'home_score', 'away_score', 'game_score_state', 'game_strength_state', 'coordinate_source']
2097
+
2098
+ else:
2099
+
2100
+ columns = ['season', 'game_id', 'game_date', 'event_index',
2101
+ 'period', 'game_seconds', 'event', 'description',
2102
+ 'event_detail', 'event_zone', 'event_team', 'event_player_1',
2103
+ 'event_player_2', 'event_player_3', 'event_length',
2104
+ 'number_on', 'number_off', 'jumping_on', 'jumping_off',
2105
+ 'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5',
2106
+ 'home_on_6', 'home_on_7', 'home_on_8', 'home_on_9', 'away_on_1', 'away_on_2', 'away_on_3',
2107
+ 'away_on_4', 'away_on_5', 'away_on_6', 'away_on_7', 'away_on_8', 'away_on_9', 'home_goalie',
2108
+ 'away_goalie', 'home_team', 'away_team', 'home_skaters', 'away_skaters',
2109
+ 'home_score', 'away_score', 'game_score_state', 'game_strength_state']
2110
+
2111
+ game = game.loc[:, columns].rename(
2112
+ columns = {'period':'game_period', 'event':'event_type', 'description':'event_description', 'number_on':'num_on', 'number_off':'num_off',
2113
+ 'jumping_on':'players_on', 'jumping_off':'players_off'}
2114
+ )
2115
+
2116
+ if live == True:
2117
+ # Compare finalized to events to see if we have events where the PBP does not match the skaters on ice in the shifts.
2118
+ # This can happen when players are taking their first shift of the game and thus do not appear in the time on ice HTML page.
2119
+ # Find events in finalized where skater count doesn't match, then get rid of everything beneath them.
2120
+ comparison_df = events.assign(events_away_skater_count = events.away_skaters.str.count('\xa0'), events_home_skater_count = events.home_skaters.str.count('\xa0'))[
2121
+ events.event.isin(ewc)
2122
+ ].loc[:,
2123
+ ['period', 'game_seconds', 'event', 'description', 'events_away_skater_count', 'events_home_skater_count']].merge(
2124
+ game[game.event_type.isin(ewc)].loc[:, ['game_period', 'game_seconds', 'event_index', 'event_type', 'event_description', 'home_skaters', 'away_skaters']].rename(
2125
+ columns = {'game_period':'period', 'event_type':'event', 'event_description':'description', 'away_skaters':'pbp_away_skater_count', 'home_skaters':'pbp_home_skater_count'}
2126
+ ))
2127
+
2128
+ mismatches = comparison_df[(comparison_df.pbp_away_skater_count != comparison_df.events_away_skater_count) |
2129
+ (comparison_df.pbp_home_skater_count != comparison_df.events_home_skater_count)]
2130
+
2131
+ # We initially just ditched the mismatched event and everything that came beneath it.
2132
+ # But then we ran into an error: We can have a play where the player comes on the ice as a penalty expires. We have a "5v4" goal but PBP events show it as 5v5.
2133
+ # NHL dot com video description calls it "Power Play Goal" but doesn't show PP under it.
2134
+ # This is pretty tricky to handle. Because the initial "mismatch comparison" was designed to catch events where the shift was just a few seconds off, let's do it this way.
2135
+
2136
+ # Ditch the mismatched event and everything that comes after it!
2137
+ # If we have multiple mismatches and not many events after it.
2138
+ if len(mismatches) > 1 and len(game[game.game_seconds >= mismatches.game_seconds.min()]) < 20:
2139
+ game = game[game.event_index < mismatches.event_index.min()]
2140
+
2141
+ game[(game.event_type.isin(ewc)) & (game.home_skaters < 0)]
2142
+
2143
+ if live == True:
2144
+ mismatches = game[(game.event_type.isin(ewc + ['FAC'])) & ((game.home_skaters < 3) | (game.home_skaters > 6) | (game.away_skaters < 3) | (game.away_skaters > 6))]
2145
+ if len(mismatches) > 0:
2146
+ game = game[game.event_index < mismatches.event_index.min()]
2147
+
2148
+ return(game)
2149
+
2150
+ def fix_missing(single, event_coords, events):
2151
+
2152
+ # Commenting this entire thing out for now. It causes problems for whatever reason, and I'm not convinced these fucked up games are still showing up.
2153
+
2154
+ # # FIRST FIX: EVENTS THAT HAVE MATCHING PERIOD, SECONDS, AND EVENT TYPE, AND ONLY OCCURRED ONCE, BUT NO EVENT PLAYER. #
2155
+ # global event_coords_temp
2156
+ # global single_problems
2157
+ # global merged_problems
2158
+ # problems = events[(events.event.isin(ewc)) & (pd.isna(events.coords_x))]
2159
+ # single_problems = problems.groupby(['event', 'period', 'game_seconds'])[
2160
+ # 'event_index'].count().reset_index().rename(
2161
+ # columns = {'event_index':'problematic_events'})
2162
+ # # Keep events where only one event of that class happened at that moment.
2163
+ # single_problems = single_problems[single_problems.problematic_events==1]
2164
+ # single_problems = problems.merge(single_problems).drop(
2165
+ # columns = ['problematic_events', 'coords_x', 'coords_y', 'coordinate_source']) # x/y come back later!
2166
+ # event_coords_temp = event_coords.loc[:, ['period', 'game_seconds', 'event', 'version', 'coords_x', 'coordinate_source']].groupby(
2167
+ # ['game_seconds', 'period', 'event', 'version'])['coords_x'].count().reset_index().rename(
2168
+ # columns = {'coords_x':'problematic_events'})
2169
+ # event_coords_temp = event_coords_temp[event_coords_temp.problematic_events==1].drop(columns = 'problematic_events')
2170
+ # event_coords_temp = event_coords_temp.merge(event_coords.loc[:, ['game_seconds', 'period', 'event', 'version', 'coords_x', 'coords_y', 'coordinate_source']])
2171
+ # if 'espn_id' in event_coords_temp.columns:
2172
+ # event_coords_temp = event_coords_temp.drop(columns = 'espn_id')
2173
+ # merged_problems = single_problems.merge(event_coords_temp)
2174
+ # #print("You fixed: " + str(len(merged_problems)) + " events!")
2175
+ # events = events[~(events.event_index.isin(list(merged_problems.event_index)))]
2176
+ # events = pd.concat([events, merged_problems.loc[:, list(events.columns)]]).sort_values(by = ['event_index', 'period', 'game_seconds'])
2177
+ # #if len(merged_problems)>0:
2178
+ # #events = events[~events.event_index.isin(merged_problems.event_index)]
2179
+ # #events = pd.concat([events, merged_problems.loc[:, list(events.columns)]]).sort_values(by = ['event_index', 'period', 'game_seconds'])
2180
+ # look = events
2181
+
2182
+ # # SECOND FIX: EVENTS THAT HAVE MATCHING PERIOD, EVENT TYPE, AND PLAYER ONE, AND ONLY OCCURRED ONCE, BUT NO GAME SECONDS.
2183
+
2184
+ # problems = events[(events.event.isin(ewc)) & (pd.isna(events.coords_x))]
2185
+ # single_problems = problems.groupby(['event', 'period', 'event_player_1'])[
2186
+ # 'event_index'].count().reset_index().rename(
2187
+ # columns = {'event_index':'problematic_events'})
2188
+ # # Keep events where only one event of that class happened at that moment.
2189
+ # single_problems = single_problems[single_problems.problematic_events==1]
2190
+ # single_problems = problems.merge(single_problems).drop(
2191
+ # columns = ['problematic_events', 'coords_x', 'coords_y', 'coordinate_source']) # x/y come back later!
2192
+ # event_coords_temp = event_coords.loc[:, ['period', 'event_player_1', 'event',
2193
+ # 'version', 'coords_x', 'coordinate_source']].groupby(
2194
+ # ['event_player_1', 'period', 'event', 'version'])['coords_x'].count().reset_index().rename(
2195
+ # columns = {'coords_x':'problematic_events'})
2196
+ # event_coords_temp = event_coords_temp[event_coords_temp.problematic_events==1].drop(columns = 'problematic_events')
2197
+ # event_coords_temp = event_coords_temp.merge(event_coords.loc[:, ['event_player_1', 'period', 'event', 'version', 'coords_x', 'coords_y', 'coordinate_source']])
2198
+ # merged_problems = single_problems.merge(event_coords_temp)
2199
+ # #print("You fixed: " + str(len(merged_problems)) + " events!")
2200
+ # events = events[~events.event_index.isin(merged_problems.event_index)]
2201
+ # events = pd.concat([events, merged_problems]).sort_values(by = ['event_index', 'period', 'game_seconds'])
2202
+
2203
+ return(events)
2204
+
2205
+ def _fetch_all_pages_parallel(season, game_id):
2206
+ """
2207
+ Fetch all required HTML pages in parallel.
2208
+
2209
+ Args:
2210
+ season: Season string (e.g., '20242025')
2211
+ game_id: Full game ID (e.g., 2025020333)
2212
+
2213
+ Returns:
2214
+ Dictionary with keys: 'events', 'roster', 'home_shifts', 'away_shifts'
2215
+ All values are requests.Response objects
2216
+ """
2217
+ small_id = str(game_id)[5:]
2218
+
2219
+ # Prepare all URLs
2220
+ events_url = f'http://www.nhl.com/scores/htmlreports/{season}/PL0{small_id}.HTM'
2221
+ roster_url = f'http://www.nhl.com/scores/htmlreports/{season}/RO0{small_id}.HTM'
2222
+ home_shifts_url = f'http://www.nhl.com/scores/htmlreports/{season}/TH0{small_id}.HTM'
2223
+ away_shifts_url = f'http://www.nhl.com/scores/htmlreports/{season}/TV0{small_id}.HTM'
2224
+ summary_url = f'https://www.nhl.com/scores/htmlreports/{season}/GS0{small_id}.HTM'
2225
+
2226
+ # Fetch HTML pages concurrently (4 pages)
2227
+ fetch_start = time.time()
2228
+ print(' 🔄 Fetching HTML pages in parallel...')
2229
+
2230
+ with ThreadPoolExecutor(max_workers=4) as executor:
2231
+ # Submit HTML fetch tasks only
2232
+ futures = {
2233
+ 'events': executor.submit(_fetch_url, events_url, timeout=10),
2234
+ 'roster': executor.submit(_fetch_url, roster_url, timeout=10),
2235
+ 'home_shifts': executor.submit(_fetch_url, home_shifts_url, timeout=10),
2236
+ 'away_shifts': executor.submit(_fetch_url, away_shifts_url, timeout=10),
2237
+ 'summary': executor.submit(_fetch_url, summary_url, timeout=10)
2238
+ }
2239
+
2240
+ # Create reverse mapping from future to key
2241
+ future_to_key = {future: key for key, future in futures.items()}
2242
+
2243
+ # Collect HTML page results as they complete
2244
+ results = {}
2245
+ for future in as_completed(futures.values()):
2246
+ key = future_to_key[future]
2247
+ results[key] = future.result() # Will raise if HTTP error
2248
+
2249
+ html_fetch_duration = time.time() - fetch_start
2250
+ try:
2251
+ print(f' ⏱️ HTML pages fetched in: {html_fetch_duration:.2f}s')
2252
+ except Exception:
2253
+ pass
2254
+
2255
+ return results
2256
+
2257
+ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True, return_intermediates = False):
2258
+
2259
+ global single
2260
+ global event_coords
2261
+ global full
2262
+ global fixed_events
2263
+ global events
2264
+
2265
+ # OPTIMIZED: Use list instead of DataFrame for accumulating results
2266
+ full_list = []
2267
+
2268
+ # Track intermediates for each game if requested
2269
+ intermediates_list = []
2270
+
2271
+ i = 0
2272
+
2273
+ while i in range(0, len(game_id_list)) and len(game_id_list)>0:
2274
+
2275
+ # First thing to try: Scraping HTML events
2276
+
2277
+ try:
2278
+ first_time = time.time()
2279
+ print(game_id_list[i])
2280
+ game_id = game_id_list[i]
2281
+ print('Attempting scrape for: ' + str(game_id))
2282
+ season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
2283
+ small_id = str(game_id)[5:]
2284
+
2285
+ # OPTIMIZED: Fetch HTML pages in parallel, API separately
2286
+ parallel_start = time.time()
2287
+ print('Fetching pages')
2288
+ pages = _fetch_all_pages_parallel(season, game_id)
2289
+ parallel_duration = time.time() - parallel_start
2290
+ try:
2291
+ print(f'⏱️ Parallel fetch took: {parallel_duration:.2f}s')
2292
+ except Exception:
2293
+ pass
2294
+
2295
+ # TIME: HTML Events (using pre-fetched pages)
2296
+ html_start = time.time()
2297
+ print('Scraping HTML events')
2298
+ single, roster_cache = scrape_html_events(season, small_id,
2299
+ events_page=pages['events'],
2300
+ roster_page=pages['roster'])
2301
+ html_duration = time.time() - html_start
2302
+ try:
2303
+ print(f'⏱️ HTML events processing took: {html_duration:.2f}s')
2304
+ except Exception:
2305
+ pass
2306
+ single['game_id'] = int(game_id)
2307
+
2308
+ # Try NHL API first (default behavior)
2309
+
2310
+ try:
2311
+ # TIME: API Events (fetch after HTML events are processed, like original)
2312
+ api_start = time.time()
2313
+ print('Attempting to scrape coordinates from NHL API')
2314
+ event_coords = scrape_api_events(game_id, drop_description=True)
2315
+ api_duration = time.time() - api_start
2316
+ try:
2317
+ print(f'⏱️ API events took: {api_duration:.2f}s')
2318
+ except Exception:
2319
+ pass
2320
+
2321
+ # Set coordinate_source on event_coords before merging (needed for fix_missing)
2322
+ event_coords['coordinate_source'] = 'api'
2323
+ api_coords = event_coords.copy()
2324
+ if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
2325
+ raise ExpatError('Bad takes, dude!')
2326
+ event_coords['game_id'] = int(game_id)
2327
+
2328
+ # TIME: Merge Events
2329
+ merge_start = time.time()
2330
+ print('Attempting to merge events')
2331
+ events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'game_id', 'event'], how = 'left')
2332
+ merge_duration = time.time() - merge_start
2333
+ print(f'Merged events, we have this many rows: {len(events)}')
2334
+ try:
2335
+ print(f'⏱️ Merge took: {merge_duration:.2f}s')
2336
+ except Exception:
2337
+ pass
2338
+
2339
+
2340
+
2341
+ # TIME: Fix Missing
2342
+ try:
2343
+ fix_start = time.time()
2344
+ events = fix_missing(single, event_coords, events)
2345
+ fix_duration = time.time() - fix_start
2346
+ try:
2347
+ print(f'⏱️ Fix missing took: {fix_duration:.2f}s')
2348
+ except Exception:
2349
+ pass
2350
+ except IndexError as e:
2351
+ print('Issue when fixing problematic events. Here it is: ' + str(e))
2352
+ continue
2353
+ print(pages)
2354
+ # TIME: Shifts and Finalize (using pre-fetched pages)
2355
+ try:
2356
+ print(pages)
2357
+ shifts_start = time.time()
2358
+ if live == True:
2359
+ min_game_clock, shifts = scrape_html_shifts(season, small_id, live,
2360
+ home_page=pages['home_shifts'],
2361
+ away_page=pages['away_shifts'],
2362
+ summary = pages['summary'],
2363
+ roster_cache = roster_cache)
2364
+ else:
2365
+ shifts = scrape_html_shifts(season, small_id, live,
2366
+ home_page=pages['home_shifts'],
2367
+ away_page=pages['away_shifts'],
2368
+ summary = pages['summary'],
2369
+ roster_cache = roster_cache)
2370
+ shifts_duration = time.time() - shifts_start
2371
+ try:
2372
+ print(f'⏱️ HTML shifts processing took: {shifts_duration:.2f}s')
2373
+ except Exception:
2374
+ pass
2375
+
2376
+ prepare_start = time.time()
2377
+ finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
2378
+ if live == True:
2379
+ if min_game_clock is not None:
2380
+ finalized = finalized[finalized.game_seconds <= min_game_clock]
2381
+ prepare_duration = time.time() - prepare_start
2382
+ try:
2383
+ print(f'⏱️ Merge and prepare took: {prepare_duration:.2f}s')
2384
+ except Exception:
2385
+ pass
2386
+
2387
+ full_list.append(finalized)
2388
+ second_time = time.time()
2389
+
2390
+ # Track intermediates if requested
2391
+ if return_intermediates:
2392
+ intermediates_list.append({
2393
+ 'game_id': game_id,
2394
+ 'shifts': shifts.copy() if shifts is not None else None,
2395
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2396
+ 'roster_cache': roster_cache.copy() if roster_cache is not None else None,
2397
+ 'coordinate_source': 'api',
2398
+ 'warning': None,
2399
+ 'error': None,
2400
+ 'raw_html': {
2401
+ 'events': pages.get('events'),
2402
+ 'roster': pages.get('roster'),
2403
+ 'home_shifts': pages.get('home_shifts'),
2404
+ 'away_shifts': pages.get('away_shifts'),
2405
+ 'summary': pages.get('summary')
2406
+ }
2407
+ })
2408
+ except IndexError as e:
2409
+ print('There was no shift data for this game. Error: ' + str(e))
2410
+ fixed_events = events
2411
+ fixed_events = fixed_events.rename(
2412
+ columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
2413
+ 'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
2414
+ 'away_team':'awayteamfull'}
2415
+ ).drop(
2416
+ columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
2417
+ ).assign(game_warning = 'NO SHIFT DATA.')
2418
+ full_list.append(fixed_events)
2419
+ second_time = time.time()
2420
+
2421
+ # Track intermediates if requested
2422
+ if return_intermediates:
2423
+ intermediates_list.append({
2424
+ 'game_id': game_id,
2425
+ 'shifts': None,
2426
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2427
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2428
+ 'coordinate_source': 'api',
2429
+ 'warning': 'NO SHIFT DATA.',
2430
+ 'error': None,
2431
+ 'raw_html': {
2432
+ 'events': pages.get('events'),
2433
+ 'roster': pages.get('roster'),
2434
+ 'home_shifts': pages.get('home_shifts'),
2435
+ 'away_shifts': pages.get('away_shifts'),
2436
+ 'summary': pages.get('summary')
2437
+ }
2438
+ })
2439
+
2440
+ try:
2441
+ total_duration = second_time - first_time
2442
+ except NameError:
2443
+ second_time = time.time()
2444
+ total_duration = second_time - first_time
2445
+ print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from the NHL API.')
2446
+ # Safely format timing string, handling potentially undefined variables
2447
+ try:
2448
+ timing_parts = []
2449
+ timing_parts.append(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
2450
+ if 'parallel_duration' in locals(): timing_parts.append(f"Parallel fetch: {parallel_duration:.2f}s")
2451
+ if 'html_duration' in locals(): timing_parts.append(f"HTML processing: {html_duration:.2f}s")
2452
+ if 'api_duration' in locals(): timing_parts.append(f"API processing: {api_duration:.2f}s")
2453
+ if 'merge_duration' in locals(): timing_parts.append(f"Merge: {merge_duration:.2f}s")
2454
+ if 'fix_duration' in locals(): timing_parts.append(f"Fix missing: {fix_duration:.2f}s")
2455
+ if 'shifts_duration' in locals(): timing_parts.append(f"Shifts: {shifts_duration:.2f}s")
2456
+ if 'prepare_duration' in locals(): timing_parts.append(f"Merge/prepare: {prepare_duration:.2f}s")
2457
+ if len(timing_parts) > 1:
2458
+ print(" (" + ", ".join(timing_parts[1:]) + ")")
2459
+ else:
2460
+ print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
2461
+ except Exception:
2462
+ print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
2463
+ i = i + 1
2464
+
2465
+ # If there is an issue with the API, fall back to ESPN:
2466
+
2467
+ except (KeyError, ExpatError) as e:
2468
+ print('The NHL API gave us trouble with: ' + str(game_id) + '. Falling back to ESPN.')
2469
+
2470
+ try:
2471
+ home_team = single['home_team_abbreviated'].iloc[0]
2472
+ away_team = single['away_team_abbreviated'].iloc[0]
2473
+ game_date = single['game_date'].iloc[0]
2474
+ espn_home_team = home_team
2475
+ espn_away_team = away_team
2476
+ try:
2477
+ if home_team == 'T.B':
2478
+ espn_home_team = 'TBL'
2479
+ if away_team == 'T.B':
2480
+ espn_away_team = 'TBL'
2481
+ if home_team == 'L.A':
2482
+ espn_home_team = 'LAK'
2483
+ if away_team == 'L.A':
2484
+ espn_away_team = 'LAK'
2485
+ if home_team == 'N.J':
2486
+ espn_home_team = 'NJD'
2487
+ if away_team == 'N.J':
2488
+ espn_away_team = 'NJD'
2489
+ if home_team == 'S.J':
2490
+ espn_home_team = 'SJS'
2491
+ if away_team == 'S.J':
2492
+ espn_away_team = 'SJS'
2493
+ print('Scraping ESPN IDs')
2494
+ espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
2495
+ print('Scraping ESPN Events')
2496
+ print('Here is the ESPN ID:', espn_id)
2497
+ event_coords = scrape_espn_events(int(espn_id))
2498
+ print('Scraped ESPN Events, we have this many rows:', len(event_coords))
2499
+ event_coords['coordinate_source'] = 'espn'
2500
+ print('Attempting to merge events')
2501
+ events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
2502
+ print('Merged events, we have this many rows:', len(events))
2503
+ try:
2504
+ events = fix_missing(single, event_coords, events)
2505
+ except IndexError as e:
2506
+ print('Issue when fixing problematic events. Here it is: ' + str(e))
2507
+ continue
2508
+ except IndexError:
2509
+ print('This game does not have ESPN or API coordinates. You will get it anyway, though.')
2510
+ events = single
2511
+ try:
2512
+ shifts = scrape_html_shifts(season, small_id, live,
2513
+ home_page=pages['home_shifts'],
2514
+ away_page=pages['away_shifts'],
2515
+ summary = pages['summary'],
2516
+ roster_cache = roster_cache)
2517
+ finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
2518
+ full_list.append(finalized)
2519
+ second_time = time.time()
2520
+
2521
+ # Track intermediates if requested
2522
+ if return_intermediates:
2523
+ intermediates_list.append({
2524
+ 'game_id': game_id,
2525
+ 'shifts': shifts.copy() if shifts is not None else None,
2526
+ 'api_coords': None,
2527
+ 'roster_cache': roster_cache.copy() if roster_cache is not None else None,
2528
+ 'coordinate_source': 'espn',
2529
+ 'warning': None,
2530
+ 'error': None,
2531
+ 'raw_html': {
2532
+ 'events': pages.get('events'),
2533
+ 'roster': pages.get('roster'),
2534
+ 'home_shifts': pages.get('home_shifts'),
2535
+ 'away_shifts': pages.get('away_shifts'),
2536
+ 'summary': pages.get('summary')
2537
+ }
2538
+ })
2539
+ except IndexError as e:
2540
+ print('There was no shift data for this game. Error: ' + str(e))
2541
+ fixed_events = events
2542
+ fixed_events = fixed_events.rename(
2543
+ columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
2544
+ 'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
2545
+ 'away_team':'awayteamfull'}
2546
+ ).drop(
2547
+ columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
2548
+ ).assign(game_warning = 'NO SHIFT DATA', season = season)
2549
+ fixed_events['coordinate_source'] = 'espn'
2550
+ full_list.append(fixed_events)
2551
+
2552
+ # Track intermediates if requested
2553
+ if return_intermediates:
2554
+ intermediates_list.append({
2555
+ 'game_id': game_id,
2556
+ 'shifts': None,
2557
+ 'api_coords': None,
2558
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2559
+ 'coordinate_source': 'espn',
2560
+ 'warning': 'NO SHIFT DATA',
2561
+ 'error': None,
2562
+ 'raw_html': {
2563
+ 'events': pages.get('events'),
2564
+ 'roster': pages.get('roster'),
2565
+ 'home_shifts': pages.get('home_shifts'),
2566
+ 'away_shifts': pages.get('away_shifts'),
2567
+ 'summary': pages.get('summary')
2568
+ }
2569
+ })
2570
+ second_time = time.time()
2571
+ # Fix this so it doesn't say sourced from ESPN if no coords.
2572
+ if single.equals(events):
2573
+ print("This game took " + str(round(second_time - first_time, 2)) + " seconds.")
2574
+ i = i + 1
2575
+ else:
2576
+ print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from ESPN.')
2577
+ print("This game took " + str(round(second_time - first_time, 2)) + " seconds.")
2578
+ i = i + 1
2579
+
2580
+ # If there are issues with ESPN
2581
+
2582
+ except KeyError as e:
2583
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2584
+ print('KeyError: ' + str(e))
2585
+ print(traceback.format_exc())
2586
+ if return_intermediates:
2587
+ intermediates_list.append({
2588
+ 'game_id': game_id,
2589
+ 'shifts': None,
2590
+ 'api_coords': None,
2591
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2592
+ 'coordinate_source': None,
2593
+ 'warning': None,
2594
+ 'error': f'ESPN KeyError: {str(e)}',
2595
+ 'error_traceback': traceback.format_exc(),
2596
+ 'raw_html': {
2597
+ 'events': pages.get('events') if 'pages' in locals() else None,
2598
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2599
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2600
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2601
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2602
+ }
2603
+ })
2604
+ i = i + 1
2605
+ continue
2606
+ except IndexError as e:
2607
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2608
+ print('IndexError: ' + str(e))
2609
+ if return_intermediates:
2610
+ intermediates_list.append({
2611
+ 'game_id': game_id,
2612
+ 'shifts': None,
2613
+ 'api_coords': None,
2614
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2615
+ 'coordinate_source': None,
2616
+ 'warning': None,
2617
+ 'error': f'ESPN IndexError: {str(e)}',
2618
+ 'error_traceback': traceback.format_exc(),
2619
+ 'raw_html': {
2620
+ 'events': pages.get('events') if 'pages' in locals() else None,
2621
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2622
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2623
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2624
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2625
+ }
2626
+ })
2627
+ i = i + 1
2628
+ continue
2629
+ except TypeError as e:
2630
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2631
+ print('TypeError: ' + str(e))
2632
+ if return_intermediates:
2633
+ intermediates_list.append({
2634
+ 'game_id': game_id,
2635
+ 'shifts': None,
2636
+ 'api_coords': None,
2637
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2638
+ 'coordinate_source': None,
2639
+ 'warning': None,
2640
+ 'error': f'ESPN TypeError: {str(e)}',
2641
+ 'error_traceback': traceback.format_exc(),
2642
+ 'raw_html': {
2643
+ 'events': pages.get('events') if 'pages' in locals() else None,
2644
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2645
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2646
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2647
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2648
+ }
2649
+ })
2650
+ i = i + 1
2651
+ continue
2652
+ except ExpatError as e:
2653
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2654
+ print('ExpatError: ' + str(e))
2655
+ if return_intermediates:
2656
+ intermediates_list.append({
2657
+ 'game_id': game_id,
2658
+ 'shifts': None,
2659
+ 'api_coords': None,
2660
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2661
+ 'coordinate_source': None,
2662
+ 'warning': None,
2663
+ 'error': f'ESPN ExpatError: {str(e)}',
2664
+ 'error_traceback': traceback.format_exc(),
2665
+ 'raw_html': {
2666
+ 'events': pages.get('events') if 'pages' in locals() else None,
2667
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2668
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2669
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2670
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2671
+ }
2672
+ })
2673
+ i = i + 1
2674
+ continue
2675
+
2676
+ except ExpatError:
2677
+ print('There was a rare error with the API; numerous takeaways did not have location coordinates for: ' + str(game_id) + '. Let us try ESPN.')
2678
+
2679
+ try:
2680
+ home_team = single['home_team'].iloc[0]
2681
+ away_team = single['away_team'].iloc[0]
2682
+ game_date = single['game_date'].iloc[0]
2683
+ try:
2684
+ espn_id = scrape_espn_ids_single_game(str(game_date.date()), home_team, away_team).espn_id.iloc[0]
2685
+ event_coords = scrape_espn_events(int(espn_id))
2686
+ duped_coords = api_coords.assign(source = 'api').merge(event_coords.drop(columns = 'espn_id'), on = ['game_seconds', 'event', 'period', 'version', 'event_player_1'], how = 'outer', indicator = True)
2687
+ # Coordinates are flipped in some games.
2688
+ if len(duped_coords[duped_coords.coords_x_x * -1 == duped_coords.coords_x_y])/len(duped_coords):
2689
+ duped_coords['coords_x_y'] = duped_coords['coords_x_y'] * (-1)
2690
+ if len(duped_coords[duped_coords.coords_y_x * -1 == duped_coords.coords_y_y])/len(duped_coords):
2691
+ duped_coords['coords_y_y'] = duped_coords['coords_y_y'] * (-1)
2692
+ duped_coords['source'] = np.where((pd.isna(duped_coords.source)) | ((pd.isna(duped_coords.coords_x_x)) & ~pd.isna(duped_coords.coords_x_y)), 'espn', duped_coords.source)
2693
+ duped_coords = duped_coords.assign(coords_x = np.where(pd.isna(duped_coords.coords_x_x), duped_coords.coords_x_y, duped_coords.coords_x_x),
2694
+ coords_y = np.where(pd.isna(duped_coords.coords_y_x), duped_coords.coords_y_y, duped_coords.coords_y_x))
2695
+ col_list = list(api_coords.columns)
2696
+ col_list._append('source')
2697
+ duped_coords = duped_coords.loc[:, col_list]
2698
+ duped_coords = duped_coords[duped_coords.event.isin(['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL', 'PENL', 'FAC'])]
2699
+ duped_coords = duped_coords[~duped_coords.duplicated()]
2700
+ event_coords = duped_coords
2701
+ events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left')#.drop(columns = ['espn_id'])
2702
+ try:
2703
+ events = fix_missing(single, event_coords, events)
2704
+ events['coordinate_source'] = events['source']
2705
+ except IndexError as e:
2706
+ print('Issue when fixing problematic events. Here it is: ' + str(e))
2707
+ except IndexError as e:
2708
+ if event_coords is not None:
2709
+ print('Okay, ESPN had issues. We will go back to the API for this one. Issue: ' + str(e))
2710
+ events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'event'], how = 'left')
2711
+ try:
2712
+ events = fix_missing(single, event_coords, events)
2713
+ except IndexError as e:
2714
+ print('Issue when fixing problematic events. Here it is: ' + str(e))
2715
+ else:
2716
+ print('This game does not have ESPN or API coordinates. You will get it anyway, though. Issue: ' + str(e))
2717
+ events = single
2718
+ events['coordinate_source'] = 'none'
2719
+ # Determine coordinate source for this path
2720
+ coord_source_for_intermediates = 'espn'
2721
+ if 'source' in events.columns:
2722
+ coord_source_for_intermediates = 'api_espn_hybrid'
2723
+ elif 'coordinate_source' not in events.columns or events.get('coordinate_source', pd.Series(['none'])).iloc[0] == 'none':
2724
+ coord_source_for_intermediates = 'none'
2725
+
2726
+ try:
2727
+ shifts = scrape_html_shifts(season, small_id, live,
2728
+ home_page=pages['home_shifts'],
2729
+ away_page=pages['away_shifts'],
2730
+ summary = pages['summary'],
2731
+ roster_cache = roster_cache)
2732
+ finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
2733
+ full_list.append(finalized)
2734
+ second_time = time.time()
2735
+
2736
+ # Track intermediates if requested
2737
+ if return_intermediates:
2738
+ intermediates_list.append({
2739
+ 'game_id': game_id,
2740
+ 'shifts': shifts.copy() if shifts is not None else None,
2741
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2742
+ 'roster_cache': roster_cache.copy() if roster_cache is not None else None,
2743
+ 'coordinate_source': coord_source_for_intermediates,
2744
+ 'warning': None,
2745
+ 'error': None,
2746
+ 'raw_html': {
2747
+ 'events': pages.get('events'),
2748
+ 'roster': pages.get('roster'),
2749
+ 'home_shifts': pages.get('home_shifts'),
2750
+ 'away_shifts': pages.get('away_shifts'),
2751
+ 'summary': pages.get('summary')
2752
+ }
2753
+ })
2754
+ except IndexError as e:
2755
+ print('There was no shift data for this game. Error: ' + str(e))
2756
+ fixed_events = events
2757
+ fixed_events = fixed_events.rename(
2758
+ columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
2759
+ 'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
2760
+ 'away_team':'awayteamfull'}
2761
+ ).drop(
2762
+ columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
2763
+ ).assign(game_warning = 'NO SHIFT DATA', season = season)
2764
+ full_list.append(fixed_events)
2765
+
2766
+ # Track intermediates if requested
2767
+ if return_intermediates:
2768
+ intermediates_list.append({
2769
+ 'game_id': game_id,
2770
+ 'shifts': None,
2771
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2772
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2773
+ 'coordinate_source': coord_source_for_intermediates,
2774
+ 'warning': 'NO SHIFT DATA',
2775
+ 'error': None,
2776
+ 'raw_html': {
2777
+ 'events': pages.get('events'),
2778
+ 'roster': pages.get('roster'),
2779
+ 'home_shifts': pages.get('home_shifts'),
2780
+ 'away_shifts': pages.get('away_shifts'),
2781
+ 'summary': pages.get('summary')
2782
+ }
2783
+ })
2784
+ second_time = time.time()
2785
+ # Fix this so it doesn't say sourced from ESPN if no coords.
2786
+ print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from ESPN.')
2787
+ print("This game took " + str(round(second_time - first_time, 2)) + " seconds.")
2788
+ i = i + 1
2789
+
2790
+ # If there are issues with ESPN
2791
+
2792
+ except KeyError as e:
2793
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2794
+ print('KeyError: ' + str(e))
2795
+ if return_intermediates:
2796
+ intermediates_list.append({
2797
+ 'game_id': game_id,
2798
+ 'shifts': None,
2799
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2800
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2801
+ 'coordinate_source': None,
2802
+ 'warning': None,
2803
+ 'error': f'ESPN Hybrid KeyError: {str(e)}',
2804
+ 'error_traceback': traceback.format_exc(),
2805
+ 'raw_html': {
2806
+ 'events': pages.get('events') if 'pages' in locals() else None,
2807
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2808
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2809
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2810
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2811
+ }
2812
+ })
2813
+ i = i + 1
2814
+ continue
2815
+ except IndexError as e:
2816
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2817
+ print('IndexError: ' + str(e))
2818
+ if return_intermediates:
2819
+ intermediates_list.append({
2820
+ 'game_id': game_id,
2821
+ 'shifts': None,
2822
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2823
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2824
+ 'coordinate_source': None,
2825
+ 'warning': None,
2826
+ 'error': f'ESPN Hybrid IndexError: {str(e)}',
2827
+ 'error_traceback': traceback.format_exc(),
2828
+ 'raw_html': {
2829
+ 'events': pages.get('events') if 'pages' in locals() else None,
2830
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2831
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2832
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2833
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2834
+ }
2835
+ })
2836
+ i = i + 1
2837
+ continue
2838
+ except TypeError as e:
2839
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2840
+ print('TypeError: ' + str(e))
2841
+ if return_intermediates:
2842
+ intermediates_list.append({
2843
+ 'game_id': game_id,
2844
+ 'shifts': None,
2845
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2846
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2847
+ 'coordinate_source': None,
2848
+ 'warning': None,
2849
+ 'error': f'ESPN Hybrid TypeError: {str(e)}',
2850
+ 'error_traceback': traceback.format_exc(),
2851
+ 'raw_html': {
2852
+ 'events': pages.get('events') if 'pages' in locals() else None,
2853
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2854
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2855
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2856
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2857
+ }
2858
+ })
2859
+ i = i + 1
2860
+ continue
2861
+ except ExpatError as e:
2862
+ print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
2863
+ print('ExpatError: ' + str(e))
2864
+ if return_intermediates:
2865
+ intermediates_list.append({
2866
+ 'game_id': game_id,
2867
+ 'shifts': None,
2868
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2869
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2870
+ 'coordinate_source': None,
2871
+ 'warning': None,
2872
+ 'error': f'ESPN Hybrid ExpatError: {str(e)}',
2873
+ 'error_traceback': traceback.format_exc(),
2874
+ 'raw_html': {
2875
+ 'events': pages.get('events') if 'pages' in locals() else None,
2876
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2877
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2878
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2879
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2880
+ }
2881
+ })
2882
+ i = i + 1
2883
+ continue
2884
+
2885
+ except ConnectionError as e:
2886
+ print('Got a Connection Error, time to sleep.')
2887
+ if return_intermediates:
2888
+ intermediates_list.append({
2889
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
2890
+ 'shifts': None,
2891
+ 'api_coords': None,
2892
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2893
+ 'coordinate_source': None,
2894
+ 'warning': None,
2895
+ 'error': f'ConnectionError: {str(e)}',
2896
+ 'error_traceback': traceback.format_exc(),
2897
+ 'raw_html': {
2898
+ 'events': pages.get('events') if 'pages' in locals() else None,
2899
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2900
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2901
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2902
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2903
+ }
2904
+ })
2905
+ time.sleep(10)
2906
+ continue
2907
+
2908
+ except ChunkedEncodingError as e:
2909
+ print('Got a Connection Error, time to sleep.')
2910
+ if return_intermediates:
2911
+ intermediates_list.append({
2912
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
2913
+ 'shifts': None,
2914
+ 'api_coords': None,
2915
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2916
+ 'coordinate_source': None,
2917
+ 'warning': None,
2918
+ 'error': f'ChunkedEncodingError: {str(e)}',
2919
+ 'error_traceback': traceback.format_exc(),
2920
+ 'raw_html': {
2921
+ 'events': pages.get('events') if 'pages' in locals() else None,
2922
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2923
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2924
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2925
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2926
+ }
2927
+ })
2928
+ time.sleep(10)
2929
+ continue
2930
+
2931
+ except AttributeError as e:
2932
+ print(str(game_id) + ' does not have an HTML report. Here is the error: ' + str(e))
2933
+ print(traceback.format_exc())
2934
+ if return_intermediates:
2935
+ intermediates_list.append({
2936
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
2937
+ 'shifts': None,
2938
+ 'api_coords': None,
2939
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2940
+ 'coordinate_source': None,
2941
+ 'warning': None,
2942
+ 'error': f'AttributeError: {str(e)}',
2943
+ 'error_traceback': traceback.format_exc(),
2944
+ 'raw_html': {
2945
+ 'events': pages.get('events') if 'pages' in locals() else None,
2946
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2947
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2948
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2949
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2950
+ }
2951
+ })
2952
+ i = i + 1
2953
+ continue
2954
+
2955
+ except IndexError as e:
2956
+ print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
2957
+ print(traceback.format_exc())
2958
+ if return_intermediates:
2959
+ intermediates_list.append({
2960
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
2961
+ 'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
2962
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2963
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2964
+ 'coordinate_source': None,
2965
+ 'warning': None,
2966
+ 'error': f'IndexError: {str(e)}',
2967
+ 'error_traceback': traceback.format_exc(),
2968
+ 'raw_html': {
2969
+ 'events': pages.get('events') if 'pages' in locals() else None,
2970
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2971
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2972
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2973
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2974
+ }
2975
+ })
2976
+ i = i + 1
2977
+ continue
2978
+
2979
+ except ValueError as e:
2980
+ print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
2981
+ print(traceback.format_exc())
2982
+ if return_intermediates:
2983
+ intermediates_list.append({
2984
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
2985
+ 'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
2986
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
2987
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
2988
+ 'coordinate_source': None,
2989
+ 'warning': None,
2990
+ 'error': f'ValueError: {str(e)}',
2991
+ 'error_traceback': traceback.format_exc(),
2992
+ 'raw_html': {
2993
+ 'events': pages.get('events') if 'pages' in locals() else None,
2994
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
2995
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
2996
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
2997
+ 'summary': pages.get('summary') if 'pages' in locals() else None
2998
+ }
2999
+ })
3000
+ i = i + 1
3001
+ continue
3002
+
3003
+ except KeyError as k:
3004
+ print(str(game_id) + 'gave some kind of Key Error. Here is the error: ' + str(k))
3005
+ if return_intermediates:
3006
+ intermediates_list.append({
3007
+ 'game_id': game_id if 'game_id' in locals() else game_id_list[i],
3008
+ 'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
3009
+ 'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
3010
+ 'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
3011
+ 'coordinate_source': None,
3012
+ 'warning': None,
3013
+ 'error': f'KeyError: {str(k)}',
3014
+ 'error_traceback': traceback.format_exc(),
3015
+ 'raw_html': {
3016
+ 'events': pages.get('events') if 'pages' in locals() else None,
3017
+ 'roster': pages.get('roster') if 'pages' in locals() else None,
3018
+ 'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
3019
+ 'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
3020
+ 'summary': pages.get('summary') if 'pages' in locals() else None
3021
+ }
3022
+ })
3023
+ i = i + 1
3024
+ continue
3025
+
3026
+ except KeyboardInterrupt:
3027
+ print('You manually interrupted the scrape. You will get to keep every game you have already completed scraping after just a bit of post-processing. Good bye.')
3028
+ global hidden_patrick
3029
+ hidden_patrick = 1
3030
+ # OPTIMIZED: Concat list to DataFrame
3031
+ full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
3032
+ if len(full) > 0:
3033
+
3034
+ full = full.assign(home_skaters = np.where(~full.home_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
3035
+ (full.home_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
3036
+ full.home_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
3037
+ full.home_skaters))
3038
+
3039
+ full = full.assign(away_skaters = np.where(~full.away_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
3040
+ (full.away_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
3041
+ full.away_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
3042
+ full.away_skaters))
3043
+
3044
+ if 'away_on_1' in full.columns:
3045
+
3046
+ full = full.assign(
3047
+ away_on_1 = np.where((pd.isna(full.away_on_1)) | (full.away_on_1 is None) | (full.away_on_1=='') | (full.away_on_1=='\xa0'), '\xa0', full.away_on_1),
3048
+ away_on_2 = np.where((pd.isna(full.away_on_2)) | (full.away_on_2 is None) | (full.away_on_2=='') | (full.away_on_2=='\xa0'), '\xa0', full.away_on_2),
3049
+ away_on_3 = np.where((pd.isna(full.away_on_3)) | (full.away_on_3 is None) | (full.away_on_3=='') | (full.away_on_3=='\xa0'), '\xa0', full.away_on_3),
3050
+ away_on_4 = np.where((pd.isna(full.away_on_4)) | (full.away_on_4 is None) | (full.away_on_4=='') | (full.away_on_4=='\xa0'), '\xa0', full.away_on_4),
3051
+ away_on_5 = np.where((pd.isna(full.away_on_5)) | (full.away_on_5 is None) | (full.away_on_5=='') | (full.away_on_5=='\xa0'), '\xa0', full.away_on_5),
3052
+ away_on_6 = np.where((pd.isna(full.away_on_6)) | (full.away_on_6 is None) | (full.away_on_6=='') | (full.away_on_6=='\xa0'), '\xa0', full.away_on_6),
3053
+ away_on_7 = np.where((pd.isna(full.away_on_7)) | (full.away_on_7 is None) | (full.away_on_7=='') | (full.away_on_7=='\xa0'), '\xa0', full.away_on_7),
3054
+ away_on_8 = np.where((pd.isna(full.away_on_8)) | (full.away_on_8 is None) | (full.away_on_8=='') | (full.away_on_8=='\xa0'), '\xa0', full.away_on_8),
3055
+ away_on_9 = np.where((pd.isna(full.away_on_9)) | (full.away_on_9 is None) | (full.away_on_9=='') | (full.away_on_9=='\xa0'), '\xa0', full.away_on_9),
3056
+ home_on_1 = np.where((pd.isna(full.home_on_1)) | (full.home_on_1 is None) | (full.home_on_1=='') | (full.home_on_1=='\xa0'), '\xa0', full.home_on_1),
3057
+ home_on_2 = np.where((pd.isna(full.home_on_2)) | (full.home_on_2 is None) | (full.home_on_2=='') | (full.home_on_2=='\xa0'), '\xa0', full.home_on_2),
3058
+ home_on_3 = np.where((pd.isna(full.home_on_3)) | (full.home_on_3 is None) | (full.home_on_3=='') | (full.home_on_3=='\xa0'), '\xa0', full.home_on_3),
3059
+ home_on_4 = np.where((pd.isna(full.home_on_4)) | (full.home_on_4 is None) | (full.home_on_4=='') | (full.home_on_4=='\xa0'), '\xa0', full.home_on_4),
3060
+ home_on_5 = np.where((pd.isna(full.home_on_5)) | (full.home_on_5 is None) | (full.home_on_5=='') | (full.home_on_5=='\xa0'), '\xa0', full.home_on_5),
3061
+ home_on_6 = np.where((pd.isna(full.home_on_6)) | (full.home_on_6 is None) | (full.home_on_6=='') | (full.home_on_6=='\xa0'), '\xa0', full.home_on_6),
3062
+ home_on_7 = np.where((pd.isna(full.home_on_7)) | (full.home_on_7 is None) | (full.home_on_7=='') | (full.home_on_7=='\xa0'), '\xa0', full.home_on_7),
3063
+ home_on_8 = np.where((pd.isna(full.home_on_8)) | (full.home_on_8 is None) | (full.home_on_8=='') | (full.home_on_8=='\xa0'), '\xa0', full.home_on_8),
3064
+ home_on_9 = np.where((pd.isna(full.home_on_9)) | (full.home_on_9 is None) | (full.home_on_9=='') | (full.home_on_9=='\xa0'), '\xa0', full.home_on_9),
3065
+ home_goalie = np.where((pd.isna(full.home_goalie)) | (full.home_goalie is None) | (full.home_goalie=='') | (full.home_goalie=='\xa0'), '\xa0', full.home_goalie),
3066
+ away_goalie = np.where((pd.isna(full.away_goalie)) | (full.away_goalie is None) | (full.away_goalie=='') | (full.away_goalie=='\xa0'), '\xa0', full.away_goalie)
3067
+ )
3068
+
3069
+ # OPTIMIZED: Concat list to DataFrame before return
3070
+ full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
3071
+ if return_intermediates:
3072
+ return {'final': full, 'intermediates': intermediates_list}
3073
+ return full
3074
+
3075
+ # OPTIMIZED: Concat list to DataFrame before final processing
3076
+ full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
3077
+
3078
+ if len(full) > 0:
3079
+
3080
+ full = full.assign(home_skaters = np.where(~full.home_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
3081
+ (full.home_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
3082
+ full.home_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
3083
+ full.home_skaters))
3084
+
3085
+ full = full.assign(away_skaters = np.where(~full.away_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
3086
+ (full.away_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
3087
+ full.away_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
3088
+ full.away_skaters))
3089
+
3090
+ if 'away_on_1' in full.columns:
3091
+
3092
+ full = full.assign(
3093
+ away_on_1 = np.where((pd.isna(full.away_on_1)) | (full.away_on_1 is None) | (full.away_on_1=='') | (full.away_on_1=='\xa0'), '\xa0', full.away_on_1),
3094
+ away_on_2 = np.where((pd.isna(full.away_on_2)) | (full.away_on_2 is None) | (full.away_on_2=='') | (full.away_on_2=='\xa0'), '\xa0', full.away_on_2),
3095
+ away_on_3 = np.where((pd.isna(full.away_on_3)) | (full.away_on_3 is None) | (full.away_on_3=='') | (full.away_on_3=='\xa0'), '\xa0', full.away_on_3),
3096
+ away_on_4 = np.where((pd.isna(full.away_on_4)) | (full.away_on_4 is None) | (full.away_on_4=='') | (full.away_on_4=='\xa0'), '\xa0', full.away_on_4),
3097
+ away_on_5 = np.where((pd.isna(full.away_on_5)) | (full.away_on_5 is None) | (full.away_on_5=='') | (full.away_on_5=='\xa0'), '\xa0', full.away_on_5),
3098
+ away_on_6 = np.where((pd.isna(full.away_on_6)) | (full.away_on_6 is None) | (full.away_on_6=='') | (full.away_on_6=='\xa0'), '\xa0', full.away_on_6),
3099
+ away_on_7 = np.where((pd.isna(full.away_on_7)) | (full.away_on_7 is None) | (full.away_on_7=='') | (full.away_on_7=='\xa0'), '\xa0', full.away_on_7),
3100
+ away_on_8 = np.where((pd.isna(full.away_on_8)) | (full.away_on_8 is None) | (full.away_on_8=='') | (full.away_on_8=='\xa0'), '\xa0', full.away_on_8),
3101
+ away_on_9 = np.where((pd.isna(full.away_on_9)) | (full.away_on_9 is None) | (full.away_on_9=='') | (full.away_on_9=='\xa0'), '\xa0', full.away_on_9),
3102
+ home_on_1 = np.where((pd.isna(full.home_on_1)) | (full.home_on_1 is None) | (full.home_on_1=='') | (full.home_on_1=='\xa0'), '\xa0', full.home_on_1),
3103
+ home_on_2 = np.where((pd.isna(full.home_on_2)) | (full.home_on_2 is None) | (full.home_on_2=='') | (full.home_on_2=='\xa0'), '\xa0', full.home_on_2),
3104
+ home_on_3 = np.where((pd.isna(full.home_on_3)) | (full.home_on_3 is None) | (full.home_on_3=='') | (full.home_on_3=='\xa0'), '\xa0', full.home_on_3),
3105
+ home_on_4 = np.where((pd.isna(full.home_on_4)) | (full.home_on_4 is None) | (full.home_on_4=='') | (full.home_on_4=='\xa0'), '\xa0', full.home_on_4),
3106
+ home_on_5 = np.where((pd.isna(full.home_on_5)) | (full.home_on_5 is None) | (full.home_on_5=='') | (full.home_on_5=='\xa0'), '\xa0', full.home_on_5),
3107
+ home_on_6 = np.where((pd.isna(full.home_on_6)) | (full.home_on_6 is None) | (full.home_on_6=='') | (full.home_on_6=='\xa0'), '\xa0', full.home_on_6),
3108
+ home_on_7 = np.where((pd.isna(full.home_on_7)) | (full.home_on_7 is None) | (full.home_on_7=='') | (full.home_on_7=='\xa0'), '\xa0', full.home_on_7),
3109
+ home_on_8 = np.where((pd.isna(full.home_on_8)) | (full.home_on_8 is None) | (full.home_on_8=='') | (full.home_on_8=='\xa0'), '\xa0', full.home_on_8),
3110
+ home_on_9 = np.where((pd.isna(full.home_on_9)) | (full.home_on_9 is None) | (full.home_on_9=='') | (full.home_on_9=='\xa0'), '\xa0', full.home_on_9),
3111
+ home_goalie = np.where((pd.isna(full.home_goalie)) | (full.home_goalie is None) | (full.home_goalie=='') | (full.home_goalie=='\xa0'), '\xa0', full.home_goalie),
3112
+ away_goalie = np.where((pd.isna(full.away_goalie)) | (full.away_goalie is None) | (full.away_goalie=='') | (full.away_goalie=='\xa0'), '\xa0', full.away_goalie)
3113
+ )
3114
+
3115
+ if live == True and 'game_strength_state' in full.columns:
3116
+
3117
+ # Find the point in time where everybody jumps off (i.e., the synthetic shifts end) and get rid of that and everything after.
3118
+ # (IF we have such a time)
3119
+
3120
+ if len(
3121
+ full[(full.game_strength_state.str.contains('E')) &
3122
+ ((full.game_strength_state != 'EvE')) &
3123
+ (full.game_strength_state.shift(-1) == 'EvE') &
3124
+ (full.game_period == max(full.game_period))]) > 0:
3125
+
3126
+ full = full[full.event_index <=
3127
+ full[(full.game_strength_state.str.contains('E')) &
3128
+ ((full.game_strength_state != 'EvE')) &
3129
+ (full.game_strength_state.shift(-1) == 'EvE') &
3130
+ (full.game_period == max(full.game_period))].event_index.iloc[-1] - 1]
3131
+
3132
+ # If we don't have such a point in time (which can happen when home clock and away clock are misaligned, for example):
3133
+ # Then we find the final change and ditch everything beneath it
3134
+
3135
+ elif full[full.event_type=='CHANGE'].iloc[-1].game_strength_state in ['5vE', 'Ev5']:
3136
+
3137
+ full = full[full.event_index <= full[full.event_type=='CHANGE'].iloc[-1].event_index]
3138
+
3139
+ if return_intermediates:
3140
+ return {'final': full, 'intermediates': intermediates_list}
3141
+ return full
3142
+
3143
+ def full_scrape(game_id_list, live = True, shift = False, return_intermediates = False):
3144
+
3145
+ global hidden_patrick
3146
+ hidden_patrick = 0
3147
+
3148
+ result = full_scrape_1by1(game_id_list, live, shift_to_espn = shift, return_intermediates = return_intermediates)
3149
+
3150
+ # Handle return_intermediates case
3151
+ if return_intermediates:
3152
+ df = result['final']
3153
+ intermediates_list = result['intermediates']
3154
+ else:
3155
+ df = result
3156
+ intermediates_list = None
3157
+
3158
+ print('Full scrape complete, we have this many rows:', len(df))
3159
+
3160
+ try:
3161
+ df = df.assign(
3162
+ event_player_1 = np.where(
3163
+ (df.event_player_1 == 'ELIAS PETTERSSON') &
3164
+ (df.event_description.str.contains('#', na=False)) &
3165
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3166
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON', expand=False) == '25'),
3167
+ 'ELIAS PETTERSSON(D)', df.event_player_1),
3168
+ event_player_2 = np.where(
3169
+ (df.event_player_2 == 'ELIAS PETTERSSON') &
3170
+ (
3171
+ # Goal and Petey got A1
3172
+ ((df.event_type == 'GOAL') &
3173
+ (df.event_description.str.contains(': #', na=False)) &
3174
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3175
+ (df.event_description.str.extract(r': #(\d+) PETTERSSON', expand=False) == '25')) |
3176
+ # Not a goal, Petey was EP2
3177
+ ((df.event_type != 'GOAL') &
3178
+ (df.event_description.str.contains('VAN #', na=False)) &
3179
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3180
+ (df.event_description.str.extract(r'VAN #(\d+) PETTERSSON', expand=False) == '25'))
3181
+ ),
3182
+ 'ELIAS PETTERSSON(D)', df.event_player_2),
3183
+ event_player_3 = np.where(
3184
+ (df.event_player_3=='ELIAS PETTERSSON') &
3185
+ (df.event_description.str.contains('#', na=False)) &
3186
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
3187
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON(?:\s|$)', expand=False) == '25'),
3188
+ 'ELIAS PETTERSSON(D)', df.event_player_3)
3189
+ )
3190
+ except Exception as e:
3191
+ print(e)
3192
+
3193
+ # Don't even need this, we've had this problem with Stutzle for years, just let it be.
3194
+ # df.event_description = df.event_description.str.replace('FEHÃ\x89RVÃ\x81RY', 'FEHERVARY').str.replace('BLÜMEL', 'BLAMEL')
3195
+
3196
+ if (hidden_patrick==0) and (len(df)>0):
3197
+
3198
+ gids = list(set(df.game_id))
3199
+ missing = [x for x in game_id_list if x not in gids]
3200
+ if len(missing)>0:
3201
+ print('You missed the following games: ' + str(missing))
3202
+ print('Let us try scraping each of them one more time.')
3203
+ retry_result = full_scrape_1by1(missing, return_intermediates = return_intermediates)
3204
+ if return_intermediates:
3205
+ retry_df = retry_result['final']
3206
+ retry_intermediates = retry_result['intermediates']
3207
+ df = pd.concat([df, retry_df], ignore_index=True)
3208
+ intermediates_list.extend(retry_intermediates)
3209
+ return {'final': df, 'intermediates': intermediates_list}
3210
+ else:
3211
+ df = pd.concat([df, retry_result], ignore_index=True)
3212
+ return df
3213
+ else:
3214
+ if return_intermediates:
3215
+ return {'final': df, 'intermediates': intermediates_list}
3216
+ return df
3217
+
3218
+ else:
3219
+ if return_intermediates:
3220
+ return {'final': df, 'intermediates': intermediates_list}
3221
+ return df
3222
+
3223
+ print("Welcome to the TopDownHockey NHL Scraper, built by Patrick Bacon.")
3224
+ print("If you enjoy the scraper and would like to support my work, or you have any comments, questions, or concerns, feel free to follow me on Twitter @TopDownHockey or reach out to me via email at patrick.s.bacon@gmail.com. Have fun!")