TopDownHockey-Scraper 6.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +820 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +3285 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper_OG.py +3224 -0
- TopDownHockey_Scraper/__init__.py +26 -0
- TopDownHockey_Scraper/data/handedness.csv +1276 -0
- TopDownHockey_Scraper/name_corrections.py +302 -0
- TopDownHockey_Scraper/portrait_links.csv +2445 -0
- TopDownHockey_Scraper/scrape_nhl_api_events.py +438 -0
- topdownhockey_scraper-6.1.30.dist-info/METADATA +169 -0
- topdownhockey_scraper-6.1.30.dist-info/RECORD +13 -0
- topdownhockey_scraper-6.1.30.dist-info/WHEEL +5 -0
- topdownhockey_scraper-6.1.30.dist-info/licenses/LICENSE +19 -0
- topdownhockey_scraper-6.1.30.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,3285 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from bs4 import BeautifulSoup # Keep for fallback/compatibility
|
|
4
|
+
from lxml import html, etree
|
|
5
|
+
import requests
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
import warnings
|
|
9
|
+
warnings.filterwarnings("ignore")
|
|
10
|
+
import sys
|
|
11
|
+
import json
|
|
12
|
+
from json import loads, dumps
|
|
13
|
+
import lxml
|
|
14
|
+
from requests import ConnectionError, ReadTimeout, ConnectTimeout, HTTPError, Timeout
|
|
15
|
+
import xml
|
|
16
|
+
import re
|
|
17
|
+
from natsort import natsorted
|
|
18
|
+
import xml.etree.ElementTree as ET
|
|
19
|
+
import xmltodict
|
|
20
|
+
from xml.parsers.expat import ExpatError
|
|
21
|
+
from requests.exceptions import ChunkedEncodingError
|
|
22
|
+
import traceback
|
|
23
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
|
+
from TopDownHockey_Scraper.scrape_nhl_api_events import scrape_api_events
|
|
25
|
+
|
|
26
|
+
print('Successfully did local install plus update - OPTIMIZED VERSION (Round 1: _append(), Round 2: name corrections, Round 3: vectorization, Round 4: parallel network requests)')
|
|
27
|
+
|
|
28
|
+
# ========== OPTIMIZATIONS ==========
|
|
29
|
+
# Create a persistent session with connection pooling
|
|
30
|
+
_session = requests.Session()
|
|
31
|
+
_adapter = requests.adapters.HTTPAdapter(
|
|
32
|
+
pool_connections=10,
|
|
33
|
+
pool_maxsize=20,
|
|
34
|
+
max_retries=2
|
|
35
|
+
)
|
|
36
|
+
_session.mount('http://', _adapter)
|
|
37
|
+
_session.mount('https://', _adapter)
|
|
38
|
+
|
|
39
|
+
# Compile regex patterns once for reuse
|
|
40
|
+
_BBORDER_PATTERN = re.compile('.*bborder.*')
|
|
41
|
+
_ZONE_PATTERN = re.compile(r'(\S+?) Zone')
|
|
42
|
+
_PLAYER_NUM_PATTERN = re.compile(r'[#-]\s*(\d+)')
|
|
43
|
+
_MATCH_GAME_PATTERN = re.compile(r'Match|Game')
|
|
44
|
+
_PARENTHESIS_PATTERN = re.compile(r'\((.*?)\)')
|
|
45
|
+
_MULTI_SPACE_PATTERN = re.compile(r' +')
|
|
46
|
+
_CAPTAIN_A_PATTERN = re.compile(r' \(A\)$')
|
|
47
|
+
_CAPTAIN_C_PATTERN = re.compile(r' \(C\)$')
|
|
48
|
+
|
|
49
|
+
# ========== PARALLEL FETCHING HELPERS ==========
|
|
50
|
+
def _fetch_url(url, **kwargs):
|
|
51
|
+
"""Helper function to fetch URL with session for use in ThreadPoolExecutor"""
|
|
52
|
+
return _session.get(url, **kwargs)
|
|
53
|
+
# ===============================================
|
|
54
|
+
|
|
55
|
+
team_names = ['ANAHEIM DUCKS',
|
|
56
|
+
'ARIZONA COYOTES',
|
|
57
|
+
'ATLANTA THRASHERS',
|
|
58
|
+
'BOSTON BRUINS',
|
|
59
|
+
'BUFFALO SABRES',
|
|
60
|
+
'CALGARY FLAMES',
|
|
61
|
+
'CHICAGO BLACKHAWKS',
|
|
62
|
+
'COLORADO AVALANCHE',
|
|
63
|
+
'COLUMBUS BLUE JACKETS',
|
|
64
|
+
'DALLAS STARS',
|
|
65
|
+
'DETROIT RED WINGS',
|
|
66
|
+
'EDMONTON OILERS',
|
|
67
|
+
'FLORIDA PANTHERS',
|
|
68
|
+
'LOS ANGELES KINGS',
|
|
69
|
+
'MINNESOTA WILD',
|
|
70
|
+
'MONTRÉAL CANADIENS',
|
|
71
|
+
'MONTREAL CANADIENS',
|
|
72
|
+
'NASHVILLE PREDATORS',
|
|
73
|
+
'NEW JERSEY DEVILS',
|
|
74
|
+
'NEW YORK ISLANDERS',
|
|
75
|
+
'NEW YORK RANGERS',
|
|
76
|
+
'OTTAWA SENATORS',
|
|
77
|
+
'PHILADELPHIA FLYERS',
|
|
78
|
+
'PITTSBURGH PENGUINS',
|
|
79
|
+
'PHOENIX COYOTES',
|
|
80
|
+
'CAROLINA HURRICANES',
|
|
81
|
+
'SAN JOSE SHARKS',
|
|
82
|
+
'ST. LOUIS BLUES',
|
|
83
|
+
'TAMPA BAY LIGHTNING',
|
|
84
|
+
'TORONTO MAPLE LEAFS',
|
|
85
|
+
'UTAH MAMMOTH',
|
|
86
|
+
'VANCOUVER CANUCKS',
|
|
87
|
+
'VEGAS GOLDEN KNIGHTS',
|
|
88
|
+
'WASHINGTON CAPITALS',
|
|
89
|
+
'WINNIPEG JETS',
|
|
90
|
+
'SEATTLE KRAKEN']
|
|
91
|
+
|
|
92
|
+
# ewc stands for "Events we care about."
|
|
93
|
+
|
|
94
|
+
ewc = ['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL']
|
|
95
|
+
|
|
96
|
+
# ========== OPTIMIZATION: Name Correction Dictionaries ==========
|
|
97
|
+
# Convert nested np.where() chains to fast dictionary lookups
|
|
98
|
+
# This provides 50-90% speedup on name correction operations
|
|
99
|
+
|
|
100
|
+
# ==================================
|
|
101
|
+
|
|
102
|
+
from TopDownHockey_Scraper.name_corrections import NAME_CORRECTIONS, normalize_player_name
|
|
103
|
+
|
|
104
|
+
def subtract_from_twenty_minutes(time_string):
|
|
105
|
+
# Parse the input time string
|
|
106
|
+
minutes, seconds = map(int, time_string.split(':'))
|
|
107
|
+
|
|
108
|
+
# Convert to total seconds
|
|
109
|
+
input_seconds = minutes * 60 + seconds
|
|
110
|
+
twenty_minutes_seconds = 20 * 60 # 1200 seconds
|
|
111
|
+
|
|
112
|
+
# Calculate the difference
|
|
113
|
+
difference_seconds = twenty_minutes_seconds - input_seconds
|
|
114
|
+
|
|
115
|
+
# Convert back to MM:SS format
|
|
116
|
+
result_minutes = difference_seconds // 60
|
|
117
|
+
result_seconds = difference_seconds % 60
|
|
118
|
+
|
|
119
|
+
# Format the result
|
|
120
|
+
return f"{result_minutes}:{result_seconds:02d}"
|
|
121
|
+
|
|
122
|
+
def convert_clock_to_seconds(clock):
|
|
123
|
+
min = int(clock.split(':')[0])
|
|
124
|
+
sec = int(clock.split(':')[1])
|
|
125
|
+
seconds = min * 60 + sec
|
|
126
|
+
return seconds
|
|
127
|
+
|
|
128
|
+
def convert_seconds_to_clock(seconds):
|
|
129
|
+
|
|
130
|
+
if len(str(int(seconds/60))) == 1:
|
|
131
|
+
minutes_string = '0' + str(int(seconds/60))
|
|
132
|
+
else:
|
|
133
|
+
minutes_string = str(int(seconds/60))
|
|
134
|
+
|
|
135
|
+
remainder = seconds - (60 * int(seconds/60))
|
|
136
|
+
|
|
137
|
+
if len(str(remainder)) == 1:
|
|
138
|
+
remainder_string = '0' + str(remainder)
|
|
139
|
+
else:
|
|
140
|
+
remainder_string = str(remainder)
|
|
141
|
+
|
|
142
|
+
return minutes_string + ':' + remainder_string
|
|
143
|
+
|
|
144
|
+
def scrape_schedule(start_date, end_date):
|
|
145
|
+
|
|
146
|
+
"""
|
|
147
|
+
Scrape the NHL's API and get a schedule back.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
url = 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=' + start_date + '&endDate=' + end_date
|
|
151
|
+
page = _session.get(url, timeout=30)
|
|
152
|
+
loaddict = json.loads(page.content)
|
|
153
|
+
date_list = (loaddict['dates'])
|
|
154
|
+
date_df = pd.DataFrame(date_list)
|
|
155
|
+
|
|
156
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
157
|
+
gamedf_list = []
|
|
158
|
+
for i in range (0, len(date_df)):
|
|
159
|
+
datedf = pd.DataFrame(date_df.games.iloc[i])
|
|
160
|
+
gamedf_list.append(datedf)
|
|
161
|
+
gamedf = pd.concat(gamedf_list, ignore_index=True) if gamedf_list else pd.DataFrame()
|
|
162
|
+
global team_df
|
|
163
|
+
team_df = pd.DataFrame(gamedf['teams'].values.tolist(), index = gamedf.index)
|
|
164
|
+
away_df = pd.DataFrame(team_df['away'].values.tolist(), index = team_df.index)
|
|
165
|
+
home_df = pd.DataFrame(team_df['home'].values.tolist(), index = team_df.index)
|
|
166
|
+
away_team_df = pd.DataFrame(away_df['team'].values.tolist(), index = away_df.index)
|
|
167
|
+
home_team_df = pd.DataFrame(home_df['team'].values.tolist(), index = home_df.index)
|
|
168
|
+
|
|
169
|
+
gamedf = gamedf.assign(
|
|
170
|
+
state = pd.DataFrame(gamedf['status'].values.tolist(), index = gamedf.index)['detailedState'],
|
|
171
|
+
homename = home_team_df['name'],
|
|
172
|
+
homeid = home_team_df['id'],
|
|
173
|
+
homescore = home_df['score'],
|
|
174
|
+
awayname = away_team_df['name'],
|
|
175
|
+
awayid = away_team_df['id'],
|
|
176
|
+
awayscore = away_df['score'],
|
|
177
|
+
venue = pd.DataFrame(gamedf['venue'].values.tolist(), index = gamedf.index)['name'],
|
|
178
|
+
gameDate = pd.to_datetime(gamedf['gameDate']).dt.tz_convert('EST')
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
gamedf = gamedf.loc[:, ['gamePk', 'link', 'gameType', 'season', 'gameDate','homeid', 'homename', 'homescore','awayid', 'awayname', 'awayscore', 'state', 'venue']].rename(
|
|
182
|
+
columns = {'gamePk':'ID', 'gameType':'type', 'gameDate':'date'})
|
|
183
|
+
|
|
184
|
+
gamedf['type']
|
|
185
|
+
|
|
186
|
+
return(gamedf)
|
|
187
|
+
|
|
188
|
+
def hs_strip_html(td):
|
|
189
|
+
"""
|
|
190
|
+
Function from Harry Shomer's Github
|
|
191
|
+
|
|
192
|
+
Strip html tags and such
|
|
193
|
+
|
|
194
|
+
:param td: pbp (list of lxml elements)
|
|
195
|
+
|
|
196
|
+
:return: list of plays (which contain a list of info) stripped of html
|
|
197
|
+
"""
|
|
198
|
+
for y in range(len(td)):
|
|
199
|
+
# Get the 'br' tag for the time column...this get's us time remaining instead of elapsed and remaining combined
|
|
200
|
+
if y == 3:
|
|
201
|
+
td[y] = td[y].text_content() # This gets us elapsed and remaining combined-< 3:0017:00
|
|
202
|
+
index = td[y].find(':')
|
|
203
|
+
td[y] = td[y][:index+3]
|
|
204
|
+
elif (y == 6 or y == 7) and td[0] != '#':
|
|
205
|
+
# 6 & 7-> These are the player 1 ice one's
|
|
206
|
+
# The second statement controls for when it's just a header
|
|
207
|
+
baz = td[y].xpath('.//td')
|
|
208
|
+
bar = [baz[z] for z in range(len(baz)) if z % 4 != 0] # Because of previous step we get repeats...delete some
|
|
209
|
+
|
|
210
|
+
# The setup in the list is now: Name/Number->Position->Blank...and repeat
|
|
211
|
+
# Now strip all the html
|
|
212
|
+
players = []
|
|
213
|
+
for i in range(len(bar)):
|
|
214
|
+
if i % 3 == 0:
|
|
215
|
+
try:
|
|
216
|
+
font_elem = bar[i].xpath('.//font')
|
|
217
|
+
if font_elem:
|
|
218
|
+
name = return_name_html(font_elem[0].get('title', ''))
|
|
219
|
+
else:
|
|
220
|
+
name = ''
|
|
221
|
+
number = bar[i].text_content().strip('\n') # Get number and strip leading/trailing newlines
|
|
222
|
+
except (KeyError, IndexError):
|
|
223
|
+
name = ''
|
|
224
|
+
number = ''
|
|
225
|
+
elif i % 3 == 1:
|
|
226
|
+
if name != '':
|
|
227
|
+
position = bar[i].text_content()
|
|
228
|
+
players.append([name, number, position])
|
|
229
|
+
|
|
230
|
+
td[y] = players
|
|
231
|
+
else:
|
|
232
|
+
td[y] = td[y].text_content()
|
|
233
|
+
|
|
234
|
+
return td
|
|
235
|
+
|
|
236
|
+
def parse_goaltender_summary(goalie_table):
|
|
237
|
+
"""Parse the goaltender summary table into a DataFrame."""
|
|
238
|
+
|
|
239
|
+
rows = goalie_table.find_all('tr')
|
|
240
|
+
|
|
241
|
+
goalie_data = []
|
|
242
|
+
current_team = None
|
|
243
|
+
|
|
244
|
+
for row in rows:
|
|
245
|
+
cells = row.find_all('td')
|
|
246
|
+
if not cells:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Check if this is a team header row (contains team name)
|
|
250
|
+
first_cell_text = cells[0].get_text(strip=True)
|
|
251
|
+
|
|
252
|
+
# Team header row - look for visitorsectionheading or homesectionheading
|
|
253
|
+
if 'visitorsectionheading' in str(cells[0].get('class', [])) or \
|
|
254
|
+
'homesectionheading' in str(cells[0].get('class', [])):
|
|
255
|
+
# Extract team name
|
|
256
|
+
if first_cell_text and first_cell_text not in ['TOI', 'GOALS-SHOTS AGAINST', 'EV', 'PP', 'SH', 'TOT', '1', '2', '3']:
|
|
257
|
+
current_team = first_cell_text
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
# Skip subheader rows (EV, PP, SH, etc.)
|
|
261
|
+
if first_cell_text in ['EV', 'PP', 'SH', 'TOT', '']:
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
# Skip TEAM TOTALS and spacer rows
|
|
265
|
+
if 'TEAM TOTALS' in first_cell_text or first_cell_text == '\xa0':
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
# This should be a goaltender data row
|
|
269
|
+
# Check if it has position "G" in the second cell
|
|
270
|
+
if len(cells) >= 11:
|
|
271
|
+
cell_texts = [c.get_text(strip=True) for c in cells]
|
|
272
|
+
|
|
273
|
+
# Goalie rows have: Number, "G", Name, EV, PP, SH, TOT, P1, P2, P3, TOT
|
|
274
|
+
if len(cell_texts) >= 2 and cell_texts[1] == 'G':
|
|
275
|
+
goalie_data.append({
|
|
276
|
+
'team': current_team,
|
|
277
|
+
'number': cell_texts[0],
|
|
278
|
+
'name': cell_texts[2],
|
|
279
|
+
'EV Total': cell_texts[3] if cell_texts[3] else None,
|
|
280
|
+
'PP Total': cell_texts[4] if cell_texts[4] else None,
|
|
281
|
+
'TOI': cell_texts[6] if cell_texts[6] else None,
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
return pd.DataFrame(goalie_data)
|
|
285
|
+
|
|
286
|
+
def group_if_not_none(result):
|
|
287
|
+
if result is not None:
|
|
288
|
+
result = result.group()
|
|
289
|
+
return(result)
|
|
290
|
+
|
|
291
|
+
def scrape_html_roster(season, game_id, page=None, verbose=False):
|
|
292
|
+
"""
|
|
293
|
+
Scrape HTML roster page.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
season: Season string (e.g., '20242025')
|
|
297
|
+
game_id: Game ID string (e.g., '020333')
|
|
298
|
+
page: Optional pre-fetched requests.Response object. If None, will fetch the page.
|
|
299
|
+
verbose: If True, print detailed timing information
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
DataFrame with roster information
|
|
303
|
+
"""
|
|
304
|
+
if page is None:
|
|
305
|
+
url = 'http://www.nhl.com/scores/htmlreports/' + season + '/RO0' + game_id + '.HTM'
|
|
306
|
+
|
|
307
|
+
# TIME: Roster network request
|
|
308
|
+
net_start = time.time()
|
|
309
|
+
page = _session.get(url, timeout=10)
|
|
310
|
+
net_duration = time.time() - net_start
|
|
311
|
+
if verbose:
|
|
312
|
+
try:
|
|
313
|
+
print(f' ⏱️ Roster network request: {net_duration:.2f}s')
|
|
314
|
+
except Exception:
|
|
315
|
+
pass
|
|
316
|
+
|
|
317
|
+
# OPTIMIZED: Use lxml directly instead of BeautifulSoup for faster parsing
|
|
318
|
+
if type(page) == str:
|
|
319
|
+
doc = html.fromstring(page)
|
|
320
|
+
else:
|
|
321
|
+
doc = html.fromstring(page.content.decode('ISO-8859-1'))
|
|
322
|
+
|
|
323
|
+
# XPath to find td elements with align='center', class containing 'teamHeading' and 'border', width='50%'
|
|
324
|
+
teamsoup = doc.xpath("//td[@align='center' and @width='50%' and contains(@class, 'teamHeading') and contains(@class, 'border')]")
|
|
325
|
+
away_team = teamsoup[0].text_content() if len(teamsoup) > 0 else ''
|
|
326
|
+
home_team = teamsoup[1].text_content() if len(teamsoup) > 1 else ''
|
|
327
|
+
|
|
328
|
+
# XPath to find tables with specific attributes, then get td elements from the 3rd table (index 2)
|
|
329
|
+
tables = doc.xpath("//table[@align='center' and @border='0' and @cellpadding='0' and @cellspacing='0' and @width='100%']")
|
|
330
|
+
home_player_soup = tables[2].xpath(".//td") if len(tables) > 2 else []
|
|
331
|
+
# Convert lxml elements to text content
|
|
332
|
+
home_player_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in home_player_soup]
|
|
333
|
+
|
|
334
|
+
length = int(len(home_player_soup)/3)
|
|
335
|
+
|
|
336
|
+
home_player_df = pd.DataFrame(np.array(home_player_soup).reshape(length, 3))
|
|
337
|
+
|
|
338
|
+
home_player_df.columns = home_player_df.iloc[0]
|
|
339
|
+
|
|
340
|
+
home_player_df = home_player_df.drop(0).assign(team = 'home', team_name = home_team)
|
|
341
|
+
|
|
342
|
+
# Get away player data from 2nd table (index 1)
|
|
343
|
+
away_player_soup = tables[1].xpath(".//td") if len(tables) > 1 else []
|
|
344
|
+
# Convert lxml elements to text content
|
|
345
|
+
away_player_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in away_player_soup]
|
|
346
|
+
|
|
347
|
+
length = int(len(away_player_soup)/3)
|
|
348
|
+
|
|
349
|
+
away_player_df = pd.DataFrame(np.array(away_player_soup).reshape(length, 3))
|
|
350
|
+
|
|
351
|
+
away_player_df.columns = away_player_df.iloc[0]
|
|
352
|
+
|
|
353
|
+
away_player_df = away_player_df.drop(0).assign(team = 'away', team_name = away_team)
|
|
354
|
+
|
|
355
|
+
#global home_scratch_soup
|
|
356
|
+
|
|
357
|
+
if len(tables) > 3:
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
|
|
361
|
+
home_scratch_soup = tables[4].xpath(".//td") if len(tables) > 4 else []
|
|
362
|
+
# Convert lxml elements to text content
|
|
363
|
+
home_scratch_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in home_scratch_soup]
|
|
364
|
+
|
|
365
|
+
if len(home_scratch_soup)>1:
|
|
366
|
+
|
|
367
|
+
length = int(len(home_scratch_soup)/3)
|
|
368
|
+
|
|
369
|
+
home_scratch_df = pd.DataFrame(np.array(home_scratch_soup).reshape(length, 3))
|
|
370
|
+
|
|
371
|
+
home_scratch_df.columns = home_scratch_df.iloc[0]
|
|
372
|
+
|
|
373
|
+
home_scratch_df = home_scratch_df.drop(0).assign(team = 'home', team_name = home_team)
|
|
374
|
+
|
|
375
|
+
except Exception as e:
|
|
376
|
+
print(e)
|
|
377
|
+
print('No home scratch soup')
|
|
378
|
+
home_scratch_df = pd.DataFrame()
|
|
379
|
+
|
|
380
|
+
if 'home_scratch_df' not in locals():
|
|
381
|
+
|
|
382
|
+
home_scratch_df = pd.DataFrame()
|
|
383
|
+
|
|
384
|
+
if len(tables) > 2:
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
|
|
388
|
+
away_scratch_soup = tables[3].xpath(".//td") if len(tables) > 3 else []
|
|
389
|
+
# Convert lxml elements to text content
|
|
390
|
+
away_scratch_soup = [elem.text_content() if hasattr(elem, 'text_content') else str(elem) for elem in away_scratch_soup]
|
|
391
|
+
|
|
392
|
+
if len(away_scratch_soup)>1:
|
|
393
|
+
|
|
394
|
+
length = int(len(away_scratch_soup)/3)
|
|
395
|
+
|
|
396
|
+
away_scratch_df = pd.DataFrame(np.array(away_scratch_soup).reshape(length, 3))
|
|
397
|
+
|
|
398
|
+
away_scratch_df.columns = away_scratch_df.iloc[0]
|
|
399
|
+
|
|
400
|
+
away_scratch_df = away_scratch_df.drop(0).assign(team = 'away', team_name = away_team)
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
print(e)
|
|
404
|
+
print('No away scratch soup')
|
|
405
|
+
away_scratch_df = pd.DataFrame()
|
|
406
|
+
|
|
407
|
+
if 'away_scratch_df' not in locals():
|
|
408
|
+
|
|
409
|
+
away_scratch_df = pd.DataFrame()
|
|
410
|
+
|
|
411
|
+
player_df = pd.concat([home_player_df, away_player_df]).assign(status = 'player')
|
|
412
|
+
scratch_df = pd.concat([home_scratch_df, away_scratch_df]).assign(status = 'scratch')
|
|
413
|
+
roster_df = pd.concat([player_df, scratch_df])
|
|
414
|
+
|
|
415
|
+
roster_df = roster_df.assign(team = np.where(roster_df.team=='CANADIENS MONTREAL', 'MONTREAL CANADIENS', roster_df.team))
|
|
416
|
+
|
|
417
|
+
roster_df = roster_df.assign(team = np.where(roster_df.team=='MONTRÉAL CANADIENS', 'MONTREAL CANADIENS', roster_df.team))
|
|
418
|
+
|
|
419
|
+
# FIX NAMES
|
|
420
|
+
|
|
421
|
+
roster_df = roster_df.rename(columns = {'Nom/Name':'Name'})
|
|
422
|
+
|
|
423
|
+
roster_df.Name = roster_df.Name.apply(lambda x: _CAPTAIN_A_PATTERN.sub('', x).strip())
|
|
424
|
+
roster_df.Name = roster_df.Name.apply(lambda x: _CAPTAIN_C_PATTERN.sub('', x).strip())
|
|
425
|
+
|
|
426
|
+
# OPTIMIZED: Batch string replacements instead of conditional np.where()
|
|
427
|
+
# Max Pacioretty doesn't exist in ESPN in 2009-2010, sadly.
|
|
428
|
+
roster_df['Name'] = (roster_df['Name']
|
|
429
|
+
.str.replace('ALEXANDRE ', 'ALEX ', regex=False)
|
|
430
|
+
.str.replace('ALEXANDER ', 'ALEX ', regex=False)
|
|
431
|
+
.str.replace('CHRISTOPHER ', 'CHRIS ', regex=False))
|
|
432
|
+
|
|
433
|
+
# OPTIMIZED: Use dictionary lookup instead of nested np.where() chains
|
|
434
|
+
# This provides 50-90% speedup on name corrections
|
|
435
|
+
|
|
436
|
+
# OPTIMIZED: Already handled by dictionary lookup above
|
|
437
|
+
# (These names are already in _NAME_CORRECTIONS)
|
|
438
|
+
|
|
439
|
+
roster_df['Name'] = np.where((roster_df['Name']=="SEBASTIAN AHO") & (roster_df['Pos']=='D'), 'SEBASTIAN AHO SWE', roster_df['Name'])
|
|
440
|
+
roster_df['Name'] = np.where((roster_df['Name']=="ELIAS PETTERSSON") & (roster_df['Pos']=='D'), 'ELIAS PETTERSSON(D)', roster_df['Name'])
|
|
441
|
+
roster_df['Name'] = np.where((roster_df['Name']=="COLIN WHITE") & (roster_df['Pos']=='D'), 'COLIN WHITE CAN', roster_df['Name'])
|
|
442
|
+
roster_df['Name'] = np.where((roster_df['Name']=="SEAN COLLINS") & (roster_df['Pos']=='D'), 'SEAN COLLINS CAN', roster_df['Name'])
|
|
443
|
+
roster_df['Name'] = np.where((roster_df['Name']=="ALEX PICARD") & (roster_df['Pos']!='D'), 'ALEX PICARD F', roster_df['Name'])
|
|
444
|
+
roster_df['Name'] = np.where((roster_df['Name']=="ERIK GUSTAFSSON") & (int(season)<20132014), 'ERIK GUSTAFSSON 88', roster_df['Name'])
|
|
445
|
+
roster_df['Name'] = np.where((roster_df['Name']=="MIKKO LEHTONEN") & (int(season)<20202021), 'MIKKO LEHTONEN F', roster_df['Name'])
|
|
446
|
+
roster_df['Name'] = np.where(roster_df['Name']=='ALEX BARRÃ-BOULET', 'ALEX BARRE-BOULET', roster_df['Name'])
|
|
447
|
+
roster_df['Name'] = np.where(roster_df['Name']=='COLIN', 'COLIN WHITE CAN', roster_df['Name'])
|
|
448
|
+
|
|
449
|
+
# OPTIMIZED: Already handled by dictionary lookup above
|
|
450
|
+
# (These names are already in _NAME_CORRECTIONS)
|
|
451
|
+
|
|
452
|
+
roster_df['Name'] = roster_df['Name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
453
|
+
|
|
454
|
+
roster_df['Name'] = np.where(roster_df['Name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", roster_df['Name']) # Need to do this after normalization, only then he becomes Slafkovska?
|
|
455
|
+
roster_df['Name'] = np.where(roster_df['Name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", roster_df['Name'])
|
|
456
|
+
roster_df['Name'] = np.where(roster_df['Name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", roster_df['Name'])
|
|
457
|
+
|
|
458
|
+
roster_df['Name'] = np.where(roster_df['Name']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY", roster_df['Name'])
|
|
459
|
+
|
|
460
|
+
roster_df['Name'] = np.where(roster_df['Name']== "MATAJ BLAMEL" , "MATAJ BLAMEL", roster_df['Name'])
|
|
461
|
+
|
|
462
|
+
roster_df['Name'] = roster_df['Name'].str.replace(' ', ' ')
|
|
463
|
+
|
|
464
|
+
roster_df['Name'] = roster_df['Name'].apply(lambda x: normalize_player_name(x))
|
|
465
|
+
|
|
466
|
+
return roster_df
|
|
467
|
+
|
|
468
|
+
def scrape_html_shifts(season, game_id, live = True, home_page=None, away_page=None, summary = None, roster_cache = None, verbose=False):
|
|
469
|
+
"""
|
|
470
|
+
Scrape HTML shifts pages.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
season: Season string (e.g., '20242025')
|
|
474
|
+
game_id: Game ID string (e.g., '020333')
|
|
475
|
+
live: Boolean flag for live games
|
|
476
|
+
home_page: Optional pre-fetched requests.Response object for home shifts page. If None, will fetch.
|
|
477
|
+
away_page: Optional pre-fetched requests.Response object for away shifts page. If None, will fetch.
|
|
478
|
+
summary: Optional summary page for goalie data
|
|
479
|
+
roster_cache: Roster cache for goalie names
|
|
480
|
+
verbose: If True, print detailed timing information
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
DataFrame with shift information
|
|
484
|
+
"""
|
|
485
|
+
goalie_names = roster_cache[roster_cache.Pos=='G'].Name.unique().tolist()
|
|
486
|
+
home_goalie_names = roster_cache[(roster_cache.Pos=='G') & (roster_cache.team=='home')].Name.unique().tolist()
|
|
487
|
+
away_goalie_names = roster_cache[(roster_cache.Pos=='G') & (roster_cache.team=='away')].Name.unique().tolist()
|
|
488
|
+
|
|
489
|
+
if home_page is None:
|
|
490
|
+
url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TH0' + game_id + '.HTM'
|
|
491
|
+
|
|
492
|
+
# TIME: Home shifts network request
|
|
493
|
+
net_start = time.time()
|
|
494
|
+
home_page = _session.get(url, timeout=10)
|
|
495
|
+
net_duration = time.time() - net_start
|
|
496
|
+
if verbose:
|
|
497
|
+
try:
|
|
498
|
+
print(f' ⏱️ Home shifts network request: {net_duration:.2f}s')
|
|
499
|
+
except Exception:
|
|
500
|
+
pass
|
|
501
|
+
|
|
502
|
+
# NOTE: Keeping BeautifulSoup for shifts parsing for now due to complex class matching
|
|
503
|
+
# lxml optimization applied to events parsing (major speedup achieved there)
|
|
504
|
+
if type(home_page) == str:
|
|
505
|
+
home_soup = BeautifulSoup(home_page)
|
|
506
|
+
else:
|
|
507
|
+
home_soup = BeautifulSoup(home_page.content, 'lxml')
|
|
508
|
+
found = home_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
|
|
509
|
+
if len(found)==0:
|
|
510
|
+
raise IndexError('This game has no shift data.')
|
|
511
|
+
thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
|
|
512
|
+
|
|
513
|
+
players = dict()
|
|
514
|
+
|
|
515
|
+
# OPTIMIZED: Reduce repeated string operations
|
|
516
|
+
for i in range(len(found)):
|
|
517
|
+
line = found[i].get_text()
|
|
518
|
+
if line == '25 PETTERSSON, ELIAS':
|
|
519
|
+
line = '25 PETTERSSON(D), ELIAS'
|
|
520
|
+
if ', ' in line:
|
|
521
|
+
# OPTIMIZED: Split once and reuse
|
|
522
|
+
name_parts = line.split(',')
|
|
523
|
+
if len(name_parts) >= 2:
|
|
524
|
+
number_last = name_parts[0].split(' ', 1) # Split only once
|
|
525
|
+
number = number_last[0].strip()
|
|
526
|
+
last_name = number_last[1].strip() if len(number_last) > 1 else ''
|
|
527
|
+
first_name = name_parts[1].strip()
|
|
528
|
+
full_name = first_name + " " + last_name
|
|
529
|
+
players[full_name] = {
|
|
530
|
+
'number': number,
|
|
531
|
+
'name': full_name,
|
|
532
|
+
'shifts': []
|
|
533
|
+
}
|
|
534
|
+
else:
|
|
535
|
+
players[full_name]['shifts'].append(line) # Use append instead of extend([line])
|
|
536
|
+
|
|
537
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
538
|
+
alldf_list = []
|
|
539
|
+
for key in players.keys():
|
|
540
|
+
shifts_array = np.array(players[key]['shifts'])
|
|
541
|
+
length = (len(shifts_array) // 5) * 5
|
|
542
|
+
shifts_array = shifts_array[:length]
|
|
543
|
+
df = pd.DataFrame(shifts_array.reshape(-1, 5)).rename(
|
|
544
|
+
columns = {0:'shift_number', 1:'period', 2:'shift_start', 3:'shift_end', 4:'duration'})
|
|
545
|
+
df = df.assign(name = players[key]['name'],
|
|
546
|
+
number = players[key]['number'],
|
|
547
|
+
team = thisteam,
|
|
548
|
+
venue = "home")
|
|
549
|
+
alldf_list.append(df)
|
|
550
|
+
|
|
551
|
+
home_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
|
|
552
|
+
|
|
553
|
+
if live == True:
|
|
554
|
+
|
|
555
|
+
home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
|
|
556
|
+
home_shifts = home_shifts.assign(number = home_shifts.number.astype(int))
|
|
557
|
+
|
|
558
|
+
found = home_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
|
|
559
|
+
if len(found)==0:
|
|
560
|
+
raise IndexError('This game has no shift data.')
|
|
561
|
+
thisteam = home_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
|
|
562
|
+
|
|
563
|
+
players = dict()
|
|
564
|
+
|
|
565
|
+
for i in range(len(found)):
|
|
566
|
+
line = found[i].get_text()
|
|
567
|
+
if line == '25 PETTERSSON, ELIAS':
|
|
568
|
+
line = '25 PETTERSSON(D), ELIAS'
|
|
569
|
+
if ', ' in line:
|
|
570
|
+
# OPTIMIZED: Split once and reuse
|
|
571
|
+
name_parts = line.split(',')
|
|
572
|
+
if len(name_parts) >= 2:
|
|
573
|
+
number_last = name_parts[0].split(' ', 1) # Split only once
|
|
574
|
+
number = number_last[0].strip()
|
|
575
|
+
last_name = number_last[1].strip() if len(number_last) > 1 else ''
|
|
576
|
+
first_name = name_parts[1].strip()
|
|
577
|
+
full_name = first_name + " " + last_name
|
|
578
|
+
players[full_name] = {
|
|
579
|
+
'number': number,
|
|
580
|
+
'name': full_name,
|
|
581
|
+
'shifts': []
|
|
582
|
+
}
|
|
583
|
+
else:
|
|
584
|
+
players[full_name]['shifts'].append(line) # Use append instead of extend([line])
|
|
585
|
+
|
|
586
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
587
|
+
alldf_list = []
|
|
588
|
+
for key in players.keys():
|
|
589
|
+
length = length = int(len(players[key]['shifts'])/6)
|
|
590
|
+
df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
|
|
591
|
+
columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
|
|
592
|
+
df = df.assign(name = players[key]['name'],
|
|
593
|
+
number = players[key]['number'],
|
|
594
|
+
team = thisteam,
|
|
595
|
+
venue = "home")
|
|
596
|
+
alldf_list.append(df)
|
|
597
|
+
|
|
598
|
+
home_extra_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
|
|
599
|
+
|
|
600
|
+
# Trigger: There is no home goalie for this period and we're not about to pull one from the extra shifts.
|
|
601
|
+
|
|
602
|
+
if len(home_shifts[(home_shifts.period==max(home_shifts.period)) & (home_shifts.name.isin(home_goalie_names))]) == 0 and len(home_extra_shifts[home_extra_shifts.name.isin(home_goalie_names)]) == 0:
|
|
603
|
+
|
|
604
|
+
if type(summary) == str:
|
|
605
|
+
summary_soup = BeautifulSoup(summary)
|
|
606
|
+
else:
|
|
607
|
+
summary_soup = BeautifulSoup(summary.content.decode('ISO-8859-1'))
|
|
608
|
+
|
|
609
|
+
sections = summary_soup.find_all('td', class_='sectionheading')
|
|
610
|
+
for section in sections:
|
|
611
|
+
if 'GOALTENDER SUMMARY' in section.get_text():
|
|
612
|
+
goalie_table = section.find_parent('tr').find_next_sibling('tr').find('table')
|
|
613
|
+
break
|
|
614
|
+
|
|
615
|
+
goalie_summary = parse_goaltender_summary(goalie_table)
|
|
616
|
+
|
|
617
|
+
goalie_summary = goalie_summary[(goalie_summary.team==thisteam) & ~(pd.isna(goalie_summary['TOI']))]
|
|
618
|
+
|
|
619
|
+
goalie_summary = goalie_summary.assign(name =
|
|
620
|
+
goalie_summary.name.str.split(', ').str[-1] + ' ' + goalie_summary.name.str.split(', ').str[0]
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
goalie_summary.name = goalie_summary.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
624
|
+
|
|
625
|
+
goalie_summary = goalie_summary.assign(period = max(home_shifts.period), shifts = '1', avg = goalie_summary.TOI, venue = 'home').loc[:, home_extra_shifts.columns]
|
|
626
|
+
|
|
627
|
+
home_extra_shifts = pd.concat([home_extra_shifts, goalie_summary])
|
|
628
|
+
|
|
629
|
+
home_extra_shifts = home_extra_shifts.assign(TOI_seconds_summary = home_extra_shifts.TOI.apply(lambda x: convert_clock_to_seconds(x)))
|
|
630
|
+
|
|
631
|
+
home_extra_shifts = home_extra_shifts.merge(
|
|
632
|
+
home_shifts.assign(toi_secs = home_shifts.duration.apply(lambda x: convert_clock_to_seconds(x))
|
|
633
|
+
).groupby(['name', 'period'])['toi_secs'].sum().reset_index(),
|
|
634
|
+
how = 'left'
|
|
635
|
+
).fillna(0)
|
|
636
|
+
|
|
637
|
+
home_extra_shifts['toi_secs'] = home_extra_shifts['toi_secs'].astype(int)
|
|
638
|
+
|
|
639
|
+
home_extra_shifts = home_extra_shifts.assign(toi_diff = abs(home_extra_shifts.toi_secs - home_extra_shifts.TOI_seconds_summary))
|
|
640
|
+
|
|
641
|
+
shifts_needing_to_be_added = home_extra_shifts[home_extra_shifts.toi_diff!=0]
|
|
642
|
+
|
|
643
|
+
if len(shifts_needing_to_be_added) > 0:
|
|
644
|
+
|
|
645
|
+
latest_shift_end = home_shifts.assign(period_secs = home_shifts.shift_end.str.split(' / ').str[0].apply(lambda x: convert_clock_to_seconds(x)))[
|
|
646
|
+
home_shifts.period==max(home_shifts.period)
|
|
647
|
+
].sort_values(by = 'period_secs', ascending = False).period_secs.iloc[0]
|
|
648
|
+
|
|
649
|
+
max_toi = shifts_needing_to_be_added.TOI.apply(lambda x: convert_clock_to_seconds(x)).max()
|
|
650
|
+
|
|
651
|
+
overage = max_toi - latest_shift_end
|
|
652
|
+
|
|
653
|
+
if overage > 0:
|
|
654
|
+
shifts_needing_to_be_added.toi_diff = shifts_needing_to_be_added.toi_diff - overage
|
|
655
|
+
|
|
656
|
+
home_clock_time_now = convert_seconds_to_clock(latest_shift_end)
|
|
657
|
+
|
|
658
|
+
home_clock_period = max(home_shifts.period.astype(int))
|
|
659
|
+
|
|
660
|
+
start_times_seconds = home_clock_time_now
|
|
661
|
+
|
|
662
|
+
import math
|
|
663
|
+
|
|
664
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
|
|
665
|
+
shift_start = ((convert_clock_to_seconds(home_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str)
|
|
666
|
+
+ ' / ' + (convert_clock_to_seconds(home_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str).apply(lambda x: subtract_from_twenty_minutes(x))),
|
|
667
|
+
shift_end = home_clock_time_now + ' / ' + subtract_from_twenty_minutes(home_clock_time_now),
|
|
668
|
+
duration = shifts_needing_to_be_added.toi_diff
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
|
|
672
|
+
duration = shifts_needing_to_be_added.toi_diff.apply(lambda x: convert_seconds_to_clock(x))
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
|
|
676
|
+
home_shifts.assign(shift_number = home_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'}),
|
|
677
|
+
how = 'left'
|
|
678
|
+
).fillna(0)
|
|
679
|
+
|
|
680
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
|
|
681
|
+
|
|
682
|
+
shifts_needing_to_be_added.shift_number = shifts_needing_to_be_added.shift_number.astype(int)
|
|
683
|
+
|
|
684
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
|
|
685
|
+
|
|
686
|
+
shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
|
|
687
|
+
|
|
688
|
+
home_shifts = pd.concat([home_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
|
|
689
|
+
|
|
690
|
+
elif len(shifts_needing_to_be_added) == 0:
|
|
691
|
+
home_clock_period = None
|
|
692
|
+
home_clock_time_now = None
|
|
693
|
+
|
|
694
|
+
if away_page is None:
|
|
695
|
+
url = 'http://www.nhl.com/scores/htmlreports/' + season + '/TH0' + game_id + '.HTM'
|
|
696
|
+
|
|
697
|
+
# TIME: away shifts network request
|
|
698
|
+
net_start = time.time()
|
|
699
|
+
away_page = _session.get(url, timeout=10)
|
|
700
|
+
net_duration = time.time() - net_start
|
|
701
|
+
if verbose:
|
|
702
|
+
try:
|
|
703
|
+
print(f' ⏱️ away shifts network request: {net_duration:.2f}s')
|
|
704
|
+
except Exception:
|
|
705
|
+
pass
|
|
706
|
+
|
|
707
|
+
# NOTE: Keeping BeautifulSoup for shifts parsing for now due to complex class matching
|
|
708
|
+
# lxml optimization applied to events parsing (major speedup achieved there)
|
|
709
|
+
if type(away_page) == str:
|
|
710
|
+
away_soup = BeautifulSoup(away_page)
|
|
711
|
+
else:
|
|
712
|
+
away_soup = BeautifulSoup(away_page.content, 'lxml')
|
|
713
|
+
found = away_soup.find_all('td', {'class':['playerHeading + border', 'lborder + bborder']})
|
|
714
|
+
if len(found)==0:
|
|
715
|
+
raise IndexError('This game has no shift data.')
|
|
716
|
+
thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
|
|
717
|
+
|
|
718
|
+
players = dict()
|
|
719
|
+
|
|
720
|
+
# OPTIMIZED: Reduce repeated string operations
|
|
721
|
+
for i in range(len(found)):
|
|
722
|
+
line = found[i].get_text()
|
|
723
|
+
if line == '25 PETTERSSON, ELIAS':
|
|
724
|
+
line = '25 PETTERSSON(D), ELIAS'
|
|
725
|
+
if ', ' in line:
|
|
726
|
+
# OPTIMIZED: Split once and reuse
|
|
727
|
+
name_parts = line.split(',')
|
|
728
|
+
if len(name_parts) >= 2:
|
|
729
|
+
number_last = name_parts[0].split(' ', 1) # Split only once
|
|
730
|
+
number = number_last[0].strip()
|
|
731
|
+
last_name = number_last[1].strip() if len(number_last) > 1 else ''
|
|
732
|
+
first_name = name_parts[1].strip()
|
|
733
|
+
full_name = first_name + " " + last_name
|
|
734
|
+
players[full_name] = {
|
|
735
|
+
'number': number,
|
|
736
|
+
'name': full_name,
|
|
737
|
+
'shifts': []
|
|
738
|
+
}
|
|
739
|
+
else:
|
|
740
|
+
players[full_name]['shifts'].append(line) # Use append instead of extend([line])
|
|
741
|
+
|
|
742
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
743
|
+
alldf_list = []
|
|
744
|
+
for key in players.keys():
|
|
745
|
+
shifts_array = np.array(players[key]['shifts'])
|
|
746
|
+
length = (len(shifts_array) // 5) * 5
|
|
747
|
+
shifts_array = shifts_array[:length]
|
|
748
|
+
df = pd.DataFrame(shifts_array.reshape(-1, 5)).rename(
|
|
749
|
+
columns = {0:'shift_number', 1:'period', 2:'shift_start', 3:'shift_end', 4:'duration'})
|
|
750
|
+
df = df.assign(name = players[key]['name'],
|
|
751
|
+
number = players[key]['number'],
|
|
752
|
+
team = thisteam,
|
|
753
|
+
venue = "away")
|
|
754
|
+
alldf_list.append(df)
|
|
755
|
+
|
|
756
|
+
away_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
|
|
757
|
+
|
|
758
|
+
if live == True:
|
|
759
|
+
|
|
760
|
+
away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
|
|
761
|
+
away_shifts = away_shifts.assign(number = away_shifts.number.astype(int))
|
|
762
|
+
|
|
763
|
+
found = away_soup.find_all('td', {'class':['playerHeading + border', 'bborder + lborder +']})
|
|
764
|
+
if len(found)==0:
|
|
765
|
+
raise IndexError('This game has no shift data.')
|
|
766
|
+
thisteam = away_soup.find('td', {'align':'center', 'class':'teamHeading + border'}).get_text()
|
|
767
|
+
|
|
768
|
+
players = dict()
|
|
769
|
+
|
|
770
|
+
for i in range(len(found)):
|
|
771
|
+
line = found[i].get_text()
|
|
772
|
+
if line == '25 PETTERSSON, ELIAS':
|
|
773
|
+
line = '25 PETTERSSON(D), ELIAS'
|
|
774
|
+
if ', ' in line:
|
|
775
|
+
# OPTIMIZED: Split once and reuse
|
|
776
|
+
name_parts = line.split(',')
|
|
777
|
+
if len(name_parts) >= 2:
|
|
778
|
+
number_last = name_parts[0].split(' ', 1) # Split only once
|
|
779
|
+
number = number_last[0].strip()
|
|
780
|
+
last_name = number_last[1].strip() if len(number_last) > 1 else ''
|
|
781
|
+
first_name = name_parts[1].strip()
|
|
782
|
+
full_name = first_name + " " + last_name
|
|
783
|
+
players[full_name] = {
|
|
784
|
+
'number': number,
|
|
785
|
+
'name': full_name,
|
|
786
|
+
'shifts': []
|
|
787
|
+
}
|
|
788
|
+
else:
|
|
789
|
+
players[full_name]['shifts'].append(line) # Use append instead of extend([line])
|
|
790
|
+
|
|
791
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
792
|
+
alldf_list = []
|
|
793
|
+
for key in players.keys():
|
|
794
|
+
length = length = int(len(players[key]['shifts'])/6)
|
|
795
|
+
df = df = pd.DataFrame(np.array((players[key]['shifts'])).reshape(length, 6)).rename(
|
|
796
|
+
columns = {0:'period', 1:'shifts', 2:'avg', 3:'TOI', 4:'EV Total', 5:'PP Total'})
|
|
797
|
+
df = df.assign(name = players[key]['name'],
|
|
798
|
+
number = players[key]['number'],
|
|
799
|
+
team = thisteam,
|
|
800
|
+
venue = "away")
|
|
801
|
+
alldf_list.append(df)
|
|
802
|
+
|
|
803
|
+
away_extra_shifts = pd.concat(alldf_list, ignore_index=True) if alldf_list else pd.DataFrame()
|
|
804
|
+
|
|
805
|
+
# Trigger: There is no away goalie for this period and we're not about to pull one from the extra shifts.
|
|
806
|
+
|
|
807
|
+
if len(away_shifts[(away_shifts.period==max(away_shifts.period)) & (away_shifts.name.isin(away_goalie_names))]) == 0 and len(away_extra_shifts[away_extra_shifts.name.isin(away_goalie_names)]) == 0:
|
|
808
|
+
|
|
809
|
+
if type(summary) == str:
|
|
810
|
+
summary_soup = BeautifulSoup(summary)
|
|
811
|
+
else:
|
|
812
|
+
summary_soup = BeautifulSoup(summary.content.decode('ISO-8859-1'))
|
|
813
|
+
|
|
814
|
+
sections = summary_soup.find_all('td', class_='sectionheading')
|
|
815
|
+
for section in sections:
|
|
816
|
+
if 'GOALTENDER SUMMARY' in section.get_text():
|
|
817
|
+
goalie_table = section.find_parent('tr').find_next_sibling('tr').find('table')
|
|
818
|
+
break
|
|
819
|
+
|
|
820
|
+
goalie_summary = parse_goaltender_summary(goalie_table)
|
|
821
|
+
|
|
822
|
+
goalie_summary = goalie_summary[(goalie_summary.team==thisteam) & ~(pd.isna(goalie_summary['TOI']))]
|
|
823
|
+
|
|
824
|
+
goalie_summary = goalie_summary.assign(name =
|
|
825
|
+
goalie_summary.name.str.split(', ').str[-1] + ' ' + goalie_summary.name.str.split(', ').str[0]
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
goalie_summary.name = goalie_summary.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
829
|
+
|
|
830
|
+
goalie_summary = goalie_summary.assign(period = max(away_shifts.period), shifts = '1', avg = goalie_summary.TOI, venue = 'away').loc[:, away_extra_shifts.columns]
|
|
831
|
+
|
|
832
|
+
away_extra_shifts = pd.concat([away_extra_shifts, goalie_summary])
|
|
833
|
+
|
|
834
|
+
away_extra_shifts = away_extra_shifts.assign(TOI_seconds_summary = away_extra_shifts.TOI.apply(lambda x: convert_clock_to_seconds(x)))
|
|
835
|
+
|
|
836
|
+
away_extra_shifts = away_extra_shifts.merge(
|
|
837
|
+
away_shifts.assign(toi_secs = away_shifts.duration.apply(lambda x: convert_clock_to_seconds(x))
|
|
838
|
+
).groupby(['name', 'period'])['toi_secs'].sum().reset_index(),
|
|
839
|
+
how = 'left'
|
|
840
|
+
).fillna(0)
|
|
841
|
+
|
|
842
|
+
away_extra_shifts['toi_secs'] = away_extra_shifts['toi_secs'].astype(int)
|
|
843
|
+
|
|
844
|
+
away_extra_shifts = away_extra_shifts.assign(toi_diff = abs(away_extra_shifts.toi_secs - away_extra_shifts.TOI_seconds_summary))
|
|
845
|
+
|
|
846
|
+
shifts_needing_to_be_added = away_extra_shifts[away_extra_shifts.toi_diff!=0]
|
|
847
|
+
|
|
848
|
+
if len(shifts_needing_to_be_added) > 0:
|
|
849
|
+
|
|
850
|
+
latest_shift_end = away_shifts.assign(period_secs = away_shifts.shift_end.str.split(' / ').str[0].apply(lambda x: convert_clock_to_seconds(x)))[
|
|
851
|
+
away_shifts.period==max(away_shifts.period)
|
|
852
|
+
].sort_values(by = 'period_secs', ascending = False).period_secs.iloc[0]
|
|
853
|
+
|
|
854
|
+
max_toi = shifts_needing_to_be_added.TOI.apply(lambda x: convert_clock_to_seconds(x)).max()
|
|
855
|
+
|
|
856
|
+
overage = max_toi - latest_shift_end
|
|
857
|
+
|
|
858
|
+
if overage > 0:
|
|
859
|
+
shifts_needing_to_be_added.toi_diff = shifts_needing_to_be_added.toi_diff - overage
|
|
860
|
+
|
|
861
|
+
away_clock_time_now = convert_seconds_to_clock(latest_shift_end)
|
|
862
|
+
|
|
863
|
+
away_clock_period = max(away_shifts.period.astype(int))
|
|
864
|
+
|
|
865
|
+
start_times_seconds = away_clock_time_now
|
|
866
|
+
|
|
867
|
+
import math
|
|
868
|
+
|
|
869
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
|
|
870
|
+
shift_start = ((convert_clock_to_seconds(away_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str)
|
|
871
|
+
+ ' / ' + (convert_clock_to_seconds(away_clock_time_now) - shifts_needing_to_be_added.toi_diff).apply(lambda x: convert_seconds_to_clock(x)).astype(str).apply(lambda x: subtract_from_twenty_minutes(x))),
|
|
872
|
+
shift_end = away_clock_time_now + ' / ' + subtract_from_twenty_minutes(away_clock_time_now),
|
|
873
|
+
duration = shifts_needing_to_be_added.toi_diff
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(
|
|
877
|
+
duration = shifts_needing_to_be_added.toi_diff.apply(lambda x: convert_seconds_to_clock(x))
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.merge(
|
|
881
|
+
away_shifts.assign(shift_number = away_shifts.shift_number.astype(int)).groupby('name')['shift_number'].max().reset_index().rename(columns = {'shift_number':'prior_max_shift'}),
|
|
882
|
+
how = 'left'
|
|
883
|
+
).fillna(0)
|
|
884
|
+
|
|
885
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.assign(shift_number = shifts_needing_to_be_added.prior_max_shift + 1)
|
|
886
|
+
|
|
887
|
+
shifts_needing_to_be_added.shift_number = shifts_needing_to_be_added.shift_number.astype(int)
|
|
888
|
+
|
|
889
|
+
shifts_needing_to_be_added = shifts_needing_to_be_added.loc[:, ['shift_number', 'period', 'shift_start', 'shift_end', 'duration', 'name', 'number', 'team', 'venue']]
|
|
890
|
+
|
|
891
|
+
shifts_needing_to_be_added['number'] = shifts_needing_to_be_added['number'].astype(int)
|
|
892
|
+
|
|
893
|
+
away_shifts = pd.concat([away_shifts, shifts_needing_to_be_added]).sort_values(by = ['number', 'period', 'shift_number'])
|
|
894
|
+
|
|
895
|
+
elif len(shifts_needing_to_be_added) == 0:
|
|
896
|
+
away_clock_period = None
|
|
897
|
+
away_clock_time_now = None
|
|
898
|
+
|
|
899
|
+
global all_shifts
|
|
900
|
+
|
|
901
|
+
all_shifts = pd.concat([home_shifts, away_shifts])
|
|
902
|
+
|
|
903
|
+
all_shifts.name = all_shifts.name.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
904
|
+
|
|
905
|
+
all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
|
|
906
|
+
|
|
907
|
+
all_shifts = all_shifts.assign(end_time = all_shifts.shift_end.str.split('/').str[0])
|
|
908
|
+
|
|
909
|
+
#all_shifts = all_shifts[~all_shifts.end_time.str.contains('\xa0')]
|
|
910
|
+
|
|
911
|
+
# Filter out summary rows (GP, G, A, etc.) that might have been included
|
|
912
|
+
# Period should be numeric (1-4) or 'OT', so filter out anything else
|
|
913
|
+
if len(all_shifts) > 0:
|
|
914
|
+
period_str = all_shifts.period.astype(str).str.strip()
|
|
915
|
+
# Only keep rows where period is a valid period value
|
|
916
|
+
valid_mask = period_str.isin(['1', '2', '3', '4', 'OT'])
|
|
917
|
+
all_shifts = all_shifts[valid_mask].copy()
|
|
918
|
+
|
|
919
|
+
if len(all_shifts) > 0:
|
|
920
|
+
all_shifts.period = (np.where(all_shifts.period=='OT', 4, all_shifts.period)).astype(int)
|
|
921
|
+
|
|
922
|
+
all_shifts = all_shifts.assign(end_time = np.where(~all_shifts.shift_end.str.contains('\xa0'), all_shifts.end_time,
|
|
923
|
+
(np.where(
|
|
924
|
+
(((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
|
|
925
|
+
(all_shifts.start_time.str.split(':').str[1].astype(int)) +
|
|
926
|
+
(60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
|
|
927
|
+
(all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[3:].str[0]=='0',
|
|
928
|
+
(((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
|
|
929
|
+
(all_shifts.start_time.str.split(':').str[1].astype(int)) +
|
|
930
|
+
(60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
|
|
931
|
+
(all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[4:],
|
|
932
|
+
(((pd.to_datetime(((60 * (all_shifts.start_time.str.split(':').str[0].astype(int))) +
|
|
933
|
+
(all_shifts.start_time.str.split(':').str[1].astype(int)) +
|
|
934
|
+
(60 * (all_shifts.duration.str.split(':').str[0].astype(int))).astype(int) +
|
|
935
|
+
(all_shifts.duration.str.split(':').str[1].astype(int))).astype(int), unit = 's'))).dt.time).astype(str).str[4:]))))
|
|
936
|
+
|
|
937
|
+
# OPTIMIZED: Batch string replacements instead of conditional np.where()
|
|
938
|
+
all_shifts['name'] = (all_shifts['name']
|
|
939
|
+
.str.replace('ALEXANDRE ', 'ALEX ', regex=False)
|
|
940
|
+
.str.replace('ALEXANDER ', 'ALEX ', regex=False)
|
|
941
|
+
.str.replace('CHRISTOPHER ', 'CHRIS ', regex=False))
|
|
942
|
+
|
|
943
|
+
# OPTIMIZED: Use dictionary lookup instead of nested np.where() chains
|
|
944
|
+
all_shifts['name'] = all_shifts['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
945
|
+
all_shifts['name'] = all_shifts['name'].apply(lambda x: normalize_player_name(x))
|
|
946
|
+
|
|
947
|
+
# OPTIMIZED: Already handled by dictionary lookup above
|
|
948
|
+
# Old nested chains removed - they were replaced with: all_shifts['name'] = all_shifts['name'].replace(_NAME_CORRECTIONS)
|
|
949
|
+
# Old nested chains removed - replaced with dictionary lookup
|
|
950
|
+
|
|
951
|
+
# Apply regex to remove (A) and (C) designations at end of names
|
|
952
|
+
all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
|
|
953
|
+
all_shifts['name'] = all_shifts['name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
|
|
954
|
+
|
|
955
|
+
# Apply specific name corrections
|
|
956
|
+
all_shifts['name'] = np.where(all_shifts['name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", all_shifts['name']) # Need to do this after normalization, only then he becomes Slafkovska?
|
|
957
|
+
all_shifts['name'] = np.where(all_shifts['name']== "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", all_shifts['name'])
|
|
958
|
+
all_shifts['name'] = np.where(all_shifts['name']== "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", all_shifts['name'])
|
|
959
|
+
|
|
960
|
+
all_shifts['name'] = np.where(all_shifts['name']== 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', all_shifts['name'])
|
|
961
|
+
|
|
962
|
+
all_shifts['name'] = np.where(all_shifts['name']== 'MATAJ BLAMEL' , 'MATAJ BLAMEL', all_shifts['name'])
|
|
963
|
+
|
|
964
|
+
all_shifts['name'] = all_shifts['name'].str.replace(' ', ' ')
|
|
965
|
+
|
|
966
|
+
# Clean invalid time values (e.g., "28:10" should be "20:00")
|
|
967
|
+
# Times beyond 20:00 (or 5:00 for OT periods) are invalid and should be capped
|
|
968
|
+
def clean_time_value(time_str):
|
|
969
|
+
"""Clean invalid time values by capping hours at 20 (23 for parsing, but we'll cap at period max)"""
|
|
970
|
+
if pd.isna(time_str):
|
|
971
|
+
return time_str
|
|
972
|
+
try:
|
|
973
|
+
# Try to parse as-is first
|
|
974
|
+
pd.to_datetime(time_str)
|
|
975
|
+
return time_str
|
|
976
|
+
except:
|
|
977
|
+
# If parsing fails, extract minutes:seconds and cap appropriately
|
|
978
|
+
try:
|
|
979
|
+
parts = str(time_str).split(':')
|
|
980
|
+
if len(parts) == 2:
|
|
981
|
+
minutes = int(parts[0])
|
|
982
|
+
seconds = parts[1]
|
|
983
|
+
# If minutes >= 20, cap at 20:00 (end of regulation period)
|
|
984
|
+
if minutes >= 20:
|
|
985
|
+
return '20:00'
|
|
986
|
+
else:
|
|
987
|
+
return time_str
|
|
988
|
+
except:
|
|
989
|
+
pass
|
|
990
|
+
# If all else fails, return 20:00 as safe default
|
|
991
|
+
return '20:00'
|
|
992
|
+
|
|
993
|
+
try:
|
|
994
|
+
all_shifts['start_time'] = all_shifts['start_time'].apply(clean_time_value)
|
|
995
|
+
all_shifts['end_time'] = all_shifts['end_time'].apply(clean_time_value)
|
|
996
|
+
except Exception as e:
|
|
997
|
+
print(f'Error cleaning time values: {e}')
|
|
998
|
+
print('Stupid vibe coded system is causing problems')
|
|
999
|
+
|
|
1000
|
+
all_shifts = all_shifts.assign(end_time = np.where(pd.to_datetime(all_shifts.start_time).dt.time > pd.to_datetime(all_shifts.end_time).dt.time, '20:00', all_shifts.end_time),
|
|
1001
|
+
goalie = np.where(all_shifts.name.isin(goalie_names), 1, 0))
|
|
1002
|
+
|
|
1003
|
+
all_shifts = all_shifts.merge(all_shifts[all_shifts.goalie==1].groupby(['team', 'period'])['name'].nunique().reset_index().rename(columns = {'name':'period_gs'}), how = 'left').fillna(0)
|
|
1004
|
+
|
|
1005
|
+
# Implement fix for goalies: Goalies who showed up late in the period and were the only goalie to play have their start time re-set to 0:00.
|
|
1006
|
+
|
|
1007
|
+
# Added this period shift number thing because we were getting an issue where a goalie got pulled mid period (like for a delayed penalty) and came back and their start time for the second shift got pushed to 0.
|
|
1008
|
+
all_shifts = all_shifts.assign(period_shift_number = all_shifts.groupby(['period', 'name']).cumcount() + 1)
|
|
1009
|
+
|
|
1010
|
+
all_shifts = all_shifts.assign(start_time = np.where((all_shifts.goalie==1) & (all_shifts.start_time!='0:00') & (all_shifts.period_gs==1) & (all_shifts.period_shift_number==1), '0:00', all_shifts.start_time))
|
|
1011
|
+
|
|
1012
|
+
# Previously I had this code to fix some kind of problem where goalie shifts didn't properly end.
|
|
1013
|
+
# But now I see this is causing an issue: If a goalie gets pulled and never comes back, this inaccurately fills them in.
|
|
1014
|
+
# Commenting this out and testing what things look like without it.
|
|
1015
|
+
|
|
1016
|
+
# all_shifts = all_shifts.assign(end_time = np.where(
|
|
1017
|
+
# (pd.to_datetime(all_shifts.start_time).dt.time < datetime(2021, 6, 10, 18, 0, 0).time()) &
|
|
1018
|
+
# (all_shifts.period!=3) &
|
|
1019
|
+
# (all_shifts.period!=4) &
|
|
1020
|
+
# (all_shifts.period!=5) &
|
|
1021
|
+
# (all_shifts.goalie==1) &
|
|
1022
|
+
# (all_shifts.period_gs==1),
|
|
1023
|
+
# '20:00', all_shifts.end_time))
|
|
1024
|
+
|
|
1025
|
+
# all_shifts = all_shifts.assign(end_time = np.where(
|
|
1026
|
+
# (pd.to_datetime(all_shifts.start_time).dt.time < datetime(2021, 6, 10, 13, 0, 0).time()) &
|
|
1027
|
+
# (all_shifts.period!=4) &
|
|
1028
|
+
# (all_shifts.period!=5) &
|
|
1029
|
+
# (all_shifts.goalie==1) &
|
|
1030
|
+
# (all_shifts.period_gs==1),
|
|
1031
|
+
# '20:00', all_shifts.end_time))
|
|
1032
|
+
|
|
1033
|
+
global myshifts
|
|
1034
|
+
global changes_on
|
|
1035
|
+
global changes_off
|
|
1036
|
+
myshifts = all_shifts
|
|
1037
|
+
#print('Printing my shifts')
|
|
1038
|
+
|
|
1039
|
+
#print(myshifts)
|
|
1040
|
+
|
|
1041
|
+
myshifts.start_time = myshifts.start_time.str.strip()
|
|
1042
|
+
myshifts.end_time = myshifts.end_time.str.strip()
|
|
1043
|
+
|
|
1044
|
+
myshifts['number'] = myshifts.number.astype(str)
|
|
1045
|
+
|
|
1046
|
+
changes_on = myshifts.groupby(['team', 'period', 'start_time']).agg(
|
|
1047
|
+
on = ('name', ', '.join),
|
|
1048
|
+
on_numbers = ('number', ', '.join),
|
|
1049
|
+
number_on = ('name', 'count')
|
|
1050
|
+
).reset_index().rename(columns = {'start_time':'time'}).sort_values(by = ['team', 'period', 'time'])
|
|
1051
|
+
|
|
1052
|
+
changes_off = myshifts.groupby(['team', 'period', 'end_time']).agg(
|
|
1053
|
+
off = ('name', ', '.join),
|
|
1054
|
+
off_numbers = ('number', ', '.join),
|
|
1055
|
+
number_off = ('name', 'count')
|
|
1056
|
+
).reset_index().rename(columns = {'end_time':'time'}).sort_values(by = ['team', 'period', 'time'])
|
|
1057
|
+
|
|
1058
|
+
all_on = changes_on.merge(changes_off, on = ['team', 'period', 'time'], how = 'left')
|
|
1059
|
+
off_only = changes_off.merge(changes_on, on = ['team', 'period', 'time'], how = 'left', indicator = True)[
|
|
1060
|
+
changes_off.merge(changes_on, on = ['team', 'period', 'time'], how = 'left', indicator = True)['_merge']!='both']
|
|
1061
|
+
full_changes = pd.concat([all_on, off_only]).sort_values(by = ['period', 'time']).drop(columns = ['_merge'])
|
|
1062
|
+
|
|
1063
|
+
full_changes['period_seconds'] = full_changes.time.str.split(':').str[0].astype(int) * 60 + full_changes.time.str.split(':').str[1].astype(int)
|
|
1064
|
+
|
|
1065
|
+
full_changes['game_seconds'] = (np.where((full_changes.period<5) & int(game_id)!=3,
|
|
1066
|
+
(((full_changes.period - 1) * 1200) + full_changes.period_seconds),
|
|
1067
|
+
3900))
|
|
1068
|
+
|
|
1069
|
+
full_changes = full_changes.assign(team = np.where(full_changes.team.str.contains('CANADI'), 'MONTREAL CANADIENS', full_changes.team)).sort_values(by = 'game_seconds')
|
|
1070
|
+
|
|
1071
|
+
if live == True:
|
|
1072
|
+
|
|
1073
|
+
if home_clock_period is not None and away_clock_period is not None:
|
|
1074
|
+
|
|
1075
|
+
min_game_clock = ((min([home_clock_period, away_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(home_clock_time_now), convert_clock_to_seconds(away_clock_time_now)])
|
|
1076
|
+
|
|
1077
|
+
elif home_clock_period is not None and away_clock_period is None:
|
|
1078
|
+
|
|
1079
|
+
min_game_clock = ((min([home_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(home_clock_time_now)])
|
|
1080
|
+
|
|
1081
|
+
elif away_clock_period is not None and home_clock_period is None:
|
|
1082
|
+
|
|
1083
|
+
min_game_clock = ((min([away_clock_period]) - 1) * 1200) + min([convert_clock_to_seconds(away_clock_time_now)])
|
|
1084
|
+
|
|
1085
|
+
else:
|
|
1086
|
+
min_game_clock = None
|
|
1087
|
+
|
|
1088
|
+
if min_game_clock is not None:
|
|
1089
|
+
|
|
1090
|
+
full_changes = full_changes[full_changes.game_seconds <= min_game_clock]
|
|
1091
|
+
|
|
1092
|
+
return min_game_clock, full_changes.reset_index(drop = True)
|
|
1093
|
+
|
|
1094
|
+
return full_changes.reset_index(drop = True)
|
|
1095
|
+
|
|
1096
|
+
def scrape_html_events(season, game_id, events_page=None, roster_page=None, verbose=False):
|
|
1097
|
+
"""
|
|
1098
|
+
Scrape HTML events page.
|
|
1099
|
+
|
|
1100
|
+
Args:
|
|
1101
|
+
season: Season string (e.g., '20242025')
|
|
1102
|
+
game_id: Game ID string (e.g., '020333')
|
|
1103
|
+
events_page: Optional pre-fetched requests.Response object for events page. If None, will fetch.
|
|
1104
|
+
roster_page: Optional pre-fetched requests.Response object for roster page. If None, will fetch.
|
|
1105
|
+
verbose: If True, print detailed timing information
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
Tuple of (events DataFrame, roster DataFrame)
|
|
1109
|
+
"""
|
|
1110
|
+
#global game
|
|
1111
|
+
if events_page is None:
|
|
1112
|
+
url = 'http://www.nhl.com/scores/htmlreports/' + season + '/PL0' + game_id + '.HTM'
|
|
1113
|
+
|
|
1114
|
+
# TIME: Network request
|
|
1115
|
+
net_start = time.time()
|
|
1116
|
+
events_page = _session.get(url, timeout=10)
|
|
1117
|
+
net_duration = time.time() - net_start
|
|
1118
|
+
if verbose:
|
|
1119
|
+
try:
|
|
1120
|
+
print(f' ⏱️ HTML events network request: {net_duration:.2f}s')
|
|
1121
|
+
except Exception:
|
|
1122
|
+
pass
|
|
1123
|
+
|
|
1124
|
+
#if int(season)<20092010):
|
|
1125
|
+
# soup = BeautifulSoup(page.content, 'html.parser')
|
|
1126
|
+
#else:
|
|
1127
|
+
# soup = BeautifulSoup(page.content, 'lxml')
|
|
1128
|
+
|
|
1129
|
+
# TIME: Parsing
|
|
1130
|
+
parse_start = time.time()
|
|
1131
|
+
# OPTIMIZED: Use lxml directly instead of BeautifulSoup for faster parsing
|
|
1132
|
+
if type(events_page) == str:
|
|
1133
|
+
doc = html.fromstring(events_page)
|
|
1134
|
+
else:
|
|
1135
|
+
doc = html.fromstring(events_page.content.decode('ISO-8859-1'))
|
|
1136
|
+
# XPath to find td elements with class containing 'bborder'
|
|
1137
|
+
tds = doc.xpath("//td[contains(@class, 'bborder')]")
|
|
1138
|
+
#global stripped_html
|
|
1139
|
+
#global eventdf
|
|
1140
|
+
stripped_html = hs_strip_html(tds)
|
|
1141
|
+
length = (len(stripped_html) // 8) * 8
|
|
1142
|
+
stripped_html = stripped_html[:length]
|
|
1143
|
+
eventdf = pd.DataFrame(np.array(stripped_html).reshape(int(length/8), 8)).rename(
|
|
1144
|
+
columns = {0:'index', 1:'period', 2:'strength', 3:'time', 4:'event', 5:'description', 6:'away_skaters', 7:'home_skaters'})
|
|
1145
|
+
split = eventdf.time.str.split(':')
|
|
1146
|
+
# XPath to find td elements with align='center' and style containing 'font-size: 10px;font-weight:bold'
|
|
1147
|
+
potentialnames = doc.xpath("//td[@align='center' and contains(@style, 'font-size: 10px;font-weight:bold')]")
|
|
1148
|
+
game_date = potentialnames[2].text_content() if len(potentialnames) > 2 else ''
|
|
1149
|
+
|
|
1150
|
+
for i in range(0, min(999, len(potentialnames))):
|
|
1151
|
+
away = potentialnames[i].text_content()
|
|
1152
|
+
if ('Away Game') in away or ('tr./Away') in away:
|
|
1153
|
+
away = _MATCH_GAME_PATTERN.split(away)[0]
|
|
1154
|
+
break
|
|
1155
|
+
|
|
1156
|
+
for i in range(0, min(999, len(potentialnames))):
|
|
1157
|
+
home = potentialnames[i].text_content()
|
|
1158
|
+
if ('Home Game') in home or ('Dom./Home') in home:
|
|
1159
|
+
home = _MATCH_GAME_PATTERN.split(home)[0]
|
|
1160
|
+
break
|
|
1161
|
+
|
|
1162
|
+
game = eventdf.assign(away_skaters = eventdf.away_skaters.str.replace('\n', ''),
|
|
1163
|
+
home_skaters = eventdf.home_skaters.str.replace('\n', ''),
|
|
1164
|
+
original_time = eventdf.time,
|
|
1165
|
+
time = split.str[0] + ":" + split.str[1].str[:2],
|
|
1166
|
+
home_team = home,
|
|
1167
|
+
away_team = away)
|
|
1168
|
+
|
|
1169
|
+
game = game.assign(away_team_abbreviated = game.away_skaters[0].split(' ')[0],
|
|
1170
|
+
home_team_abbreviated = game.home_skaters[0].split(' ')[0])
|
|
1171
|
+
|
|
1172
|
+
game = game[game.period!='Per']
|
|
1173
|
+
|
|
1174
|
+
game = game.assign(index = game.index.astype(int)).rename(columns = {'index':'event_index'})
|
|
1175
|
+
|
|
1176
|
+
game = game.assign(event_team = game.description.str.split(' ').str[0])
|
|
1177
|
+
|
|
1178
|
+
game = game.assign(event_team = game.event_team.str.split('\xa0').str[0])
|
|
1179
|
+
|
|
1180
|
+
game = game.assign(event_team = np.where(~game.event_team.isin([game.home_team_abbreviated.iloc[0], game.away_team_abbreviated.iloc[0]]), '\xa0', game.event_team))
|
|
1181
|
+
|
|
1182
|
+
game = game.assign(other_team = np.where(game.event_team=='', '\xa0',
|
|
1183
|
+
np.where(game.event_team==game.home_team_abbreviated.iloc[0], game.away_team_abbreviated.iloc[0], game.home_team_abbreviated.iloc[0])))
|
|
1184
|
+
|
|
1185
|
+
# Optimized: use single function instead of multiple .str.replace() calls
|
|
1186
|
+
def _extract_player_numbers(desc):
|
|
1187
|
+
matches = re.findall(r'[#-]\s*(\d+)', str(desc))
|
|
1188
|
+
return ' '.join(matches)
|
|
1189
|
+
game['event_player_str'] = game.description.apply(_extract_player_numbers)
|
|
1190
|
+
|
|
1191
|
+
game = game.assign(event_player_1 =
|
|
1192
|
+
game.event_player_str.str.split(' ').str[0],
|
|
1193
|
+
event_player_2 =
|
|
1194
|
+
game.event_player_str.str.split(' ').str[1],
|
|
1195
|
+
event_player_3 =
|
|
1196
|
+
game.event_player_str.str.split(' ').str[2])
|
|
1197
|
+
#return game
|
|
1198
|
+
|
|
1199
|
+
if len(game[game.description.str.contains('Drawn By')])>0:
|
|
1200
|
+
|
|
1201
|
+
game = game.assign(event_player_2 = np.where(game.description.str.contains('Drawn By'),
|
|
1202
|
+
game.description.str.split('Drawn By').str[1].str.split('#').str[1].str.split(' ').str[0].str.strip(),
|
|
1203
|
+
game.event_player_2),
|
|
1204
|
+
event_player_3 = np.where(game.description.str.contains('Served By'),
|
|
1205
|
+
'\xa0',
|
|
1206
|
+
game.event_player_3))
|
|
1207
|
+
|
|
1208
|
+
game = game.assign(event_player_1 = np.where((~pd.isna(game.event_player_1)) & (game.event_player_1!=''),
|
|
1209
|
+
np.where(game.event=='FAC', game.away_team_abbreviated,
|
|
1210
|
+
game.event_team) + (game.event_player_1.astype(str)),
|
|
1211
|
+
game.event_player_1),
|
|
1212
|
+
event_player_2 = np.where((~pd.isna(game.event_player_2)) & (game.event_player_2!=''),
|
|
1213
|
+
np.where(game.event=='FAC', game.home_team_abbreviated,
|
|
1214
|
+
np.where(game.event.isin(['BLOCK', 'HIT', 'PENL']), game.other_team, game.event_team)) + (game.event_player_2.astype(str)),
|
|
1215
|
+
game.event_player_2),
|
|
1216
|
+
event_player_3 = np.where((~pd.isna(game.event_player_3)) & (game.event_player_3!=''),
|
|
1217
|
+
game.event_team + (game.event_player_3.astype(str)),
|
|
1218
|
+
game.event_player_3))
|
|
1219
|
+
|
|
1220
|
+
game = game.assign(
|
|
1221
|
+
event_player_1 = np.where((game.event=='FAC') & (game.event_team==game.home_team_abbreviated),
|
|
1222
|
+
game.event_player_2, game.event_player_1),
|
|
1223
|
+
event_player_2 = np.where((game.event=='FAC') & (game.event_team==game.home_team_abbreviated),
|
|
1224
|
+
game.event_player_1, game.event_player_2))
|
|
1225
|
+
|
|
1226
|
+
#return game
|
|
1227
|
+
|
|
1228
|
+
roster = scrape_html_roster(season, game_id, page=roster_page, verbose=verbose).rename(columns = {'Nom/Name':'Name'})
|
|
1229
|
+
roster = roster[roster.status=='player']
|
|
1230
|
+
roster = roster.assign(team_abbreviated = np.where(roster.team=='home',
|
|
1231
|
+
game.home_team_abbreviated.iloc[0],
|
|
1232
|
+
game.away_team_abbreviated.iloc[0]))
|
|
1233
|
+
|
|
1234
|
+
roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
|
|
1235
|
+
roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
|
|
1236
|
+
roster['Name'] = roster.Name.apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
|
|
1237
|
+
|
|
1238
|
+
event_player_1s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_1', 'Name':'ep1_name'})
|
|
1239
|
+
event_player_2s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_2', 'Name':'ep2_name'})
|
|
1240
|
+
event_player_3s = roster.loc[:, ['teamnum', 'Name']].rename(columns = {'teamnum':'event_player_3', 'Name':'ep3_name'})
|
|
1241
|
+
|
|
1242
|
+
game = game.merge(
|
|
1243
|
+
event_player_1s, on = 'event_player_1', how = 'left').merge(
|
|
1244
|
+
event_player_2s, on = 'event_player_2', how = 'left').merge(
|
|
1245
|
+
event_player_3s, on = 'event_player_3', how = 'left').assign(
|
|
1246
|
+
date = game_date)
|
|
1247
|
+
#return game
|
|
1248
|
+
game['period'] = np.where(game['period'] == '', '1', game['period'])
|
|
1249
|
+
game['time'] = np.where((game['time'] == '') | (pd.isna(game['time'])), '0:00', game['time'])
|
|
1250
|
+
game['period'] = game.period.astype(int)
|
|
1251
|
+
|
|
1252
|
+
# OPTIMIZED: Split time once instead of twice
|
|
1253
|
+
time_split = game.time.str.split(':')
|
|
1254
|
+
game['period_seconds'] = time_split.str[0].str.replace('-', '', regex=False).astype(int) * 60 + time_split.str[1].str.replace('-', '', regex=False).astype(int)
|
|
1255
|
+
|
|
1256
|
+
game['game_seconds'] = (np.where((game.period<5) & int(game_id[0])!=3,
|
|
1257
|
+
(((game.period - 1) * 1200) + game.period_seconds),
|
|
1258
|
+
3900))
|
|
1259
|
+
|
|
1260
|
+
# OPTIMIZED: Use dictionary lookup instead of nested np.where()
|
|
1261
|
+
# TODO: Fix priority map so that we have change before shot or miss if the change involves a player returning from penalty box.
|
|
1262
|
+
priority_map = {
|
|
1263
|
+
'TAKE': 1, 'GIVE': 1, 'MISS': 1, 'HIT': 1, 'SHOT': 1, 'BLOCK': 1,
|
|
1264
|
+
'GOAL': 2, 'STOP': 3, 'DELPEN': 4, 'PENL': 5, 'CHANGE': 6,
|
|
1265
|
+
'PEND': 7, 'GEND': 8, 'FAC': 9
|
|
1266
|
+
}
|
|
1267
|
+
game = game.assign(priority=game.event.map(priority_map).fillna(0).astype(int)).sort_values(by = ['game_seconds', 'period', 'event_player_1', 'event'])
|
|
1268
|
+
game = game.assign(version =
|
|
1269
|
+
(np.where(
|
|
1270
|
+
(game.event==game.event.shift()) &
|
|
1271
|
+
(game.event_player_1==game.event_player_1.shift()) &
|
|
1272
|
+
(game.event_player_1!='') &
|
|
1273
|
+
(game.game_seconds==game.game_seconds.shift()),
|
|
1274
|
+
1, 0)))
|
|
1275
|
+
|
|
1276
|
+
game = game.assign(version =
|
|
1277
|
+
(np.where(
|
|
1278
|
+
(game.event==game.event.shift(2)) &
|
|
1279
|
+
(game.event_player_1==game.event_player_1.shift(2)) &
|
|
1280
|
+
(game.game_seconds==game.game_seconds.shift(2)) &
|
|
1281
|
+
(game.event_player_1!='') &
|
|
1282
|
+
(~game.description.str.contains('Penalty Shot')),
|
|
1283
|
+
2, game.version)))
|
|
1284
|
+
|
|
1285
|
+
game = game.assign(version =
|
|
1286
|
+
(np.where(
|
|
1287
|
+
(game.event==game.event.shift(3)) &
|
|
1288
|
+
(game.event_player_1==game.event_player_1.shift(3)) &
|
|
1289
|
+
(game.game_seconds==game.game_seconds.shift(3)) &
|
|
1290
|
+
(game.event_player_1!=''),
|
|
1291
|
+
3, game.version)))
|
|
1292
|
+
|
|
1293
|
+
game = game.assign(date = pd.to_datetime(game.date[~pd.isna(game.date)].iloc[0])
|
|
1294
|
+
).rename(columns = {'date':'game_date'}).sort_values(by = ['event_index'])
|
|
1295
|
+
|
|
1296
|
+
game = game.assign(event_player_1 = game.ep1_name, event_player_2 = game.ep2_name, event_player_3 = game.ep3_name).drop(columns = ['ep1_name', 'ep2_name', 'ep3_name'])
|
|
1297
|
+
|
|
1298
|
+
# OPTIMIZED: Combine team name replacements into single operation
|
|
1299
|
+
team_replacements = {'CANADIENS MONTREAL': 'MONTREAL CANADIENS', 'MONTRÉAL CANADIENS': 'MONTREAL CANADIENS'}
|
|
1300
|
+
game['home_team'] = game['home_team'].replace(team_replacements)
|
|
1301
|
+
game['away_team'] = game['away_team'].replace(team_replacements)
|
|
1302
|
+
|
|
1303
|
+
if int(game_id[0])!=3:
|
|
1304
|
+
game = game[game.game_seconds<4000]
|
|
1305
|
+
|
|
1306
|
+
game['game_date'] = np.where((season=='20072008') & (game_id == '20003'), game.game_date + pd.Timedelta(days=1), game.game_date)
|
|
1307
|
+
|
|
1308
|
+
game = game.assign(event_player_1 = np.where((game.description.str.upper().str.contains('TEAM')) | (game.description.str.lower().str.contains('bench')),
|
|
1309
|
+
'BENCH',
|
|
1310
|
+
game.event_player_1))
|
|
1311
|
+
|
|
1312
|
+
game = game.assign(home_skater_count_temp = (game.home_skaters.apply(lambda x: len(re.findall('[A-Z]', x)))),
|
|
1313
|
+
away_skater_count_temp = (game.away_skaters.apply(lambda x: len(re.findall('[A-Z]', x))))
|
|
1314
|
+
)
|
|
1315
|
+
|
|
1316
|
+
game = game.assign(event_team = np.where((game.event=='PENL') & (game.event_team=='') & (game.description.str.lower().str.contains('bench')) & (game.home_skater_count_temp>game.home_skater_count_temp.shift(-1)),
|
|
1317
|
+
game.home_team_abbreviated, game.event_team))
|
|
1318
|
+
|
|
1319
|
+
game = game.assign(event_team = np.where((game.event=='PENL') & (game.event_team=='') & (game.description.str.lower().str.contains('bench')) & (game.away_skater_count_temp>game.away_skater_count_temp.shift(-1)),
|
|
1320
|
+
game.away_team_abbreviated, game.event_team))
|
|
1321
|
+
|
|
1322
|
+
# TIME: Total parsing
|
|
1323
|
+
total_parse_duration = time.time() - parse_start
|
|
1324
|
+
# try:
|
|
1325
|
+
# print(f' ⏱️ HTML events parsing/processing: {total_parse_duration:.2f}s')
|
|
1326
|
+
# except Exception:
|
|
1327
|
+
# pass
|
|
1328
|
+
|
|
1329
|
+
# OPTIMIZATION: Return roster to avoid re-scraping in merge_and_prepare
|
|
1330
|
+
return game.drop(columns = ['period_seconds', 'time', 'priority', 'home_skater_count_temp', 'away_skater_count_temp']), roster
|
|
1331
|
+
|
|
1332
|
+
def scrape_espn_events(espn_game_id, drop_description = True):
|
|
1333
|
+
|
|
1334
|
+
# This URL has event coordinates
|
|
1335
|
+
|
|
1336
|
+
url = f'https://www.espn.com/nhl/playbyplay/_/gameId/{espn_game_id}'
|
|
1337
|
+
|
|
1338
|
+
page = _session.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=30)
|
|
1339
|
+
|
|
1340
|
+
soup = BeautifulSoup(page.content.decode('ISO-8859-1'), 'lxml', multi_valued_attributes = None)
|
|
1341
|
+
|
|
1342
|
+
period_jsons = json.loads(str(soup).split('"playGrps":')[1].split(',"tms"')[0])
|
|
1343
|
+
|
|
1344
|
+
# OPTIMIZED: Use list + concat instead of repeated _append()
|
|
1345
|
+
clock_df_list = []
|
|
1346
|
+
for period in range(0, len(period_jsons)):
|
|
1347
|
+
clock_df_list.append(pd.DataFrame(period_jsons[period]))
|
|
1348
|
+
clock_df = pd.concat(clock_df_list, ignore_index=True) if clock_df_list else pd.DataFrame()
|
|
1349
|
+
|
|
1350
|
+
clock_df = clock_df[~pd.isna(clock_df.clock)]
|
|
1351
|
+
|
|
1352
|
+
# Needed to add .split(',"st":3')[0] for playoffs
|
|
1353
|
+
|
|
1354
|
+
coords_df = pd.DataFrame(json.loads(str(soup).split('plays":')[1].split(',"st":1')[0].split(',"st":2')[0].split(',"st":3')[0]))
|
|
1355
|
+
|
|
1356
|
+
clock_df = clock_df.assign(
|
|
1357
|
+
clock = clock_df.clock.apply(lambda x: x['displayValue'])
|
|
1358
|
+
)
|
|
1359
|
+
|
|
1360
|
+
coords_df = coords_df.assign(
|
|
1361
|
+
coords_x = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda x: x['x']).astype(int),
|
|
1362
|
+
coords_y = coords_df[~pd.isna(coords_df.coordinate)].coordinate.apply(lambda y: y['y']).astype(int),
|
|
1363
|
+
event_player_1 = coords_df[~pd.isna(coords_df.athlete)]['athlete'].apply(lambda x: x['name'])
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
espn_events = coords_df.merge(clock_df.loc[:, ['id', 'clock']])
|
|
1367
|
+
|
|
1368
|
+
espn_events = espn_events.assign(
|
|
1369
|
+
period = espn_events['period'].apply(lambda x: x['number']),
|
|
1370
|
+
minutes = espn_events['clock'].str.split(':').apply(lambda x: x[0]).astype(int),
|
|
1371
|
+
seconds = espn_events['clock'].str.split(':').apply(lambda x: x[1]).astype(int),
|
|
1372
|
+
event_type = espn_events['type'].apply(lambda x: x['txt'])
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
espn_events = espn_events.assign(coords_x = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
|
1376
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_x
|
|
1377
|
+
),
|
|
1378
|
+
coords_y = np.where((pd.isna(espn_events.coords_x)) & (pd.isna(espn_events.coords_y)) &
|
|
1379
|
+
(espn_events.event_type=='Face Off'), 0, espn_events.coords_y))
|
|
1380
|
+
|
|
1381
|
+
espn_events = espn_events[(~pd.isna(espn_events.coords_x)) & (~pd.isna(espn_events.coords_y)) & (~pd.isna(espn_events.event_player_1))]
|
|
1382
|
+
|
|
1383
|
+
espn_events = espn_events.assign(
|
|
1384
|
+
# Do this later
|
|
1385
|
+
coords_x = espn_events.coords_x.astype(int),
|
|
1386
|
+
coords_y = espn_events.coords_y.astype(int)
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
espn_events = espn_events.rename(columns = {'text':'description'})
|
|
1390
|
+
|
|
1391
|
+
espn_events = espn_events.assign(
|
|
1392
|
+
event_type = np.where(espn_events.event_type=='Face Off', 'FAC',
|
|
1393
|
+
np.where(espn_events.event_type=='Goal', 'GOAL',
|
|
1394
|
+
np.where(espn_events.event_type=='Giveaway', 'GIVE',
|
|
1395
|
+
np.where(espn_events.event_type=='Penalty', 'PENL',
|
|
1396
|
+
np.where(espn_events.event_type=='Missed', 'MISS',
|
|
1397
|
+
np.where(espn_events.event_type=='Shot', 'SHOT',
|
|
1398
|
+
np.where(espn_events.event_type=='Takeaway', 'TAKE',
|
|
1399
|
+
np.where(espn_events.event_type=='Blocked', 'BLOCK',
|
|
1400
|
+
np.where(espn_events.event_type=='Hit', 'HIT',
|
|
1401
|
+
espn_events.event_type))))))))))
|
|
1402
|
+
|
|
1403
|
+
espn_events = espn_events.assign(priority = np.where(espn_events.event_type.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK']), 1,
|
|
1404
|
+
np.where(espn_events.event_type=="GOAL", 2,
|
|
1405
|
+
np.where(espn_events.event_type=="STOP", 3,
|
|
1406
|
+
np.where(espn_events.event_type=="DELPEN", 4,
|
|
1407
|
+
np.where(espn_events.event_type=="PENL", 5,
|
|
1408
|
+
np.where(espn_events.event_type=="CHANGE", 6,
|
|
1409
|
+
np.where(espn_events.event_type=="PEND", 7,
|
|
1410
|
+
np.where(espn_events.event_type=="GEND", 8,
|
|
1411
|
+
np.where(espn_events.event_type=="FAC", 9, 0))))))))),
|
|
1412
|
+
event_player_1 = espn_events.event_player_1.str.upper(),
|
|
1413
|
+
game_seconds = np.where(espn_events.period<5,
|
|
1414
|
+
((espn_events.period - 1) * 1200) + (espn_events.minutes * 60) + espn_events.seconds, 3900))
|
|
1415
|
+
|
|
1416
|
+
espn_events = espn_events.sort_values(by = ['period', 'game_seconds', 'event_player_1', 'priority']).rename(
|
|
1417
|
+
columns = {'event_type':'event'}).loc[:, ['coords_x', 'coords_y', 'event_player_1', 'event', 'game_seconds', 'description', 'period']]
|
|
1418
|
+
|
|
1419
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('ALEXANDRE '),
|
|
1420
|
+
espn_events['event_player_1'].str.replace('ALEXANDRE ', 'ALEX '),
|
|
1421
|
+
espn_events['event_player_1'])
|
|
1422
|
+
|
|
1423
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('ALEXANDER '),
|
|
1424
|
+
espn_events['event_player_1'].str.replace('ALEXANDER ', 'ALEX '),
|
|
1425
|
+
espn_events['event_player_1'])
|
|
1426
|
+
|
|
1427
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'].str.contains('CHRISTOPHER '),
|
|
1428
|
+
espn_events['event_player_1'].str.replace('CHRISTOPHER ', 'CHRIS '),
|
|
1429
|
+
espn_events['event_player_1'])
|
|
1430
|
+
|
|
1431
|
+
espn_events = espn_events.assign(event_player_1 =
|
|
1432
|
+
np.where(espn_events.event_player_1=='PATRICK MAROON', 'PAT MAROON',
|
|
1433
|
+
(np.where(espn_events.event_player_1=='J T COMPHER', 'J.T. COMPHER',
|
|
1434
|
+
(np.where(espn_events.event_player_1=='J T MILLER', 'J.T. MILLER',
|
|
1435
|
+
(np.where(espn_events.event_player_1=='T J OSHIE', 'T.J. OSHIE',
|
|
1436
|
+
(np.where((espn_events.event_player_1=='ALEXIS LAFRENIERE') | (espn_events.event_player_1=='ALEXIS LAFRENI RE'), 'ALEXIS LAFRENIÈRE',
|
|
1437
|
+
(np.where((espn_events.event_player_1=='TIM STUTZLE') | (espn_events.event_player_1=='TIM ST TZLE'), 'TIM STÜTZLE',
|
|
1438
|
+
(np.where(espn_events.event_player_1=='T.J. BRODIE', 'TJ BRODIE',
|
|
1439
|
+
(np.where(espn_events.event_player_1=='MATTHEW IRWIN', 'MATT IRWIN',
|
|
1440
|
+
(np.where(espn_events.event_player_1=='STEVE KAMPFER', 'STEVEN KAMPFER',
|
|
1441
|
+
(np.where(espn_events.event_player_1=='STEVE KAMPFER', 'STEVEN KAMPFER',
|
|
1442
|
+
(np.where(espn_events.event_player_1=='JEFFREY TRUCHON-VIEL', 'JEFFREY VIEL',
|
|
1443
|
+
(np.where(espn_events.event_player_1=='ZACHARY JONES', 'ZAC JONES',
|
|
1444
|
+
(np.where(espn_events.event_player_1=='MITCH MARNER', 'MITCHELL MARNER',
|
|
1445
|
+
(np.where(espn_events.event_player_1=='MATHEW DUMBA', 'MATT DUMBA',
|
|
1446
|
+
(np.where(espn_events.event_player_1=='JOSHUA MORRISSEY', 'JOSH MORRISSEY',
|
|
1447
|
+
(np.where(espn_events.event_player_1=='P K SUBBAN', 'P.K. SUBBAN',
|
|
1448
|
+
(np.where(espn_events.event_player_1=='EGOR SHARANGOVICH', 'YEGOR SHARANGOVICH',
|
|
1449
|
+
(np.where(espn_events.event_player_1=='MAXIME COMTOIS', 'MAX COMTOIS',
|
|
1450
|
+
(np.where(espn_events.event_player_1=='NICHOLAS CAAMANO', 'NICK CAAMANO',
|
|
1451
|
+
(np.where(espn_events.event_player_1=='DANIEL CARCILLO', 'DAN CARCILLO',
|
|
1452
|
+
(np.where(espn_events.event_player_1=='ALEXANDER OVECHKIN', 'ALEX OVECHKIN',
|
|
1453
|
+
(np.where(espn_events.event_player_1=='MICHAEL CAMMALLERI', 'MIKE CAMMALLERI',
|
|
1454
|
+
(np.where(espn_events.event_player_1=='DAVE STECKEL', 'DAVID STECKEL',
|
|
1455
|
+
(np.where(espn_events.event_player_1=='JIM DOWD', 'JAMES DOWD',
|
|
1456
|
+
(np.where(espn_events.event_player_1=='MAXIME TALBOT', 'MAX TALBOT',
|
|
1457
|
+
(np.where(espn_events.event_player_1=='MIKE ZIGOMANIS', 'MICHAEL ZIGOMANIS',
|
|
1458
|
+
(np.where(espn_events.event_player_1=='VINNY PROSPAL', 'VACLAV PROSPAL',
|
|
1459
|
+
(np.where(espn_events.event_player_1=='MIKE YORK', 'MICHAEL YORK',
|
|
1460
|
+
(np.where(espn_events.event_player_1=='JACOB DOWELL', 'JAKE DOWELL',
|
|
1461
|
+
(np.where(espn_events.event_player_1=='MICHAEL RUPP', 'MIKE RUPP',
|
|
1462
|
+
(np.where(espn_events.event_player_1=='ALEXEI KOVALEV', 'ALEX KOVALEV',
|
|
1463
|
+
(np.where(espn_events.event_player_1=='SLAVA KOZLOV', 'VYACHESLAV KOZLOV',
|
|
1464
|
+
(np.where(espn_events.event_player_1=='JEFF HAMILTON', 'JEFFREY HAMILTON',
|
|
1465
|
+
(np.where(espn_events.event_player_1=='JOHNNY POHL', 'JOHN POHL',
|
|
1466
|
+
(np.where(espn_events.event_player_1=='DANIEL GIRARDI', 'DAN GIRARDI',
|
|
1467
|
+
(np.where(espn_events.event_player_1=='NIKOLAI ZHERDEV', 'NIKOLAY ZHERDEV',
|
|
1468
|
+
(np.where(espn_events.event_player_1=='J.P. DUMONT', 'J-P DUMONT',
|
|
1469
|
+
(np.where(espn_events.event_player_1=='DWAYNE KING', 'DJ KING',
|
|
1470
|
+
(np.where(espn_events.event_player_1=='JOHN ODUYA', 'JOHNNY ODUYA',
|
|
1471
|
+
(np.where(espn_events.event_player_1=='ROBERT SCUDERI', 'ROB SCUDERI',
|
|
1472
|
+
(np.where(espn_events.event_player_1=='DOUG MURRAY', 'DOUGLAS MURRAY',
|
|
1473
|
+
(np.where(espn_events.event_player_1=='VACLAV PROSPAL', 'VINNY PROSPAL',
|
|
1474
|
+
(np.where(espn_events.event_player_1=='RICH PEVERLY', 'RICH PEVERLEY',
|
|
1475
|
+
espn_events.event_player_1.str.strip()
|
|
1476
|
+
))))))))))))))))))))))))))))))))))))))))))))
|
|
1477
|
+
))))))))))))))))))))))))))))))))))))))))))
|
|
1478
|
+
|
|
1479
|
+
espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JANIS MOSER" , "J.J. MOSER",
|
|
1480
|
+
(np.where(espn_events['event_player_1']== "NICHOLAS PAUL" , "NICK PAUL",
|
|
1481
|
+
(np.where(espn_events['event_player_1']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
|
|
1482
|
+
(np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
|
|
1483
|
+
espn_events['event_player_1']))))))))
|
|
1484
|
+
|
|
1485
|
+
espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JOHHNY BEECHER" , "JOHN BEECHER",
|
|
1486
|
+
(np.where(espn_events['event_player_1']== "ALEXANDER BARKOV" , "ALEKSANDER BARKOV",
|
|
1487
|
+
(np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
|
|
1488
|
+
espn_events['event_player_1']))))))
|
|
1489
|
+
|
|
1490
|
+
espn_events['event_player_1'] = (np.where(espn_events['event_player_1']== "JANIS MOSER" , "J.J. MOSER",
|
|
1491
|
+
(np.where(espn_events['event_player_1']== "NICHOLAS PAUL" , "NICK PAUL",
|
|
1492
|
+
(np.where(espn_events['event_player_1']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
|
|
1493
|
+
(np.where(espn_events['event_player_1']== "TOMMY NOVAK" , "THOMAS NOVAK",
|
|
1494
|
+
# New guys from 24-25
|
|
1495
|
+
(np.where(espn_events['event_player_1']== "JOSHUA NORRIS" , "JOSH NORRIS",
|
|
1496
|
+
(np.where(espn_events['event_player_1']== "P.O JOSEPH" , "PIERRE-OLIVIER JOSEPH",
|
|
1497
|
+
(np.where(espn_events['event_player_1']== "MIKEY EYSSIMONT" , "MICHAEL EYSSIMONT",
|
|
1498
|
+
(np.where(espn_events['event_player_1']== "MATAJ BLAMEL" , "MATAJ BLAMEL",
|
|
1499
|
+
(np.where(espn_events['event_player_1']== "VITTORIO MANCINI" , "VICTOR MANCINI",
|
|
1500
|
+
(np.where(espn_events['event_player_1']== "JOSHUA MAHURA" , "JOSH MAHURA",
|
|
1501
|
+
(np.where(espn_events['event_player_1']== "JOSEPH VELENO" , "JOE VELENO",
|
|
1502
|
+
(np.where(espn_events['event_player_1']== "ZACK BOLDUC" , "ZACHARY BOLDUC",
|
|
1503
|
+
(np.where(espn_events['event_player_1']== "JOSHUA BROWN" , "JOSH BROWN",
|
|
1504
|
+
(np.where(espn_events['event_player_1']== "JAKE LUCCHINI" , "JACOB LUCCHINI",
|
|
1505
|
+
(np.where(espn_events['event_player_1']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
|
|
1506
|
+
(np.where(espn_events['event_player_1']== "CAMERON ATKINSON" , "CAM ATKINSON",
|
|
1507
|
+
(np.where(espn_events['event_player_1']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
|
|
1508
|
+
(np.where(espn_events['event_player_1']== "MARTIN FEHARVARY" , "MARTIN FEHERVARY",
|
|
1509
|
+
espn_events['event_player_1']))))))))))))))))))))))))))))))))))))
|
|
1510
|
+
|
|
1511
|
+
|
|
1512
|
+
espn_events = espn_events.assign(version =
|
|
1513
|
+
(np.where(
|
|
1514
|
+
(espn_events.event==espn_events.event.shift()) &
|
|
1515
|
+
(espn_events.event_player_1==espn_events.event_player_1.shift()) &
|
|
1516
|
+
(espn_events.event_player_1!='') &
|
|
1517
|
+
(espn_events.game_seconds==espn_events.game_seconds.shift()),
|
|
1518
|
+
1, 0)))
|
|
1519
|
+
|
|
1520
|
+
espn_events = espn_events.assign(version =
|
|
1521
|
+
(np.where(
|
|
1522
|
+
(espn_events.event==espn_events.event.shift(2)) &
|
|
1523
|
+
(espn_events.event_player_1==espn_events.event_player_1.shift(2)) &
|
|
1524
|
+
(espn_events.game_seconds==espn_events.game_seconds.shift(2)) &
|
|
1525
|
+
(espn_events.event_player_1!='') &
|
|
1526
|
+
(~espn_events.description.str.contains('Penalty Shot')),
|
|
1527
|
+
2, espn_events.version)))
|
|
1528
|
+
|
|
1529
|
+
espn_events = espn_events.assign(version =
|
|
1530
|
+
(np.where(
|
|
1531
|
+
(espn_events.event==espn_events.event.shift(3)) &
|
|
1532
|
+
(espn_events.event_player_1==espn_events.event_player_1.shift(3)) &
|
|
1533
|
+
(espn_events.game_seconds==espn_events.game_seconds.shift(3)) &
|
|
1534
|
+
(espn_events.event_player_1!=''),
|
|
1535
|
+
3, espn_events.version)))
|
|
1536
|
+
|
|
1537
|
+
espn_events['espn_id'] = int(espn_game_id)
|
|
1538
|
+
|
|
1539
|
+
espn_events['event_player_1'] = espn_events['event_player_1'].str.strip()
|
|
1540
|
+
|
|
1541
|
+
espn_events['event_player_1'] = espn_events['event_player_1'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
1542
|
+
|
|
1543
|
+
# Apply regex to remove (A) and (C) designations at end of names
|
|
1544
|
+
espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
|
|
1545
|
+
espn_events['event_player_1'] = espn_events['event_player_1'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
|
|
1546
|
+
|
|
1547
|
+
# Apply specific name corrections
|
|
1548
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", espn_events['event_player_1'])
|
|
1549
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", espn_events['event_player_1'])
|
|
1550
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", espn_events['event_player_1'])
|
|
1551
|
+
|
|
1552
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', espn_events['event_player_1'])
|
|
1553
|
+
|
|
1554
|
+
espn_events['event_player_1'] = np.where(espn_events['event_player_1'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', espn_events['event_player_1'])
|
|
1555
|
+
|
|
1556
|
+
espn_events['event_player_1'] = espn_events['event_player_1'].str.replace(' ', ' ')
|
|
1557
|
+
|
|
1558
|
+
#espn_events = espn_events.assign(event_player_1 = np.where(
|
|
1559
|
+
#espn_events.event_player_1=='ALEX BURROWS', 'ALEXANDRE BURROWS', espn_events.event_player_1))
|
|
1560
|
+
|
|
1561
|
+
global look
|
|
1562
|
+
look = espn_events
|
|
1563
|
+
|
|
1564
|
+
espn_events['coords_x'] = np.where(espn_events['coords_x']>99, 99, espn_events['coords_x'])
|
|
1565
|
+
espn_events['coords_y'] = np.where(espn_events['coords_y']<(-42), (-42), espn_events['coords_y'])
|
|
1566
|
+
|
|
1567
|
+
if drop_description == True:
|
|
1568
|
+
return espn_events.drop(columns = 'description')
|
|
1569
|
+
else:
|
|
1570
|
+
return espn_events
|
|
1571
|
+
|
|
1572
|
+
def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
1573
|
+
|
|
1574
|
+
gamedays = pd.DataFrame()
|
|
1575
|
+
|
|
1576
|
+
if home_team == 'ATLANTA THRASHERS':
|
|
1577
|
+
home_team = 'WINNIPEG JETS'
|
|
1578
|
+
if away_team == 'ATLANTA THRASHERS':
|
|
1579
|
+
away_team = 'WINNIPEG JETS'
|
|
1580
|
+
|
|
1581
|
+
if home_team == 'PHOENIX COYOTES':
|
|
1582
|
+
home_team = 'ARIZONA COYOTES'
|
|
1583
|
+
if away_team == 'PHOENIX COYOTES':
|
|
1584
|
+
away_team = 'ARIZONA COYOTES'
|
|
1585
|
+
|
|
1586
|
+
this_date = (game_date)
|
|
1587
|
+
url = 'http://www.espn.com/nhl/scoreboard?date=' + this_date.replace("-", "")
|
|
1588
|
+
page = _session.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=30)
|
|
1589
|
+
print('Request to ESPN IDs successful.')
|
|
1590
|
+
soup = BeautifulSoup(page.content, 'lxml')
|
|
1591
|
+
soup_found = soup.find_all('a', {'class':['AnchorLink truncate',
|
|
1592
|
+
'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100',
|
|
1593
|
+
'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100 mr2'], 'href':[re.compile("/nhl/team/_/name/"), re.compile("game/_")]})
|
|
1594
|
+
game_sections = soup.find_all('section', class_ = 'Scoreboard bg-clr-white flex flex-auto justify-between')
|
|
1595
|
+
|
|
1596
|
+
at = []
|
|
1597
|
+
ht = []
|
|
1598
|
+
gids = []
|
|
1599
|
+
fax = pd.DataFrame()
|
|
1600
|
+
#print(str(i))
|
|
1601
|
+
for i in range(0, len(game_sections)):
|
|
1602
|
+
away = game_sections[i].find_all('div', class_='ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName db')[0].contents[0].upper()
|
|
1603
|
+
home = game_sections[i].find_all('div', class_='ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName db')[1].contents[0].upper()
|
|
1604
|
+
espnid = game_sections[i]['id']
|
|
1605
|
+
at.append(away)
|
|
1606
|
+
ht.append(home)
|
|
1607
|
+
gids.append(espnid)
|
|
1608
|
+
|
|
1609
|
+
fax = fax.assign(
|
|
1610
|
+
away_team = at,
|
|
1611
|
+
home_team = ht,
|
|
1612
|
+
espn_id = gids,
|
|
1613
|
+
game_date = pd.to_datetime(this_date))
|
|
1614
|
+
|
|
1615
|
+
# OPTIMIZED: Use concat instead of _append
|
|
1616
|
+
gamedays = pd.concat([gamedays, fax], ignore_index=True) if len(gamedays) > 0 else fax
|
|
1617
|
+
|
|
1618
|
+
gamedays = gamedays[gamedays.espn_id!='gameId']
|
|
1619
|
+
|
|
1620
|
+
gamedays = gamedays.assign(
|
|
1621
|
+
home_team = np.where(gamedays.home_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.home_team),
|
|
1622
|
+
away_team = np.where(gamedays.away_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.away_team),
|
|
1623
|
+
espn_id = gamedays.espn_id.str.split('/').str[0].astype(int)
|
|
1624
|
+
|
|
1625
|
+
)
|
|
1626
|
+
|
|
1627
|
+
gamedays = gamedays.assign(
|
|
1628
|
+
home_team = np.where(gamedays.home_team=='TB', 'TBL',
|
|
1629
|
+
np.where(gamedays.home_team=='T.B', 'TBL',
|
|
1630
|
+
np.where(gamedays.home_team=='L.A', 'LAK',
|
|
1631
|
+
np.where(gamedays.home_team=='LA', 'LAK',
|
|
1632
|
+
np.where(gamedays.home_team=='S.J', 'SJS',
|
|
1633
|
+
np.where(gamedays.home_team=='SJ', 'SJS',
|
|
1634
|
+
np.where(gamedays.home_team=='N.J', 'NJD',
|
|
1635
|
+
np.where(gamedays.home_team=='NJ', 'NJD',
|
|
1636
|
+
gamedays.home_team)))))))),
|
|
1637
|
+
away_team = np.where(gamedays.away_team=='TB', 'TBL',
|
|
1638
|
+
np.where(gamedays.away_team=='T.B', 'TBL',
|
|
1639
|
+
np.where(gamedays.away_team=='L.A', 'LAK',
|
|
1640
|
+
np.where(gamedays.away_team=='LA', 'LAK',
|
|
1641
|
+
np.where(gamedays.away_team=='S.J', 'SJS',
|
|
1642
|
+
np.where(gamedays.away_team=='SJ', 'SJS',
|
|
1643
|
+
np.where(gamedays.away_team=='N.J', 'NJD',
|
|
1644
|
+
np.where(gamedays.away_team=='NJ', 'NJD',
|
|
1645
|
+
gamedays.away_team)))))))),
|
|
1646
|
+
espn_id = gamedays.espn_id.astype(int))
|
|
1647
|
+
|
|
1648
|
+
gamedays = gamedays.assign(
|
|
1649
|
+
away_team = np.where(gamedays.away_team=='DUCKS', 'ANA',
|
|
1650
|
+
np.where(gamedays.away_team=='COYOTES', 'ARI',
|
|
1651
|
+
np.where(gamedays.away_team=='BRUINS', 'BOS',
|
|
1652
|
+
np.where(gamedays.away_team=='SABRES', 'BUF',
|
|
1653
|
+
np.where(gamedays.away_team=='FLAMES', 'CGY',
|
|
1654
|
+
np.where(gamedays.away_team=='HURRICANES', 'CAR',
|
|
1655
|
+
np.where(gamedays.away_team=='BLACKHAWKS', 'CHI',
|
|
1656
|
+
np.where(gamedays.away_team=='AVALANCHE', 'COL',
|
|
1657
|
+
np.where(gamedays.away_team=='BLUE', 'CBJ',
|
|
1658
|
+
np.where(gamedays.away_team=='JACKETS', 'CBJ',
|
|
1659
|
+
np.where(gamedays.away_team=='BLUE JACKETS', 'CBJ',
|
|
1660
|
+
np.where(gamedays.away_team=='STARS', 'DAL',
|
|
1661
|
+
np.where(gamedays.away_team=='RED', 'DET',
|
|
1662
|
+
np.where(gamedays.away_team=='WINGS', 'DET',
|
|
1663
|
+
np.where(gamedays.away_team=='RED WINGS', 'DET',
|
|
1664
|
+
np.where(gamedays.away_team=='OILERS', 'EDM',
|
|
1665
|
+
np.where(gamedays.away_team=='PANTHERS', 'FLA',
|
|
1666
|
+
np.where(gamedays.away_team=='KINGS', 'LAK',
|
|
1667
|
+
np.where(gamedays.away_team=='WILD', 'MIN',
|
|
1668
|
+
np.where(gamedays.away_team=='CANADIENS', 'MTL',
|
|
1669
|
+
np.where(gamedays.away_team=='PREDATORS', 'NSH',
|
|
1670
|
+
np.where(gamedays.away_team=='DEVILS', 'NJD',
|
|
1671
|
+
np.where(gamedays.away_team=='ISLANDERS', 'NYI',
|
|
1672
|
+
np.where(gamedays.away_team=='RANGERS', 'NYR',
|
|
1673
|
+
np.where(gamedays.away_team=='SENATORS', 'OTT',
|
|
1674
|
+
np.where(gamedays.away_team=='FLYERS', 'PHI',
|
|
1675
|
+
np.where(gamedays.away_team=='PENGUINS', 'PIT',
|
|
1676
|
+
np.where(gamedays.away_team=='SHARKS', 'SJS',
|
|
1677
|
+
np.where(gamedays.away_team=='KRAKEN', 'SEA',
|
|
1678
|
+
np.where(gamedays.away_team=='BLUES', 'STL',
|
|
1679
|
+
np.where(gamedays.away_team=='LIGHTNING', 'TBL',
|
|
1680
|
+
np.where(gamedays.away_team=='LEAFS', 'TOR',
|
|
1681
|
+
np.where(gamedays.away_team=='MAPLE', 'TOR',
|
|
1682
|
+
np.where(gamedays.away_team=='MAPLE LEAFS', 'TOR',
|
|
1683
|
+
np.where(gamedays.away_team=='CANUCKS', 'VAN',
|
|
1684
|
+
np.where(gamedays.away_team=='GOLDEN', 'VGK',
|
|
1685
|
+
np.where(gamedays.away_team=='KNIGHTS', 'VGK',
|
|
1686
|
+
np.where(gamedays.away_team=='GOLDEN KNIGHTS', 'VGK',
|
|
1687
|
+
np.where(gamedays.away_team=='CAPITALS', 'WSH',
|
|
1688
|
+
np.where(gamedays.away_team=='JETS', 'WPG',
|
|
1689
|
+
np.where(gamedays.away_team=='CLUB', 'UTA',
|
|
1690
|
+
np.where(gamedays.away_team=='MAMMOTH', 'UTA',
|
|
1691
|
+
np.where(gamedays.away_team=='HOCKEY', 'UTA', 'mistake'
|
|
1692
|
+
))))))))))))))))))))))))))))))))))))))))))))
|
|
1693
|
+
|
|
1694
|
+
gamedays = gamedays.assign(
|
|
1695
|
+
home_team = np.where(gamedays.home_team=='DUCKS', 'ANA',
|
|
1696
|
+
np.where(gamedays.home_team=='COYOTES', 'ARI',
|
|
1697
|
+
np.where(gamedays.home_team=='BRUINS', 'BOS',
|
|
1698
|
+
np.where(gamedays.home_team=='SABRES', 'BUF',
|
|
1699
|
+
np.where(gamedays.home_team=='FLAMES', 'CGY',
|
|
1700
|
+
np.where(gamedays.home_team=='HURRICANES', 'CAR',
|
|
1701
|
+
np.where(gamedays.home_team=='BLACKHAWKS', 'CHI',
|
|
1702
|
+
np.where(gamedays.home_team=='AVALANCHE', 'COL',
|
|
1703
|
+
np.where(gamedays.home_team=='BLUE', 'CBJ',
|
|
1704
|
+
np.where(gamedays.home_team=='JACKETS', 'CBJ',
|
|
1705
|
+
np.where(gamedays.home_team=='BLUE JACKETS', 'CBJ',
|
|
1706
|
+
np.where(gamedays.home_team=='STARS', 'DAL',
|
|
1707
|
+
np.where(gamedays.home_team=='RED', 'DET',
|
|
1708
|
+
np.where(gamedays.home_team=='WINGS', 'DET',
|
|
1709
|
+
np.where(gamedays.home_team=='RED WINGS', 'DET',
|
|
1710
|
+
np.where(gamedays.home_team=='OILERS', 'EDM',
|
|
1711
|
+
np.where(gamedays.home_team=='PANTHERS', 'FLA',
|
|
1712
|
+
np.where(gamedays.home_team=='KINGS', 'LAK',
|
|
1713
|
+
np.where(gamedays.home_team=='WILD', 'MIN',
|
|
1714
|
+
np.where(gamedays.home_team=='CANADIENS', 'MTL',
|
|
1715
|
+
np.where(gamedays.home_team=='PREDATORS', 'NSH',
|
|
1716
|
+
np.where(gamedays.home_team=='DEVILS', 'NJD',
|
|
1717
|
+
np.where(gamedays.home_team=='ISLANDERS', 'NYI',
|
|
1718
|
+
np.where(gamedays.home_team=='RANGERS', 'NYR',
|
|
1719
|
+
np.where(gamedays.home_team=='SENATORS', 'OTT',
|
|
1720
|
+
np.where(gamedays.home_team=='FLYERS', 'PHI',
|
|
1721
|
+
np.where(gamedays.home_team=='PENGUINS', 'PIT',
|
|
1722
|
+
np.where(gamedays.home_team=='SHARKS', 'SJS',
|
|
1723
|
+
np.where(gamedays.home_team=='KRAKEN', 'SEA',
|
|
1724
|
+
np.where(gamedays.home_team=='BLUES', 'STL',
|
|
1725
|
+
np.where(gamedays.home_team=='LIGHTNING', 'TBL',
|
|
1726
|
+
np.where(gamedays.home_team=='MAPLE', 'TOR',
|
|
1727
|
+
np.where(gamedays.home_team=='LEAFS', 'TOR',
|
|
1728
|
+
np.where(gamedays.home_team=='MAPLE LEAFS', 'TOR',
|
|
1729
|
+
np.where(gamedays.home_team=='CANUCKS', 'VAN',
|
|
1730
|
+
np.where(gamedays.home_team=='GOLDEN', 'VGK',
|
|
1731
|
+
np.where(gamedays.home_team=='KNIGHTS', 'VGK',
|
|
1732
|
+
np.where(gamedays.home_team=='GOLDEN KNIGHTS', 'VGK',
|
|
1733
|
+
np.where(gamedays.home_team=='CAPITALS', 'WSH',
|
|
1734
|
+
np.where(gamedays.home_team=='JETS', 'WPG',
|
|
1735
|
+
np.where(gamedays.home_team=='CLUB', 'UTA',
|
|
1736
|
+
np.where(gamedays.home_team=='MAMMOTH', 'UTA',
|
|
1737
|
+
np.where(gamedays.home_team=='HOCKEY', 'UTA', 'mistake'
|
|
1738
|
+
))))))))))))))))))))))))))))))))))))))))))))
|
|
1739
|
+
|
|
1740
|
+
gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
|
|
1741
|
+
|
|
1742
|
+
return(gamedays)
|
|
1743
|
+
|
|
1744
|
+
def merge_and_prepare(events, shifts, roster=None, live = False):
|
|
1745
|
+
|
|
1746
|
+
season = str(int(str(events.game_id.iloc[0])[:4])) + str(int(str(events.game_id.iloc[0])[:4]) + 1)
|
|
1747
|
+
small_id = str(events.game_id.iloc[0])[5:]
|
|
1748
|
+
game_id = int(events.game_id.iloc[0])
|
|
1749
|
+
|
|
1750
|
+
merged = pd.concat([events, shifts])
|
|
1751
|
+
|
|
1752
|
+
home_team = merged[~(pd.isna(merged.home_team))].home_team.iloc[0]
|
|
1753
|
+
#print(home_team)
|
|
1754
|
+
away_team = merged[~(pd.isna(merged.away_team))].away_team.iloc[0]
|
|
1755
|
+
#print(away_team)
|
|
1756
|
+
|
|
1757
|
+
if 'CANADIENS' in home_team:
|
|
1758
|
+
home_team = 'MONTREAL CANADIENS'
|
|
1759
|
+
|
|
1760
|
+
if 'CANADIENS' in away_team:
|
|
1761
|
+
away_team = 'MONTREAL CANADIENS'
|
|
1762
|
+
|
|
1763
|
+
#print(home_team)
|
|
1764
|
+
#print(away_team)
|
|
1765
|
+
|
|
1766
|
+
merged = merged.assign(home_team = home_team,
|
|
1767
|
+
away_team = away_team,
|
|
1768
|
+
home_team_abbreviated = merged[~(pd.isna(merged.home_team_abbreviated))].home_team_abbreviated.iloc[0],
|
|
1769
|
+
away_team_abbreviated = merged[~(pd.isna(merged.away_team_abbreviated))].away_team_abbreviated.iloc[0])
|
|
1770
|
+
|
|
1771
|
+
merged = merged.assign(event_team = np.where(merged.team==merged.home_team, merged.home_team_abbreviated,
|
|
1772
|
+
np.where(merged.team==merged.away_team, merged.away_team_abbreviated,
|
|
1773
|
+
merged.event_team)))
|
|
1774
|
+
|
|
1775
|
+
merged = merged.assign(event = np.where((pd.isna(merged.event)) &
|
|
1776
|
+
((~pd.isna(merged.number_off)) | (~pd.isna(merged.number_on))), "CHANGE", merged.event))
|
|
1777
|
+
|
|
1778
|
+
home_space = ' ' + merged['home_team_abbreviated'].iloc[0]
|
|
1779
|
+
away_space = ' ' + merged['away_team_abbreviated'].iloc[0]
|
|
1780
|
+
|
|
1781
|
+
merged['away_skaters'] = np.where(pd.isna(merged.away_skaters), '\xa0', merged.away_skaters)
|
|
1782
|
+
|
|
1783
|
+
merged['tmp'] = merged.away_skaters.str.replace("[^0-9]", " ")
|
|
1784
|
+
|
|
1785
|
+
merged['tmp2'] = (merged.tmp.str.strip().str.split(" ")).apply(lambda x: natsorted(x)).apply(lambda x: ' '.join(x))
|
|
1786
|
+
|
|
1787
|
+
merged['tmp2'] = (merged.away_team_abbreviated.iloc[0] + merged.tmp2).str.replace(" ", away_space).str.replace(" ", ", ")
|
|
1788
|
+
|
|
1789
|
+
merged['tmp2'] = np.where(merged.tmp2.str.strip()==merged.away_team_abbreviated.iloc[0], '\xa0', merged.tmp2)
|
|
1790
|
+
|
|
1791
|
+
merged['away_on_ice'] = merged['tmp2']
|
|
1792
|
+
|
|
1793
|
+
merged['home_skaters'] = np.where(pd.isna(merged.home_skaters), '\xa0', merged.home_skaters)
|
|
1794
|
+
|
|
1795
|
+
merged['tmp'] = merged.home_skaters.str.replace("[^0-9]", " ")
|
|
1796
|
+
|
|
1797
|
+
merged['tmp2'] = (merged.tmp.str.strip().str.split(" ")).apply(lambda x: natsorted(x)).apply(lambda x: ' '.join(x))
|
|
1798
|
+
|
|
1799
|
+
merged['tmp2'] = (merged.home_team_abbreviated.iloc[0] + merged.tmp2).str.replace(" ", home_space).str.replace(" ", ", ")
|
|
1800
|
+
|
|
1801
|
+
merged['tmp2'] = np.where(merged.tmp2.str.strip()==merged.home_team_abbreviated.iloc[0], '\xa0', merged.tmp2)
|
|
1802
|
+
|
|
1803
|
+
merged['home_on_ice'] = merged['tmp2']
|
|
1804
|
+
|
|
1805
|
+
merged = merged.sort_values(by = ['game_seconds', 'period'])
|
|
1806
|
+
|
|
1807
|
+
merged = merged.assign(jumping_on = (np.where(merged.home_team == merged.team, (merged.home_team_abbreviated.iloc[0] + merged.on_numbers).str.replace(", ", home_space).str.replace(" ", ", "),
|
|
1808
|
+
np.where(merged.away_team == merged.team, (merged.away_team_abbreviated.iloc[0] + merged.on_numbers).str.replace(", ", away_space).str.replace(" ", ", "),
|
|
1809
|
+
'\xa0'))),
|
|
1810
|
+
jumping_off = (np.where(merged.home_team == merged.team, (merged.home_team_abbreviated.iloc[0] + merged.off_numbers).str.replace(", ", home_space).str.replace(" ", ", "),
|
|
1811
|
+
np.where(merged.away_team == merged.team, (merged.away_team_abbreviated.iloc[0] + merged.off_numbers).str.replace(", ", away_space).str.replace(" ", ", "),
|
|
1812
|
+
'\xa0'))),
|
|
1813
|
+
prio = np.where(merged.event=="CHANGE", 0,
|
|
1814
|
+
np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
|
|
1815
|
+
by = ['game_seconds', 'period', 'event_index'])
|
|
1816
|
+
|
|
1817
|
+
merged = merged.assign(change_before_event = np.where(
|
|
1818
|
+
(
|
|
1819
|
+
(merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
|
|
1820
|
+
(merged.home_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.home_on_ice!=merged.home_on_ice.shift())
|
|
1821
|
+
), 1, 0
|
|
1822
|
+
))
|
|
1823
|
+
|
|
1824
|
+
merged = merged.assign(change_prio =
|
|
1825
|
+
np.where((merged.team==merged.home_team) & (merged.event=='CHANGE') , 1,
|
|
1826
|
+
np.where((merged.team==merged.away_team) & (merged.event=='CHANGE'), -1, 0)))
|
|
1827
|
+
|
|
1828
|
+
# TODO: Fix priority map so that we have change before shot or miss if the change involves a player returning from penalty box.
|
|
1829
|
+
merged = merged.assign(priority = np.where(merged.event.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK']), 1,
|
|
1830
|
+
np.where(merged.event=="GOAL", 2,
|
|
1831
|
+
np.where(merged.event=="STOP", 3,
|
|
1832
|
+
np.where(merged.event=="DELPEN", 4,
|
|
1833
|
+
np.where(merged.event=="PENL", 5,
|
|
1834
|
+
np.where(merged.event=="CHANGE", 6,
|
|
1835
|
+
np.where(merged.event=="PEND", 7,
|
|
1836
|
+
np.where(merged.event=="GEND", 8,
|
|
1837
|
+
np.where(merged.event=="FAC", 9, 0)))))))))).sort_values(by = ['game_seconds', 'period', 'priority', 'event_index', 'change_prio'])
|
|
1838
|
+
|
|
1839
|
+
merged = merged.reset_index(drop = True).reset_index().rename(columns = {'index':'event_index', 'event_index':'original_index'})
|
|
1840
|
+
|
|
1841
|
+
# OPTIMIZATION: Use passed-in roster if available, otherwise scrape it
|
|
1842
|
+
if roster is None:
|
|
1843
|
+
roster = scrape_html_roster(season, small_id).rename(columns = {'Nom/Name':'Name'})
|
|
1844
|
+
# roster is already prepared in scrape_html_events, no need to rename
|
|
1845
|
+
|
|
1846
|
+
roster = roster.assign(team_abbreviated = np.where(roster.team=='home',
|
|
1847
|
+
merged.home_team_abbreviated.iloc[0],
|
|
1848
|
+
merged.away_team_abbreviated.iloc[0]))
|
|
1849
|
+
|
|
1850
|
+
roster = roster.assign(teamnum = roster.team_abbreviated + roster['#'])
|
|
1851
|
+
# OPTIMIZED: Use pre-compiled regex patterns instead of compiling in each lambda
|
|
1852
|
+
roster['Name'] = roster.Name.apply(lambda x: _CAPTAIN_A_PATTERN.sub('', x).strip())
|
|
1853
|
+
roster['Name'] = roster.Name.apply(lambda x: _CAPTAIN_C_PATTERN.sub('', x).strip())
|
|
1854
|
+
|
|
1855
|
+
roster = roster.assign(Name = np.where((roster.Name=='SEBASTIAN AHO') &( roster.team_name == 'NEW YORK ISLANDERS'), 'SEBASTIAN AHO (SWE)', roster.Name))
|
|
1856
|
+
roster = roster.assign(Name = np.where((roster.Name=='ELIAS PETTERSSON') &( roster.Pos == 'D'), 'ELIAS PETTERSSON(D)', roster.Name))
|
|
1857
|
+
|
|
1858
|
+
goalies = roster[(roster.Pos=='G') & (roster.status!='scratch')]
|
|
1859
|
+
|
|
1860
|
+
away_roster = roster[(roster.team=='away') & (roster.status!='scratch')]
|
|
1861
|
+
home_roster = roster[(roster.team=='home') & (roster.status!='scratch')]
|
|
1862
|
+
|
|
1863
|
+
merged.jumping_on = np.where(pd.isna(merged.jumping_on), '\xa0', merged.jumping_on)
|
|
1864
|
+
merged.jumping_off = np.where(pd.isna(merged.jumping_off), '\xa0', merged.jumping_off)
|
|
1865
|
+
|
|
1866
|
+
# OPTIMIZED: Use vectorized string operations instead of .apply()
|
|
1867
|
+
# This provides 10-20x speedup on the on-ice tracking loops
|
|
1868
|
+
change_mask = (merged.event == 'CHANGE')
|
|
1869
|
+
|
|
1870
|
+
# OPTIMIZED: Use regex pattern matching for exact teamnum matches in comma-separated strings
|
|
1871
|
+
# Pattern matches teamnum at start, middle (after comma+space), or end of string
|
|
1872
|
+
# Note: re module is already imported at module level
|
|
1873
|
+
|
|
1874
|
+
# Build all columns at once using vectorized string operations
|
|
1875
|
+
awaydf_dict = {}
|
|
1876
|
+
for i in range(0, len(away_roster)):
|
|
1877
|
+
teamnum = away_roster.teamnum.iloc[i]
|
|
1878
|
+
# Use regex to match teamnum as whole value (not substring)
|
|
1879
|
+
# Match: start of string OR comma+space, then teamnum, then comma OR end of string
|
|
1880
|
+
pattern = r'(^|, )' + re.escape(teamnum) + r'(,|$)'
|
|
1881
|
+
on_mask = merged.jumping_on.str.contains(pattern, na=False, regex=True)
|
|
1882
|
+
off_mask = merged.jumping_off.str.contains(pattern, na=False, regex=True) & change_mask
|
|
1883
|
+
vec = np.cumsum(on_mask.astype(int) - off_mask.astype(int))
|
|
1884
|
+
awaydf_dict[away_roster.Name.iloc[i]] = vec
|
|
1885
|
+
|
|
1886
|
+
awaydf = pd.DataFrame(awaydf_dict)
|
|
1887
|
+
|
|
1888
|
+
global homedf
|
|
1889
|
+
|
|
1890
|
+
# OPTIMIZED: Same optimization for home roster
|
|
1891
|
+
homedf_dict = {}
|
|
1892
|
+
for i in range(0, len(home_roster)):
|
|
1893
|
+
teamnum = home_roster.teamnum.iloc[i]
|
|
1894
|
+
pattern = r'(^|, )' + re.escape(teamnum) + r'(,|$)'
|
|
1895
|
+
on_mask = merged.jumping_on.str.contains(pattern, na=False, regex=True)
|
|
1896
|
+
off_mask = merged.jumping_off.str.contains(pattern, na=False, regex=True) & change_mask
|
|
1897
|
+
vec = np.cumsum(on_mask.astype(int) - off_mask.astype(int))
|
|
1898
|
+
homedf_dict[home_roster.Name.iloc[i]] = vec
|
|
1899
|
+
|
|
1900
|
+
homedf = pd.DataFrame(homedf_dict)
|
|
1901
|
+
|
|
1902
|
+
global home_on
|
|
1903
|
+
global away_on
|
|
1904
|
+
|
|
1905
|
+
# OPTIMIZED: Use list comprehension which is faster than .apply() for this operation
|
|
1906
|
+
# Get column names where value is 1, join, and sort
|
|
1907
|
+
home_on_list = []
|
|
1908
|
+
for idx in range(len(homedf)):
|
|
1909
|
+
row = homedf.iloc[idx]
|
|
1910
|
+
players = [col for col in homedf.columns if row[col] == 1]
|
|
1911
|
+
home_on_list.append(','.join(natsorted(players)) if players else '')
|
|
1912
|
+
home_on = pd.DataFrame({0: home_on_list})
|
|
1913
|
+
|
|
1914
|
+
away_on_list = []
|
|
1915
|
+
for idx in range(len(awaydf)):
|
|
1916
|
+
row = awaydf.iloc[idx]
|
|
1917
|
+
players = [col for col in awaydf.columns if row[col] == 1]
|
|
1918
|
+
away_on_list.append(','.join(natsorted(players)) if players else '')
|
|
1919
|
+
away_on = pd.DataFrame({0: away_on_list})
|
|
1920
|
+
|
|
1921
|
+
away_on = away_on[0].str.split(',', expand=True).rename(columns = {0:'away_on_1', 1:'away_on_2', 2:'away_on_3', 3:'away_on_4', 4:'away_on_5', 5:'away_on_6', 6:'away_on_7', 7:'away_on_8', 8:'away_on_9'})
|
|
1922
|
+
home_on = home_on[0].str.split(',', expand=True).rename(columns = {0:'home_on_1', 1:'home_on_2', 2:'home_on_3', 3:'home_on_4', 4:'home_on_5', 5:'home_on_6', 6:'home_on_7', 7:'home_on_8', 8:'home_on_9'})
|
|
1923
|
+
|
|
1924
|
+
# OPTIMIZED: Initialize missing columns in a loop
|
|
1925
|
+
for side in ['away', 'home']:
|
|
1926
|
+
for i in range(1, 10):
|
|
1927
|
+
col = f'{side}_on_{i}'
|
|
1928
|
+
if col not in (away_on if side == 'away' else home_on).columns:
|
|
1929
|
+
(away_on if side == 'away' else home_on)[col] = '\xa0'
|
|
1930
|
+
|
|
1931
|
+
game = pd.concat([merged, home_on, away_on], axis = 1)
|
|
1932
|
+
|
|
1933
|
+
game = game.assign(
|
|
1934
|
+
event_team = np.where(game.event_team==game.home_team, game.home_team_abbreviated,
|
|
1935
|
+
np.where(game.event_team==game.away_team, game.away_team_abbreviated,
|
|
1936
|
+
game.event_team)),
|
|
1937
|
+
description = game.description.astype(str))
|
|
1938
|
+
|
|
1939
|
+
game['description'] = np.where(game.description=='nan', '\xa0', game.description)
|
|
1940
|
+
|
|
1941
|
+
game = game.drop(columns = ['original_index', 'strength', 'original_time', 'home_team', 'away_team', 'other_team', 'event_player_str',
|
|
1942
|
+
'version', 'team', 'change_before_event', 'prio', 'change_prio', 'priority', 'tmp', 'tmp2']).rename(
|
|
1943
|
+
columns = {'away_team_abbreviated':'away_team', 'home_team_abbreviated':'home_team', 'coordsx':'coords_x', 'coordsy':'coords_y',
|
|
1944
|
+
'ep1_name':'event_player_1', 'ep2_name':'event_player_2', 'ep3_name':'event_player_3'})
|
|
1945
|
+
|
|
1946
|
+
# OPTIMIZED: Pre-compile regex and use vectorized operations where possible
|
|
1947
|
+
# event_zone: combine the two apply() calls into one
|
|
1948
|
+
def extract_zone(desc):
|
|
1949
|
+
match = _ZONE_PATTERN.search(str(desc))
|
|
1950
|
+
return match.group() if match else None
|
|
1951
|
+
|
|
1952
|
+
# OPTIMIZED: event_detail - reduce string operations by caching splits
|
|
1953
|
+
def extract_detail(row):
|
|
1954
|
+
desc = row['description']
|
|
1955
|
+
event = row['event']
|
|
1956
|
+
if pd.isna(desc):
|
|
1957
|
+
return '\xa0'
|
|
1958
|
+
if event in ['SHOT', 'BLOCK', 'MISS', 'GOAL']:
|
|
1959
|
+
parts = desc.split(', ')
|
|
1960
|
+
return parts[1].strip() if len(parts) > 1 else '\xa0'
|
|
1961
|
+
elif event in ["PSTR", "PEND", "SOC", "GEND"]:
|
|
1962
|
+
parts = desc.split(': ')
|
|
1963
|
+
return parts[1].strip() if len(parts) > 1 else '\xa0'
|
|
1964
|
+
elif event == 'PENL':
|
|
1965
|
+
match = _PARENTHESIS_PATTERN.search(desc)
|
|
1966
|
+
return match.group(1).strip() if match else '\xa0'
|
|
1967
|
+
elif event == 'CHANGE':
|
|
1968
|
+
parts = desc.split(' - ')
|
|
1969
|
+
return parts[0].strip() if len(parts) > 0 else '\xa0'
|
|
1970
|
+
return '\xa0'
|
|
1971
|
+
|
|
1972
|
+
game = game.assign(
|
|
1973
|
+
game_id = int(game_id),
|
|
1974
|
+
season = int(season),
|
|
1975
|
+
event_zone = game.description.apply(extract_zone),
|
|
1976
|
+
event_detail = game.apply(extract_detail, axis=1))
|
|
1977
|
+
|
|
1978
|
+
# Goalie finding - keep nested np.where() as it's actually quite fast for this use case
|
|
1979
|
+
game = game.assign(home_goalie = np.where(
|
|
1980
|
+
game.home_on_1.isin(goalies.Name), game.home_on_1,
|
|
1981
|
+
np.where(
|
|
1982
|
+
game.home_on_2.isin(goalies.Name), game.home_on_2,
|
|
1983
|
+
np.where(
|
|
1984
|
+
game.home_on_3.isin(goalies.Name), game.home_on_3,
|
|
1985
|
+
np.where(
|
|
1986
|
+
game.home_on_4.isin(goalies.Name), game.home_on_4,
|
|
1987
|
+
np.where(
|
|
1988
|
+
game.home_on_5.isin(goalies.Name), game.home_on_5,
|
|
1989
|
+
np.where(
|
|
1990
|
+
game.home_on_6.isin(goalies.Name), game.home_on_6,
|
|
1991
|
+
np.where(
|
|
1992
|
+
game.home_on_7.isin(goalies.Name), game.home_on_7,
|
|
1993
|
+
np.where(
|
|
1994
|
+
game.home_on_8.isin(goalies.Name), game.home_on_8,
|
|
1995
|
+
np.where(
|
|
1996
|
+
game.home_on_9.isin(goalies.Name), game.home_on_9,
|
|
1997
|
+
'\xa0'))))))))),
|
|
1998
|
+
away_goalie = np.where(
|
|
1999
|
+
game.away_on_1.isin(goalies.Name), game.away_on_1,
|
|
2000
|
+
np.where(
|
|
2001
|
+
game.away_on_2.isin(goalies.Name), game.away_on_2,
|
|
2002
|
+
np.where(
|
|
2003
|
+
game.away_on_3.isin(goalies.Name), game.away_on_3,
|
|
2004
|
+
np.where(
|
|
2005
|
+
game.away_on_4.isin(goalies.Name), game.away_on_4,
|
|
2006
|
+
np.where(
|
|
2007
|
+
game.away_on_5.isin(goalies.Name), game.away_on_5,
|
|
2008
|
+
np.where(
|
|
2009
|
+
game.away_on_6.isin(goalies.Name), game.away_on_6,
|
|
2010
|
+
np.where(
|
|
2011
|
+
game.away_on_7.isin(goalies.Name), game.away_on_7,
|
|
2012
|
+
np.where(
|
|
2013
|
+
game.away_on_8.isin(goalies.Name), game.away_on_8,
|
|
2014
|
+
np.where(
|
|
2015
|
+
game.away_on_9.isin(goalies.Name), game.away_on_9,
|
|
2016
|
+
'\xa0'))))))))))
|
|
2017
|
+
|
|
2018
|
+
# OPTIMIZED: Vectorized column cleaning - use np.where() in loop for consistency
|
|
2019
|
+
# Slightly faster than individual assigns due to reduced function call overhead
|
|
2020
|
+
on_ice_cols = [f'{side}_on_{i}' for side in ['away', 'home'] for i in range(1, 10)]
|
|
2021
|
+
goalie_cols = ['home_goalie', 'away_goalie']
|
|
2022
|
+
|
|
2023
|
+
for col in on_ice_cols + goalie_cols:
|
|
2024
|
+
if col in game.columns:
|
|
2025
|
+
game[col] = np.where((pd.isna(game[col])) | (game[col] == '') | (game[col] == '\xa0'), '\xa0', game[col])
|
|
2026
|
+
|
|
2027
|
+
# OPTIMIZED: Vectorized skater counting using .ne() and .sum()
|
|
2028
|
+
# Cache the game_id check to avoid repeated string operations
|
|
2029
|
+
game_id_str = str(game_id)
|
|
2030
|
+
is_playoff = int(game_id_str[5]) == 3 if len(game_id_str) > 5 else False
|
|
2031
|
+
|
|
2032
|
+
# Vectorized: use .ne() (not equal) which is faster than np.where() for boolean conversion
|
|
2033
|
+
home_on_cols = [f'home_on_{i}' for i in range(1, 10)]
|
|
2034
|
+
away_on_cols = [f'away_on_{i}' for i in range(1, 10)]
|
|
2035
|
+
|
|
2036
|
+
home_skaters = game[home_on_cols].ne('\xa0').sum(axis=1)
|
|
2037
|
+
away_skaters = game[away_on_cols].ne('\xa0').sum(axis=1)
|
|
2038
|
+
|
|
2039
|
+
# Subtract goalie if present and in regulation/playoff
|
|
2040
|
+
goalie_mask = ((game.period < 5) | is_playoff)
|
|
2041
|
+
home_skaters = home_skaters - ((game.home_goalie != '\xa0') & goalie_mask).astype(int)
|
|
2042
|
+
away_skaters = away_skaters - ((game.away_goalie != '\xa0') & goalie_mask).astype(int)
|
|
2043
|
+
|
|
2044
|
+
game = game.assign(home_skaters=home_skaters, away_skaters=away_skaters)
|
|
2045
|
+
|
|
2046
|
+
game = game.assign(home_skater_temp =
|
|
2047
|
+
np.where((game.home_goalie=='\xa0') , 'E', game.home_skaters),
|
|
2048
|
+
away_skater_temp =
|
|
2049
|
+
np.where((game.away_goalie=='\xa0') , 'E', game.away_skaters))
|
|
2050
|
+
|
|
2051
|
+
# OPTIMIZED: Reuse cached is_playoff from earlier
|
|
2052
|
+
game = game.assign(game_strength_state = (game.home_skater_temp.astype(str)) + 'v' + (game.away_skater_temp.astype(str)),
|
|
2053
|
+
event_zone = np.where(game.event_zone is not None, game.event_zone.str.replace(". Zone", "", regex=False), ''),
|
|
2054
|
+
home_score = np.cumsum(np.where((game.event.shift()=='GOAL') & (((game.period<5) | is_playoff)) & (game.event_team.shift()==game.home_team), 1, 0)),
|
|
2055
|
+
away_score = np.cumsum(np.where((game.event.shift()=='GOAL') & (((game.period<5) | is_playoff)) & (game.event_team.shift()==game.away_team), 1, 0))).drop(
|
|
2056
|
+
columns = ['home_skater_temp', 'away_skater_temp'])
|
|
2057
|
+
|
|
2058
|
+
game = game.assign(game_score_state = (game.home_score.astype(str)) + 'v' + (game.away_score.astype(str)),
|
|
2059
|
+
game_date = pd.to_datetime(game.game_date[~pd.isna(game.game_date)].iloc[0])
|
|
2060
|
+
)
|
|
2061
|
+
|
|
2062
|
+
game.number_off = np.where((game.jumping_on!='\xa0') & (game.jumping_off=='\xa0'), 0, game.number_off)
|
|
2063
|
+
game.number_on = np.where((game.jumping_off!='\xa0') & (game.jumping_on=='\xa0'), 0, game.number_on)
|
|
2064
|
+
|
|
2065
|
+
so = game[game.period==5]
|
|
2066
|
+
|
|
2067
|
+
if len(so)>0 and int(game.game_id.astype(str).str[5].iloc[0]) != 3:
|
|
2068
|
+
game = game[game.period<5]
|
|
2069
|
+
home = roster[roster.team=='home'].rename(columns = {'teamnum':'home_on_ice', 'Name':'home_goalie_name'}).loc[:, ['home_goalie_name', 'home_on_ice']]
|
|
2070
|
+
away = roster[roster.team=='away'].rename(columns = {'teamnum':'away_on_ice', 'Name':'away_goalie_name'}).loc[:, ['away_goalie_name', 'away_on_ice']]
|
|
2071
|
+
so = so.merge(away, how = 'left', indicator = True).drop(columns = ['_merge']).merge(home, how = 'left')
|
|
2072
|
+
so = so.assign(
|
|
2073
|
+
home_goalie = so.home_goalie_name,
|
|
2074
|
+
away_goalie = so.away_goalie_name).drop(columns = ['away_goalie_name', 'home_goalie_name'])
|
|
2075
|
+
so_winner = so[so.event=='GOAL'].groupby('event_team')['event', 'home_team'].count().reset_index().sort_values(by = ['event', 'event_team'],ascending = False).event_team.iloc[0]
|
|
2076
|
+
so = so.assign(
|
|
2077
|
+
home_on_1 = so.home_goalie,
|
|
2078
|
+
away_on_1 = so.away_goalie,
|
|
2079
|
+
home_on_2 = np.where(so.event_team==so.home_team, so.event_player_1, '\xa0'),
|
|
2080
|
+
away_on_2 = np.where(so.event_team==so.away_team, so.event_player_1, '\xa0'))
|
|
2081
|
+
if len(so[so.event=='PEND'])>0:
|
|
2082
|
+
end_event = so[so.event=='PEND'].index.astype(int)[0]
|
|
2083
|
+
so = so.assign(
|
|
2084
|
+
home_score = np.where((so.index>=end_event) & (so_winner == so.home_team), 1+so.home_score, so.home_score),
|
|
2085
|
+
away_score = np.where((so.index>=end_event) & (so_winner == so.away_team), 1+so.away_score, so.away_score))
|
|
2086
|
+
game = pd.concat([game, so])
|
|
2087
|
+
|
|
2088
|
+
game['event_length'] = game.game_seconds.shift(-1) - game.game_seconds
|
|
2089
|
+
game['event_length'] = (np.where((pd.isna(game.event_length)) | (game.event_length<0), 0, game.event_length)).astype(int)
|
|
2090
|
+
game['event_index'] = game.event_index + 1
|
|
2091
|
+
|
|
2092
|
+
if 'coords_x' and 'coords_y' in game.columns:
|
|
2093
|
+
|
|
2094
|
+
columns = ['season', 'game_id', 'game_date', 'event_index',
|
|
2095
|
+
'period', 'game_seconds', 'event', 'description',
|
|
2096
|
+
'event_detail', 'event_zone', 'event_team', 'event_player_1',
|
|
2097
|
+
'event_player_2', 'event_player_3', 'event_length', 'coords_x',
|
|
2098
|
+
'coords_y', 'number_on', 'number_off', 'jumping_on', 'jumping_off',
|
|
2099
|
+
'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5',
|
|
2100
|
+
'home_on_6', 'home_on_7', 'home_on_8', 'home_on_9', 'away_on_1', 'away_on_2', 'away_on_3',
|
|
2101
|
+
'away_on_4', 'away_on_5', 'away_on_6', 'away_on_7', 'away_on_8', 'away_on_9', 'home_goalie',
|
|
2102
|
+
'away_goalie', 'home_team', 'away_team', 'home_skaters', 'away_skaters',
|
|
2103
|
+
'home_score', 'away_score', 'game_score_state', 'game_strength_state', 'coordinate_source', 'miss_reason']
|
|
2104
|
+
|
|
2105
|
+
else:
|
|
2106
|
+
|
|
2107
|
+
columns = ['season', 'game_id', 'game_date', 'event_index',
|
|
2108
|
+
'period', 'game_seconds', 'event', 'description',
|
|
2109
|
+
'event_detail', 'event_zone', 'event_team', 'event_player_1',
|
|
2110
|
+
'event_player_2', 'event_player_3', 'event_length',
|
|
2111
|
+
'number_on', 'number_off', 'jumping_on', 'jumping_off',
|
|
2112
|
+
'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5',
|
|
2113
|
+
'home_on_6', 'home_on_7', 'home_on_8', 'home_on_9', 'away_on_1', 'away_on_2', 'away_on_3',
|
|
2114
|
+
'away_on_4', 'away_on_5', 'away_on_6', 'away_on_7', 'away_on_8', 'away_on_9', 'home_goalie',
|
|
2115
|
+
'away_goalie', 'home_team', 'away_team', 'home_skaters', 'away_skaters',
|
|
2116
|
+
'home_score', 'away_score', 'game_score_state', 'game_strength_state']
|
|
2117
|
+
|
|
2118
|
+
game = game.loc[:, columns].rename(
|
|
2119
|
+
columns = {'period':'game_period', 'event':'event_type', 'description':'event_description', 'number_on':'num_on', 'number_off':'num_off',
|
|
2120
|
+
'jumping_on':'players_on', 'jumping_off':'players_off'}
|
|
2121
|
+
)
|
|
2122
|
+
|
|
2123
|
+
if live == True:
|
|
2124
|
+
# Compare finalized to events to see if we have events where the PBP does not match the skaters on ice in the shifts.
|
|
2125
|
+
# This can happen when players are taking their first shift of the game and thus do not appear in the time on ice HTML page.
|
|
2126
|
+
# Find events in finalized where skater count doesn't match, then get rid of everything beneath them.
|
|
2127
|
+
comparison_df = events.assign(events_away_skater_count = events.away_skaters.str.count('\xa0'), events_home_skater_count = events.home_skaters.str.count('\xa0'))[
|
|
2128
|
+
events.event.isin(ewc)
|
|
2129
|
+
].loc[:,
|
|
2130
|
+
['period', 'game_seconds', 'event', 'description', 'events_away_skater_count', 'events_home_skater_count']].merge(
|
|
2131
|
+
game[game.event_type.isin(ewc)].loc[:, ['game_period', 'game_seconds', 'event_index', 'event_type', 'event_description', 'home_skaters', 'away_skaters']].rename(
|
|
2132
|
+
columns = {'game_period':'period', 'event_type':'event', 'event_description':'description', 'away_skaters':'pbp_away_skater_count', 'home_skaters':'pbp_home_skater_count'}
|
|
2133
|
+
))
|
|
2134
|
+
|
|
2135
|
+
mismatches = comparison_df[(comparison_df.pbp_away_skater_count != comparison_df.events_away_skater_count) |
|
|
2136
|
+
(comparison_df.pbp_home_skater_count != comparison_df.events_home_skater_count)]
|
|
2137
|
+
|
|
2138
|
+
# We initially just ditched the mismatched event and everything that came beneath it.
|
|
2139
|
+
# But then we ran into an error: We can have a play where the player comes on the ice as a penalty expires. We have a "5v4" goal but PBP events show it as 5v5.
|
|
2140
|
+
# NHL dot com video description calls it "Power Play Goal" but doesn't show PP under it.
|
|
2141
|
+
# This is pretty tricky to handle. Because the initial "mismatch comparison" was designed to catch events where the shift was just a few seconds off, let's do it this way.
|
|
2142
|
+
|
|
2143
|
+
# Ditch the mismatched event and everything that comes after it!
|
|
2144
|
+
# If we have multiple mismatches and not many events after it.
|
|
2145
|
+
if len(mismatches) > 1 and len(game[game.game_seconds >= mismatches.game_seconds.min()]) < 20:
|
|
2146
|
+
game = game[game.event_index < mismatches.event_index.min()]
|
|
2147
|
+
|
|
2148
|
+
game[(game.event_type.isin(ewc)) & (game.home_skaters < 0)]
|
|
2149
|
+
|
|
2150
|
+
if live == True:
|
|
2151
|
+
mismatches = game[(game.event_type.isin(ewc + ['FAC'])) & ((game.home_skaters < 3) | (game.home_skaters > 6) | (game.away_skaters < 3) | (game.away_skaters > 6))]
|
|
2152
|
+
if len(mismatches) > 0:
|
|
2153
|
+
game = game[game.event_index < mismatches.event_index.min()]
|
|
2154
|
+
|
|
2155
|
+
return(game)
|
|
2156
|
+
|
|
2157
|
+
def fix_missing(single, event_coords, events):
|
|
2158
|
+
|
|
2159
|
+
# Commenting this entire thing out for now. It causes problems for whatever reason, and I'm not convinced these fucked up games are still showing up.
|
|
2160
|
+
|
|
2161
|
+
# # FIRST FIX: EVENTS THAT HAVE MATCHING PERIOD, SECONDS, AND EVENT TYPE, AND ONLY OCCURRED ONCE, BUT NO EVENT PLAYER. #
|
|
2162
|
+
# global event_coords_temp
|
|
2163
|
+
# global single_problems
|
|
2164
|
+
# global merged_problems
|
|
2165
|
+
# problems = events[(events.event.isin(ewc)) & (pd.isna(events.coords_x))]
|
|
2166
|
+
# single_problems = problems.groupby(['event', 'period', 'game_seconds'])[
|
|
2167
|
+
# 'event_index'].count().reset_index().rename(
|
|
2168
|
+
# columns = {'event_index':'problematic_events'})
|
|
2169
|
+
# # Keep events where only one event of that class happened at that moment.
|
|
2170
|
+
# single_problems = single_problems[single_problems.problematic_events==1]
|
|
2171
|
+
# single_problems = problems.merge(single_problems).drop(
|
|
2172
|
+
# columns = ['problematic_events', 'coords_x', 'coords_y', 'coordinate_source']) # x/y come back later!
|
|
2173
|
+
# event_coords_temp = event_coords.loc[:, ['period', 'game_seconds', 'event', 'version', 'coords_x', 'coordinate_source']].groupby(
|
|
2174
|
+
# ['game_seconds', 'period', 'event', 'version'])['coords_x'].count().reset_index().rename(
|
|
2175
|
+
# columns = {'coords_x':'problematic_events'})
|
|
2176
|
+
# event_coords_temp = event_coords_temp[event_coords_temp.problematic_events==1].drop(columns = 'problematic_events')
|
|
2177
|
+
# event_coords_temp = event_coords_temp.merge(event_coords.loc[:, ['game_seconds', 'period', 'event', 'version', 'coords_x', 'coords_y', 'coordinate_source']])
|
|
2178
|
+
# if 'espn_id' in event_coords_temp.columns:
|
|
2179
|
+
# event_coords_temp = event_coords_temp.drop(columns = 'espn_id')
|
|
2180
|
+
# merged_problems = single_problems.merge(event_coords_temp)
|
|
2181
|
+
# #print("You fixed: " + str(len(merged_problems)) + " events!")
|
|
2182
|
+
# events = events[~(events.event_index.isin(list(merged_problems.event_index)))]
|
|
2183
|
+
# events = pd.concat([events, merged_problems.loc[:, list(events.columns)]]).sort_values(by = ['event_index', 'period', 'game_seconds'])
|
|
2184
|
+
# #if len(merged_problems)>0:
|
|
2185
|
+
# #events = events[~events.event_index.isin(merged_problems.event_index)]
|
|
2186
|
+
# #events = pd.concat([events, merged_problems.loc[:, list(events.columns)]]).sort_values(by = ['event_index', 'period', 'game_seconds'])
|
|
2187
|
+
# look = events
|
|
2188
|
+
|
|
2189
|
+
# # SECOND FIX: EVENTS THAT HAVE MATCHING PERIOD, EVENT TYPE, AND PLAYER ONE, AND ONLY OCCURRED ONCE, BUT NO GAME SECONDS.
|
|
2190
|
+
|
|
2191
|
+
# problems = events[(events.event.isin(ewc)) & (pd.isna(events.coords_x))]
|
|
2192
|
+
# single_problems = problems.groupby(['event', 'period', 'event_player_1'])[
|
|
2193
|
+
# 'event_index'].count().reset_index().rename(
|
|
2194
|
+
# columns = {'event_index':'problematic_events'})
|
|
2195
|
+
# # Keep events where only one event of that class happened at that moment.
|
|
2196
|
+
# single_problems = single_problems[single_problems.problematic_events==1]
|
|
2197
|
+
# single_problems = problems.merge(single_problems).drop(
|
|
2198
|
+
# columns = ['problematic_events', 'coords_x', 'coords_y', 'coordinate_source']) # x/y come back later!
|
|
2199
|
+
# event_coords_temp = event_coords.loc[:, ['period', 'event_player_1', 'event',
|
|
2200
|
+
# 'version', 'coords_x', 'coordinate_source']].groupby(
|
|
2201
|
+
# ['event_player_1', 'period', 'event', 'version'])['coords_x'].count().reset_index().rename(
|
|
2202
|
+
# columns = {'coords_x':'problematic_events'})
|
|
2203
|
+
# event_coords_temp = event_coords_temp[event_coords_temp.problematic_events==1].drop(columns = 'problematic_events')
|
|
2204
|
+
# event_coords_temp = event_coords_temp.merge(event_coords.loc[:, ['event_player_1', 'period', 'event', 'version', 'coords_x', 'coords_y', 'coordinate_source']])
|
|
2205
|
+
# merged_problems = single_problems.merge(event_coords_temp)
|
|
2206
|
+
# #print("You fixed: " + str(len(merged_problems)) + " events!")
|
|
2207
|
+
# events = events[~events.event_index.isin(merged_problems.event_index)]
|
|
2208
|
+
# events = pd.concat([events, merged_problems]).sort_values(by = ['event_index', 'period', 'game_seconds'])
|
|
2209
|
+
|
|
2210
|
+
return(events)
|
|
2211
|
+
|
|
2212
|
+
def _fetch_all_pages_parallel(season, game_id, verbose=False):
|
|
2213
|
+
"""
|
|
2214
|
+
Fetch all required HTML pages in parallel.
|
|
2215
|
+
|
|
2216
|
+
Args:
|
|
2217
|
+
season: Season string (e.g., '20242025')
|
|
2218
|
+
game_id: Full game ID (e.g., 2025020333)
|
|
2219
|
+
verbose: If True, print detailed timing information
|
|
2220
|
+
|
|
2221
|
+
Returns:
|
|
2222
|
+
Dictionary with keys: 'events', 'roster', 'home_shifts', 'away_shifts'
|
|
2223
|
+
All values are requests.Response objects
|
|
2224
|
+
"""
|
|
2225
|
+
small_id = str(game_id)[5:]
|
|
2226
|
+
|
|
2227
|
+
# Prepare all URLs
|
|
2228
|
+
events_url = f'http://www.nhl.com/scores/htmlreports/{season}/PL0{small_id}.HTM'
|
|
2229
|
+
roster_url = f'http://www.nhl.com/scores/htmlreports/{season}/RO0{small_id}.HTM'
|
|
2230
|
+
home_shifts_url = f'http://www.nhl.com/scores/htmlreports/{season}/TH0{small_id}.HTM'
|
|
2231
|
+
away_shifts_url = f'http://www.nhl.com/scores/htmlreports/{season}/TV0{small_id}.HTM'
|
|
2232
|
+
summary_url = f'https://www.nhl.com/scores/htmlreports/{season}/GS0{small_id}.HTM'
|
|
2233
|
+
|
|
2234
|
+
# Fetch HTML pages concurrently (4 pages)
|
|
2235
|
+
fetch_start = time.time()
|
|
2236
|
+
if verbose:
|
|
2237
|
+
print(' 🔄 Fetching HTML pages in parallel...')
|
|
2238
|
+
|
|
2239
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
2240
|
+
# Submit HTML fetch tasks only
|
|
2241
|
+
futures = {
|
|
2242
|
+
'events': executor.submit(_fetch_url, events_url, timeout=10),
|
|
2243
|
+
'roster': executor.submit(_fetch_url, roster_url, timeout=10),
|
|
2244
|
+
'home_shifts': executor.submit(_fetch_url, home_shifts_url, timeout=10),
|
|
2245
|
+
'away_shifts': executor.submit(_fetch_url, away_shifts_url, timeout=10),
|
|
2246
|
+
'summary': executor.submit(_fetch_url, summary_url, timeout=10)
|
|
2247
|
+
}
|
|
2248
|
+
|
|
2249
|
+
# Create reverse mapping from future to key
|
|
2250
|
+
future_to_key = {future: key for key, future in futures.items()}
|
|
2251
|
+
|
|
2252
|
+
# Collect HTML page results as they complete
|
|
2253
|
+
results = {}
|
|
2254
|
+
for future in as_completed(futures.values()):
|
|
2255
|
+
key = future_to_key[future]
|
|
2256
|
+
results[key] = future.result() # Will raise if HTTP error
|
|
2257
|
+
|
|
2258
|
+
html_fetch_duration = time.time() - fetch_start
|
|
2259
|
+
if verbose:
|
|
2260
|
+
try:
|
|
2261
|
+
print(f' ⏱️ HTML pages fetched in: {html_fetch_duration:.2f}s')
|
|
2262
|
+
except Exception:
|
|
2263
|
+
pass
|
|
2264
|
+
|
|
2265
|
+
return results
|
|
2266
|
+
|
|
2267
|
+
def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True, return_intermediates = False, verbose = False):
|
|
2268
|
+
|
|
2269
|
+
global single
|
|
2270
|
+
global event_coords
|
|
2271
|
+
global full
|
|
2272
|
+
global fixed_events
|
|
2273
|
+
global events
|
|
2274
|
+
|
|
2275
|
+
# OPTIMIZED: Use list instead of DataFrame for accumulating results
|
|
2276
|
+
full_list = []
|
|
2277
|
+
|
|
2278
|
+
# Track intermediates for each game if requested
|
|
2279
|
+
intermediates_list = []
|
|
2280
|
+
|
|
2281
|
+
i = 0
|
|
2282
|
+
|
|
2283
|
+
while i in range(0, len(game_id_list)) and len(game_id_list)>0:
|
|
2284
|
+
|
|
2285
|
+
# First thing to try: Scraping HTML events
|
|
2286
|
+
|
|
2287
|
+
try:
|
|
2288
|
+
first_time = time.time()
|
|
2289
|
+
game_id = game_id_list[i]
|
|
2290
|
+
print('Attempting scrape for: ' + str(game_id))
|
|
2291
|
+
season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
|
|
2292
|
+
small_id = str(game_id)[5:]
|
|
2293
|
+
|
|
2294
|
+
# OPTIMIZED: Fetch HTML pages in parallel, API separately
|
|
2295
|
+
parallel_start = time.time()
|
|
2296
|
+
if verbose:
|
|
2297
|
+
print('Fetching pages')
|
|
2298
|
+
pages = _fetch_all_pages_parallel(season, game_id, verbose=verbose)
|
|
2299
|
+
parallel_duration = time.time() - parallel_start
|
|
2300
|
+
if verbose:
|
|
2301
|
+
try:
|
|
2302
|
+
print(f'⏱️ Parallel fetch took: {parallel_duration:.2f}s')
|
|
2303
|
+
except Exception:
|
|
2304
|
+
pass
|
|
2305
|
+
|
|
2306
|
+
# TIME: HTML Events (using pre-fetched pages)
|
|
2307
|
+
html_start = time.time()
|
|
2308
|
+
if verbose:
|
|
2309
|
+
print('Scraping HTML events')
|
|
2310
|
+
single, roster_cache = scrape_html_events(season, small_id,
|
|
2311
|
+
events_page=pages['events'],
|
|
2312
|
+
roster_page=pages['roster'],
|
|
2313
|
+
verbose=verbose)
|
|
2314
|
+
html_duration = time.time() - html_start
|
|
2315
|
+
if verbose:
|
|
2316
|
+
try:
|
|
2317
|
+
print(f'⏱️ HTML events processing took: {html_duration:.2f}s')
|
|
2318
|
+
except Exception:
|
|
2319
|
+
pass
|
|
2320
|
+
single['game_id'] = int(game_id)
|
|
2321
|
+
|
|
2322
|
+
# Try NHL API first (default behavior)
|
|
2323
|
+
|
|
2324
|
+
try:
|
|
2325
|
+
# TIME: API Events (fetch after HTML events are processed, like original)
|
|
2326
|
+
api_start = time.time()
|
|
2327
|
+
if verbose:
|
|
2328
|
+
print('Attempting to scrape coordinates from NHL API')
|
|
2329
|
+
event_coords = scrape_api_events(game_id, drop_description=True, verbose=verbose)
|
|
2330
|
+
api_duration = time.time() - api_start
|
|
2331
|
+
if verbose:
|
|
2332
|
+
try:
|
|
2333
|
+
print(f'⏱️ API events took: {api_duration:.2f}s')
|
|
2334
|
+
except Exception:
|
|
2335
|
+
pass
|
|
2336
|
+
|
|
2337
|
+
# Set coordinate_source on event_coords before merging (needed for fix_missing)
|
|
2338
|
+
event_coords['coordinate_source'] = 'api'
|
|
2339
|
+
api_coords = event_coords.copy()
|
|
2340
|
+
if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
|
|
2341
|
+
raise ExpatError('Bad takes, dude!')
|
|
2342
|
+
event_coords['game_id'] = int(game_id)
|
|
2343
|
+
|
|
2344
|
+
# TIME: Merge Events
|
|
2345
|
+
merge_start = time.time()
|
|
2346
|
+
if verbose:
|
|
2347
|
+
print('Attempting to merge events')
|
|
2348
|
+
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'game_id', 'event'], how = 'left')
|
|
2349
|
+
merge_duration = time.time() - merge_start
|
|
2350
|
+
if verbose:
|
|
2351
|
+
print(f'Merged events, we have this many rows: {len(events)}')
|
|
2352
|
+
try:
|
|
2353
|
+
print(f'⏱️ Merge took: {merge_duration:.2f}s')
|
|
2354
|
+
except Exception:
|
|
2355
|
+
pass
|
|
2356
|
+
|
|
2357
|
+
|
|
2358
|
+
|
|
2359
|
+
# TIME: Fix Missing
|
|
2360
|
+
try:
|
|
2361
|
+
fix_start = time.time()
|
|
2362
|
+
events = fix_missing(single, event_coords, events)
|
|
2363
|
+
fix_duration = time.time() - fix_start
|
|
2364
|
+
if verbose:
|
|
2365
|
+
try:
|
|
2366
|
+
print(f'⏱️ Fix missing took: {fix_duration:.2f}s')
|
|
2367
|
+
except Exception:
|
|
2368
|
+
pass
|
|
2369
|
+
except IndexError as e:
|
|
2370
|
+
print('Issue when fixing problematic events. Here it is: ' + str(e))
|
|
2371
|
+
continue
|
|
2372
|
+
if verbose:
|
|
2373
|
+
print(pages)
|
|
2374
|
+
# TIME: Shifts and Finalize (using pre-fetched pages)
|
|
2375
|
+
try:
|
|
2376
|
+
if verbose:
|
|
2377
|
+
print(pages)
|
|
2378
|
+
shifts_start = time.time()
|
|
2379
|
+
if live == True:
|
|
2380
|
+
min_game_clock, shifts = scrape_html_shifts(season, small_id, live,
|
|
2381
|
+
home_page=pages['home_shifts'],
|
|
2382
|
+
away_page=pages['away_shifts'],
|
|
2383
|
+
summary = pages['summary'],
|
|
2384
|
+
roster_cache = roster_cache,
|
|
2385
|
+
verbose=verbose)
|
|
2386
|
+
else:
|
|
2387
|
+
shifts = scrape_html_shifts(season, small_id, live,
|
|
2388
|
+
home_page=pages['home_shifts'],
|
|
2389
|
+
away_page=pages['away_shifts'],
|
|
2390
|
+
summary = pages['summary'],
|
|
2391
|
+
roster_cache = roster_cache,
|
|
2392
|
+
verbose=verbose)
|
|
2393
|
+
shifts_duration = time.time() - shifts_start
|
|
2394
|
+
if verbose:
|
|
2395
|
+
try:
|
|
2396
|
+
print(f'⏱️ HTML shifts processing took: {shifts_duration:.2f}s')
|
|
2397
|
+
except Exception:
|
|
2398
|
+
pass
|
|
2399
|
+
|
|
2400
|
+
prepare_start = time.time()
|
|
2401
|
+
finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
|
|
2402
|
+
if live == True:
|
|
2403
|
+
if min_game_clock is not None:
|
|
2404
|
+
finalized = finalized[finalized.game_seconds <= min_game_clock]
|
|
2405
|
+
prepare_duration = time.time() - prepare_start
|
|
2406
|
+
if verbose:
|
|
2407
|
+
try:
|
|
2408
|
+
print(f'⏱️ Merge and prepare took: {prepare_duration:.2f}s')
|
|
2409
|
+
except Exception:
|
|
2410
|
+
pass
|
|
2411
|
+
|
|
2412
|
+
full_list.append(finalized)
|
|
2413
|
+
second_time = time.time()
|
|
2414
|
+
|
|
2415
|
+
# Track intermediates if requested
|
|
2416
|
+
if return_intermediates:
|
|
2417
|
+
intermediates_list.append({
|
|
2418
|
+
'game_id': game_id,
|
|
2419
|
+
'shifts': shifts.copy() if shifts is not None else None,
|
|
2420
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2421
|
+
'roster_cache': roster_cache.copy() if roster_cache is not None else None,
|
|
2422
|
+
'coordinate_source': 'api',
|
|
2423
|
+
'warning': None,
|
|
2424
|
+
'error': None,
|
|
2425
|
+
'raw_html': {
|
|
2426
|
+
'events': pages.get('events'),
|
|
2427
|
+
'roster': pages.get('roster'),
|
|
2428
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2429
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2430
|
+
'summary': pages.get('summary')
|
|
2431
|
+
}
|
|
2432
|
+
})
|
|
2433
|
+
except IndexError as e:
|
|
2434
|
+
print('There was no shift data for this game. Error: ' + str(e))
|
|
2435
|
+
fixed_events = events
|
|
2436
|
+
fixed_events = fixed_events.rename(
|
|
2437
|
+
columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
|
|
2438
|
+
'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
|
|
2439
|
+
'away_team':'awayteamfull'}
|
|
2440
|
+
).drop(
|
|
2441
|
+
columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
|
|
2442
|
+
).assign(game_warning = 'NO SHIFT DATA.')
|
|
2443
|
+
full_list.append(fixed_events)
|
|
2444
|
+
second_time = time.time()
|
|
2445
|
+
|
|
2446
|
+
# Track intermediates if requested
|
|
2447
|
+
if return_intermediates:
|
|
2448
|
+
intermediates_list.append({
|
|
2449
|
+
'game_id': game_id,
|
|
2450
|
+
'shifts': None,
|
|
2451
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2452
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2453
|
+
'coordinate_source': 'api',
|
|
2454
|
+
'warning': 'NO SHIFT DATA.',
|
|
2455
|
+
'error': None,
|
|
2456
|
+
'raw_html': {
|
|
2457
|
+
'events': pages.get('events'),
|
|
2458
|
+
'roster': pages.get('roster'),
|
|
2459
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2460
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2461
|
+
'summary': pages.get('summary')
|
|
2462
|
+
}
|
|
2463
|
+
})
|
|
2464
|
+
|
|
2465
|
+
try:
|
|
2466
|
+
total_duration = second_time - first_time
|
|
2467
|
+
except NameError:
|
|
2468
|
+
second_time = time.time()
|
|
2469
|
+
total_duration = second_time - first_time
|
|
2470
|
+
print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from the NHL API.')
|
|
2471
|
+
# Safely format timing string, handling potentially undefined variables
|
|
2472
|
+
try:
|
|
2473
|
+
timing_parts = []
|
|
2474
|
+
if 'parallel_duration' in locals(): timing_parts.append(f"Parallel fetch: {parallel_duration:.2f}s")
|
|
2475
|
+
if 'html_duration' in locals(): timing_parts.append(f"HTML processing: {html_duration:.2f}s")
|
|
2476
|
+
if 'api_duration' in locals(): timing_parts.append(f"API processing: {api_duration:.2f}s")
|
|
2477
|
+
if 'merge_duration' in locals(): timing_parts.append(f"Merge: {merge_duration:.2f}s")
|
|
2478
|
+
if 'fix_duration' in locals(): timing_parts.append(f"Fix missing: {fix_duration:.2f}s")
|
|
2479
|
+
if 'shifts_duration' in locals(): timing_parts.append(f"Shifts: {shifts_duration:.2f}s")
|
|
2480
|
+
if 'prepare_duration' in locals(): timing_parts.append(f"Merge/prepare: {prepare_duration:.2f}s")
|
|
2481
|
+
if len(timing_parts) > 0:
|
|
2482
|
+
print("(" + ", ".join(timing_parts) + ")")
|
|
2483
|
+
else:
|
|
2484
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2485
|
+
except Exception:
|
|
2486
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2487
|
+
i = i + 1
|
|
2488
|
+
|
|
2489
|
+
# If there is an issue with the API, fall back to ESPN:
|
|
2490
|
+
|
|
2491
|
+
except (KeyError, ExpatError) as e:
|
|
2492
|
+
print('The NHL API gave us trouble with: ' + str(game_id) + '. Falling back to ESPN.')
|
|
2493
|
+
|
|
2494
|
+
try:
|
|
2495
|
+
home_team = single['home_team_abbreviated'].iloc[0]
|
|
2496
|
+
away_team = single['away_team_abbreviated'].iloc[0]
|
|
2497
|
+
game_date = single['game_date'].iloc[0]
|
|
2498
|
+
espn_home_team = home_team
|
|
2499
|
+
espn_away_team = away_team
|
|
2500
|
+
try:
|
|
2501
|
+
if home_team == 'T.B':
|
|
2502
|
+
espn_home_team = 'TBL'
|
|
2503
|
+
if away_team == 'T.B':
|
|
2504
|
+
espn_away_team = 'TBL'
|
|
2505
|
+
if home_team == 'L.A':
|
|
2506
|
+
espn_home_team = 'LAK'
|
|
2507
|
+
if away_team == 'L.A':
|
|
2508
|
+
espn_away_team = 'LAK'
|
|
2509
|
+
if home_team == 'N.J':
|
|
2510
|
+
espn_home_team = 'NJD'
|
|
2511
|
+
if away_team == 'N.J':
|
|
2512
|
+
espn_away_team = 'NJD'
|
|
2513
|
+
if home_team == 'S.J':
|
|
2514
|
+
espn_home_team = 'SJS'
|
|
2515
|
+
if away_team == 'S.J':
|
|
2516
|
+
espn_away_team = 'SJS'
|
|
2517
|
+
if verbose:
|
|
2518
|
+
print('Scraping ESPN IDs')
|
|
2519
|
+
espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
|
|
2520
|
+
if verbose:
|
|
2521
|
+
print('Scraping ESPN Events')
|
|
2522
|
+
print('Here is the ESPN ID:', espn_id)
|
|
2523
|
+
event_coords = scrape_espn_events(int(espn_id))
|
|
2524
|
+
if verbose:
|
|
2525
|
+
print('Scraped ESPN Events, we have this many rows:', len(event_coords))
|
|
2526
|
+
event_coords['coordinate_source'] = 'espn'
|
|
2527
|
+
if verbose:
|
|
2528
|
+
print('Attempting to merge events')
|
|
2529
|
+
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
|
|
2530
|
+
if verbose:
|
|
2531
|
+
print('Merged events, we have this many rows:', len(events))
|
|
2532
|
+
try:
|
|
2533
|
+
events = fix_missing(single, event_coords, events)
|
|
2534
|
+
except IndexError as e:
|
|
2535
|
+
print('Issue when fixing problematic events. Here it is: ' + str(e))
|
|
2536
|
+
continue
|
|
2537
|
+
except IndexError:
|
|
2538
|
+
print('This game does not have ESPN or API coordinates. You will get it anyway, though.')
|
|
2539
|
+
events = single
|
|
2540
|
+
try:
|
|
2541
|
+
shifts = scrape_html_shifts(season, small_id, live,
|
|
2542
|
+
home_page=pages['home_shifts'],
|
|
2543
|
+
away_page=pages['away_shifts'],
|
|
2544
|
+
summary = pages['summary'],
|
|
2545
|
+
roster_cache = roster_cache,
|
|
2546
|
+
verbose=verbose)
|
|
2547
|
+
finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
|
|
2548
|
+
full_list.append(finalized)
|
|
2549
|
+
second_time = time.time()
|
|
2550
|
+
|
|
2551
|
+
# Track intermediates if requested
|
|
2552
|
+
if return_intermediates:
|
|
2553
|
+
intermediates_list.append({
|
|
2554
|
+
'game_id': game_id,
|
|
2555
|
+
'shifts': shifts.copy() if shifts is not None else None,
|
|
2556
|
+
'api_coords': None,
|
|
2557
|
+
'roster_cache': roster_cache.copy() if roster_cache is not None else None,
|
|
2558
|
+
'coordinate_source': 'espn',
|
|
2559
|
+
'warning': None,
|
|
2560
|
+
'error': None,
|
|
2561
|
+
'raw_html': {
|
|
2562
|
+
'events': pages.get('events'),
|
|
2563
|
+
'roster': pages.get('roster'),
|
|
2564
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2565
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2566
|
+
'summary': pages.get('summary')
|
|
2567
|
+
}
|
|
2568
|
+
})
|
|
2569
|
+
except IndexError as e:
|
|
2570
|
+
print('There was no shift data for this game. Error: ' + str(e))
|
|
2571
|
+
fixed_events = events
|
|
2572
|
+
fixed_events = fixed_events.rename(
|
|
2573
|
+
columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
|
|
2574
|
+
'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
|
|
2575
|
+
'away_team':'awayteamfull'}
|
|
2576
|
+
).drop(
|
|
2577
|
+
columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
|
|
2578
|
+
).assign(game_warning = 'NO SHIFT DATA', season = season)
|
|
2579
|
+
fixed_events['coordinate_source'] = 'espn'
|
|
2580
|
+
full_list.append(fixed_events)
|
|
2581
|
+
|
|
2582
|
+
# Track intermediates if requested
|
|
2583
|
+
if return_intermediates:
|
|
2584
|
+
intermediates_list.append({
|
|
2585
|
+
'game_id': game_id,
|
|
2586
|
+
'shifts': None,
|
|
2587
|
+
'api_coords': None,
|
|
2588
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2589
|
+
'coordinate_source': 'espn',
|
|
2590
|
+
'warning': 'NO SHIFT DATA',
|
|
2591
|
+
'error': None,
|
|
2592
|
+
'raw_html': {
|
|
2593
|
+
'events': pages.get('events'),
|
|
2594
|
+
'roster': pages.get('roster'),
|
|
2595
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2596
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2597
|
+
'summary': pages.get('summary')
|
|
2598
|
+
}
|
|
2599
|
+
})
|
|
2600
|
+
second_time = time.time()
|
|
2601
|
+
total_duration = second_time - first_time
|
|
2602
|
+
# Fix this so it doesn't say sourced from ESPN if no coords.
|
|
2603
|
+
if single.equals(events):
|
|
2604
|
+
if verbose:
|
|
2605
|
+
print("This game took " + str(round(total_duration, 2)) + " seconds.")
|
|
2606
|
+
i = i + 1
|
|
2607
|
+
else:
|
|
2608
|
+
print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from ESPN.')
|
|
2609
|
+
try:
|
|
2610
|
+
timing_parts = []
|
|
2611
|
+
if 'parallel_duration' in locals(): timing_parts.append(f"Parallel fetch: {parallel_duration:.2f}s")
|
|
2612
|
+
if 'html_duration' in locals(): timing_parts.append(f"HTML processing: {html_duration:.2f}s")
|
|
2613
|
+
if len(timing_parts) > 0:
|
|
2614
|
+
print("(" + ", ".join(timing_parts) + ")")
|
|
2615
|
+
else:
|
|
2616
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2617
|
+
except Exception:
|
|
2618
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2619
|
+
i = i + 1
|
|
2620
|
+
|
|
2621
|
+
# If there are issues with ESPN
|
|
2622
|
+
|
|
2623
|
+
except KeyError as e:
|
|
2624
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2625
|
+
print('KeyError: ' + str(e))
|
|
2626
|
+
print(traceback.format_exc())
|
|
2627
|
+
if return_intermediates:
|
|
2628
|
+
intermediates_list.append({
|
|
2629
|
+
'game_id': game_id,
|
|
2630
|
+
'shifts': None,
|
|
2631
|
+
'api_coords': None,
|
|
2632
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2633
|
+
'coordinate_source': None,
|
|
2634
|
+
'warning': None,
|
|
2635
|
+
'error': f'ESPN KeyError: {str(e)}',
|
|
2636
|
+
'error_traceback': traceback.format_exc(),
|
|
2637
|
+
'raw_html': {
|
|
2638
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2639
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2640
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2641
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2642
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2643
|
+
}
|
|
2644
|
+
})
|
|
2645
|
+
i = i + 1
|
|
2646
|
+
continue
|
|
2647
|
+
except IndexError as e:
|
|
2648
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2649
|
+
print('IndexError: ' + str(e))
|
|
2650
|
+
if return_intermediates:
|
|
2651
|
+
intermediates_list.append({
|
|
2652
|
+
'game_id': game_id,
|
|
2653
|
+
'shifts': None,
|
|
2654
|
+
'api_coords': None,
|
|
2655
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2656
|
+
'coordinate_source': None,
|
|
2657
|
+
'warning': None,
|
|
2658
|
+
'error': f'ESPN IndexError: {str(e)}',
|
|
2659
|
+
'error_traceback': traceback.format_exc(),
|
|
2660
|
+
'raw_html': {
|
|
2661
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2662
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2663
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2664
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2665
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2666
|
+
}
|
|
2667
|
+
})
|
|
2668
|
+
i = i + 1
|
|
2669
|
+
continue
|
|
2670
|
+
except TypeError as e:
|
|
2671
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2672
|
+
print('TypeError: ' + str(e))
|
|
2673
|
+
if return_intermediates:
|
|
2674
|
+
intermediates_list.append({
|
|
2675
|
+
'game_id': game_id,
|
|
2676
|
+
'shifts': None,
|
|
2677
|
+
'api_coords': None,
|
|
2678
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2679
|
+
'coordinate_source': None,
|
|
2680
|
+
'warning': None,
|
|
2681
|
+
'error': f'ESPN TypeError: {str(e)}',
|
|
2682
|
+
'error_traceback': traceback.format_exc(),
|
|
2683
|
+
'raw_html': {
|
|
2684
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2685
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2686
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2687
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2688
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2689
|
+
}
|
|
2690
|
+
})
|
|
2691
|
+
i = i + 1
|
|
2692
|
+
continue
|
|
2693
|
+
except ExpatError as e:
|
|
2694
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2695
|
+
print('ExpatError: ' + str(e))
|
|
2696
|
+
if return_intermediates:
|
|
2697
|
+
intermediates_list.append({
|
|
2698
|
+
'game_id': game_id,
|
|
2699
|
+
'shifts': None,
|
|
2700
|
+
'api_coords': None,
|
|
2701
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2702
|
+
'coordinate_source': None,
|
|
2703
|
+
'warning': None,
|
|
2704
|
+
'error': f'ESPN ExpatError: {str(e)}',
|
|
2705
|
+
'error_traceback': traceback.format_exc(),
|
|
2706
|
+
'raw_html': {
|
|
2707
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2708
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2709
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2710
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2711
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2712
|
+
}
|
|
2713
|
+
})
|
|
2714
|
+
i = i + 1
|
|
2715
|
+
continue
|
|
2716
|
+
|
|
2717
|
+
except ExpatError:
|
|
2718
|
+
print('There was a rare error with the API; numerous takeaways did not have location coordinates for: ' + str(game_id) + '. Let us try ESPN.')
|
|
2719
|
+
|
|
2720
|
+
try:
|
|
2721
|
+
home_team = single['home_team'].iloc[0]
|
|
2722
|
+
away_team = single['away_team'].iloc[0]
|
|
2723
|
+
game_date = single['game_date'].iloc[0]
|
|
2724
|
+
try:
|
|
2725
|
+
espn_id = scrape_espn_ids_single_game(str(game_date.date()), home_team, away_team).espn_id.iloc[0]
|
|
2726
|
+
event_coords = scrape_espn_events(int(espn_id))
|
|
2727
|
+
duped_coords = api_coords.assign(source = 'api').merge(event_coords.drop(columns = 'espn_id'), on = ['game_seconds', 'event', 'period', 'version', 'event_player_1'], how = 'outer', indicator = True)
|
|
2728
|
+
# Coordinates are flipped in some games.
|
|
2729
|
+
if len(duped_coords[duped_coords.coords_x_x * -1 == duped_coords.coords_x_y])/len(duped_coords):
|
|
2730
|
+
duped_coords['coords_x_y'] = duped_coords['coords_x_y'] * (-1)
|
|
2731
|
+
if len(duped_coords[duped_coords.coords_y_x * -1 == duped_coords.coords_y_y])/len(duped_coords):
|
|
2732
|
+
duped_coords['coords_y_y'] = duped_coords['coords_y_y'] * (-1)
|
|
2733
|
+
duped_coords['source'] = np.where((pd.isna(duped_coords.source)) | ((pd.isna(duped_coords.coords_x_x)) & ~pd.isna(duped_coords.coords_x_y)), 'espn', duped_coords.source)
|
|
2734
|
+
duped_coords = duped_coords.assign(coords_x = np.where(pd.isna(duped_coords.coords_x_x), duped_coords.coords_x_y, duped_coords.coords_x_x),
|
|
2735
|
+
coords_y = np.where(pd.isna(duped_coords.coords_y_x), duped_coords.coords_y_y, duped_coords.coords_y_x))
|
|
2736
|
+
col_list = list(api_coords.columns)
|
|
2737
|
+
col_list._append('source')
|
|
2738
|
+
duped_coords = duped_coords.loc[:, col_list]
|
|
2739
|
+
duped_coords = duped_coords[duped_coords.event.isin(['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL', 'PENL', 'FAC'])]
|
|
2740
|
+
duped_coords = duped_coords[~duped_coords.duplicated()]
|
|
2741
|
+
event_coords = duped_coords
|
|
2742
|
+
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left')#.drop(columns = ['espn_id'])
|
|
2743
|
+
try:
|
|
2744
|
+
events = fix_missing(single, event_coords, events)
|
|
2745
|
+
events['coordinate_source'] = events['source']
|
|
2746
|
+
except IndexError as e:
|
|
2747
|
+
print('Issue when fixing problematic events. Here it is: ' + str(e))
|
|
2748
|
+
except IndexError as e:
|
|
2749
|
+
if event_coords is not None:
|
|
2750
|
+
print('Okay, ESPN had issues. We will go back to the API for this one. Issue: ' + str(e))
|
|
2751
|
+
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'event'], how = 'left')
|
|
2752
|
+
try:
|
|
2753
|
+
events = fix_missing(single, event_coords, events)
|
|
2754
|
+
except IndexError as e:
|
|
2755
|
+
print('Issue when fixing problematic events. Here it is: ' + str(e))
|
|
2756
|
+
else:
|
|
2757
|
+
print('This game does not have ESPN or API coordinates. You will get it anyway, though. Issue: ' + str(e))
|
|
2758
|
+
events = single
|
|
2759
|
+
events['coordinate_source'] = 'none'
|
|
2760
|
+
# Determine coordinate source for this path
|
|
2761
|
+
coord_source_for_intermediates = 'espn'
|
|
2762
|
+
if 'source' in events.columns:
|
|
2763
|
+
coord_source_for_intermediates = 'api_espn_hybrid'
|
|
2764
|
+
elif 'coordinate_source' not in events.columns or events.get('coordinate_source', pd.Series(['none'])).iloc[0] == 'none':
|
|
2765
|
+
coord_source_for_intermediates = 'none'
|
|
2766
|
+
|
|
2767
|
+
try:
|
|
2768
|
+
shifts = scrape_html_shifts(season, small_id, live,
|
|
2769
|
+
home_page=pages['home_shifts'],
|
|
2770
|
+
away_page=pages['away_shifts'],
|
|
2771
|
+
summary = pages['summary'],
|
|
2772
|
+
roster_cache = roster_cache,
|
|
2773
|
+
verbose=verbose)
|
|
2774
|
+
finalized = merge_and_prepare(events, shifts, roster_cache, live = live)
|
|
2775
|
+
full_list.append(finalized)
|
|
2776
|
+
second_time = time.time()
|
|
2777
|
+
|
|
2778
|
+
# Track intermediates if requested
|
|
2779
|
+
if return_intermediates:
|
|
2780
|
+
intermediates_list.append({
|
|
2781
|
+
'game_id': game_id,
|
|
2782
|
+
'shifts': shifts.copy() if shifts is not None else None,
|
|
2783
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2784
|
+
'roster_cache': roster_cache.copy() if roster_cache is not None else None,
|
|
2785
|
+
'coordinate_source': coord_source_for_intermediates,
|
|
2786
|
+
'warning': None,
|
|
2787
|
+
'error': None,
|
|
2788
|
+
'raw_html': {
|
|
2789
|
+
'events': pages.get('events'),
|
|
2790
|
+
'roster': pages.get('roster'),
|
|
2791
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2792
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2793
|
+
'summary': pages.get('summary')
|
|
2794
|
+
}
|
|
2795
|
+
})
|
|
2796
|
+
except IndexError as e:
|
|
2797
|
+
print('There was no shift data for this game. Error: ' + str(e))
|
|
2798
|
+
fixed_events = events
|
|
2799
|
+
fixed_events = fixed_events.rename(
|
|
2800
|
+
columns = {'period':'game_period', 'event':'event_type', 'away_team_abbreviated':'away_team',
|
|
2801
|
+
'home_team_abbreviated':'home_team', 'description':'event_description', 'home_team':'hometeamfull',
|
|
2802
|
+
'away_team':'awayteamfull'}
|
|
2803
|
+
).drop(
|
|
2804
|
+
columns = ['original_time', 'other_team', 'strength', 'event_player_str', 'version', 'hometeamfull', 'awayteamfull']
|
|
2805
|
+
).assign(game_warning = 'NO SHIFT DATA', season = season)
|
|
2806
|
+
full_list.append(fixed_events)
|
|
2807
|
+
|
|
2808
|
+
# Track intermediates if requested
|
|
2809
|
+
if return_intermediates:
|
|
2810
|
+
intermediates_list.append({
|
|
2811
|
+
'game_id': game_id,
|
|
2812
|
+
'shifts': None,
|
|
2813
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2814
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2815
|
+
'coordinate_source': coord_source_for_intermediates,
|
|
2816
|
+
'warning': 'NO SHIFT DATA',
|
|
2817
|
+
'error': None,
|
|
2818
|
+
'raw_html': {
|
|
2819
|
+
'events': pages.get('events'),
|
|
2820
|
+
'roster': pages.get('roster'),
|
|
2821
|
+
'home_shifts': pages.get('home_shifts'),
|
|
2822
|
+
'away_shifts': pages.get('away_shifts'),
|
|
2823
|
+
'summary': pages.get('summary')
|
|
2824
|
+
}
|
|
2825
|
+
})
|
|
2826
|
+
second_time = time.time()
|
|
2827
|
+
total_duration = second_time - first_time
|
|
2828
|
+
# Fix this so it doesn't say sourced from ESPN if no coords.
|
|
2829
|
+
print('Successfully scraped ' + str(game_id) + '. Coordinates sourced from ESPN.')
|
|
2830
|
+
try:
|
|
2831
|
+
timing_parts = []
|
|
2832
|
+
if 'parallel_duration' in locals(): timing_parts.append(f"Parallel fetch: {parallel_duration:.2f}s")
|
|
2833
|
+
if 'html_duration' in locals(): timing_parts.append(f"HTML processing: {html_duration:.2f}s")
|
|
2834
|
+
if len(timing_parts) > 0:
|
|
2835
|
+
print("(" + ", ".join(timing_parts) + ")")
|
|
2836
|
+
else:
|
|
2837
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2838
|
+
except Exception:
|
|
2839
|
+
print(f"⏱️ TOTAL game scrape: {total_duration:.2f}s")
|
|
2840
|
+
i = i + 1
|
|
2841
|
+
|
|
2842
|
+
# If there are issues with ESPN
|
|
2843
|
+
|
|
2844
|
+
except KeyError as e:
|
|
2845
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2846
|
+
print('KeyError: ' + str(e))
|
|
2847
|
+
if return_intermediates:
|
|
2848
|
+
intermediates_list.append({
|
|
2849
|
+
'game_id': game_id,
|
|
2850
|
+
'shifts': None,
|
|
2851
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2852
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2853
|
+
'coordinate_source': None,
|
|
2854
|
+
'warning': None,
|
|
2855
|
+
'error': f'ESPN Hybrid KeyError: {str(e)}',
|
|
2856
|
+
'error_traceback': traceback.format_exc(),
|
|
2857
|
+
'raw_html': {
|
|
2858
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2859
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2860
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2861
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2862
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2863
|
+
}
|
|
2864
|
+
})
|
|
2865
|
+
i = i + 1
|
|
2866
|
+
continue
|
|
2867
|
+
except IndexError as e:
|
|
2868
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2869
|
+
print('IndexError: ' + str(e))
|
|
2870
|
+
if return_intermediates:
|
|
2871
|
+
intermediates_list.append({
|
|
2872
|
+
'game_id': game_id,
|
|
2873
|
+
'shifts': None,
|
|
2874
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2875
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2876
|
+
'coordinate_source': None,
|
|
2877
|
+
'warning': None,
|
|
2878
|
+
'error': f'ESPN Hybrid IndexError: {str(e)}',
|
|
2879
|
+
'error_traceback': traceback.format_exc(),
|
|
2880
|
+
'raw_html': {
|
|
2881
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2882
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2883
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2884
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2885
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2886
|
+
}
|
|
2887
|
+
})
|
|
2888
|
+
i = i + 1
|
|
2889
|
+
continue
|
|
2890
|
+
except TypeError as e:
|
|
2891
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2892
|
+
print('TypeError: ' + str(e))
|
|
2893
|
+
if return_intermediates:
|
|
2894
|
+
intermediates_list.append({
|
|
2895
|
+
'game_id': game_id,
|
|
2896
|
+
'shifts': None,
|
|
2897
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2898
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2899
|
+
'coordinate_source': None,
|
|
2900
|
+
'warning': None,
|
|
2901
|
+
'error': f'ESPN Hybrid TypeError: {str(e)}',
|
|
2902
|
+
'error_traceback': traceback.format_exc(),
|
|
2903
|
+
'raw_html': {
|
|
2904
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2905
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2906
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2907
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2908
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2909
|
+
}
|
|
2910
|
+
})
|
|
2911
|
+
i = i + 1
|
|
2912
|
+
continue
|
|
2913
|
+
except ExpatError as e:
|
|
2914
|
+
print('ESPN also had trouble scraping coordinates for: ' + str(game_id) + '. Looks like we will need to punt this one, unfortunately.')
|
|
2915
|
+
print('ExpatError: ' + str(e))
|
|
2916
|
+
if return_intermediates:
|
|
2917
|
+
intermediates_list.append({
|
|
2918
|
+
'game_id': game_id,
|
|
2919
|
+
'shifts': None,
|
|
2920
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
2921
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2922
|
+
'coordinate_source': None,
|
|
2923
|
+
'warning': None,
|
|
2924
|
+
'error': f'ESPN Hybrid ExpatError: {str(e)}',
|
|
2925
|
+
'error_traceback': traceback.format_exc(),
|
|
2926
|
+
'raw_html': {
|
|
2927
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2928
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2929
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2930
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2931
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2932
|
+
}
|
|
2933
|
+
})
|
|
2934
|
+
i = i + 1
|
|
2935
|
+
continue
|
|
2936
|
+
|
|
2937
|
+
except ConnectionError as e:
|
|
2938
|
+
print('Got a Connection Error, time to sleep.')
|
|
2939
|
+
if return_intermediates:
|
|
2940
|
+
intermediates_list.append({
|
|
2941
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
2942
|
+
'shifts': None,
|
|
2943
|
+
'api_coords': None,
|
|
2944
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2945
|
+
'coordinate_source': None,
|
|
2946
|
+
'warning': None,
|
|
2947
|
+
'error': f'ConnectionError: {str(e)}',
|
|
2948
|
+
'error_traceback': traceback.format_exc(),
|
|
2949
|
+
'raw_html': {
|
|
2950
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2951
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2952
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2953
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2954
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2955
|
+
}
|
|
2956
|
+
})
|
|
2957
|
+
time.sleep(10)
|
|
2958
|
+
continue
|
|
2959
|
+
|
|
2960
|
+
except ChunkedEncodingError as e:
|
|
2961
|
+
print('Got a Connection Error, time to sleep.')
|
|
2962
|
+
if return_intermediates:
|
|
2963
|
+
intermediates_list.append({
|
|
2964
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
2965
|
+
'shifts': None,
|
|
2966
|
+
'api_coords': None,
|
|
2967
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2968
|
+
'coordinate_source': None,
|
|
2969
|
+
'warning': None,
|
|
2970
|
+
'error': f'ChunkedEncodingError: {str(e)}',
|
|
2971
|
+
'error_traceback': traceback.format_exc(),
|
|
2972
|
+
'raw_html': {
|
|
2973
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2974
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2975
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
2976
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
2977
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
2978
|
+
}
|
|
2979
|
+
})
|
|
2980
|
+
time.sleep(10)
|
|
2981
|
+
continue
|
|
2982
|
+
|
|
2983
|
+
except AttributeError as e:
|
|
2984
|
+
print(str(game_id) + ' does not have an HTML report. Here is the error: ' + str(e))
|
|
2985
|
+
print(traceback.format_exc())
|
|
2986
|
+
if return_intermediates:
|
|
2987
|
+
intermediates_list.append({
|
|
2988
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
2989
|
+
'shifts': None,
|
|
2990
|
+
'api_coords': None,
|
|
2991
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
2992
|
+
'coordinate_source': None,
|
|
2993
|
+
'warning': None,
|
|
2994
|
+
'error': f'AttributeError: {str(e)}',
|
|
2995
|
+
'error_traceback': traceback.format_exc(),
|
|
2996
|
+
'raw_html': {
|
|
2997
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
2998
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
2999
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
3000
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
3001
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
3002
|
+
}
|
|
3003
|
+
})
|
|
3004
|
+
i = i + 1
|
|
3005
|
+
continue
|
|
3006
|
+
|
|
3007
|
+
except IndexError as e:
|
|
3008
|
+
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
3009
|
+
print(traceback.format_exc())
|
|
3010
|
+
if return_intermediates:
|
|
3011
|
+
intermediates_list.append({
|
|
3012
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
3013
|
+
'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
|
|
3014
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
3015
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
3016
|
+
'coordinate_source': None,
|
|
3017
|
+
'warning': None,
|
|
3018
|
+
'error': f'IndexError: {str(e)}',
|
|
3019
|
+
'error_traceback': traceback.format_exc(),
|
|
3020
|
+
'raw_html': {
|
|
3021
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
3022
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
3023
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
3024
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
3025
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
3026
|
+
}
|
|
3027
|
+
})
|
|
3028
|
+
i = i + 1
|
|
3029
|
+
continue
|
|
3030
|
+
|
|
3031
|
+
except ValueError as e:
|
|
3032
|
+
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
3033
|
+
print(traceback.format_exc())
|
|
3034
|
+
if return_intermediates:
|
|
3035
|
+
intermediates_list.append({
|
|
3036
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
3037
|
+
'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
|
|
3038
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
3039
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
3040
|
+
'coordinate_source': None,
|
|
3041
|
+
'warning': None,
|
|
3042
|
+
'error': f'ValueError: {str(e)}',
|
|
3043
|
+
'error_traceback': traceback.format_exc(),
|
|
3044
|
+
'raw_html': {
|
|
3045
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
3046
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
3047
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
3048
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
3049
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
3050
|
+
}
|
|
3051
|
+
})
|
|
3052
|
+
i = i + 1
|
|
3053
|
+
continue
|
|
3054
|
+
|
|
3055
|
+
except KeyError as k:
|
|
3056
|
+
print(str(game_id) + 'gave some kind of Key Error. Here is the error: ' + str(k))
|
|
3057
|
+
if return_intermediates:
|
|
3058
|
+
intermediates_list.append({
|
|
3059
|
+
'game_id': game_id if 'game_id' in locals() else game_id_list[i],
|
|
3060
|
+
'shifts': shifts.copy() if 'shifts' in locals() and shifts is not None else None,
|
|
3061
|
+
'api_coords': api_coords.copy() if 'api_coords' in locals() else None,
|
|
3062
|
+
'roster_cache': roster_cache.copy() if 'roster_cache' in locals() and roster_cache is not None else None,
|
|
3063
|
+
'coordinate_source': None,
|
|
3064
|
+
'warning': None,
|
|
3065
|
+
'error': f'KeyError: {str(k)}',
|
|
3066
|
+
'error_traceback': traceback.format_exc(),
|
|
3067
|
+
'raw_html': {
|
|
3068
|
+
'events': pages.get('events') if 'pages' in locals() else None,
|
|
3069
|
+
'roster': pages.get('roster') if 'pages' in locals() else None,
|
|
3070
|
+
'home_shifts': pages.get('home_shifts') if 'pages' in locals() else None,
|
|
3071
|
+
'away_shifts': pages.get('away_shifts') if 'pages' in locals() else None,
|
|
3072
|
+
'summary': pages.get('summary') if 'pages' in locals() else None
|
|
3073
|
+
}
|
|
3074
|
+
})
|
|
3075
|
+
i = i + 1
|
|
3076
|
+
continue
|
|
3077
|
+
|
|
3078
|
+
except KeyboardInterrupt:
|
|
3079
|
+
print('You manually interrupted the scrape. You will get to keep every game you have already completed scraping after just a bit of post-processing. Good bye.')
|
|
3080
|
+
global hidden_patrick
|
|
3081
|
+
hidden_patrick = 1
|
|
3082
|
+
# OPTIMIZED: Concat list to DataFrame
|
|
3083
|
+
full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
|
|
3084
|
+
if len(full) > 0:
|
|
3085
|
+
|
|
3086
|
+
full = full.assign(home_skaters = np.where(~full.home_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
|
3087
|
+
(full.home_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
|
|
3088
|
+
full.home_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
|
|
3089
|
+
full.home_skaters))
|
|
3090
|
+
|
|
3091
|
+
full = full.assign(away_skaters = np.where(~full.away_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
|
3092
|
+
(full.away_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
|
|
3093
|
+
full.away_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
|
|
3094
|
+
full.away_skaters))
|
|
3095
|
+
|
|
3096
|
+
if 'away_on_1' in full.columns:
|
|
3097
|
+
|
|
3098
|
+
full = full.assign(
|
|
3099
|
+
away_on_1 = np.where((pd.isna(full.away_on_1)) | (full.away_on_1 is None) | (full.away_on_1=='') | (full.away_on_1=='\xa0'), '\xa0', full.away_on_1),
|
|
3100
|
+
away_on_2 = np.where((pd.isna(full.away_on_2)) | (full.away_on_2 is None) | (full.away_on_2=='') | (full.away_on_2=='\xa0'), '\xa0', full.away_on_2),
|
|
3101
|
+
away_on_3 = np.where((pd.isna(full.away_on_3)) | (full.away_on_3 is None) | (full.away_on_3=='') | (full.away_on_3=='\xa0'), '\xa0', full.away_on_3),
|
|
3102
|
+
away_on_4 = np.where((pd.isna(full.away_on_4)) | (full.away_on_4 is None) | (full.away_on_4=='') | (full.away_on_4=='\xa0'), '\xa0', full.away_on_4),
|
|
3103
|
+
away_on_5 = np.where((pd.isna(full.away_on_5)) | (full.away_on_5 is None) | (full.away_on_5=='') | (full.away_on_5=='\xa0'), '\xa0', full.away_on_5),
|
|
3104
|
+
away_on_6 = np.where((pd.isna(full.away_on_6)) | (full.away_on_6 is None) | (full.away_on_6=='') | (full.away_on_6=='\xa0'), '\xa0', full.away_on_6),
|
|
3105
|
+
away_on_7 = np.where((pd.isna(full.away_on_7)) | (full.away_on_7 is None) | (full.away_on_7=='') | (full.away_on_7=='\xa0'), '\xa0', full.away_on_7),
|
|
3106
|
+
away_on_8 = np.where((pd.isna(full.away_on_8)) | (full.away_on_8 is None) | (full.away_on_8=='') | (full.away_on_8=='\xa0'), '\xa0', full.away_on_8),
|
|
3107
|
+
away_on_9 = np.where((pd.isna(full.away_on_9)) | (full.away_on_9 is None) | (full.away_on_9=='') | (full.away_on_9=='\xa0'), '\xa0', full.away_on_9),
|
|
3108
|
+
home_on_1 = np.where((pd.isna(full.home_on_1)) | (full.home_on_1 is None) | (full.home_on_1=='') | (full.home_on_1=='\xa0'), '\xa0', full.home_on_1),
|
|
3109
|
+
home_on_2 = np.where((pd.isna(full.home_on_2)) | (full.home_on_2 is None) | (full.home_on_2=='') | (full.home_on_2=='\xa0'), '\xa0', full.home_on_2),
|
|
3110
|
+
home_on_3 = np.where((pd.isna(full.home_on_3)) | (full.home_on_3 is None) | (full.home_on_3=='') | (full.home_on_3=='\xa0'), '\xa0', full.home_on_3),
|
|
3111
|
+
home_on_4 = np.where((pd.isna(full.home_on_4)) | (full.home_on_4 is None) | (full.home_on_4=='') | (full.home_on_4=='\xa0'), '\xa0', full.home_on_4),
|
|
3112
|
+
home_on_5 = np.where((pd.isna(full.home_on_5)) | (full.home_on_5 is None) | (full.home_on_5=='') | (full.home_on_5=='\xa0'), '\xa0', full.home_on_5),
|
|
3113
|
+
home_on_6 = np.where((pd.isna(full.home_on_6)) | (full.home_on_6 is None) | (full.home_on_6=='') | (full.home_on_6=='\xa0'), '\xa0', full.home_on_6),
|
|
3114
|
+
home_on_7 = np.where((pd.isna(full.home_on_7)) | (full.home_on_7 is None) | (full.home_on_7=='') | (full.home_on_7=='\xa0'), '\xa0', full.home_on_7),
|
|
3115
|
+
home_on_8 = np.where((pd.isna(full.home_on_8)) | (full.home_on_8 is None) | (full.home_on_8=='') | (full.home_on_8=='\xa0'), '\xa0', full.home_on_8),
|
|
3116
|
+
home_on_9 = np.where((pd.isna(full.home_on_9)) | (full.home_on_9 is None) | (full.home_on_9=='') | (full.home_on_9=='\xa0'), '\xa0', full.home_on_9),
|
|
3117
|
+
home_goalie = np.where((pd.isna(full.home_goalie)) | (full.home_goalie is None) | (full.home_goalie=='') | (full.home_goalie=='\xa0'), '\xa0', full.home_goalie),
|
|
3118
|
+
away_goalie = np.where((pd.isna(full.away_goalie)) | (full.away_goalie is None) | (full.away_goalie=='') | (full.away_goalie=='\xa0'), '\xa0', full.away_goalie)
|
|
3119
|
+
)
|
|
3120
|
+
|
|
3121
|
+
# OPTIMIZED: Concat list to DataFrame before return
|
|
3122
|
+
full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
|
|
3123
|
+
if return_intermediates:
|
|
3124
|
+
return {'final': full, 'intermediates': intermediates_list}
|
|
3125
|
+
return full
|
|
3126
|
+
|
|
3127
|
+
# OPTIMIZED: Concat list to DataFrame before final processing
|
|
3128
|
+
full = pd.concat(full_list, ignore_index=True) if full_list else pd.DataFrame()
|
|
3129
|
+
|
|
3130
|
+
if len(full) > 0:
|
|
3131
|
+
|
|
3132
|
+
full = full.assign(home_skaters = np.where(~full.home_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
|
3133
|
+
(full.home_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
|
|
3134
|
+
full.home_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
|
|
3135
|
+
full.home_skaters))
|
|
3136
|
+
|
|
3137
|
+
full = full.assign(away_skaters = np.where(~full.away_skaters.isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
|
3138
|
+
(full.away_skaters.apply(lambda x: len(re.findall('[A-Z]', str(x)))) -
|
|
3139
|
+
full.away_skaters.apply(lambda x: len(re.findall('[G]', str(x))))),
|
|
3140
|
+
full.away_skaters))
|
|
3141
|
+
|
|
3142
|
+
if 'away_on_1' in full.columns:
|
|
3143
|
+
|
|
3144
|
+
full = full.assign(
|
|
3145
|
+
away_on_1 = np.where((pd.isna(full.away_on_1)) | (full.away_on_1 is None) | (full.away_on_1=='') | (full.away_on_1=='\xa0'), '\xa0', full.away_on_1),
|
|
3146
|
+
away_on_2 = np.where((pd.isna(full.away_on_2)) | (full.away_on_2 is None) | (full.away_on_2=='') | (full.away_on_2=='\xa0'), '\xa0', full.away_on_2),
|
|
3147
|
+
away_on_3 = np.where((pd.isna(full.away_on_3)) | (full.away_on_3 is None) | (full.away_on_3=='') | (full.away_on_3=='\xa0'), '\xa0', full.away_on_3),
|
|
3148
|
+
away_on_4 = np.where((pd.isna(full.away_on_4)) | (full.away_on_4 is None) | (full.away_on_4=='') | (full.away_on_4=='\xa0'), '\xa0', full.away_on_4),
|
|
3149
|
+
away_on_5 = np.where((pd.isna(full.away_on_5)) | (full.away_on_5 is None) | (full.away_on_5=='') | (full.away_on_5=='\xa0'), '\xa0', full.away_on_5),
|
|
3150
|
+
away_on_6 = np.where((pd.isna(full.away_on_6)) | (full.away_on_6 is None) | (full.away_on_6=='') | (full.away_on_6=='\xa0'), '\xa0', full.away_on_6),
|
|
3151
|
+
away_on_7 = np.where((pd.isna(full.away_on_7)) | (full.away_on_7 is None) | (full.away_on_7=='') | (full.away_on_7=='\xa0'), '\xa0', full.away_on_7),
|
|
3152
|
+
away_on_8 = np.where((pd.isna(full.away_on_8)) | (full.away_on_8 is None) | (full.away_on_8=='') | (full.away_on_8=='\xa0'), '\xa0', full.away_on_8),
|
|
3153
|
+
away_on_9 = np.where((pd.isna(full.away_on_9)) | (full.away_on_9 is None) | (full.away_on_9=='') | (full.away_on_9=='\xa0'), '\xa0', full.away_on_9),
|
|
3154
|
+
home_on_1 = np.where((pd.isna(full.home_on_1)) | (full.home_on_1 is None) | (full.home_on_1=='') | (full.home_on_1=='\xa0'), '\xa0', full.home_on_1),
|
|
3155
|
+
home_on_2 = np.where((pd.isna(full.home_on_2)) | (full.home_on_2 is None) | (full.home_on_2=='') | (full.home_on_2=='\xa0'), '\xa0', full.home_on_2),
|
|
3156
|
+
home_on_3 = np.where((pd.isna(full.home_on_3)) | (full.home_on_3 is None) | (full.home_on_3=='') | (full.home_on_3=='\xa0'), '\xa0', full.home_on_3),
|
|
3157
|
+
home_on_4 = np.where((pd.isna(full.home_on_4)) | (full.home_on_4 is None) | (full.home_on_4=='') | (full.home_on_4=='\xa0'), '\xa0', full.home_on_4),
|
|
3158
|
+
home_on_5 = np.where((pd.isna(full.home_on_5)) | (full.home_on_5 is None) | (full.home_on_5=='') | (full.home_on_5=='\xa0'), '\xa0', full.home_on_5),
|
|
3159
|
+
home_on_6 = np.where((pd.isna(full.home_on_6)) | (full.home_on_6 is None) | (full.home_on_6=='') | (full.home_on_6=='\xa0'), '\xa0', full.home_on_6),
|
|
3160
|
+
home_on_7 = np.where((pd.isna(full.home_on_7)) | (full.home_on_7 is None) | (full.home_on_7=='') | (full.home_on_7=='\xa0'), '\xa0', full.home_on_7),
|
|
3161
|
+
home_on_8 = np.where((pd.isna(full.home_on_8)) | (full.home_on_8 is None) | (full.home_on_8=='') | (full.home_on_8=='\xa0'), '\xa0', full.home_on_8),
|
|
3162
|
+
home_on_9 = np.where((pd.isna(full.home_on_9)) | (full.home_on_9 is None) | (full.home_on_9=='') | (full.home_on_9=='\xa0'), '\xa0', full.home_on_9),
|
|
3163
|
+
home_goalie = np.where((pd.isna(full.home_goalie)) | (full.home_goalie is None) | (full.home_goalie=='') | (full.home_goalie=='\xa0'), '\xa0', full.home_goalie),
|
|
3164
|
+
away_goalie = np.where((pd.isna(full.away_goalie)) | (full.away_goalie is None) | (full.away_goalie=='') | (full.away_goalie=='\xa0'), '\xa0', full.away_goalie)
|
|
3165
|
+
)
|
|
3166
|
+
|
|
3167
|
+
if live == True and 'game_strength_state' in full.columns:
|
|
3168
|
+
|
|
3169
|
+
# Find the point in time where everybody jumps off (i.e., the synthetic shifts end) and get rid of that and everything after.
|
|
3170
|
+
# (IF we have such a time)
|
|
3171
|
+
|
|
3172
|
+
if len(
|
|
3173
|
+
full[(full.game_strength_state.str.contains('E')) &
|
|
3174
|
+
((full.game_strength_state != 'EvE')) &
|
|
3175
|
+
(full.game_strength_state.shift(-1) == 'EvE') &
|
|
3176
|
+
(full.game_period == max(full.game_period))]) > 0:
|
|
3177
|
+
|
|
3178
|
+
full = full[full.event_index <=
|
|
3179
|
+
full[(full.game_strength_state.str.contains('E')) &
|
|
3180
|
+
((full.game_strength_state != 'EvE')) &
|
|
3181
|
+
(full.game_strength_state.shift(-1) == 'EvE') &
|
|
3182
|
+
(full.game_period == max(full.game_period))].event_index.iloc[-1] - 1]
|
|
3183
|
+
|
|
3184
|
+
# If we don't have such a point in time (which can happen when home clock and away clock are misaligned, for example):
|
|
3185
|
+
# Then we find the final change and ditch everything beneath it
|
|
3186
|
+
|
|
3187
|
+
elif full[full.event_type=='CHANGE'].iloc[-1].game_strength_state in ['5vE', 'Ev5']:
|
|
3188
|
+
|
|
3189
|
+
full = full[full.event_index <= full[full.event_type=='CHANGE'].iloc[-1].event_index]
|
|
3190
|
+
|
|
3191
|
+
if len(full[(full.event_type == 'SHOT') &
|
|
3192
|
+
(((full.event_team==full.home_team) & (full.away_goalie=='\xa0')) | ((full.event_team==full.away_team) & (full.home_goalie=='\xa0')))
|
|
3193
|
+
]) > 0:
|
|
3194
|
+
latest_bad_event = full[(full.event_type == 'SHOT') &
|
|
3195
|
+
(((full.event_team==full.home_team) & (full.away_goalie=='\xa0')) | ((full.event_team==full.away_team) & (full.home_goalie=='\xa0')))
|
|
3196
|
+
].event_index.min()
|
|
3197
|
+
full = full[full.event_index < latest_bad_event]
|
|
3198
|
+
|
|
3199
|
+
if return_intermediates:
|
|
3200
|
+
return {'final': full, 'intermediates': intermediates_list}
|
|
3201
|
+
return full
|
|
3202
|
+
|
|
3203
|
+
def full_scrape(game_id_list, live = True, shift = False, return_intermediates = False, verbose = False):
|
|
3204
|
+
|
|
3205
|
+
global hidden_patrick
|
|
3206
|
+
hidden_patrick = 0
|
|
3207
|
+
|
|
3208
|
+
result = full_scrape_1by1(game_id_list, live, shift_to_espn = shift, return_intermediates = return_intermediates, verbose = verbose)
|
|
3209
|
+
|
|
3210
|
+
# Handle return_intermediates case
|
|
3211
|
+
if return_intermediates:
|
|
3212
|
+
df = result['final']
|
|
3213
|
+
intermediates_list = result['intermediates']
|
|
3214
|
+
else:
|
|
3215
|
+
df = result
|
|
3216
|
+
intermediates_list = None
|
|
3217
|
+
|
|
3218
|
+
if verbose:
|
|
3219
|
+
print('Full scrape complete, we have this many rows:', len(df))
|
|
3220
|
+
|
|
3221
|
+
try:
|
|
3222
|
+
df = df.assign(
|
|
3223
|
+
event_player_1 = np.where(
|
|
3224
|
+
(df.event_player_1 == 'ELIAS PETTERSSON') &
|
|
3225
|
+
(df.event_description.str.contains('#', na=False)) &
|
|
3226
|
+
(df.event_description.str.contains(' PETTERSSON', na=False)) &
|
|
3227
|
+
(df.event_description.str.extract(r'#(\d+) PETTERSSON', expand=False) == '25'),
|
|
3228
|
+
'ELIAS PETTERSSON(D)', df.event_player_1),
|
|
3229
|
+
event_player_2 = np.where(
|
|
3230
|
+
(df.event_player_2 == 'ELIAS PETTERSSON') &
|
|
3231
|
+
(
|
|
3232
|
+
# Goal and Petey got A1
|
|
3233
|
+
((df.event_type == 'GOAL') &
|
|
3234
|
+
(df.event_description.str.contains(': #', na=False)) &
|
|
3235
|
+
(df.event_description.str.contains(' PETTERSSON', na=False)) &
|
|
3236
|
+
(df.event_description.str.extract(r': #(\d+) PETTERSSON', expand=False) == '25')) |
|
|
3237
|
+
# Not a goal, Petey was EP2
|
|
3238
|
+
((df.event_type != 'GOAL') &
|
|
3239
|
+
(df.event_description.str.contains('VAN #', na=False)) &
|
|
3240
|
+
(df.event_description.str.contains(' PETTERSSON', na=False)) &
|
|
3241
|
+
(df.event_description.str.extract(r'VAN #(\d+) PETTERSSON', expand=False) == '25'))
|
|
3242
|
+
),
|
|
3243
|
+
'ELIAS PETTERSSON(D)', df.event_player_2),
|
|
3244
|
+
event_player_3 = np.where(
|
|
3245
|
+
(df.event_player_3=='ELIAS PETTERSSON') &
|
|
3246
|
+
(df.event_description.str.contains('#', na=False)) &
|
|
3247
|
+
(df.event_description.str.contains(' PETTERSSON', na=False)) &
|
|
3248
|
+
(df.event_description.str.extract(r'#(\d+) PETTERSSON(?:\s|$)', expand=False) == '25'),
|
|
3249
|
+
'ELIAS PETTERSSON(D)', df.event_player_3)
|
|
3250
|
+
)
|
|
3251
|
+
except Exception as e:
|
|
3252
|
+
print(e)
|
|
3253
|
+
|
|
3254
|
+
# Don't even need this, we've had this problem with Stutzle for years, just let it be.
|
|
3255
|
+
# df.event_description = df.event_description.str.replace('FEHÃ\x89RVÃ\x81RY', 'FEHERVARY').str.replace('BLÃMEL', 'BLAMEL')
|
|
3256
|
+
|
|
3257
|
+
if (hidden_patrick==0) and (len(df)>0):
|
|
3258
|
+
|
|
3259
|
+
gids = list(set(df.game_id))
|
|
3260
|
+
missing = [x for x in game_id_list if x not in gids]
|
|
3261
|
+
if len(missing)>0:
|
|
3262
|
+
print('You missed the following games: ' + str(missing))
|
|
3263
|
+
print('Let us try scraping each of them one more time.')
|
|
3264
|
+
retry_result = full_scrape_1by1(missing, return_intermediates = return_intermediates, verbose = verbose)
|
|
3265
|
+
if return_intermediates:
|
|
3266
|
+
retry_df = retry_result['final']
|
|
3267
|
+
retry_intermediates = retry_result['intermediates']
|
|
3268
|
+
df = pd.concat([df, retry_df], ignore_index=True)
|
|
3269
|
+
intermediates_list.extend(retry_intermediates)
|
|
3270
|
+
return {'final': df, 'intermediates': intermediates_list}
|
|
3271
|
+
else:
|
|
3272
|
+
df = pd.concat([df, retry_result], ignore_index=True)
|
|
3273
|
+
return df
|
|
3274
|
+
else:
|
|
3275
|
+
if return_intermediates:
|
|
3276
|
+
return {'final': df, 'intermediates': intermediates_list}
|
|
3277
|
+
return df
|
|
3278
|
+
|
|
3279
|
+
else:
|
|
3280
|
+
if return_intermediates:
|
|
3281
|
+
return {'final': df, 'intermediates': intermediates_list}
|
|
3282
|
+
return df
|
|
3283
|
+
|
|
3284
|
+
print("Welcome to the TopDownHockey NHL Scraper, built by Patrick Bacon.")
|
|
3285
|
+
print("If you enjoy the scraper and would like to support my work, or you have any comments, questions, or concerns, feel free to follow me on Twitter @TopDownHockey or reach out to me via email at patrick.s.bacon@gmail.com. Have fun!")
|