TopDownHockey-Scraper 6.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +820 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +3285 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper_OG.py +3224 -0
- TopDownHockey_Scraper/__init__.py +26 -0
- TopDownHockey_Scraper/data/handedness.csv +1276 -0
- TopDownHockey_Scraper/name_corrections.py +302 -0
- TopDownHockey_Scraper/portrait_links.csv +2445 -0
- TopDownHockey_Scraper/scrape_nhl_api_events.py +438 -0
- topdownhockey_scraper-6.1.30.dist-info/METADATA +169 -0
- topdownhockey_scraper-6.1.30.dist-info/RECORD +13 -0
- topdownhockey_scraper-6.1.30.dist-info/WHEEL +5 -0
- topdownhockey_scraper-6.1.30.dist-info/licenses/LICENSE +19 -0
- topdownhockey_scraper-6.1.30.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NHL API Events Scraper
|
|
3
|
+
This module implements scrape_api_events() using the NHL API play-by-play endpoint
|
|
4
|
+
to replace ESPN scraping functionality.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
import unicodedata
|
|
13
|
+
import time
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
# Use the same session pattern as the main scraper
|
|
17
|
+
_session = requests.Session()
|
|
18
|
+
|
|
19
|
+
from TopDownHockey_Scraper.name_corrections import NAME_CORRECTIONS, normalize_player_name
|
|
20
|
+
|
|
21
|
+
# Load packaged handedness data
|
|
22
|
+
_handedness_dict = {}
|
|
23
|
+
_handedness_api_cache = {} # Cache for API lookups during session
|
|
24
|
+
|
|
25
|
+
def _load_handedness_data():
|
|
26
|
+
"""Load handedness data from packaged CSV file"""
|
|
27
|
+
global _handedness_dict
|
|
28
|
+
try:
|
|
29
|
+
# Try importlib.resources first (Python 3.9+)
|
|
30
|
+
try:
|
|
31
|
+
from importlib.resources import files
|
|
32
|
+
data_path = files('TopDownHockey_Scraper').joinpath('data', 'handedness.csv')
|
|
33
|
+
with data_path.open('r') as f:
|
|
34
|
+
df = pd.read_csv(f)
|
|
35
|
+
except (ImportError, TypeError):
|
|
36
|
+
# Fallback for older Python versions
|
|
37
|
+
import pkg_resources
|
|
38
|
+
data_path = pkg_resources.resource_filename('TopDownHockey_Scraper', 'data/handedness.csv')
|
|
39
|
+
df = pd.read_csv(data_path)
|
|
40
|
+
|
|
41
|
+
_handedness_dict = dict(zip(df['player'], df['handedness']))
|
|
42
|
+
except Exception as e:
|
|
43
|
+
# If data file not found, continue without it (API fallback will be used)
|
|
44
|
+
_handedness_dict = {}
|
|
45
|
+
|
|
46
|
+
# Load on module import
|
|
47
|
+
_load_handedness_data()
|
|
48
|
+
|
|
49
|
+
def _get_handedness_from_api(player_id):
|
|
50
|
+
"""Fetch player handedness from NHL API (with session caching)"""
|
|
51
|
+
if player_id is None:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
player_id_str = str(int(player_id))
|
|
55
|
+
|
|
56
|
+
# Check session cache first
|
|
57
|
+
if player_id_str in _handedness_api_cache:
|
|
58
|
+
return _handedness_api_cache[player_id_str]
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
url = f"https://api-web.nhle.com/v1/player/{player_id_str}/landing"
|
|
62
|
+
response = _session.get(url, timeout=10)
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
data = response.json()
|
|
65
|
+
handedness = data.get('shootsCatches')
|
|
66
|
+
_handedness_api_cache[player_id_str] = handedness
|
|
67
|
+
return handedness
|
|
68
|
+
except Exception:
|
|
69
|
+
_handedness_api_cache[player_id_str] = None
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def _get_player_name(player_id, player_mapping_dict):
|
|
73
|
+
"""Get player name from ID using player mapping dictionary from API"""
|
|
74
|
+
if player_id is None:
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
player_id_str = str(int(player_id)) if pd.notna(player_id) else None
|
|
78
|
+
return player_mapping_dict.get(player_id_str, None)
|
|
79
|
+
|
|
80
|
+
def _map_event_type(type_desc_key, type_code=None):
|
|
81
|
+
"""Map NHL API event types to ESPN-style event codes"""
|
|
82
|
+
# NHL API uses typeDescKey for event descriptions
|
|
83
|
+
event_mapping = {
|
|
84
|
+
'shot-on-goal': 'SHOT',
|
|
85
|
+
'shot-blocked': 'BLOCK',
|
|
86
|
+
'shot-missed': 'MISS',
|
|
87
|
+
'goal': 'GOAL',
|
|
88
|
+
'hit': 'HIT',
|
|
89
|
+
'giveaway': 'GIVE',
|
|
90
|
+
'takeaway': 'TAKE',
|
|
91
|
+
'faceoff': 'FAC',
|
|
92
|
+
'penalty': 'PENL',
|
|
93
|
+
'stoppage': 'STOP',
|
|
94
|
+
'period-start': 'PSTR',
|
|
95
|
+
'period-end': 'PEND',
|
|
96
|
+
'game-end': 'GEND',
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Handle typeDescKey (string)
|
|
100
|
+
if isinstance(type_desc_key, str):
|
|
101
|
+
type_lower = type_desc_key.lower()
|
|
102
|
+
# Try exact match first
|
|
103
|
+
if type_lower in event_mapping:
|
|
104
|
+
return event_mapping[type_lower]
|
|
105
|
+
# Try partial matches
|
|
106
|
+
if 'shot' in type_lower and 'goal' in type_lower:
|
|
107
|
+
return 'SHOT'
|
|
108
|
+
elif 'shot' in type_lower and 'block' in type_lower:
|
|
109
|
+
return 'BLOCK'
|
|
110
|
+
elif 'shot' in type_lower and 'miss' in type_lower:
|
|
111
|
+
return 'MISS'
|
|
112
|
+
elif 'goal' in type_lower:
|
|
113
|
+
return 'GOAL'
|
|
114
|
+
elif 'hit' in type_lower:
|
|
115
|
+
return 'HIT'
|
|
116
|
+
elif 'giveaway' in type_lower or 'give' in type_lower:
|
|
117
|
+
return 'GIVE'
|
|
118
|
+
elif 'takeaway' in type_lower or 'take' in type_lower:
|
|
119
|
+
return 'TAKE'
|
|
120
|
+
elif 'faceoff' in type_lower or 'face-off' in type_lower:
|
|
121
|
+
return 'FAC'
|
|
122
|
+
elif 'penalty' in type_lower:
|
|
123
|
+
return 'PENL'
|
|
124
|
+
elif 'stop' in type_lower:
|
|
125
|
+
return 'STOP'
|
|
126
|
+
|
|
127
|
+
# Handle typeCode (numeric) as fallback
|
|
128
|
+
if isinstance(type_code, int):
|
|
129
|
+
# Common NHL API event type codes
|
|
130
|
+
type_id_mapping = {
|
|
131
|
+
502: 'GOAL',
|
|
132
|
+
503: 'HIT',
|
|
133
|
+
504: 'GIVE', # Note: giveaway and takeaway may share codes, check details
|
|
134
|
+
505: 'SHOT',
|
|
135
|
+
506: 'BLOCK',
|
|
136
|
+
507: 'MISS',
|
|
137
|
+
508: 'TAKE',
|
|
138
|
+
}
|
|
139
|
+
if type_code in type_id_mapping:
|
|
140
|
+
return type_id_mapping[type_code]
|
|
141
|
+
|
|
142
|
+
return 'UNKNOWN'
|
|
143
|
+
|
|
144
|
+
def _extract_player_id_from_event(event_details, event_type):
|
|
145
|
+
"""Extract the primary player ID from event details based on event type"""
|
|
146
|
+
if not event_details:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
# Map event types to their corresponding player ID fields (NHL API field names)
|
|
150
|
+
# Note: For BLOCK events, ESPN shows the shooting player (whose shot was blocked), not the blocker
|
|
151
|
+
player_id_fields = {
|
|
152
|
+
'SHOT': ['shootingPlayerId', 'scoringPlayerId', 'playerId'],
|
|
153
|
+
'GOAL': ['scoringPlayerId', 'shootingPlayerId', 'playerId'],
|
|
154
|
+
'HIT': ['hittingPlayerId', 'playerId'],
|
|
155
|
+
'BLOCK': ['shootingPlayerId', 'playerId'], # ESPN shows shooter, not blocker
|
|
156
|
+
'GIVE': ['playerId', 'committedByPlayerId'],
|
|
157
|
+
'TAKE': ['playerId', 'takingPlayerId'],
|
|
158
|
+
'MISS': ['shootingPlayerId', 'playerId'],
|
|
159
|
+
'FAC': ['winningPlayerId', 'playerId'],
|
|
160
|
+
'PENL': ['committedByPlayerId', 'playerId'],
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Try to find the appropriate player ID field
|
|
164
|
+
fields_to_try = player_id_fields.get(event_type, ['playerId'])
|
|
165
|
+
|
|
166
|
+
for field in fields_to_try:
|
|
167
|
+
if field in event_details and event_details[field] is not None:
|
|
168
|
+
player_id = event_details[field]
|
|
169
|
+
# Ensure it's a valid ID (not 0 or empty)
|
|
170
|
+
if player_id and player_id != 0:
|
|
171
|
+
return player_id
|
|
172
|
+
|
|
173
|
+
# Fallback: try common field names
|
|
174
|
+
for common_field in ['playerId', 'player', 'id']:
|
|
175
|
+
if common_field in event_details and event_details[common_field] is not None:
|
|
176
|
+
player_id = event_details[common_field]
|
|
177
|
+
if player_id and player_id != 0:
|
|
178
|
+
return player_id
|
|
179
|
+
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
def scrape_api_events(game_id, drop_description=True, shift_to_espn=False, verbose=False):
|
|
183
|
+
"""
|
|
184
|
+
Scrape event coordinates and data from NHL API play-by-play endpoint.
|
|
185
|
+
|
|
186
|
+
This function replaces scrape_espn_events() by using the official NHL API.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
-----------
|
|
190
|
+
game_id : int
|
|
191
|
+
NHL game ID (e.g., 2025020331)
|
|
192
|
+
drop_description : bool, default True
|
|
193
|
+
Whether to drop the description column from the output
|
|
194
|
+
shift_to_espn : bool, default False
|
|
195
|
+
If True, raises KeyError to trigger ESPN fallback (for compatibility)
|
|
196
|
+
verbose : bool, default False
|
|
197
|
+
If True, print detailed timing information
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
--------
|
|
201
|
+
pd.DataFrame
|
|
202
|
+
DataFrame with columns: coords_x, coords_y, event_player_1, event,
|
|
203
|
+
game_seconds, period, version, goalie_id, goalie_name
|
|
204
|
+
(and optionally description)
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
if shift_to_espn:
|
|
208
|
+
raise KeyError("shift_to_espn=True requested, triggering ESPN fallback")
|
|
209
|
+
|
|
210
|
+
# Fetch play-by-play data from NHL API
|
|
211
|
+
api_url = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
# TIME: Network request
|
|
215
|
+
net_start = time.time()
|
|
216
|
+
response = _session.get(api_url, timeout=30)
|
|
217
|
+
net_duration = time.time() - net_start
|
|
218
|
+
if verbose:
|
|
219
|
+
print(f' ⏱️ API events network request: {net_duration:.2f}s')
|
|
220
|
+
|
|
221
|
+
response.raise_for_status()
|
|
222
|
+
|
|
223
|
+
# TIME: JSON parsing
|
|
224
|
+
parse_start = time.time()
|
|
225
|
+
api_data = json.loads(response.content)
|
|
226
|
+
player_mapping_df = pd.DataFrame(api_data['rosterSpots'])
|
|
227
|
+
player_mapping_df = player_mapping_df.assign(player = (player_mapping_df['firstName'].apply(lambda x: x['default']) + ' ' + player_mapping_df['lastName'].apply(lambda x: x['default'])).str.upper(),
|
|
228
|
+
link = 'https://assets.nhle.com/mugs/nhl/latest/' + player_mapping_df['playerId'].astype(str) + '.png',
|
|
229
|
+
id = player_mapping_df['playerId']).loc[:, ['player', 'link', 'id']]
|
|
230
|
+
|
|
231
|
+
# Disambiguate players with identical names using their NHL API IDs
|
|
232
|
+
# ELIAS PETTERSSON (D) - defenseman, ID 8483678 - vs forward ELIAS PETTERSSON
|
|
233
|
+
player_mapping_df['player'] = np.where(
|
|
234
|
+
player_mapping_df['id'] == 8483678,
|
|
235
|
+
'ELIAS PETTERSSON(D)',
|
|
236
|
+
player_mapping_df['player']
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Create dictionary mapping player ID to name for fast lookup
|
|
240
|
+
player_mapping_dict = dict(zip(player_mapping_df['id'].astype(str), player_mapping_df['player']))
|
|
241
|
+
parse_duration = time.time() - parse_start
|
|
242
|
+
if verbose:
|
|
243
|
+
print(f' ⏱️ API JSON parsing: {parse_duration:.2f}s')
|
|
244
|
+
except Exception as e:
|
|
245
|
+
raise KeyError(f"Failed to fetch NHL API data for game {game_id}: {e}")
|
|
246
|
+
|
|
247
|
+
# Extract plays array
|
|
248
|
+
if 'plays' not in api_data:
|
|
249
|
+
raise KeyError(f"No 'plays' key found in NHL API response for game {game_id}")
|
|
250
|
+
|
|
251
|
+
plays = api_data['plays']
|
|
252
|
+
|
|
253
|
+
if not plays:
|
|
254
|
+
# Return empty DataFrame with correct columns
|
|
255
|
+
columns = ['coords_x', 'coords_y', 'event_player_1', 'event', 'game_seconds', 'period', 'version', 'goalie_id', 'goalie_name', 'miss_reason', 'shooter_handedness']
|
|
256
|
+
if not drop_description:
|
|
257
|
+
columns.append('description')
|
|
258
|
+
return pd.DataFrame(columns=columns)
|
|
259
|
+
|
|
260
|
+
# Parse plays into list of dictionaries
|
|
261
|
+
events_list = []
|
|
262
|
+
|
|
263
|
+
for play in plays:
|
|
264
|
+
# Extract period information
|
|
265
|
+
period_desc = play.get('periodDescriptor', {})
|
|
266
|
+
period = period_desc.get('number', 1)
|
|
267
|
+
|
|
268
|
+
# Extract time information
|
|
269
|
+
time_in_period = play.get('timeInPeriod', '')
|
|
270
|
+
time_remaining = play.get('timeRemaining', '')
|
|
271
|
+
|
|
272
|
+
# Parse time string (format: "MM:SS")
|
|
273
|
+
if time_in_period:
|
|
274
|
+
try:
|
|
275
|
+
time_parts = time_in_period.split(':')
|
|
276
|
+
minutes = int(time_parts[0])
|
|
277
|
+
seconds = int(time_parts[1])
|
|
278
|
+
except (ValueError, IndexError):
|
|
279
|
+
minutes, seconds = 0, 0
|
|
280
|
+
else:
|
|
281
|
+
minutes, seconds = 0, 0
|
|
282
|
+
|
|
283
|
+
# Calculate game_seconds
|
|
284
|
+
if period < 5:
|
|
285
|
+
game_seconds = ((period - 1) * 1200) + (minutes * 60) + seconds
|
|
286
|
+
else:
|
|
287
|
+
game_seconds = 3900 # Overtime
|
|
288
|
+
|
|
289
|
+
# Extract event type
|
|
290
|
+
event_type_code = play.get('typeCode')
|
|
291
|
+
event_type_desc = play.get('typeDescKey', '')
|
|
292
|
+
|
|
293
|
+
# Map to ESPN-style event code
|
|
294
|
+
event_code = _map_event_type(event_type_desc, event_type_code)
|
|
295
|
+
|
|
296
|
+
# Extract coordinates from details
|
|
297
|
+
details = play.get('details', {})
|
|
298
|
+
coords_x = details.get('xCoord')
|
|
299
|
+
coords_y = details.get('yCoord')
|
|
300
|
+
|
|
301
|
+
# Handle None coordinates
|
|
302
|
+
if coords_x is None or coords_y is None:
|
|
303
|
+
# Skip events without coordinates (except faceoffs which can be at 0,0)
|
|
304
|
+
if event_code != 'FAC':
|
|
305
|
+
continue
|
|
306
|
+
# Set faceoff coordinates to 0 if missing
|
|
307
|
+
coords_x = coords_x if coords_x is not None else 0
|
|
308
|
+
coords_y = coords_y if coords_y is not None else 0
|
|
309
|
+
|
|
310
|
+
# Extract player ID and map to name
|
|
311
|
+
player_id = _extract_player_id_from_event(details, event_code)
|
|
312
|
+
player_name = _get_player_name(player_id, player_mapping_dict) if player_id else None
|
|
313
|
+
|
|
314
|
+
# Extract goalie ID and map to name
|
|
315
|
+
goalie_id = details.get('goalieInNetId')
|
|
316
|
+
goalie_name = _get_player_name(goalie_id, player_mapping_dict) if goalie_id else None
|
|
317
|
+
|
|
318
|
+
# Extract miss reason (only present for missed shots)
|
|
319
|
+
miss_reason = details.get('reason')
|
|
320
|
+
|
|
321
|
+
# Extract description
|
|
322
|
+
description = play.get('description', {})
|
|
323
|
+
if isinstance(description, dict):
|
|
324
|
+
description = description.get('default', '')
|
|
325
|
+
description = str(description) if description else ''
|
|
326
|
+
|
|
327
|
+
# Only include events with coordinates (matching ESPN behavior)
|
|
328
|
+
# For faceoffs, allow missing player names (they'll be handled in merge)
|
|
329
|
+
if coords_x is not None and coords_y is not None:
|
|
330
|
+
|
|
331
|
+
events_list.append({
|
|
332
|
+
'coords_x': int(coords_x),
|
|
333
|
+
'coords_y': int(coords_y),
|
|
334
|
+
'event_player_1': player_name,
|
|
335
|
+
'event': event_code,
|
|
336
|
+
'game_seconds': game_seconds,
|
|
337
|
+
'period': period,
|
|
338
|
+
'description': description,
|
|
339
|
+
'time_in_period': time_in_period,
|
|
340
|
+
'player_id': player_id,
|
|
341
|
+
'goalie_id': goalie_id,
|
|
342
|
+
'goalie_name': goalie_name,
|
|
343
|
+
'miss_reason': miss_reason,
|
|
344
|
+
})
|
|
345
|
+
|
|
346
|
+
if not events_list:
|
|
347
|
+
# Return empty DataFrame with correct columns
|
|
348
|
+
columns = ['coords_x', 'coords_y', 'event_player_1', 'event', 'game_seconds', 'period', 'version', 'goalie_id', 'goalie_name', 'miss_reason', 'shooter_handedness']
|
|
349
|
+
if not drop_description:
|
|
350
|
+
columns.append('description')
|
|
351
|
+
return pd.DataFrame(columns=columns)
|
|
352
|
+
|
|
353
|
+
# Convert to DataFrame
|
|
354
|
+
events_df = pd.DataFrame(events_list)
|
|
355
|
+
|
|
356
|
+
# Filter out events without player names (matching ESPN behavior)
|
|
357
|
+
# ESPN filters: events must have coords AND player names
|
|
358
|
+
events_df = events_df[events_df['event_player_1'].notna()]
|
|
359
|
+
|
|
360
|
+
# Normalize player names
|
|
361
|
+
events_df['event_player_1'] = events_df['event_player_1'].apply(normalize_player_name)
|
|
362
|
+
events_df['goalie_name'] = events_df['goalie_name'].apply(normalize_player_name)
|
|
363
|
+
|
|
364
|
+
# Filter again after normalization (in case normalization resulted in empty strings)
|
|
365
|
+
events_df = events_df[events_df['event_player_1'] != '']
|
|
366
|
+
|
|
367
|
+
# Add shooter handedness from packaged data, with API fallback for unknowns
|
|
368
|
+
def get_handedness(row):
|
|
369
|
+
# Try packaged data first (fast)
|
|
370
|
+
player_name = row['event_player_1']
|
|
371
|
+
if player_name in _handedness_dict:
|
|
372
|
+
return _handedness_dict[player_name]
|
|
373
|
+
# Fall back to NHL API for unknown players (slow, but cached)
|
|
374
|
+
return _get_handedness_from_api(row.get('player_id'))
|
|
375
|
+
|
|
376
|
+
events_df['shooter_handedness'] = events_df.apply(get_handedness, axis=1)
|
|
377
|
+
|
|
378
|
+
# Calculate priority for sorting (matching ESPN function)
|
|
379
|
+
events_df['priority'] = np.where(
|
|
380
|
+
events_df['event'].isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK']), 1,
|
|
381
|
+
np.where(events_df['event'] == 'GOAL', 2,
|
|
382
|
+
np.where(events_df['event'] == 'STOP', 3,
|
|
383
|
+
np.where(events_df['event'] == 'DELPEN', 4,
|
|
384
|
+
np.where(events_df['event'] == 'PENL', 5,
|
|
385
|
+
np.where(events_df['event'] == 'CHANGE', 6,
|
|
386
|
+
np.where(events_df['event'] == 'PEND', 7,
|
|
387
|
+
np.where(events_df['event'] == 'GEND', 8,
|
|
388
|
+
np.where(events_df['event'] == 'FAC', 9, 0)))))))))
|
|
389
|
+
|
|
390
|
+
# Sort by period, game_seconds, event_player_1, priority
|
|
391
|
+
events_df = events_df.sort_values(
|
|
392
|
+
by=['period', 'game_seconds', 'event_player_1', 'priority']
|
|
393
|
+
).reset_index(drop=True)
|
|
394
|
+
|
|
395
|
+
# Calculate version numbers for duplicate events (matching ESPN logic)
|
|
396
|
+
events_df['version'] = 0
|
|
397
|
+
|
|
398
|
+
# Version 1: same event, player, and time as previous
|
|
399
|
+
events_df['version'] = np.where(
|
|
400
|
+
(events_df['event'] == events_df['event'].shift()) &
|
|
401
|
+
(events_df['event_player_1'] == events_df['event_player_1'].shift()) &
|
|
402
|
+
(events_df['event_player_1'] != '') &
|
|
403
|
+
(events_df['game_seconds'] == events_df['game_seconds'].shift()),
|
|
404
|
+
1, events_df['version']
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Version 2: same event, player, and time as 2 rows ago
|
|
408
|
+
events_df['version'] = np.where(
|
|
409
|
+
(events_df['event'] == events_df['event'].shift(2)) &
|
|
410
|
+
(events_df['event_player_1'] == events_df['event_player_1'].shift(2)) &
|
|
411
|
+
(events_df['game_seconds'] == events_df['game_seconds'].shift(2)) &
|
|
412
|
+
(events_df['event_player_1'] != '') &
|
|
413
|
+
(~events_df['description'].str.contains('Penalty Shot', na=False)),
|
|
414
|
+
2, events_df['version']
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Version 3: same event, player, and time as 3 rows ago
|
|
418
|
+
events_df['version'] = np.where(
|
|
419
|
+
(events_df['event'] == events_df['event'].shift(3)) &
|
|
420
|
+
(events_df['event_player_1'] == events_df['event_player_1'].shift(3)) &
|
|
421
|
+
(events_df['game_seconds'] == events_df['game_seconds'].shift(3)) &
|
|
422
|
+
(events_df['event_player_1'] != ''),
|
|
423
|
+
3, events_df['version']
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Clip coordinates to valid ranges (matching ESPN function)
|
|
427
|
+
events_df['coords_x'] = np.where(events_df['coords_x'] > 99, 99, events_df['coords_x'])
|
|
428
|
+
events_df['coords_y'] = np.where(events_df['coords_y'] < -42, -42, events_df['coords_y'])
|
|
429
|
+
|
|
430
|
+
# Select final columns (matching ESPN column order)
|
|
431
|
+
final_columns = ['coords_x', 'coords_y', 'event_player_1', 'event', 'game_seconds', 'period', 'version', 'goalie_id', 'goalie_name', 'miss_reason', 'shooter_handedness']
|
|
432
|
+
if not drop_description:
|
|
433
|
+
final_columns.append('description')
|
|
434
|
+
|
|
435
|
+
events_df = events_df[final_columns]
|
|
436
|
+
|
|
437
|
+
return events_df
|
|
438
|
+
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: TopDownHockey_Scraper
|
|
3
|
+
Version: 6.1.30
|
|
4
|
+
Summary: The TopDownHockey Scraper
|
|
5
|
+
Home-page: https://github.com/TopDownHockey/TopDownHockey_Scraper
|
|
6
|
+
Author: Patrick Bacon
|
|
7
|
+
Author-email: patrick.s.bacon@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/TopDownHockey/TopDownHockey_Scraper/issues
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.6
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: pandas
|
|
18
|
+
Requires-Dist: bs4
|
|
19
|
+
Requires-Dist: requests
|
|
20
|
+
Requires-Dist: xmltodict
|
|
21
|
+
Requires-Dist: lxml
|
|
22
|
+
Requires-Dist: natsort
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: author-email
|
|
25
|
+
Dynamic: classifier
|
|
26
|
+
Dynamic: description
|
|
27
|
+
Dynamic: description-content-type
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: project-url
|
|
32
|
+
Dynamic: requires-dist
|
|
33
|
+
Dynamic: requires-python
|
|
34
|
+
Dynamic: summary
|
|
35
|
+
|
|
36
|
+
# TopDownHockey EliteProspects Scraper
|
|
37
|
+
|
|
38
|
+
## By Patrick Bacon, made possible by the work of Marcus Sjölin and Harry Shomer.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
This is a package built for scraping two data sources:
|
|
43
|
+
|
|
44
|
+
1. The NHL's Play-by-Play Reports, which come in the form of HTML/API reports from the NHL and JSON reports from ESPN.
|
|
45
|
+
|
|
46
|
+
2. Elite Prospects, an extremely valuable website which makes hockey data for thousands of leagues available to the public.
|
|
47
|
+
|
|
48
|
+
This package is strictly built for end users who wish to scrape data for personal use. If you are interested in using Elite Prospects data for professional purposes, I recommend you look into the <a href="https://www.eliteprospects.com/api" >Elite Prospects API</a>.
|
|
49
|
+
|
|
50
|
+
While using the scraper, please be mindful of EliteProspects servers.
|
|
51
|
+
|
|
52
|
+
# Installation
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
You can install the package by entering the following command in terminal:
|
|
57
|
+
|
|
58
|
+
<code>pip install TopDownHockey_Scraper</code>
|
|
59
|
+
|
|
60
|
+
If you're interested in using the NHL Play-By-Play scraper, import that module using this function in Python:
|
|
61
|
+
|
|
62
|
+
<code>import TopDownHockey_Scraper.TopDownHockey_NHL_Scraper as tdhnhlscrape</code>
|
|
63
|
+
|
|
64
|
+
If you're interested in using the Elite Prospects scraper, import that module using this function in Python:
|
|
65
|
+
|
|
66
|
+
<code>import TopDownHockey_Scraper.TopDownHockey_EliteProspects_Scraper as tdhepscrape</code>
|
|
67
|
+
|
|
68
|
+
# User-End Functions (NHL Scraper)
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
### scrape_full_schedule(start_date, end_date)
|
|
73
|
+
|
|
74
|
+
Returns the NHL's schedule from the API for all games for the 2023-2024 NHL season.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
|
|
78
|
+
<code>tdhnhlscrape.scrape_full_schedule()</code>
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
### full_scrape(game_id_list, shift = True)
|
|
83
|
+
|
|
84
|
+
Returns a dataframe containing play-by-play data for a list of game ids.
|
|
85
|
+
|
|
86
|
+
<ul>
|
|
87
|
+
<li>game_id_list: A list of NHL game ids.</li>
|
|
88
|
+
</ul>
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
|
|
92
|
+
<code>tdhnhlscrape.full_scrape([2023020179, 2023020180, 2023020181])</code>
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# User-End Functions (Elite Prospects Scraper)
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### get_skaters(leagues, seasons)
|
|
100
|
+
|
|
101
|
+
Returns a dataframe containing statistics for all skaters in a target set of league(s) and season(s).
|
|
102
|
+
|
|
103
|
+
<ul>
|
|
104
|
+
<li>leagues: One or multiple leagues. If one league, enter as a string i.e; "nhl". If multiple leagues, enter as a tuple or list i.e; ("nhl", "ahl").</li>
|
|
105
|
+
<li>seasons: One or multiple leagues. If one league, enter as a string i.e; "2018-2019". If multiple leagues, enter as a tuple or list i.e; ("2018-2019", "2019-2020").</li>
|
|
106
|
+
</ul>
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
|
|
110
|
+
<code>tdhepscrape.get_skaters(("nhl", "ahl"), ("2018-2019", "2019-2020"))</code>
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
### get_goalies(leagues, seasons)
|
|
115
|
+
|
|
116
|
+
Returns a dataframe containing statistics for all goalies in a target set of league(s) and season(s).
|
|
117
|
+
|
|
118
|
+
<ul>
|
|
119
|
+
<li>leagues: One or multiple leagues. If one league, enter as a string i.e; "nhl". If multiple leagues, enter as a tuple or list i.e; ("nhl", "ahl").</li>
|
|
120
|
+
<li>seasons: One or multiple leagues. If one league, enter as a string i.e; "2018-2019". If multiple leagues, enter as a tuple or list i.e; ("2018-2019", "2019-2020").</li>
|
|
121
|
+
</ul>
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
|
|
125
|
+
<code>tdhepscrape.get_goalies("khl", "2015-2016")</code>
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
### get_player_information(dataframe)
|
|
130
|
+
|
|
131
|
+
Returns a dataframe containing bio information for all skaters or goalies (or both) within a target dataframe.
|
|
132
|
+
|
|
133
|
+
<ul>
|
|
134
|
+
<li>dataframe: The dataframe returned by one of the previous two commands.</li>
|
|
135
|
+
</ul>
|
|
136
|
+
|
|
137
|
+
Example:
|
|
138
|
+
|
|
139
|
+
Say you obtain skater data for the KHL in 2020-2021 and store that as a dataframe called <code>output</code>. You can run this function to get bio information for every player in that league's scrape.
|
|
140
|
+
|
|
141
|
+
<code>output = tdhepscrape.get_skaters("khl", "2020-2021")</code>
|
|
142
|
+
|
|
143
|
+
<code>tdhepscrape.get_player_information(output)</code>
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### add_player_information(dataframe)
|
|
148
|
+
|
|
149
|
+
Returns a dataframe containing bio information for all skaters or goalies (or both) within a target dataframe as well as the statistics from the original dataframe.
|
|
150
|
+
|
|
151
|
+
<ul>
|
|
152
|
+
<li>dataframe: The dataframe returned by one of the previous two commands.</li>
|
|
153
|
+
</ul>
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
|
|
157
|
+
Say you obtain skater data for the KHL in 2020-2021 and store that as a dataframe called <code>output</code>. You can run this function to get bio information for every player in that league's scrape.
|
|
158
|
+
|
|
159
|
+
<code>output = tdhepscrape.get_skaters("khl", "2020-2021")</code>
|
|
160
|
+
|
|
161
|
+
<code>tdhepscrape.add_player_information(output)</code>
|
|
162
|
+
|
|
163
|
+
# Comments, Questions, and Concerns.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
My goal was to make this package as error-proof as possible. I believe I've accounted for every issue that could potentially throw off a scrape, but it's possible I've missed something.
|
|
168
|
+
|
|
169
|
+
If any issues arise, or you have any questions about the package, please do not hesitate to contact me on Twitter at @TopDownHockey or email me directly at patrick.s.bacon@gmail.com.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
|
|
2
|
+
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=1BqhU6-YZfrryXizCM-bE0axljham7YwWASRkonD6nE,184040
|
|
3
|
+
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper_OG.py,sha256=NaNuLqeVVWuJHjM-hsgBiv8dvh7vkImuL5uB5bQXwyc,180528
|
|
4
|
+
TopDownHockey_Scraper/__init__.py,sha256=xojegoBkA9Tcz-nnKENGT9pVF1oxBJaU2x8y8KpEj7w,83
|
|
5
|
+
TopDownHockey_Scraper/name_corrections.py,sha256=fewjGSliIY7JDhreEYj5Ml45B2b71BJseoXskE7n4k4,11154
|
|
6
|
+
TopDownHockey_Scraper/portrait_links.csv,sha256=O47YWe3wEpTUDXPpx2-t3VTRD90lntNrNT6IYST5x28,181831
|
|
7
|
+
TopDownHockey_Scraper/scrape_nhl_api_events.py,sha256=TZLQHewxUx8GHadRompWYtoe3Wcv7Z_OGs6vQUksj3w,17450
|
|
8
|
+
TopDownHockey_Scraper/data/handedness.csv,sha256=rA4p7YnBrEMfhmtX4nRO787UJwpXM2KZgWKPgOiNSac,20823
|
|
9
|
+
topdownhockey_scraper-6.1.30.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
10
|
+
topdownhockey_scraper-6.1.30.dist-info/METADATA,sha256=RacztwBVUvVT8PpCEyOEqCP8-U3fstbf8OBm8iyyTIs,5671
|
|
11
|
+
topdownhockey_scraper-6.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
topdownhockey_scraper-6.1.30.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
13
|
+
topdownhockey_scraper-6.1.30.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
TopDownHockey_Scraper
|