wsba-hockey 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey-0.1.0/LICENSE +21 -0
- wsba_hockey-0.1.0/PKG-INFO +72 -0
- wsba_hockey-0.1.0/README.md +57 -0
- wsba_hockey-0.1.0/pyproject.toml +18 -0
- wsba_hockey-0.1.0/setup.cfg +4 -0
- wsba_hockey-0.1.0/src/wsba_hockey/tools/scraping.py +625 -0
- wsba_hockey-0.1.0/src/wsba_hockey/wsba_main.py +282 -0
- wsba_hockey-0.1.0/src/wsba_hockey.egg-info/PKG-INFO +72 -0
- wsba_hockey-0.1.0/src/wsba_hockey.egg-info/SOURCES.txt +9 -0
- wsba_hockey-0.1.0/src/wsba_hockey.egg-info/dependency_links.txt +1 -0
- wsba_hockey-0.1.0/src/wsba_hockey.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Owen Singh, WeakSide Breakout Analysis
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: wsba_hockey
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information.
|
5
|
+
Author-email: Owen Singh <owenbksingh@gmail.com>
|
6
|
+
Project-URL: Homepage, https://github.com/owensingh38/wsba_hockey/
|
7
|
+
Keywords: wsba,hockey,scraping,NHL,analytics
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Dynamic: license-file
|
15
|
+
|
16
|
+
# WSBA HOCKEY
|
17
|
+

|
18
|
+
|
19
|
+
### A Python package for scraping and analyzing hockey data under the motto: ***Evaluating, analyzing, and understanding the game of hockey through the lens of different analytical methods, including incorporation of analytics.***
|
20
|
+
|
21
|
+
## INSTALLATION AND USAGE
|
22
|
+
```bash
|
23
|
+
pip install wsba_hockey
|
24
|
+
```
|
25
|
+
|
26
|
+
```python
|
27
|
+
import wsba_hockey as wsba
|
28
|
+
```
|
29
|
+
|
30
|
+
## ALL FEATURES
|
31
|
+
### Note: Features yet included are marked with *
|
32
|
+
|
33
|
+
|
34
|
+
## SCRAPING
|
35
|
+
### NHL Play-by-Play (of any game frame up to a full season)
|
36
|
+
#### Functions:
|
37
|
+
|
38
|
+
```python
|
39
|
+
wsba.nhl_scrape_game(['2024020918'],split_shifts=False,remove=['game-end'])
|
40
|
+
wsba.nhl_scrape_season('20242025',split_shifts=False,remove=['game-end'],local=True)
|
41
|
+
```
|
42
|
+
|
43
|
+
### NHL Season Information
|
44
|
+
|
45
|
+
```python
|
46
|
+
wsba.nhl_scrape_schedule('20242025')
|
47
|
+
wsba.nhl_scrape_seasons_info(seasons=['20212022','20222023','20232024','20242025])
|
48
|
+
wsba.nhl_scrape_standings(arg = '2024-03-20')
|
49
|
+
```
|
50
|
+
|
51
|
+
### NHL Rosters and Player Information
|
52
|
+
|
53
|
+
```python
|
54
|
+
wsba.nhl_scrape_player_info(wsba.nhl_scrape_roster('20242025'))
|
55
|
+
```
|
56
|
+
|
57
|
+
## DATA ANALYTICS
|
58
|
+
### Expected Goals (WeakSide Breakout and MoneyPuck models)*
|
59
|
+
### Goal Impacts and Shot Analysis*
|
60
|
+
### Stat Aggregation*
|
61
|
+
### Shot Plotting (Plots, Heatmaps, etc.)*
|
62
|
+
|
63
|
+
## REPOSITORY
|
64
|
+
### Past Season Play-by-Play*
|
65
|
+
### Team Information*
|
66
|
+
### Schedule*
|
67
|
+
|
68
|
+
## ACKNOWLEDGEMENTS AND CREDITS
|
69
|
+
### Huge thanks to the following:
|
70
|
+
Harry Shomer - Creator of the hockey_scraper package, which contains select utils functions utilized in this package and otherwise inspires the creation of this package.
|
71
|
+
|
72
|
+
Dan Morse - Creator of the hockeyR package; another important inspiration and model for developing an NHL scraper.
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# WSBA HOCKEY
|
2
|
+

|
3
|
+
|
4
|
+
### A Python package for scraping and analyzing hockey data under the motto: ***Evaluating, analyzing, and understanding the game of hockey through the lens of different analytical methods, including incorporation of analytics.***
|
5
|
+
|
6
|
+
## INSTALLATION AND USAGE
|
7
|
+
```bash
|
8
|
+
pip install wsba_hockey
|
9
|
+
```
|
10
|
+
|
11
|
+
```python
|
12
|
+
import wsba_hockey as wsba
|
13
|
+
```
|
14
|
+
|
15
|
+
## ALL FEATURES
|
16
|
+
### Note: Features yet included are marked with *
|
17
|
+
|
18
|
+
|
19
|
+
## SCRAPING
|
20
|
+
### NHL Play-by-Play (of any game frame up to a full season)
|
21
|
+
#### Functions:
|
22
|
+
|
23
|
+
```python
|
24
|
+
wsba.nhl_scrape_game(['2024020918'],split_shifts=False,remove=['game-end'])
|
25
|
+
wsba.nhl_scrape_season('20242025',split_shifts=False,remove=['game-end'],local=True)
|
26
|
+
```
|
27
|
+
|
28
|
+
### NHL Season Information
|
29
|
+
|
30
|
+
```python
|
31
|
+
wsba.nhl_scrape_schedule('20242025')
|
32
|
+
wsba.nhl_scrape_seasons_info(seasons=['20212022','20222023','20232024','20242025])
|
33
|
+
wsba.nhl_scrape_standings(arg = '2024-03-20')
|
34
|
+
```
|
35
|
+
|
36
|
+
### NHL Rosters and Player Information
|
37
|
+
|
38
|
+
```python
|
39
|
+
wsba.nhl_scrape_player_info(wsba.nhl_scrape_roster('20242025'))
|
40
|
+
```
|
41
|
+
|
42
|
+
## DATA ANALYTICS
|
43
|
+
### Expected Goals (WeakSide Breakout and MoneyPuck models)*
|
44
|
+
### Goal Impacts and Shot Analysis*
|
45
|
+
### Stat Aggregation*
|
46
|
+
### Shot Plotting (Plots, Heatmaps, etc.)*
|
47
|
+
|
48
|
+
## REPOSITORY
|
49
|
+
### Past Season Play-by-Play*
|
50
|
+
### Team Information*
|
51
|
+
### Schedule*
|
52
|
+
|
53
|
+
## ACKNOWLEDGEMENTS AND CREDITS
|
54
|
+
### Huge thanks to the following:
|
55
|
+
Harry Shomer - Creator of the hockey_scraper package, which contains select utils functions utilized in this package and otherwise inspires the creation of this package.
|
56
|
+
|
57
|
+
Dan Morse - Creator of the hockeyR package; another important inspiration and model for developing an NHL scraper.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[project]
|
2
|
+
name = "wsba_hockey"
|
3
|
+
version = "0.1.0"
|
4
|
+
authors = [
|
5
|
+
{ name="Owen Singh", email="owenbksingh@gmail.com" },
|
6
|
+
]
|
7
|
+
description = "WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information."
|
8
|
+
readme = "README.md"
|
9
|
+
requires-python = ">=3.8"
|
10
|
+
classifiers = [
|
11
|
+
"Programming Language :: Python :: 3",
|
12
|
+
"License :: OSI Approved :: MIT License",
|
13
|
+
"Operating System :: OS Independent",
|
14
|
+
]
|
15
|
+
keywords = ["wsba","hockey","scraping","NHL","analytics"]
|
16
|
+
|
17
|
+
[project.urls]
|
18
|
+
Homepage = "https://github.com/owensingh38/wsba_hockey/"
|
@@ -0,0 +1,625 @@
|
|
1
|
+
import re
|
2
|
+
from bs4 import BeautifulSoup, SoupStrainer
|
3
|
+
import hockey_scraper.utils.shared as shared
|
4
|
+
import hockey_scraper.nhl.pbp.html_pbp as html
|
5
|
+
import hockey_scraper.nhl.game_scraper as gs
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
import warnings
|
9
|
+
import requests as rs
|
10
|
+
from zipfile import ZipFile
|
11
|
+
warnings.filterwarnings('ignore')
|
12
|
+
|
13
|
+
### SCRAPING FUNCTIONS ###
|
14
|
+
# Provided in this file are functions vital to the scraping functions in the WSBA Hockey Python package. #
|
15
|
+
|
16
|
+
## JSON DATA ##
|
17
|
+
def retreive_players(json,result = "id"):
|
18
|
+
#Given json data from an NHL API call, return dictionary with home and away players and either their id or their position.
|
19
|
+
roster = pd.json_normalize(json['rosterSpots'])
|
20
|
+
info = pd.json_normalize(json)
|
21
|
+
home = info['homeTeam.id'][0]
|
22
|
+
away = info['awayTeam.id'][0]
|
23
|
+
|
24
|
+
#Add up to four alternative names for each player in the game
|
25
|
+
roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
|
26
|
+
try: roster['playerName_2'] = roster['firstName.cs']+" "+roster['lastName.default']
|
27
|
+
except: roster['playerName_2'] = ""
|
28
|
+
try: roster['playerName_3'] = roster['firstName.de']+" "+roster['lastName.default']
|
29
|
+
except: roster['playerName_3'] = ""
|
30
|
+
try: roster['playerName_4'] = roster['firstName.es']+" "+roster['lastName.default']
|
31
|
+
except: roster['playerName_4'] = ""
|
32
|
+
|
33
|
+
#For each home/away player their name is included as a key and their id or position is the value
|
34
|
+
home_players = {}
|
35
|
+
home_id = roster.loc[roster['teamId']==home]
|
36
|
+
hid = list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])+list(home_id['playerId'])
|
37
|
+
hpos = list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])+list(home_id['positionCode'])
|
38
|
+
hp = list(home_id['playerName'])+list(home_id['playerName_2'])+list(home_id['playerName_3'])+list(home_id['playerName_4'])
|
39
|
+
|
40
|
+
for id, pos, player in zip(hid,hpos,hp):
|
41
|
+
try: home_players.update({player.upper():
|
42
|
+
{result:id if result == 'id' else pos}})
|
43
|
+
except:
|
44
|
+
continue
|
45
|
+
|
46
|
+
away_players = {}
|
47
|
+
away_id = roster.loc[roster['teamId']==away]
|
48
|
+
aid = list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])+list(away_id['playerId'])
|
49
|
+
apos = list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])+list(away_id['positionCode'])
|
50
|
+
ap = list(away_id['playerName'])+list(away_id['playerName_2'])+list(away_id['playerName_3'])+list(away_id['playerName_4'])
|
51
|
+
|
52
|
+
for id, pos, player in zip(aid,apos,ap):
|
53
|
+
try: away_players.update({player.upper():
|
54
|
+
{result:id if result == 'id' else pos}})
|
55
|
+
except:
|
56
|
+
continue
|
57
|
+
|
58
|
+
#Return: Dict of away and home players keyed with id or position as value
|
59
|
+
return {
|
60
|
+
'home':home_players,
|
61
|
+
'away':away_players
|
62
|
+
}
|
63
|
+
|
64
|
+
def parse_json(json):
|
65
|
+
#Given json data from an NHL API call, return play-by-play data.
|
66
|
+
|
67
|
+
events = pd.json_normalize(json['plays']).reset_index(drop=True)
|
68
|
+
info = pd.json_normalize(json)
|
69
|
+
roster = pd.json_normalize(json['rosterSpots'])
|
70
|
+
|
71
|
+
#Game information
|
72
|
+
events['game_id'] = info['id'][0]
|
73
|
+
events['season'] = info['season'][0]
|
74
|
+
events['season_type'] = info['gameType'][0]
|
75
|
+
events['game_date'] = info['gameDate'][0]
|
76
|
+
events['start_time'] = info['startTimeUTC'][0]
|
77
|
+
events['venue'] = info['venue.default'][0]
|
78
|
+
events['venue_location'] = info['venueLocation.default'][0]
|
79
|
+
events['away_team_id'] = info['awayTeam.id'][0]
|
80
|
+
events['away_team_abbr'] = info['awayTeam.abbrev'][0]
|
81
|
+
events['home_team_id'] = info['homeTeam.id'][0]
|
82
|
+
events['home_team_abbr'] = info['homeTeam.abbrev'][0]
|
83
|
+
|
84
|
+
teams = {
|
85
|
+
info['awayTeam.id'][0]:info['awayTeam.abbrev'][0],
|
86
|
+
info['homeTeam.id'][0]:info['homeTeam.abbrev'][0]
|
87
|
+
}
|
88
|
+
|
89
|
+
#Create player information dicts used to create event_player columns
|
90
|
+
roster['playerName'] = roster['firstName.default']+" "+roster['lastName.default']
|
91
|
+
players = {}
|
92
|
+
players_pos = {}
|
93
|
+
ids = {}
|
94
|
+
for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
|
95
|
+
players.update({id:player.upper()})
|
96
|
+
for id, pos in zip(list(roster['playerId']),list(roster['positionCode'])):
|
97
|
+
players_pos.update({id:pos.upper()})
|
98
|
+
for id, player in zip(list(roster['playerId']),list(roster['playerName'])):
|
99
|
+
ids.update({player.upper():id})
|
100
|
+
|
101
|
+
#Test columns
|
102
|
+
cols = ['eventId', 'timeInPeriod', 'timeRemaining', 'situationCode', 'homeTeamDefendingSide', 'typeCode', 'typeDescKey', 'sortOrder', 'periodDescriptor.number', 'periodDescriptor.periodType', 'periodDescriptor.maxRegulationPeriods', 'details.eventOwnerTeamId', 'details.losingPlayerId', 'details.winningPlayerId', 'details.xCoord', 'details.yCoord', 'details.zoneCode', 'pptReplayUrl', 'details.shotType', 'details.scoringPlayerId', 'details.scoringPlayerTotal', 'details.assist1PlayerId', 'details.assist1PlayerTotal', 'details.assist2PlayerId', 'details.assist2PlayerTotal', 'details.goalieInNetId', 'details.awayScore', 'details.homeScore', 'details.highlightClipSharingUrl', 'details.highlightClipSharingUrlFr', 'details.highlightClip', 'details.highlightClipFr', 'details.discreteClip', 'details.discreteClipFr', 'details.shootingPlayerId', 'details.awaySOG', 'details.homeSOG', 'details.playerId', 'details.hittingPlayerId', 'details.hitteePlayerId', 'details.reason', 'details.typeCode', 'details.descKey', 'details.duration', 'details.servedByPlayerId', 'details.secondaryReason', 'details.blockingPlayerId', 'details.committedByPlayerId', 'details.drawnByPlayerId', 'game_id', 'season', 'season_type', 'game_date', 'away_team_id', 'away_team_abbr', 'home_team_id', 'home_team_abbr']
|
103
|
+
|
104
|
+
for col in cols:
|
105
|
+
try:events[col]
|
106
|
+
except:
|
107
|
+
events[col]=""
|
108
|
+
|
109
|
+
#Event_player_columns include players in a given set of events; the higher the number, the greater the importance the event player was to the play
|
110
|
+
events['event_player_1_id'] = events['details.winningPlayerId'].combine_first(events['details.scoringPlayerId'])\
|
111
|
+
.combine_first(events['details.shootingPlayerId'])\
|
112
|
+
.combine_first(events['details.playerId'])\
|
113
|
+
.combine_first(events['details.hittingPlayerId'])\
|
114
|
+
.combine_first(events['details.committedByPlayerId'])
|
115
|
+
|
116
|
+
events['event_player_2_id'] = events['details.losingPlayerId'].combine_first(events['details.assist1PlayerId'])\
|
117
|
+
.combine_first(events['details.hitteePlayerId'])\
|
118
|
+
.combine_first(events['details.drawnByPlayerId'])\
|
119
|
+
.combine_first(events['details.blockingPlayerId'])
|
120
|
+
|
121
|
+
events['event_player_3_id'] = events['details.assist2PlayerId']
|
122
|
+
|
123
|
+
events['event_team_status'] = np.where(events['home_team_id']==events['details.eventOwnerTeamId'],"home","away")
|
124
|
+
|
125
|
+
#Coordinate adjustments:
|
126
|
+
#The WSBA NHL Scraper includes three sets of coordinates per event:
|
127
|
+
# x, y - Raw coordinates from JSON pbpp
|
128
|
+
# x_fixed, y_fixed - Coordinates fixed to the right side of the ice (x is always greater than 0)
|
129
|
+
# x_adj, y_adj - Adjusted coordinates configuring away events with negative x vlaues while home events are always positive
|
130
|
+
events['x_fixed'] = abs(events['details.xCoord'])
|
131
|
+
events['y_fixed'] = np.where(events['details.xCoord']<0,-events['details.yCoord'],events['details.yCoord'])
|
132
|
+
events['x_adj'] = np.where(events['event_team_status']=="home",events['x_fixed'],-events['x_fixed'])
|
133
|
+
events['y_adj'] = np.where(events['event_team_status']=="home",events['y_fixed'],-events['y_fixed'])
|
134
|
+
events['event_distance'] = np.sqrt(((89 - events['x_fixed'])**2) + (events['y_fixed']**2))
|
135
|
+
events['event_angle'] = np.degrees(np.arctan2(abs(events['y_fixed']), abs(89 - events['x_fixed'])))
|
136
|
+
|
137
|
+
events['event_team_abbr'] = events['details.eventOwnerTeamId'].replace(teams)
|
138
|
+
|
139
|
+
#Event player information includes ids (included in the JSON events), names (from "rosterSpots"), and positions (also from "rosterSpots")
|
140
|
+
events['event_player_1_name'] = events['event_player_1_id'].replace(players)
|
141
|
+
events['event_player_2_name'] = events['event_player_2_id'].replace(players)
|
142
|
+
events['event_player_3_name'] = events['event_player_3_id'].replace(players)
|
143
|
+
|
144
|
+
events['event_player_1_pos'] = events['event_player_1_id'].replace(players_pos)
|
145
|
+
events['event_player_2_pos'] = events['event_player_2_id'].replace(players_pos)
|
146
|
+
events['event_player_3_pos'] = events['event_player_3_id'].replace(players_pos)
|
147
|
+
|
148
|
+
events['event_goalie_name'] = events['details.goalieInNetId'].replace(players)
|
149
|
+
|
150
|
+
#Create situations given situation code (this is reconfigured with on ice skaters when provided shifts data)
|
151
|
+
events['away_skaters'] = events['situationCode'].astype(str).str.slice(start=1,stop=2)
|
152
|
+
events['home_skaters'] = events['situationCode'].astype(str).str.slice(start=2,stop=3)
|
153
|
+
events['event_skaters'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['home_skaters'],events['away_skaters'])
|
154
|
+
events['event_skaters_against'] = np.where(events['event_team_abbr']==events['home_team_abbr'],events['away_skaters'],events['home_skaters'])
|
155
|
+
|
156
|
+
events['strength_state'] = events['event_skaters']+"v"+events['event_skaters_against']
|
157
|
+
events['strength'] = np.where(events['event_skaters']==events['event_skaters_against'],
|
158
|
+
"EV",np.where(
|
159
|
+
events['event_skaters']>events['event_skaters_against'],
|
160
|
+
"PP","SH"
|
161
|
+
))
|
162
|
+
|
163
|
+
#Rename columns to follow WSBA naming conventions
|
164
|
+
events = events.rename(columns={
|
165
|
+
"eventId":"event_id",
|
166
|
+
"periodDescriptor.number":"period",
|
167
|
+
"periodDescriptor.periodType":"period_type",
|
168
|
+
"timeInPeriod":"period_time_elasped",
|
169
|
+
"timeRemaining":"period_time_remaining",
|
170
|
+
"situationCode":"situation_code",
|
171
|
+
"homeTeamDefendingSide":"home_team_defending_side",
|
172
|
+
"typeCode":"event_type_code",
|
173
|
+
"typeDescKey":"event_type",
|
174
|
+
"details.shotType":"shot_type",
|
175
|
+
"details.duration":"penalty_duration",
|
176
|
+
"details.descKey":"penalty_description",
|
177
|
+
"details.reason":"reason",
|
178
|
+
"details.zoneCode":"zone_code",
|
179
|
+
"details.xCoord":"x",
|
180
|
+
"details.yCoord":"y",
|
181
|
+
"details.goalieInNetId": "event_goalie_id",
|
182
|
+
"details.awaySOG":"away_SOG",
|
183
|
+
"details.homeSOG":"home_SOG"
|
184
|
+
})
|
185
|
+
|
186
|
+
#Period time adjustments (only 'seconds_elapsed' is included in the resulting data)
|
187
|
+
events['period_time_simple'] = events['period_time_elasped'].str.replace(":","",regex=True)
|
188
|
+
events['period_seconds_elapsed'] = np.where(events['period_time_simple'].str.len()==3,
|
189
|
+
((events['period_time_simple'].str[0].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)),
|
190
|
+
((events['period_time_simple'].str[0:2].astype(int)*60)+events['period_time_simple'].str[-2:].astype(int)))
|
191
|
+
events['period_seconds_remaining'] = 1200-events['period_seconds_elapsed']
|
192
|
+
events['seconds_elapsed'] = ((events['period']-1)*1200)+events['period_seconds_elapsed']
|
193
|
+
|
194
|
+
#The following code is utilized to generate score and fenwick columns for each event
|
195
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
196
|
+
ag = 0
|
197
|
+
ags = []
|
198
|
+
hg = 0
|
199
|
+
hgs = []
|
200
|
+
|
201
|
+
af = 0
|
202
|
+
afs = []
|
203
|
+
hf = 0
|
204
|
+
hfs = []
|
205
|
+
for event,team in zip(list(events['event_type']),list(events['event_team_status'])):
|
206
|
+
if event in fenwick_events:
|
207
|
+
if team == "home":
|
208
|
+
hf = hf+1
|
209
|
+
if event == 'goal':
|
210
|
+
hg = hg+1
|
211
|
+
else:
|
212
|
+
af = af+1
|
213
|
+
if event == 'goal':
|
214
|
+
ag = ag+1
|
215
|
+
|
216
|
+
ags.append(ag)
|
217
|
+
hgs.append(hg)
|
218
|
+
afs.append(af)
|
219
|
+
hfs.append(hf)
|
220
|
+
|
221
|
+
events['away_score'] = ags
|
222
|
+
events['home_score'] = hgs
|
223
|
+
events['away_fenwick'] = afs
|
224
|
+
events['home_fenwick'] = hfs
|
225
|
+
|
226
|
+
events = events.loc[(events['event_type']!="")&(events['event_type']!="game-end")]
|
227
|
+
|
228
|
+
#Return: dataframe with parsed games in event
|
229
|
+
return events
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
## HTML DATA ##
|
234
|
+
def get_soup(shifts_html):
|
235
|
+
#Parses provided shifts html with BeautifulSoup
|
236
|
+
#Utilizes method from Harry Shomer's hockey_scraper package
|
237
|
+
parsers = ["lxml", "html.parser", "html5lib"]
|
238
|
+
|
239
|
+
for parser in parsers:
|
240
|
+
soup = BeautifulSoup(shifts_html, parser)
|
241
|
+
td = soup.findAll(True, {'class': ['playerHeading + border', 'lborder + bborder']})
|
242
|
+
|
243
|
+
if len(td) > 0:
|
244
|
+
break
|
245
|
+
return td, get_teams(soup)
|
246
|
+
|
247
|
+
|
248
|
+
def get_teams(soup):
|
249
|
+
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
250
|
+
#Utilizes method from Harry Shomer's hockey_scraper package
|
251
|
+
team = soup.find('td', class_='teamHeading + border') # Team for shifts
|
252
|
+
team = team.get_text()
|
253
|
+
|
254
|
+
# Get Home Team
|
255
|
+
teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
|
256
|
+
regex = re.compile(r'>(.*)<br/?>')
|
257
|
+
home_team = regex.findall(str(teams[7]))
|
258
|
+
|
259
|
+
return [team, home_team[0]]
|
260
|
+
|
261
|
+
#PARSE FUNCTIONS
|
262
|
+
def analyze_shifts(shift, name, team, home_team, player_ids):
|
263
|
+
#Collects teams in given shifts html (parsed by Beautiful Soup)
|
264
|
+
#Modified version of Harry Shomer's analyze_shifts function in the hockey_scraper package
|
265
|
+
shifts = dict()
|
266
|
+
|
267
|
+
shifts['player_name'] = name.upper()
|
268
|
+
shifts['period'] = '4' if shift[1] == 'OT' else '5' if shift[1] == 'SO' else shift[1]
|
269
|
+
shifts['team_abbr'] = shared.get_team(team.strip(' '))
|
270
|
+
shifts['start'] = shared.convert_to_seconds(shift[2].split('/')[0])
|
271
|
+
shifts['duration'] = shared.convert_to_seconds(shift[4].split('/')[0])
|
272
|
+
|
273
|
+
# I've had problems with this one...if there are no digits the time is fucked up
|
274
|
+
if re.compile(r'\d+').findall(shift[3].split('/')[0]):
|
275
|
+
shifts['end'] = shared.convert_to_seconds(shift[3].split('/')[0])
|
276
|
+
else:
|
277
|
+
shifts['end'] = shifts['start'] + shifts['duration']
|
278
|
+
|
279
|
+
try:
|
280
|
+
if home_team == team:
|
281
|
+
shifts['player_id'] = player_ids['home'][name.upper()]['id']
|
282
|
+
else:
|
283
|
+
shifts['player_id'] = player_ids['away'][name.upper()]['id']
|
284
|
+
except KeyError:
|
285
|
+
shifts['player_id'] = None
|
286
|
+
|
287
|
+
return shifts
|
288
|
+
|
289
|
+
def parse_shifts(html, player_ids, game_id):
|
290
|
+
#Two-stage parsing of shifts data for a single team in a provided game
|
291
|
+
#Stage one: create dataframe with raw individual shifts
|
292
|
+
#Stage two: convert shift events to play-by-play structure created with json_parsing
|
293
|
+
#Modified version of Harry Shomer's parse_shifts function in the hockey_scraper package
|
294
|
+
|
295
|
+
|
296
|
+
all_shifts = []
|
297
|
+
columns = ['game_id', 'player_name', 'player_id', 'period', 'team_abbr', 'start', 'end', 'duration']
|
298
|
+
|
299
|
+
td, teams = get_soup(html)
|
300
|
+
|
301
|
+
team = teams[0]
|
302
|
+
home_team = teams[1]
|
303
|
+
players = dict()
|
304
|
+
|
305
|
+
# Iterates through each player shifts table with the following data:
|
306
|
+
# Shift #, Period, Start, End, and Duration.
|
307
|
+
for t in td:
|
308
|
+
t = t.get_text()
|
309
|
+
if ',' in t: # If a comma exists it is a player
|
310
|
+
name = t
|
311
|
+
name = name.split(',')
|
312
|
+
name = ' '.join([name[1].strip(' '), name[0][2:].strip(' ')])
|
313
|
+
#name = shared.fix_name(name)
|
314
|
+
#This has been excluded as means to control the differences in names between the JSON and HTML documents
|
315
|
+
players[name] = dict()
|
316
|
+
players[name]['number'] = name[0][:2].strip()
|
317
|
+
players[name]['shifts'] = []
|
318
|
+
else:
|
319
|
+
players[name]['shifts'].extend([t])
|
320
|
+
|
321
|
+
for key in players.keys():
|
322
|
+
# Create lists of shifts-table columns for analysis
|
323
|
+
players[key]['shifts'] = [players[key]['shifts'][i:i + 5] for i in range(0, len(players[key]['shifts']), 5)]
|
324
|
+
|
325
|
+
# Parsing
|
326
|
+
shifts = [analyze_shifts(shift, key, team, home_team, player_ids) for shift in players[key]['shifts']]
|
327
|
+
all_shifts.extend(shifts)
|
328
|
+
|
329
|
+
df = pd.DataFrame(all_shifts)
|
330
|
+
df['game_id'] = str(game_id)
|
331
|
+
|
332
|
+
shifts_raw = df[columns]
|
333
|
+
|
334
|
+
shifts_raw = shifts_raw[shifts_raw['duration'] > 0]
|
335
|
+
|
336
|
+
# Second-stage beginds here
|
337
|
+
# Identify shift starts for each shift event
|
338
|
+
shifts_on = shifts_raw.groupby(['team_abbr', 'period', 'start']).agg(
|
339
|
+
num_on=('player_name', 'size'),
|
340
|
+
players_on=('player_name', lambda x: ', '.join(x)),
|
341
|
+
ids_on=('player_id', lambda x: ', '.join(map(str, x)))
|
342
|
+
).reset_index()
|
343
|
+
|
344
|
+
shifts_on = shifts_on.rename(columns={
|
345
|
+
'start':"seconds_elapsed"
|
346
|
+
})
|
347
|
+
|
348
|
+
# Identify shift stops for each shift event
|
349
|
+
shifts_off = shifts_raw.groupby(['team_abbr', 'period', 'end']).agg(
|
350
|
+
num_off=('player_name', 'size'),
|
351
|
+
players_off=('player_name', lambda x: ', '.join(x)),
|
352
|
+
ids_off=('player_id', lambda x: ', '.join(map(str, x)))
|
353
|
+
).reset_index()
|
354
|
+
|
355
|
+
shifts_off = shifts_off.rename(columns={
|
356
|
+
'end':"seconds_elapsed"
|
357
|
+
})
|
358
|
+
|
359
|
+
# Merge and sort by time in game
|
360
|
+
shifts = pd.merge(shifts_on, shifts_off, on=['team_abbr', 'period', 'seconds_elapsed'], how='outer')
|
361
|
+
|
362
|
+
shifts = shifts.sort_values('seconds_elapsed')
|
363
|
+
|
364
|
+
#Modify columns of new total shifts dataframe
|
365
|
+
shifts['period'] = shifts['period'].astype(int)
|
366
|
+
shifts['event_type'] = 'change'
|
367
|
+
shifts['seconds_elapsed'] = shifts['seconds_elapsed'] + (1200 * (shifts['period']-1))
|
368
|
+
shifts['game_seconds_remaining'] = 3600 - shifts['seconds_elapsed']
|
369
|
+
|
370
|
+
# Handle missing values at the start and end of periods
|
371
|
+
shifts['players_on'] = shifts['players_on'].fillna('None')
|
372
|
+
shifts['players_off'] = shifts['players_off'].fillna('None')
|
373
|
+
shifts['ids_on'] = shifts['ids_on'].fillna('0')
|
374
|
+
shifts['ids_off'] = shifts['ids_off'].fillna('0')
|
375
|
+
shifts['num_on'] = shifts['num_on'].fillna(0).astype(int)
|
376
|
+
shifts['num_off'] = shifts['num_off'].fillna(0).astype(int)
|
377
|
+
|
378
|
+
#Manual Team Rename
|
379
|
+
shifts['team_abbr'] = shifts['team_abbr'].replace({
|
380
|
+
"L.A":"LAK",
|
381
|
+
"N.J":"NJD",
|
382
|
+
"S.J":"SJS",
|
383
|
+
"T.B":"TBL"
|
384
|
+
})
|
385
|
+
|
386
|
+
#Return: shift events formatted similarly to json pbp: shootout changes are discluded
|
387
|
+
return shifts.loc[shifts['period']<5].rename(columns={'team_abbr':'event_team_abbr'})
|
388
|
+
|
389
|
+
def construct_skaters_matrix(rosters, shifts, team_abbr, home=True):
|
390
|
+
#Given roster info (from the retreive_players function), shifts df, and team, generate on_ice columns for shift events
|
391
|
+
#These on-ice columns configure the on-ice players for events in the json play by play as well
|
392
|
+
skaters = pd.DataFrame()
|
393
|
+
goalies = pd.DataFrame()
|
394
|
+
if home:
|
395
|
+
team = {key:value for key, value in rosters['home'].items() if value['pos'] != "G"}
|
396
|
+
else:
|
397
|
+
team = {key:value for key, value in rosters['away'].items() if value['pos'] != "G"}
|
398
|
+
|
399
|
+
names = list(team.keys())
|
400
|
+
try: names.remove("")
|
401
|
+
except ValueError: ""
|
402
|
+
|
403
|
+
for player in names:
|
404
|
+
#For each player in the game, determine when they began and ended shifts.
|
405
|
+
#With player names as columns, 1 represents a shift event a player was on the ice for while 0 represents off the ice
|
406
|
+
on_ice = (np.cumsum(
|
407
|
+
shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
|
408
|
+
.apply(str)
|
409
|
+
.apply(lambda x: int(bool(re.search(player, x)))) -
|
410
|
+
shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
|
411
|
+
.apply(str)
|
412
|
+
.apply(lambda x: int(bool(re.search(player, x))))
|
413
|
+
))
|
414
|
+
skaters[player] = on_ice
|
415
|
+
|
416
|
+
skaters = skaters.fillna(0).astype(int)
|
417
|
+
|
418
|
+
|
419
|
+
on_skaters = (skaters == 1).stack().reset_index()
|
420
|
+
on_skaters = on_skaters[on_skaters[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
421
|
+
|
422
|
+
max_players = 6
|
423
|
+
for i in range(max_players):
|
424
|
+
on_skaters[f"{'home' if home else 'away'}_on_{i+1}"] = on_skaters["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
425
|
+
|
426
|
+
on_skaters = on_skaters.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
427
|
+
|
428
|
+
#Repeat above process with goaltenders
|
429
|
+
if home:
|
430
|
+
team = {key:value for key, value in rosters['home'].items() if value['pos'] == "G"}
|
431
|
+
else:
|
432
|
+
team = {key:value for key, value in rosters['away'].items() if value['pos'] == "G"}
|
433
|
+
|
434
|
+
names = list(team.keys())
|
435
|
+
try: names.remove("")
|
436
|
+
except ValueError: ""
|
437
|
+
|
438
|
+
for player in names:
|
439
|
+
on_ice = (np.cumsum(
|
440
|
+
shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_on']
|
441
|
+
.apply(str)
|
442
|
+
.apply(lambda x: int(bool(re.search(player, x)))) -
|
443
|
+
shifts.loc[(shifts['event_team_abbr'] == team_abbr), 'players_off']
|
444
|
+
.apply(str)
|
445
|
+
.apply(lambda x: int(bool(re.search(player, x))))
|
446
|
+
))
|
447
|
+
goalies[player] = on_ice
|
448
|
+
|
449
|
+
goalies = goalies.fillna(0).astype(int)
|
450
|
+
|
451
|
+
on_goalies = (goalies == 1).stack().reset_index()
|
452
|
+
on_goalies = on_goalies[on_goalies[0]].groupby("level_0")["level_1"].apply(list).reset_index()
|
453
|
+
|
454
|
+
max_players = 1
|
455
|
+
for i in range(max_players):
|
456
|
+
on_goalies[f"{'home' if home else 'away'}_goalie"] = on_goalies["level_1"].apply(lambda x: x[i] if i < len(x) else " ")
|
457
|
+
|
458
|
+
on_goalies = on_goalies.drop(columns=["level_1"]).rename(columns={"level_0": "row"})
|
459
|
+
|
460
|
+
#combine on-ice skaters and goaltenders for each shift event
|
461
|
+
on_players = pd.merge(on_skaters,on_goalies,how='outer',on=['row'])
|
462
|
+
|
463
|
+
shifts['row'] = shifts.index
|
464
|
+
|
465
|
+
#Return: shift events with newly added on-ice columns. NAN values are replaced with string "REMOVE" as means to create proper on-ice columns for json pbp
|
466
|
+
return pd.merge(shifts,on_players,how="outer",on=['row']).replace(np.nan,"REMOVE")
|
467
|
+
|
468
|
+
def combine_shifts(home_shift,away_shift,json,game_id):
|
469
|
+
#Given shifts html documents for home and away team, return shift events complete with both teams' changes in the provided game
|
470
|
+
data = retreive_players(json,result="pos")
|
471
|
+
data_id = retreive_players(json)
|
472
|
+
|
473
|
+
away = parse_shifts(away_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
|
474
|
+
home = parse_shifts(home_shift,data_id,game_id).sort_values(by=['period','seconds_elapsed'])
|
475
|
+
|
476
|
+
away['row'] = away.index
|
477
|
+
home['row'] = home.index
|
478
|
+
|
479
|
+
away_shifts = construct_skaters_matrix(data,away,pd.json_normalize(json)['awayTeam.abbrev'][0],False).fillna("REMOVE")
|
480
|
+
home_shifts = construct_skaters_matrix(data,home,pd.json_normalize(json)['homeTeam.abbrev'][0],True).fillna("REMOVE")
|
481
|
+
|
482
|
+
shifts = pd.concat([away_shifts,home_shifts]).sort_values(by=['period','seconds_elapsed'])
|
483
|
+
|
484
|
+
#Return: shifts dataframe with both teams' changes
|
485
|
+
return shifts.drop(columns=['row'])
|
486
|
+
|
487
|
+
def fix_names(shifts_df,json):
|
488
|
+
#Uses alternative names provided in the json to search shifts and ensure both shifts and json dataframes use the same name for each player
|
489
|
+
data = pd.json_normalize(json['rosterSpots'])
|
490
|
+
data['fullName'] = (data['firstName.default']+" "+data['lastName.default']).str.upper()
|
491
|
+
|
492
|
+
alt_name_col = ['firstName.cs', 'firstName.de', 'firstName.es', 'firstName.fi', 'firstName.sk', 'firstName.sv']
|
493
|
+
for i in range(len(alt_name_col)):
|
494
|
+
try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
|
495
|
+
except: continue
|
496
|
+
|
497
|
+
name_col = ['fullName', 'fullName.1', 'fullName.2', 'fullName.3', 'fullName.4', 'fullName.5', 'fullName.6']
|
498
|
+
|
499
|
+
for name in name_col:
|
500
|
+
try: data[name]
|
501
|
+
except:
|
502
|
+
data[name] = np.nan
|
503
|
+
|
504
|
+
names_dfs = []
|
505
|
+
for name in name_col[1:len(name_col)]:
|
506
|
+
names_dfs.append(data[[name,'fullName']].rename(columns={name:"alt",
|
507
|
+
"fullName":'default'}))
|
508
|
+
|
509
|
+
names_df = pd.concat(names_dfs)
|
510
|
+
|
511
|
+
replace = {}
|
512
|
+
for default, alt in zip(names_df['default'],names_df['alt']):
|
513
|
+
if alt == np.nan or alt == "" or str(alt) == 'nan':
|
514
|
+
continue
|
515
|
+
else:
|
516
|
+
replace.update({alt:default})
|
517
|
+
|
518
|
+
return shifts_df.replace(replace,regex=True)
|
519
|
+
|
520
|
+
def combine_data(json,html):
|
521
|
+
#Given json pbp and html shifts, total game play-by-play data is provided with additional and corrected details
|
522
|
+
df = pd.concat([json,html])
|
523
|
+
|
524
|
+
#Fill period_type column and assign shifts a sub-500 event code
|
525
|
+
df['period_type'] = np.where(df['period']<4,"REG",np.where(df['period']==4,"OT","SO"))
|
526
|
+
df['event_type_code'] = np.where(df['event_type']!='change',df['event_type_code'],499)
|
527
|
+
|
528
|
+
#Create priority columns designed to order events that occur at the same time in a game
|
529
|
+
start_pri = ['period-start','game-start']
|
530
|
+
even_pri = ['takeaway','giveaway','missed-shot','hit','shot-on-goal','blocked-shot']
|
531
|
+
df['priority'] = np.where(df['event_type'].isin(start_pri),0,
|
532
|
+
np.where(df['event_type'].isin(even_pri),1,
|
533
|
+
np.where(df['event_type']=='goal',2,
|
534
|
+
np.where(df['event_type']=='stoppage',3,
|
535
|
+
np.where(df['event_type']=='penalty',4,
|
536
|
+
np.where(df['event_type']=='change',5,
|
537
|
+
np.where(df['event_type']=='period-end',6,
|
538
|
+
np.where(df['event_type']=='game-end',7,
|
539
|
+
np.where(df['event_type']=='faceoff',8,9)))))))))
|
540
|
+
|
541
|
+
df = df.sort_values(by=['period','seconds_elapsed','priority']).reset_index()
|
542
|
+
#Recreate event_num column to accurately depict the order of all events, including changes
|
543
|
+
df['event_num'] = df.index+1
|
544
|
+
df['event_team_status'] = np.where(df['event_team_abbr'].isna(),"",np.where(df['home_team_abbr']==df['event_team_abbr'],"home","away"))
|
545
|
+
df['event_type_last'] = df['event_type'].shift(1)
|
546
|
+
df['event_type_last_2'] = df['event_type_last'].shift(1)
|
547
|
+
df['event_type_next'] = df['event_type'].shift(-1)
|
548
|
+
lag_events = ['stoppage','goal','period-end']
|
549
|
+
lead_events = ['faceoff','period-end']
|
550
|
+
period_end_secs = [0,1200,2400,3600,4800,6000,7200,8400,9600,10800]
|
551
|
+
#Define shifts by "line-change" or "on-the-fly"
|
552
|
+
df['shift_type'] = np.where(df['event_type']=='change',np.where(np.logical_or(np.logical_or(df['event_type_last'].isin(lag_events),df['event_type_last_2'].isin(lag_events),df['event_type_next'].isin(lead_events)),df['seconds_elapsed'].isin(period_end_secs)),"line-change","on-the-fly"),"")
|
553
|
+
|
554
|
+
#Descrpitions:
|
555
|
+
#HTML pbp includes descriptions for each event; without the HTML pbp, play descriptions must be generated
|
556
|
+
#Different, more originally formatting is employed with these descriptions in comparison to that provided in the HTML pbp
|
557
|
+
df['start_end_desc'] = np.where(df['event_type'].isin(['period-start','period-end']),df['away_team_abbr'] + "vs" + df['home_team_abbr'] + ": Period " + df['period'].astype(str) + " " + df['event_type'].str.replace("period-","",regex=True).str.capitalize(),np.nan)
|
558
|
+
df['take_give_desc'] = np.where(df['event_type'].isin(['takeaway','giveaway']),df['event_team_abbr'] + " " + df['event_type'].str.upper() + " by " + df['event_player_1_name'],np.nan)
|
559
|
+
df['stoppage_desc'] = np.where(df['event_type']=='stoppage',"STOPPAGE: " + df['reason'].str.replace("-"," ",regex=True).str.capitalize(),np.nan)
|
560
|
+
df['blocked_desc'] = np.where(df['event_type']=='blocked-shot',df['event_team_abbr'] + " SHOT from " + df['event_player_1_name'] + " BLOCKED by " + df['event_player_2_name'],np.nan)
|
561
|
+
df['missed_desc'] = np.where(df['event_type']=='missed-shot',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " MISSED: " + df['reason'].astype(str).str.replace("-"," ",regex=True),np.nan)
|
562
|
+
df['sog_desc'] = np.where(df['event_type']=='shot-on-goal',df['event_team_abbr'] + " SHOT by " + df['event_player_1_name'] + " SAVED by " + df['event_goalie_name'],np.nan)
|
563
|
+
df['goal_desc'] = np.where(df['event_type']=='goal',df['event_team_abbr'] + " GOAL SCORED by " + df['event_player_1_name'],np.nan)
|
564
|
+
df['assist_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_2_name'].notna())," ASSISTED by " + df['event_player_2_name'],"")
|
565
|
+
df['assist2_desc'] = np.where(np.logical_and(df['event_type']=='goal',df['event_player_3_name'].notna())," and ASSISTED by " + df['event_player_3_name'],"")
|
566
|
+
df['goal_desc_complete'] = df['goal_desc'] + df['assist_desc'] + df['assist2_desc']
|
567
|
+
df['hit_desc'] = np.where(df['event_type']=='hit',df['event_team_abbr'] + " HIT by " + df['event_player_1_name'] + " on " + df['event_player_2_name'],np.nan)
|
568
|
+
df['faceoff_desc'] = np.where(df['event_type']=='faceoff',"FACEOFF WON by " + df['event_player_1_name'] + " AGAINST " + df['event_player_2_name'],np.nan)
|
569
|
+
df['penalty_desc'] = np.where(df['event_type']=='penalty',df['event_team_abbr'] + " PENALTY on " + df['event_player_1_name'] + ": " + df['penalty_duration'].astype(str).str.replace(".0","",regex=True) + " minutes for " + df['penalty_description'].astype(str).str.replace("-"," ",regex=True).str.upper(),np.nan)
|
570
|
+
|
571
|
+
df['description'] = df['start_end_desc'].combine_first(df['take_give_desc'])\
|
572
|
+
.combine_first(df['stoppage_desc'])\
|
573
|
+
.combine_first(df['blocked_desc'])\
|
574
|
+
.combine_first(df['missed_desc'])\
|
575
|
+
.combine_first(df['sog_desc'])\
|
576
|
+
.combine_first(df['goal_desc_complete'])\
|
577
|
+
.combine_first(df['hit_desc'])\
|
578
|
+
.combine_first(df['faceoff_desc'])\
|
579
|
+
.combine_first(df['penalty_desc'])
|
580
|
+
ffill_col = ['season','season_type','game_id','game_date',
|
581
|
+
"start_time","venue","venue_location",
|
582
|
+
'away_team_abbr','home_team_abbr','home_team_defending_side',
|
583
|
+
'away_score','away_fenwick',
|
584
|
+
'home_score','home_fenwick',
|
585
|
+
'away_goalie','home_goalie']
|
586
|
+
away_on = ['away_on_1','away_on_2','away_on_3','away_on_4','away_on_5','away_on_6']
|
587
|
+
home_on = ['home_on_1','home_on_2','home_on_3','home_on_4','home_on_5','home_on_6']
|
588
|
+
|
589
|
+
#Forward fill appropriate columns
|
590
|
+
for col in ffill_col+away_on+home_on:
|
591
|
+
df[col] = df[col].ffill()
|
592
|
+
|
593
|
+
#Now that forward fill is complete, replace "REMOVE" with nan
|
594
|
+
df.replace("REMOVE",np.nan,inplace=True)
|
595
|
+
|
596
|
+
#Reconfigure strength state and sitution codes
|
597
|
+
df['away_skaters'] = df[away_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
598
|
+
df['home_skaters'] = df[home_on].replace(r'^\s*$', np.nan, regex=True).notna().sum(axis=1)
|
599
|
+
df['away_goalie_in'] = np.where(df['away_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
600
|
+
df['home_goalie_in'] = np.where(df['home_goalie'].replace(r'^\s*$', np.nan, regex=True).notna(),1,0)
|
601
|
+
|
602
|
+
df['event_skaters'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['home_skaters'],df['away_skaters'])
|
603
|
+
df['event_skaters_against'] = np.where(df['event_team_abbr']==df['home_team_abbr'],df['away_skaters'],df['home_skaters'])
|
604
|
+
|
605
|
+
df['strength_state'] = df['event_skaters'].astype(str) + "v" + df['event_skaters_against'].astype(str)
|
606
|
+
df['situation_code'] = np.where(df['situation_code'].isna(),df['away_goalie_in'].astype(str) + df['away_skaters'].astype(str) + df['home_skaters'].astype(str) + df['home_goalie_in'].astype(str),df['situation_code'])
|
607
|
+
|
608
|
+
col = [
|
609
|
+
'season','season_type','game_id','game_date',"start_time","venue","venue_location",
|
610
|
+
'away_team_abbr','home_team_abbr','event_num','period','period_type',
|
611
|
+
'seconds_elapsed', "situation_code","strength_state","home_team_defending_side","shift_type",
|
612
|
+
"event_type_code","event_type","description","reason","penalty_duration","penalty_description",
|
613
|
+
"event_team_abbr",'num_on', 'players_on', 'ids_on', 'num_off', 'players_off', 'ids_off',
|
614
|
+
"event_team_status","event_player_1_id","event_player_2_id","event_player_3_id",
|
615
|
+
"event_player_1_name","event_player_2_name","event_player_3_name","event_player_1_pos","event_player_2_pos",
|
616
|
+
"event_player_3_pos","event_goalie_id",
|
617
|
+
"event_goalie_name","shot_type","zone_code","x","y","x_fixed","y_fixed","x_adj","y_adj",
|
618
|
+
"event_skaters","away_skaters","home_skaters",
|
619
|
+
"event_distance","event_angle","away_score","home_score", "away_fenwick", "home_fenwick",
|
620
|
+
"away_on_1","away_on_2","away_on_3","away_on_4","away_on_5","away_on_6","away_goalie",
|
621
|
+
"home_on_1","home_on_2","home_on_3","home_on_4","home_on_5","home_on_6","home_goalie"
|
622
|
+
]
|
623
|
+
|
624
|
+
#Return: complete play-by-play with all important data for each event in a provided game
|
625
|
+
return df[col].replace(r'^\s*$', np.nan, regex=True)
|
@@ -0,0 +1,282 @@
|
|
1
|
+
import requests as rs
|
2
|
+
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
from datetime import datetime, timedelta
|
5
|
+
from tools.scraping import *
|
6
|
+
|
7
|
+
### WSBA HOCKEY ###
|
8
|
+
## Provided below are all integral functions in the WSBA Hockey Python package. ##
|
9
|
+
|
10
|
+
## SCRAPE FUNCTIONS ##
|
11
|
+
def nhl_scrape_game(game_ids,split_shifts = False,remove = ['period-start','period-end','challenge','stoppage']):
|
12
|
+
#Given a set of game_ids (NHL API), return complete play-by-play information as requested
|
13
|
+
# param 'game_ids' - NHL game ids
|
14
|
+
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
15
|
+
# param 'remove' - list of events to remove from final dataframe
|
16
|
+
|
17
|
+
pbps = []
|
18
|
+
for game_id in game_ids:
|
19
|
+
print("Scraping data from game " + str(game_id) + "...")
|
20
|
+
|
21
|
+
game_id = str(game_id)
|
22
|
+
season = str(game_id[:4])+str(int(game_id[:4])+1)
|
23
|
+
|
24
|
+
api = "https://api-web.nhle.com/v1/gamecenter/"+game_id+"/play-by-play"
|
25
|
+
home_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TH"+str(game_id)[-6:]+".HTM"
|
26
|
+
away_log = "https://www.nhl.com/scores/htmlreports/"+season+"/TV"+str(game_id)[-6:]+".HTM"
|
27
|
+
|
28
|
+
#Retrieve raw data
|
29
|
+
json = rs.get(api).json()
|
30
|
+
home_shift = rs.get(home_log).content
|
31
|
+
away_shift = rs.get(away_log).content
|
32
|
+
|
33
|
+
if int(game_id[:4]) < 2010:
|
34
|
+
print()
|
35
|
+
raise Exception('Games before 2010-2011 are not available yet.')
|
36
|
+
else:
|
37
|
+
#Parse Json
|
38
|
+
pbp = parse_json(json)
|
39
|
+
|
40
|
+
#Create shifts
|
41
|
+
shifts = fix_names(combine_shifts(home_shift,away_shift,json,game_id),json)
|
42
|
+
|
43
|
+
#Combine and append data to list
|
44
|
+
data = combine_data(pbp,shifts)
|
45
|
+
|
46
|
+
pbps.append(data)
|
47
|
+
|
48
|
+
#Add all pbps together
|
49
|
+
df = pd.concat(pbps)
|
50
|
+
|
51
|
+
#Split pbp and shift events if necessary
|
52
|
+
#Return: complete play-by-play with data removed or split as necessary
|
53
|
+
if split_shifts == True:
|
54
|
+
if len(remove) == 0:
|
55
|
+
remove = ['change']
|
56
|
+
|
57
|
+
#Return: dict with pbp and shifts seperated
|
58
|
+
return {"pbp":df.loc[~df['event_type'].isin(remove)].dropna(axis=1,how='all'),
|
59
|
+
"shifts":df.loc[df['event_type']=='change'].dropna(axis=1,how='all')
|
60
|
+
}
|
61
|
+
else:
|
62
|
+
#Return: all events that are not set for removal by the provided list
|
63
|
+
return df.loc[~df['event_type'].isin(remove)]
|
64
|
+
|
65
|
+
def nhl_scrape_schedule(season,start = "09-01", end = "08-01"):
|
66
|
+
#Given a season, return schedule data
|
67
|
+
# param 'season' - NHL season to scrape
|
68
|
+
# param 'start' - Start date in season
|
69
|
+
# param 'end' - End date in season
|
70
|
+
|
71
|
+
api = "https://api-web.nhle.com/v1/schedule/"
|
72
|
+
|
73
|
+
#Determine how to approach scraping; if month in season is after the new year the year must be adjusted
|
74
|
+
new_year = ["01","02","03","04","05","06"]
|
75
|
+
if start[:2] in new_year:
|
76
|
+
start = str(int(season[:4])+1)+"-"+start
|
77
|
+
end = str(season[:-4])+"-"+end
|
78
|
+
else:
|
79
|
+
start = str(season[:4])+"-"+start
|
80
|
+
end = str(season[:-4])+"-"+end
|
81
|
+
|
82
|
+
form = '%Y-%m-%d'
|
83
|
+
|
84
|
+
#Create datetime values from dates
|
85
|
+
start = datetime.strptime(start,form)
|
86
|
+
end = datetime.strptime(end,form)
|
87
|
+
|
88
|
+
game = []
|
89
|
+
|
90
|
+
day = (end-start).days+1
|
91
|
+
if day < 0:
|
92
|
+
#Handles dates which are over a year apart
|
93
|
+
day = 365 + day
|
94
|
+
for i in range(day):
|
95
|
+
#For each day, call NHL api and retreive id, season, season_type (1,2,3), and gamecenter link
|
96
|
+
inc = start+timedelta(days=i)
|
97
|
+
print("Scraping games on " + str(inc)[:10]+"...")
|
98
|
+
|
99
|
+
get = rs.get(api+str(inc)[:10]).json()
|
100
|
+
gameWeek = list(pd.json_normalize(get['gameWeek'])['games'])[0]
|
101
|
+
|
102
|
+
for i in range(0,len(gameWeek)):
|
103
|
+
game.append(pd.DataFrame({
|
104
|
+
"id": [gameWeek[i]['id']],
|
105
|
+
"season": [gameWeek[i]['season']],
|
106
|
+
"season_type":[gameWeek[i]['gameType']],
|
107
|
+
"gamecenter_link":[gameWeek[i]['gameCenterLink']]
|
108
|
+
}))
|
109
|
+
|
110
|
+
#Concatenate all games
|
111
|
+
df = pd.concat(game)
|
112
|
+
|
113
|
+
#Return: specificed schedule data (excluding preseason games)
|
114
|
+
return df.loc[df['season_type']>1]
|
115
|
+
|
116
|
+
def nhl_scrape_season(season,split_shifts = False, remove = ['period-start','period-end','challenge','stoppage','change'], start = "09-01", end = "08-01", local=False, local_path = "schedule/schedule.csv"):
|
117
|
+
#Given season, scrape all play-by-play occuring within the season
|
118
|
+
# param 'season' - NHL season to scrape
|
119
|
+
# param 'split_shifts' - boolean which splits pbp and shift events if true
|
120
|
+
# param 'remove' - list of events to remove from final dataframe
|
121
|
+
# param 'start' - Start date in season
|
122
|
+
# param 'end' - End date in season
|
123
|
+
# param 'local' - boolean indicating whether to use local file to scrape game_ids
|
124
|
+
# param 'local_path' - path of local file
|
125
|
+
|
126
|
+
#While the default value of local is false, schedule data is provided in the package files; enabling local will automatically find and scrape games in a specified season, saving time otherwise spent scraping a season's schedule
|
127
|
+
if local == True:
|
128
|
+
load = pd.read_csv(local_path)
|
129
|
+
load = load.loc[load['season'].astype(str)==season]
|
130
|
+
game_ids = list(load['id'].astype(str))
|
131
|
+
else:
|
132
|
+
game_ids = list(nhl_scrape_schedule(season,start,end)['id'].astype(str))
|
133
|
+
|
134
|
+
df = []
|
135
|
+
df_s = []
|
136
|
+
|
137
|
+
errors = []
|
138
|
+
for game_id in game_ids:
|
139
|
+
try:
|
140
|
+
if split_shifts == True:
|
141
|
+
data = nhl_scrape_game([game_id],split_shifts=True,remove=remove)
|
142
|
+
df.append(data['pbp'])
|
143
|
+
df_s.append(data['shifts'])
|
144
|
+
else:
|
145
|
+
data = nhl_scrape_game([game_id],remove=remove)
|
146
|
+
df.append(data)
|
147
|
+
|
148
|
+
except:
|
149
|
+
#Errors should be rare; testing of eight full-season scraped produced just one missing game due to erro
|
150
|
+
#Games which have not happened yet also print as errors
|
151
|
+
print("An error occurred...")
|
152
|
+
errors.append(pd.DataFrame({"id":game_id}))
|
153
|
+
|
154
|
+
pbp = pd.concat(df)
|
155
|
+
if split_shifts == True:
|
156
|
+
shifts = pd.concat(df_s)
|
157
|
+
else:
|
158
|
+
""
|
159
|
+
try:
|
160
|
+
errors = pd.concat(errors)
|
161
|
+
except:
|
162
|
+
errors = pd.DataFrame()
|
163
|
+
|
164
|
+
#Return: Complete pbp and shifts data for specified season as well as dataframe of game_ids which failed to return data
|
165
|
+
if split_shifts == True:
|
166
|
+
return {"pbp":pbp,
|
167
|
+
'shifts':shifts,
|
168
|
+
"errors":errors}
|
169
|
+
else:
|
170
|
+
return {"pbp":pbp,
|
171
|
+
"errors":errors}
|
172
|
+
|
173
|
+
def nhl_scrape_seasons_info(seasons = []):
|
174
|
+
#Returns info related to NHL seasons (by default, all seasons are included)
|
175
|
+
# param 'season' - list of seasons to include
|
176
|
+
|
177
|
+
print("Scraping info for seasons: " + str(seasons))
|
178
|
+
api = "https://api.nhle.com/stats/rest/en/season"
|
179
|
+
info = "https://api-web.nhle.com/v1/standings-season"
|
180
|
+
data = rs.get(api).json()['data']
|
181
|
+
data_2 = rs.get(info).json()['seasons']
|
182
|
+
|
183
|
+
df = pd.json_normalize(data)
|
184
|
+
df_2 = pd.json_normalize(data_2)
|
185
|
+
|
186
|
+
df = pd.merge(df,df_2,how='outer',on=['id'])
|
187
|
+
|
188
|
+
if len(seasons) > 0:
|
189
|
+
return df.loc[df['id'].astype(str).isin(seasons)].sort_values(by=['id'])
|
190
|
+
else:
|
191
|
+
return df.sort_values(by=['id'])
|
192
|
+
|
193
|
+
def nhl_scrape_standings(arg = "now"):
|
194
|
+
#Returns standings
|
195
|
+
# parma 'arg' - by default, this is "now" returning active NHL standings. May also be a specific date formatted as YYYY-MM-DD
|
196
|
+
|
197
|
+
if arg == "now":
|
198
|
+
print("Scraping standings as of now...")
|
199
|
+
else:
|
200
|
+
print("Scraping standings for season: "+arg)
|
201
|
+
api = "https://api-web.nhle.com/v1/standings/"+arg
|
202
|
+
|
203
|
+
data = rs.get(api).json()['standings']
|
204
|
+
|
205
|
+
return pd.json_normalize(data)
|
206
|
+
|
207
|
+
def nhl_scrape_roster(season):
|
208
|
+
#Given a nhl season, return rosters for all participating teams
|
209
|
+
# param 'season' - NHL season to scrape
|
210
|
+
print("Scrpaing rosters for the "+ season + "season...")
|
211
|
+
teaminfo = pd.read_csv("teaminfo/nhl_teaminfo.csv")
|
212
|
+
|
213
|
+
rosts = []
|
214
|
+
for team in list(teaminfo['Team']):
|
215
|
+
try:
|
216
|
+
print("Scraping " + team + " roster...")
|
217
|
+
api = "https://api-web.nhle.com/v1/roster/"+team+"/"+season
|
218
|
+
|
219
|
+
data = rs.get(api).json()
|
220
|
+
forwards = pd.json_normalize(data['forwards'])
|
221
|
+
forwards['headingPosition'] = "F"
|
222
|
+
dmen = pd.json_normalize(data['defensemen'])
|
223
|
+
dmen['headingPosition'] = "D"
|
224
|
+
goalies = pd.json_normalize(data['goalies'])
|
225
|
+
goalies['headingPosition'] = "G"
|
226
|
+
|
227
|
+
roster = pd.concat([forwards,dmen,goalies]).reset_index(drop=True)
|
228
|
+
roster['fullName'] = (roster['firstName.default']+" "+roster['lastName.default']).str.upper()
|
229
|
+
roster['season'] = str(season)
|
230
|
+
roster['team_abbr'] = team
|
231
|
+
|
232
|
+
rosts.append(roster)
|
233
|
+
except:
|
234
|
+
print("No roster found for " + team + "...")
|
235
|
+
|
236
|
+
return pd.concat(rosts)
|
237
|
+
|
238
|
+
def nhl_scrape_player_info(roster):
|
239
|
+
#Given compiled roster information from the nhl_scrape_roster function, return a list of all players (seperated into team and season) and associated information
|
240
|
+
# param 'roster' - dataframe of roster information from the nhl_scrape_roster function
|
241
|
+
|
242
|
+
data = roster
|
243
|
+
|
244
|
+
print("Creating player info for provided roster data...")
|
245
|
+
|
246
|
+
alt_name_col = ['firstName.cs', 'firstName.de', 'firstName.es', 'firstName.fi', 'firstName.sk', 'firstName.sv']
|
247
|
+
for i in range(len(alt_name_col)):
|
248
|
+
try: data['fullName.'+str(i+1)] = np.where(data[alt_name_col[i]].notna(),(data[alt_name_col[i]].astype(str)+" "+data['lastName.default'].astype(str)).str.upper(),np.nan)
|
249
|
+
except: continue
|
250
|
+
|
251
|
+
name_col = ['fullName', 'fullName.1', 'fullName.2', 'fullName.3', 'fullName.4', 'fullName.5', 'fullName.6']
|
252
|
+
|
253
|
+
for name in name_col:
|
254
|
+
try: data[name]
|
255
|
+
except:
|
256
|
+
data[name] = np.nan
|
257
|
+
|
258
|
+
infos = []
|
259
|
+
for name in name_col:
|
260
|
+
infos.append(data[[name,"id","season","team_abbr","headshot",
|
261
|
+
"sweaterNumber","headingPosition",
|
262
|
+
"positionCode",'shootsCatches',
|
263
|
+
'heightInInches','weightInPounds',
|
264
|
+
'birthDate','birthCountry']].rename(columns={
|
265
|
+
name:'Player',
|
266
|
+
'id':"API",
|
267
|
+
"season":"Season",
|
268
|
+
"team_abbr":"Team",
|
269
|
+
'headshot':'Headshot',
|
270
|
+
'sweaterNumber':"Number",
|
271
|
+
'headingPosition':"Primary Position",
|
272
|
+
'positionCode':'Position',
|
273
|
+
'shootsCatches':'Handedness',
|
274
|
+
'heightInInches':'Height',
|
275
|
+
'weightInPounds':'Weight',
|
276
|
+
'birthDate':'Birthday',
|
277
|
+
'birthCountry':'Nationality'}))
|
278
|
+
players = pd.concat(infos)
|
279
|
+
players['Season'] = players['Season'].astype(str)
|
280
|
+
players['Player'] = players['Player'].replace(r'^\s*$', np.nan, regex=True)
|
281
|
+
|
282
|
+
return players.loc[players['Player'].notna()].sort_values(by=['Player','Season','Team'])
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: wsba_hockey
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information.
|
5
|
+
Author-email: Owen Singh <owenbksingh@gmail.com>
|
6
|
+
Project-URL: Homepage, https://github.com/owensingh38/wsba_hockey/
|
7
|
+
Keywords: wsba,hockey,scraping,NHL,analytics
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Dynamic: license-file
|
15
|
+
|
16
|
+
# WSBA HOCKEY
|
17
|
+

|
18
|
+
|
19
|
+
### A Python package for scraping and analyzing hockey data under the motto: ***Evaluating, analyzing, and understanding the game of hockey through the lens of different analytical methods, including incorporation of analytics.***
|
20
|
+
|
21
|
+
## INSTALLATION AND USAGE
|
22
|
+
```bash
|
23
|
+
pip install wsba_hockey
|
24
|
+
```
|
25
|
+
|
26
|
+
```python
|
27
|
+
import wsba_hockey as wsba
|
28
|
+
```
|
29
|
+
|
30
|
+
## ALL FEATURES
|
31
|
+
### Note: Features yet included are marked with *
|
32
|
+
|
33
|
+
|
34
|
+
## SCRAPING
|
35
|
+
### NHL Play-by-Play (of any game frame up to a full season)
|
36
|
+
#### Functions:
|
37
|
+
|
38
|
+
```python
|
39
|
+
wsba.nhl_scrape_game(['2024020918'],split_shifts=False,remove=['game-end'])
|
40
|
+
wsba.nhl_scrape_season('20242025',split_shifts=False,remove=['game-end'],local=True)
|
41
|
+
```
|
42
|
+
|
43
|
+
### NHL Season Information
|
44
|
+
|
45
|
+
```python
|
46
|
+
wsba.nhl_scrape_schedule('20242025')
|
47
|
+
wsba.nhl_scrape_seasons_info(seasons=['20212022','20222023','20232024','20242025])
|
48
|
+
wsba.nhl_scrape_standings(arg = '2024-03-20')
|
49
|
+
```
|
50
|
+
|
51
|
+
### NHL Rosters and Player Information
|
52
|
+
|
53
|
+
```python
|
54
|
+
wsba.nhl_scrape_player_info(wsba.nhl_scrape_roster('20242025'))
|
55
|
+
```
|
56
|
+
|
57
|
+
## DATA ANALYTICS
|
58
|
+
### Expected Goals (WeakSide Breakout and MoneyPuck models)*
|
59
|
+
### Goal Impacts and Shot Analysis*
|
60
|
+
### Stat Aggregation*
|
61
|
+
### Shot Plotting (Plots, Heatmaps, etc.)*
|
62
|
+
|
63
|
+
## REPOSITORY
|
64
|
+
### Past Season Play-by-Play*
|
65
|
+
### Team Information*
|
66
|
+
### Schedule*
|
67
|
+
|
68
|
+
## ACKNOWLEDGEMENTS AND CREDITS
|
69
|
+
### Huge thanks to the following:
|
70
|
+
Harry Shomer - Creator of the hockey_scraper package, which contains select utils functions utilized in this package and otherwise inspires the creation of this package.
|
71
|
+
|
72
|
+
Dan Morse - Creator of the hockeyR package; another important inspiration and model for developing an NHL scraper.
|
@@ -0,0 +1,9 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
src/wsba_hockey/wsba_main.py
|
5
|
+
src/wsba_hockey.egg-info/PKG-INFO
|
6
|
+
src/wsba_hockey.egg-info/SOURCES.txt
|
7
|
+
src/wsba_hockey.egg-info/dependency_links.txt
|
8
|
+
src/wsba_hockey.egg-info/top_level.txt
|
9
|
+
src/wsba_hockey/tools/scraping.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
wsba_hockey
|