wsba-hockey 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +836 -369
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +275 -47
- wsba_hockey/wsba_main.py +699 -132
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/METADATA +42 -11
- wsba_hockey-1.0.0.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.2.dist-info/RECORD +0 -9
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/top_level.txt +0 -0
wsba_hockey/tools/xg_model.py
CHANGED
@@ -3,54 +3,45 @@ import numpy as np
|
|
3
3
|
import xgboost as xgb
|
4
4
|
import scipy.sparse as sp
|
5
5
|
import joblib
|
6
|
+
from zipfile import ZipFile
|
7
|
+
import requests as rs
|
6
8
|
|
7
9
|
### XG_MODEL FUNCTIONS ###
|
8
10
|
# Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
|
9
11
|
|
12
|
+
## GLOBAL VARIABLES ##
|
13
|
+
#Newest season
|
14
|
+
new_full = '20242025'
|
15
|
+
new = '2024'
|
16
|
+
|
10
17
|
def prep_xG_data(pbp):
|
11
18
|
#Prep data for xG training and calculation
|
12
19
|
|
13
20
|
events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
|
14
21
|
shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
|
15
22
|
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
16
|
-
strengths = ['3v3',
|
17
|
-
'3v4',
|
18
|
-
'3v5',
|
19
|
-
'4v3',
|
20
|
-
'4v4',
|
21
|
-
'4v5',
|
22
|
-
'4v6',
|
23
|
-
'5v3',
|
24
|
-
'5v4',
|
25
|
-
'5v5',
|
26
|
-
'5v6',
|
27
|
-
'6v4',
|
28
|
-
'6v5']
|
29
23
|
|
30
|
-
#
|
31
|
-
|
32
|
-
data = pbp.loc[(pbp['event_type'].isin(events))&
|
33
|
-
(pbp['strength_state'].isin(strengths))&
|
34
|
-
(pbp['period'] < 5)&
|
35
|
-
(pbp['x_fixed'].notna())&
|
36
|
-
(pbp['y_fixed'].notna())&
|
37
|
-
~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
|
38
|
-
#Create last event columns
|
39
|
-
data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
|
24
|
+
#Informal groupby
|
25
|
+
data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
|
40
26
|
|
41
|
-
|
27
|
+
#Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
|
28
|
+
data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
|
29
|
+
data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
|
30
|
+
|
31
|
+
#Create last event columns
|
42
32
|
data["event_team_last"] = data['event_team_abbr'].shift(1)
|
43
33
|
data["event_type_last"] = data['event_type'].shift(1)
|
44
34
|
data["x_fixed_last"] = data['x_fixed'].shift(1)
|
45
35
|
data["y_fixed_last"] = data['y_fixed'].shift(1)
|
46
|
-
data["zone_code_last"] = data['zone_code'].shift(1)
|
47
|
-
data['shot_type'] = data['shot_type'].fillna('wrist')
|
48
|
-
|
36
|
+
data["zone_code_last"] = data['zone_code'].shift(1)
|
49
37
|
|
50
38
|
data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
|
51
39
|
data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
|
40
|
+
data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
|
52
41
|
data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
|
53
42
|
data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
|
43
|
+
|
44
|
+
#Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
|
54
45
|
data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
|
55
46
|
data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
|
56
47
|
|
@@ -58,19 +49,19 @@ def prep_xG_data(pbp):
|
|
58
49
|
data["is_goal"]=(data['event_type']=='goal').astype(int)
|
59
50
|
data["is_home"]=(data['home_team_abbr']==data['event_team_abbr']).astype(int)
|
60
51
|
|
61
|
-
|
52
|
+
#Boolean variables for shot types and prior events
|
62
53
|
for shot in shot_types:
|
63
54
|
data[shot] = (data['shot_type']==shot).astype(int)
|
64
|
-
for strength in strengths:
|
65
|
-
data[f'state_{strength}'] = (data['strength_state']==strength).astype(int)
|
66
55
|
for event in events[0:len(events)-1]:
|
67
56
|
data[f'prior_{event}_same'] = ((data['event_type_last']==event)&(data['event_team_last']==data['event_team_abbr'])).astype(int)
|
68
57
|
data[f'prior_{event}_opp'] = ((data['event_type_last']==event)&(data['event_team_last']!=data['event_team_abbr'])).astype(int)
|
69
58
|
|
59
|
+
data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
|
60
|
+
|
70
61
|
#Return: pbp data prepared to train and calculate the xG model
|
71
62
|
return data
|
72
63
|
|
73
|
-
def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20,
|
64
|
+
def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
|
74
65
|
#Train and calculate the WSBA Expected Goals model
|
75
66
|
|
76
67
|
target = "is_goal"
|
@@ -85,23 +76,11 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
|
|
85
76
|
'distance_from_last',
|
86
77
|
'seconds_since_last',
|
87
78
|
'score_state',
|
79
|
+
'strength_diff',
|
88
80
|
'fenwick_state',
|
89
81
|
'rush_mod',
|
90
82
|
'rebound_mod']
|
91
83
|
boolean = ['is_home',
|
92
|
-
'state_3v3',
|
93
|
-
'state_3v4',
|
94
|
-
'state_3v5',
|
95
|
-
'state_4v3',
|
96
|
-
'state_4v4',
|
97
|
-
'state_4v5',
|
98
|
-
'state_4v6',
|
99
|
-
'state_5v3',
|
100
|
-
'state_5v4',
|
101
|
-
'state_5v5',
|
102
|
-
'state_5v6',
|
103
|
-
'state_6v4',
|
104
|
-
'state_6v5',
|
105
84
|
'wrist',
|
106
85
|
'deflected',
|
107
86
|
'tip-in',
|
@@ -128,7 +107,31 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
|
|
128
107
|
'prior_faceoff']
|
129
108
|
|
130
109
|
#Prep Data
|
131
|
-
|
110
|
+
pbp = prep_xG_data(pbp)
|
111
|
+
#Filter unwanted date:
|
112
|
+
#Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
|
113
|
+
events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
|
114
|
+
fenwick_events = ['missed-shot','shot-on-goal','goal']
|
115
|
+
strengths = ['3v3',
|
116
|
+
'3v4',
|
117
|
+
'3v5',
|
118
|
+
'4v3',
|
119
|
+
'4v4',
|
120
|
+
'4v5',
|
121
|
+
'4v6',
|
122
|
+
'5v3',
|
123
|
+
'5v4',
|
124
|
+
'5v5',
|
125
|
+
'5v6',
|
126
|
+
'6v4',
|
127
|
+
'6v5']
|
128
|
+
|
129
|
+
data = pbp.loc[(pbp['event_type'].isin(events))&
|
130
|
+
(pbp['strength_state'].isin(strengths))&
|
131
|
+
(pbp['period'] < 5)&
|
132
|
+
(pbp['x_fixed'].notna())&
|
133
|
+
(pbp['y_fixed'].notna())&
|
134
|
+
~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
|
132
135
|
|
133
136
|
#Convert to sparse
|
134
137
|
data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
|
@@ -141,6 +144,231 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
|
|
141
144
|
xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
|
142
145
|
|
143
146
|
if train == True:
|
144
|
-
|
147
|
+
# Number of runs
|
148
|
+
run_num = train_runs
|
149
|
+
|
150
|
+
# DataFrames to store results
|
151
|
+
best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
|
152
|
+
best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
|
153
|
+
|
154
|
+
# Loop
|
155
|
+
for i in range(run_num):
|
156
|
+
print(f"### LOOP: {i+1} ###")
|
157
|
+
|
158
|
+
param = {
|
159
|
+
"objective": "binary:logistic",
|
160
|
+
"eval_metric": ["logloss", "auc"],
|
161
|
+
"max_depth": 6,
|
162
|
+
"eta": np.random.uniform(0.06, 0.11),
|
163
|
+
"gamma": np.random.uniform(0.06, 0.12),
|
164
|
+
"subsample": np.random.uniform(0.76, 0.84),
|
165
|
+
"colsample_bytree": np.random.uniform(0.76, 0.8),
|
166
|
+
"min_child_weight": np.random.randint(5, 23),
|
167
|
+
"max_delta_step": np.random.randint(4, 9)
|
168
|
+
}
|
169
|
+
|
170
|
+
# Cross-validation
|
171
|
+
seed = np.random.randint(0, 10000)
|
172
|
+
np.random.seed(seed)
|
173
|
+
|
174
|
+
cv_results = xgb.cv(
|
175
|
+
params=param,
|
176
|
+
dtrain=xgb_matrix,
|
177
|
+
num_boost_round=1000,
|
178
|
+
nfold=5,
|
179
|
+
early_stopping_rounds=25,
|
180
|
+
metrics=["logloss", "auc"],
|
181
|
+
seed=seed
|
182
|
+
)
|
183
|
+
|
184
|
+
# Record results
|
185
|
+
best_df.loc[i] = param
|
186
|
+
best_ll.loc[i] = [
|
187
|
+
cv_results["test-logloss-mean"].min(),
|
188
|
+
cv_results["test-logloss-mean"].idxmin(),
|
189
|
+
cv_results["test-auc-mean"].max(),
|
190
|
+
cv_results["test-auc-mean"].idxmax(),
|
191
|
+
seed
|
192
|
+
]
|
193
|
+
|
194
|
+
# Combine results
|
195
|
+
best_all = pd.concat([best_df, best_ll], axis=1).dropna()
|
196
|
+
|
197
|
+
# Arrange to get best run
|
198
|
+
best_all = best_all.sort_values(by="auc", ascending=False)
|
199
|
+
|
200
|
+
if overwrite == True:
|
201
|
+
best_all.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
|
202
|
+
else:
|
203
|
+
best_old = pd.read_csv("xg_model/testing/xg_model_training_runs.csv")
|
204
|
+
best_comb = pd.concat([best_old,best_all])
|
205
|
+
best_comb.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
|
206
|
+
|
207
|
+
# Final parameters
|
208
|
+
param_7_EV = {
|
209
|
+
"objective": "binary:logistic",
|
210
|
+
"eval_metric": ["logloss", "auc"],
|
211
|
+
"eta": 0.068,
|
212
|
+
"gamma": 0.12,
|
213
|
+
"subsample": 0.78,
|
214
|
+
"max_depth": 6,
|
215
|
+
"colsample_bytree": 0.76,
|
216
|
+
"min_child_weight": 5,
|
217
|
+
"max_delta_step": 5,
|
218
|
+
}
|
219
|
+
|
220
|
+
# CV rounds Loop
|
221
|
+
run_num = cv_runs
|
222
|
+
cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
|
223
|
+
|
224
|
+
for i in range(run_num):
|
225
|
+
print(f"### LOOP: {i+1} ###")
|
226
|
+
|
227
|
+
seed = np.random.randint(0, 10000)
|
228
|
+
np.random.seed(seed)
|
229
|
+
|
230
|
+
cv_rounds = xgb.cv(
|
231
|
+
params=param_7_EV,
|
232
|
+
dtrain=xgb_matrix,
|
233
|
+
num_boost_round=1000,
|
234
|
+
nfold=5,
|
235
|
+
early_stopping_rounds=25,
|
236
|
+
metrics=["logloss", "auc"],
|
237
|
+
seed=seed
|
238
|
+
)
|
239
|
+
|
240
|
+
# Record results
|
241
|
+
cv_test.loc[i] = [
|
242
|
+
cv_rounds["test-auc-mean"].idxmax(),
|
243
|
+
cv_rounds["test-auc-mean"].max(),
|
244
|
+
cv_rounds["test-logloss-mean"].idxmin(),
|
245
|
+
cv_rounds["test-logloss-mean"].min(),
|
246
|
+
seed
|
247
|
+
]
|
248
|
+
|
249
|
+
# Clean results and sort to find the number of rounds to use and seed
|
250
|
+
cv_final = cv_test.sort_values(by="AUC", ascending=False)
|
251
|
+
if overwrite == True:
|
252
|
+
cv_final.to_csv("xg_model/testing/xg_model_cv_runs.csv",index=False)
|
253
|
+
else:
|
254
|
+
cv_old = pd.read_csv("xg_model/testing/xg_model_cv_runs.csv")
|
255
|
+
cv_comb = pd.concat([cv_old,cv_final])
|
256
|
+
cv_comb.to_csv("xg_model/testing/xg_model_cv_runs.csv")
|
257
|
+
cv_final.loc[len(cv_final)] = cv_test.mean()
|
258
|
+
|
259
|
+
# Train the final model
|
260
|
+
np.random.seed(556)
|
261
|
+
|
262
|
+
if overwrite == False:
|
263
|
+
model = joblib.load(model_path)
|
264
|
+
else:
|
265
|
+
""
|
266
|
+
|
267
|
+
model = xgb.train(
|
268
|
+
params=param_7_EV,
|
269
|
+
dtrain=xgb_matrix,
|
270
|
+
num_boost_round=189,
|
271
|
+
verbose_eval=2
|
272
|
+
)
|
273
|
+
|
274
|
+
joblib.dump(model,model_path)
|
275
|
+
|
276
|
+
else:
|
277
|
+
model = joblib.load(model_path)
|
278
|
+
pbp['xG'] = np.where(pbp['event_type'].isin(fenwick_events),model.predict(xgb_matrix),"")
|
279
|
+
return pbp
|
280
|
+
|
281
|
+
def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
|
282
|
+
#Given play-by-play, return itself with xG column sourced from MoneyPuck.com
|
283
|
+
|
284
|
+
#If file is already in the repository downloading is not necessary
|
285
|
+
try:
|
286
|
+
db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
|
287
|
+
except:
|
288
|
+
url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
|
289
|
+
|
290
|
+
response = rs.get(url)
|
291
|
+
|
292
|
+
if response.status_code == 200:
|
293
|
+
with open(repo_path, 'wb') as file:
|
294
|
+
file.write(response.content)
|
295
|
+
print('File downloaded successfully')
|
296
|
+
else:
|
297
|
+
print('Failed to download file')
|
298
|
+
|
299
|
+
with ZipFile(repo_path, 'r') as zObject:
|
300
|
+
zObject.extractall(
|
301
|
+
path="tools/xg_model/moneypuck/shots/")
|
302
|
+
|
303
|
+
db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
|
304
|
+
|
305
|
+
#Repeat process with active/most recent season
|
306
|
+
#For the new/recent season, only scrape if the supplied pbp data contains the season
|
307
|
+
if new in list(pbp['season'].astype(str).str[0:4]):
|
308
|
+
url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
|
309
|
+
repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
|
310
|
+
|
311
|
+
response = rs.get(url)
|
312
|
+
|
313
|
+
if response.status_code == 200:
|
314
|
+
with open(repo_path, 'wb') as file:
|
315
|
+
file.write(response.content)
|
316
|
+
print('File downloaded successfully')
|
317
|
+
else:
|
318
|
+
print('Failed to download file')
|
319
|
+
|
320
|
+
with ZipFile(repo_path, 'r') as zObject:
|
321
|
+
zObject.extractall(
|
322
|
+
path="tools/xg_model/moneypuck/shots/")
|
323
|
+
|
324
|
+
new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
|
325
|
+
#Convert to parquet
|
326
|
+
new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
|
145
327
|
else:
|
146
|
-
|
328
|
+
new_season = pd.DataFrame()
|
329
|
+
#Combine shots
|
330
|
+
moneypuck = pd.concat([db,new_season])
|
331
|
+
|
332
|
+
#Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
|
333
|
+
moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
|
334
|
+
moneypuck['event'] = moneypuck['event'].replace({
|
335
|
+
"SHOT":"shot-on-goal",
|
336
|
+
"MISS":"missed-shot",
|
337
|
+
"BLOCK":"blocked-shot",
|
338
|
+
"GOAL":"goal"
|
339
|
+
})
|
340
|
+
|
341
|
+
#Manual Team Rename
|
342
|
+
moneypuck['teamCode'] = moneypuck['teamCode'].replace({
|
343
|
+
"L.A":"LAK",
|
344
|
+
"N.J":"NJD",
|
345
|
+
"S.J":"SJS",
|
346
|
+
"T.B":"TBL",
|
347
|
+
})
|
348
|
+
pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
|
349
|
+
"L.A":"LAK",
|
350
|
+
"N.J":"NJD",
|
351
|
+
"S.J":"SJS",
|
352
|
+
"T.B":"TBL",
|
353
|
+
"PHX":'ARI'
|
354
|
+
})
|
355
|
+
|
356
|
+
#Managing oddities in datatypes
|
357
|
+
moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
|
358
|
+
pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
|
359
|
+
|
360
|
+
#Modify and merge
|
361
|
+
moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
|
362
|
+
comb = pd.merge(pbp,moneypuck
|
363
|
+
,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
|
364
|
+
,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
|
365
|
+
,how='left')
|
366
|
+
|
367
|
+
#Drop and rename
|
368
|
+
pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
|
369
|
+
|
370
|
+
if pbp_xg['xG'].isnull().all():
|
371
|
+
print("No MoneyPuck xG values were found for this game...")
|
372
|
+
|
373
|
+
#Return: play-by-play with moneypuck xG column
|
374
|
+
return pbp_xg
|