wsba-hockey 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,54 +3,45 @@ import numpy as np
3
3
  import xgboost as xgb
4
4
  import scipy.sparse as sp
5
5
  import joblib
6
+ from zipfile import ZipFile
7
+ import requests as rs
6
8
 
7
9
  ### XG_MODEL FUNCTIONS ###
8
10
  # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
9
11
 
12
+ ## GLOBAL VARIABLES ##
13
+ #Newest season
14
+ new_full = '20242025'
15
+ new = '2024'
16
+
10
17
  def prep_xG_data(pbp):
11
18
  #Prep data for xG training and calculation
12
19
 
13
20
  events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
14
21
  shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
15
22
  fenwick_events = ['missed-shot','shot-on-goal','goal']
16
- strengths = ['3v3',
17
- '3v4',
18
- '3v5',
19
- '4v3',
20
- '4v4',
21
- '4v5',
22
- '4v6',
23
- '5v3',
24
- '5v4',
25
- '5v5',
26
- '5v6',
27
- '6v4',
28
- '6v5']
29
23
 
30
- #Filter unwanted date:
31
- #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
32
- data = pbp.loc[(pbp['event_type'].isin(events))&
33
- (pbp['strength_state'].isin(strengths))&
34
- (pbp['period'] < 5)&
35
- (pbp['x_fixed'].notna())&
36
- (pbp['y_fixed'].notna())&
37
- ~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
38
- #Create last event columns
39
- data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
24
+ #Informal groupby
25
+ data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
40
26
 
41
- data["seconds_since_last"] = data['seconds_elapsed']-data['seconds_elapsed'].shift(1)
27
+ #Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
28
+ data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
29
+ data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
30
+
31
+ #Create last event columns
42
32
  data["event_team_last"] = data['event_team_abbr'].shift(1)
43
33
  data["event_type_last"] = data['event_type'].shift(1)
44
34
  data["x_fixed_last"] = data['x_fixed'].shift(1)
45
35
  data["y_fixed_last"] = data['y_fixed'].shift(1)
46
- data["zone_code_last"] = data['zone_code'].shift(1)
47
- data['shot_type'] = data['shot_type'].fillna('wrist')
48
-
36
+ data["zone_code_last"] = data['zone_code'].shift(1)
49
37
 
50
38
  data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
51
39
  data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
40
+ data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
52
41
  data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
53
42
  data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
43
+
44
+ #Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
54
45
  data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
55
46
  data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
56
47
 
@@ -58,19 +49,19 @@ def prep_xG_data(pbp):
58
49
  data["is_goal"]=(data['event_type']=='goal').astype(int)
59
50
  data["is_home"]=(data['home_team_abbr']==data['event_team_abbr']).astype(int)
60
51
 
61
-
52
+ #Boolean variables for shot types and prior events
62
53
  for shot in shot_types:
63
54
  data[shot] = (data['shot_type']==shot).astype(int)
64
- for strength in strengths:
65
- data[f'state_{strength}'] = (data['strength_state']==strength).astype(int)
66
55
  for event in events[0:len(events)-1]:
67
56
  data[f'prior_{event}_same'] = ((data['event_type_last']==event)&(data['event_team_last']==data['event_team_abbr'])).astype(int)
68
57
  data[f'prior_{event}_opp'] = ((data['event_type_last']==event)&(data['event_team_last']!=data['event_team_abbr'])).astype(int)
69
58
 
59
+ data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
60
+
70
61
  #Return: pbp data prepared to train and calculate the xG model
71
62
  return data
72
63
 
73
- def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, test_runs = 20):
64
+ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
74
65
  #Train and calculate the WSBA Expected Goals model
75
66
 
76
67
  target = "is_goal"
@@ -85,23 +76,11 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
85
76
  'distance_from_last',
86
77
  'seconds_since_last',
87
78
  'score_state',
79
+ 'strength_diff',
88
80
  'fenwick_state',
89
81
  'rush_mod',
90
82
  'rebound_mod']
91
83
  boolean = ['is_home',
92
- 'state_3v3',
93
- 'state_3v4',
94
- 'state_3v5',
95
- 'state_4v3',
96
- 'state_4v4',
97
- 'state_4v5',
98
- 'state_4v6',
99
- 'state_5v3',
100
- 'state_5v4',
101
- 'state_5v5',
102
- 'state_5v6',
103
- 'state_6v4',
104
- 'state_6v5',
105
84
  'wrist',
106
85
  'deflected',
107
86
  'tip-in',
@@ -128,7 +107,31 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
128
107
  'prior_faceoff']
129
108
 
130
109
  #Prep Data
131
- data = prep_xG_data(pbp)
110
+ pbp = prep_xG_data(pbp)
111
+ #Filter unwanted date:
112
+ #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
113
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
114
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
115
+ strengths = ['3v3',
116
+ '3v4',
117
+ '3v5',
118
+ '4v3',
119
+ '4v4',
120
+ '4v5',
121
+ '4v6',
122
+ '5v3',
123
+ '5v4',
124
+ '5v5',
125
+ '5v6',
126
+ '6v4',
127
+ '6v5']
128
+
129
+ data = pbp.loc[(pbp['event_type'].isin(events))&
130
+ (pbp['strength_state'].isin(strengths))&
131
+ (pbp['period'] < 5)&
132
+ (pbp['x_fixed'].notna())&
133
+ (pbp['y_fixed'].notna())&
134
+ ~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
132
135
 
133
136
  #Convert to sparse
134
137
  data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
@@ -141,6 +144,231 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
141
144
  xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
142
145
 
143
146
  if train == True:
144
- run_num =
147
+ # Number of runs
148
+ run_num = train_runs
149
+
150
+ # DataFrames to store results
151
+ best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
152
+ best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
153
+
154
+ # Loop
155
+ for i in range(run_num):
156
+ print(f"### LOOP: {i+1} ###")
157
+
158
+ param = {
159
+ "objective": "binary:logistic",
160
+ "eval_metric": ["logloss", "auc"],
161
+ "max_depth": 6,
162
+ "eta": np.random.uniform(0.06, 0.11),
163
+ "gamma": np.random.uniform(0.06, 0.12),
164
+ "subsample": np.random.uniform(0.76, 0.84),
165
+ "colsample_bytree": np.random.uniform(0.76, 0.8),
166
+ "min_child_weight": np.random.randint(5, 23),
167
+ "max_delta_step": np.random.randint(4, 9)
168
+ }
169
+
170
+ # Cross-validation
171
+ seed = np.random.randint(0, 10000)
172
+ np.random.seed(seed)
173
+
174
+ cv_results = xgb.cv(
175
+ params=param,
176
+ dtrain=xgb_matrix,
177
+ num_boost_round=1000,
178
+ nfold=5,
179
+ early_stopping_rounds=25,
180
+ metrics=["logloss", "auc"],
181
+ seed=seed
182
+ )
183
+
184
+ # Record results
185
+ best_df.loc[i] = param
186
+ best_ll.loc[i] = [
187
+ cv_results["test-logloss-mean"].min(),
188
+ cv_results["test-logloss-mean"].idxmin(),
189
+ cv_results["test-auc-mean"].max(),
190
+ cv_results["test-auc-mean"].idxmax(),
191
+ seed
192
+ ]
193
+
194
+ # Combine results
195
+ best_all = pd.concat([best_df, best_ll], axis=1).dropna()
196
+
197
+ # Arrange to get best run
198
+ best_all = best_all.sort_values(by="auc", ascending=False)
199
+
200
+ if overwrite == True:
201
+ best_all.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
202
+ else:
203
+ best_old = pd.read_csv("xg_model/testing/xg_model_training_runs.csv")
204
+ best_comb = pd.concat([best_old,best_all])
205
+ best_comb.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
206
+
207
+ # Final parameters
208
+ param_7_EV = {
209
+ "objective": "binary:logistic",
210
+ "eval_metric": ["logloss", "auc"],
211
+ "eta": 0.068,
212
+ "gamma": 0.12,
213
+ "subsample": 0.78,
214
+ "max_depth": 6,
215
+ "colsample_bytree": 0.76,
216
+ "min_child_weight": 5,
217
+ "max_delta_step": 5,
218
+ }
219
+
220
+ # CV rounds Loop
221
+ run_num = cv_runs
222
+ cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
223
+
224
+ for i in range(run_num):
225
+ print(f"### LOOP: {i+1} ###")
226
+
227
+ seed = np.random.randint(0, 10000)
228
+ np.random.seed(seed)
229
+
230
+ cv_rounds = xgb.cv(
231
+ params=param_7_EV,
232
+ dtrain=xgb_matrix,
233
+ num_boost_round=1000,
234
+ nfold=5,
235
+ early_stopping_rounds=25,
236
+ metrics=["logloss", "auc"],
237
+ seed=seed
238
+ )
239
+
240
+ # Record results
241
+ cv_test.loc[i] = [
242
+ cv_rounds["test-auc-mean"].idxmax(),
243
+ cv_rounds["test-auc-mean"].max(),
244
+ cv_rounds["test-logloss-mean"].idxmin(),
245
+ cv_rounds["test-logloss-mean"].min(),
246
+ seed
247
+ ]
248
+
249
+ # Clean results and sort to find the number of rounds to use and seed
250
+ cv_final = cv_test.sort_values(by="AUC", ascending=False)
251
+ if overwrite == True:
252
+ cv_final.to_csv("xg_model/testing/xg_model_cv_runs.csv",index=False)
253
+ else:
254
+ cv_old = pd.read_csv("xg_model/testing/xg_model_cv_runs.csv")
255
+ cv_comb = pd.concat([cv_old,cv_final])
256
+ cv_comb.to_csv("xg_model/testing/xg_model_cv_runs.csv")
257
+ cv_final.loc[len(cv_final)] = cv_test.mean()
258
+
259
+ # Train the final model
260
+ np.random.seed(556)
261
+
262
+ if overwrite == False:
263
+ model = joblib.load(model_path)
264
+ else:
265
+ ""
266
+
267
+ model = xgb.train(
268
+ params=param_7_EV,
269
+ dtrain=xgb_matrix,
270
+ num_boost_round=189,
271
+ verbose_eval=2
272
+ )
273
+
274
+ joblib.dump(model,model_path)
275
+
276
+ else:
277
+ model = joblib.load(model_path)
278
+ pbp['xG'] = np.where(pbp['event_type'].isin(fenwick_events),model.predict(xgb_matrix),"")
279
+ return pbp
280
+
281
+ def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
282
+ #Given play-by-play, return itself with xG column sourced from MoneyPuck.com
283
+
284
+ #If file is already in the repository downloading is not necessary
285
+ try:
286
+ db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
287
+ except:
288
+ url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
289
+
290
+ response = rs.get(url)
291
+
292
+ if response.status_code == 200:
293
+ with open(repo_path, 'wb') as file:
294
+ file.write(response.content)
295
+ print('File downloaded successfully')
296
+ else:
297
+ print('Failed to download file')
298
+
299
+ with ZipFile(repo_path, 'r') as zObject:
300
+ zObject.extractall(
301
+ path="tools/xg_model/moneypuck/shots/")
302
+
303
+ db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
304
+
305
+ #Repeat process with active/most recent season
306
+ #For the new/recent season, only scrape if the supplied pbp data contains the season
307
+ if new in list(pbp['season'].astype(str).str[0:4]):
308
+ url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
309
+ repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
310
+
311
+ response = rs.get(url)
312
+
313
+ if response.status_code == 200:
314
+ with open(repo_path, 'wb') as file:
315
+ file.write(response.content)
316
+ print('File downloaded successfully')
317
+ else:
318
+ print('Failed to download file')
319
+
320
+ with ZipFile(repo_path, 'r') as zObject:
321
+ zObject.extractall(
322
+ path="tools/xg_model/moneypuck/shots/")
323
+
324
+ new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
325
+ #Convert to parquet
326
+ new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
145
327
  else:
146
- print("No data to add yet...")
328
+ new_season = pd.DataFrame()
329
+ #Combine shots
330
+ moneypuck = pd.concat([db,new_season])
331
+
332
+ #Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
333
+ moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
334
+ moneypuck['event'] = moneypuck['event'].replace({
335
+ "SHOT":"shot-on-goal",
336
+ "MISS":"missed-shot",
337
+ "BLOCK":"blocked-shot",
338
+ "GOAL":"goal"
339
+ })
340
+
341
+ #Manual Team Rename
342
+ moneypuck['teamCode'] = moneypuck['teamCode'].replace({
343
+ "L.A":"LAK",
344
+ "N.J":"NJD",
345
+ "S.J":"SJS",
346
+ "T.B":"TBL",
347
+ })
348
+ pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
349
+ "L.A":"LAK",
350
+ "N.J":"NJD",
351
+ "S.J":"SJS",
352
+ "T.B":"TBL",
353
+ "PHX":'ARI'
354
+ })
355
+
356
+ #Managing oddities in datatypes
357
+ moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
358
+ pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
359
+
360
+ #Modify and merge
361
+ moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
362
+ comb = pd.merge(pbp,moneypuck
363
+ ,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
364
+ ,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
365
+ ,how='left')
366
+
367
+ #Drop and rename
368
+ pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
369
+
370
+ if pbp_xg['xG'].isnull().all():
371
+ print("No MoneyPuck xG values were found for this game...")
372
+
373
+ #Return: play-by-play with moneypuck xG column
374
+ return pbp_xg