wsba-hockey 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,374 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import xgboost as xgb
4
+ import scipy.sparse as sp
5
+ import joblib
6
+ from zipfile import ZipFile
7
+ import requests as rs
8
+
9
+ ### XG_MODEL FUNCTIONS ###
10
+ # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
11
+
12
+ ## GLOBAL VARIABLES ##
13
+ #Newest season
14
+ new_full = '20242025'
15
+ new = '2024'
16
+
17
+ def prep_xG_data(pbp):
18
+ #Prep data for xG training and calculation
19
+
20
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
21
+ shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
22
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
23
+
24
+ #Informal groupby
25
+ data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
26
+
27
+ #Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
28
+ data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
29
+ data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
30
+
31
+ #Create last event columns
32
+ data["event_team_last"] = data['event_team_abbr'].shift(1)
33
+ data["event_type_last"] = data['event_type'].shift(1)
34
+ data["x_fixed_last"] = data['x_fixed'].shift(1)
35
+ data["y_fixed_last"] = data['y_fixed'].shift(1)
36
+ data["zone_code_last"] = data['zone_code'].shift(1)
37
+
38
+ data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
39
+ data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
40
+ data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
41
+ data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
42
+ data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
43
+
44
+ #Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
45
+ data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
46
+ data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
47
+
48
+ #Create boolean variables
49
+ data["is_goal"]=(data['event_type']=='goal').astype(int)
50
+ data["is_home"]=(data['home_team_abbr']==data['event_team_abbr']).astype(int)
51
+
52
+ #Boolean variables for shot types and prior events
53
+ for shot in shot_types:
54
+ data[shot] = (data['shot_type']==shot).astype(int)
55
+ for event in events[0:len(events)-1]:
56
+ data[f'prior_{event}_same'] = ((data['event_type_last']==event)&(data['event_team_last']==data['event_team_abbr'])).astype(int)
57
+ data[f'prior_{event}_opp'] = ((data['event_type_last']==event)&(data['event_team_last']!=data['event_team_abbr'])).astype(int)
58
+
59
+ data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
60
+
61
+ #Return: pbp data prepared to train and calculate the xG model
62
+ return data
63
+
64
+ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
65
+ #Train and calculate the WSBA Expected Goals model
66
+
67
+ target = "is_goal"
68
+ continous = ['event_distance',
69
+ 'event_angle',
70
+ 'seconds_elapsed',
71
+ 'period',
72
+ 'x_fixed',
73
+ 'y_fixed',
74
+ 'x_fixed_last',
75
+ 'y_fixed_last',
76
+ 'distance_from_last',
77
+ 'seconds_since_last',
78
+ 'score_state',
79
+ 'strength_diff',
80
+ 'fenwick_state',
81
+ 'rush_mod',
82
+ 'rebound_mod']
83
+ boolean = ['is_home',
84
+ 'wrist',
85
+ 'deflected',
86
+ 'tip-in',
87
+ 'slap',
88
+ 'backhand',
89
+ 'snap',
90
+ 'wrap-around',
91
+ 'poke',
92
+ 'bat',
93
+ 'cradle',
94
+ 'between-legs',
95
+ 'prior_shot-on-goal_same',
96
+ 'prior_missed-shot_same',
97
+ 'prior_blocked-shot_same',
98
+ 'prior_giveaway_same',
99
+ 'prior_takeaway_same',
100
+ 'prior_hit_same',
101
+ 'prior_shot-on-goal_opp',
102
+ 'prior_missed-shot_opp',
103
+ 'prior_blocked-shot_opp',
104
+ 'prior_giveaway_opp',
105
+ 'prior_takeaway_opp',
106
+ 'prior_hit_opp',
107
+ 'prior_faceoff']
108
+
109
+ #Prep Data
110
+ pbp = prep_xG_data(pbp)
111
+ #Filter unwanted date:
112
+ #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
113
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
114
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
115
+ strengths = ['3v3',
116
+ '3v4',
117
+ '3v5',
118
+ '4v3',
119
+ '4v4',
120
+ '4v5',
121
+ '4v6',
122
+ '5v3',
123
+ '5v4',
124
+ '5v5',
125
+ '5v6',
126
+ '6v4',
127
+ '6v5']
128
+
129
+ data = pbp.loc[(pbp['event_type'].isin(events))&
130
+ (pbp['strength_state'].isin(strengths))&
131
+ (pbp['period'] < 5)&
132
+ (pbp['x_fixed'].notna())&
133
+ (pbp['y_fixed'].notna())&
134
+ ~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
135
+
136
+ #Convert to sparse
137
+ data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
138
+
139
+ #Target and Predictors
140
+ is_goal_vect = data_sparse[:, 0].A
141
+ predictors = data_sparse[:, 1:]
142
+
143
+ #XGB DataModel
144
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
145
+
146
+ if train == True:
147
+ # Number of runs
148
+ run_num = train_runs
149
+
150
+ # DataFrames to store results
151
+ best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
152
+ best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
153
+
154
+ # Loop
155
+ for i in range(run_num):
156
+ print(f"### LOOP: {i+1} ###")
157
+
158
+ param = {
159
+ "objective": "binary:logistic",
160
+ "eval_metric": ["logloss", "auc"],
161
+ "max_depth": 6,
162
+ "eta": np.random.uniform(0.06, 0.11),
163
+ "gamma": np.random.uniform(0.06, 0.12),
164
+ "subsample": np.random.uniform(0.76, 0.84),
165
+ "colsample_bytree": np.random.uniform(0.76, 0.8),
166
+ "min_child_weight": np.random.randint(5, 23),
167
+ "max_delta_step": np.random.randint(4, 9)
168
+ }
169
+
170
+ # Cross-validation
171
+ seed = np.random.randint(0, 10000)
172
+ np.random.seed(seed)
173
+
174
+ cv_results = xgb.cv(
175
+ params=param,
176
+ dtrain=xgb_matrix,
177
+ num_boost_round=1000,
178
+ nfold=5,
179
+ early_stopping_rounds=25,
180
+ metrics=["logloss", "auc"],
181
+ seed=seed
182
+ )
183
+
184
+ # Record results
185
+ best_df.loc[i] = param
186
+ best_ll.loc[i] = [
187
+ cv_results["test-logloss-mean"].min(),
188
+ cv_results["test-logloss-mean"].idxmin(),
189
+ cv_results["test-auc-mean"].max(),
190
+ cv_results["test-auc-mean"].idxmax(),
191
+ seed
192
+ ]
193
+
194
+ # Combine results
195
+ best_all = pd.concat([best_df, best_ll], axis=1).dropna()
196
+
197
+ # Arrange to get best run
198
+ best_all = best_all.sort_values(by="auc", ascending=False)
199
+
200
+ if overwrite == True:
201
+ best_all.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
202
+ else:
203
+ best_old = pd.read_csv("xg_model/testing/xg_model_training_runs.csv")
204
+ best_comb = pd.concat([best_old,best_all])
205
+ best_comb.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
206
+
207
+ # Final parameters
208
+ param_7_EV = {
209
+ "objective": "binary:logistic",
210
+ "eval_metric": ["logloss", "auc"],
211
+ "eta": 0.068,
212
+ "gamma": 0.12,
213
+ "subsample": 0.78,
214
+ "max_depth": 6,
215
+ "colsample_bytree": 0.76,
216
+ "min_child_weight": 5,
217
+ "max_delta_step": 5,
218
+ }
219
+
220
+ # CV rounds Loop
221
+ run_num = cv_runs
222
+ cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
223
+
224
+ for i in range(run_num):
225
+ print(f"### LOOP: {i+1} ###")
226
+
227
+ seed = np.random.randint(0, 10000)
228
+ np.random.seed(seed)
229
+
230
+ cv_rounds = xgb.cv(
231
+ params=param_7_EV,
232
+ dtrain=xgb_matrix,
233
+ num_boost_round=1000,
234
+ nfold=5,
235
+ early_stopping_rounds=25,
236
+ metrics=["logloss", "auc"],
237
+ seed=seed
238
+ )
239
+
240
+ # Record results
241
+ cv_test.loc[i] = [
242
+ cv_rounds["test-auc-mean"].idxmax(),
243
+ cv_rounds["test-auc-mean"].max(),
244
+ cv_rounds["test-logloss-mean"].idxmin(),
245
+ cv_rounds["test-logloss-mean"].min(),
246
+ seed
247
+ ]
248
+
249
+ # Clean results and sort to find the number of rounds to use and seed
250
+ cv_final = cv_test.sort_values(by="AUC", ascending=False)
251
+ if overwrite == True:
252
+ cv_final.to_csv("xg_model/testing/xg_model_cv_runs.csv",index=False)
253
+ else:
254
+ cv_old = pd.read_csv("xg_model/testing/xg_model_cv_runs.csv")
255
+ cv_comb = pd.concat([cv_old,cv_final])
256
+ cv_comb.to_csv("xg_model/testing/xg_model_cv_runs.csv")
257
+ cv_final.loc[len(cv_final)] = cv_test.mean()
258
+
259
+ # Train the final model
260
+ np.random.seed(556)
261
+
262
+ if overwrite == False:
263
+ model = joblib.load(model_path)
264
+ else:
265
+ ""
266
+
267
+ model = xgb.train(
268
+ params=param_7_EV,
269
+ dtrain=xgb_matrix,
270
+ num_boost_round=189,
271
+ verbose_eval=2
272
+ )
273
+
274
+ joblib.dump(model,model_path)
275
+
276
+ else:
277
+ model = joblib.load(model_path)
278
+ pbp['xG'] = np.where(pbp['event_type'].isin(fenwick_events),model.predict(xgb_matrix),"")
279
+ return pbp
280
+
281
+ def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
282
+ #Given play-by-play, return itself with xG column sourced from MoneyPuck.com
283
+
284
+ #If file is already in the repository downloading is not necessary
285
+ try:
286
+ db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
287
+ except:
288
+ url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
289
+
290
+ response = rs.get(url)
291
+
292
+ if response.status_code == 200:
293
+ with open(repo_path, 'wb') as file:
294
+ file.write(response.content)
295
+ print('File downloaded successfully')
296
+ else:
297
+ print('Failed to download file')
298
+
299
+ with ZipFile(repo_path, 'r') as zObject:
300
+ zObject.extractall(
301
+ path="tools/xg_model/moneypuck/shots/")
302
+
303
+ db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
304
+
305
+ #Repeat process with active/most recent season
306
+ #For the new/recent season, only scrape if the supplied pbp data contains the season
307
+ if new in list(pbp['season'].astype(str).str[0:4]):
308
+ url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
309
+ repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
310
+
311
+ response = rs.get(url)
312
+
313
+ if response.status_code == 200:
314
+ with open(repo_path, 'wb') as file:
315
+ file.write(response.content)
316
+ print('File downloaded successfully')
317
+ else:
318
+ print('Failed to download file')
319
+
320
+ with ZipFile(repo_path, 'r') as zObject:
321
+ zObject.extractall(
322
+ path="tools/xg_model/moneypuck/shots/")
323
+
324
+ new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
325
+ #Convert to parquet
326
+ new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
327
+ else:
328
+ new_season = pd.DataFrame()
329
+ #Combine shots
330
+ moneypuck = pd.concat([db,new_season])
331
+
332
+ #Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
333
+ moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
334
+ moneypuck['event'] = moneypuck['event'].replace({
335
+ "SHOT":"shot-on-goal",
336
+ "MISS":"missed-shot",
337
+ "BLOCK":"blocked-shot",
338
+ "GOAL":"goal"
339
+ })
340
+
341
+ #Manual Team Rename
342
+ moneypuck['teamCode'] = moneypuck['teamCode'].replace({
343
+ "L.A":"LAK",
344
+ "N.J":"NJD",
345
+ "S.J":"SJS",
346
+ "T.B":"TBL",
347
+ })
348
+ pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
349
+ "L.A":"LAK",
350
+ "N.J":"NJD",
351
+ "S.J":"SJS",
352
+ "T.B":"TBL",
353
+ "PHX":'ARI'
354
+ })
355
+
356
+ #Managing oddities in datatypes
357
+ moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
358
+ pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
359
+
360
+ #Modify and merge
361
+ moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
362
+ comb = pd.merge(pbp,moneypuck
363
+ ,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
364
+ ,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
365
+ ,how='left')
366
+
367
+ #Drop and rename
368
+ pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
369
+
370
+ if pbp_xg['xG'].isnull().all():
371
+ print("No MoneyPuck xG values were found for this game...")
372
+
373
+ #Return: play-by-play with moneypuck xG column
374
+ return pbp_xg