wsba-hockey 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,455 +0,0 @@
1
- import joblib
2
- import os
3
- import pandas as pd
4
- import numpy as np
5
- import xgboost as xgb
6
- import scipy.sparse as sp
7
- import wsba_hockey.wsba_main as wsba
8
- import wsba_hockey.tools.scraping as scraping
9
- import matplotlib.pyplot as plt
10
- from sklearn.calibration import calibration_curve
11
- from sklearn.metrics import roc_curve, auc
12
-
13
- ### XG_MODEL FUNCTIONS ###
14
- # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
15
-
16
- ## GLOBAL VARIABLES ##
17
-
18
- target = "is_goal"
19
- continuous = ['event_distance',
20
- 'event_angle',
21
- 'seconds_elapsed',
22
- 'period',
23
- 'x_adj',
24
- 'y_adj',
25
- 'distance_from_last',
26
- 'angle_from_last',
27
- 'seconds_since_last',
28
- 'speed_from_last',
29
- 'speed_of_angle_from_last',
30
- 'score_state',
31
- 'strength_diff'
32
- ]
33
- boolean = ['is_home',
34
- 'wrist',
35
- 'deflected',
36
- 'tip-in',
37
- 'slap',
38
- 'backhand',
39
- 'snap',
40
- 'wrap-around',
41
- 'poke',
42
- 'bat',
43
- 'cradle',
44
- 'between-legs',
45
- 'prior_shot-on-goal_same',
46
- 'prior_missed-shot_same',
47
- 'prior_blocked-shot_same',
48
- 'prior_giveaway_same',
49
- 'prior_takeaway_same',
50
- 'prior_hit_same',
51
- 'prior_shot-on-goal_opp',
52
- 'prior_missed-shot_opp',
53
- 'prior_blocked-shot_opp',
54
- 'prior_giveaway_opp',
55
- 'prior_takeaway_opp',
56
- 'prior_hit_opp',
57
- 'prior_faceoff',
58
- 'regular',
59
- 'empty_net',
60
- 'offwing',
61
- 'rush',
62
- 'rebound'
63
- ]
64
-
65
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
66
- shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
67
- fenwick_events = ['missed-shot','shot-on-goal','goal']
68
- strengths = ['3v3',
69
- '3v4',
70
- '3v5',
71
- '4v3',
72
- '4v4',
73
- '4v5',
74
- '4v6',
75
- '5v3',
76
- '5v4',
77
- '5v5',
78
- '5v6',
79
- '6v4',
80
- '6v5']
81
-
82
- dir = os.path.dirname(os.path.realpath(__file__))
83
- roster_path = os.path.join(dir,'rosters\\nhl_rosters.csv')
84
- xg_model_path = os.path.join(dir,'xg_model\\wsba_xg.joblib')
85
- test_path = os.path.join(dir,'xg_model\\testing\\xg_model_training_runs.csv')
86
- cv_path = os.path.join(dir,'xg_model\\testing\\xg_model_cv_runs.csv')
87
-
88
- def fix_players(pbp):
89
- #Add/fix player info for shooters and goaltenders
90
- print('Adding player info to pbp...')
91
-
92
- #Load roster and all players
93
- roster = pd.read_csv(roster_path).drop_duplicates(['id'])[['fullName','id','shootsCatches']]
94
-
95
- #Some players are missing from the roster file (generally in newer seasons); add these manually
96
- miss = list(pbp.loc[~(pbp['event_player_1_id'].isin(list(roster['id'])))&(pbp['event_player_1_id'].notna()),'event_player_1_id'].drop_duplicates())
97
- if miss:
98
- add = wsba.nhl_scrape_player_data(miss).rename(columns={'playerId':'id'})[['fullName','id','shootsCatches']]
99
- roster = pd.concat([roster,add]).reset_index(drop=True)
100
-
101
- #Conversion dict
102
- roster['id'] = roster['id'].astype(str)
103
- roster_dict = roster.set_index('id').to_dict()['shootsCatches']
104
- names_dict = roster.set_index('id').to_dict()['fullName']
105
-
106
- #Add player names
107
- for i in range(3):
108
- pbp[f'add_player_{i+1}_name'] = np.where(pbp[f'event_player_{i+1}_name'].isna(),pbp[f'event_player_{i+1}_id'].astype(str).replace(names_dict),np.nan)
109
- pbp[f'event_player_{i+1}_name'] = pbp[f'event_player_{i+1}_name'].combine_first(pbp[f'add_player_{i+1}_name'])
110
-
111
- pbp['event_goalie_name'] = pbp['event_goalie_id'].astype(str).replace(names_dict)
112
-
113
- #Add hands
114
- pbp['event_player_1_hand'] = pbp['event_player_1_id'].astype(str).str.replace('.0','').replace(roster_dict)
115
- pbp['event_player_1_hand'] = pbp['event_player_1_hand'].replace('nan',np.nan)
116
-
117
- return pbp
118
-
119
- def prep_xG_data(data):
120
- #Prep data for xG training and calculation
121
- data = fix_players(data)
122
-
123
- #Informal groupby
124
- data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
125
-
126
- #Recalibrate times series data with current data
127
- data['seconds_since_last'] = data['seconds_elapsed'] - data['seconds_elapsed'].shift(1)
128
- #Prevent leaking between games by setting value to zero when no time has occured in game
129
- data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'])
130
-
131
- #Create last event columns
132
- data["event_team_last"] = data['event_team_abbr'].shift(1)
133
- data["event_type_last"] = data['event_type'].shift(1)
134
- data["x_adj_last"] = data['x_adj'].shift(1)
135
- data["y_adj_last"] = data['y_adj'].shift(1)
136
- data["zone_code_last"] = data['zone_code'].shift(1)
137
-
138
- data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
139
-
140
- #Contextual Data (for score state minimize the capture to four goals)
141
- data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
142
- data['score_state'] = np.where(data['score_state']>4,4,data['score_state'])
143
- data['score_state'] = np.where(data['score_state']<-4,-4,data['score_state'])
144
-
145
- data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
146
- data['strength_state_venue'] = data['away_skaters'].astype(str)+'v'+data['home_skaters'].astype(str)
147
- data['distance_from_last'] = np.sqrt((data['x_adj'] - data['x_adj_last'])**2 + (data['y_adj'] - data['y_adj_last'])**2)
148
- data['angle_from_last'] = np.degrees(np.arctan2(abs(data['y_adj'] - data['y_adj_last']), abs(89 - (data['x_adj']-data['x_adj_last']))))
149
-
150
- #Event speeds
151
- data['speed_from_last'] = np.where(data['seconds_since_last']==0,0,data['distance_from_last']/data['seconds_since_last'])
152
- data['speed_of_angle_from_last'] = np.where(data['seconds_since_last']==0,0,data['angle_from_last']/data['seconds_since_last'])
153
-
154
- #Rush and rebounds are labelled
155
- data['rush'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_adj']>25)&(data['seconds_since_last']<=5),1,0)
156
- data['rebound'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<=2),1,0)
157
-
158
- #Create boolean variables
159
- data["is_goal"]=(data['event_type']=='goal').astype(int)
160
- data["is_home"]=(data['home_team_abbr']==data['event_team_abbr']).astype(int)
161
-
162
- #Boolean variables for shot types and prior events
163
- for shot in shot_types:
164
- data[shot] = (data['shot_type']==shot).astype(int)
165
- for event in events[0:len(events)-1]:
166
- data[f'prior_{event}_same'] = ((data['event_type_last']==event)&(data['event_team_last']==data['event_team_abbr'])).astype(int)
167
- data[f'prior_{event}_opp'] = ((data['event_type_last']==event)&(data['event_team_last']!=data['event_team_abbr'])).astype(int)
168
-
169
- data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
170
-
171
- #Misc variables
172
- data['empty_net'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_goalie_id'].isna()),1,0)
173
- data['regular'] = (data['season_type']==2).astype(int)
174
- data['offwing'] = np.where(((data['y_adj']<0)&(data['event_player_1_hand']=='L'))|((data['y_adj']>=0)&(data['event_player_1_hand']=='R')),1,0)
175
-
176
- #Return: pbp data prepared to train and calculate the xG model
177
- return data
178
-
179
- def wsba_xG(pbp, hypertune = False, train = False, model_path = xg_model_path, train_runs = 20, cv_runs = 20):
180
- #Train and calculate the WSBA Expected Goals model
181
-
182
- #Add index for future merging
183
- pbp['event_index'] = pbp.index
184
-
185
- #Recalibrate coordinates
186
- pbp = scraping.adjust_coords(pbp)
187
-
188
- #Fix strengths
189
- pbp['strength_state'] = np.where((pbp['season_type']==3)&(pbp['period']>4),(np.where(pbp['event_team_abbr']==pbp['away_team_abbr'],pbp['away_skaters'].astype(str)+"v"+pbp['home_skaters'].astype(str),pbp['home_skaters'].astype(str)+"v"+pbp['away_skaters'].astype(str))),pbp['strength_state'])
190
-
191
- #Filter unwanted data:
192
- #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
193
- pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
194
- (pbp['strength_state'].isin(strengths))&
195
- (pbp['x'].notna())&
196
- (pbp['y'].notna())]
197
-
198
- #Prep Data
199
- data = prep_xG_data(pbp_prep)
200
-
201
- #Reduce to fenwick shots
202
- data = data.loc[data['event_type'].isin(fenwick_events)]
203
-
204
- #Convert to sparse
205
- data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
206
-
207
- #Target and Predictors
208
- is_goal_vect = data_sparse[:, 0].A
209
- predictors = data_sparse[:, 1:]
210
-
211
- #XGB DataModel
212
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
213
-
214
- if train:
215
- if hypertune:
216
- # Number of runs
217
- run_num = train_runs
218
-
219
- # DataFrames to store results
220
- best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
221
- best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
222
-
223
- # Loop
224
- for i in range(run_num):
225
- print(f"### LOOP: {i+1} ###")
226
-
227
- param = {
228
- "objective": "binary:logistic",
229
- "eval_metric": ["logloss", "auc"],
230
- "max_depth": 6,
231
- "eta": np.random.uniform(0.06, 0.11),
232
- "gamma": np.random.uniform(0.06, 0.12),
233
- "subsample": np.random.uniform(0.76, 0.84),
234
- "colsample_bytree": np.random.uniform(0.76, 0.8),
235
- "min_child_weight": np.random.randint(5, 23),
236
- "max_delta_step": np.random.randint(4, 9)
237
- }
238
-
239
- # Cross-validation
240
- seed = np.random.randint(0, 10000)
241
- np.random.seed(seed)
242
-
243
- cv_results = xgb.cv(
244
- params=param,
245
- dtrain=xgb_matrix,
246
- num_boost_round=1000,
247
- nfold=5,
248
- early_stopping_rounds=25,
249
- metrics=["logloss", "auc"],
250
- seed=seed
251
- )
252
-
253
- # Record results
254
- best_df.loc[i] = param
255
- best_ll.loc[i] = [
256
- cv_results["test-logloss-mean"].min(),
257
- cv_results["test-logloss-mean"].idxmin(),
258
- cv_results["test-auc-mean"].max(),
259
- cv_results["test-auc-mean"].idxmax(),
260
- seed
261
- ]
262
-
263
- # Combine results
264
- best_all = pd.concat([best_df, best_ll], axis=1).dropna()
265
-
266
- # Arrange to get best run
267
- best_all = best_all.sort_values(by="auc", ascending=False)
268
-
269
- best_all.to_csv(test_path,index=False)
270
-
271
- # Final parameters
272
- param_7_EV = {
273
- "objective": "binary:logistic",
274
- "eval_metric": ["logloss", "auc"],
275
- "gamma": best_all['gamma'].iloc[0],
276
- "subsample": best_all['subsample'].iloc[0],
277
- "max_depth": best_all['max_depth'].iloc[0],
278
- "colsample_bytree": best_all['colsample_bytree'].iloc[0],
279
- "min_child_weight": best_all['min_child_weight'].iloc[0],
280
- "max_delta_step": best_all['max_delta_step'].iloc[0],
281
- }
282
-
283
- # CV rounds Loop
284
- run_num = cv_runs
285
- cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
286
-
287
- for i in range(run_num):
288
- print(f"### LOOP: {i+1} ###")
289
-
290
- seed = np.random.randint(0, 10000)
291
- np.random.seed(seed)
292
-
293
- cv_rounds = xgb.cv(
294
- params=param_7_EV,
295
- dtrain=xgb_matrix,
296
- num_boost_round=1000,
297
- nfold=5,
298
- early_stopping_rounds=25,
299
- metrics=["logloss", "auc"],
300
- seed=seed
301
- )
302
-
303
- # Record results
304
- cv_test.loc[i] = [
305
- cv_rounds["test-auc-mean"].idxmax(),
306
- cv_rounds["test-auc-mean"].max(),
307
- cv_rounds["test-logloss-mean"].idxmin(),
308
- cv_rounds["test-logloss-mean"].min(),
309
- seed
310
- ]
311
-
312
- # Clean results and sort to find the number of rounds to use and seed
313
- cv_final = cv_test.sort_values(by="AUC", ascending=False)
314
- cv_final.to_csv(cv_path,index=False)
315
- else:
316
- # Load previous parameters
317
- best_all = pd.read_csv(test_path)
318
- cv_final = pd.read_csv(cv_path)
319
-
320
- print('Loaded hyperparameters...')
321
- # Final parameters
322
- param_7_EV = {
323
- "objective": "binary:logistic",
324
- "eval_metric": ["logloss", "auc"],
325
- "gamma": best_all['gamma'].iloc[0],
326
- "subsample": best_all['subsample'].iloc[0],
327
- "max_depth": best_all['max_depth'].iloc[0],
328
- "colsample_bytree": best_all['colsample_bytree'].iloc[0],
329
- "min_child_weight": best_all['min_child_weight'].iloc[0],
330
- "max_delta_step": best_all['max_delta_step'].iloc[0],
331
- }
332
-
333
- print('Training model...')
334
- seed = int(cv_final['seed'].iloc[0])
335
- np.random.seed(seed)
336
- model = xgb.train(
337
- params=param_7_EV,
338
- dtrain=xgb_matrix,
339
- num_boost_round=int(cv_final['AUC_rounds'].iloc[0]),
340
- verbose_eval=2,
341
- )
342
-
343
- #Save model
344
- joblib.dump(model,model_path)
345
-
346
- else:
347
- model = joblib.load(model_path)
348
-
349
- #Predict goal
350
- data['xG'] = model.predict(xgb_matrix)
351
-
352
- #Drop previous xG if it exists
353
- pbp = pbp.drop(columns=['xG'],errors='ignore')
354
-
355
- #Merge
356
- comm = list(data.columns.intersection(pbp.columns))
357
- comm.remove('event_index')
358
- data = data.drop(columns=comm)
359
- pbp_xg = pd.merge(pbp,data,how='left')
360
-
361
- return pbp_xg
362
-
363
- def feature_importance(model):
364
- print('Feature importance for WSBA xG Model...')
365
- model = joblib.load(model)
366
-
367
- fig, ax = plt.subplots(figsize=(10, 7))
368
- xgb.plot_importance(model,
369
- importance_type='weight',
370
- max_num_features=30,
371
- height=0.5,
372
- grid=False,
373
- show_values=False,
374
- xlabel='Weight',
375
- title='WSBA xG Feature Importance',
376
- ax=ax
377
- )
378
- plt.savefig('tools/xg_model/metrics/feature_importance.png',bbox_inches='tight')
379
-
380
- def roc_auc_curve(pbp,model):
381
- print('ROC-AUC Curve for WSBA xG Model...')
382
-
383
- #Recalibrate coordinates
384
- pbp = scraping.adjust_coords(pbp)
385
-
386
- #Filter unwanted data:
387
- #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
388
- pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
389
- (pbp['strength_state'].isin(strengths))&
390
- (pbp['period'] < 5)&
391
- (pbp['x'].notna())&
392
- (pbp['y'].notna())]
393
-
394
- pbp = prep_xG_data(pbp_prep)
395
- model = joblib.load(model)
396
-
397
- data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
398
-
399
- data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
400
-
401
- is_goal_vect = data_sparse[:, 0].A
402
- predictors = data_sparse[:, 1:]
403
-
404
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
405
-
406
- pred = model.predict(xgb_matrix)
407
- fpr, tpr, _ = roc_curve(is_goal_vect, pred)
408
- roc_auc = auc(fpr,tpr)
409
-
410
- plt.figure()
411
- plt.plot(fpr,tpr,label=f"ROC (AUC = {roc_auc:.4f})")
412
- plt.plot([0, 1], [0, 1], linestyle="--")
413
- plt.title("WSBA xG ROC Curve")
414
- plt.xlabel("False Positive Rate")
415
- plt.ylabel("True Positive Rate")
416
- plt.legend(loc="lower right")
417
- plt.savefig('tools/xg_model/metrics/roc_auc_curve.png')
418
-
419
- def reliability(pbp,model):
420
- print('Reliability for WSBA xG Model...')
421
-
422
- #Recalibrate coordinates
423
- pbp = scraping.adjust_coords(pbp)
424
-
425
- #Filter unwanted data:
426
- #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
427
- pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
428
- (pbp['strength_state'].isin(strengths))&
429
- (pbp['period'] < 5)&
430
- (pbp['x'].notna())&
431
- (pbp['y'].notna())]
432
-
433
- pbp = prep_xG_data(pbp_prep)
434
- model = joblib.load(model)
435
-
436
- data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
437
-
438
- data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
439
-
440
- is_goal_vect = data_sparse[:, 0].A
441
- predictors = data_sparse[:, 1:]
442
-
443
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
444
-
445
- pred = model.predict(xgb_matrix)
446
- fop, mpv = calibration_curve(is_goal_vect, pred, strategy='uniform')
447
-
448
- plt.figure()
449
- plt.plot(mpv, fop, "s-", label="Model")
450
- plt.plot([0, 1], [0, 1], linestyle="--", label="Perfect calibration")
451
- plt.title("WSBA xG Reliability Diagram")
452
- plt.xlabel("Predicted Probability (mean)")
453
- plt.ylabel("Fraction of positives")
454
- plt.legend(loc="best")
455
- plt.savefig('tools/xg_model/metrics/reliability.png')