wsba-hockey 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. wsba_hockey/data_pipelines.py +183 -0
  2. wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py +146 -0
  3. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py +149 -0
  4. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/test.py +63 -0
  5. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  6. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  7. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  8. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  9. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  10. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  11. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  12. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  13. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  14. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +690 -0
  15. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common.py +661 -0
  16. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  17. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  18. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  19. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  20. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  21. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  22. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  23. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  24. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  25. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  26. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  27. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  28. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  29. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2714 -0
  30. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3981 -0
  31. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  32. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  33. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  34. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  35. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  36. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input.py +3130 -0
  37. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  38. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  39. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  40. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  41. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  42. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  43. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  44. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  45. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  46. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  47. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/test_gyp.py +261 -0
  48. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  49. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  50. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  51. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  52. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  53. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/update-gyp.py +64 -0
  54. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  55. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  56. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  57. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  58. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  59. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  60. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  61. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  62. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  63. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +666 -0
  64. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common.py +654 -0
  65. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  66. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  67. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  68. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  69. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  70. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  71. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  72. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  73. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  74. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  75. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  76. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  77. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  78. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2518 -0
  79. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3978 -0
  80. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  81. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  82. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  83. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  84. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  85. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input.py +3137 -0
  86. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  87. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  88. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  89. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  90. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  91. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  92. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  93. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  94. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  95. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  96. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/setup.py +42 -0
  97. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/test_gyp.py +260 -0
  98. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  99. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  100. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  101. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  102. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  103. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/update-gyp.py +46 -0
  104. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/app.py +401 -0
  105. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/name_fix.py +47 -0
  106. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/app.py +108 -0
  107. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/plot.py +93 -0
  108. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/rink_plot.py +245 -0
  109. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/app.py +145 -0
  110. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/plot.py +77 -0
  111. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/rink_plot.py +245 -0
  112. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/app.py +389 -0
  113. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/plot.py +70 -0
  114. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/rink_plot.py +245 -0
  115. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/app.py +110 -0
  116. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/plot.py +58 -0
  117. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/rink_plot.py +245 -0
  118. wsba_hockey/tools/agg.py +242 -53
  119. wsba_hockey/tools/plotting.py +15 -20
  120. wsba_hockey/tools/scraping.py +149 -258
  121. wsba_hockey/tools/xg_model.py +370 -298
  122. wsba_hockey/workspace.py +22 -101
  123. wsba_hockey/wsba_main.py +494 -147
  124. {wsba_hockey-1.0.2.dist-info → wsba_hockey-1.0.4.dist-info}/METADATA +2 -2
  125. wsba_hockey-1.0.4.dist-info/RECORD +135 -0
  126. {wsba_hockey-1.0.2.dist-info → wsba_hockey-1.0.4.dist-info}/WHEEL +1 -1
  127. wsba_hockey/stats/calculate_viz/shot_impact.py +0 -2
  128. wsba_hockey-1.0.2.dist-info/RECORD +0 -19
  129. {wsba_hockey-1.0.2.dist-info → wsba_hockey-1.0.4.dist-info}/licenses/LICENSE +0 -0
  130. {wsba_hockey-1.0.2.dist-info → wsba_hockey-1.0.4.dist-info}/top_level.txt +0 -0
@@ -3,48 +3,149 @@ import numpy as np
3
3
  import xgboost as xgb
4
4
  import scipy.sparse as sp
5
5
  import joblib
6
- from zipfile import ZipFile
7
- import requests as rs
6
+ import wsba_main as wsba
7
+ import tools.scraping as scraping
8
+ from sklearn.calibration import calibration_curve
9
+ from sklearn.metrics import roc_curve, auc
10
+ import matplotlib.pyplot as plt
8
11
 
9
12
  ### XG_MODEL FUNCTIONS ###
10
13
  # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
11
14
 
12
15
  ## GLOBAL VARIABLES ##
13
- #Newest season
14
- new_full = '20242025'
15
- new = '2024'
16
16
 
17
- def prep_xG_data(pbp):
17
+ target = "is_goal"
18
+ continuous = ['event_distance',
19
+ 'event_angle',
20
+ 'seconds_elapsed',
21
+ 'period',
22
+ 'x_adj',
23
+ 'y_adj',
24
+ 'distance_from_last',
25
+ 'angle_from_last',
26
+ 'seconds_since_last',
27
+ 'speed_from_last',
28
+ 'speed_of_angle_from_last',
29
+ 'score_state',
30
+ 'strength_diff'
31
+ ]
32
+ boolean = ['is_home',
33
+ 'wrist',
34
+ 'deflected',
35
+ 'tip-in',
36
+ 'slap',
37
+ 'backhand',
38
+ 'snap',
39
+ 'wrap-around',
40
+ 'poke',
41
+ 'bat',
42
+ 'cradle',
43
+ 'between-legs',
44
+ 'prior_shot-on-goal_same',
45
+ 'prior_missed-shot_same',
46
+ 'prior_blocked-shot_same',
47
+ 'prior_giveaway_same',
48
+ 'prior_takeaway_same',
49
+ 'prior_hit_same',
50
+ 'prior_shot-on-goal_opp',
51
+ 'prior_missed-shot_opp',
52
+ 'prior_blocked-shot_opp',
53
+ 'prior_giveaway_opp',
54
+ 'prior_takeaway_opp',
55
+ 'prior_hit_opp',
56
+ 'prior_faceoff',
57
+ 'regular',
58
+ 'empty_net',
59
+ 'offwing',
60
+ 'rush',
61
+ 'rebound'
62
+ ]
63
+
64
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
65
+ shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
66
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
67
+ strengths = ['3v3',
68
+ '3v4',
69
+ '3v5',
70
+ '4v3',
71
+ '4v4',
72
+ '4v5',
73
+ '4v6',
74
+ '5v3',
75
+ '5v4',
76
+ '5v5',
77
+ '5v6',
78
+ '6v4',
79
+ '6v5']
80
+
81
+ def fix_players(pbp):
82
+ #Add/fix player info for shooters and goaltenders
83
+ print('Adding player info to pbp...')
84
+
85
+ #Load roster and all players
86
+ roster = pd.read_csv('rosters/nhl_rosters.csv').drop_duplicates(['id'])[['fullName','id','shootsCatches']]
87
+
88
+ #Some players are missing from the roster file (generally in newer seasons); add these manually
89
+ miss = list(pbp.loc[~(pbp['event_player_1_id'].isin(list(roster['id'])))&(pbp['event_player_1_id'].notna()),'event_player_1_id'].drop_duplicates())
90
+ if miss:
91
+ add = wsba.nhl_scrape_player_data(miss).rename(columns={'playerId':'id'})[['fullName','id','shootsCatches']]
92
+ roster = pd.concat([roster,add]).reset_index(drop=True)
93
+
94
+ #Conversion dict
95
+ roster['id'] = roster['id'].astype(str)
96
+ roster_dict = roster.set_index('id').to_dict()['shootsCatches']
97
+ names_dict = roster.set_index('id').to_dict()['fullName']
98
+
99
+ #Add player names
100
+ for i in range(3):
101
+ pbp[f'add_player_{i+1}_name'] = np.where(pbp[f'event_player_{i+1}_name'].isna(),pbp[f'event_player_{i+1}_id'].astype(str).replace(names_dict),np.nan)
102
+ pbp[f'event_player_{i+1}_name'] = pbp[f'event_player_{i+1}_name'].combine_first(pbp[f'add_player_{i+1}_name'])
103
+
104
+ pbp['event_goalie_name'] = pbp['event_goalie_id'].astype(str).replace(names_dict)
105
+
106
+ #Add hands
107
+ pbp['event_player_1_hand'] = pbp['event_player_1_id'].astype(str).str.replace('.0','').replace(roster_dict)
108
+ pbp['event_player_1_hand'] = pbp['event_player_1_hand'].replace('nan',np.nan)
109
+
110
+ return pbp
111
+
112
+ def prep_xG_data(data):
18
113
  #Prep data for xG training and calculation
19
114
 
20
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
21
- shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
22
- fenwick_events = ['missed-shot','shot-on-goal','goal']
23
-
24
115
  #Informal groupby
25
- data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
116
+ data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
26
117
 
27
- #Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
28
- data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
29
- data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
118
+ #Recalibrate times series data with current data
119
+ data['seconds_since_last'] = data['seconds_elapsed'] - data['seconds_elapsed'].shift(1)
120
+ #Prevent leaking between games by setting value to zero when no time has occured in game
121
+ data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'])
30
122
 
31
123
  #Create last event columns
32
124
  data["event_team_last"] = data['event_team_abbr'].shift(1)
33
125
  data["event_type_last"] = data['event_type'].shift(1)
34
- data["x_fixed_last"] = data['x_fixed'].shift(1)
35
- data["y_fixed_last"] = data['y_fixed'].shift(1)
36
- data["zone_code_last"] = data['zone_code'].shift(1)
126
+ data["x_adj_last"] = data['x_adj'].shift(1)
127
+ data["y_adj_last"] = data['y_adj'].shift(1)
128
+ data["zone_code_last"] = data['zone_code'].shift(1)
37
129
 
38
130
  data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
131
+
132
+ #Contextual Data (for score state minimize the capture to four goals)
39
133
  data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
134
+ data['score_state'] = np.where(data['score_state']>4,4,data['score_state'])
135
+ data['score_state'] = np.where(data['score_state']<-4,-4,data['score_state'])
136
+
40
137
  data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
41
138
  data['strength_state_venue'] = data['away_skaters'].astype(str)+'v'+data['home_skaters'].astype(str)
42
- data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
43
- data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
139
+ data['distance_from_last'] = np.sqrt((data['x_adj'] - data['x_adj_last'])**2 + (data['y_adj'] - data['y_adj_last'])**2)
140
+ data['angle_from_last'] = np.degrees(np.arctan2(abs(data['y_adj'] - data['y_adj_last']), abs(89 - (data['x_adj']-data['x_adj_last']))))
44
141
 
45
- #Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
46
- data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
47
- data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
142
+ #Event speeds
143
+ data['speed_from_last'] = np.where(data['seconds_since_last']==0,0,data['distance_from_last']/data['seconds_since_last'])
144
+ data['speed_of_angle_from_last'] = np.where(data['seconds_since_last']==0,0,data['angle_from_last']/data['seconds_since_last'])
145
+
146
+ #Rush and rebounds are labelled
147
+ data['rush'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_adj']>25)&(data['seconds_since_last']<=5),1,0)
148
+ data['rebound'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<=2),1,0)
48
149
 
49
150
  #Create boolean variables
50
151
  data["is_goal"]=(data['event_type']=='goal').astype(int)
@@ -59,317 +160,288 @@ def prep_xG_data(pbp):
59
160
 
60
161
  data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
61
162
 
163
+ #Misc variables
164
+ data['empty_net'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_goalie_id'].isna()),1,0)
165
+ data['regular'] = (data['season_type']==2).astype(int)
166
+ data['offwing'] = np.where(((data['y_adj']<0)&(data['event_player_1_hand']=='L'))|((data['y_adj']>=0)&(data['event_player_1_hand']=='R')),1,0)
167
+
62
168
  #Return: pbp data prepared to train and calculate the xG model
63
169
  return data
64
170
 
65
- def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
171
+ def wsba_xG(pbp, hypertune = False, train = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
66
172
  #Train and calculate the WSBA Expected Goals model
67
-
68
- target = "is_goal"
69
- continous = ['event_distance',
70
- 'event_angle',
71
- 'seconds_elapsed',
72
- 'period',
73
- 'x_fixed',
74
- 'y_fixed',
75
- 'x_fixed_last',
76
- 'y_fixed_last',
77
- 'distance_from_last',
78
- 'seconds_since_last',
79
- 'score_state',
80
- 'strength_diff',
81
- 'fenwick_state',
82
- 'rush_mod',
83
- 'rebound_mod']
84
- boolean = ['is_home',
85
- 'wrist',
86
- 'deflected',
87
- 'tip-in',
88
- 'slap',
89
- 'backhand',
90
- 'snap',
91
- 'wrap-around',
92
- 'poke',
93
- 'bat',
94
- 'cradle',
95
- 'between-legs',
96
- 'prior_shot-on-goal_same',
97
- 'prior_missed-shot_same',
98
- 'prior_blocked-shot_same',
99
- 'prior_giveaway_same',
100
- 'prior_takeaway_same',
101
- 'prior_hit_same',
102
- 'prior_shot-on-goal_opp',
103
- 'prior_missed-shot_opp',
104
- 'prior_blocked-shot_opp',
105
- 'prior_giveaway_opp',
106
- 'prior_takeaway_opp',
107
- 'prior_hit_opp',
108
- 'prior_faceoff']
109
-
110
- #Prep Data
111
- pbp = prep_xG_data(pbp)
112
- #Filter unwanted date:
113
- #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
114
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
115
- fenwick_events = ['missed-shot','shot-on-goal','goal']
116
- strengths = ['3v3',
117
- '3v4',
118
- '3v5',
119
- '4v3',
120
- '4v4',
121
- '4v5',
122
- '4v6',
123
- '5v3',
124
- '5v4',
125
- '5v5',
126
- '5v6',
127
- '6v4',
128
- '6v5']
129
-
130
- data = pbp.loc[(pbp['event_type'].isin(events))&
173
+
174
+ #Add index for future merging
175
+ pbp['event_index'] = pbp.index
176
+
177
+ #Recalibrate coordinates
178
+ pbp = scraping.adjust_coords(pbp)
179
+
180
+ #Fix strengths
181
+ pbp['strength_state'] = np.where((pbp['season_type']==3)&(pbp['period']>4),(np.where(pbp['event_team_abbr']==pbp['away_team_abbr'],pbp['away_skaters'].astype(str)+"v"+pbp['home_skaters'].astype(str),pbp['home_skaters'].astype(str)+"v"+pbp['away_skaters'].astype(str))),pbp['strength_state'])
182
+
183
+ #Filter unwanted data:
184
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
185
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
131
186
  (pbp['strength_state'].isin(strengths))&
132
- (pbp['period'] < 5)&
133
- (pbp['x_fixed'].notna())&
134
- (pbp['y_fixed'].notna())&
135
- ~((pbp['x_fixed']==0)&(pbp['y_fixed']==0)&(pbp['x_fixed'].isin(fenwick_events))&(pbp['event_distance']!=90))]
187
+ (pbp['x'].notna())&
188
+ (pbp['y'].notna())]
189
+
190
+ #Prep Data
191
+ data = prep_xG_data(pbp_prep)
136
192
 
193
+ #Reduce to fenwick shots
194
+ data = data.loc[data['event_type'].isin(fenwick_events)]
195
+
137
196
  #Convert to sparse
138
- data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
197
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
139
198
 
140
199
  #Target and Predictors
141
200
  is_goal_vect = data_sparse[:, 0].A
142
201
  predictors = data_sparse[:, 1:]
143
202
 
144
203
  #XGB DataModel
145
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
146
-
147
- if train == True:
148
- # Number of runs
149
- run_num = train_runs
150
-
151
- # DataFrames to store results
152
- best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
153
- best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
154
-
155
- # Loop
156
- for i in range(run_num):
157
- print(f"### LOOP: {i+1} ###")
158
-
159
- param = {
204
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
205
+
206
+ if train:
207
+ if hypertune:
208
+ # Number of runs
209
+ run_num = train_runs
210
+
211
+ # DataFrames to store results
212
+ best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
213
+ best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
214
+
215
+ # Loop
216
+ for i in range(run_num):
217
+ print(f"### LOOP: {i+1} ###")
218
+
219
+ param = {
220
+ "objective": "binary:logistic",
221
+ "eval_metric": ["logloss", "auc"],
222
+ "max_depth": 6,
223
+ "eta": np.random.uniform(0.06, 0.11),
224
+ "gamma": np.random.uniform(0.06, 0.12),
225
+ "subsample": np.random.uniform(0.76, 0.84),
226
+ "colsample_bytree": np.random.uniform(0.76, 0.8),
227
+ "min_child_weight": np.random.randint(5, 23),
228
+ "max_delta_step": np.random.randint(4, 9)
229
+ }
230
+
231
+ # Cross-validation
232
+ seed = np.random.randint(0, 10000)
233
+ np.random.seed(seed)
234
+
235
+ cv_results = xgb.cv(
236
+ params=param,
237
+ dtrain=xgb_matrix,
238
+ num_boost_round=1000,
239
+ nfold=5,
240
+ early_stopping_rounds=25,
241
+ metrics=["logloss", "auc"],
242
+ seed=seed
243
+ )
244
+
245
+ # Record results
246
+ best_df.loc[i] = param
247
+ best_ll.loc[i] = [
248
+ cv_results["test-logloss-mean"].min(),
249
+ cv_results["test-logloss-mean"].idxmin(),
250
+ cv_results["test-auc-mean"].max(),
251
+ cv_results["test-auc-mean"].idxmax(),
252
+ seed
253
+ ]
254
+
255
+ # Combine results
256
+ best_all = pd.concat([best_df, best_ll], axis=1).dropna()
257
+
258
+ # Arrange to get best run
259
+ best_all = best_all.sort_values(by="auc", ascending=False)
260
+
261
+ best_all.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
262
+
263
+ # Final parameters
264
+ param_7_EV = {
160
265
  "objective": "binary:logistic",
161
266
  "eval_metric": ["logloss", "auc"],
162
- "max_depth": 6,
163
- "eta": np.random.uniform(0.06, 0.11),
164
- "gamma": np.random.uniform(0.06, 0.12),
165
- "subsample": np.random.uniform(0.76, 0.84),
166
- "colsample_bytree": np.random.uniform(0.76, 0.8),
167
- "min_child_weight": np.random.randint(5, 23),
168
- "max_delta_step": np.random.randint(4, 9)
267
+ "gamma": best_all['gamma'].iloc[0],
268
+ "subsample": best_all['subsample'].iloc[0],
269
+ "max_depth": best_all['max_depth'].iloc[0],
270
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
271
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
272
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
169
273
  }
170
-
171
- # Cross-validation
172
- seed = np.random.randint(0, 10000)
173
- np.random.seed(seed)
174
-
175
- cv_results = xgb.cv(
176
- params=param,
177
- dtrain=xgb_matrix,
178
- num_boost_round=1000,
179
- nfold=5,
180
- early_stopping_rounds=25,
181
- metrics=["logloss", "auc"],
182
- seed=seed
183
- )
184
-
185
- # Record results
186
- best_df.loc[i] = param
187
- best_ll.loc[i] = [
188
- cv_results["test-logloss-mean"].min(),
189
- cv_results["test-logloss-mean"].idxmin(),
190
- cv_results["test-auc-mean"].max(),
191
- cv_results["test-auc-mean"].idxmax(),
192
- seed
193
- ]
194
274
 
195
- # Combine results
196
- best_all = pd.concat([best_df, best_ll], axis=1).dropna()
197
-
198
- # Arrange to get best run
199
- best_all = best_all.sort_values(by="auc", ascending=False)
200
-
201
- if overwrite == True:
202
- best_all.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
203
- else:
204
- best_old = pd.read_csv("xg_model/testing/xg_model_training_runs.csv")
205
- best_comb = pd.concat([best_old,best_all])
206
- best_comb.to_csv("xg_model/testing/xg_model_training_runs.csv",index=False)
207
-
208
- # Final parameters
209
- param_7_EV = {
210
- "objective": "binary:logistic",
211
- "eval_metric": ["logloss", "auc"],
212
- "eta": 0.068,
213
- "gamma": 0.12,
214
- "subsample": 0.78,
215
- "max_depth": 6,
216
- "colsample_bytree": 0.76,
217
- "min_child_weight": 5,
218
- "max_delta_step": 5,
219
- }
220
-
221
- # CV rounds Loop
222
- run_num = cv_runs
223
- cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
224
-
225
- for i in range(run_num):
226
- print(f"### LOOP: {i+1} ###")
227
-
228
- seed = np.random.randint(0, 10000)
229
- np.random.seed(seed)
230
-
231
- cv_rounds = xgb.cv(
232
- params=param_7_EV,
233
- dtrain=xgb_matrix,
234
- num_boost_round=1000,
235
- nfold=5,
236
- early_stopping_rounds=25,
237
- metrics=["logloss", "auc"],
238
- seed=seed
239
- )
240
-
241
- # Record results
242
- cv_test.loc[i] = [
243
- cv_rounds["test-auc-mean"].idxmax(),
244
- cv_rounds["test-auc-mean"].max(),
245
- cv_rounds["test-logloss-mean"].idxmin(),
246
- cv_rounds["test-logloss-mean"].min(),
247
- seed
248
- ]
249
-
250
- # Clean results and sort to find the number of rounds to use and seed
251
- cv_final = cv_test.sort_values(by="AUC", ascending=False)
252
- if overwrite == True:
253
- cv_final.to_csv("xg_model/testing/xg_model_cv_runs.csv",index=False)
275
+ # CV rounds Loop
276
+ run_num = cv_runs
277
+ cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
278
+
279
+ for i in range(run_num):
280
+ print(f"### LOOP: {i+1} ###")
281
+
282
+ seed = np.random.randint(0, 10000)
283
+ np.random.seed(seed)
284
+
285
+ cv_rounds = xgb.cv(
286
+ params=param_7_EV,
287
+ dtrain=xgb_matrix,
288
+ num_boost_round=1000,
289
+ nfold=5,
290
+ early_stopping_rounds=25,
291
+ metrics=["logloss", "auc"],
292
+ seed=seed
293
+ )
294
+
295
+ # Record results
296
+ cv_test.loc[i] = [
297
+ cv_rounds["test-auc-mean"].idxmax(),
298
+ cv_rounds["test-auc-mean"].max(),
299
+ cv_rounds["test-logloss-mean"].idxmin(),
300
+ cv_rounds["test-logloss-mean"].min(),
301
+ seed
302
+ ]
303
+
304
+ # Clean results and sort to find the number of rounds to use and seed
305
+ cv_final = cv_test.sort_values(by="AUC", ascending=False)
306
+ cv_final.to_csv("tools/xg_model/testing/xg_model_cv_runs.csv",index=False)
254
307
  else:
255
- cv_old = pd.read_csv("xg_model/testing/xg_model_cv_runs.csv")
256
- cv_comb = pd.concat([cv_old,cv_final])
257
- cv_comb.to_csv("xg_model/testing/xg_model_cv_runs.csv")
258
- cv_final.loc[len(cv_final)] = cv_test.mean()
308
+ # Load previous parameters
309
+ best_all = pd.read_csv('tools/xg_model/testing/xg_model_training_runs.csv')
310
+ cv_final = pd.read_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
259
311
 
260
- # Train the final model
261
- np.random.seed(556)
262
-
263
- if overwrite == False:
264
- model = joblib.load(model_path)
265
- else:
266
- ""
312
+ print('Loaded hyperparameters...')
313
+ # Final parameters
314
+ param_7_EV = {
315
+ "objective": "binary:logistic",
316
+ "eval_metric": ["logloss", "auc"],
317
+ "gamma": best_all['gamma'].iloc[0],
318
+ "subsample": best_all['subsample'].iloc[0],
319
+ "max_depth": best_all['max_depth'].iloc[0],
320
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
321
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
322
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
323
+ }
267
324
 
325
+ print('Training model...')
326
+ seed = int(cv_final['seed'].iloc[0])
327
+ np.random.seed(seed)
268
328
  model = xgb.train(
269
329
  params=param_7_EV,
270
330
  dtrain=xgb_matrix,
271
- num_boost_round=189,
272
- verbose_eval=2
331
+ num_boost_round=int(cv_final['AUC_rounds'].iloc[0]),
332
+ verbose_eval=2,
273
333
  )
274
-
334
+
335
+ #Save model
275
336
  joblib.dump(model,model_path)
276
337
 
277
338
  else:
278
339
  model = joblib.load(model_path)
279
- pbp['xG'] = np.where(pbp['event_type'].isin(fenwick_events),model.predict(xgb_matrix),"")
280
- return pbp
281
340
 
282
- def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
283
- #Given play-by-play, return itself with xG column sourced from MoneyPuck.com
341
+ #Predict goal
342
+ data['xG'] = model.predict(xgb_matrix)
343
+
344
+ #Drop previous xG if it exists
345
+ pbp = pbp.drop(columns=['xG'],errors='ignore')
346
+
347
+ #Merge
348
+ comm = list(data.columns.intersection(pbp.columns))
349
+ comm.remove('event_index')
350
+ data = data.drop(columns=comm)
351
+ pbp_xg = pd.merge(pbp,data,how='left')
352
+
353
+ return pbp_xg
354
+
355
+ def feature_importance(model):
356
+ print('Feature importance for WSBA xG Model...')
357
+ model = joblib.load(model)
358
+
359
+ fig, ax = plt.subplots(figsize=(10, 7))
360
+ xgb.plot_importance(model,
361
+ importance_type='weight',
362
+ max_num_features=30,
363
+ height=0.5,
364
+ grid=False,
365
+ show_values=False,
366
+ xlabel='Weight',
367
+ title='WSBA xG Feature Importance',
368
+ ax=ax
369
+ )
370
+ plt.savefig('tools/xg_model/metrics/feature_importance.png',bbox_inches='tight')
284
371
 
285
- #If file is already in the repository downloading is not necessary
286
- try:
287
- db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
288
- except:
289
- url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
372
+ def roc_auc_curve(pbp,model):
373
+ print('ROC-AUC Curve for WSBA xG Model...')
290
374
 
291
- response = rs.get(url)
375
+ #Recalibrate coordinates
376
+ pbp = scraping.adjust_coords(pbp)
292
377
 
293
- if response.status_code == 200:
294
- with open(repo_path, 'wb') as file:
295
- file.write(response.content)
296
- print('File downloaded successfully')
297
- else:
298
- print('Failed to download file')
378
+ #Filter unwanted data:
379
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
380
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
381
+ (pbp['strength_state'].isin(strengths))&
382
+ (pbp['period'] < 5)&
383
+ (pbp['x'].notna())&
384
+ (pbp['y'].notna())]
299
385
 
300
- with ZipFile(repo_path, 'r') as zObject:
301
- zObject.extractall(
302
- path="tools/xg_model/moneypuck/shots/")
303
-
304
- db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
386
+ pbp = prep_xG_data(pbp_prep)
387
+ model = joblib.load(model)
388
+
389
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
305
390
 
306
- #Repeat process with active/most recent season
307
- #For the new/recent season, only scrape if the supplied pbp data contains the season
308
- if new in list(pbp['season'].astype(str).str[0:4]):
309
- url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
310
- repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
311
-
312
- response = rs.get(url)
313
-
314
- if response.status_code == 200:
315
- with open(repo_path, 'wb') as file:
316
- file.write(response.content)
317
- print('File downloaded successfully')
318
- else:
319
- print('Failed to download file')
320
-
321
- with ZipFile(repo_path, 'r') as zObject:
322
- zObject.extractall(
323
- path="tools/xg_model/moneypuck/shots/")
324
-
325
- new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
326
- #Convert to parquet
327
- new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
328
- else:
329
- new_season = pd.DataFrame()
330
- #Combine shots
331
- moneypuck = pd.concat([db,new_season])
332
-
333
- #Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
334
- moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
335
- moneypuck['event'] = moneypuck['event'].replace({
336
- "SHOT":"shot-on-goal",
337
- "MISS":"missed-shot",
338
- "BLOCK":"blocked-shot",
339
- "GOAL":"goal"
340
- })
391
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
392
+
393
+ is_goal_vect = data_sparse[:, 0].A
394
+ predictors = data_sparse[:, 1:]
341
395
 
342
- #Manual Team Rename
343
- moneypuck['teamCode'] = moneypuck['teamCode'].replace({
344
- "L.A":"LAK",
345
- "N.J":"NJD",
346
- "S.J":"SJS",
347
- "T.B":"TBL",
348
- })
349
- pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
350
- "L.A":"LAK",
351
- "N.J":"NJD",
352
- "S.J":"SJS",
353
- "T.B":"TBL",
354
- "PHX":'ARI'
355
- })
356
-
357
- #Managing oddities in datatypes
358
- moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
359
- pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
360
-
361
- #Modify and merge
362
- moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
363
- comb = pd.merge(pbp,moneypuck
364
- ,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
365
- ,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
366
- ,how='left')
396
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
397
+
398
+ pred = model.predict(xgb_matrix)
399
+ fpr, tpr, _ = roc_curve(is_goal_vect, pred)
400
+ roc_auc = auc(fpr,tpr)
367
401
 
368
- #Drop and rename
369
- pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
402
+ plt.figure()
403
+ plt.plot(fpr,tpr,label=f"ROC (AUC = {roc_auc:.4f})")
404
+ plt.plot([0, 1], [0, 1], linestyle="--")
405
+ plt.title("WSBA xG ROC Curve")
406
+ plt.xlabel("False Positive Rate")
407
+ plt.ylabel("True Positive Rate")
408
+ plt.legend(loc="lower right")
409
+ plt.savefig('tools/xg_model/metrics/roc_auc_curve.png')
410
+
411
+ def reliability(pbp,model):
412
+ print('Reliability for WSBA xG Model...')
413
+
414
+ #Recalibrate coordinates
415
+ pbp = scraping.adjust_coords(pbp)
416
+
417
+ #Filter unwanted data:
418
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
419
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
420
+ (pbp['strength_state'].isin(strengths))&
421
+ (pbp['period'] < 5)&
422
+ (pbp['x'].notna())&
423
+ (pbp['y'].notna())]
424
+
425
+ pbp = prep_xG_data(pbp_prep)
426
+ model = joblib.load(model)
427
+
428
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
370
429
 
371
- if pbp_xg['xG'].isnull().all():
372
- print("No MoneyPuck xG values were found for this game...")
430
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
373
431
 
374
- #Return: play-by-play with moneypuck xG column
375
- return pbp_xg
432
+ is_goal_vect = data_sparse[:, 0].A
433
+ predictors = data_sparse[:, 1:]
434
+
435
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
436
+
437
+ pred = model.predict(xgb_matrix)
438
+ fop, mpv = calibration_curve(is_goal_vect, pred, strategy='uniform')
439
+
440
+ plt.figure()
441
+ plt.plot(mpv, fop, "s-", label="Model")
442
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Perfect calibration")
443
+ plt.title("WSBA xG Reliability Diagram")
444
+ plt.xlabel("Predicted Probability (mean)")
445
+ plt.ylabel("Fraction of positives")
446
+ plt.legend(loc="best")
447
+ plt.savefig('tools/xg_model/metrics/reliability.png')