wsba-hockey 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. wsba_hockey/data_pipelines.py +183 -0
  2. wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py +146 -0
  3. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py +149 -0
  4. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/test.py +63 -0
  5. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  6. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  7. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  8. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  9. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  10. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  11. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  12. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  13. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  14. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +690 -0
  15. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common.py +661 -0
  16. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  17. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  18. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  19. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  20. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  21. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  22. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  23. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  24. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  25. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  26. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  27. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  28. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  29. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2714 -0
  30. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3981 -0
  31. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  32. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  33. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  34. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  35. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  36. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input.py +3130 -0
  37. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  38. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  39. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  40. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  41. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  42. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  43. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  44. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  45. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  46. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  47. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/test_gyp.py +261 -0
  48. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  49. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  50. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  51. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  52. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  53. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/update-gyp.py +64 -0
  54. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  55. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  56. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  57. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  58. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  59. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  60. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  61. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  62. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  63. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +666 -0
  64. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common.py +654 -0
  65. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  66. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  67. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  68. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  69. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  70. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  71. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  72. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  73. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  74. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  75. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  76. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  77. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  78. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2518 -0
  79. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3978 -0
  80. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  81. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  82. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  83. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  84. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  85. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input.py +3137 -0
  86. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  87. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  88. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  89. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  90. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  91. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  92. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  93. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  94. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  95. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  96. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/setup.py +42 -0
  97. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/test_gyp.py +260 -0
  98. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  99. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  100. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  101. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  102. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  103. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/update-gyp.py +46 -0
  104. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/app.py +401 -0
  105. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/name_fix.py +47 -0
  106. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/app.py +108 -0
  107. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/plot.py +93 -0
  108. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/rink_plot.py +245 -0
  109. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/app.py +145 -0
  110. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/plot.py +77 -0
  111. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/rink_plot.py +245 -0
  112. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/app.py +389 -0
  113. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/plot.py +70 -0
  114. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/rink_plot.py +245 -0
  115. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/app.py +110 -0
  116. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/plot.py +58 -0
  117. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/rink_plot.py +245 -0
  118. wsba_hockey/tools/agg.py +242 -53
  119. wsba_hockey/tools/plotting.py +12 -17
  120. wsba_hockey/tools/scraping.py +149 -258
  121. wsba_hockey/tools/xg_model.py +357 -311
  122. wsba_hockey/workspace.py +22 -117
  123. wsba_hockey/wsba_main.py +493 -165
  124. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.4.dist-info}/METADATA +1 -1
  125. wsba_hockey-1.0.4.dist-info/RECORD +135 -0
  126. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.4.dist-info}/WHEEL +1 -1
  127. wsba_hockey/stats/calculate_viz/shot_impact.py +0 -2
  128. wsba_hockey-1.0.3.dist-info/RECORD +0 -19
  129. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.4.dist-info}/licenses/LICENSE +0 -0
  130. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.4.dist-info}/top_level.txt +0 -0
@@ -3,48 +3,149 @@ import numpy as np
3
3
  import xgboost as xgb
4
4
  import scipy.sparse as sp
5
5
  import joblib
6
- from zipfile import ZipFile
7
- import requests as rs
6
+ import wsba_main as wsba
7
+ import tools.scraping as scraping
8
+ from sklearn.calibration import calibration_curve
9
+ from sklearn.metrics import roc_curve, auc
10
+ import matplotlib.pyplot as plt
8
11
 
9
12
  ### XG_MODEL FUNCTIONS ###
10
13
  # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
11
14
 
12
15
  ## GLOBAL VARIABLES ##
13
- #Newest season
14
- new_full = '20242025'
15
- new = '2024'
16
16
 
17
- def prep_xG_data(pbp):
17
+ target = "is_goal"
18
+ continuous = ['event_distance',
19
+ 'event_angle',
20
+ 'seconds_elapsed',
21
+ 'period',
22
+ 'x_adj',
23
+ 'y_adj',
24
+ 'distance_from_last',
25
+ 'angle_from_last',
26
+ 'seconds_since_last',
27
+ 'speed_from_last',
28
+ 'speed_of_angle_from_last',
29
+ 'score_state',
30
+ 'strength_diff'
31
+ ]
32
+ boolean = ['is_home',
33
+ 'wrist',
34
+ 'deflected',
35
+ 'tip-in',
36
+ 'slap',
37
+ 'backhand',
38
+ 'snap',
39
+ 'wrap-around',
40
+ 'poke',
41
+ 'bat',
42
+ 'cradle',
43
+ 'between-legs',
44
+ 'prior_shot-on-goal_same',
45
+ 'prior_missed-shot_same',
46
+ 'prior_blocked-shot_same',
47
+ 'prior_giveaway_same',
48
+ 'prior_takeaway_same',
49
+ 'prior_hit_same',
50
+ 'prior_shot-on-goal_opp',
51
+ 'prior_missed-shot_opp',
52
+ 'prior_blocked-shot_opp',
53
+ 'prior_giveaway_opp',
54
+ 'prior_takeaway_opp',
55
+ 'prior_hit_opp',
56
+ 'prior_faceoff',
57
+ 'regular',
58
+ 'empty_net',
59
+ 'offwing',
60
+ 'rush',
61
+ 'rebound'
62
+ ]
63
+
64
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
65
+ shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
66
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
67
+ strengths = ['3v3',
68
+ '3v4',
69
+ '3v5',
70
+ '4v3',
71
+ '4v4',
72
+ '4v5',
73
+ '4v6',
74
+ '5v3',
75
+ '5v4',
76
+ '5v5',
77
+ '5v6',
78
+ '6v4',
79
+ '6v5']
80
+
81
+ def fix_players(pbp):
82
+ #Add/fix player info for shooters and goaltenders
83
+ print('Adding player info to pbp...')
84
+
85
+ #Load roster and all players
86
+ roster = pd.read_csv('rosters/nhl_rosters.csv').drop_duplicates(['id'])[['fullName','id','shootsCatches']]
87
+
88
+ #Some players are missing from the roster file (generally in newer seasons); add these manually
89
+ miss = list(pbp.loc[~(pbp['event_player_1_id'].isin(list(roster['id'])))&(pbp['event_player_1_id'].notna()),'event_player_1_id'].drop_duplicates())
90
+ if miss:
91
+ add = wsba.nhl_scrape_player_data(miss).rename(columns={'playerId':'id'})[['fullName','id','shootsCatches']]
92
+ roster = pd.concat([roster,add]).reset_index(drop=True)
93
+
94
+ #Conversion dict
95
+ roster['id'] = roster['id'].astype(str)
96
+ roster_dict = roster.set_index('id').to_dict()['shootsCatches']
97
+ names_dict = roster.set_index('id').to_dict()['fullName']
98
+
99
+ #Add player names
100
+ for i in range(3):
101
+ pbp[f'add_player_{i+1}_name'] = np.where(pbp[f'event_player_{i+1}_name'].isna(),pbp[f'event_player_{i+1}_id'].astype(str).replace(names_dict),np.nan)
102
+ pbp[f'event_player_{i+1}_name'] = pbp[f'event_player_{i+1}_name'].combine_first(pbp[f'add_player_{i+1}_name'])
103
+
104
+ pbp['event_goalie_name'] = pbp['event_goalie_id'].astype(str).replace(names_dict)
105
+
106
+ #Add hands
107
+ pbp['event_player_1_hand'] = pbp['event_player_1_id'].astype(str).str.replace('.0','').replace(roster_dict)
108
+ pbp['event_player_1_hand'] = pbp['event_player_1_hand'].replace('nan',np.nan)
109
+
110
+ return pbp
111
+
112
+ def prep_xG_data(data):
18
113
  #Prep data for xG training and calculation
19
114
 
20
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
21
- shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
22
- fenwick_events = ['missed-shot','shot-on-goal','goal']
23
-
24
115
  #Informal groupby
25
- data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
116
+ data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
26
117
 
27
- #Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
28
- data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
29
- data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
118
+ #Recalibrate times series data with current data
119
+ data['seconds_since_last'] = data['seconds_elapsed'] - data['seconds_elapsed'].shift(1)
120
+ #Prevent leaking between games by setting value to zero when no time has occured in game
121
+ data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'])
30
122
 
31
123
  #Create last event columns
32
124
  data["event_team_last"] = data['event_team_abbr'].shift(1)
33
125
  data["event_type_last"] = data['event_type'].shift(1)
34
- data["x_fixed_last"] = data['x_fixed'].shift(1)
35
- data["y_fixed_last"] = data['y_fixed'].shift(1)
36
- data["zone_code_last"] = data['zone_code'].shift(1)
126
+ data["x_adj_last"] = data['x_adj'].shift(1)
127
+ data["y_adj_last"] = data['y_adj'].shift(1)
128
+ data["zone_code_last"] = data['zone_code'].shift(1)
37
129
 
38
130
  data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
131
+
132
+ #Contextual Data (for score state minimize the capture to four goals)
39
133
  data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
134
+ data['score_state'] = np.where(data['score_state']>4,4,data['score_state'])
135
+ data['score_state'] = np.where(data['score_state']<-4,-4,data['score_state'])
136
+
40
137
  data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
41
138
  data['strength_state_venue'] = data['away_skaters'].astype(str)+'v'+data['home_skaters'].astype(str)
42
- data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
43
- data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
139
+ data['distance_from_last'] = np.sqrt((data['x_adj'] - data['x_adj_last'])**2 + (data['y_adj'] - data['y_adj_last'])**2)
140
+ data['angle_from_last'] = np.degrees(np.arctan2(abs(data['y_adj'] - data['y_adj_last']), abs(89 - (data['x_adj']-data['x_adj_last']))))
141
+
142
+ #Event speeds
143
+ data['speed_from_last'] = np.where(data['seconds_since_last']==0,0,data['distance_from_last']/data['seconds_since_last'])
144
+ data['speed_of_angle_from_last'] = np.where(data['seconds_since_last']==0,0,data['angle_from_last']/data['seconds_since_last'])
44
145
 
45
- #Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
46
- data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
47
- data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
146
+ #Rush and rebounds are labelled
147
+ data['rush'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_adj']>25)&(data['seconds_since_last']<=5),1,0)
148
+ data['rebound'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<=2),1,0)
48
149
 
49
150
  #Create boolean variables
50
151
  data["is_goal"]=(data['event_type']=='goal').astype(int)
@@ -59,219 +160,179 @@ def prep_xG_data(pbp):
59
160
 
60
161
  data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
61
162
 
163
+ #Misc variables
164
+ data['empty_net'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_goalie_id'].isna()),1,0)
165
+ data['regular'] = (data['season_type']==2).astype(int)
166
+ data['offwing'] = np.where(((data['y_adj']<0)&(data['event_player_1_hand']=='L'))|((data['y_adj']>=0)&(data['event_player_1_hand']=='R')),1,0)
167
+
62
168
  #Return: pbp data prepared to train and calculate the xG model
63
169
  return data
64
170
 
65
- def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
171
+ def wsba_xG(pbp, hypertune = False, train = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
66
172
  #Train and calculate the WSBA Expected Goals model
67
-
68
- target = "is_goal"
69
- continous = ['event_distance',
70
- 'event_angle',
71
- 'seconds_elapsed',
72
- 'period',
73
- 'x_fixed',
74
- 'y_fixed',
75
- 'x_fixed_last',
76
- 'y_fixed_last',
77
- 'distance_from_last',
78
- 'seconds_since_last',
79
- 'score_state',
80
- 'strength_diff',
81
- 'fenwick_state',
82
- 'rush_mod',
83
- 'rebound_mod']
84
- boolean = ['is_home',
85
- 'wrist',
86
- 'deflected',
87
- 'tip-in',
88
- 'slap',
89
- 'backhand',
90
- 'snap',
91
- 'wrap-around',
92
- 'poke',
93
- 'bat',
94
- 'cradle',
95
- 'between-legs',
96
- 'prior_shot-on-goal_same',
97
- 'prior_missed-shot_same',
98
- 'prior_blocked-shot_same',
99
- 'prior_giveaway_same',
100
- 'prior_takeaway_same',
101
- 'prior_hit_same',
102
- 'prior_shot-on-goal_opp',
103
- 'prior_missed-shot_opp',
104
- 'prior_blocked-shot_opp',
105
- 'prior_giveaway_opp',
106
- 'prior_takeaway_opp',
107
- 'prior_hit_opp',
108
- 'prior_faceoff']
109
-
173
+
174
+ #Add index for future merging
175
+ pbp['event_index'] = pbp.index
176
+
177
+ #Recalibrate coordinates
178
+ pbp = scraping.adjust_coords(pbp)
179
+
180
+ #Fix strengths
181
+ pbp['strength_state'] = np.where((pbp['season_type']==3)&(pbp['period']>4),(np.where(pbp['event_team_abbr']==pbp['away_team_abbr'],pbp['away_skaters'].astype(str)+"v"+pbp['home_skaters'].astype(str),pbp['home_skaters'].astype(str)+"v"+pbp['away_skaters'].astype(str))),pbp['strength_state'])
182
+
183
+ #Filter unwanted data:
184
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
185
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
186
+ (pbp['strength_state'].isin(strengths))&
187
+ (pbp['x'].notna())&
188
+ (pbp['y'].notna())]
189
+
110
190
  #Prep Data
111
- pbp_prep = prep_xG_data(pbp)
112
- #Filter unwanted date:
113
- #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
114
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
115
- fenwick_events = ['missed-shot','shot-on-goal','goal']
116
- strengths = ['3v3',
117
- '3v4',
118
- '3v5',
119
- '4v3',
120
- '4v4',
121
- '4v5',
122
- '4v6',
123
- '5v3',
124
- '5v4',
125
- '5v5',
126
- '5v6',
127
- '6v4',
128
- '6v5']
129
-
130
- data = pbp_prep.loc[(pbp_prep['event_type'].isin(events))&
131
- (pbp_prep['strength_state'].isin(strengths))&
132
- (pbp_prep['period'] < 5)&
133
- (pbp_prep['x_fixed'].notna())&
134
- (pbp_prep['y_fixed'].notna())&
135
- ~((pbp_prep['x_fixed']==0)&(pbp_prep['y_fixed']==0)&(pbp_prep['x_fixed'].isin(fenwick_events))&(pbp_prep['event_distance']!=90))]
191
+ data = prep_xG_data(pbp_prep)
136
192
 
193
+ #Reduce to fenwick shots
194
+ data = data.loc[data['event_type'].isin(fenwick_events)]
195
+
137
196
  #Convert to sparse
138
- data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
197
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
139
198
 
140
199
  #Target and Predictors
141
200
  is_goal_vect = data_sparse[:, 0].A
142
201
  predictors = data_sparse[:, 1:]
143
202
 
144
203
  #XGB DataModel
145
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
204
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
205
+
206
+ if train:
207
+ if hypertune:
208
+ # Number of runs
209
+ run_num = train_runs
210
+
211
+ # DataFrames to store results
212
+ best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
213
+ best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
214
+
215
+ # Loop
216
+ for i in range(run_num):
217
+ print(f"### LOOP: {i+1} ###")
218
+
219
+ param = {
220
+ "objective": "binary:logistic",
221
+ "eval_metric": ["logloss", "auc"],
222
+ "max_depth": 6,
223
+ "eta": np.random.uniform(0.06, 0.11),
224
+ "gamma": np.random.uniform(0.06, 0.12),
225
+ "subsample": np.random.uniform(0.76, 0.84),
226
+ "colsample_bytree": np.random.uniform(0.76, 0.8),
227
+ "min_child_weight": np.random.randint(5, 23),
228
+ "max_delta_step": np.random.randint(4, 9)
229
+ }
230
+
231
+ # Cross-validation
232
+ seed = np.random.randint(0, 10000)
233
+ np.random.seed(seed)
234
+
235
+ cv_results = xgb.cv(
236
+ params=param,
237
+ dtrain=xgb_matrix,
238
+ num_boost_round=1000,
239
+ nfold=5,
240
+ early_stopping_rounds=25,
241
+ metrics=["logloss", "auc"],
242
+ seed=seed
243
+ )
244
+
245
+ # Record results
246
+ best_df.loc[i] = param
247
+ best_ll.loc[i] = [
248
+ cv_results["test-logloss-mean"].min(),
249
+ cv_results["test-logloss-mean"].idxmin(),
250
+ cv_results["test-auc-mean"].max(),
251
+ cv_results["test-auc-mean"].idxmax(),
252
+ seed
253
+ ]
254
+
255
+ # Combine results
256
+ best_all = pd.concat([best_df, best_ll], axis=1).dropna()
257
+
258
+ # Arrange to get best run
259
+ best_all = best_all.sort_values(by="auc", ascending=False)
146
260
 
147
- if train == True:
148
- # Number of runs
149
- run_num = train_runs
150
-
151
- # DataFrames to store results
152
- best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
153
- best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
261
+ best_all.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
154
262
 
155
- # Loop
156
- for i in range(run_num):
157
- print(f"### LOOP: {i+1} ###")
158
-
159
- param = {
263
+ # Final parameters
264
+ param_7_EV = {
160
265
  "objective": "binary:logistic",
161
266
  "eval_metric": ["logloss", "auc"],
162
- "max_depth": 6,
163
- "eta": np.random.uniform(0.06, 0.11),
164
- "gamma": np.random.uniform(0.06, 0.12),
165
- "subsample": np.random.uniform(0.76, 0.84),
166
- "colsample_bytree": np.random.uniform(0.76, 0.8),
167
- "min_child_weight": np.random.randint(5, 23),
168
- "max_delta_step": np.random.randint(4, 9)
267
+ "gamma": best_all['gamma'].iloc[0],
268
+ "subsample": best_all['subsample'].iloc[0],
269
+ "max_depth": best_all['max_depth'].iloc[0],
270
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
271
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
272
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
169
273
  }
170
-
171
- # Cross-validation
172
- seed = np.random.randint(0, 10000)
173
- np.random.seed(seed)
174
-
175
- cv_results = xgb.cv(
176
- params=param,
177
- dtrain=xgb_matrix,
178
- num_boost_round=1000,
179
- nfold=5,
180
- early_stopping_rounds=25,
181
- metrics=["logloss", "auc"],
182
- seed=seed
183
- )
184
-
185
- # Record results
186
- best_df.loc[i] = param
187
- best_ll.loc[i] = [
188
- cv_results["test-logloss-mean"].min(),
189
- cv_results["test-logloss-mean"].idxmin(),
190
- cv_results["test-auc-mean"].max(),
191
- cv_results["test-auc-mean"].idxmax(),
192
- seed
193
- ]
194
-
195
- # Combine results
196
- best_all = pd.concat([best_df, best_ll], axis=1).dropna()
197
-
198
- # Arrange to get best run
199
- best_all = best_all.sort_values(by="auc", ascending=False)
200
-
201
- if overwrite == True:
202
- best_all.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
203
- else:
204
- best_old = pd.read_csv("tools/xg_model/testing/xg_model_training_runs.csv")
205
- best_comb = pd.concat([best_old,best_all])
206
- best_comb.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
207
-
208
- # Final parameters
209
- param_7_EV = {
210
- "objective": "binary:logistic",
211
- "eval_metric": ["logloss", "auc"],
212
- "eta": 0.068,
213
- "gamma": 0.12,
214
- "subsample": 0.78,
215
- "max_depth": 6,
216
- "colsample_bytree": 0.76,
217
- "min_child_weight": 5,
218
- "max_delta_step": 5,
219
- }
220
-
221
- # CV rounds Loop
222
- run_num = cv_runs
223
- cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
224
-
225
- for i in range(run_num):
226
- print(f"### LOOP: {i+1} ###")
227
-
228
- seed = np.random.randint(0, 10000)
229
- np.random.seed(seed)
230
-
231
- cv_rounds = xgb.cv(
232
- params=param_7_EV,
233
- dtrain=xgb_matrix,
234
- num_boost_round=1000,
235
- nfold=5,
236
- early_stopping_rounds=25,
237
- metrics=["logloss", "auc"],
238
- seed=seed
239
- )
240
-
241
- # Record results
242
- cv_test.loc[i] = [
243
- cv_rounds["test-auc-mean"].idxmax(),
244
- cv_rounds["test-auc-mean"].max(),
245
- cv_rounds["test-logloss-mean"].idxmin(),
246
- cv_rounds["test-logloss-mean"].min(),
247
- seed
248
- ]
249
274
 
250
- # Clean results and sort to find the number of rounds to use and seed
251
- cv_final = cv_test.sort_values(by="AUC", ascending=False)
252
- if overwrite == True:
275
+ # CV rounds Loop
276
+ run_num = cv_runs
277
+ cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
278
+
279
+ for i in range(run_num):
280
+ print(f"### LOOP: {i+1} ###")
281
+
282
+ seed = np.random.randint(0, 10000)
283
+ np.random.seed(seed)
284
+
285
+ cv_rounds = xgb.cv(
286
+ params=param_7_EV,
287
+ dtrain=xgb_matrix,
288
+ num_boost_round=1000,
289
+ nfold=5,
290
+ early_stopping_rounds=25,
291
+ metrics=["logloss", "auc"],
292
+ seed=seed
293
+ )
294
+
295
+ # Record results
296
+ cv_test.loc[i] = [
297
+ cv_rounds["test-auc-mean"].idxmax(),
298
+ cv_rounds["test-auc-mean"].max(),
299
+ cv_rounds["test-logloss-mean"].idxmin(),
300
+ cv_rounds["test-logloss-mean"].min(),
301
+ seed
302
+ ]
303
+
304
+ # Clean results and sort to find the number of rounds to use and seed
305
+ cv_final = cv_test.sort_values(by="AUC", ascending=False)
253
306
  cv_final.to_csv("tools/xg_model/testing/xg_model_cv_runs.csv",index=False)
254
307
  else:
255
- cv_old = pd.read_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
256
- cv_comb = pd.concat([cv_old,cv_final])
257
- cv_comb.to_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
258
- cv_final.loc[len(cv_final)] = cv_test.mean()
308
+ # Load previous parameters
309
+ best_all = pd.read_csv('tools/xg_model/testing/xg_model_training_runs.csv')
310
+ cv_final = pd.read_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
259
311
 
260
- # Train the final model
261
- np.random.seed(556)
262
-
263
- if overwrite == False:
264
- model = joblib.load(model_path)
265
- else:
266
- ""
312
+ print('Loaded hyperparameters...')
313
+ # Final parameters
314
+ param_7_EV = {
315
+ "objective": "binary:logistic",
316
+ "eval_metric": ["logloss", "auc"],
317
+ "gamma": best_all['gamma'].iloc[0],
318
+ "subsample": best_all['subsample'].iloc[0],
319
+ "max_depth": best_all['max_depth'].iloc[0],
320
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
321
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
322
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
323
+ }
267
324
 
325
+ print('Training model...')
326
+ seed = int(cv_final['seed'].iloc[0])
327
+ np.random.seed(seed)
268
328
  model = xgb.train(
269
329
  params=param_7_EV,
270
330
  dtrain=xgb_matrix,
271
- num_boost_round=189,
272
- verbose_eval=2
331
+ num_boost_round=int(cv_final['AUC_rounds'].iloc[0]),
332
+ verbose_eval=2,
273
333
  )
274
-
334
+
335
+ #Save model
275
336
  joblib.dump(model,model_path)
276
337
 
277
338
  else:
@@ -279,123 +340,108 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
279
340
 
280
341
  #Predict goal
281
342
  data['xG'] = model.predict(xgb_matrix)
282
- data['xG'] = np.where(data['event_type'].isin(fenwick_events),data['xG'],np.nan)
283
-
284
- #Avoid merging errors
285
- merge_col = ['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
286
-
287
- for df in [pbp,data]:
288
- df = df.astype({
289
- 'game_id':'int',
290
- 'period':'int',
291
- 'seconds_elapsed':'int',
292
- 'event_type':'str',
293
- 'event_team_abbr':'str',
294
- 'event_player_1_id':'float'
295
- })
296
343
 
297
344
  #Drop previous xG if it exists
298
- try: pbp = pbp.drop(columns=['xG'])
299
- except KeyError:
300
- ''
345
+ pbp = pbp.drop(columns=['xG'],errors='ignore')
301
346
 
302
347
  #Merge
303
- data = data[merge_col+['xG']]
348
+ comm = list(data.columns.intersection(pbp.columns))
349
+ comm.remove('event_index')
350
+ data = data.drop(columns=comm)
304
351
  pbp_xg = pd.merge(pbp,data,how='left')
305
352
 
306
353
  return pbp_xg
307
354
 
308
- def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
309
- #Given play-by-play, return itself with xG column sourced from MoneyPuck.com
355
+ def feature_importance(model):
356
+ print('Feature importance for WSBA xG Model...')
357
+ model = joblib.load(model)
358
+
359
+ fig, ax = plt.subplots(figsize=(10, 7))
360
+ xgb.plot_importance(model,
361
+ importance_type='weight',
362
+ max_num_features=30,
363
+ height=0.5,
364
+ grid=False,
365
+ show_values=False,
366
+ xlabel='Weight',
367
+ title='WSBA xG Feature Importance',
368
+ ax=ax
369
+ )
370
+ plt.savefig('tools/xg_model/metrics/feature_importance.png',bbox_inches='tight')
310
371
 
311
- #If file is already in the repository downloading is not necessary
312
- try:
313
- db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
314
- except:
315
- url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
372
+ def roc_auc_curve(pbp,model):
373
+ print('ROC-AUC Curve for WSBA xG Model...')
316
374
 
317
- response = rs.get(url)
375
+ #Recalibrate coordinates
376
+ pbp = scraping.adjust_coords(pbp)
318
377
 
319
- if response.status_code == 200:
320
- with open(repo_path, 'wb') as file:
321
- file.write(response.content)
322
- print('File downloaded successfully')
323
- else:
324
- print('Failed to download file')
378
+ #Filter unwanted data:
379
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
380
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
381
+ (pbp['strength_state'].isin(strengths))&
382
+ (pbp['period'] < 5)&
383
+ (pbp['x'].notna())&
384
+ (pbp['y'].notna())]
325
385
 
326
- with ZipFile(repo_path, 'r') as zObject:
327
- zObject.extractall(
328
- path="tools/xg_model/moneypuck/shots/")
329
-
330
- db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
386
+ pbp = prep_xG_data(pbp_prep)
387
+ model = joblib.load(model)
388
+
389
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
331
390
 
332
- #Repeat process with active/most recent season
333
- #For the new/recent season, only scrape if the supplied pbp data contains the season
334
- if new in list(pbp['season'].astype(str).str[0:4]):
335
- url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
336
- repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
337
-
338
- response = rs.get(url)
339
-
340
- if response.status_code == 200:
341
- with open(repo_path, 'wb') as file:
342
- file.write(response.content)
343
- print('File downloaded successfully')
344
- else:
345
- print('Failed to download file')
346
-
347
- with ZipFile(repo_path, 'r') as zObject:
348
- zObject.extractall(
349
- path="tools/xg_model/moneypuck/shots/")
350
-
351
- new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
352
- #Convert to parquet
353
- new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
354
- else:
355
- new_season = pd.DataFrame()
356
- #Combine shots
357
- moneypuck = pd.concat([db,new_season])
358
-
359
- #Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
360
- moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
361
- moneypuck['event'] = moneypuck['event'].replace({
362
- "SHOT":"shot-on-goal",
363
- "MISS":"missed-shot",
364
- "BLOCK":"blocked-shot",
365
- "GOAL":"goal"
366
- })
391
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
392
+
393
+ is_goal_vect = data_sparse[:, 0].A
394
+ predictors = data_sparse[:, 1:]
367
395
 
368
- #Manual Team Rename
369
- moneypuck['teamCode'] = moneypuck['teamCode'].replace({
370
- "L.A":"LAK",
371
- "N.J":"NJD",
372
- "S.J":"SJS",
373
- "T.B":"TBL",
374
- })
375
- pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
376
- "L.A":"LAK",
377
- "N.J":"NJD",
378
- "S.J":"SJS",
379
- "T.B":"TBL",
380
- "PHX":'ARI'
381
- })
382
-
383
- #Managing oddities in datatypes
384
- moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
385
- pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
386
-
387
- #Modify and merge
388
- moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
389
- comb = pd.merge(pbp,moneypuck
390
- ,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
391
- ,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
392
- ,how='left')
396
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
397
+
398
+ pred = model.predict(xgb_matrix)
399
+ fpr, tpr, _ = roc_curve(is_goal_vect, pred)
400
+ roc_auc = auc(fpr,tpr)
393
401
 
394
- #Drop and rename
395
- pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
402
+ plt.figure()
403
+ plt.plot(fpr,tpr,label=f"ROC (AUC = {roc_auc:.4f})")
404
+ plt.plot([0, 1], [0, 1], linestyle="--")
405
+ plt.title("WSBA xG ROC Curve")
406
+ plt.xlabel("False Positive Rate")
407
+ plt.ylabel("True Positive Rate")
408
+ plt.legend(loc="lower right")
409
+ plt.savefig('tools/xg_model/metrics/roc_auc_curve.png')
410
+
411
+ def reliability(pbp,model):
412
+ print('Reliability for WSBA xG Model...')
413
+
414
+ #Recalibrate coordinates
415
+ pbp = scraping.adjust_coords(pbp)
416
+
417
+ #Filter unwanted data:
418
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
419
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
420
+ (pbp['strength_state'].isin(strengths))&
421
+ (pbp['period'] < 5)&
422
+ (pbp['x'].notna())&
423
+ (pbp['y'].notna())]
424
+
425
+ pbp = prep_xG_data(pbp_prep)
426
+ model = joblib.load(model)
427
+
428
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
396
429
 
397
- if pbp_xg['xG'].isnull().all():
398
- print("No MoneyPuck xG values were found for this game...")
430
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
399
431
 
400
- #Return: play-by-play with moneypuck xG column
401
- return pbp_xg
432
+ is_goal_vect = data_sparse[:, 0].A
433
+ predictors = data_sparse[:, 1:]
434
+
435
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
436
+
437
+ pred = model.predict(xgb_matrix)
438
+ fop, mpv = calibration_curve(is_goal_vect, pred, strategy='uniform')
439
+
440
+ plt.figure()
441
+ plt.plot(mpv, fop, "s-", label="Model")
442
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Perfect calibration")
443
+ plt.title("WSBA xG Reliability Diagram")
444
+ plt.xlabel("Predicted Probability (mean)")
445
+ plt.ylabel("Fraction of positives")
446
+ plt.legend(loc="best")
447
+ plt.savefig('tools/xg_model/metrics/reliability.png')