wsba-hockey 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. wsba_hockey/__init__.py +1 -1
  2. wsba_hockey/data_pipelines.py +183 -0
  3. wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py +146 -0
  4. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py +149 -0
  5. wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/test.py +63 -0
  6. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  7. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  8. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  9. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  10. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  11. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  12. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  13. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  14. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  15. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +690 -0
  16. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common.py +661 -0
  17. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  18. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  19. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  20. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  21. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  22. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  23. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  24. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  25. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  26. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  27. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  28. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  29. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  30. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2714 -0
  31. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3981 -0
  32. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  33. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  34. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  35. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  36. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  37. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input.py +3130 -0
  38. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  39. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  40. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  41. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  42. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  43. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  44. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  45. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  46. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  47. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  48. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/test_gyp.py +261 -0
  49. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  50. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  51. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  52. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  53. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  54. wsba_hockey/evidence/weakside-breakout/node_modules/node-gyp/update-gyp.py +64 -0
  55. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/gyp_main.py +45 -0
  56. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSNew.py +367 -0
  57. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSProject.py +206 -0
  58. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings.py +1270 -0
  59. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSSettings_test.py +1547 -0
  60. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSToolFile.py +59 -0
  61. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUserFile.py +153 -0
  62. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSUtil.py +271 -0
  63. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/MSVSVersion.py +574 -0
  64. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/__init__.py +666 -0
  65. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common.py +654 -0
  66. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/common_test.py +78 -0
  67. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml.py +165 -0
  68. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/easy_xml_test.py +109 -0
  69. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/flock_tool.py +55 -0
  70. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/__init__.py +0 -0
  71. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/analyzer.py +808 -0
  72. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/android.py +1173 -0
  73. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/cmake.py +1321 -0
  74. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/compile_commands_json.py +120 -0
  75. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/dump_dependency_json.py +103 -0
  76. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/eclipse.py +464 -0
  77. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypd.py +89 -0
  78. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/gypsh.py +58 -0
  79. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/make.py +2518 -0
  80. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs.py +3978 -0
  81. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/msvs_test.py +44 -0
  82. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja.py +2936 -0
  83. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/ninja_test.py +55 -0
  84. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode.py +1394 -0
  85. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/generator/xcode_test.py +25 -0
  86. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input.py +3137 -0
  87. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/input_test.py +98 -0
  88. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/mac_tool.py +771 -0
  89. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/msvs_emulation.py +1271 -0
  90. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/ninja_syntax.py +174 -0
  91. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/simple_copy.py +61 -0
  92. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/win_tool.py +374 -0
  93. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_emulation.py +1939 -0
  94. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcode_ninja.py +302 -0
  95. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xcodeproj_file.py +3197 -0
  96. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/pylib/gyp/xml_fix.py +65 -0
  97. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/setup.py +42 -0
  98. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/test_gyp.py +260 -0
  99. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/graphviz.py +102 -0
  100. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_gyp.py +156 -0
  101. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_sln.py +181 -0
  102. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/gyp/tools/pretty_vcproj.py +339 -0
  103. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/test/fixtures/test-charmap.py +31 -0
  104. wsba_hockey/evidence/weakside-breakout/node_modules/sqlite3/node_modules/node-gyp/update-gyp.py +46 -0
  105. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/app.py +400 -0
  106. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/game_stats/name_fix.py +47 -0
  107. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/app.py +108 -0
  108. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/plot.py +93 -0
  109. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/heatmaps/rink_plot.py +245 -0
  110. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/app.py +145 -0
  111. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/plot.py +77 -0
  112. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/matchups/rink_plot.py +245 -0
  113. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/app.py +389 -0
  114. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/plot.py +70 -0
  115. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/pbp/rink_plot.py +245 -0
  116. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/app.py +110 -0
  117. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/plot.py +58 -0
  118. wsba_hockey/evidence/weakside-breakout/wsba_nhl_apps/wsba_nhl_apps/skater/rink_plot.py +245 -0
  119. wsba_hockey/tools/agg.py +243 -54
  120. wsba_hockey/tools/plotting.py +25 -25
  121. wsba_hockey/tools/scraping.py +154 -263
  122. wsba_hockey/tools/xg_model.py +369 -315
  123. wsba_hockey/workspace.py +22 -117
  124. wsba_hockey/wsba_main.py +499 -167
  125. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.5.dist-info}/METADATA +1 -1
  126. wsba_hockey-1.0.5.dist-info/RECORD +135 -0
  127. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.5.dist-info}/WHEEL +1 -1
  128. wsba_hockey/stats/calculate_viz/shot_impact.py +0 -2
  129. wsba_hockey-1.0.3.dist-info/RECORD +0 -19
  130. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.5.dist-info}/licenses/LICENSE +0 -0
  131. {wsba_hockey-1.0.3.dist-info → wsba_hockey-1.0.5.dist-info}/top_level.txt +0 -0
@@ -1,50 +1,159 @@
1
+ import joblib
2
+ import os
1
3
  import pandas as pd
2
4
  import numpy as np
3
5
  import xgboost as xgb
4
6
  import scipy.sparse as sp
5
- import joblib
6
- from zipfile import ZipFile
7
- import requests as rs
7
+ import wsba_hockey.wsba_main as wsba
8
+ import wsba_hockey.tools.scraping as scraping
9
+ import matplotlib.pyplot as plt
10
+ from sklearn.calibration import calibration_curve
11
+ from sklearn.metrics import roc_curve, auc
8
12
 
9
13
  ### XG_MODEL FUNCTIONS ###
10
14
  # Provided in this file are functions vital to the goal prediction model in the WSBA Hockey Python package. #
11
15
 
12
16
  ## GLOBAL VARIABLES ##
13
- #Newest season
14
- new_full = '20242025'
15
- new = '2024'
16
17
 
17
- def prep_xG_data(pbp):
18
+ target = "is_goal"
19
+ continuous = ['event_distance',
20
+ 'event_angle',
21
+ 'seconds_elapsed',
22
+ 'period',
23
+ 'x_adj',
24
+ 'y_adj',
25
+ 'distance_from_last',
26
+ 'angle_from_last',
27
+ 'seconds_since_last',
28
+ 'speed_from_last',
29
+ 'speed_of_angle_from_last',
30
+ 'score_state',
31
+ 'strength_diff'
32
+ ]
33
+ boolean = ['is_home',
34
+ 'wrist',
35
+ 'deflected',
36
+ 'tip-in',
37
+ 'slap',
38
+ 'backhand',
39
+ 'snap',
40
+ 'wrap-around',
41
+ 'poke',
42
+ 'bat',
43
+ 'cradle',
44
+ 'between-legs',
45
+ 'prior_shot-on-goal_same',
46
+ 'prior_missed-shot_same',
47
+ 'prior_blocked-shot_same',
48
+ 'prior_giveaway_same',
49
+ 'prior_takeaway_same',
50
+ 'prior_hit_same',
51
+ 'prior_shot-on-goal_opp',
52
+ 'prior_missed-shot_opp',
53
+ 'prior_blocked-shot_opp',
54
+ 'prior_giveaway_opp',
55
+ 'prior_takeaway_opp',
56
+ 'prior_hit_opp',
57
+ 'prior_faceoff',
58
+ 'regular',
59
+ 'empty_net',
60
+ 'offwing',
61
+ 'rush',
62
+ 'rebound'
63
+ ]
64
+
65
+ events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
66
+ shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
67
+ fenwick_events = ['missed-shot','shot-on-goal','goal']
68
+ strengths = ['3v3',
69
+ '3v4',
70
+ '3v5',
71
+ '4v3',
72
+ '4v4',
73
+ '4v5',
74
+ '4v6',
75
+ '5v3',
76
+ '5v4',
77
+ '5v5',
78
+ '5v6',
79
+ '6v4',
80
+ '6v5']
81
+
82
+ dir = os.path.dirname(os.path.realpath(__file__))
83
+ roster_path = os.path.join(dir,'rosters\\nhl_rosters.csv')
84
+ xg_model_path = os.path.join(dir,'xg_model\\wsba_xg.joblib')
85
+ test_path = os.path.join(dir,'xg_model\\testing\\xg_model_training_runs.csv')
86
+ cv_path = os.path.join(dir,'xg_model\\testing\\xg_model_cv_runs.csv')
87
+
88
+ def fix_players(pbp):
89
+ #Add/fix player info for shooters and goaltenders
90
+ print('Adding player info to pbp...')
91
+
92
+ #Load roster and all players
93
+ roster = pd.read_csv(roster_path).drop_duplicates(['id'])[['fullName','id','shootsCatches']]
94
+
95
+ #Some players are missing from the roster file (generally in newer seasons); add these manually
96
+ miss = list(pbp.loc[~(pbp['event_player_1_id'].isin(list(roster['id'])))&(pbp['event_player_1_id'].notna()),'event_player_1_id'].drop_duplicates())
97
+ if miss:
98
+ add = wsba.nhl_scrape_player_data(miss).rename(columns={'playerId':'id'})[['fullName','id','shootsCatches']]
99
+ roster = pd.concat([roster,add]).reset_index(drop=True)
100
+
101
+ #Conversion dict
102
+ roster['id'] = roster['id'].astype(str)
103
+ roster_dict = roster.set_index('id').to_dict()['shootsCatches']
104
+ names_dict = roster.set_index('id').to_dict()['fullName']
105
+
106
+ #Add player names
107
+ for i in range(3):
108
+ pbp[f'add_player_{i+1}_name'] = np.where(pbp[f'event_player_{i+1}_name'].isna(),pbp[f'event_player_{i+1}_id'].astype(str).replace(names_dict),np.nan)
109
+ pbp[f'event_player_{i+1}_name'] = pbp[f'event_player_{i+1}_name'].combine_first(pbp[f'add_player_{i+1}_name'])
110
+
111
+ pbp['event_goalie_name'] = pbp['event_goalie_id'].astype(str).replace(names_dict)
112
+
113
+ #Add hands
114
+ pbp['event_player_1_hand'] = pbp['event_player_1_id'].astype(str).str.replace('.0','').replace(roster_dict)
115
+ pbp['event_player_1_hand'] = pbp['event_player_1_hand'].replace('nan',np.nan)
116
+
117
+ return pbp
118
+
119
+ def prep_xG_data(data):
18
120
  #Prep data for xG training and calculation
121
+ data = fix_players(data)
19
122
 
20
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
21
- shot_types = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
22
- fenwick_events = ['missed-shot','shot-on-goal','goal']
23
-
24
123
  #Informal groupby
25
- data = pbp.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
124
+ data = data.sort_values(by=['season','game_id','period','seconds_elapsed','event_num'])
26
125
 
27
- #Add event time details - prevent leaking between games by setting value to zero when no time has occured in game
28
- data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_elapsed']-data['seconds_elapsed'].shift(1))
29
- data["event_length"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'].shift(-1))
126
+ #Recalibrate times series data with current data
127
+ data['seconds_since_last'] = data['seconds_elapsed'] - data['seconds_elapsed'].shift(1)
128
+ #Prevent leaking between games by setting value to zero when no time has occured in game
129
+ data["seconds_since_last"] = np.where(data['seconds_elapsed']==0,0,data['seconds_since_last'])
30
130
 
31
131
  #Create last event columns
32
132
  data["event_team_last"] = data['event_team_abbr'].shift(1)
33
133
  data["event_type_last"] = data['event_type'].shift(1)
34
- data["x_fixed_last"] = data['x_fixed'].shift(1)
35
- data["y_fixed_last"] = data['y_fixed'].shift(1)
36
- data["zone_code_last"] = data['zone_code'].shift(1)
134
+ data["x_adj_last"] = data['x_adj'].shift(1)
135
+ data["y_adj_last"] = data['y_adj'].shift(1)
136
+ data["zone_code_last"] = data['zone_code'].shift(1)
37
137
 
38
138
  data.sort_values(['season','game_id','period','seconds_elapsed','event_num'],inplace=True)
139
+
140
+ #Contextual Data (for score state minimize the capture to four goals)
39
141
  data['score_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_score']-data['home_score'],data['home_score']-data['away_score'])
142
+ data['score_state'] = np.where(data['score_state']>4,4,data['score_state'])
143
+ data['score_state'] = np.where(data['score_state']<-4,-4,data['score_state'])
144
+
40
145
  data['strength_diff'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_skaters']-data['home_skaters'],data['home_skaters']-data['away_skaters'])
41
146
  data['strength_state_venue'] = data['away_skaters'].astype(str)+'v'+data['home_skaters'].astype(str)
42
- data['fenwick_state'] = np.where(data['away_team_abbr']==data['event_team_abbr'],data['away_fenwick']-data['home_fenwick'],data['home_fenwick']-data['away_fenwick'])
43
- data['distance_from_last'] = np.sqrt((data['x_fixed'] - data['x_fixed_last'])**2 + (data['y_fixed'] - data['y_fixed_last'])**2)
147
+ data['distance_from_last'] = np.sqrt((data['x_adj'] - data['x_adj_last'])**2 + (data['y_adj'] - data['y_adj_last'])**2)
148
+ data['angle_from_last'] = np.degrees(np.arctan2(abs(data['y_adj'] - data['y_adj_last']), abs(89 - (data['x_adj']-data['x_adj_last']))))
44
149
 
45
- #Rush and rebounds are included and graded off of the speed of the event (an event cannot be a rush event unless it also occurs in the offensive zone)
46
- data['rush_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_fixed']>25)&(data['seconds_since_last']<5),5-data['seconds_since_last'],0)
47
- data['rebound_mod'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<3),3-data['seconds_since_last'],0)
150
+ #Event speeds
151
+ data['speed_from_last'] = np.where(data['seconds_since_last']==0,0,data['distance_from_last']/data['seconds_since_last'])
152
+ data['speed_of_angle_from_last'] = np.where(data['seconds_since_last']==0,0,data['angle_from_last']/data['seconds_since_last'])
153
+
154
+ #Rush and rebounds are labelled
155
+ data['rush'] = np.where((data['event_type'].isin(fenwick_events))&(data['zone_code_last'].isin(['N','D']))&(data['x_adj']>25)&(data['seconds_since_last']<=5),1,0)
156
+ data['rebound'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_type_last'].isin(fenwick_events))&(data['seconds_since_last']<=2),1,0)
48
157
 
49
158
  #Create boolean variables
50
159
  data["is_goal"]=(data['event_type']=='goal').astype(int)
@@ -59,219 +168,179 @@ def prep_xG_data(pbp):
59
168
 
60
169
  data['prior_faceoff'] = (data['event_type_last']=='faceoff').astype(int)
61
170
 
171
+ #Misc variables
172
+ data['empty_net'] = np.where((data['event_type'].isin(fenwick_events))&(data['event_goalie_id'].isna()),1,0)
173
+ data['regular'] = (data['season_type']==2).astype(int)
174
+ data['offwing'] = np.where(((data['y_adj']<0)&(data['event_player_1_hand']=='L'))|((data['y_adj']>=0)&(data['event_player_1_hand']=='R')),1,0)
175
+
62
176
  #Return: pbp data prepared to train and calculate the xG model
63
177
  return data
64
178
 
65
- def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/wsba_xg.joblib", train_runs = 20, cv_runs = 20):
179
+ def wsba_xG(pbp, hypertune = False, train = False, model_path = xg_model_path, train_runs = 20, cv_runs = 20):
66
180
  #Train and calculate the WSBA Expected Goals model
67
-
68
- target = "is_goal"
69
- continous = ['event_distance',
70
- 'event_angle',
71
- 'seconds_elapsed',
72
- 'period',
73
- 'x_fixed',
74
- 'y_fixed',
75
- 'x_fixed_last',
76
- 'y_fixed_last',
77
- 'distance_from_last',
78
- 'seconds_since_last',
79
- 'score_state',
80
- 'strength_diff',
81
- 'fenwick_state',
82
- 'rush_mod',
83
- 'rebound_mod']
84
- boolean = ['is_home',
85
- 'wrist',
86
- 'deflected',
87
- 'tip-in',
88
- 'slap',
89
- 'backhand',
90
- 'snap',
91
- 'wrap-around',
92
- 'poke',
93
- 'bat',
94
- 'cradle',
95
- 'between-legs',
96
- 'prior_shot-on-goal_same',
97
- 'prior_missed-shot_same',
98
- 'prior_blocked-shot_same',
99
- 'prior_giveaway_same',
100
- 'prior_takeaway_same',
101
- 'prior_hit_same',
102
- 'prior_shot-on-goal_opp',
103
- 'prior_missed-shot_opp',
104
- 'prior_blocked-shot_opp',
105
- 'prior_giveaway_opp',
106
- 'prior_takeaway_opp',
107
- 'prior_hit_opp',
108
- 'prior_faceoff']
109
-
181
+
182
+ #Add index for future merging
183
+ pbp['event_index'] = pbp.index
184
+
185
+ #Recalibrate coordinates
186
+ pbp = scraping.adjust_coords(pbp)
187
+
188
+ #Fix strengths
189
+ pbp['strength_state'] = np.where((pbp['season_type']==3)&(pbp['period']>4),(np.where(pbp['event_team_abbr']==pbp['away_team_abbr'],pbp['away_skaters'].astype(str)+"v"+pbp['home_skaters'].astype(str),pbp['home_skaters'].astype(str)+"v"+pbp['away_skaters'].astype(str))),pbp['strength_state'])
190
+
191
+ #Filter unwanted data:
192
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
193
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
194
+ (pbp['strength_state'].isin(strengths))&
195
+ (pbp['x'].notna())&
196
+ (pbp['y'].notna())]
197
+
110
198
  #Prep Data
111
- pbp_prep = prep_xG_data(pbp)
112
- #Filter unwanted date:
113
- #Shots must occur in specified events and strength states, occur before the shootout, and have valid coordinates
114
- events = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal']
115
- fenwick_events = ['missed-shot','shot-on-goal','goal']
116
- strengths = ['3v3',
117
- '3v4',
118
- '3v5',
119
- '4v3',
120
- '4v4',
121
- '4v5',
122
- '4v6',
123
- '5v3',
124
- '5v4',
125
- '5v5',
126
- '5v6',
127
- '6v4',
128
- '6v5']
129
-
130
- data = pbp_prep.loc[(pbp_prep['event_type'].isin(events))&
131
- (pbp_prep['strength_state'].isin(strengths))&
132
- (pbp_prep['period'] < 5)&
133
- (pbp_prep['x_fixed'].notna())&
134
- (pbp_prep['y_fixed'].notna())&
135
- ~((pbp_prep['x_fixed']==0)&(pbp_prep['y_fixed']==0)&(pbp_prep['x_fixed'].isin(fenwick_events))&(pbp_prep['event_distance']!=90))]
199
+ data = prep_xG_data(pbp_prep)
136
200
 
201
+ #Reduce to fenwick shots
202
+ data = data.loc[data['event_type'].isin(fenwick_events)]
203
+
137
204
  #Convert to sparse
138
- data_sparse = sp.csr_matrix(data[[target]+continous+boolean])
205
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
139
206
 
140
207
  #Target and Predictors
141
208
  is_goal_vect = data_sparse[:, 0].A
142
209
  predictors = data_sparse[:, 1:]
143
210
 
144
211
  #XGB DataModel
145
- xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect)
146
-
147
- if train == True:
148
- # Number of runs
149
- run_num = train_runs
150
-
151
- # DataFrames to store results
152
- best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
153
- best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
154
-
155
- # Loop
156
- for i in range(run_num):
157
- print(f"### LOOP: {i+1} ###")
158
-
159
- param = {
212
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
213
+
214
+ if train:
215
+ if hypertune:
216
+ # Number of runs
217
+ run_num = train_runs
218
+
219
+ # DataFrames to store results
220
+ best_df = pd.DataFrame(columns=["max_depth", "eta", "gamma", "subsample", "colsample_bytree", "min_child_weight", "max_delta_step"])
221
+ best_ll = pd.DataFrame(columns=["ll", "ll_rounds", "auc", "auc_rounds", "seed"])
222
+
223
+ # Loop
224
+ for i in range(run_num):
225
+ print(f"### LOOP: {i+1} ###")
226
+
227
+ param = {
228
+ "objective": "binary:logistic",
229
+ "eval_metric": ["logloss", "auc"],
230
+ "max_depth": 6,
231
+ "eta": np.random.uniform(0.06, 0.11),
232
+ "gamma": np.random.uniform(0.06, 0.12),
233
+ "subsample": np.random.uniform(0.76, 0.84),
234
+ "colsample_bytree": np.random.uniform(0.76, 0.8),
235
+ "min_child_weight": np.random.randint(5, 23),
236
+ "max_delta_step": np.random.randint(4, 9)
237
+ }
238
+
239
+ # Cross-validation
240
+ seed = np.random.randint(0, 10000)
241
+ np.random.seed(seed)
242
+
243
+ cv_results = xgb.cv(
244
+ params=param,
245
+ dtrain=xgb_matrix,
246
+ num_boost_round=1000,
247
+ nfold=5,
248
+ early_stopping_rounds=25,
249
+ metrics=["logloss", "auc"],
250
+ seed=seed
251
+ )
252
+
253
+ # Record results
254
+ best_df.loc[i] = param
255
+ best_ll.loc[i] = [
256
+ cv_results["test-logloss-mean"].min(),
257
+ cv_results["test-logloss-mean"].idxmin(),
258
+ cv_results["test-auc-mean"].max(),
259
+ cv_results["test-auc-mean"].idxmax(),
260
+ seed
261
+ ]
262
+
263
+ # Combine results
264
+ best_all = pd.concat([best_df, best_ll], axis=1).dropna()
265
+
266
+ # Arrange to get best run
267
+ best_all = best_all.sort_values(by="auc", ascending=False)
268
+
269
+ best_all.to_csv(test_path,index=False)
270
+
271
+ # Final parameters
272
+ param_7_EV = {
160
273
  "objective": "binary:logistic",
161
274
  "eval_metric": ["logloss", "auc"],
162
- "max_depth": 6,
163
- "eta": np.random.uniform(0.06, 0.11),
164
- "gamma": np.random.uniform(0.06, 0.12),
165
- "subsample": np.random.uniform(0.76, 0.84),
166
- "colsample_bytree": np.random.uniform(0.76, 0.8),
167
- "min_child_weight": np.random.randint(5, 23),
168
- "max_delta_step": np.random.randint(4, 9)
275
+ "gamma": best_all['gamma'].iloc[0],
276
+ "subsample": best_all['subsample'].iloc[0],
277
+ "max_depth": best_all['max_depth'].iloc[0],
278
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
279
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
280
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
169
281
  }
170
-
171
- # Cross-validation
172
- seed = np.random.randint(0, 10000)
173
- np.random.seed(seed)
174
-
175
- cv_results = xgb.cv(
176
- params=param,
177
- dtrain=xgb_matrix,
178
- num_boost_round=1000,
179
- nfold=5,
180
- early_stopping_rounds=25,
181
- metrics=["logloss", "auc"],
182
- seed=seed
183
- )
184
-
185
- # Record results
186
- best_df.loc[i] = param
187
- best_ll.loc[i] = [
188
- cv_results["test-logloss-mean"].min(),
189
- cv_results["test-logloss-mean"].idxmin(),
190
- cv_results["test-auc-mean"].max(),
191
- cv_results["test-auc-mean"].idxmax(),
192
- seed
193
- ]
194
-
195
- # Combine results
196
- best_all = pd.concat([best_df, best_ll], axis=1).dropna()
197
-
198
- # Arrange to get best run
199
- best_all = best_all.sort_values(by="auc", ascending=False)
200
-
201
- if overwrite == True:
202
- best_all.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
203
- else:
204
- best_old = pd.read_csv("tools/xg_model/testing/xg_model_training_runs.csv")
205
- best_comb = pd.concat([best_old,best_all])
206
- best_comb.to_csv("tools/xg_model/testing/xg_model_training_runs.csv",index=False)
207
-
208
- # Final parameters
209
- param_7_EV = {
210
- "objective": "binary:logistic",
211
- "eval_metric": ["logloss", "auc"],
212
- "eta": 0.068,
213
- "gamma": 0.12,
214
- "subsample": 0.78,
215
- "max_depth": 6,
216
- "colsample_bytree": 0.76,
217
- "min_child_weight": 5,
218
- "max_delta_step": 5,
219
- }
220
-
221
- # CV rounds Loop
222
- run_num = cv_runs
223
- cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
224
-
225
- for i in range(run_num):
226
- print(f"### LOOP: {i+1} ###")
227
-
228
- seed = np.random.randint(0, 10000)
229
- np.random.seed(seed)
230
-
231
- cv_rounds = xgb.cv(
232
- params=param_7_EV,
233
- dtrain=xgb_matrix,
234
- num_boost_round=1000,
235
- nfold=5,
236
- early_stopping_rounds=25,
237
- metrics=["logloss", "auc"],
238
- seed=seed
239
- )
240
-
241
- # Record results
242
- cv_test.loc[i] = [
243
- cv_rounds["test-auc-mean"].idxmax(),
244
- cv_rounds["test-auc-mean"].max(),
245
- cv_rounds["test-logloss-mean"].idxmin(),
246
- cv_rounds["test-logloss-mean"].min(),
247
- seed
248
- ]
249
282
 
250
- # Clean results and sort to find the number of rounds to use and seed
251
- cv_final = cv_test.sort_values(by="AUC", ascending=False)
252
- if overwrite == True:
253
- cv_final.to_csv("tools/xg_model/testing/xg_model_cv_runs.csv",index=False)
283
+ # CV rounds Loop
284
+ run_num = cv_runs
285
+ cv_test = pd.DataFrame(columns=["AUC_rounds", "AUC", "LL_rounds", "LL", "seed"])
286
+
287
+ for i in range(run_num):
288
+ print(f"### LOOP: {i+1} ###")
289
+
290
+ seed = np.random.randint(0, 10000)
291
+ np.random.seed(seed)
292
+
293
+ cv_rounds = xgb.cv(
294
+ params=param_7_EV,
295
+ dtrain=xgb_matrix,
296
+ num_boost_round=1000,
297
+ nfold=5,
298
+ early_stopping_rounds=25,
299
+ metrics=["logloss", "auc"],
300
+ seed=seed
301
+ )
302
+
303
+ # Record results
304
+ cv_test.loc[i] = [
305
+ cv_rounds["test-auc-mean"].idxmax(),
306
+ cv_rounds["test-auc-mean"].max(),
307
+ cv_rounds["test-logloss-mean"].idxmin(),
308
+ cv_rounds["test-logloss-mean"].min(),
309
+ seed
310
+ ]
311
+
312
+ # Clean results and sort to find the number of rounds to use and seed
313
+ cv_final = cv_test.sort_values(by="AUC", ascending=False)
314
+ cv_final.to_csv(cv_path,index=False)
254
315
  else:
255
- cv_old = pd.read_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
256
- cv_comb = pd.concat([cv_old,cv_final])
257
- cv_comb.to_csv("tools/xg_model/testing/xg_model_cv_runs.csv")
258
- cv_final.loc[len(cv_final)] = cv_test.mean()
316
+ # Load previous parameters
317
+ best_all = pd.read_csv(test_path)
318
+ cv_final = pd.read_csv(cv_path)
259
319
 
260
- # Train the final model
261
- np.random.seed(556)
262
-
263
- if overwrite == False:
264
- model = joblib.load(model_path)
265
- else:
266
- ""
320
+ print('Loaded hyperparameters...')
321
+ # Final parameters
322
+ param_7_EV = {
323
+ "objective": "binary:logistic",
324
+ "eval_metric": ["logloss", "auc"],
325
+ "gamma": best_all['gamma'].iloc[0],
326
+ "subsample": best_all['subsample'].iloc[0],
327
+ "max_depth": best_all['max_depth'].iloc[0],
328
+ "colsample_bytree": best_all['colsample_bytree'].iloc[0],
329
+ "min_child_weight": best_all['min_child_weight'].iloc[0],
330
+ "max_delta_step": best_all['max_delta_step'].iloc[0],
331
+ }
267
332
 
333
+ print('Training model...')
334
+ seed = int(cv_final['seed'].iloc[0])
335
+ np.random.seed(seed)
268
336
  model = xgb.train(
269
337
  params=param_7_EV,
270
338
  dtrain=xgb_matrix,
271
- num_boost_round=189,
272
- verbose_eval=2
339
+ num_boost_round=int(cv_final['AUC_rounds'].iloc[0]),
340
+ verbose_eval=2,
273
341
  )
274
-
342
+
343
+ #Save model
275
344
  joblib.dump(model,model_path)
276
345
 
277
346
  else:
@@ -279,123 +348,108 @@ def wsba_xG(pbp, train = False, overwrite = False, model_path = "tools/xg_model/
279
348
 
280
349
  #Predict goal
281
350
  data['xG'] = model.predict(xgb_matrix)
282
- data['xG'] = np.where(data['event_type'].isin(fenwick_events),data['xG'],np.nan)
283
-
284
- #Avoid merging errors
285
- merge_col = ['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
286
-
287
- for df in [pbp,data]:
288
- df = df.astype({
289
- 'game_id':'int',
290
- 'period':'int',
291
- 'seconds_elapsed':'int',
292
- 'event_type':'str',
293
- 'event_team_abbr':'str',
294
- 'event_player_1_id':'float'
295
- })
296
351
 
297
352
  #Drop previous xG if it exists
298
- try: pbp = pbp.drop(columns=['xG'])
299
- except KeyError:
300
- ''
353
+ pbp = pbp.drop(columns=['xG'],errors='ignore')
301
354
 
302
355
  #Merge
303
- data = data[merge_col+['xG']]
356
+ comm = list(data.columns.intersection(pbp.columns))
357
+ comm.remove('event_index')
358
+ data = data.drop(columns=comm)
304
359
  pbp_xg = pd.merge(pbp,data,how='left')
305
360
 
306
361
  return pbp_xg
307
362
 
308
- def moneypuck_xG(pbp,repo_path = "tools/xg_model/moneypuck/shots_2007-2023.zip"):
309
- #Given play-by-play, return itself with xG column sourced from MoneyPuck.com
363
+ def feature_importance(model):
364
+ print('Feature importance for WSBA xG Model...')
365
+ model = joblib.load(model)
366
+
367
+ fig, ax = plt.subplots(figsize=(10, 7))
368
+ xgb.plot_importance(model,
369
+ importance_type='weight',
370
+ max_num_features=30,
371
+ height=0.5,
372
+ grid=False,
373
+ show_values=False,
374
+ xlabel='Weight',
375
+ title='WSBA xG Feature Importance',
376
+ ax=ax
377
+ )
378
+ plt.savefig('tools/xg_model/metrics/feature_importance.png',bbox_inches='tight')
310
379
 
311
- #If file is already in the repository downloading is not necessary
312
- try:
313
- db = pd.read_parquet("tools/xg_model/moneypuck/shots/shots_2007-2023.parquet")
314
- except:
315
- url = 'https://peter-tanner.com/moneypuck/downloads/shots_2007-2023.zip'
380
+ def roc_auc_curve(pbp,model):
381
+ print('ROC-AUC Curve for WSBA xG Model...')
316
382
 
317
- response = rs.get(url)
383
+ #Recalibrate coordinates
384
+ pbp = scraping.adjust_coords(pbp)
318
385
 
319
- if response.status_code == 200:
320
- with open(repo_path, 'wb') as file:
321
- file.write(response.content)
322
- print('File downloaded successfully')
323
- else:
324
- print('Failed to download file')
386
+ #Filter unwanted data:
387
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
388
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
389
+ (pbp['strength_state'].isin(strengths))&
390
+ (pbp['period'] < 5)&
391
+ (pbp['x'].notna())&
392
+ (pbp['y'].notna())]
325
393
 
326
- with ZipFile(repo_path, 'r') as zObject:
327
- zObject.extractall(
328
- path="tools/xg_model/moneypuck/shots/")
329
-
330
- db = pd.read_csv("tools/xg_model/moneypuck/shots/shots_2007-2023.csv")
394
+ pbp = prep_xG_data(pbp_prep)
395
+ model = joblib.load(model)
396
+
397
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
331
398
 
332
- #Repeat process with active/most recent season
333
- #For the new/recent season, only scrape if the supplied pbp data contains the season
334
- if new in list(pbp['season'].astype(str).str[0:4]):
335
- url = f'https://peter-tanner.com/moneypuck/downloads/shots_{new}.zip'
336
- repo_path = f"tools/xg_model/moneypuck/shots_{new}.zip"
337
-
338
- response = rs.get(url)
339
-
340
- if response.status_code == 200:
341
- with open(repo_path, 'wb') as file:
342
- file.write(response.content)
343
- print('File downloaded successfully')
344
- else:
345
- print('Failed to download file')
346
-
347
- with ZipFile(repo_path, 'r') as zObject:
348
- zObject.extractall(
349
- path="tools/xg_model/moneypuck/shots/")
350
-
351
- new_season = pd.read_csv(f"tools/xg_model/moneypuck/shots/shots_{new}.csv")
352
- #Convert to parquet
353
- new_season.to_parquet(f"tools/xg_model/moneypuck/shots/shots_{new}.csv",index=False)
354
- else:
355
- new_season = pd.DataFrame()
356
- #Combine shots
357
- moneypuck = pd.concat([db,new_season])
358
-
359
- #Find game ids that occur in supplied pbp and filter moneypuck shots accordingly
360
- moneypuck['game_id'] = moneypuck['season'].astype(str)+"0"+moneypuck['game_id'].astype(str)
361
- moneypuck['event'] = moneypuck['event'].replace({
362
- "SHOT":"shot-on-goal",
363
- "MISS":"missed-shot",
364
- "BLOCK":"blocked-shot",
365
- "GOAL":"goal"
366
- })
399
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
400
+
401
+ is_goal_vect = data_sparse[:, 0].A
402
+ predictors = data_sparse[:, 1:]
367
403
 
368
- #Manual Team Rename
369
- moneypuck['teamCode'] = moneypuck['teamCode'].replace({
370
- "L.A":"LAK",
371
- "N.J":"NJD",
372
- "S.J":"SJS",
373
- "T.B":"TBL",
374
- })
375
- pbp['event_team_abbr'] = pbp['event_team_abbr'].replace({
376
- "L.A":"LAK",
377
- "N.J":"NJD",
378
- "S.J":"SJS",
379
- "T.B":"TBL",
380
- "PHX":'ARI'
381
- })
382
-
383
- #Managing oddities in datatypes
384
- moneypuck[['game_id','period','time']] = moneypuck[['game_id','period','time']].astype(int)
385
- pbp[['game_id','period','seconds_elapsed']] = pbp[['game_id','period','seconds_elapsed']].astype(int)
386
-
387
- #Modify and merge
388
- moneypuck = moneypuck[['game_id','period','time','event','teamCode','shooterPlayerId','xGoal']]
389
- comb = pd.merge(pbp,moneypuck
390
- ,left_on=['game_id','period','seconds_elapsed','event_type','event_team_abbr','event_player_1_id']
391
- ,right_on=['game_id','period','time','event','teamCode','shooterPlayerId']
392
- ,how='left')
404
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
405
+
406
+ pred = model.predict(xgb_matrix)
407
+ fpr, tpr, _ = roc_curve(is_goal_vect, pred)
408
+ roc_auc = auc(fpr,tpr)
393
409
 
394
- #Drop and rename
395
- pbp_xg = comb.drop(columns=['time', 'event', 'teamCode', 'shooterPlayerId']).rename(columns={'xGoal':'xG'})
410
+ plt.figure()
411
+ plt.plot(fpr,tpr,label=f"ROC (AUC = {roc_auc:.4f})")
412
+ plt.plot([0, 1], [0, 1], linestyle="--")
413
+ plt.title("WSBA xG ROC Curve")
414
+ plt.xlabel("False Positive Rate")
415
+ plt.ylabel("True Positive Rate")
416
+ plt.legend(loc="lower right")
417
+ plt.savefig('tools/xg_model/metrics/roc_auc_curve.png')
418
+
419
+ def reliability(pbp,model):
420
+ print('Reliability for WSBA xG Model...')
421
+
422
+ #Recalibrate coordinates
423
+ pbp = scraping.adjust_coords(pbp)
424
+
425
+ #Filter unwanted data:
426
+ #Shots must occur in specified events and strength states, occur in open play, and have valid coordinates
427
+ pbp_prep = pbp.loc[(pbp['event_type'].isin(events))&
428
+ (pbp['strength_state'].isin(strengths))&
429
+ (pbp['period'] < 5)&
430
+ (pbp['x'].notna())&
431
+ (pbp['y'].notna())]
432
+
433
+ pbp = prep_xG_data(pbp_prep)
434
+ model = joblib.load(model)
435
+
436
+ data = pbp.loc[pbp['event_type'].isin(fenwick_events)]
396
437
 
397
- if pbp_xg['xG'].isnull().all():
398
- print("No MoneyPuck xG values were found for this game...")
438
+ data_sparse = sp.csr_matrix(data[[target]+continuous+boolean])
399
439
 
400
- #Return: play-by-play with moneypuck xG column
401
- return pbp_xg
440
+ is_goal_vect = data_sparse[:, 0].A
441
+ predictors = data_sparse[:, 1:]
442
+
443
+ xgb_matrix = xgb.DMatrix(data=predictors,label=is_goal_vect,feature_names=(continuous+boolean))
444
+
445
+ pred = model.predict(xgb_matrix)
446
+ fop, mpv = calibration_curve(is_goal_vect, pred, strategy='uniform')
447
+
448
+ plt.figure()
449
+ plt.plot(mpv, fop, "s-", label="Model")
450
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Perfect calibration")
451
+ plt.title("WSBA xG Reliability Diagram")
452
+ plt.xlabel("Predicted Probability (mean)")
453
+ plt.ylabel("Fraction of positives")
454
+ plt.legend(loc="best")
455
+ plt.savefig('tools/xg_model/metrics/reliability.png')