deep-rapm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. deep_rapm-0.1.0/LICENSE +21 -0
  2. deep_rapm-0.1.0/PKG-INFO +314 -0
  3. deep_rapm-0.1.0/README.md +270 -0
  4. deep_rapm-0.1.0/deep_rapm/__init__.py +5 -0
  5. deep_rapm-0.1.0/deep_rapm/data/__init__.py +28 -0
  6. deep_rapm-0.1.0/deep_rapm/data/box_scores.py +263 -0
  7. deep_rapm-0.1.0/deep_rapm/data/dataset.py +272 -0
  8. deep_rapm-0.1.0/deep_rapm/data/feature_lookup.py +293 -0
  9. deep_rapm-0.1.0/deep_rapm/data/game.py +300 -0
  10. deep_rapm-0.1.0/deep_rapm/data/players.py +517 -0
  11. deep_rapm-0.1.0/deep_rapm/data/season.py +416 -0
  12. deep_rapm-0.1.0/deep_rapm/data/stats.py +213 -0
  13. deep_rapm-0.1.0/deep_rapm/data/validate.py +306 -0
  14. deep_rapm-0.1.0/deep_rapm/model.py +487 -0
  15. deep_rapm-0.1.0/deep_rapm/model_cross_rapm.py +290 -0
  16. deep_rapm-0.1.0/deep_rapm/model_linear.py +148 -0
  17. deep_rapm-0.1.0/deep_rapm/rapm.py +415 -0
  18. deep_rapm-0.1.0/deep_rapm/scripts/__init__.py +0 -0
  19. deep_rapm-0.1.0/deep_rapm/scripts/collect.py +124 -0
  20. deep_rapm-0.1.0/deep_rapm/scripts/collect_box_scores.py +75 -0
  21. deep_rapm-0.1.0/deep_rapm/scripts/collect_players.py +146 -0
  22. deep_rapm-0.1.0/deep_rapm/scripts/solve_rapm.py +135 -0
  23. deep_rapm-0.1.0/deep_rapm/scripts/train_script.py +157 -0
  24. deep_rapm-0.1.0/deep_rapm/train.py +458 -0
  25. deep_rapm-0.1.0/deep_rapm.egg-info/PKG-INFO +314 -0
  26. deep_rapm-0.1.0/deep_rapm.egg-info/SOURCES.txt +34 -0
  27. deep_rapm-0.1.0/deep_rapm.egg-info/dependency_links.txt +1 -0
  28. deep_rapm-0.1.0/deep_rapm.egg-info/entry_points.txt +4 -0
  29. deep_rapm-0.1.0/deep_rapm.egg-info/requires.txt +15 -0
  30. deep_rapm-0.1.0/deep_rapm.egg-info/top_level.txt +1 -0
  31. deep_rapm-0.1.0/pyproject.toml +48 -0
  32. deep_rapm-0.1.0/setup.cfg +4 -0
  33. deep_rapm-0.1.0/tests/test_dataset.py +350 -0
  34. deep_rapm-0.1.0/tests/test_players.py +249 -0
  35. deep_rapm-0.1.0/tests/test_possessions.py +585 -0
  36. deep_rapm-0.1.0/tests/test_sample_games.py +326 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aaron Danielson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,314 @@
1
+ Metadata-Version: 2.4
2
+ Name: deep-rapm
3
+ Version: 0.1.0
4
+ Summary: Regularized Adjusted Plus-Minus (RAPM) for NBA possession data — analytical ridge regression with recency weighting and a cross-attention neural model for lineup interaction effects.
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 Aaron Danielson
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: pandas>=2.0
31
+ Requires-Dist: numpy>=1.24
32
+ Requires-Dist: pyarrow>=14.0
33
+ Requires-Dist: nba-api>=1.4
34
+ Requires-Dist: pbpstats>=1.0
35
+ Requires-Dist: tqdm>=4.66
36
+ Requires-Dist: scipy>=1.11
37
+ Provides-Extra: neural
38
+ Requires-Dist: torch>=2.1; extra == "neural"
39
+ Requires-Dist: scikit-learn>=1.3; extra == "neural"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.4; extra == "dev"
42
+ Requires-Dist: black>=23.12; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # Deep RAPM
46
+
47
+ Regularized Adjusted Plus-Minus (RAPM) for NBA possession data, with both
48
+ an analytical ridge regression solver and a Set Transformer neural model.
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install -e .
56
+ ```
57
+
58
+ Requires Python ≥ 3.10 and PyTorch ≥ 2.1.
59
+
60
+ ---
61
+
62
+ ## Data pipeline
63
+
64
+ **Step 1 — collect possessions** (calls the NBA Stats API; takes ~10 min per season):
65
+
66
+ ```bash
67
+ collect-possessions --season 2022-23 --output-dir data/2022-23
68
+ ```
69
+
70
+ **Step 2 — build player vocab and position table**:
71
+
72
+ ```bash
73
+ collect-players --seasons 2018-19 2019-20 2020-21 2021-22 2022-23 2023-24
74
+ ```
75
+
76
+ Produces `data/player_vocab.parquet` and `data/players.parquet`.
77
+
78
+ ---
79
+
80
+ ## Analytical RAPM
81
+
82
+ Fits ridge regression via the normal equations — exact, fast (~1 s), and
83
+ noise-immune. This is the recommended starting point.
84
+
85
+ ### CLI
86
+
87
+ **Season mode** (uses pre-collected parquets):
88
+
89
+ ```bash
90
+ solve-rapm # default: 5 training seasons, alpha=2000
91
+ solve-rapm --alpha 1000 --top 20 # tune regularisation, show more players
92
+ solve-rapm --output-dir runs/rapm # custom output directory
93
+ solve-rapm --half-life 365 # down-weight older games (1-year half-life)
94
+ ```
95
+
96
+ **Date-range mode** (auto-fetches and caches games from the NBA API):
97
+
98
+ ```bash
99
+ # Fit on a specific date window; games cached to data/games/<game_id>.parquet
100
+ solve-rapm --from-date 2024-10-01 --to-date 2025-04-15
101
+
102
+ # With recency weighting — games from 180 days ago count half as much
103
+ solve-rapm --from-date 2023-10-01 --to-date 2025-04-15 --half-life 180
104
+ ```
105
+
106
+ Output: `checkpoints/rapm/rapm.parquet` and `rapm_summary.json`.
107
+
108
+ ### Python API
109
+
110
+ ```python
111
+ from pathlib import Path
112
+ from deep_rapm import fit_rapm, load_rapm
113
+
114
+ # Season mode — load from pre-collected parquets
115
+ results = fit_rapm(
116
+ data_dir=Path("data"),
117
+ seasons=["2018-19", "2019-20", "2020-21", "2021-22", "2022-23"],
118
+ player_vocab_path=Path("data/player_vocab.parquet"),
119
+ player_table_path=Path("data/players.parquet"),
120
+ alpha=2000,
121
+ output_dir=Path("checkpoints/rapm"),
122
+ )
123
+
124
+ # Date-range mode — auto-fetch from NBA API, cache per game
125
+ results = fit_rapm(
126
+ data_dir=Path("data"),
127
+ from_date="2024-10-01",
128
+ to_date="2025-04-15",
129
+ player_vocab_path=Path("data/player_vocab.parquet"),
130
+ player_table_path=Path("data/players.parquet"),
131
+ alpha=2000,
132
+ half_life_days=180, # optional: down-weight older games
133
+ output_dir=Path("checkpoints/rapm"),
134
+ )
135
+
136
+ # Load pre-computed results
137
+ results = load_rapm(Path("checkpoints/rapm"))
138
+
139
+ # Work with the DataFrame
140
+ qualified = results[results["qualified"]] # min 100 poss each role
141
+ print(qualified.nlargest(10, "rapm")[["player_name", "orapm", "drapm", "rapm"]])
142
+ ```
143
+
144
+ Result columns (all per 100 possessions):
145
+
146
+ | Column | Description |
147
+ |--------|-------------|
148
+ | `orapm` | Offensive RAPM — points added per 100 offensive possessions |
149
+ | `drapm` | Defensive RAPM — points prevented per 100 defensive possessions (positive = good defender) |
150
+ | `rapm` | Total RAPM = `orapm + drapm` |
151
+ | `n_off` / `n_def` | Offensive / defensive possession counts |
152
+ | `qualified` | `True` if ≥ 100 possessions in each role |
153
+
154
+ ### Model
155
+
156
+ Each possession $i$ is labelled by which players are on the court. Define the
157
+ indicator matrix $X \in \{0,1\}^{n \times 2p}$ where $p$ is the number of
158
+ players: the first $p$ columns are offense indicators and the last $p$ columns
159
+ are defense indicators. Each row has exactly 10 ones — one per player on the
160
+ court.
161
+
162
+ The predicted points scored on possession $i$ is
163
+
164
+ $$\hat{y}_i = \mu + \sum_{j \in \text{off}(i)} \alpha_j + \sum_{k \in \text{def}(i)} \delta_k = \mu + X_i \beta$$
165
+
166
+ where $\beta = [\alpha_1, \ldots, \alpha_p, \delta_1, \ldots, \delta_p]^\top$
167
+ collects the offensive and defensive parameters.
168
+
169
+ **Unweighted ridge.** Fit by minimising
170
+
171
+ $$\mathcal{L}(\beta) = \|y_c - X\beta\|^2 + \alpha \|\beta\|^2$$
172
+
173
+ where $y_c = y - \mu$ is mean-centred. Setting the gradient to zero gives the
174
+ normal equations
175
+
176
+ $$\bigl(X^\top X + \alpha I\bigr)\,\beta = X^\top y_c$$
177
+
178
+ which are solved exactly via Cholesky decomposition. The matrix $X^\top X$ is
179
+ $2p \times 2p$ (typically $3000 \times 3000$) and dense after forming, making
180
+ the direct solve fast and numerically stable.
181
+
182
+ **Recency-weighted ridge.** When a half-life $\tau$ (days) is specified, each
183
+ possession is down-weighted exponentially by its age:
184
+
185
+ $$w_i = 0.5^{\,d_i / \tau}$$
186
+
187
+ where $d_i$ is the number of days between possession $i$ and the most recent
188
+ possession in the dataset. The weighted objective becomes
189
+
190
+ $$\mathcal{L}_W(\beta) = \|W^{1/2}(y_c - X\beta)\|^2 + \alpha\|\beta\|^2$$
191
+
192
+ with $W = \operatorname{diag}(w)$. The weighted normal equations are
193
+
194
+ $$\bigl(X^\top W X + \alpha I\bigr)\,\beta = X^\top W y_c$$
195
+
196
+ $X^\top W X$ is computed efficiently as $(X \odot \sqrt{w})^\top (X \odot \sqrt{w})$, keeping $X$ sparse throughout.
197
+
198
+ **Intercept.** The intercept $\mu$ is the (weighted) mean points per
199
+ possession and is removed before solving, then added back at prediction time.
200
+ This decouples the mean from the ridge penalty.
201
+
202
+ **Reported values** (per 100 possessions):
203
+
204
+ $$\text{ORAPM}_j = 100 \cdot \alpha_j \qquad \text{DRAPM}_k = -100 \cdot \delta_k \qquad \text{RAPM} = \text{ORAPM} + \text{DRAPM}$$
205
+
206
+ The sign flip on DRAPM makes positive values mean *good defender* (a defender
207
+ who suppresses scoring has $\delta_k < 0$, so $\text{DRAPM}_k > 0$).
208
+
209
+ ### Sample output (2018-19 through 2022-23, alpha=2000)
210
+
211
+ ```
212
+ Player ORAPM DRAPM RAPM
213
+ Nikola Jokić +7.74 +1.94 +9.68
214
+ Joel Embiid +4.44 +4.56 +9.00
215
+ Stephen Curry +6.09 +2.29 +8.38
216
+ Giannis Antetokounmpo +4.33 +4.03 +8.35
217
+ LeBron James +6.01 +2.03 +8.04
218
+ Alex Caruso +0.96 +6.24 +7.20 ← elite defender
219
+ Rudy Gobert +0.20 +6.28 +6.48 ← elite defender
220
+ Damian Lillard +7.44 -0.52 +6.93 ← scorer, defensive liability
221
+ ```
222
+
223
+ ---
224
+
225
+ ## Neural model (experimental)
226
+
227
+ Trains a Set Transformer on the possession data, warm-started from the
228
+ analytical RAPM solution.
229
+
230
+ ```bash
231
+ # Fit analytical RAPM first (required for warm-start)
232
+ solve-rapm --output-dir checkpoints/rapm
233
+
234
+ # Train neural model warm-started from RAPM
235
+ train-deep-rapm --model linear \
236
+ --rapm-dir checkpoints/rapm \
237
+ --output-dir checkpoints/neural
238
+
239
+ # Train full Set Transformer (DeepRAPM)
240
+ train-deep-rapm --model deep \
241
+ --rapm-dir checkpoints/rapm \
242
+ --output-dir checkpoints/deep
243
+ ```
244
+
245
+ Key hyperparameters:
246
+
247
+ | Flag | Default | Description |
248
+ |------|---------|-------------|
249
+ | `--model` | `deep` | `deep` (Set Transformer) or `linear` (ridge analog) |
250
+ | `--d` | 64 | Embedding dimension |
251
+ | `--num-layers` | 2 | Transformer layers |
252
+ | `--epochs` | 30 | Training epochs |
253
+ | `--embedding-reg` | 1e-4 | L2 penalty on player embeddings |
254
+ | `--rapm-dir` | None | Warm-start from analytical RAPM |
255
+
256
+ ---
257
+
258
+ ## Design notes
259
+
260
+ The general RAPM prediction can be written as
261
+
262
+ $$\hat{y}_i = f_{\theta}(\mathbf{o}, \mathbf{x}, \mathbf{g})$$
263
+
264
+ where $\mathbf{o}$ are the indices of the 5 offensive players, $\mathbf{x}$ are the indices of the 5 defensive players, $\mathbf{g}$ is a gamestate vector, and $\theta$ parameterises $f$. The target $y$ is an outcome of interest — points scored, possession length, assist probability, etc.
265
+
266
+ The analytical model uses a design matrix with $2p$ columns ($p$ = number of players) so each player is represented by two scalars: one offensive, one defensive. This does not capture player-player interactions or lineup synergies.
267
+
268
+ The neural models replace the two scalars with two latent vectors $\mathbf{u}_i^o, \mathbf{u}_i^d \in \mathbb{R}^d$ per player, enabling richer lineup representations.
269
+
270
+ ### What does NOT increase expressiveness
271
+
272
+ A natural first idea is to give each player a higher-dimensional embedding $\mathbf{u}_i^o \in \mathbb{R}^d$ and project to a scalar with a shared weight vector $\mathbf{w}_o \in \mathbb{R}^d$:
273
+
274
+ $$\hat{y} = \text{bias} + \sum_i \mathbf{w}_o^\top \mathbf{u}_i^o + \sum_j \mathbf{w}_d^\top \mathbf{u}_j^d$$
275
+
276
+ This looks richer, but it is not. The composition $\mathbf{w}_o^\top \mathbf{u}_i^o$ is a linear map $\mathbb{R}^d \to \mathbb{R}$, which spans the same function class as a single scalar $\alpha_i$ per player. Any assignment of real numbers to players can be represented with $d=1$. Under joint optimization the higher-dimensional vectors collapse to rank-1 — equivalent to standard RAPM, just overparameterized.
277
+
278
+ **The root constraint:** whenever the lineup score decomposes as a *sum of independent player terms*, the model is equivalent to RAPM regardless of the embedding dimension.
279
+
280
+ ### What does increase expressiveness
281
+
282
+ Expressiveness requires that the lineup encoding cannot be decomposed additively. The key tools:
283
+
284
+ 1. **Cross-player attention before aggregation.** Allow each player's representation to attend to teammates and opponents before being summed:
285
+
286
+ $$\mathbf{h}_i^o = \text{Attention}\!\left(\mathbf{u}_i^o;\, \{\mathbf{u}_1^o, \ldots, \mathbf{u}_5^o, \mathbf{u}_1^d, \ldots, \mathbf{u}_5^d\}\right)$$
287
+
288
+ $$\hat{y} = \text{MLP}\!\left(\textstyle\sum_i \mathbf{h}_i^o,\; \sum_j \mathbf{h}_j^d\right)$$
289
+
290
+ After attention, $\mathbf{h}_i^o$ encodes matchup and lineup context — the final sum is no longer a sum of pre-fixed scalars.
291
+
292
+ 2. **Nonlinear pooling (Deep Sets).** $\rho\!\left(\sum_i \varphi(\mathbf{u}_i^o)\right)$ where $\varphi$ and $\rho$ are nonlinear MLPs. By the universal approximation theorem for set functions, this can represent any permutation-invariant function of the lineup.
293
+
294
+ 3. **Bilinear cross-team interactions.** $\sum_i \sum_j (\mathbf{u}_i^o)^\top M\, \mathbf{u}_j^d$ captures matchup-level terms at the cost of $O(25d^2)$ parameters per possession.
295
+
296
+ ### CrossRAPM architecture (implemented)
297
+
298
+ Each player $i$ is enriched with a feature projection before cross-attention:
299
+
300
+ $$E_i^o = \mathbf{u}_i^o + W_o f_i, \qquad E_j^d = \mathbf{u}_j^d + W_d f_j$$
301
+
302
+ where $f_i \in \mathbb{R}^{14}$ is a per-player feature vector (one-hot position + EWMA rate stats). Offense and defense then attend to each other:
303
+
304
+ $$H^o = \text{LayerNorm}\!\left(E^o + \text{CrossAttn}(Q{=}E^o,\, K{=}E^d,\, V{=}E^d)\right)$$
305
+
306
+ $$H^d = \text{LayerNorm}\!\left(E^d + \text{CrossAttn}(Q{=}E^d,\, K{=}E^o,\, V{=}E^o)\right)$$
307
+
308
+ The attention kernel is the standard scaled dot-product:
309
+
310
+ $$\text{Attn}(Q, K, V) = \text{softmax}\!\left(\frac{QK^\top}{\sqrt{d}}\right)V$$
311
+
312
+ The lineup representations are mean-pooled and concatenated with the gamestate for prediction:
313
+
314
+ $$\hat{y} = \mathbf{w}^\top \bigl[\bar{H}^o \;\|\; \bar{H}^d \;\|\; \mathbf{g}\bigr] + b$$
@@ -0,0 +1,270 @@
1
+ # Deep RAPM
2
+
3
+ Regularized Adjusted Plus-Minus (RAPM) for NBA possession data, with both
4
+ an analytical ridge regression solver and a Set Transformer neural model.
5
+
6
+ ---
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ pip install -e .
12
+ ```
13
+
14
+ Requires Python ≥ 3.10 and PyTorch ≥ 2.1.
15
+
16
+ ---
17
+
18
+ ## Data pipeline
19
+
20
+ **Step 1 — collect possessions** (calls the NBA Stats API; takes ~10 min per season):
21
+
22
+ ```bash
23
+ collect-possessions --season 2022-23 --output-dir data/2022-23
24
+ ```
25
+
26
+ **Step 2 — build player vocab and position table**:
27
+
28
+ ```bash
29
+ collect-players --seasons 2018-19 2019-20 2020-21 2021-22 2022-23 2023-24
30
+ ```
31
+
32
+ Produces `data/player_vocab.parquet` and `data/players.parquet`.
33
+
34
+ ---
35
+
36
+ ## Analytical RAPM
37
+
38
+ Fits ridge regression via the normal equations — exact, fast (~1 s), and
39
+ noise-immune. This is the recommended starting point.
40
+
41
+ ### CLI
42
+
43
+ **Season mode** (uses pre-collected parquets):
44
+
45
+ ```bash
46
+ solve-rapm # default: 5 training seasons, alpha=2000
47
+ solve-rapm --alpha 1000 --top 20 # tune regularisation, show more players
48
+ solve-rapm --output-dir runs/rapm # custom output directory
49
+ solve-rapm --half-life 365 # down-weight older games (1-year half-life)
50
+ ```
51
+
52
+ **Date-range mode** (auto-fetches and caches games from the NBA API):
53
+
54
+ ```bash
55
+ # Fit on a specific date window; games cached to data/games/<game_id>.parquet
56
+ solve-rapm --from-date 2024-10-01 --to-date 2025-04-15
57
+
58
+ # With recency weighting — games from 180 days ago count half as much
59
+ solve-rapm --from-date 2023-10-01 --to-date 2025-04-15 --half-life 180
60
+ ```
61
+
62
+ Output: `checkpoints/rapm/rapm.parquet` and `rapm_summary.json`.
63
+
64
+ ### Python API
65
+
66
+ ```python
67
+ from pathlib import Path
68
+ from deep_rapm import fit_rapm, load_rapm
69
+
70
+ # Season mode — load from pre-collected parquets
71
+ results = fit_rapm(
72
+ data_dir=Path("data"),
73
+ seasons=["2018-19", "2019-20", "2020-21", "2021-22", "2022-23"],
74
+ player_vocab_path=Path("data/player_vocab.parquet"),
75
+ player_table_path=Path("data/players.parquet"),
76
+ alpha=2000,
77
+ output_dir=Path("checkpoints/rapm"),
78
+ )
79
+
80
+ # Date-range mode — auto-fetch from NBA API, cache per game
81
+ results = fit_rapm(
82
+ data_dir=Path("data"),
83
+ from_date="2024-10-01",
84
+ to_date="2025-04-15",
85
+ player_vocab_path=Path("data/player_vocab.parquet"),
86
+ player_table_path=Path("data/players.parquet"),
87
+ alpha=2000,
88
+ half_life_days=180, # optional: down-weight older games
89
+ output_dir=Path("checkpoints/rapm"),
90
+ )
91
+
92
+ # Load pre-computed results
93
+ results = load_rapm(Path("checkpoints/rapm"))
94
+
95
+ # Work with the DataFrame
96
+ qualified = results[results["qualified"]] # min 100 poss each role
97
+ print(qualified.nlargest(10, "rapm")[["player_name", "orapm", "drapm", "rapm"]])
98
+ ```
99
+
100
+ Result columns (all per 100 possessions):
101
+
102
+ | Column | Description |
103
+ |--------|-------------|
104
+ | `orapm` | Offensive RAPM — points added per 100 offensive possessions |
105
+ | `drapm` | Defensive RAPM — points prevented per 100 defensive possessions (positive = good defender) |
106
+ | `rapm` | Total RAPM = `orapm + drapm` |
107
+ | `n_off` / `n_def` | Offensive / defensive possession counts |
108
+ | `qualified` | `True` if ≥ 100 possessions in each role |
109
+
110
+ ### Model
111
+
112
+ Each possession $i$ is labelled by which players are on the court. Define the
113
+ indicator matrix $X \in \{0,1\}^{n \times 2p}$ where $p$ is the number of
114
+ players: the first $p$ columns are offense indicators and the last $p$ columns
115
+ are defense indicators. Each row has exactly 10 ones — one per player on the
116
+ court.
117
+
118
+ The predicted points scored on possession $i$ is
119
+
120
+ $$\hat{y}_i = \mu + \sum_{j \in \text{off}(i)} \alpha_j + \sum_{k \in \text{def}(i)} \delta_k = \mu + X_i \beta$$
121
+
122
+ where $\beta = [\alpha_1, \ldots, \alpha_p, \delta_1, \ldots, \delta_p]^\top$
123
+ collects the offensive and defensive parameters.
124
+
125
+ **Unweighted ridge.** Fit by minimising
126
+
127
+ $$\mathcal{L}(\beta) = \|y_c - X\beta\|^2 + \alpha \|\beta\|^2$$
128
+
129
+ where $y_c = y - \mu$ is mean-centred. Setting the gradient to zero gives the
130
+ normal equations
131
+
132
+ $$\bigl(X^\top X + \alpha I\bigr)\,\beta = X^\top y_c$$
133
+
134
+ which are solved exactly via Cholesky decomposition. The matrix $X^\top X$ is
135
+ $2p \times 2p$ (typically $3000 \times 3000$) and dense after forming, making
136
+ the direct solve fast and numerically stable.
137
+
138
+ **Recency-weighted ridge.** When a half-life $\tau$ (days) is specified, each
139
+ possession is down-weighted exponentially by its age:
140
+
141
+ $$w_i = 0.5^{\,d_i / \tau}$$
142
+
143
+ where $d_i$ is the number of days between possession $i$ and the most recent
144
+ possession in the dataset. The weighted objective becomes
145
+
146
+ $$\mathcal{L}_W(\beta) = \|W^{1/2}(y_c - X\beta)\|^2 + \alpha\|\beta\|^2$$
147
+
148
+ with $W = \operatorname{diag}(w)$. The weighted normal equations are
149
+
150
+ $$\bigl(X^\top W X + \alpha I\bigr)\,\beta = X^\top W y_c$$
151
+
152
+ $X^\top W X$ is computed efficiently as $(X \odot \sqrt{w})^\top (X \odot \sqrt{w})$, keeping $X$ sparse throughout.
153
+
154
+ **Intercept.** The intercept $\mu$ is the (weighted) mean points per
155
+ possession and is removed before solving, then added back at prediction time.
156
+ This decouples the mean from the ridge penalty.
157
+
158
+ **Reported values** (per 100 possessions):
159
+
160
+ $$\text{ORAPM}_j = 100 \cdot \alpha_j \qquad \text{DRAPM}_k = -100 \cdot \delta_k \qquad \text{RAPM} = \text{ORAPM} + \text{DRAPM}$$
161
+
162
+ The sign flip on DRAPM makes positive values mean *good defender* (a defender
163
+ who suppresses scoring has $\delta_k < 0$, so $\text{DRAPM}_k > 0$).
164
+
165
+ ### Sample output (2018-19 through 2022-23, alpha=2000)
166
+
167
+ ```
168
+ Player ORAPM DRAPM RAPM
169
+ Nikola Jokić +7.74 +1.94 +9.68
170
+ Joel Embiid +4.44 +4.56 +9.00
171
+ Stephen Curry +6.09 +2.29 +8.38
172
+ Giannis Antetokounmpo +4.33 +4.03 +8.35
173
+ LeBron James +6.01 +2.03 +8.04
174
+ Alex Caruso +0.96 +6.24 +7.20 ← elite defender
175
+ Rudy Gobert +0.20 +6.28 +6.48 ← elite defender
176
+ Damian Lillard +7.44 -0.52 +6.93 ← scorer, defensive liability
177
+ ```
178
+
179
+ ---
180
+
181
+ ## Neural model (experimental)
182
+
183
+ Trains a Set Transformer on the possession data, warm-started from the
184
+ analytical RAPM solution.
185
+
186
+ ```bash
187
+ # Fit analytical RAPM first (required for warm-start)
188
+ solve-rapm --output-dir checkpoints/rapm
189
+
190
+ # Train neural model warm-started from RAPM
191
+ train-deep-rapm --model linear \
192
+ --rapm-dir checkpoints/rapm \
193
+ --output-dir checkpoints/neural
194
+
195
+ # Train full Set Transformer (DeepRAPM)
196
+ train-deep-rapm --model deep \
197
+ --rapm-dir checkpoints/rapm \
198
+ --output-dir checkpoints/deep
199
+ ```
200
+
201
+ Key hyperparameters:
202
+
203
+ | Flag | Default | Description |
204
+ |------|---------|-------------|
205
+ | `--model` | `deep` | `deep` (Set Transformer) or `linear` (ridge analog) |
206
+ | `--d` | 64 | Embedding dimension |
207
+ | `--num-layers` | 2 | Transformer layers |
208
+ | `--epochs` | 30 | Training epochs |
209
+ | `--embedding-reg` | 1e-4 | L2 penalty on player embeddings |
210
+ | `--rapm-dir` | None | Warm-start from analytical RAPM |
211
+
212
+ ---
213
+
214
+ ## Design notes
215
+
216
+ The general RAPM prediction can be written as
217
+
218
+ $$\hat{y}_i = f_{\theta}(\mathbf{o}, \mathbf{x}, \mathbf{g})$$
219
+
220
+ where $\mathbf{o}$ are the indices of the 5 offensive players, $\mathbf{x}$ are the indices of the 5 defensive players, $\mathbf{g}$ is a gamestate vector, and $\theta$ parameterises $f$. The target $y$ is an outcome of interest — points scored, possession length, assist probability, etc.
221
+
222
+ The analytical model uses a design matrix with $2p$ columns ($p$ = number of players) so each player is represented by two scalars: one offensive, one defensive. This does not capture player-player interactions or lineup synergies.
223
+
224
+ The neural models replace the two scalars with two latent vectors $\mathbf{u}_i^o, \mathbf{u}_i^d \in \mathbb{R}^d$ per player, enabling richer lineup representations.
225
+
226
+ ### What does NOT increase expressiveness
227
+
228
+ A natural first idea is to give each player a higher-dimensional embedding $\mathbf{u}_i^o \in \mathbb{R}^d$ and project to a scalar with a shared weight vector $\mathbf{w}_o \in \mathbb{R}^d$:
229
+
230
+ $$\hat{y} = \text{bias} + \sum_i \mathbf{w}_o^\top \mathbf{u}_i^o + \sum_j \mathbf{w}_d^\top \mathbf{u}_j^d$$
231
+
232
+ This looks richer, but it is not. The composition $\mathbf{w}_o^\top \mathbf{u}_i^o$ is a linear map $\mathbb{R}^d \to \mathbb{R}$, which spans the same function class as a single scalar $\alpha_i$ per player. Any assignment of real numbers to players can be represented with $d=1$. Under joint optimization the higher-dimensional vectors collapse to rank-1 — equivalent to standard RAPM, just overparameterized.
233
+
234
+ **The root constraint:** whenever the lineup score decomposes as a *sum of independent player terms*, the model is equivalent to RAPM regardless of the embedding dimension.
235
+
236
+ ### What does increase expressiveness
237
+
238
+ Expressiveness requires that the lineup encoding cannot be decomposed additively. The key tools:
239
+
240
+ 1. **Cross-player attention before aggregation.** Allow each player's representation to attend to teammates and opponents before being summed:
241
+
242
+ $$\mathbf{h}_i^o = \text{Attention}\!\left(\mathbf{u}_i^o;\, \{\mathbf{u}_1^o, \ldots, \mathbf{u}_5^o, \mathbf{u}_1^d, \ldots, \mathbf{u}_5^d\}\right)$$
243
+
244
+ $$\hat{y} = \text{MLP}\!\left(\textstyle\sum_i \mathbf{h}_i^o,\; \sum_j \mathbf{h}_j^d\right)$$
245
+
246
+ After attention, $\mathbf{h}_i^o$ encodes matchup and lineup context — the final sum is no longer a sum of pre-fixed scalars.
247
+
248
+ 2. **Nonlinear pooling (Deep Sets).** $\rho\!\left(\sum_i \varphi(\mathbf{u}_i^o)\right)$ where $\varphi$ and $\rho$ are nonlinear MLPs. By the universal approximation theorem for set functions, this can represent any permutation-invariant function of the lineup.
249
+
250
+ 3. **Bilinear cross-team interactions.** $\sum_i \sum_j (\mathbf{u}_i^o)^\top M\, \mathbf{u}_j^d$ captures matchup-level terms at the cost of $O(25d^2)$ parameters per possession.
251
+
252
+ ### CrossRAPM architecture (implemented)
253
+
254
+ Each player $i$ is enriched with a feature projection before cross-attention:
255
+
256
+ $$E_i^o = \mathbf{u}_i^o + W_o f_i, \qquad E_j^d = \mathbf{u}_j^d + W_d f_j$$
257
+
258
+ where $f_i \in \mathbb{R}^{14}$ is a per-player feature vector (one-hot position + EWMA rate stats). Offense and defense then attend to each other:
259
+
260
+ $$H^o = \text{LayerNorm}\!\left(E^o + \text{CrossAttn}(Q{=}E^o,\, K{=}E^d,\, V{=}E^d)\right)$$
261
+
262
+ $$H^d = \text{LayerNorm}\!\left(E^d + \text{CrossAttn}(Q{=}E^d,\, K{=}E^o,\, V{=}E^o)\right)$$
263
+
264
+ The attention kernel is the standard scaled dot-product:
265
+
266
+ $$\text{Attn}(Q, K, V) = \text{softmax}\!\left(\frac{QK^\top}{\sqrt{d}}\right)V$$
267
+
268
+ The lineup representations are mean-pooled and concatenated with the gamestate for prediction:
269
+
270
+ $$\hat{y} = \mathbf{w}^\top \bigl[\bar{H}^o \;\|\; \bar{H}^d \;\|\; \mathbf{g}\bigr] + b$$
@@ -0,0 +1,5 @@
1
+ """Deep RAPM: Regularized Adjusted Plus-Minus for NBA possession data."""
2
+
3
+ from .rapm import fit_rapm, load_rapm
4
+
5
+ __all__ = ["fit_rapm", "load_rapm"]
@@ -0,0 +1,28 @@
1
+ from .dataset import PossessionDataset, make_possession_splits
2
+ from .game import get_game_possessions
3
+ from .players import (
4
+ UNKNOWN_PLAYER_IDX,
5
+ build_player_table,
6
+ build_player_vocab,
7
+ load_player_table,
8
+ load_player_vocab,
9
+ make_player_index_lookup,
10
+ make_position_lookup,
11
+ supplement_player_table,
12
+ )
13
+ from .season import collect_season
14
+
15
+ __all__ = [
16
+ "PossessionDataset",
17
+ "make_possession_splits",
18
+ "get_game_possessions",
19
+ "collect_season",
20
+ "UNKNOWN_PLAYER_IDX",
21
+ "build_player_table",
22
+ "build_player_vocab",
23
+ "load_player_table",
24
+ "load_player_vocab",
25
+ "make_player_index_lookup",
26
+ "make_position_lookup",
27
+ "supplement_player_table",
28
+ ]