spforge 0.8.4__py3-none-any.whl → 0.8.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spforge might be problematic. Click here for more details.
- examples/lol/pipeline_transformer_example.py +69 -86
- examples/nba/cross_validation_example.py +4 -11
- examples/nba/feature_engineering_example.py +33 -15
- examples/nba/game_winner_example.py +24 -14
- examples/nba/predictor_transformers_example.py +29 -16
- spforge/__init__.py +1 -0
- spforge/autopipeline.py +169 -5
- spforge/estimator/_group_by_estimator.py +11 -3
- spforge/features_generator_pipeline.py +8 -4
- spforge/hyperparameter_tuning/__init__.py +12 -0
- spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
- spforge/hyperparameter_tuning/_tuner.py +192 -0
- spforge/performance_transformers/_performance_manager.py +2 -4
- spforge/ratings/__init__.py +4 -0
- spforge/ratings/_player_rating.py +142 -28
- spforge/ratings/league_start_rating_optimizer.py +201 -0
- spforge/ratings/start_rating_generator.py +1 -1
- spforge/ratings/team_start_rating_generator.py +1 -1
- spforge/ratings/utils.py +16 -6
- spforge/scorer/_score.py +42 -11
- spforge/transformers/_other_transformer.py +38 -8
- {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/METADATA +12 -19
- {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/RECORD +37 -31
- {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
- tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
- tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
- tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
- tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
- tests/performance_transformers/test_performance_manager.py +15 -0
- tests/ratings/test_player_rating_generator.py +154 -0
- tests/ratings/test_player_rating_no_mutation.py +214 -0
- tests/ratings/test_utils_scaled_weights.py +136 -0
- tests/scorer/test_score.py +232 -0
- tests/test_autopipeline.py +336 -6
- tests/test_feature_generator_pipeline.py +43 -0
- {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
- {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import narwhals.stable.v2 as nw
|
|
7
|
+
import polars as pl
|
|
8
|
+
from narwhals.stable.v2.typing import IntoFrameT
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_START_RATING = 1000.0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LeagueStartRatingOptimizationResult:
|
|
16
|
+
league_ratings: dict[str, float]
|
|
17
|
+
iteration_errors: list[dict[str, float]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LeagueStartRatingOptimizer:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
rating_generator: object,
|
|
24
|
+
n_iterations: int = 3,
|
|
25
|
+
learning_rate: float = 0.2,
|
|
26
|
+
min_cross_region_rows: int = 10,
|
|
27
|
+
rating_scale: float | None = None,
|
|
28
|
+
):
|
|
29
|
+
self.rating_generator = rating_generator
|
|
30
|
+
self.n_iterations = int(n_iterations)
|
|
31
|
+
self.learning_rate = float(learning_rate)
|
|
32
|
+
self.min_cross_region_rows = int(min_cross_region_rows)
|
|
33
|
+
self.rating_scale = rating_scale
|
|
34
|
+
|
|
35
|
+
@nw.narwhalify
|
|
36
|
+
def optimize(self, df: IntoFrameT) -> LeagueStartRatingOptimizationResult:
|
|
37
|
+
pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars()
|
|
38
|
+
league_ratings = self._get_league_ratings(self.rating_generator)
|
|
39
|
+
iteration_errors: list[dict[str, float]] = []
|
|
40
|
+
|
|
41
|
+
for _ in range(self.n_iterations):
|
|
42
|
+
gen = copy.deepcopy(self.rating_generator)
|
|
43
|
+
self._set_league_ratings(gen, league_ratings)
|
|
44
|
+
self._ensure_prediction_columns(gen)
|
|
45
|
+
|
|
46
|
+
pred_df = gen.fit_transform(pl_df)
|
|
47
|
+
error_df = self._cross_region_error_df(pl_df, pred_df, gen)
|
|
48
|
+
if error_df.is_empty():
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
error_summary = (
|
|
52
|
+
error_df.group_by(self._league_column_name(gen))
|
|
53
|
+
.agg(
|
|
54
|
+
pl.col("error").mean().alias("mean_error"),
|
|
55
|
+
pl.len().alias("row_count"),
|
|
56
|
+
)
|
|
57
|
+
.to_dicts()
|
|
58
|
+
)
|
|
59
|
+
league_key = self._league_column_name(gen)
|
|
60
|
+
iteration_errors.append({r[league_key]: r["mean_error"] for r in error_summary})
|
|
61
|
+
league_ratings = self._apply_error_updates(
|
|
62
|
+
gen, league_ratings, error_summary, league_key
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._set_league_ratings(self.rating_generator, league_ratings)
|
|
66
|
+
return LeagueStartRatingOptimizationResult(
|
|
67
|
+
league_ratings=league_ratings, iteration_errors=iteration_errors
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def _cross_region_error_df(
|
|
71
|
+
self,
|
|
72
|
+
df: pl.DataFrame,
|
|
73
|
+
pred_df: pl.DataFrame,
|
|
74
|
+
rating_generator: object,
|
|
75
|
+
) -> pl.DataFrame:
|
|
76
|
+
column_names = getattr(rating_generator, "column_names", None)
|
|
77
|
+
if column_names is None:
|
|
78
|
+
raise ValueError("rating_generator must define column_names")
|
|
79
|
+
|
|
80
|
+
match_id = getattr(column_names, "match_id", None)
|
|
81
|
+
team_id = getattr(column_names, "team_id", None)
|
|
82
|
+
league_col = getattr(column_names, "league", None)
|
|
83
|
+
if not match_id or not team_id or not league_col:
|
|
84
|
+
raise ValueError("column_names must include match_id, team_id, and league")
|
|
85
|
+
|
|
86
|
+
pred_col, entity_cols, perf_col = self._prediction_spec(rating_generator)
|
|
87
|
+
base_cols = [match_id, team_id, league_col, perf_col]
|
|
88
|
+
for col in base_cols + entity_cols:
|
|
89
|
+
if col not in df.columns:
|
|
90
|
+
raise ValueError(f"{col} missing from input dataframe")
|
|
91
|
+
|
|
92
|
+
join_cols = [match_id, team_id] + entity_cols
|
|
93
|
+
joined = df.select(base_cols + entity_cols).join(
|
|
94
|
+
pred_df.select(join_cols + [pred_col]),
|
|
95
|
+
on=join_cols,
|
|
96
|
+
how="inner",
|
|
97
|
+
)
|
|
98
|
+
opp_league = self._opponent_mode_league(joined, match_id, team_id, league_col)
|
|
99
|
+
enriched = joined.join(opp_league, on=[match_id, team_id], how="left").with_columns(
|
|
100
|
+
(pl.col(perf_col) - pl.col(pred_col)).alias("error")
|
|
101
|
+
)
|
|
102
|
+
return enriched.filter(pl.col("opp_mode_league").is_not_null()).filter(
|
|
103
|
+
pl.col(league_col) != pl.col("opp_mode_league")
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _opponent_mode_league(
|
|
107
|
+
self, df: pl.DataFrame, match_id: str, team_id: str, league_col: str
|
|
108
|
+
) -> pl.DataFrame:
|
|
109
|
+
team_mode = (
|
|
110
|
+
df.group_by([match_id, team_id, league_col])
|
|
111
|
+
.agg(pl.len().alias("__count"))
|
|
112
|
+
.sort(["__count"], descending=True)
|
|
113
|
+
.unique([match_id, team_id])
|
|
114
|
+
.select([match_id, team_id, league_col])
|
|
115
|
+
.rename({league_col: "team_mode_league"})
|
|
116
|
+
)
|
|
117
|
+
opponents = (
|
|
118
|
+
team_mode.join(team_mode, on=match_id, suffix="_opp")
|
|
119
|
+
.filter(pl.col(team_id) != pl.col(f"{team_id}_opp"))
|
|
120
|
+
.group_by([match_id, team_id, "team_mode_league_opp"])
|
|
121
|
+
.agg(pl.len().alias("__count"))
|
|
122
|
+
.sort(["__count"], descending=True)
|
|
123
|
+
.unique([match_id, team_id])
|
|
124
|
+
.select([match_id, team_id, "team_mode_league_opp"])
|
|
125
|
+
.rename({"team_mode_league_opp": "opp_mode_league"})
|
|
126
|
+
)
|
|
127
|
+
return opponents
|
|
128
|
+
|
|
129
|
+
def _prediction_spec(self, rating_generator: object) -> tuple[str, list[str], str]:
|
|
130
|
+
perf_col = getattr(rating_generator, "performance_column", None)
|
|
131
|
+
if not perf_col:
|
|
132
|
+
raise ValueError("rating_generator must define performance_column")
|
|
133
|
+
if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
|
|
134
|
+
pred_col = rating_generator.PLAYER_PRED_PERF_COL
|
|
135
|
+
column_names = rating_generator.column_names
|
|
136
|
+
player_id = getattr(column_names, "player_id", None)
|
|
137
|
+
if not player_id:
|
|
138
|
+
raise ValueError("column_names must include player_id for player ratings")
|
|
139
|
+
return pred_col, [player_id], perf_col
|
|
140
|
+
if hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
|
|
141
|
+
pred_col = rating_generator.TEAM_PRED_OFF_PERF_COL
|
|
142
|
+
return pred_col, [], perf_col
|
|
143
|
+
raise ValueError("rating_generator must expose a predicted performance column")
|
|
144
|
+
|
|
145
|
+
def _ensure_prediction_columns(self, rating_generator: object) -> None:
|
|
146
|
+
pred_cols: list[str] = []
|
|
147
|
+
if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
|
|
148
|
+
pred_cols.append(rating_generator.PLAYER_PRED_PERF_COL)
|
|
149
|
+
elif hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
|
|
150
|
+
pred_cols.append(rating_generator.TEAM_PRED_OFF_PERF_COL)
|
|
151
|
+
|
|
152
|
+
if not pred_cols:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
existing = list(getattr(rating_generator, "non_predictor_features_out", []) or [])
|
|
156
|
+
for col in pred_cols:
|
|
157
|
+
if col not in existing:
|
|
158
|
+
existing.append(col)
|
|
159
|
+
rating_generator.non_predictor_features_out = existing
|
|
160
|
+
|
|
161
|
+
def _apply_error_updates(
|
|
162
|
+
self,
|
|
163
|
+
rating_generator: object,
|
|
164
|
+
league_ratings: dict[str, float],
|
|
165
|
+
error_summary: list[dict[str, float]],
|
|
166
|
+
league_key: str,
|
|
167
|
+
) -> dict[str, float]:
|
|
168
|
+
scale = self.rating_scale
|
|
169
|
+
if scale is None:
|
|
170
|
+
scale = getattr(rating_generator, "rating_change_multiplier_offense", 1.0)
|
|
171
|
+
|
|
172
|
+
updated = dict(league_ratings)
|
|
173
|
+
for row in error_summary:
|
|
174
|
+
if row["row_count"] < self.min_cross_region_rows:
|
|
175
|
+
continue
|
|
176
|
+
league = row[league_key]
|
|
177
|
+
mean_error = row["mean_error"]
|
|
178
|
+
base_rating = updated.get(league, DEFAULT_START_RATING)
|
|
179
|
+
updated[league] = base_rating + self.learning_rate * mean_error * scale
|
|
180
|
+
return updated
|
|
181
|
+
|
|
182
|
+
def _league_column_name(self, rating_generator: object) -> str:
|
|
183
|
+
column_names = getattr(rating_generator, "column_names", None)
|
|
184
|
+
league_col = getattr(column_names, "league", None)
|
|
185
|
+
if not league_col:
|
|
186
|
+
raise ValueError("column_names must include league for league adjustments")
|
|
187
|
+
return league_col
|
|
188
|
+
|
|
189
|
+
def _get_league_ratings(self, rating_generator: object) -> dict[str, float]:
|
|
190
|
+
start_gen = getattr(rating_generator, "start_rating_generator", None)
|
|
191
|
+
if start_gen is None or not hasattr(start_gen, "league_ratings"):
|
|
192
|
+
raise ValueError("rating_generator must define start_rating_generator.league_ratings")
|
|
193
|
+
return dict(start_gen.league_ratings)
|
|
194
|
+
|
|
195
|
+
def _set_league_ratings(self, rating_generator: object, league_ratings: dict[str, float]) -> None:
|
|
196
|
+
start_gen = getattr(rating_generator, "start_rating_generator", None)
|
|
197
|
+
if start_gen is None or not hasattr(start_gen, "league_ratings"):
|
|
198
|
+
raise ValueError("rating_generator must define start_rating_generator.league_ratings")
|
|
199
|
+
start_gen.league_ratings = dict(league_ratings)
|
|
200
|
+
if hasattr(rating_generator, "start_league_ratings"):
|
|
201
|
+
rating_generator.start_league_ratings = dict(league_ratings)
|
|
@@ -28,7 +28,7 @@ class StartRatingGenerator:
|
|
|
28
28
|
min_count_for_percentiles: int = 50,
|
|
29
29
|
team_rating_subtract: float = 80,
|
|
30
30
|
team_weight: float = 0,
|
|
31
|
-
max_days_ago_league_entities: int =
|
|
31
|
+
max_days_ago_league_entities: int = 600,
|
|
32
32
|
min_match_count_team_rating: int = 2,
|
|
33
33
|
harcoded_start_rating: float | None = None,
|
|
34
34
|
):
|
|
@@ -24,7 +24,7 @@ class TeamStartRatingGenerator:
|
|
|
24
24
|
league_ratings: dict[str, float] | None = None,
|
|
25
25
|
league_quantile: float = 0.2,
|
|
26
26
|
min_count_for_percentiles: int = 50,
|
|
27
|
-
max_days_ago_league_entities: int =
|
|
27
|
+
max_days_ago_league_entities: int = 600,
|
|
28
28
|
min_match_count_team_rating: int = 2,
|
|
29
29
|
harcoded_start_rating: float | None = None,
|
|
30
30
|
):
|
spforge/ratings/utils.py
CHANGED
|
@@ -2,6 +2,10 @@ import polars as pl
|
|
|
2
2
|
|
|
3
3
|
from spforge.data_structures import ColumnNames
|
|
4
4
|
|
|
5
|
+
# Internal column names for scaled participation weights
|
|
6
|
+
_SCALED_PW = "__scaled_participation_weight__"
|
|
7
|
+
_SCALED_PPW = "__scaled_projected_participation_weight__"
|
|
8
|
+
|
|
5
9
|
|
|
6
10
|
def add_team_rating(
|
|
7
11
|
df: pl.DataFrame,
|
|
@@ -46,11 +50,14 @@ def add_team_rating_projected(
|
|
|
46
50
|
tid = column_names.team_id
|
|
47
51
|
ppw = column_names.projected_participation_weight
|
|
48
52
|
|
|
49
|
-
if
|
|
53
|
+
# Use scaled column if available (clipped to [0, 1]), otherwise raw column
|
|
54
|
+
weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
|
|
55
|
+
|
|
56
|
+
if weight_col and weight_col in df.columns:
|
|
50
57
|
return df.with_columns(
|
|
51
58
|
(
|
|
52
|
-
(pl.col(
|
|
53
|
-
/ pl.col(
|
|
59
|
+
(pl.col(weight_col) * pl.col(player_rating_col)).sum().over([mid, tid])
|
|
60
|
+
/ pl.col(weight_col).sum().over([mid, tid])
|
|
54
61
|
).alias(team_rating_out)
|
|
55
62
|
)
|
|
56
63
|
|
|
@@ -118,11 +125,14 @@ def add_rating_mean_projected(
|
|
|
118
125
|
mid = column_names.match_id
|
|
119
126
|
ppw = column_names.projected_participation_weight
|
|
120
127
|
|
|
121
|
-
if
|
|
128
|
+
# Use scaled column if available (clipped to [0, 1]), otherwise raw column
|
|
129
|
+
weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
|
|
130
|
+
|
|
131
|
+
if weight_col and weight_col in df.columns:
|
|
122
132
|
return df.with_columns(
|
|
123
133
|
(
|
|
124
|
-
(pl.col(
|
|
125
|
-
/ pl.col(
|
|
134
|
+
(pl.col(weight_col) * pl.col(player_rating_col)).sum().over(mid)
|
|
135
|
+
/ pl.col(weight_col).sum().over(mid)
|
|
126
136
|
).alias(rating_mean_out)
|
|
127
137
|
)
|
|
128
138
|
|
spforge/scorer/_score.py
CHANGED
|
@@ -366,18 +366,49 @@ class PWMSE(BaseScorer):
|
|
|
366
366
|
self.labels = labels
|
|
367
367
|
self.evaluation_labels = evaluation_labels
|
|
368
368
|
|
|
369
|
+
self._needs_extension = False
|
|
370
|
+
self._needs_slicing = False
|
|
369
371
|
self._eval_indices: list[int] | None = None
|
|
372
|
+
self._extension_mapping: dict[int, int] | None = None
|
|
373
|
+
|
|
370
374
|
if self.evaluation_labels is not None and self.labels is not None:
|
|
371
|
-
|
|
372
|
-
|
|
375
|
+
training_set = set(self.labels)
|
|
376
|
+
eval_set = set(self.evaluation_labels)
|
|
377
|
+
|
|
378
|
+
if eval_set <= training_set:
|
|
379
|
+
self._needs_slicing = True
|
|
380
|
+
label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
|
|
381
|
+
self._eval_indices = [label_to_idx[lbl] for lbl in self.evaluation_labels]
|
|
382
|
+
elif training_set <= eval_set:
|
|
383
|
+
self._needs_extension = True
|
|
384
|
+
eval_label_to_idx = {lbl: i for i, lbl in enumerate(self.evaluation_labels)}
|
|
385
|
+
self._extension_mapping = {
|
|
386
|
+
train_idx: eval_label_to_idx[lbl]
|
|
387
|
+
for train_idx, lbl in enumerate(self.labels)
|
|
388
|
+
}
|
|
389
|
+
else:
|
|
390
|
+
raise ValueError(
|
|
391
|
+
f"evaluation_labels must be a subset or superset of labels. "
|
|
392
|
+
f"labels={self.labels}, evaluation_labels={self.evaluation_labels}"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
def _align_predictions(self, preds: np.ndarray) -> np.ndarray:
|
|
396
|
+
if self._needs_slicing and self._eval_indices is not None:
|
|
397
|
+
sliced = preds[:, self._eval_indices]
|
|
398
|
+
row_sums = sliced.sum(axis=1, keepdims=True)
|
|
399
|
+
row_sums = np.where(row_sums == 0, 1.0, row_sums)
|
|
400
|
+
return sliced / row_sums
|
|
401
|
+
|
|
402
|
+
if self._needs_extension and self._extension_mapping is not None:
|
|
403
|
+
n_samples = preds.shape[0]
|
|
404
|
+
n_eval_labels = len(self.evaluation_labels)
|
|
405
|
+
extended = np.full((n_samples, n_eval_labels), 1e-5, dtype=np.float64)
|
|
406
|
+
for train_idx, eval_idx in self._extension_mapping.items():
|
|
407
|
+
extended[:, eval_idx] = preds[:, train_idx]
|
|
408
|
+
row_sums = extended.sum(axis=1, keepdims=True)
|
|
409
|
+
return extended / row_sums
|
|
373
410
|
|
|
374
|
-
|
|
375
|
-
if self._eval_indices is None:
|
|
376
|
-
return preds
|
|
377
|
-
sliced = preds[:, self._eval_indices]
|
|
378
|
-
row_sums = sliced.sum(axis=1, keepdims=True)
|
|
379
|
-
row_sums = np.where(row_sums == 0, 1.0, row_sums)
|
|
380
|
-
return sliced / row_sums
|
|
411
|
+
return preds
|
|
381
412
|
|
|
382
413
|
def _get_scoring_labels(self) -> list[int]:
|
|
383
414
|
if self.evaluation_labels is not None:
|
|
@@ -446,7 +477,7 @@ class PWMSE(BaseScorer):
|
|
|
446
477
|
|
|
447
478
|
targets = gran_df[self.target].to_numpy().astype(np.float64)
|
|
448
479
|
preds = np.asarray(gran_df[self.pred_column].to_list(), dtype=np.float64)
|
|
449
|
-
preds = self.
|
|
480
|
+
preds = self._align_predictions(preds)
|
|
450
481
|
score = self._pwmse_score(targets, preds)
|
|
451
482
|
if self.compare_to_naive:
|
|
452
483
|
naive_probs_list = _naive_probability_predictions_for_df(
|
|
@@ -464,7 +495,7 @@ class PWMSE(BaseScorer):
|
|
|
464
495
|
|
|
465
496
|
targets = df[self.target].to_numpy().astype(np.float64)
|
|
466
497
|
preds = np.asarray(df[self.pred_column].to_list(), dtype=np.float64)
|
|
467
|
-
preds = self.
|
|
498
|
+
preds = self._align_predictions(preds)
|
|
468
499
|
score = self._pwmse_score(targets, preds)
|
|
469
500
|
if self.compare_to_naive:
|
|
470
501
|
naive_probs_list = _naive_probability_predictions_for_df(
|
|
@@ -8,8 +8,9 @@ from sklearn.base import BaseEstimator, TransformerMixin
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class GroupByReducer(BaseEstimator, TransformerMixin):
|
|
11
|
-
def __init__(self, granularity: list[str]):
|
|
11
|
+
def __init__(self, granularity: list[str], aggregation_weight: str | None = None):
|
|
12
12
|
self.granularity = granularity
|
|
13
|
+
self.aggregation_weight = aggregation_weight
|
|
13
14
|
|
|
14
15
|
@nw.narwhalify
|
|
15
16
|
def fit(self, X: IntoFrameT, y: Any = None):
|
|
@@ -26,18 +27,47 @@ class GroupByReducer(BaseEstimator, TransformerMixin):
|
|
|
26
27
|
raise ValueError("Could not find granularity columns in dataframe %s", self.granularity)
|
|
27
28
|
|
|
28
29
|
non_keys = [c for c in df.columns if c not in keys]
|
|
29
|
-
|
|
30
|
+
schema = df.schema
|
|
31
|
+
num_cols = [c for c in non_keys if schema[c].is_numeric()]
|
|
30
32
|
other_cols = [c for c in non_keys if c not in num_cols]
|
|
31
33
|
|
|
32
34
|
aggs: list[nw.Expr] = []
|
|
33
35
|
|
|
36
|
+
# Backwards compatibility: old pickled objects may not have aggregation_weight
|
|
37
|
+
weight_col = getattr(self, "aggregation_weight", None)
|
|
38
|
+
has_weight = weight_col and weight_col in df.columns
|
|
39
|
+
|
|
34
40
|
for c in num_cols:
|
|
35
|
-
|
|
41
|
+
if c == weight_col:
|
|
42
|
+
aggs.append(nw.col(c).sum().alias(c))
|
|
43
|
+
elif has_weight:
|
|
44
|
+
aggs.append((nw.col(c) * nw.col(weight_col)).sum().alias(f"__{c}_weighted_sum"))
|
|
45
|
+
aggs.append(nw.col(c).mean().alias(f"__{c}_fallback"))
|
|
46
|
+
else:
|
|
47
|
+
aggs.append(nw.col(c).mean().alias(c))
|
|
36
48
|
|
|
37
49
|
for c in other_cols:
|
|
38
50
|
aggs.append(nw.col(c).first().alias(c))
|
|
39
51
|
|
|
52
|
+
if has_weight:
|
|
53
|
+
aggs.append(nw.col(weight_col).sum().alias("__weight_sum"))
|
|
54
|
+
|
|
40
55
|
out = df.group_by(keys).agg(aggs)
|
|
56
|
+
|
|
57
|
+
if has_weight:
|
|
58
|
+
weighted_cols = [c for c in num_cols if c != weight_col]
|
|
59
|
+
for c in weighted_cols:
|
|
60
|
+
out = out.with_columns(
|
|
61
|
+
nw.when((~nw.col("__weight_sum").is_null()) & (nw.col("__weight_sum") != 0))
|
|
62
|
+
.then(nw.col(f"__{c}_weighted_sum") / nw.col("__weight_sum"))
|
|
63
|
+
.otherwise(nw.col(f"__{c}_fallback"))
|
|
64
|
+
.alias(c)
|
|
65
|
+
)
|
|
66
|
+
drop_cols = [f"__{c}_weighted_sum" for c in weighted_cols]
|
|
67
|
+
drop_cols += [f"__{c}_fallback" for c in weighted_cols]
|
|
68
|
+
drop_cols.append("__weight_sum")
|
|
69
|
+
out = out.drop(drop_cols)
|
|
70
|
+
|
|
41
71
|
return out
|
|
42
72
|
|
|
43
73
|
@nw.narwhalify
|
|
@@ -59,12 +89,12 @@ class GroupByReducer(BaseEstimator, TransformerMixin):
|
|
|
59
89
|
if sample_weight is not None:
|
|
60
90
|
df = df.with_columns(nw.lit(sample_weight).alias("__sw"))
|
|
61
91
|
|
|
62
|
-
|
|
92
|
+
y_uniques = df.group_by(keys).agg(nw.col("__y").n_unique().alias("__y_nunique"))
|
|
93
|
+
non_uniform = y_uniques.filter(nw.col("__y_nunique") > 1)
|
|
94
|
+
if len(non_uniform) > 0:
|
|
95
|
+
raise ValueError("Target (y) must be uniform within each granularity group")
|
|
63
96
|
|
|
64
|
-
|
|
65
|
-
agg_exprs = [nw.col("__y").mean().alias("__y")]
|
|
66
|
-
else:
|
|
67
|
-
agg_exprs = [nw.col("__y").first().alias("__y")]
|
|
97
|
+
agg_exprs = [nw.col("__y").first().alias("__y")]
|
|
68
98
|
|
|
69
99
|
if sample_weight is not None:
|
|
70
100
|
agg_exprs.append(nw.col("__sw").sum().alias("__sw"))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spforge
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.18
|
|
4
4
|
Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
|
|
5
5
|
Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
|
|
6
6
|
License: See LICENSE file
|
|
@@ -17,7 +17,7 @@ Description-Content-Type: text/markdown
|
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: numpy>=1.23.4
|
|
19
19
|
Requires-Dist: optuna>=3.4.0
|
|
20
|
-
Requires-Dist: pandas
|
|
20
|
+
Requires-Dist: pandas<3.0.0,>=2.0.0
|
|
21
21
|
Requires-Dist: pendulum>=1.0.0
|
|
22
22
|
Requires-Dist: scikit-learn>=1.4.0
|
|
23
23
|
Requires-Dist: lightgbm>=4.0.0
|
|
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
|
|
|
85
85
|
import pandas as pd
|
|
86
86
|
from sklearn.linear_model import LogisticRegression
|
|
87
87
|
|
|
88
|
+
from examples import get_sub_sample_nba_data
|
|
88
89
|
from spforge.autopipeline import AutoPipeline
|
|
89
90
|
from spforge.data_structures import ColumnNames
|
|
90
|
-
from spforge.ratings import RatingKnownFeatures
|
|
91
|
-
from spforge.ratings._player_rating import PlayerRatingGenerator
|
|
91
|
+
from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
92
92
|
|
|
93
|
-
df =
|
|
93
|
+
df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
|
|
94
94
|
|
|
95
95
|
# Step 1: Define column mappings for your dataset
|
|
96
96
|
column_names = ColumnNames(
|
|
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
|
|
|
144
144
|
pipeline = AutoPipeline(
|
|
145
145
|
estimator=LogisticRegression(),
|
|
146
146
|
granularity=["game_id", "team_id"], # Aggregate players → teams
|
|
147
|
-
|
|
147
|
+
estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
# Train on historical data
|
|
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
|
|
|
302
302
|
prediction_column_name="points_pred",
|
|
303
303
|
target_column="points",
|
|
304
304
|
n_splits=3, # Number of temporal folds
|
|
305
|
-
# Must include both
|
|
306
|
-
features=pipeline.
|
|
305
|
+
# Must include both estimator features and context features
|
|
306
|
+
features=pipeline.required_features,
|
|
307
307
|
)
|
|
308
308
|
|
|
309
309
|
# Generate validation predictions
|
|
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
|
|
|
330
330
|
- `is_validation=1` marks validation rows, `is_validation=0` marks training rows
|
|
331
331
|
- Use `validation_column` in scorer to score only validation rows
|
|
332
332
|
- Training data always comes BEFORE validation data chronologically
|
|
333
|
-
- Must pass
|
|
333
|
+
- Must pass all required features (use `pipeline.required_features`)
|
|
334
334
|
- Scorers can filter rows (e.g., only score players who played minutes > 0)
|
|
335
335
|
|
|
336
336
|
See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
|
|
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
|
|
|
371
371
|
# Approach 1: LGBMClassifier (direct probability prediction)
|
|
372
372
|
pipeline_classifier = AutoPipeline(
|
|
373
373
|
estimator=LGBMClassifier(verbose=-100, random_state=42),
|
|
374
|
-
|
|
374
|
+
estimator_features=features_pipeline.features_out,
|
|
375
375
|
)
|
|
376
376
|
|
|
377
377
|
# Approach 2: LGBMRegressor + NegativeBinomialEstimator
|
|
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
|
|
|
385
385
|
|
|
386
386
|
pipeline_negbin = AutoPipeline(
|
|
387
387
|
estimator=distribution_estimator,
|
|
388
|
-
|
|
389
|
-
context_feature_names=[
|
|
390
|
-
column_names.player_id,
|
|
391
|
-
column_names.start_date,
|
|
392
|
-
column_names.team_id,
|
|
393
|
-
column_names.match_id,
|
|
394
|
-
],
|
|
388
|
+
estimator_features=features_pipeline.features_out,
|
|
395
389
|
predictor_transformers=[
|
|
396
390
|
EstimatorTransformer(
|
|
397
391
|
prediction_column_name="points_estimate",
|
|
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
|
|
|
439
433
|
# Stage 2: Refine estimate using Stage 1 output
|
|
440
434
|
player_points_pipeline = AutoPipeline(
|
|
441
435
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
442
|
-
|
|
436
|
+
estimator_features=features_pipeline.features_out, # Original features
|
|
443
437
|
# predictor_transformers execute first, adding their predictions
|
|
444
438
|
predictor_transformers=[points_estimate_transformer],
|
|
445
439
|
)
|
|
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
|
|
|
474
468
|
- **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
|
|
475
469
|
- **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
|
|
476
470
|
- **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
|
|
477
|
-
|