spforge 0.8.4__py3-none-any.whl → 0.8.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (37) hide show
  1. examples/lol/pipeline_transformer_example.py +69 -86
  2. examples/nba/cross_validation_example.py +4 -11
  3. examples/nba/feature_engineering_example.py +33 -15
  4. examples/nba/game_winner_example.py +24 -14
  5. examples/nba/predictor_transformers_example.py +29 -16
  6. spforge/__init__.py +1 -0
  7. spforge/autopipeline.py +169 -5
  8. spforge/estimator/_group_by_estimator.py +11 -3
  9. spforge/features_generator_pipeline.py +8 -4
  10. spforge/hyperparameter_tuning/__init__.py +12 -0
  11. spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
  12. spforge/hyperparameter_tuning/_tuner.py +192 -0
  13. spforge/performance_transformers/_performance_manager.py +2 -4
  14. spforge/ratings/__init__.py +4 -0
  15. spforge/ratings/_player_rating.py +142 -28
  16. spforge/ratings/league_start_rating_optimizer.py +201 -0
  17. spforge/ratings/start_rating_generator.py +1 -1
  18. spforge/ratings/team_start_rating_generator.py +1 -1
  19. spforge/ratings/utils.py +16 -6
  20. spforge/scorer/_score.py +42 -11
  21. spforge/transformers/_other_transformer.py +38 -8
  22. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/METADATA +12 -19
  23. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/RECORD +37 -31
  24. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
  25. tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
  26. tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  27. tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
  28. tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
  29. tests/performance_transformers/test_performance_manager.py +15 -0
  30. tests/ratings/test_player_rating_generator.py +154 -0
  31. tests/ratings/test_player_rating_no_mutation.py +214 -0
  32. tests/ratings/test_utils_scaled_weights.py +136 -0
  33. tests/scorer/test_score.py +232 -0
  34. tests/test_autopipeline.py +336 -6
  35. tests/test_feature_generator_pipeline.py +43 -0
  36. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
  37. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,201 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from dataclasses import dataclass
5
+
6
+ import narwhals.stable.v2 as nw
7
+ import polars as pl
8
+ from narwhals.stable.v2.typing import IntoFrameT
9
+
10
+
11
+ DEFAULT_START_RATING = 1000.0
12
+
13
+
14
+ @dataclass
15
+ class LeagueStartRatingOptimizationResult:
16
+ league_ratings: dict[str, float]
17
+ iteration_errors: list[dict[str, float]]
18
+
19
+
20
+ class LeagueStartRatingOptimizer:
21
+ def __init__(
22
+ self,
23
+ rating_generator: object,
24
+ n_iterations: int = 3,
25
+ learning_rate: float = 0.2,
26
+ min_cross_region_rows: int = 10,
27
+ rating_scale: float | None = None,
28
+ ):
29
+ self.rating_generator = rating_generator
30
+ self.n_iterations = int(n_iterations)
31
+ self.learning_rate = float(learning_rate)
32
+ self.min_cross_region_rows = int(min_cross_region_rows)
33
+ self.rating_scale = rating_scale
34
+
35
+ @nw.narwhalify
36
+ def optimize(self, df: IntoFrameT) -> LeagueStartRatingOptimizationResult:
37
+ pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars()
38
+ league_ratings = self._get_league_ratings(self.rating_generator)
39
+ iteration_errors: list[dict[str, float]] = []
40
+
41
+ for _ in range(self.n_iterations):
42
+ gen = copy.deepcopy(self.rating_generator)
43
+ self._set_league_ratings(gen, league_ratings)
44
+ self._ensure_prediction_columns(gen)
45
+
46
+ pred_df = gen.fit_transform(pl_df)
47
+ error_df = self._cross_region_error_df(pl_df, pred_df, gen)
48
+ if error_df.is_empty():
49
+ break
50
+
51
+ error_summary = (
52
+ error_df.group_by(self._league_column_name(gen))
53
+ .agg(
54
+ pl.col("error").mean().alias("mean_error"),
55
+ pl.len().alias("row_count"),
56
+ )
57
+ .to_dicts()
58
+ )
59
+ league_key = self._league_column_name(gen)
60
+ iteration_errors.append({r[league_key]: r["mean_error"] for r in error_summary})
61
+ league_ratings = self._apply_error_updates(
62
+ gen, league_ratings, error_summary, league_key
63
+ )
64
+
65
+ self._set_league_ratings(self.rating_generator, league_ratings)
66
+ return LeagueStartRatingOptimizationResult(
67
+ league_ratings=league_ratings, iteration_errors=iteration_errors
68
+ )
69
+
70
+ def _cross_region_error_df(
71
+ self,
72
+ df: pl.DataFrame,
73
+ pred_df: pl.DataFrame,
74
+ rating_generator: object,
75
+ ) -> pl.DataFrame:
76
+ column_names = getattr(rating_generator, "column_names", None)
77
+ if column_names is None:
78
+ raise ValueError("rating_generator must define column_names")
79
+
80
+ match_id = getattr(column_names, "match_id", None)
81
+ team_id = getattr(column_names, "team_id", None)
82
+ league_col = getattr(column_names, "league", None)
83
+ if not match_id or not team_id or not league_col:
84
+ raise ValueError("column_names must include match_id, team_id, and league")
85
+
86
+ pred_col, entity_cols, perf_col = self._prediction_spec(rating_generator)
87
+ base_cols = [match_id, team_id, league_col, perf_col]
88
+ for col in base_cols + entity_cols:
89
+ if col not in df.columns:
90
+ raise ValueError(f"{col} missing from input dataframe")
91
+
92
+ join_cols = [match_id, team_id] + entity_cols
93
+ joined = df.select(base_cols + entity_cols).join(
94
+ pred_df.select(join_cols + [pred_col]),
95
+ on=join_cols,
96
+ how="inner",
97
+ )
98
+ opp_league = self._opponent_mode_league(joined, match_id, team_id, league_col)
99
+ enriched = joined.join(opp_league, on=[match_id, team_id], how="left").with_columns(
100
+ (pl.col(perf_col) - pl.col(pred_col)).alias("error")
101
+ )
102
+ return enriched.filter(pl.col("opp_mode_league").is_not_null()).filter(
103
+ pl.col(league_col) != pl.col("opp_mode_league")
104
+ )
105
+
106
+ def _opponent_mode_league(
107
+ self, df: pl.DataFrame, match_id: str, team_id: str, league_col: str
108
+ ) -> pl.DataFrame:
109
+ team_mode = (
110
+ df.group_by([match_id, team_id, league_col])
111
+ .agg(pl.len().alias("__count"))
112
+ .sort(["__count"], descending=True)
113
+ .unique([match_id, team_id])
114
+ .select([match_id, team_id, league_col])
115
+ .rename({league_col: "team_mode_league"})
116
+ )
117
+ opponents = (
118
+ team_mode.join(team_mode, on=match_id, suffix="_opp")
119
+ .filter(pl.col(team_id) != pl.col(f"{team_id}_opp"))
120
+ .group_by([match_id, team_id, "team_mode_league_opp"])
121
+ .agg(pl.len().alias("__count"))
122
+ .sort(["__count"], descending=True)
123
+ .unique([match_id, team_id])
124
+ .select([match_id, team_id, "team_mode_league_opp"])
125
+ .rename({"team_mode_league_opp": "opp_mode_league"})
126
+ )
127
+ return opponents
128
+
129
+ def _prediction_spec(self, rating_generator: object) -> tuple[str, list[str], str]:
130
+ perf_col = getattr(rating_generator, "performance_column", None)
131
+ if not perf_col:
132
+ raise ValueError("rating_generator must define performance_column")
133
+ if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
134
+ pred_col = rating_generator.PLAYER_PRED_PERF_COL
135
+ column_names = rating_generator.column_names
136
+ player_id = getattr(column_names, "player_id", None)
137
+ if not player_id:
138
+ raise ValueError("column_names must include player_id for player ratings")
139
+ return pred_col, [player_id], perf_col
140
+ if hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
141
+ pred_col = rating_generator.TEAM_PRED_OFF_PERF_COL
142
+ return pred_col, [], perf_col
143
+ raise ValueError("rating_generator must expose a predicted performance column")
144
+
145
+ def _ensure_prediction_columns(self, rating_generator: object) -> None:
146
+ pred_cols: list[str] = []
147
+ if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
148
+ pred_cols.append(rating_generator.PLAYER_PRED_PERF_COL)
149
+ elif hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
150
+ pred_cols.append(rating_generator.TEAM_PRED_OFF_PERF_COL)
151
+
152
+ if not pred_cols:
153
+ return
154
+
155
+ existing = list(getattr(rating_generator, "non_predictor_features_out", []) or [])
156
+ for col in pred_cols:
157
+ if col not in existing:
158
+ existing.append(col)
159
+ rating_generator.non_predictor_features_out = existing
160
+
161
+ def _apply_error_updates(
162
+ self,
163
+ rating_generator: object,
164
+ league_ratings: dict[str, float],
165
+ error_summary: list[dict[str, float]],
166
+ league_key: str,
167
+ ) -> dict[str, float]:
168
+ scale = self.rating_scale
169
+ if scale is None:
170
+ scale = getattr(rating_generator, "rating_change_multiplier_offense", 1.0)
171
+
172
+ updated = dict(league_ratings)
173
+ for row in error_summary:
174
+ if row["row_count"] < self.min_cross_region_rows:
175
+ continue
176
+ league = row[league_key]
177
+ mean_error = row["mean_error"]
178
+ base_rating = updated.get(league, DEFAULT_START_RATING)
179
+ updated[league] = base_rating + self.learning_rate * mean_error * scale
180
+ return updated
181
+
182
+ def _league_column_name(self, rating_generator: object) -> str:
183
+ column_names = getattr(rating_generator, "column_names", None)
184
+ league_col = getattr(column_names, "league", None)
185
+ if not league_col:
186
+ raise ValueError("column_names must include league for league adjustments")
187
+ return league_col
188
+
189
+ def _get_league_ratings(self, rating_generator: object) -> dict[str, float]:
190
+ start_gen = getattr(rating_generator, "start_rating_generator", None)
191
+ if start_gen is None or not hasattr(start_gen, "league_ratings"):
192
+ raise ValueError("rating_generator must define start_rating_generator.league_ratings")
193
+ return dict(start_gen.league_ratings)
194
+
195
+ def _set_league_ratings(self, rating_generator: object, league_ratings: dict[str, float]) -> None:
196
+ start_gen = getattr(rating_generator, "start_rating_generator", None)
197
+ if start_gen is None or not hasattr(start_gen, "league_ratings"):
198
+ raise ValueError("rating_generator must define start_rating_generator.league_ratings")
199
+ start_gen.league_ratings = dict(league_ratings)
200
+ if hasattr(rating_generator, "start_league_ratings"):
201
+ rating_generator.start_league_ratings = dict(league_ratings)
@@ -28,7 +28,7 @@ class StartRatingGenerator:
28
28
  min_count_for_percentiles: int = 50,
29
29
  team_rating_subtract: float = 80,
30
30
  team_weight: float = 0,
31
- max_days_ago_league_entities: int = 120,
31
+ max_days_ago_league_entities: int = 600,
32
32
  min_match_count_team_rating: int = 2,
33
33
  harcoded_start_rating: float | None = None,
34
34
  ):
@@ -24,7 +24,7 @@ class TeamStartRatingGenerator:
24
24
  league_ratings: dict[str, float] | None = None,
25
25
  league_quantile: float = 0.2,
26
26
  min_count_for_percentiles: int = 50,
27
- max_days_ago_league_entities: int = 120,
27
+ max_days_ago_league_entities: int = 600,
28
28
  min_match_count_team_rating: int = 2,
29
29
  harcoded_start_rating: float | None = None,
30
30
  ):
spforge/ratings/utils.py CHANGED
@@ -2,6 +2,10 @@ import polars as pl
2
2
 
3
3
  from spforge.data_structures import ColumnNames
4
4
 
5
+ # Internal column names for scaled participation weights
6
+ _SCALED_PW = "__scaled_participation_weight__"
7
+ _SCALED_PPW = "__scaled_projected_participation_weight__"
8
+
5
9
 
6
10
  def add_team_rating(
7
11
  df: pl.DataFrame,
@@ -46,11 +50,14 @@ def add_team_rating_projected(
46
50
  tid = column_names.team_id
47
51
  ppw = column_names.projected_participation_weight
48
52
 
49
- if ppw:
53
+ # Use scaled column if available (clipped to [0, 1]), otherwise raw column
54
+ weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
55
+
56
+ if weight_col and weight_col in df.columns:
50
57
  return df.with_columns(
51
58
  (
52
- (pl.col(ppw) * pl.col(player_rating_col)).sum().over([mid, tid])
53
- / pl.col(ppw).sum().over([mid, tid])
59
+ (pl.col(weight_col) * pl.col(player_rating_col)).sum().over([mid, tid])
60
+ / pl.col(weight_col).sum().over([mid, tid])
54
61
  ).alias(team_rating_out)
55
62
  )
56
63
 
@@ -118,11 +125,14 @@ def add_rating_mean_projected(
118
125
  mid = column_names.match_id
119
126
  ppw = column_names.projected_participation_weight
120
127
 
121
- if ppw:
128
+ # Use scaled column if available (clipped to [0, 1]), otherwise raw column
129
+ weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
130
+
131
+ if weight_col and weight_col in df.columns:
122
132
  return df.with_columns(
123
133
  (
124
- (pl.col(ppw) * pl.col(player_rating_col)).sum().over(mid)
125
- / pl.col(ppw).sum().over(mid)
134
+ (pl.col(weight_col) * pl.col(player_rating_col)).sum().over(mid)
135
+ / pl.col(weight_col).sum().over(mid)
126
136
  ).alias(rating_mean_out)
127
137
  )
128
138
 
spforge/scorer/_score.py CHANGED
@@ -366,18 +366,49 @@ class PWMSE(BaseScorer):
366
366
  self.labels = labels
367
367
  self.evaluation_labels = evaluation_labels
368
368
 
369
+ self._needs_extension = False
370
+ self._needs_slicing = False
369
371
  self._eval_indices: list[int] | None = None
372
+ self._extension_mapping: dict[int, int] | None = None
373
+
370
374
  if self.evaluation_labels is not None and self.labels is not None:
371
- label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
372
- self._eval_indices = [label_to_idx[lbl] for lbl in self.evaluation_labels]
375
+ training_set = set(self.labels)
376
+ eval_set = set(self.evaluation_labels)
377
+
378
+ if eval_set <= training_set:
379
+ self._needs_slicing = True
380
+ label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
381
+ self._eval_indices = [label_to_idx[lbl] for lbl in self.evaluation_labels]
382
+ elif training_set <= eval_set:
383
+ self._needs_extension = True
384
+ eval_label_to_idx = {lbl: i for i, lbl in enumerate(self.evaluation_labels)}
385
+ self._extension_mapping = {
386
+ train_idx: eval_label_to_idx[lbl]
387
+ for train_idx, lbl in enumerate(self.labels)
388
+ }
389
+ else:
390
+ raise ValueError(
391
+ f"evaluation_labels must be a subset or superset of labels. "
392
+ f"labels={self.labels}, evaluation_labels={self.evaluation_labels}"
393
+ )
394
+
395
+ def _align_predictions(self, preds: np.ndarray) -> np.ndarray:
396
+ if self._needs_slicing and self._eval_indices is not None:
397
+ sliced = preds[:, self._eval_indices]
398
+ row_sums = sliced.sum(axis=1, keepdims=True)
399
+ row_sums = np.where(row_sums == 0, 1.0, row_sums)
400
+ return sliced / row_sums
401
+
402
+ if self._needs_extension and self._extension_mapping is not None:
403
+ n_samples = preds.shape[0]
404
+ n_eval_labels = len(self.evaluation_labels)
405
+ extended = np.full((n_samples, n_eval_labels), 1e-5, dtype=np.float64)
406
+ for train_idx, eval_idx in self._extension_mapping.items():
407
+ extended[:, eval_idx] = preds[:, train_idx]
408
+ row_sums = extended.sum(axis=1, keepdims=True)
409
+ return extended / row_sums
373
410
 
374
- def _slice_and_renormalize(self, preds: np.ndarray) -> np.ndarray:
375
- if self._eval_indices is None:
376
- return preds
377
- sliced = preds[:, self._eval_indices]
378
- row_sums = sliced.sum(axis=1, keepdims=True)
379
- row_sums = np.where(row_sums == 0, 1.0, row_sums)
380
- return sliced / row_sums
411
+ return preds
381
412
 
382
413
  def _get_scoring_labels(self) -> list[int]:
383
414
  if self.evaluation_labels is not None:
@@ -446,7 +477,7 @@ class PWMSE(BaseScorer):
446
477
 
447
478
  targets = gran_df[self.target].to_numpy().astype(np.float64)
448
479
  preds = np.asarray(gran_df[self.pred_column].to_list(), dtype=np.float64)
449
- preds = self._slice_and_renormalize(preds)
480
+ preds = self._align_predictions(preds)
450
481
  score = self._pwmse_score(targets, preds)
451
482
  if self.compare_to_naive:
452
483
  naive_probs_list = _naive_probability_predictions_for_df(
@@ -464,7 +495,7 @@ class PWMSE(BaseScorer):
464
495
 
465
496
  targets = df[self.target].to_numpy().astype(np.float64)
466
497
  preds = np.asarray(df[self.pred_column].to_list(), dtype=np.float64)
467
- preds = self._slice_and_renormalize(preds)
498
+ preds = self._align_predictions(preds)
468
499
  score = self._pwmse_score(targets, preds)
469
500
  if self.compare_to_naive:
470
501
  naive_probs_list = _naive_probability_predictions_for_df(
@@ -8,8 +8,9 @@ from sklearn.base import BaseEstimator, TransformerMixin
8
8
 
9
9
 
10
10
  class GroupByReducer(BaseEstimator, TransformerMixin):
11
- def __init__(self, granularity: list[str]):
11
+ def __init__(self, granularity: list[str], aggregation_weight: str | None = None):
12
12
  self.granularity = granularity
13
+ self.aggregation_weight = aggregation_weight
13
14
 
14
15
  @nw.narwhalify
15
16
  def fit(self, X: IntoFrameT, y: Any = None):
@@ -26,18 +27,47 @@ class GroupByReducer(BaseEstimator, TransformerMixin):
26
27
  raise ValueError("Could not find granularity columns in dataframe %s", self.granularity)
27
28
 
28
29
  non_keys = [c for c in df.columns if c not in keys]
29
- num_cols = [c for c in non_keys if pd.api.types.is_numeric_dtype(df[c])]
30
+ schema = df.schema
31
+ num_cols = [c for c in non_keys if schema[c].is_numeric()]
30
32
  other_cols = [c for c in non_keys if c not in num_cols]
31
33
 
32
34
  aggs: list[nw.Expr] = []
33
35
 
36
+ # Backwards compatibility: old pickled objects may not have aggregation_weight
37
+ weight_col = getattr(self, "aggregation_weight", None)
38
+ has_weight = weight_col and weight_col in df.columns
39
+
34
40
  for c in num_cols:
35
- aggs.append(nw.col(c).mean().alias(c))
41
+ if c == weight_col:
42
+ aggs.append(nw.col(c).sum().alias(c))
43
+ elif has_weight:
44
+ aggs.append((nw.col(c) * nw.col(weight_col)).sum().alias(f"__{c}_weighted_sum"))
45
+ aggs.append(nw.col(c).mean().alias(f"__{c}_fallback"))
46
+ else:
47
+ aggs.append(nw.col(c).mean().alias(c))
36
48
 
37
49
  for c in other_cols:
38
50
  aggs.append(nw.col(c).first().alias(c))
39
51
 
52
+ if has_weight:
53
+ aggs.append(nw.col(weight_col).sum().alias("__weight_sum"))
54
+
40
55
  out = df.group_by(keys).agg(aggs)
56
+
57
+ if has_weight:
58
+ weighted_cols = [c for c in num_cols if c != weight_col]
59
+ for c in weighted_cols:
60
+ out = out.with_columns(
61
+ nw.when((~nw.col("__weight_sum").is_null()) & (nw.col("__weight_sum") != 0))
62
+ .then(nw.col(f"__{c}_weighted_sum") / nw.col("__weight_sum"))
63
+ .otherwise(nw.col(f"__{c}_fallback"))
64
+ .alias(c)
65
+ )
66
+ drop_cols = [f"__{c}_weighted_sum" for c in weighted_cols]
67
+ drop_cols += [f"__{c}_fallback" for c in weighted_cols]
68
+ drop_cols.append("__weight_sum")
69
+ out = out.drop(drop_cols)
70
+
41
71
  return out
42
72
 
43
73
  @nw.narwhalify
@@ -59,12 +89,12 @@ class GroupByReducer(BaseEstimator, TransformerMixin):
59
89
  if sample_weight is not None:
60
90
  df = df.with_columns(nw.lit(sample_weight).alias("__sw"))
61
91
 
62
- y_is_numeric = df.select(nw.col("__y")).schema["__y"].is_numeric()
92
+ y_uniques = df.group_by(keys).agg(nw.col("__y").n_unique().alias("__y_nunique"))
93
+ non_uniform = y_uniques.filter(nw.col("__y_nunique") > 1)
94
+ if len(non_uniform) > 0:
95
+ raise ValueError("Target (y) must be uniform within each granularity group")
63
96
 
64
- if y_is_numeric:
65
- agg_exprs = [nw.col("__y").mean().alias("__y")]
66
- else:
67
- agg_exprs = [nw.col("__y").first().alias("__y")]
97
+ agg_exprs = [nw.col("__y").first().alias("__y")]
68
98
 
69
99
  if sample_weight is not None:
70
100
  agg_exprs.append(nw.col("__sw").sum().alias("__sw"))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.4
3
+ Version: 0.8.18
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -17,7 +17,7 @@ Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: numpy>=1.23.4
19
19
  Requires-Dist: optuna>=3.4.0
20
- Requires-Dist: pandas>=2.0.0
20
+ Requires-Dist: pandas<3.0.0,>=2.0.0
21
21
  Requires-Dist: pendulum>=1.0.0
22
22
  Requires-Dist: scikit-learn>=1.4.0
23
23
  Requires-Dist: lightgbm>=4.0.0
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
85
85
  import pandas as pd
86
86
  from sklearn.linear_model import LogisticRegression
87
87
 
88
+ from examples import get_sub_sample_nba_data
88
89
  from spforge.autopipeline import AutoPipeline
89
90
  from spforge.data_structures import ColumnNames
90
- from spforge.ratings import RatingKnownFeatures
91
- from spforge.ratings._player_rating import PlayerRatingGenerator
91
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
92
92
 
93
- df = pd.read_parquet("data/game_player_subsample.parquet")
93
+ df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
94
94
 
95
95
  # Step 1: Define column mappings for your dataset
96
96
  column_names = ColumnNames(
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
144
144
  pipeline = AutoPipeline(
145
145
  estimator=LogisticRegression(),
146
146
  granularity=["game_id", "team_id"], # Aggregate players → teams
147
- feature_names=rating_generator.features_out + ["location"], # Rating + home/away
147
+ estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
148
148
  )
149
149
 
150
150
  # Train on historical data
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
302
302
  prediction_column_name="points_pred",
303
303
  target_column="points",
304
304
  n_splits=3, # Number of temporal folds
305
- # Must include both feature_names AND context_feature_names
306
- features=pipeline.feature_names + pipeline.context_feature_names,
305
+ # Must include both estimator features and context features
306
+ features=pipeline.required_features,
307
307
  )
308
308
 
309
309
  # Generate validation predictions
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
330
330
  - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
331
331
  - Use `validation_column` in scorer to score only validation rows
332
332
  - Training data always comes BEFORE validation data chronologically
333
- - Must pass both `feature_names` + `context_feature_names` to `features` parameter
333
+ - Must pass all required features (use `pipeline.required_features`)
334
334
  - Scorers can filter rows (e.g., only score players who played minutes > 0)
335
335
 
336
336
  See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
371
371
  # Approach 1: LGBMClassifier (direct probability prediction)
372
372
  pipeline_classifier = AutoPipeline(
373
373
  estimator=LGBMClassifier(verbose=-100, random_state=42),
374
- feature_names=features_pipeline.features_out,
374
+ estimator_features=features_pipeline.features_out,
375
375
  )
376
376
 
377
377
  # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
385
385
 
386
386
  pipeline_negbin = AutoPipeline(
387
387
  estimator=distribution_estimator,
388
- feature_names=features_pipeline.features_out,
389
- context_feature_names=[
390
- column_names.player_id,
391
- column_names.start_date,
392
- column_names.team_id,
393
- column_names.match_id,
394
- ],
388
+ estimator_features=features_pipeline.features_out,
395
389
  predictor_transformers=[
396
390
  EstimatorTransformer(
397
391
  prediction_column_name="points_estimate",
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
439
433
  # Stage 2: Refine estimate using Stage 1 output
440
434
  player_points_pipeline = AutoPipeline(
441
435
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
442
- feature_names=features_pipeline.features_out, # Original features
436
+ estimator_features=features_pipeline.features_out, # Original features
443
437
  # predictor_transformers execute first, adding their predictions
444
438
  predictor_transformers=[points_estimate_transformer],
445
439
  )
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
474
468
  - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
475
469
  - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
476
470
  - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
477
-