spforge 0.8.4__py3-none-any.whl → 0.8.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (37) hide show
  1. examples/lol/pipeline_transformer_example.py +69 -86
  2. examples/nba/cross_validation_example.py +4 -11
  3. examples/nba/feature_engineering_example.py +33 -15
  4. examples/nba/game_winner_example.py +24 -14
  5. examples/nba/predictor_transformers_example.py +29 -16
  6. spforge/__init__.py +1 -0
  7. spforge/autopipeline.py +169 -5
  8. spforge/estimator/_group_by_estimator.py +11 -3
  9. spforge/features_generator_pipeline.py +8 -4
  10. spforge/hyperparameter_tuning/__init__.py +12 -0
  11. spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
  12. spforge/hyperparameter_tuning/_tuner.py +192 -0
  13. spforge/performance_transformers/_performance_manager.py +2 -4
  14. spforge/ratings/__init__.py +4 -0
  15. spforge/ratings/_player_rating.py +142 -28
  16. spforge/ratings/league_start_rating_optimizer.py +201 -0
  17. spforge/ratings/start_rating_generator.py +1 -1
  18. spforge/ratings/team_start_rating_generator.py +1 -1
  19. spforge/ratings/utils.py +16 -6
  20. spforge/scorer/_score.py +42 -11
  21. spforge/transformers/_other_transformer.py +38 -8
  22. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/METADATA +12 -19
  23. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/RECORD +37 -31
  24. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
  25. tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
  26. tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  27. tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
  28. tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
  29. tests/performance_transformers/test_performance_manager.py +15 -0
  30. tests/ratings/test_player_rating_generator.py +154 -0
  31. tests/ratings/test_player_rating_no_mutation.py +214 -0
  32. tests/ratings/test_utils_scaled_weights.py +136 -0
  33. tests/scorer/test_score.py +232 -0
  34. tests/test_autopipeline.py +336 -6
  35. tests/test_feature_generator_pipeline.py +43 -0
  36. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
  37. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
spforge/autopipeline.py CHANGED
@@ -195,6 +195,40 @@ def lgbm_in_root(root) -> bool:
195
195
  return any(_is_lightgbm_estimator(obj) for obj in _walk_objects(root))
196
196
 
197
197
 
198
+ def _get_importance_estimator(estimator) -> tuple[Any, str] | None:
199
+ """Recursively find innermost estimator with feature_importances_ or coef_."""
200
+ if hasattr(estimator, "feature_importances_"):
201
+ inner = _get_importance_estimator_inner(estimator)
202
+ if inner is not None:
203
+ return inner
204
+ return (estimator, "feature_importances_")
205
+
206
+ if hasattr(estimator, "coef_"):
207
+ inner = _get_importance_estimator_inner(estimator)
208
+ if inner is not None:
209
+ return inner
210
+ return (estimator, "coef_")
211
+
212
+ return _get_importance_estimator_inner(estimator)
213
+
214
+
215
+ def _get_importance_estimator_inner(estimator) -> tuple[Any, str] | None:
216
+ """Check wrapped estimators for importance attributes."""
217
+ # Check estimator_ (sklearn fitted wrapper convention)
218
+ if hasattr(estimator, "estimator_") and estimator.estimator_ is not None:
219
+ result = _get_importance_estimator(estimator.estimator_)
220
+ if result is not None:
221
+ return result
222
+
223
+ # Check _est (GroupByEstimator convention)
224
+ if hasattr(estimator, "_est") and estimator._est is not None:
225
+ result = _get_importance_estimator(estimator._est)
226
+ if result is not None:
227
+ return result
228
+
229
+ return None
230
+
231
+
198
232
  class AutoPipeline(BaseEstimator):
199
233
  def __init__(
200
234
  self,
@@ -202,6 +236,7 @@ class AutoPipeline(BaseEstimator):
202
236
  estimator_features: list[str],
203
237
  predictor_transformers: list[PredictorTransformer] | None = None,
204
238
  granularity: list[str] | None = None,
239
+ aggregation_weight: str | None = None,
205
240
  filters: list[Filter] | None = None,
206
241
  scale_features: bool = False,
207
242
  categorical_handling: CategoricalHandling = "auto",
@@ -216,6 +251,7 @@ class AutoPipeline(BaseEstimator):
216
251
  self.estimator_features = estimator_features
217
252
  self.feature_names = estimator_features # Internal compat
218
253
  self.granularity = granularity or []
254
+ self.aggregation_weight = aggregation_weight
219
255
  self.predictor_transformers = predictor_transformers
220
256
  self.estimator = estimator
221
257
  self.filters = filters or []
@@ -230,6 +266,7 @@ class AutoPipeline(BaseEstimator):
230
266
  self.numeric_features = numeric_features
231
267
  self.remainder = remainder
232
268
  self._cat_feats = []
269
+ self._filter_feature_names: list[str] = []
233
270
 
234
271
  # Auto-compute context features
235
272
  self.context_feature_names = self._compute_context_features()
@@ -242,11 +279,12 @@ class AutoPipeline(BaseEstimator):
242
279
  self._resolved_categorical_handling: CategoricalHandling | None = None
243
280
 
244
281
  def _compute_context_features(self) -> list[str]:
245
- """Auto-compute context features from estimator, granularity, and filters.
282
+ """Auto-compute context features from estimator and granularity.
246
283
 
247
284
  Note: Context from predictor_transformers is tracked separately in
248
285
  context_predictor_transformer_feature_names and is dropped before
249
- the final estimator.
286
+ the final estimator. Filter columns are tracked separately and are
287
+ dropped before the final estimator.
250
288
  """
251
289
  from spforge.transformers._base import PredictorTransformer
252
290
 
@@ -290,9 +328,15 @@ class AutoPipeline(BaseEstimator):
290
328
  # Add granularity columns
291
329
  context.extend(self.granularity)
292
330
 
331
+ # Add aggregation weight column
332
+ if self.aggregation_weight:
333
+ context.append(self.aggregation_weight)
334
+
293
335
  # Add filter columns
336
+ self._filter_feature_names = []
294
337
  for f in self.filters:
295
- context.append(f.column_name)
338
+ if f.column_name not in self._filter_feature_names:
339
+ self._filter_feature_names.append(f.column_name)
296
340
 
297
341
  # Dedupe while preserving order, excluding estimator_features
298
342
  seen = set()
@@ -454,7 +498,11 @@ class AutoPipeline(BaseEstimator):
454
498
  pre = PreprocessorToDataFrame(pre_raw)
455
499
 
456
500
  est = (
457
- GroupByEstimator(self.estimator, granularity=[f"{c}" for c in self.granularity])
501
+ GroupByEstimator(
502
+ self.estimator,
503
+ granularity=[f"{c}" for c in self.granularity],
504
+ aggregation_weight=self.aggregation_weight,
505
+ )
458
506
  if do_groupby
459
507
  else self.estimator
460
508
  )
@@ -506,8 +554,10 @@ class AutoPipeline(BaseEstimator):
506
554
  prev_transformer_feats_out.extend(feats_out)
507
555
 
508
556
  # Use FunctionTransformer with global function for serializability
557
+ drop_filter_cols = set(self._filter_feature_names)
558
+ drop_cols = drop_ctx_set | drop_filter_cols
509
559
  final = FunctionTransformer(
510
- _drop_columns_transformer, validate=False, kw_args={"drop_cols": drop_ctx_set}
560
+ _drop_columns_transformer, validate=False, kw_args={"drop_cols": drop_cols}
511
561
  )
512
562
  steps.append(("final", final))
513
563
 
@@ -538,6 +588,7 @@ class AutoPipeline(BaseEstimator):
538
588
  self.feature_names
539
589
  + self.context_feature_names
540
590
  + self.context_predictor_transformer_feature_names
591
+ + self._filter_feature_names
541
592
  + self.granularity
542
593
  )
543
594
  )
@@ -626,4 +677,117 @@ class AutoPipeline(BaseEstimator):
626
677
  if ctx not in all_features:
627
678
  all_features.append(ctx)
628
679
 
680
+ # Add filter columns (needed for fit-time filtering)
681
+ for col in self._filter_feature_names:
682
+ if col not in all_features:
683
+ all_features.append(col)
684
+
629
685
  return all_features
686
+
687
+ def _get_estimator_feature_names(self) -> list[str]:
688
+ """Get feature names as seen by the final estimator after all transformations."""
689
+ pre_out = list(self.sklearn_pipeline.named_steps["pre"].get_feature_names_out())
690
+
691
+ # Remove context columns dropped by "final" step
692
+ final_step = self.sklearn_pipeline.named_steps["final"]
693
+ drop_cols = final_step.kw_args.get("drop_cols", set()) if final_step.kw_args else set()
694
+ features = [f for f in pre_out if f not in drop_cols]
695
+
696
+ # Remove granularity columns (dropped by GroupByEstimator)
697
+ granularity_set = set(self.granularity)
698
+ features = [f for f in features if f not in granularity_set]
699
+
700
+ # Remove context features (used by wrapper estimators, not inner model)
701
+ context_set = set(self.context_feature_names)
702
+ features = [f for f in features if f not in context_set]
703
+
704
+ # Remove filter columns (used only for fit-time filtering)
705
+ filter_set = set(self._filter_feature_names)
706
+ features = [f for f in features if f not in filter_set]
707
+
708
+ return features
709
+
710
+ def _resolve_importance_feature_names(self, estimator, n_features: int) -> list[str]:
711
+ names = None
712
+ if hasattr(estimator, "feature_names_in_") and estimator.feature_names_in_ is not None:
713
+ names = list(estimator.feature_names_in_)
714
+ elif hasattr(estimator, "feature_name_") and estimator.feature_name_ is not None:
715
+ names = list(estimator.feature_name_)
716
+ elif hasattr(estimator, "feature_names_") and estimator.feature_names_ is not None:
717
+ names = list(estimator.feature_names_)
718
+ if names is None:
719
+ names = self._get_estimator_feature_names()
720
+ if len(names) != n_features:
721
+ raise ValueError(
722
+ f"Feature names length ({len(names)}) does not match importances length ({n_features})."
723
+ )
724
+ return names
725
+
726
+ @property
727
+ def feature_importances_(self) -> pd.DataFrame:
728
+ """Get feature importances from the fitted estimator.
729
+
730
+ Returns a DataFrame with columns ["feature", "importance"] sorted by
731
+ absolute importance descending. Works with tree-based models
732
+ (feature_importances_) and linear models (coef_).
733
+ """
734
+ if self.sklearn_pipeline is None:
735
+ raise RuntimeError("Pipeline not fitted. Call fit() first.")
736
+
737
+ est = self.sklearn_pipeline.named_steps["est"]
738
+ result = _get_importance_estimator(est)
739
+
740
+ if result is None:
741
+ raise RuntimeError(
742
+ "Estimator does not support feature importances. "
743
+ "Requires feature_importances_ or coef_ attribute."
744
+ )
745
+
746
+ inner_est, attr_name = result
747
+ raw = getattr(inner_est, attr_name)
748
+
749
+ if attr_name == "coef_":
750
+ # Linear models: use absolute value of coefficients
751
+ if raw.ndim == 2:
752
+ # Multi-class: average absolute values across classes
753
+ importances = np.abs(raw).mean(axis=0)
754
+ else:
755
+ importances = np.abs(raw)
756
+ else:
757
+ importances = raw
758
+
759
+ feature_names = self._get_estimator_feature_names()
760
+
761
+ df = pd.DataFrame({"feature": feature_names, "importance": importances})
762
+ df = df.sort_values("importance", ascending=False, key=abs).reset_index(drop=True)
763
+ return df
764
+
765
+ @property
766
+ def feature_importance_names(self) -> dict[str, float]:
767
+ """Map deepest estimator feature names to importances."""
768
+ if self.sklearn_pipeline is None:
769
+ raise RuntimeError("Pipeline not fitted. Call fit() first.")
770
+
771
+ est = self.sklearn_pipeline.named_steps["est"]
772
+ result = _get_importance_estimator(est)
773
+
774
+ if result is None:
775
+ raise RuntimeError(
776
+ "Estimator does not support feature importances. "
777
+ "Requires feature_importances_ or coef_ attribute."
778
+ )
779
+
780
+ inner_est, attr_name = result
781
+ raw = getattr(inner_est, attr_name)
782
+
783
+ if attr_name == "coef_":
784
+ if raw.ndim == 2:
785
+ importances = np.abs(raw).mean(axis=0)
786
+ else:
787
+ importances = np.abs(raw)
788
+ else:
789
+ importances = raw
790
+
791
+ importances = np.asarray(importances)
792
+ feature_names = self._resolve_importance_feature_names(inner_est, len(importances))
793
+ return dict(zip(feature_names, importances.tolist()))
@@ -10,10 +10,16 @@ from spforge.transformers._other_transformer import GroupByReducer
10
10
 
11
11
 
12
12
  class GroupByEstimator(BaseEstimator):
13
- def __init__(self, estimator: Any, granularity: list[str] | None = None):
13
+ def __init__(
14
+ self,
15
+ estimator: Any,
16
+ granularity: list[str] | None = None,
17
+ aggregation_weight: str | None = None,
18
+ ):
14
19
  self.estimator = estimator
15
20
  self.granularity = granularity or []
16
- self._reducer = GroupByReducer(self.granularity)
21
+ self.aggregation_weight = aggregation_weight
22
+ self._reducer = GroupByReducer(self.granularity, aggregation_weight=aggregation_weight)
17
23
  self._est = None
18
24
 
19
25
  def __sklearn_is_fitted__(self):
@@ -22,7 +28,9 @@ class GroupByEstimator(BaseEstimator):
22
28
  @nw.narwhalify
23
29
  def fit(self, X: IntoFrameT, y: Any, sample_weight: np.ndarray | None = None):
24
30
  X = X.to_pandas()
25
- self._reducer = GroupByReducer(self.granularity)
31
+ # Backwards compatibility: old pickled objects may not have aggregation_weight
32
+ agg_weight = getattr(self, "aggregation_weight", None)
33
+ self._reducer = GroupByReducer(self.granularity, aggregation_weight=agg_weight)
26
34
  X_red = nw.from_native(self._reducer.fit_transform(X))
27
35
  y_red, sw_red = self._reducer.reduce_y(X, y, sample_weight=sample_weight)
28
36
 
@@ -120,7 +120,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
120
120
 
121
121
  for transformer in self.feature_generators:
122
122
  pre_row_count = len(df)
123
- df = nw.from_native(transformer.fit_transform(df, column_names=column_names))
123
+ native_df = df.to_native()
124
+ df = nw.from_native(transformer.fit_transform(native_df, column_names=column_names))
124
125
  assert len(df) == pre_row_count
125
126
  for f in transformer.features_out:
126
127
  if f in expected_feats_added:
@@ -151,7 +152,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
151
152
 
152
153
  for transformer in self.feature_generators:
153
154
  pre_row_count = len(df)
154
- df = nw.from_native(transformer.transform(df))
155
+ native_df = df.to_native()
156
+ df = nw.from_native(transformer.transform(native_df))
155
157
  assert len(df) == pre_row_count
156
158
  for f in transformer.features_out:
157
159
  if f in expected_feats_added:
@@ -181,9 +183,11 @@ class FeatureGeneratorPipeline(FeatureGenerator):
181
183
  for transformer in self.feature_generators:
182
184
  pre_row_count = len(df)
183
185
  if hasattr(transformer, "future_transform") and callable(transformer.future_transform):
184
- df = nw.from_native(transformer.future_transform(df))
186
+ native_df = df.to_native()
187
+ df = nw.from_native(transformer.future_transform(native_df))
185
188
  else:
186
- df = nw.from_native(transformer.transform(df))
189
+ native_df = df.to_native()
190
+ df = nw.from_native(transformer.transform(native_df))
187
191
  assert len(df) == pre_row_count
188
192
  for f in transformer.features_out:
189
193
  if f in expected_feats_added:
@@ -1,9 +1,15 @@
1
1
  from spforge.hyperparameter_tuning._default_search_spaces import (
2
+ get_default_estimator_search_space,
3
+ get_default_lgbm_search_space,
4
+ get_default_negative_binomial_search_space,
5
+ get_default_normal_distribution_search_space,
2
6
  get_default_player_rating_search_space,
3
7
  get_default_search_space,
8
+ get_default_student_t_search_space,
4
9
  get_default_team_rating_search_space,
5
10
  )
6
11
  from spforge.hyperparameter_tuning._tuner import (
12
+ EstimatorHyperparameterTuner,
7
13
  OptunaResult,
8
14
  ParamSpec,
9
15
  RatingHyperparameterTuner,
@@ -11,9 +17,15 @@ from spforge.hyperparameter_tuning._tuner import (
11
17
 
12
18
  __all__ = [
13
19
  "RatingHyperparameterTuner",
20
+ "EstimatorHyperparameterTuner",
14
21
  "ParamSpec",
15
22
  "OptunaResult",
23
+ "get_default_estimator_search_space",
24
+ "get_default_lgbm_search_space",
25
+ "get_default_negative_binomial_search_space",
26
+ "get_default_normal_distribution_search_space",
16
27
  "get_default_player_rating_search_space",
17
28
  "get_default_team_rating_search_space",
29
+ "get_default_student_t_search_space",
18
30
  "get_default_search_space",
19
31
  ]
@@ -1,12 +1,133 @@
1
1
  from spforge.hyperparameter_tuning._tuner import ParamSpec
2
2
  from spforge.ratings import PlayerRatingGenerator, TeamRatingGenerator
3
+ from spforge.distributions import (
4
+ NegativeBinomialEstimator,
5
+ NormalDistributionPredictor,
6
+ StudentTDistributionEstimator,
7
+ )
8
+
9
+
10
+ def _is_lightgbm_estimator(obj: object) -> bool:
11
+ mod = (getattr(type(obj), "__module__", "") or "").lower()
12
+ name = type(obj).__name__
13
+ if "lightgbm" in mod:
14
+ return True
15
+ return bool(name.startswith("LGBM"))
16
+
17
+
18
+ def get_default_lgbm_search_space() -> dict[str, ParamSpec]:
19
+ return {
20
+ "n_estimators": ParamSpec(
21
+ param_type="int",
22
+ low=50,
23
+ high=800,
24
+ log=True,
25
+ ),
26
+ "num_leaves": ParamSpec(
27
+ param_type="int",
28
+ low=16,
29
+ high=256,
30
+ log=True,
31
+ ),
32
+ "max_depth": ParamSpec(
33
+ param_type="int",
34
+ low=3,
35
+ high=12,
36
+ ),
37
+ "min_child_samples": ParamSpec(
38
+ param_type="int",
39
+ low=10,
40
+ high=200,
41
+ log=True,
42
+ ),
43
+ "subsample": ParamSpec(
44
+ param_type="float",
45
+ low=0.6,
46
+ high=1.0,
47
+ ),
48
+ "subsample_freq": ParamSpec(
49
+ param_type="int",
50
+ low=1,
51
+ high=7,
52
+ ),
53
+ "reg_alpha": ParamSpec(
54
+ param_type="float",
55
+ low=1e-8,
56
+ high=10.0,
57
+ log=True,
58
+ ),
59
+ "reg_lambda": ParamSpec(
60
+ param_type="float",
61
+ low=1e-8,
62
+ high=10.0,
63
+ log=True,
64
+ ),
65
+ }
66
+
67
+
68
+ def get_default_negative_binomial_search_space() -> dict[str, ParamSpec]:
69
+ return {
70
+ "predicted_r_weight": ParamSpec(
71
+ param_type="float",
72
+ low=0.0,
73
+ high=1.0,
74
+ ),
75
+ "r_rolling_mean_window": ParamSpec(
76
+ param_type="int",
77
+ low=10,
78
+ high=120,
79
+ ),
80
+ "predicted_r_iterations": ParamSpec(
81
+ param_type="int",
82
+ low=2,
83
+ high=12,
84
+ ),
85
+ }
86
+
87
+
88
+ def get_default_normal_distribution_search_space() -> dict[str, ParamSpec]:
89
+ return {
90
+ "sigma": ParamSpec(
91
+ param_type="float",
92
+ low=0.5,
93
+ high=30.0,
94
+ log=True,
95
+ ),
96
+ }
97
+
98
+
99
+ def get_default_student_t_search_space() -> dict[str, ParamSpec]:
100
+ return {
101
+ "df": ParamSpec(
102
+ param_type="float",
103
+ low=3.0,
104
+ high=30.0,
105
+ log=True,
106
+ ),
107
+ "min_sigma": ParamSpec(
108
+ param_type="float",
109
+ low=0.5,
110
+ high=10.0,
111
+ log=True,
112
+ ),
113
+ "sigma_bins": ParamSpec(
114
+ param_type="int",
115
+ low=4,
116
+ high=12,
117
+ ),
118
+ "min_bin_rows": ParamSpec(
119
+ param_type="int",
120
+ low=10,
121
+ high=100,
122
+ ),
123
+ }
3
124
 
4
125
 
5
126
  def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
6
127
  """
7
128
  Default search space for PlayerRatingGenerator.
8
129
 
9
- Focuses on 5-8 core parameters that have the most impact on performance.
130
+ Focuses on core parameters that have the most impact on performance.
10
131
 
11
132
  Returns:
12
133
  Dictionary mapping parameter names to ParamSpec objects
@@ -46,6 +167,31 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
46
167
  param_type="categorical",
47
168
  choices=["difference", "mean", "ignore_opponent"],
48
169
  ),
170
+ "start_league_quantile": ParamSpec(
171
+ param_type="float",
172
+ low=0.05,
173
+ high=0.5,
174
+ ),
175
+ "start_min_count_for_percentiles": ParamSpec(
176
+ param_type="int",
177
+ low=40,
178
+ high=500,
179
+ ),
180
+ "start_team_rating_subtract": ParamSpec(
181
+ param_type="float",
182
+ low=0.0,
183
+ high=200.0,
184
+ ),
185
+ "start_team_weight": ParamSpec(
186
+ param_type="float",
187
+ low=0.0,
188
+ high=1.0,
189
+ ),
190
+ "start_min_match_count_team_rating": ParamSpec(
191
+ param_type="int",
192
+ low=1,
193
+ high=10,
194
+ ),
49
195
  }
50
196
 
51
197
 
@@ -120,3 +266,15 @@ def get_default_search_space(
120
266
  f"Unsupported rating generator type: {type(rating_generator)}. "
121
267
  "Expected PlayerRatingGenerator or TeamRatingGenerator."
122
268
  )
269
+
270
+
271
+ def get_default_estimator_search_space(estimator: object) -> dict[str, ParamSpec]:
272
+ if _is_lightgbm_estimator(estimator):
273
+ return get_default_lgbm_search_space()
274
+ if isinstance(estimator, NegativeBinomialEstimator):
275
+ return get_default_negative_binomial_search_space()
276
+ if isinstance(estimator, NormalDistributionPredictor):
277
+ return get_default_normal_distribution_search_space()
278
+ if isinstance(estimator, StudentTDistributionEstimator):
279
+ return get_default_student_t_search_space()
280
+ return {}