spforge 0.8.8__py3-none-any.whl → 0.8.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

spforge/autopipeline.py CHANGED
@@ -195,6 +195,40 @@ def lgbm_in_root(root) -> bool:
195
195
  return any(_is_lightgbm_estimator(obj) for obj in _walk_objects(root))
196
196
 
197
197
 
198
+ def _get_importance_estimator(estimator) -> tuple[Any, str] | None:
199
+ """Recursively find innermost estimator with feature_importances_ or coef_."""
200
+ if hasattr(estimator, "feature_importances_"):
201
+ inner = _get_importance_estimator_inner(estimator)
202
+ if inner is not None:
203
+ return inner
204
+ return (estimator, "feature_importances_")
205
+
206
+ if hasattr(estimator, "coef_"):
207
+ inner = _get_importance_estimator_inner(estimator)
208
+ if inner is not None:
209
+ return inner
210
+ return (estimator, "coef_")
211
+
212
+ return _get_importance_estimator_inner(estimator)
213
+
214
+
215
+ def _get_importance_estimator_inner(estimator) -> tuple[Any, str] | None:
216
+ """Check wrapped estimators for importance attributes."""
217
+ # Check estimator_ (sklearn fitted wrapper convention)
218
+ if hasattr(estimator, "estimator_") and estimator.estimator_ is not None:
219
+ result = _get_importance_estimator(estimator.estimator_)
220
+ if result is not None:
221
+ return result
222
+
223
+ # Check _est (GroupByEstimator convention)
224
+ if hasattr(estimator, "_est") and estimator._est is not None:
225
+ result = _get_importance_estimator(estimator._est)
226
+ if result is not None:
227
+ return result
228
+
229
+ return None
230
+
231
+
198
232
  class AutoPipeline(BaseEstimator):
199
233
  def __init__(
200
234
  self,
@@ -202,6 +236,7 @@ class AutoPipeline(BaseEstimator):
202
236
  estimator_features: list[str],
203
237
  predictor_transformers: list[PredictorTransformer] | None = None,
204
238
  granularity: list[str] | None = None,
239
+ aggregation_weight: str | None = None,
205
240
  filters: list[Filter] | None = None,
206
241
  scale_features: bool = False,
207
242
  categorical_handling: CategoricalHandling = "auto",
@@ -216,6 +251,7 @@ class AutoPipeline(BaseEstimator):
216
251
  self.estimator_features = estimator_features
217
252
  self.feature_names = estimator_features # Internal compat
218
253
  self.granularity = granularity or []
254
+ self.aggregation_weight = aggregation_weight
219
255
  self.predictor_transformers = predictor_transformers
220
256
  self.estimator = estimator
221
257
  self.filters = filters or []
@@ -230,6 +266,7 @@ class AutoPipeline(BaseEstimator):
230
266
  self.numeric_features = numeric_features
231
267
  self.remainder = remainder
232
268
  self._cat_feats = []
269
+ self._filter_feature_names: list[str] = []
233
270
 
234
271
  # Auto-compute context features
235
272
  self.context_feature_names = self._compute_context_features()
@@ -242,11 +279,12 @@ class AutoPipeline(BaseEstimator):
242
279
  self._resolved_categorical_handling: CategoricalHandling | None = None
243
280
 
244
281
  def _compute_context_features(self) -> list[str]:
245
- """Auto-compute context features from estimator, granularity, and filters.
282
+ """Auto-compute context features from estimator and granularity.
246
283
 
247
284
  Note: Context from predictor_transformers is tracked separately in
248
285
  context_predictor_transformer_feature_names and is dropped before
249
- the final estimator.
286
+ the final estimator. Filter columns are tracked separately and are
287
+ dropped before the final estimator.
250
288
  """
251
289
  from spforge.transformers._base import PredictorTransformer
252
290
 
@@ -290,9 +328,15 @@ class AutoPipeline(BaseEstimator):
290
328
  # Add granularity columns
291
329
  context.extend(self.granularity)
292
330
 
331
+ # Add aggregation weight column
332
+ if self.aggregation_weight:
333
+ context.append(self.aggregation_weight)
334
+
293
335
  # Add filter columns
336
+ self._filter_feature_names = []
294
337
  for f in self.filters:
295
- context.append(f.column_name)
338
+ if f.column_name not in self._filter_feature_names:
339
+ self._filter_feature_names.append(f.column_name)
296
340
 
297
341
  # Dedupe while preserving order, excluding estimator_features
298
342
  seen = set()
@@ -454,7 +498,11 @@ class AutoPipeline(BaseEstimator):
454
498
  pre = PreprocessorToDataFrame(pre_raw)
455
499
 
456
500
  est = (
457
- GroupByEstimator(self.estimator, granularity=[f"{c}" for c in self.granularity])
501
+ GroupByEstimator(
502
+ self.estimator,
503
+ granularity=[f"{c}" for c in self.granularity],
504
+ aggregation_weight=self.aggregation_weight,
505
+ )
458
506
  if do_groupby
459
507
  else self.estimator
460
508
  )
@@ -506,8 +554,10 @@ class AutoPipeline(BaseEstimator):
506
554
  prev_transformer_feats_out.extend(feats_out)
507
555
 
508
556
  # Use FunctionTransformer with global function for serializability
557
+ drop_filter_cols = set(self._filter_feature_names)
558
+ drop_cols = drop_ctx_set | drop_filter_cols
509
559
  final = FunctionTransformer(
510
- _drop_columns_transformer, validate=False, kw_args={"drop_cols": drop_ctx_set}
560
+ _drop_columns_transformer, validate=False, kw_args={"drop_cols": drop_cols}
511
561
  )
512
562
  steps.append(("final", final))
513
563
 
@@ -538,6 +588,7 @@ class AutoPipeline(BaseEstimator):
538
588
  self.feature_names
539
589
  + self.context_feature_names
540
590
  + self.context_predictor_transformer_feature_names
591
+ + self._filter_feature_names
541
592
  + self.granularity
542
593
  )
543
594
  )
@@ -626,4 +677,117 @@ class AutoPipeline(BaseEstimator):
626
677
  if ctx not in all_features:
627
678
  all_features.append(ctx)
628
679
 
680
+ # Add filter columns (needed for fit-time filtering)
681
+ for col in self._filter_feature_names:
682
+ if col not in all_features:
683
+ all_features.append(col)
684
+
629
685
  return all_features
686
+
687
+ def _get_estimator_feature_names(self) -> list[str]:
688
+ """Get feature names as seen by the final estimator after all transformations."""
689
+ pre_out = list(self.sklearn_pipeline.named_steps["pre"].get_feature_names_out())
690
+
691
+ # Remove context columns dropped by "final" step
692
+ final_step = self.sklearn_pipeline.named_steps["final"]
693
+ drop_cols = final_step.kw_args.get("drop_cols", set()) if final_step.kw_args else set()
694
+ features = [f for f in pre_out if f not in drop_cols]
695
+
696
+ # Remove granularity columns (dropped by GroupByEstimator)
697
+ granularity_set = set(self.granularity)
698
+ features = [f for f in features if f not in granularity_set]
699
+
700
+ # Remove context features (used by wrapper estimators, not inner model)
701
+ context_set = set(self.context_feature_names)
702
+ features = [f for f in features if f not in context_set]
703
+
704
+ # Remove filter columns (used only for fit-time filtering)
705
+ filter_set = set(self._filter_feature_names)
706
+ features = [f for f in features if f not in filter_set]
707
+
708
+ return features
709
+
710
+ def _resolve_importance_feature_names(self, estimator, n_features: int) -> list[str]:
711
+ names = None
712
+ if hasattr(estimator, "feature_names_in_") and estimator.feature_names_in_ is not None:
713
+ names = list(estimator.feature_names_in_)
714
+ elif hasattr(estimator, "feature_name_") and estimator.feature_name_ is not None:
715
+ names = list(estimator.feature_name_)
716
+ elif hasattr(estimator, "feature_names_") and estimator.feature_names_ is not None:
717
+ names = list(estimator.feature_names_)
718
+ if names is None:
719
+ names = self._get_estimator_feature_names()
720
+ if len(names) != n_features:
721
+ raise ValueError(
722
+ f"Feature names length ({len(names)}) does not match importances length ({n_features})."
723
+ )
724
+ return names
725
+
726
+ @property
727
+ def feature_importances_(self) -> pd.DataFrame:
728
+ """Get feature importances from the fitted estimator.
729
+
730
+ Returns a DataFrame with columns ["feature", "importance"] sorted by
731
+ absolute importance descending. Works with tree-based models
732
+ (feature_importances_) and linear models (coef_).
733
+ """
734
+ if self.sklearn_pipeline is None:
735
+ raise RuntimeError("Pipeline not fitted. Call fit() first.")
736
+
737
+ est = self.sklearn_pipeline.named_steps["est"]
738
+ result = _get_importance_estimator(est)
739
+
740
+ if result is None:
741
+ raise RuntimeError(
742
+ "Estimator does not support feature importances. "
743
+ "Requires feature_importances_ or coef_ attribute."
744
+ )
745
+
746
+ inner_est, attr_name = result
747
+ raw = getattr(inner_est, attr_name)
748
+
749
+ if attr_name == "coef_":
750
+ # Linear models: use absolute value of coefficients
751
+ if raw.ndim == 2:
752
+ # Multi-class: average absolute values across classes
753
+ importances = np.abs(raw).mean(axis=0)
754
+ else:
755
+ importances = np.abs(raw)
756
+ else:
757
+ importances = raw
758
+
759
+ feature_names = self._get_estimator_feature_names()
760
+
761
+ df = pd.DataFrame({"feature": feature_names, "importance": importances})
762
+ df = df.sort_values("importance", ascending=False, key=abs).reset_index(drop=True)
763
+ return df
764
+
765
+ @property
766
+ def feature_importance_names(self) -> dict[str, float]:
767
+ """Map deepest estimator feature names to importances."""
768
+ if self.sklearn_pipeline is None:
769
+ raise RuntimeError("Pipeline not fitted. Call fit() first.")
770
+
771
+ est = self.sklearn_pipeline.named_steps["est"]
772
+ result = _get_importance_estimator(est)
773
+
774
+ if result is None:
775
+ raise RuntimeError(
776
+ "Estimator does not support feature importances. "
777
+ "Requires feature_importances_ or coef_ attribute."
778
+ )
779
+
780
+ inner_est, attr_name = result
781
+ raw = getattr(inner_est, attr_name)
782
+
783
+ if attr_name == "coef_":
784
+ if raw.ndim == 2:
785
+ importances = np.abs(raw).mean(axis=0)
786
+ else:
787
+ importances = np.abs(raw)
788
+ else:
789
+ importances = raw
790
+
791
+ importances = np.asarray(importances)
792
+ feature_names = self._resolve_importance_feature_names(inner_est, len(importances))
793
+ return dict(zip(feature_names, importances.tolist()))
@@ -10,10 +10,16 @@ from spforge.transformers._other_transformer import GroupByReducer
10
10
 
11
11
 
12
12
  class GroupByEstimator(BaseEstimator):
13
- def __init__(self, estimator: Any, granularity: list[str] | None = None):
13
+ def __init__(
14
+ self,
15
+ estimator: Any,
16
+ granularity: list[str] | None = None,
17
+ aggregation_weight: str | None = None,
18
+ ):
14
19
  self.estimator = estimator
15
20
  self.granularity = granularity or []
16
- self._reducer = GroupByReducer(self.granularity)
21
+ self.aggregation_weight = aggregation_weight
22
+ self._reducer = GroupByReducer(self.granularity, aggregation_weight=aggregation_weight)
17
23
  self._est = None
18
24
 
19
25
  def __sklearn_is_fitted__(self):
@@ -22,7 +28,9 @@ class GroupByEstimator(BaseEstimator):
22
28
  @nw.narwhalify
23
29
  def fit(self, X: IntoFrameT, y: Any, sample_weight: np.ndarray | None = None):
24
30
  X = X.to_pandas()
25
- self._reducer = GroupByReducer(self.granularity)
31
+ # Backwards compatibility: old pickled objects may not have aggregation_weight
32
+ agg_weight = getattr(self, "aggregation_weight", None)
33
+ self._reducer = GroupByReducer(self.granularity, aggregation_weight=agg_weight)
26
34
  X_red = nw.from_native(self._reducer.fit_transform(X))
27
35
  y_red, sw_red = self._reducer.reduce_y(X, y, sample_weight=sample_weight)
28
36
 
@@ -7,6 +7,7 @@ from spforge.hyperparameter_tuning._default_search_spaces import (
7
7
  get_default_search_space,
8
8
  get_default_student_t_search_space,
9
9
  get_default_team_rating_search_space,
10
+ get_full_player_rating_search_space,
10
11
  )
11
12
  from spforge.hyperparameter_tuning._tuner import (
12
13
  EstimatorHyperparameterTuner,
@@ -28,4 +29,5 @@ __all__ = [
28
29
  "get_default_team_rating_search_space",
29
30
  "get_default_student_t_search_space",
30
31
  "get_default_search_space",
32
+ "get_full_player_rating_search_space",
31
33
  ]
@@ -128,6 +128,7 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
128
128
  Default search space for PlayerRatingGenerator.
129
129
 
130
130
  Focuses on core parameters that have the most impact on performance.
131
+ Excludes performance_predictor and team-based start rating params.
131
132
 
132
133
  Returns:
133
134
  Dictionary mapping parameter names to ParamSpec objects
@@ -163,10 +164,6 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
163
164
  "use_off_def_split": ParamSpec(
164
165
  param_type="bool",
165
166
  ),
166
- "performance_predictor": ParamSpec(
167
- param_type="categorical",
168
- choices=["difference", "mean", "ignore_opponent"],
169
- ),
170
167
  "start_league_quantile": ParamSpec(
171
168
  param_type="float",
172
169
  low=0.05,
@@ -177,24 +174,46 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
177
174
  low=40,
178
175
  high=500,
179
176
  ),
180
- "start_team_rating_subtract": ParamSpec(
181
- param_type="float",
182
- low=0.0,
183
- high=200.0,
184
- ),
185
- "start_team_weight": ParamSpec(
186
- param_type="float",
187
- low=0.0,
188
- high=1.0,
189
- ),
190
- "start_min_match_count_team_rating": ParamSpec(
191
- param_type="int",
192
- low=1,
193
- high=10,
194
- ),
195
177
  }
196
178
 
197
179
 
180
+ def get_full_player_rating_search_space() -> dict[str, ParamSpec]:
181
+ """
182
+ Full search space for PlayerRatingGenerator including all tunable parameters.
183
+
184
+ Includes performance_predictor and team-based start rating parameters.
185
+ Use this when you want to tune all parameters.
186
+
187
+ Returns:
188
+ Dictionary mapping parameter names to ParamSpec objects
189
+ """
190
+ base = get_default_player_rating_search_space()
191
+ base.update(
192
+ {
193
+ "performance_predictor": ParamSpec(
194
+ param_type="categorical",
195
+ choices=["difference", "mean", "ignore_opponent"],
196
+ ),
197
+ "start_team_rating_subtract": ParamSpec(
198
+ param_type="float",
199
+ low=0.0,
200
+ high=200.0,
201
+ ),
202
+ "start_team_weight": ParamSpec(
203
+ param_type="float",
204
+ low=0.0,
205
+ high=1.0,
206
+ ),
207
+ "start_min_match_count_team_rating": ParamSpec(
208
+ param_type="int",
209
+ low=1,
210
+ high=10,
211
+ ),
212
+ }
213
+ )
214
+ return base
215
+
216
+
198
217
  def get_default_team_rating_search_space() -> dict[str, ParamSpec]:
199
218
  """
200
219
  Default search space for TeamRatingGenerator.
@@ -235,10 +254,6 @@ def get_default_team_rating_search_space() -> dict[str, ParamSpec]:
235
254
  "use_off_def_split": ParamSpec(
236
255
  param_type="bool",
237
256
  ),
238
- "performance_predictor": ParamSpec(
239
- param_type="categorical",
240
- choices=["difference", "mean", "ignore_opponent"],
241
- ),
242
257
  }
243
258
 
244
259
 
@@ -91,6 +91,9 @@ class RatingHyperparameterTuner:
91
91
  scorer: BaseScorer,
92
92
  direction: Literal["minimize", "maximize"],
93
93
  param_search_space: dict[str, ParamSpec] | None = None,
94
+ param_ranges: dict[str, tuple[float | int, float | int]] | None = None,
95
+ exclude_params: list[str] | None = None,
96
+ fixed_params: dict[str, Any] | None = None,
94
97
  n_trials: int = 50,
95
98
  n_jobs: int = 1,
96
99
  storage: str | None = None,
@@ -109,6 +112,14 @@ class RatingHyperparameterTuner:
109
112
  scorer: Scorer for evaluation (must have score(df) -> float | dict)
110
113
  direction: "minimize" or "maximize"
111
114
  param_search_space: Custom search space (merges with defaults if provided)
115
+ param_ranges: Easy range override for float/int params. Maps param name to
116
+ (low, high) tuple. Preserves param_type and log scale from defaults.
117
+ Example: {"confidence_weight": (0.2, 1.0)}
118
+ exclude_params: List of param names to exclude from tuning entirely.
119
+ Example: ["performance_predictor", "use_off_def_split"]
120
+ fixed_params: Parameters to fix at specific values (not tuned).
121
+ These values are applied to the rating generator each trial.
122
+ Example: {"performance_predictor": "mean"}
112
123
  n_trials: Number of optimization trials
113
124
  n_jobs: Number of parallel jobs (1 = sequential)
114
125
  storage: Optuna storage URL (e.g., "sqlite:///optuna.db") for persistence
@@ -123,6 +134,9 @@ class RatingHyperparameterTuner:
123
134
  self.scorer = scorer
124
135
  self.direction = direction
125
136
  self.custom_search_space = param_search_space
137
+ self.param_ranges = param_ranges
138
+ self.exclude_params = exclude_params or []
139
+ self.fixed_params = fixed_params or {}
126
140
  self.n_trials = n_trials
127
141
  self.n_jobs = n_jobs
128
142
  self.storage = storage
@@ -196,6 +210,9 @@ class RatingHyperparameterTuner:
196
210
  try:
197
211
  copied_gen = copy.deepcopy(self.rating_generator)
198
212
 
213
+ for param_name, param_value in self.fixed_params.items():
214
+ setattr(copied_gen, param_name, param_value)
215
+
199
216
  trial_params = self._suggest_params(trial, search_space)
200
217
 
201
218
  for param_name, param_value in trial_params.items():
@@ -243,18 +260,54 @@ class RatingHyperparameterTuner:
243
260
  defaults: dict[str, ParamSpec],
244
261
  ) -> dict[str, ParamSpec]:
245
262
  """
246
- Merge custom search space with defaults (custom takes precedence).
263
+ Merge custom search space with defaults.
264
+
265
+ Priority order (highest to lowest):
266
+ 1. exclude_params - removes param entirely
267
+ 2. fixed_params - removes from search (applied separately)
268
+ 3. custom (param_search_space) - full ParamSpec override
269
+ 4. param_ranges - updates only low/high bounds
270
+ 5. defaults - base search space
247
271
 
248
272
  Args:
249
273
  custom: Custom search space (may be None)
250
274
  defaults: Default search space
251
275
 
252
276
  Returns:
253
- Merged search space
277
+ Merged search space (excludes fixed_params, those are applied separately)
254
278
  """
255
279
  merged = defaults.copy()
280
+
281
+ if self.param_ranges:
282
+ for param_name, (low, high) in self.param_ranges.items():
283
+ if param_name not in merged:
284
+ raise ValueError(
285
+ f"param_ranges contains unknown parameter: '{param_name}'. "
286
+ f"Available parameters: {list(merged.keys())}"
287
+ )
288
+ existing = merged[param_name]
289
+ if existing.param_type not in ("float", "int"):
290
+ raise ValueError(
291
+ f"param_ranges can only override float/int parameters. "
292
+ f"'{param_name}' is {existing.param_type}."
293
+ )
294
+ merged[param_name] = ParamSpec(
295
+ param_type=existing.param_type,
296
+ low=low,
297
+ high=high,
298
+ log=existing.log,
299
+ step=existing.step,
300
+ )
301
+
256
302
  if custom:
257
303
  merged.update(custom)
304
+
305
+ for param_name in self.exclude_params:
306
+ merged.pop(param_name, None)
307
+
308
+ for param_name in self.fixed_params:
309
+ merged.pop(param_name, None)
310
+
258
311
  return merged
259
312
 
260
313
  @staticmethod
@@ -250,8 +250,6 @@ class PerformanceWeightsManager(PerformanceManager):
250
250
  )
251
251
  )
252
252
 
253
- sum_weight = sum([w.weight for w in self.weights])
254
-
255
253
  for column_weight in self.weights:
256
254
  weight_col = f"weight__{column_weight.name}"
257
255
  feature_col = column_weight.name
@@ -261,14 +259,14 @@ class PerformanceWeightsManager(PerformanceManager):
261
259
  df = df.with_columns(
262
260
  (
263
261
  nw.col(tmp_out_performance_colum_name)
264
- + (nw.col(weight_col) / sum_weight * (1 - nw.col(feature_name)))
262
+ + (nw.col(weight_col) * (1 - nw.col(feature_name)))
265
263
  ).alias(tmp_out_performance_colum_name)
266
264
  )
267
265
  else:
268
266
  df = df.with_columns(
269
267
  (
270
268
  nw.col(tmp_out_performance_colum_name)
271
- + (nw.col(weight_col) / sum_weight * nw.col(feature_name))
269
+ + (nw.col(weight_col) * nw.col(feature_name))
272
270
  ).alias(tmp_out_performance_colum_name)
273
271
  )
274
272