spforge 0.8.4__py3-none-any.whl → 0.8.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (37) hide show
  1. examples/lol/pipeline_transformer_example.py +69 -86
  2. examples/nba/cross_validation_example.py +4 -11
  3. examples/nba/feature_engineering_example.py +33 -15
  4. examples/nba/game_winner_example.py +24 -14
  5. examples/nba/predictor_transformers_example.py +29 -16
  6. spforge/__init__.py +1 -0
  7. spforge/autopipeline.py +169 -5
  8. spforge/estimator/_group_by_estimator.py +11 -3
  9. spforge/features_generator_pipeline.py +8 -4
  10. spforge/hyperparameter_tuning/__init__.py +12 -0
  11. spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
  12. spforge/hyperparameter_tuning/_tuner.py +192 -0
  13. spforge/performance_transformers/_performance_manager.py +2 -4
  14. spforge/ratings/__init__.py +4 -0
  15. spforge/ratings/_player_rating.py +142 -28
  16. spforge/ratings/league_start_rating_optimizer.py +201 -0
  17. spforge/ratings/start_rating_generator.py +1 -1
  18. spforge/ratings/team_start_rating_generator.py +1 -1
  19. spforge/ratings/utils.py +16 -6
  20. spforge/scorer/_score.py +42 -11
  21. spforge/transformers/_other_transformer.py +38 -8
  22. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/METADATA +12 -19
  23. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/RECORD +37 -31
  24. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
  25. tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
  26. tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  27. tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
  28. tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
  29. tests/performance_transformers/test_performance_manager.py +15 -0
  30. tests/ratings/test_player_rating_generator.py +154 -0
  31. tests/ratings/test_player_rating_no_mutation.py +214 -0
  32. tests/ratings/test_utils_scaled_weights.py +136 -0
  33. tests/scorer/test_score.py +232 -0
  34. tests/test_autopipeline.py +336 -6
  35. tests/test_feature_generator_pipeline.py +43 -0
  36. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
  37. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
@@ -2048,3 +2048,235 @@ def test_all_scorers_handle_all_nan_targets(df_type):
2048
2048
  assert np.isnan(score) or score == 0.0
2049
2049
  except (ValueError, IndexError):
2050
2050
  pass
2051
+ SCORER_VALIDATION_CASES = [
2052
+ pytest.param(
2053
+ lambda: MeanBiasScorer(pred_column="pred", target="target", validation_column="is_validation"),
2054
+ lambda: pd.DataFrame(
2055
+ {
2056
+ "pred": [2.0, 0.0],
2057
+ "target": [1.0, 2.0],
2058
+ "is_validation": [1, 0],
2059
+ }
2060
+ ),
2061
+ id="mean_bias",
2062
+ ),
2063
+ pytest.param(
2064
+ lambda: PWMSE(pred_column="pred", target="target", labels=[0, 1], validation_column="is_validation"),
2065
+ lambda: pd.DataFrame(
2066
+ {
2067
+ "pred": [[0.7, 0.3], [0.4, 0.6]],
2068
+ "target": [0, 1],
2069
+ "is_validation": [1, 0],
2070
+ }
2071
+ ),
2072
+ id="pwmse",
2073
+ ),
2074
+ pytest.param(
2075
+ lambda: SklearnScorer(
2076
+ scorer_function=mean_absolute_error, pred_column="pred", target="target", validation_column="is_validation"
2077
+ ),
2078
+ lambda: pd.DataFrame(
2079
+ {
2080
+ "pred": [1.0, 0.0],
2081
+ "target": [1.0, 0.0],
2082
+ "is_validation": [1, 0],
2083
+ }
2084
+ ),
2085
+ id="sklearn",
2086
+ ),
2087
+ pytest.param(
2088
+ lambda: ProbabilisticMeanBias(
2089
+ pred_column="pred", target="target", class_column_name="classes", validation_column="is_validation"
2090
+ ),
2091
+ lambda: pd.DataFrame(
2092
+ {
2093
+ "pred": [[0.2, 0.8], [0.6, 0.4]],
2094
+ "target": [1, 0],
2095
+ "classes": [[0, 1], [0, 1]],
2096
+ "is_validation": [1, 0],
2097
+ }
2098
+ ),
2099
+ id="probabilistic_mean_bias",
2100
+ ),
2101
+ pytest.param(
2102
+ lambda: OrdinalLossScorer(pred_column="pred", target="target", classes=[0, 1], validation_column="is_validation"),
2103
+ lambda: pd.DataFrame(
2104
+ {
2105
+ "pred": [[0.2, 0.8], [0.6, 0.4]],
2106
+ "target": [1, 0],
2107
+ "is_validation": [1, 0],
2108
+ }
2109
+ ),
2110
+ id="ordinal_loss",
2111
+ ),
2112
+ pytest.param(
2113
+ lambda: ThresholdEventScorer(
2114
+ dist_column="dist",
2115
+ threshold_column="threshold",
2116
+ outcome_column="outcome",
2117
+ comparator=Operator.GREATER_THAN_OR_EQUALS,
2118
+ validation_column="is_validation",
2119
+ ),
2120
+ lambda: pd.DataFrame(
2121
+ {
2122
+ "dist": [[0.2, 0.8], [0.6, 0.4], [0.3, 0.7]],
2123
+ "threshold": [0.5, 0.2, 0.3],
2124
+ "outcome": [1, 0, 1],
2125
+ "is_validation": [1, 1, 0],
2126
+ }
2127
+ ),
2128
+ id="threshold_event",
2129
+ ),
2130
+ ]
2131
+
2132
+
2133
+ @pytest.mark.parametrize("scorer_factory, df_factory", SCORER_VALIDATION_CASES)
2134
+ def test_scorers_respect_validation_column(scorer_factory, df_factory):
2135
+ """Scorers should filter on validation_column when specified."""
2136
+ df = df_factory()
2137
+ df_valid = df[df["is_validation"] == 1]
2138
+ score_all = scorer_factory().score(df)
2139
+ score_valid = scorer_factory().score(df_valid)
2140
+ assert score_all == score_valid
2141
+
2142
+
2143
+ # ============================================================================
2144
+ # PWMSE evaluation_labels Extension Tests
2145
+ # ============================================================================
2146
+
2147
+
2148
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
2149
+ def test_pwmse__evaluation_labels_extends_predictions(df_type):
2150
+ """PWMSE with evaluation_labels as superset extends predictions with small probs."""
2151
+ df = create_dataframe(
2152
+ df_type,
2153
+ {
2154
+ "pred": [
2155
+ [0.3, 0.5, 0.2],
2156
+ [0.2, 0.6, 0.2],
2157
+ ],
2158
+ "target": [0, 1],
2159
+ },
2160
+ )
2161
+
2162
+ scorer = PWMSE(
2163
+ pred_column="pred",
2164
+ target="target",
2165
+ labels=[0, 1, 2],
2166
+ evaluation_labels=[-1, 0, 1, 2, 3],
2167
+ )
2168
+ score = scorer.score(df)
2169
+
2170
+ n_eval_labels = 5
2171
+ eps = 1e-5
2172
+ preds_original = np.array([[0.3, 0.5, 0.2], [0.2, 0.6, 0.2]])
2173
+ extended = np.full((2, n_eval_labels), eps, dtype=np.float64)
2174
+ extended[:, 1] = preds_original[:, 0]
2175
+ extended[:, 2] = preds_original[:, 1]
2176
+ extended[:, 3] = preds_original[:, 2]
2177
+ row_sums = extended.sum(axis=1, keepdims=True)
2178
+ preds_renorm = extended / row_sums
2179
+
2180
+ eval_labels = np.array([-1, 0, 1, 2, 3], dtype=np.float64)
2181
+ targets = np.array([0, 1], dtype=np.float64)
2182
+ diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
2183
+ expected = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
2184
+
2185
+ assert abs(score - expected) < 1e-10
2186
+
2187
+
2188
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
2189
+ def test_pwmse__evaluation_labels_exact_match(df_type):
2190
+ """PWMSE with evaluation_labels identical to labels (no-op)."""
2191
+ df = create_dataframe(
2192
+ df_type,
2193
+ {
2194
+ "pred": [
2195
+ [0.3, 0.5, 0.2],
2196
+ [0.2, 0.6, 0.2],
2197
+ ],
2198
+ "target": [0, 1],
2199
+ },
2200
+ )
2201
+
2202
+ scorer_with_eval = PWMSE(
2203
+ pred_column="pred",
2204
+ target="target",
2205
+ labels=[0, 1, 2],
2206
+ evaluation_labels=[0, 1, 2],
2207
+ )
2208
+ scorer_without_eval = PWMSE(
2209
+ pred_column="pred",
2210
+ target="target",
2211
+ labels=[0, 1, 2],
2212
+ )
2213
+
2214
+ score_with = scorer_with_eval.score(df)
2215
+ score_without = scorer_without_eval.score(df)
2216
+
2217
+ assert abs(score_with - score_without) < 1e-10
2218
+
2219
+
2220
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
2221
+ def test_pwmse__evaluation_labels_partial_overlap_raises(df_type):
2222
+ """PWMSE with partial overlap between labels and evaluation_labels raises."""
2223
+ with pytest.raises(ValueError, match="evaluation_labels must be a subset or superset"):
2224
+ PWMSE(
2225
+ pred_column="pred",
2226
+ target="target",
2227
+ labels=[0, 1, 2],
2228
+ evaluation_labels=[1, 2, 3],
2229
+ )
2230
+
2231
+
2232
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
2233
+ def test_pwmse__evaluation_labels_extends_with_compare_to_naive(df_type):
2234
+ """PWMSE extension mode works correctly with compare_to_naive."""
2235
+ df = create_dataframe(
2236
+ df_type,
2237
+ {
2238
+ "pred": [
2239
+ [0.8, 0.15, 0.05],
2240
+ [0.1, 0.7, 0.2],
2241
+ [0.05, 0.15, 0.8],
2242
+ [0.3, 0.4, 0.3],
2243
+ ],
2244
+ "target": [0, 1, 2, 1],
2245
+ },
2246
+ )
2247
+
2248
+ scorer = PWMSE(
2249
+ pred_column="pred",
2250
+ target="target",
2251
+ labels=[0, 1, 2],
2252
+ evaluation_labels=[-1, 0, 1, 2, 3],
2253
+ compare_to_naive=True,
2254
+ )
2255
+ score = scorer.score(df)
2256
+
2257
+ n_eval_labels = 5
2258
+ eps = 1e-5
2259
+ preds_original = np.array([
2260
+ [0.8, 0.15, 0.05],
2261
+ [0.1, 0.7, 0.2],
2262
+ [0.05, 0.15, 0.8],
2263
+ [0.3, 0.4, 0.3],
2264
+ ])
2265
+ extended = np.full((4, n_eval_labels), eps, dtype=np.float64)
2266
+ extended[:, 1] = preds_original[:, 0]
2267
+ extended[:, 2] = preds_original[:, 1]
2268
+ extended[:, 3] = preds_original[:, 2]
2269
+ row_sums = extended.sum(axis=1, keepdims=True)
2270
+ preds_renorm = extended / row_sums
2271
+
2272
+ eval_labels = np.array([-1, 0, 1, 2, 3], dtype=np.float64)
2273
+ targets = np.array([0, 1, 2, 1], dtype=np.float64)
2274
+ diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
2275
+ model_score = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
2276
+
2277
+ naive_probs = np.array([0.0, 0.25, 0.5, 0.25, 0.0])
2278
+ naive_preds = np.tile(naive_probs, (4, 1))
2279
+ naive_score = float((diffs_sqd * naive_preds).sum(axis=1).mean())
2280
+
2281
+ expected = naive_score - model_score
2282
+ assert abs(score - expected) < 1e-10
@@ -12,6 +12,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
12
12
 
13
13
  from spforge import AutoPipeline
14
14
  from spforge.estimator import SkLearnEnhancerEstimator
15
+ from spforge.scorer import Filter, Operator
15
16
  from spforge.transformers import EstimatorTransformer
16
17
 
17
18
 
@@ -231,6 +232,27 @@ def test_predict_proba(df_clf):
231
232
  assert np.allclose(proba.sum(axis=1), 1.0, atol=1e-6)
232
233
 
233
234
 
235
+ def test_filter_columns_not_passed_to_estimator(frame):
236
+ df_pd = pd.DataFrame(
237
+ {"x": [1.0, 2.0, 3.0, 4.0], "keep": [1, 0, 1, 0], "y": [1.0, 2.0, 3.0, 4.0]}
238
+ )
239
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
240
+
241
+ model = AutoPipeline(
242
+ estimator=CaptureEstimator(),
243
+ estimator_features=["x"],
244
+ filters=[Filter(column_name="keep", value=1, operator=Operator.EQUALS)],
245
+ )
246
+
247
+ X = _select(df, ["x", "keep"])
248
+ y = _col(df, "y")
249
+ model.fit(X, y=y)
250
+
251
+ est = _inner_estimator(model)
252
+ assert "keep" in model.required_features
253
+ assert "keep" not in est.fit_columns
254
+
255
+
234
256
  def test_predict_proba_raises_if_not_supported(df_reg):
235
257
  model = AutoPipeline(
236
258
  estimator=LinearRegression(),
@@ -306,7 +328,18 @@ def test_infer_categorical_from_feature_names_when_only_numeric_features_given(d
306
328
  assert any(c.startswith("cat") for c in cap.fit_columns)
307
329
 
308
330
 
309
- def test_granularity_groups_rows_before_estimator_fit_and_predict(df_reg):
331
+ def test_granularity_groups_rows_before_estimator_fit_and_predict(frame):
332
+ df_pd = pd.DataFrame(
333
+ {
334
+ "gameid": ["g1", "g1", "g2", "g2", "g3", "g3"],
335
+ "num1": [1.0, 2.0, np.nan, 4.0, 5.0, 6.0],
336
+ "num2": [10.0, 20.0, 30.0, 40.0, np.nan, 60.0],
337
+ "cat1": ["a", "b", "a", None, "b", "c"],
338
+ "y": [1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
339
+ }
340
+ )
341
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
342
+
310
343
  model = AutoPipeline(
311
344
  estimator=CaptureEstimator(),
312
345
  estimator_features=["gameid", "num1", "num2", "cat1"],
@@ -317,16 +350,16 @@ def test_granularity_groups_rows_before_estimator_fit_and_predict(df_reg):
317
350
  remainder="drop",
318
351
  )
319
352
 
320
- X = _select(df_reg, ["gameid", "num1", "num2", "cat1"])
321
- y = _col(df_reg, "y")
353
+ X = _select(df, ["gameid", "num1", "num2", "cat1"])
354
+ y = _col(df, "y")
322
355
  model.fit(X, y=y)
323
356
 
324
357
  inner = _inner_estimator(model)
325
358
 
326
- if isinstance(df_reg, pl.DataFrame):
327
- n_groups = df_reg.select(pl.col("gameid").n_unique()).item()
359
+ if isinstance(df, pl.DataFrame):
360
+ n_groups = df.select(pl.col("gameid").n_unique()).item()
328
361
  else:
329
- n_groups = df_reg["gameid"].nunique()
362
+ n_groups = df["gameid"].nunique()
330
363
 
331
364
  assert inner.fit_shape[0] == n_groups
332
365
 
@@ -551,3 +584,300 @@ def test_autopipeline_is_picklable_after_fit():
551
584
  model.fit(df, y)
552
585
 
553
586
  pickle.dumps(model)
587
+
588
+
589
+ # --- Feature Importances Tests ---
590
+
591
+
592
+ def test_feature_importances__tree_model():
593
+ from sklearn.ensemble import RandomForestRegressor
594
+
595
+ df = pd.DataFrame(
596
+ {
597
+ "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
598
+ "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0],
599
+ "cat1": ["a", "b", "a", "b", "a", "b"],
600
+ }
601
+ )
602
+ y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
603
+
604
+ model = AutoPipeline(
605
+ estimator=RandomForestRegressor(n_estimators=5, random_state=42),
606
+ estimator_features=["num1", "num2", "cat1"],
607
+ categorical_handling="ordinal",
608
+ )
609
+ model.fit(df, y)
610
+
611
+ importances = model.feature_importances_
612
+
613
+ assert isinstance(importances, pd.DataFrame)
614
+ assert list(importances.columns) == ["feature", "importance"]
615
+ assert len(importances) == 3
616
+ assert set(importances["feature"].tolist()) == {"num1", "num2", "cat1"}
617
+ assert all(importances["importance"] >= 0)
618
+
619
+
620
+ def test_feature_importances__linear_model():
621
+ df = pd.DataFrame(
622
+ {
623
+ "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
624
+ "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0],
625
+ }
626
+ )
627
+ y = pd.Series([0, 1, 0, 1, 0, 1, 0, 1], name="y")
628
+
629
+ model = AutoPipeline(
630
+ estimator=LogisticRegression(max_iter=1000),
631
+ estimator_features=["num1", "num2"],
632
+ scale_features=True,
633
+ )
634
+ model.fit(df, y)
635
+
636
+ importances = model.feature_importances_
637
+
638
+ assert isinstance(importances, pd.DataFrame)
639
+ assert list(importances.columns) == ["feature", "importance"]
640
+ assert len(importances) == 2
641
+ assert set(importances["feature"].tolist()) == {"num1", "num2"}
642
+ assert all(importances["importance"] >= 0)
643
+
644
+
645
+ def test_feature_importances__not_fitted_raises():
646
+ model = AutoPipeline(
647
+ estimator=LinearRegression(),
648
+ estimator_features=["x"],
649
+ )
650
+
651
+ with pytest.raises(RuntimeError, match="Pipeline not fitted"):
652
+ _ = model.feature_importances_
653
+
654
+
655
+ def test_feature_importances__unsupported_estimator_raises():
656
+ df = pd.DataFrame({"x": [1.0, 2.0, 3.0, 4.0]})
657
+ y = pd.Series([1.0, 2.0, 3.0, 4.0], name="y")
658
+
659
+ model = AutoPipeline(
660
+ estimator=DummyRegressor(),
661
+ estimator_features=["x"],
662
+ )
663
+ model.fit(df, y)
664
+
665
+ with pytest.raises(RuntimeError, match="does not support feature importances"):
666
+ _ = model.feature_importances_
667
+
668
+
669
+ def test_feature_importances__with_sklearn_enhancer():
670
+ from sklearn.ensemble import RandomForestRegressor
671
+
672
+ df = pd.DataFrame(
673
+ {
674
+ "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
675
+ "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0],
676
+ "start_date": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", "2022-01-05", "2022-01-06"],
677
+ }
678
+ )
679
+ y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
680
+
681
+ inner = RandomForestRegressor(n_estimators=5, random_state=42)
682
+ enhancer = SkLearnEnhancerEstimator(
683
+ estimator=inner,
684
+ date_column="start_date",
685
+ day_weight_epsilon=0.1,
686
+ )
687
+
688
+ model = AutoPipeline(
689
+ estimator=enhancer,
690
+ estimator_features=["num1", "num2"],
691
+ )
692
+ model.fit(df, y)
693
+
694
+ importances = model.feature_importances_
695
+
696
+ assert isinstance(importances, pd.DataFrame)
697
+ assert list(importances.columns) == ["feature", "importance"]
698
+ assert len(importances) == 2
699
+ assert set(importances["feature"].tolist()) == {"num1", "num2"}
700
+
701
+
702
+ def test_feature_importances__onehot_features():
703
+ from sklearn.ensemble import RandomForestRegressor
704
+
705
+ df = pd.DataFrame(
706
+ {
707
+ "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
708
+ "cat1": ["a", "b", "c", "a", "b", "c"],
709
+ }
710
+ )
711
+ y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
712
+
713
+ model = AutoPipeline(
714
+ estimator=RandomForestRegressor(n_estimators=5, random_state=42),
715
+ estimator_features=["num1", "cat1"],
716
+ categorical_handling="onehot",
717
+ )
718
+ model.fit(df, y)
719
+
720
+ importances = model.feature_importances_
721
+
722
+ assert isinstance(importances, pd.DataFrame)
723
+ assert list(importances.columns) == ["feature", "importance"]
724
+ # Should have expanded features: num1 + cat1_a, cat1_b, cat1_c
725
+ assert len(importances) == 4
726
+ assert "num1" in importances["feature"].tolist()
727
+ assert any("cat1_" in f for f in importances["feature"].tolist())
728
+
729
+
730
+ def test_feature_importance_names__granularity_uses_deep_feature_names():
731
+ from sklearn.ensemble import RandomForestRegressor
732
+
733
+ df = pd.DataFrame(
734
+ {
735
+ "gameid": ["g1", "g1", "g2", "g2"],
736
+ "num1": [1.0, 2.0, 3.0, 4.0],
737
+ "num2": [10.0, 20.0, 30.0, 40.0],
738
+ "y": [1.0, 1.0, 2.0, 2.0],
739
+ }
740
+ )
741
+ y = df["y"]
742
+
743
+ model = AutoPipeline(
744
+ estimator=RandomForestRegressor(n_estimators=5, random_state=42),
745
+ estimator_features=["gameid", "num1", "num2"],
746
+ predictor_transformers=[AddConstantPredictionTransformer(col_name="const_pred")],
747
+ granularity=["gameid"],
748
+ categorical_features=["gameid"],
749
+ categorical_handling="ordinal",
750
+ remainder="drop",
751
+ )
752
+ model.fit(df, y)
753
+
754
+ names = model.feature_importance_names
755
+
756
+ inner = _inner_estimator(model)
757
+ assert list(names.keys()) == list(inner.feature_names_in_)
758
+ assert "gameid" not in names
759
+ assert "const_pred" in names
760
+
761
+
762
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
763
+ def test_granularity_with_aggregation_weight__features_weighted(frame):
764
+ df_pd = pd.DataFrame(
765
+ {
766
+ "gameid": ["g1", "g1", "g2", "g2"],
767
+ "num1": [10.0, 30.0, 20.0, 40.0],
768
+ "weight": [0.25, 0.75, 0.5, 0.5],
769
+ "y": [1.0, 1.0, 2.0, 2.0],
770
+ }
771
+ )
772
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
773
+
774
+ cap = CaptureEstimator()
775
+ model = AutoPipeline(
776
+ estimator=cap,
777
+ estimator_features=["num1"],
778
+ granularity=["gameid"],
779
+ aggregation_weight="weight",
780
+ remainder="drop",
781
+ )
782
+
783
+ X = _select(df, ["gameid", "num1", "weight"])
784
+ y = _col(df, "y")
785
+ model.fit(X, y=y)
786
+
787
+ inner = _inner_estimator(model)
788
+ assert inner.fit_shape[0] == 2
789
+
790
+ preds = model.predict(X)
791
+ assert preds.shape[0] == len(X)
792
+
793
+
794
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
795
+ def test_granularity_aggregation_weight__weighted_mean_correct(frame):
796
+ df_pd = pd.DataFrame(
797
+ {
798
+ "gameid": ["g1", "g1"],
799
+ "num1": [10.0, 30.0],
800
+ "weight": [0.25, 0.75],
801
+ "y": [1.0, 1.0],
802
+ }
803
+ )
804
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
805
+
806
+ from spforge.transformers._other_transformer import GroupByReducer
807
+
808
+ reducer = GroupByReducer(granularity=["gameid"], aggregation_weight="weight")
809
+ transformed = reducer.fit_transform(df)
810
+
811
+ if frame == "pl":
812
+ num1_val = transformed["num1"].to_list()[0]
813
+ else:
814
+ num1_val = transformed["num1"].iloc[0]
815
+
816
+ expected = (10.0 * 0.25 + 30.0 * 0.75) / (0.25 + 0.75)
817
+ assert abs(num1_val - expected) < 1e-6
818
+
819
+
820
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
821
+ def test_reduce_y_raises_when_target_not_uniform_per_group(frame):
822
+ df_pd = pd.DataFrame(
823
+ {
824
+ "gameid": ["g1", "g1"],
825
+ "num1": [10.0, 30.0],
826
+ }
827
+ )
828
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
829
+
830
+ from spforge.transformers._other_transformer import GroupByReducer
831
+
832
+ reducer = GroupByReducer(granularity=["gameid"])
833
+
834
+ y = np.array([1.0, 2.0])
835
+ with pytest.raises(ValueError, match="Target.*must be uniform"):
836
+ reducer.reduce_y(df, y)
837
+
838
+
839
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
840
+ def test_reduce_y_works_when_target_uniform_per_group(frame):
841
+ df_pd = pd.DataFrame(
842
+ {
843
+ "gameid": ["g1", "g1", "g2", "g2"],
844
+ "num1": [10.0, 30.0, 20.0, 40.0],
845
+ }
846
+ )
847
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
848
+
849
+ from spforge.transformers._other_transformer import GroupByReducer
850
+
851
+ reducer = GroupByReducer(granularity=["gameid"])
852
+
853
+ y = np.array([1.0, 1.0, 2.0, 2.0])
854
+ y_out, _ = reducer.reduce_y(df, y)
855
+
856
+ assert len(y_out) == 2
857
+ assert set(y_out) == {1.0, 2.0}
858
+
859
+
860
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
861
+ def test_aggregation_weight_sums_weight_column(frame):
862
+ df_pd = pd.DataFrame(
863
+ {
864
+ "gameid": ["g1", "g1"],
865
+ "num1": [10.0, 30.0],
866
+ "weight": [0.25, 0.75],
867
+ "y": [1.0, 1.0],
868
+ }
869
+ )
870
+ df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
871
+
872
+ from spforge.transformers._other_transformer import GroupByReducer
873
+
874
+ reducer = GroupByReducer(granularity=["gameid"], aggregation_weight="weight")
875
+ transformed = reducer.fit_transform(df)
876
+
877
+ if frame == "pl":
878
+ weight_val = transformed["weight"].to_list()[0]
879
+ else:
880
+ weight_val = transformed["weight"].iloc[0]
881
+
882
+ expected = 0.25 + 0.75
883
+ assert abs(weight_val - expected) < 1e-6
@@ -16,6 +16,49 @@ def column_names():
16
16
  )
17
17
 
18
18
 
19
+ class PolarsOnlyGenerator:
20
+ def __init__(self):
21
+ self._features_out = ["polars_only_feature"]
22
+
23
+ @property
24
+ def features_out(self):
25
+ return self._features_out
26
+
27
+ def fit_transform(self, df, column_names=None):
28
+ if not isinstance(df, pl.DataFrame):
29
+ raise TypeError("Expected polars DataFrame")
30
+ return df.with_columns((pl.col("points") * 2).alias("polars_only_feature"))
31
+
32
+ def transform(self, df):
33
+ if not isinstance(df, pl.DataFrame):
34
+ raise TypeError("Expected polars DataFrame")
35
+ return df.with_columns((pl.col("points") * 2).alias("polars_only_feature"))
36
+
37
+ def future_transform(self, df):
38
+ return self.transform(df)
39
+
40
+
41
+ def test_feature_generator_pipeline__passes_native_polars_to_custom_generator(column_names):
42
+ data = pl.DataFrame(
43
+ {
44
+ "game_id": [1, 1],
45
+ "team_id": ["A", "B"],
46
+ "player_id": ["p1", "p2"],
47
+ "date": pd.to_datetime(["2023-01-01", "2023-01-01"]),
48
+ "points": [10, 15],
49
+ }
50
+ )
51
+
52
+ pipeline = FeatureGeneratorPipeline(
53
+ feature_generators=[PolarsOnlyGenerator()],
54
+ column_names=column_names,
55
+ )
56
+
57
+ result = pipeline.fit_transform(data, column_names=column_names)
58
+
59
+ assert "polars_only_feature" in result.columns
60
+
61
+
19
62
  @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
20
63
  def test_feature_generator_pipeline__fit_transform_preserves_row_count(df_type, column_names):
21
64
  """FeatureGeneratorPipeline.fit_transform should preserve row count."""