spforge 0.8.4__py3-none-any.whl → 0.8.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (37) hide show
  1. examples/lol/pipeline_transformer_example.py +69 -86
  2. examples/nba/cross_validation_example.py +4 -11
  3. examples/nba/feature_engineering_example.py +33 -15
  4. examples/nba/game_winner_example.py +24 -14
  5. examples/nba/predictor_transformers_example.py +29 -16
  6. spforge/__init__.py +1 -0
  7. spforge/autopipeline.py +169 -5
  8. spforge/estimator/_group_by_estimator.py +11 -3
  9. spforge/features_generator_pipeline.py +8 -4
  10. spforge/hyperparameter_tuning/__init__.py +12 -0
  11. spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
  12. spforge/hyperparameter_tuning/_tuner.py +192 -0
  13. spforge/performance_transformers/_performance_manager.py +2 -4
  14. spforge/ratings/__init__.py +4 -0
  15. spforge/ratings/_player_rating.py +142 -28
  16. spforge/ratings/league_start_rating_optimizer.py +201 -0
  17. spforge/ratings/start_rating_generator.py +1 -1
  18. spforge/ratings/team_start_rating_generator.py +1 -1
  19. spforge/ratings/utils.py +16 -6
  20. spforge/scorer/_score.py +42 -11
  21. spforge/transformers/_other_transformer.py +38 -8
  22. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/METADATA +12 -19
  23. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/RECORD +37 -31
  24. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
  25. tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
  26. tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  27. tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
  28. tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
  29. tests/performance_transformers/test_performance_manager.py +15 -0
  30. tests/ratings/test_player_rating_generator.py +154 -0
  31. tests/ratings/test_player_rating_no_mutation.py +214 -0
  32. tests/ratings/test_utils_scaled_weights.py +136 -0
  33. tests/scorer/test_score.py +232 -0
  34. tests/test_autopipeline.py +336 -6
  35. tests/test_feature_generator_pipeline.py +43 -0
  36. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
  37. {spforge-0.8.4.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
@@ -1,123 +1,106 @@
1
+ import polars as pl
1
2
  from lightgbm import LGBMRegressor
2
- from sklearn.linear_model import LogisticRegression
3
3
 
4
4
  from examples import get_sub_sample_lol_data
5
5
  from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
6
- from spforge.cross_validator import MatchKFoldCrossValidator
7
- from spforge.distributions import (
8
- NegativeBinomialEstimator,
9
- )
6
+ from spforge.distributions import NegativeBinomialEstimator
10
7
  from spforge.feature_generator import LagTransformer, RollingWindowTransformer
11
- from spforge.performance_transformers._performance_manager import ColumnWeight
12
- from spforge.ratings import (
13
- PlayerRatingGenerator,
14
- RatingKnownFeatures,
15
- )
8
+ from spforge.transformers import EstimatorTransformer
16
9
 
17
10
  column_names = ColumnNames(
18
11
  team_id="teamname",
19
12
  match_id="gameid",
20
13
  start_date="date",
21
- player_id="playername",
14
+ player_id="player_uid",
22
15
  league="league",
23
16
  position="position",
24
17
  )
25
- df = get_sub_sample_lol_data(as_pandas=True)
18
+
19
+ df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
26
20
  df = (
27
- df.loc[lambda x: x.position != "team"]
28
- .assign(team_count=df.groupby("gameid")["teamname"].transform("nunique"))
29
- .loc[lambda x: x.team_count == 2]
30
- .assign(player_count=df.groupby(["gameid", "teamname"])["playername"].transform("nunique"))
31
- .loc[lambda x: x.player_count == 5]
21
+ df.with_columns(
22
+ pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
23
+ column_names.player_id
24
+ )
25
+ )
26
+ .filter(pl.col(column_names.position) != "team")
27
+ .with_columns(
28
+ pl.col(column_names.team_id)
29
+ .n_unique()
30
+ .over(column_names.match_id)
31
+ .alias("team_count"),
32
+ pl.col(column_names.player_id)
33
+ .n_unique()
34
+ .over([column_names.match_id, column_names.team_id])
35
+ .alias("player_count"),
36
+ )
37
+ .filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
38
+ .drop(["team_count", "player_count"])
39
+ .unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
40
+ .sort(
41
+ [
42
+ column_names.start_date,
43
+ column_names.match_id,
44
+ column_names.team_id,
45
+ column_names.player_id,
46
+ ]
47
+ )
32
48
  )
33
- df = df.assign(team_count=df.groupby("gameid")["teamname"].transform("nunique")).loc[
34
- lambda x: x.team_count == 2
35
- ]
36
-
37
- df = df.drop_duplicates(subset=["gameid", "playername", "teamname"])
38
49
 
39
- # Pretends the last 10 games are future games. The most will be trained on everything before that.
40
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
41
- historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
42
- future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["result"])
43
- rating_generator_player_kills = PlayerRatingGenerator(
44
- features_out=[RatingKnownFeatures.PLAYER_RATING],
45
- performance_column="performance_kills",
46
- auto_scale_performance=True,
47
- performance_weights=[ColumnWeight(name="kills", weight=1)],
50
+ most_recent_10_games = (
51
+ df.select(pl.col(column_names.match_id))
52
+ .unique(maintain_order=True)
53
+ .tail(10)
54
+ .get_column(column_names.match_id)
55
+ .to_list()
48
56
  )
49
- rating_generator_result = PlayerRatingGenerator(
50
- features_out=[RatingKnownFeatures.TEAM_RATING_DIFFERENCE_PROJECTED],
51
- performance_column="result",
52
- non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
53
- )
54
-
57
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
58
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
55
59
 
56
- lag_generators = [
57
- LagTransformer(
58
- features=["kills", "deaths", "result"], lag_length=3, granularity=["playername"]
59
- ),
60
+ lag_transformers = [
61
+ LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
60
62
  RollingWindowTransformer(
61
- features=["kills", "deaths", "result"],
63
+ features=["kills", "deaths"],
62
64
  window=20,
63
65
  min_periods=1,
64
- granularity=["playername"],
66
+ granularity=["player_uid"],
65
67
  ),
66
68
  ]
67
69
 
68
70
  features_generator = FeatureGeneratorPipeline(
69
71
  column_names=column_names,
70
- feature_generators=[rating_generator_player_kills, rating_generator_result, *lag_generators],
72
+ feature_generators=lag_transformers,
71
73
  )
72
74
 
73
- historical_df = features_generator.fit_transform(historical_df)
74
-
75
- game_winner_predictor = SklearnPredictor(
76
- estimator=LogisticRegression(),
77
- target="result",
78
- features=rating_generator_result.features_out,
79
- granularity=[column_names.match_id, column_names.team_id],
80
- )
81
- game_winner_pipeline = AutoPipeline(
82
- predictor=game_winner_predictor, one_hot_encode_cat_features=True, impute_missing_values=True
83
- )
75
+ historical_df = features_generator.fit_transform(historical_df).to_pandas()
76
+ future_df = features_generator.future_transform(future_df).to_pandas()
84
77
 
85
- player_kills_predictor = SklearnPredictor(
86
- estimator=LGBMRegressor(verbose=-100),
87
- target="kills",
88
- features=[game_winner_predictor.pred_column, *features_generator.features_out],
78
+ point_estimate_transformer = EstimatorTransformer(
79
+ prediction_column_name="kills_estimate",
80
+ estimator=LGBMRegressor(verbose=-100, random_state=42),
81
+ features=features_generator.features_out,
89
82
  )
90
83
 
91
- cross_validator_game_winner = MatchKFoldCrossValidator(
92
- date_column_name=column_names.start_date,
93
- match_id_column_name=column_names.match_id,
94
- estimator=game_winner_predictor,
84
+ probability_estimator = NegativeBinomialEstimator(
85
+ max_value=15,
86
+ point_estimate_pred_column="kills_estimate",
87
+ r_specific_granularity=[column_names.player_id],
88
+ predicted_r_weight=1,
89
+ column_names=column_names,
95
90
  )
96
91
 
97
- game_winner_predictor.train(historical_df)
98
- historical_df = cross_validator_game_winner.generate_validation_df(historical_df)
99
-
100
- cross_validator_player_kills = MatchKFoldCrossValidator(
101
- date_column_name=column_names.start_date,
102
- match_id_column_name=column_names.match_id,
103
- estimator=player_kills_predictor,
92
+ pipeline = AutoPipeline(
93
+ estimator=probability_estimator,
94
+ estimator_features=features_generator.features_out,
95
+ predictor_transformers=[point_estimate_transformer],
104
96
  )
105
97
 
106
- player_kills_predictor.train(historical_df)
107
- print(player_kills_predictor.features)
108
- historical_df = cross_validator_player_kills.generate_validation_df(historical_df)
109
-
110
- future_df = features_generator.future_transform(future_df)
111
- future_df = game_winner_predictor.predict(future_df)
112
- future_df = player_kills_predictor.predict(future_df)
113
-
114
- probability_predictor = NegativeBinomialEstimator(
115
- target="kills",
116
- point_estimate_pred_column=player_kills_predictor.pred_column,
117
- max_value=15,
118
- )
98
+ pipeline.fit(X=historical_df, y=historical_df["kills"])
119
99
 
120
- probability_predictor.train(historical_df)
121
- future_df = probability_predictor.predict(future_df)
100
+ future_point_estimates = pipeline.predict(future_df)
101
+ future_probabilities = pipeline.predict_proba(future_df)
102
+ future_df["kills_pred"] = future_point_estimates
122
103
 
123
- print(future_df.head(10))
104
+ print(future_df.head(5))
105
+ print(f"Probability matrix shape: {future_probabilities.shape}")
106
+ print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")
@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
51
51
  print("-" * 70)
52
52
  pipeline_classifier = AutoPipeline(
53
53
  estimator=LGBMClassifier(verbose=-100, random_state=42),
54
- feature_names=features_generator.features_out,
54
+ estimator_features=features_generator.features_out,
55
55
  )
56
56
 
57
57
  cross_validator_classifier = MatchKFoldCrossValidator(
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
60
60
  estimator=pipeline_classifier,
61
61
  prediction_column_name="points_probabilities_classifier",
62
62
  target_column="points",
63
- features=pipeline_classifier.feature_names,
63
+ features=pipeline_classifier.required_features,
64
64
  )
65
65
  validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
66
66
 
@@ -80,20 +80,13 @@ print("-" * 70)
80
80
  predictor_negbin = NegativeBinomialEstimator(
81
81
  max_value=40,
82
82
  point_estimate_pred_column="points_estimate",
83
- r_specific_granularity=["player_id"],
84
83
  predicted_r_weight=1,
85
84
  column_names=column_names,
86
85
  )
87
86
 
88
87
  pipeline_negbin = AutoPipeline(
89
88
  estimator=predictor_negbin,
90
- feature_names=features_generator.features_out,
91
- context_feature_names=[
92
- column_names.player_id,
93
- column_names.start_date,
94
- column_names.team_id,
95
- column_names.match_id,
96
- ],
89
+ estimator_features=features_generator.features_out,
97
90
  predictor_transformers=[
98
91
  EstimatorTransformer(
99
92
  prediction_column_name="points_estimate",
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
109
102
  estimator=pipeline_negbin,
110
103
  prediction_column_name="points_probabilities_negbin",
111
104
  target_column="points",
112
- features=pipeline_negbin.context_feature_names + pipeline_negbin.feature_names,
105
+ features=pipeline_negbin.required_features,
113
106
  )
114
107
  validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)
115
108
 
@@ -13,7 +13,7 @@ Key concepts covered:
13
13
  - State management: fit_transform vs future_transform
14
14
  """
15
15
 
16
- import pandas as pd
16
+ import polars as pl
17
17
 
18
18
  from examples import get_sub_sample_nba_data
19
19
  from spforge import FeatureGeneratorPipeline
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
22
22
  from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
23
23
 
24
24
  # Load sample NBA data
25
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
25
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
26
26
 
27
27
  # Define column mappings for your dataset
28
28
  # This tells spforge which columns contain team IDs, player IDs, dates, etc.
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
 
36
36
  # CRITICAL: Always sort data chronologically before generating features
37
37
  # This ensures temporal ordering and prevents future leakage (using future data to predict the past)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date, # First by date
41
41
  column_names.match_id, # Then by match
@@ -46,13 +46,21 @@ df = df.sort_values(
46
46
 
47
47
  # Keep only games with exactly 2 teams (filter out invalid data)
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
- print(f"Dataset: {len(df)} rows, {df[column_names.match_id].nunique()} games")
55
- print(f"Date range: {df[column_names.start_date].min()} to {df[column_names.start_date].max()}")
59
+ match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
60
+ start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
61
+ end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
62
+ print(f"Dataset: {len(df)} rows, {match_count} games")
63
+ print(f"Date range: {start_date} to {end_date}")
56
64
  print()
57
65
 
58
66
  # ====================================================================
@@ -125,12 +133,22 @@ print()
125
133
  # ====================================================================
126
134
 
127
135
  # Split data into historical (for training) and future (for prediction)
128
- most_recent_5_games = df[column_names.match_id].unique()[-5:]
129
- historical_df = df[~df[column_names.match_id].isin(most_recent_5_games)].copy()
130
- future_df = df[df[column_names.match_id].isin(most_recent_5_games)].copy()
136
+ most_recent_5_games = (
137
+ df.select(pl.col(column_names.match_id))
138
+ .unique(maintain_order=True)
139
+ .tail(5)
140
+ .get_column(column_names.match_id)
141
+ .to_list()
142
+ )
143
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
144
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
131
145
 
132
- print(f"Historical data: {len(historical_df)} rows, {historical_df[column_names.match_id].nunique()} games")
133
- print(f"Future data: {len(future_df)} rows, {future_df[column_names.match_id].nunique()} games")
146
+ historical_games = (
147
+ historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
148
+ )
149
+ future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
150
+ print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
151
+ print(f"Future data: {len(future_df)} rows, {future_games} games")
134
152
  print()
135
153
 
136
154
  # FIT_TRANSFORM: Learn from historical data
@@ -138,7 +156,7 @@ print()
138
156
  # - Lags/rolling windows build up from initial games
139
157
  # - Internal state (ratings, windows) is MUTATED
140
158
  print("Applying fit_transform to historical data...")
141
- historical_df = features_pipeline.fit_transform(historical_df)
159
+ historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
142
160
  print(f" Generated {len(features_pipeline.features_out)} features:")
143
161
  for feature in features_pipeline.features_out:
144
162
  print(f" - {feature}")
@@ -149,7 +167,7 @@ print()
149
167
  # - Appends current game to lag/rolling windows but doesn't persist the update
150
168
  # - This is what you use in production: generate features without affecting your model's state
151
169
  print("Applying future_transform to future data (read-only)...")
152
- future_df_transformed = features_pipeline.future_transform(future_df)
170
+ future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
153
171
  print(f" Future data now has {len(future_df_transformed.columns)} columns")
154
172
  print()
155
173
 
@@ -1,12 +1,13 @@
1
- import pandas as pd
1
+ import polars as pl
2
2
  from sklearn.linear_model import LogisticRegression
3
3
 
4
+ from examples import get_sub_sample_nba_data
4
5
  from spforge.autopipeline import AutoPipeline
5
6
  from spforge.data_structures import ColumnNames
6
7
  from spforge.ratings import RatingKnownFeatures
7
8
  from spforge.ratings._player_rating import PlayerRatingGenerator
8
9
 
9
- df = pd.read_parquet("data/game_player_subsample.parquet")
10
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
10
11
 
11
12
  # Defines the column names as they appear in the dataframe
12
13
  column_names = ColumnNames(
@@ -16,8 +17,8 @@ column_names = ColumnNames(
16
17
  player_id="player_name",
17
18
  )
18
19
  # Sorts the dataframe. The dataframe must always be sorted as below
19
- df = df.sort_values(
20
- by=[
20
+ df = df.sort(
21
+ [
21
22
  column_names.start_date,
22
23
  column_names.match_id,
23
24
  column_names.team_id,
@@ -27,17 +28,26 @@ df = df.sort_values(
27
28
 
28
29
  # Drops games with less or more than 2 teams
29
30
  df = (
30
- df.assign(
31
- team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique")
31
+ df.with_columns(
32
+ pl.col(column_names.team_id)
33
+ .n_unique()
34
+ .over(column_names.match_id)
35
+ .alias("team_count")
32
36
  )
33
- .loc[lambda x: x.team_count == 2]
34
- .drop(columns=["team_count"])
37
+ .filter(pl.col("team_count") == 2)
38
+ .drop("team_count")
35
39
  )
36
40
 
37
41
  # Pretends the last 10 games are future games. The most will be trained on everything before that.
38
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
39
- historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
40
- future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["won"])
42
+ most_recent_10_games = (
43
+ df.select(pl.col(column_names.match_id))
44
+ .unique(maintain_order=True)
45
+ .tail(10)
46
+ .get_column(column_names.match_id)
47
+ .to_list()
48
+ )
49
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
50
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
41
51
 
42
52
  # Defining a simple rating-generator. It will use the "won" column to update the ratings.
43
53
  # In contrast to a typical Elo, ratings will follow players.
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
49
59
  column_names=column_names,
50
60
  non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
51
61
  )
52
- historical_df = rating_generator.fit_transform(historical_df)
62
+ historical_df = rating_generator.fit_transform(historical_df).to_pandas()
53
63
 
54
64
  # Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
55
65
  # Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
61
71
  pipeline = AutoPipeline(
62
72
  estimator=LogisticRegression(),
63
73
  granularity=["game_id", "team_id"],
64
- feature_names=rating_generator.features_out + ["location"],
74
+ estimator_features=rating_generator.features_out + ["location"],
65
75
  )
66
76
 
67
77
  pipeline.fit(X=historical_df, y=historical_df["won"])
68
78
 
69
79
  # Future predictions on future results
70
- future_df = rating_generator.future_transform(future_df)
80
+ future_df = rating_generator.future_transform(future_df).to_pandas()
71
81
  future_predictions = pipeline.predict_proba(future_df)[:, 1]
72
82
  future_df["game_winner_probability"] = future_predictions
73
83
  # Grouping predictions from game-player level to game-level.
@@ -12,7 +12,7 @@ Key concepts covered:
12
12
  - Hierarchical modeling: Team strength → Player performance
13
13
  """
14
14
 
15
- import pandas as pd
15
+ import polars as pl
16
16
  from lightgbm import LGBMRegressor
17
17
  from sklearn.linear_model import LogisticRegression
18
18
 
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
24
24
  from spforge.transformers import EstimatorTransformer
25
25
 
26
26
  # Load sample NBA data
27
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
27
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
28
28
 
29
29
  # Define column mappings
30
30
  column_names = ColumnNames(
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
  )
36
36
 
37
37
  # Sort data chronologically (critical for temporal correctness)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date,
41
41
  column_names.match_id,
@@ -46,18 +46,31 @@ df = df.sort_values(
46
46
 
47
47
  # Filter to valid games
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
59
  # Train/test split (using temporal ordering)
55
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
56
- train_df = df[~df[column_names.match_id].isin(most_recent_10_games)].copy()
57
- test_df = df[df[column_names.match_id].isin(most_recent_10_games)].copy()
60
+ most_recent_10_games = (
61
+ df.select(pl.col(column_names.match_id))
62
+ .unique(maintain_order=True)
63
+ .tail(10)
64
+ .get_column(column_names.match_id)
65
+ .to_list()
66
+ )
67
+ train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
68
+ test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
58
69
 
59
- print(f"Training: {len(train_df)} rows, {train_df[column_names.match_id].nunique()} games")
60
- print(f"Testing: {len(test_df)} rows, {test_df[column_names.match_id].nunique()} games")
70
+ train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
71
+ test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
72
+ print(f"Training: {len(train_df)} rows, {train_games} games")
73
+ print(f"Testing: {len(test_df)} rows, {test_games} games")
61
74
  print()
62
75
 
63
76
  # ====================================================================
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
86
99
  )
87
100
 
88
101
  # Generate features
89
- train_df = features_pipeline.fit_transform(train_df)
90
- test_df = features_pipeline.future_transform(test_df)
102
+ train_df = features_pipeline.fit_transform(train_df).to_pandas()
103
+ test_df = features_pipeline.future_transform(test_df).to_pandas()
91
104
 
92
105
  print(f"Generated {len(features_pipeline.features_out)} baseline features")
93
106
  print()
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
121
134
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
122
135
  # Features for the final estimator (only pre-game information)
123
136
  # Note: points_estimate_raw will be added by the transformer
124
- feature_names=features_pipeline.features_out,
137
+ estimator_features=features_pipeline.features_out,
125
138
  # The predictor_transformers parameter chains the estimators
126
139
  predictor_transformers=[points_estimate_transformer], # Stage 1 executes first
127
140
  )
@@ -150,7 +163,7 @@ print()
150
163
 
151
164
  # Fit the pipeline
152
165
  # The y target here is for the FINAL estimator (player points)
153
- # Each predictor_transformer has its own target_column specified
166
+ # Predictor_transformers are trained on the same target during fit()
154
167
  player_points_pipeline.fit(X=train_df, y=train_df["points"])
155
168
 
156
169
  print("Training complete!")
@@ -188,7 +201,7 @@ print()
188
201
 
189
202
  single_stage_pipeline = AutoPipeline(
190
203
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
191
- feature_names=features_pipeline.features_out,
204
+ estimator_features=features_pipeline.features_out,
192
205
  )
193
206
 
194
207
  print("Training single-stage baseline for comparison...")
spforge/__init__.py CHANGED
@@ -2,6 +2,7 @@ from .autopipeline import AutoPipeline as AutoPipeline
2
2
  from .data_structures import ColumnNames as ColumnNames, GameColumnNames as GameColumnNames
3
3
  from .features_generator_pipeline import FeatureGeneratorPipeline as FeatureGeneratorPipeline
4
4
  from .hyperparameter_tuning import (
5
+ EstimatorHyperparameterTuner as EstimatorHyperparameterTuner,
5
6
  OptunaResult as OptunaResult,
6
7
  ParamSpec as ParamSpec,
7
8
  RatingHyperparameterTuner as RatingHyperparameterTuner,