spforge 0.8.4__py3-none-any.whl → 0.8.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spforge might be problematic. Click here for more details.
- examples/lol/pipeline_transformer_example.py +69 -86
- examples/nba/cross_validation_example.py +4 -11
- examples/nba/feature_engineering_example.py +33 -15
- examples/nba/game_winner_example.py +24 -14
- examples/nba/predictor_transformers_example.py +29 -16
- spforge/__init__.py +1 -0
- spforge/hyperparameter_tuning/__init__.py +12 -0
- spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
- spforge/hyperparameter_tuning/_tuner.py +192 -0
- spforge/ratings/__init__.py +4 -0
- spforge/ratings/_player_rating.py +11 -0
- spforge/ratings/league_start_rating_optimizer.py +201 -0
- {spforge-0.8.4.dist-info → spforge-0.8.7.dist-info}/METADATA +12 -19
- {spforge-0.8.4.dist-info → spforge-0.8.7.dist-info}/RECORD +23 -19
- tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
- tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
- tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
- tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
- tests/ratings/test_player_rating_generator.py +27 -0
- tests/scorer/test_score.py +90 -0
- {spforge-0.8.4.dist-info → spforge-0.8.7.dist-info}/WHEEL +0 -0
- {spforge-0.8.4.dist-info → spforge-0.8.7.dist-info}/licenses/LICENSE +0 -0
- {spforge-0.8.4.dist-info → spforge-0.8.7.dist-info}/top_level.txt +0 -0
|
@@ -1,123 +1,106 @@
|
|
|
1
|
+
import polars as pl
|
|
1
2
|
from lightgbm import LGBMRegressor
|
|
2
|
-
from sklearn.linear_model import LogisticRegression
|
|
3
3
|
|
|
4
4
|
from examples import get_sub_sample_lol_data
|
|
5
5
|
from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
|
|
6
|
-
from spforge.
|
|
7
|
-
from spforge.distributions import (
|
|
8
|
-
NegativeBinomialEstimator,
|
|
9
|
-
)
|
|
6
|
+
from spforge.distributions import NegativeBinomialEstimator
|
|
10
7
|
from spforge.feature_generator import LagTransformer, RollingWindowTransformer
|
|
11
|
-
from spforge.
|
|
12
|
-
from spforge.ratings import (
|
|
13
|
-
PlayerRatingGenerator,
|
|
14
|
-
RatingKnownFeatures,
|
|
15
|
-
)
|
|
8
|
+
from spforge.transformers import EstimatorTransformer
|
|
16
9
|
|
|
17
10
|
column_names = ColumnNames(
|
|
18
11
|
team_id="teamname",
|
|
19
12
|
match_id="gameid",
|
|
20
13
|
start_date="date",
|
|
21
|
-
player_id="
|
|
14
|
+
player_id="player_uid",
|
|
22
15
|
league="league",
|
|
23
16
|
position="position",
|
|
24
17
|
)
|
|
25
|
-
|
|
18
|
+
|
|
19
|
+
df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
|
|
26
20
|
df = (
|
|
27
|
-
df.
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
21
|
+
df.with_columns(
|
|
22
|
+
pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
|
|
23
|
+
column_names.player_id
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
.filter(pl.col(column_names.position) != "team")
|
|
27
|
+
.with_columns(
|
|
28
|
+
pl.col(column_names.team_id)
|
|
29
|
+
.n_unique()
|
|
30
|
+
.over(column_names.match_id)
|
|
31
|
+
.alias("team_count"),
|
|
32
|
+
pl.col(column_names.player_id)
|
|
33
|
+
.n_unique()
|
|
34
|
+
.over([column_names.match_id, column_names.team_id])
|
|
35
|
+
.alias("player_count"),
|
|
36
|
+
)
|
|
37
|
+
.filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
|
|
38
|
+
.drop(["team_count", "player_count"])
|
|
39
|
+
.unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
|
|
40
|
+
.sort(
|
|
41
|
+
[
|
|
42
|
+
column_names.start_date,
|
|
43
|
+
column_names.match_id,
|
|
44
|
+
column_names.team_id,
|
|
45
|
+
column_names.player_id,
|
|
46
|
+
]
|
|
47
|
+
)
|
|
32
48
|
)
|
|
33
|
-
df = df.assign(team_count=df.groupby("gameid")["teamname"].transform("nunique")).loc[
|
|
34
|
-
lambda x: x.team_count == 2
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
df = df.drop_duplicates(subset=["gameid", "playername", "teamname"])
|
|
38
49
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
performance_column="performance_kills",
|
|
46
|
-
auto_scale_performance=True,
|
|
47
|
-
performance_weights=[ColumnWeight(name="kills", weight=1)],
|
|
50
|
+
most_recent_10_games = (
|
|
51
|
+
df.select(pl.col(column_names.match_id))
|
|
52
|
+
.unique(maintain_order=True)
|
|
53
|
+
.tail(10)
|
|
54
|
+
.get_column(column_names.match_id)
|
|
55
|
+
.to_list()
|
|
48
56
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
performance_column="result",
|
|
52
|
-
non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
|
|
53
|
-
)
|
|
54
|
-
|
|
57
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
58
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
LagTransformer(
|
|
58
|
-
features=["kills", "deaths", "result"], lag_length=3, granularity=["playername"]
|
|
59
|
-
),
|
|
60
|
+
lag_transformers = [
|
|
61
|
+
LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
|
|
60
62
|
RollingWindowTransformer(
|
|
61
|
-
features=["kills", "deaths"
|
|
63
|
+
features=["kills", "deaths"],
|
|
62
64
|
window=20,
|
|
63
65
|
min_periods=1,
|
|
64
|
-
granularity=["
|
|
66
|
+
granularity=["player_uid"],
|
|
65
67
|
),
|
|
66
68
|
]
|
|
67
69
|
|
|
68
70
|
features_generator = FeatureGeneratorPipeline(
|
|
69
71
|
column_names=column_names,
|
|
70
|
-
feature_generators=
|
|
72
|
+
feature_generators=lag_transformers,
|
|
71
73
|
)
|
|
72
74
|
|
|
73
|
-
historical_df = features_generator.fit_transform(historical_df)
|
|
74
|
-
|
|
75
|
-
game_winner_predictor = SklearnPredictor(
|
|
76
|
-
estimator=LogisticRegression(),
|
|
77
|
-
target="result",
|
|
78
|
-
features=rating_generator_result.features_out,
|
|
79
|
-
granularity=[column_names.match_id, column_names.team_id],
|
|
80
|
-
)
|
|
81
|
-
game_winner_pipeline = AutoPipeline(
|
|
82
|
-
predictor=game_winner_predictor, one_hot_encode_cat_features=True, impute_missing_values=True
|
|
83
|
-
)
|
|
75
|
+
historical_df = features_generator.fit_transform(historical_df).to_pandas()
|
|
76
|
+
future_df = features_generator.future_transform(future_df).to_pandas()
|
|
84
77
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
features=
|
|
78
|
+
point_estimate_transformer = EstimatorTransformer(
|
|
79
|
+
prediction_column_name="kills_estimate",
|
|
80
|
+
estimator=LGBMRegressor(verbose=-100, random_state=42),
|
|
81
|
+
features=features_generator.features_out,
|
|
89
82
|
)
|
|
90
83
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
84
|
+
probability_estimator = NegativeBinomialEstimator(
|
|
85
|
+
max_value=15,
|
|
86
|
+
point_estimate_pred_column="kills_estimate",
|
|
87
|
+
r_specific_granularity=[column_names.player_id],
|
|
88
|
+
predicted_r_weight=1,
|
|
89
|
+
column_names=column_names,
|
|
95
90
|
)
|
|
96
91
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
date_column_name=column_names.start_date,
|
|
102
|
-
match_id_column_name=column_names.match_id,
|
|
103
|
-
estimator=player_kills_predictor,
|
|
92
|
+
pipeline = AutoPipeline(
|
|
93
|
+
estimator=probability_estimator,
|
|
94
|
+
estimator_features=features_generator.features_out,
|
|
95
|
+
predictor_transformers=[point_estimate_transformer],
|
|
104
96
|
)
|
|
105
97
|
|
|
106
|
-
|
|
107
|
-
print(player_kills_predictor.features)
|
|
108
|
-
historical_df = cross_validator_player_kills.generate_validation_df(historical_df)
|
|
109
|
-
|
|
110
|
-
future_df = features_generator.future_transform(future_df)
|
|
111
|
-
future_df = game_winner_predictor.predict(future_df)
|
|
112
|
-
future_df = player_kills_predictor.predict(future_df)
|
|
113
|
-
|
|
114
|
-
probability_predictor = NegativeBinomialEstimator(
|
|
115
|
-
target="kills",
|
|
116
|
-
point_estimate_pred_column=player_kills_predictor.pred_column,
|
|
117
|
-
max_value=15,
|
|
118
|
-
)
|
|
98
|
+
pipeline.fit(X=historical_df, y=historical_df["kills"])
|
|
119
99
|
|
|
120
|
-
|
|
121
|
-
|
|
100
|
+
future_point_estimates = pipeline.predict(future_df)
|
|
101
|
+
future_probabilities = pipeline.predict_proba(future_df)
|
|
102
|
+
future_df["kills_pred"] = future_point_estimates
|
|
122
103
|
|
|
123
|
-
print(future_df.head(
|
|
104
|
+
print(future_df.head(5))
|
|
105
|
+
print(f"Probability matrix shape: {future_probabilities.shape}")
|
|
106
|
+
print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")
|
|
@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
|
|
|
51
51
|
print("-" * 70)
|
|
52
52
|
pipeline_classifier = AutoPipeline(
|
|
53
53
|
estimator=LGBMClassifier(verbose=-100, random_state=42),
|
|
54
|
-
|
|
54
|
+
estimator_features=features_generator.features_out,
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
cross_validator_classifier = MatchKFoldCrossValidator(
|
|
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
|
|
|
60
60
|
estimator=pipeline_classifier,
|
|
61
61
|
prediction_column_name="points_probabilities_classifier",
|
|
62
62
|
target_column="points",
|
|
63
|
-
features=pipeline_classifier.
|
|
63
|
+
features=pipeline_classifier.required_features,
|
|
64
64
|
)
|
|
65
65
|
validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
|
|
66
66
|
|
|
@@ -80,20 +80,13 @@ print("-" * 70)
|
|
|
80
80
|
predictor_negbin = NegativeBinomialEstimator(
|
|
81
81
|
max_value=40,
|
|
82
82
|
point_estimate_pred_column="points_estimate",
|
|
83
|
-
r_specific_granularity=["player_id"],
|
|
84
83
|
predicted_r_weight=1,
|
|
85
84
|
column_names=column_names,
|
|
86
85
|
)
|
|
87
86
|
|
|
88
87
|
pipeline_negbin = AutoPipeline(
|
|
89
88
|
estimator=predictor_negbin,
|
|
90
|
-
|
|
91
|
-
context_feature_names=[
|
|
92
|
-
column_names.player_id,
|
|
93
|
-
column_names.start_date,
|
|
94
|
-
column_names.team_id,
|
|
95
|
-
column_names.match_id,
|
|
96
|
-
],
|
|
89
|
+
estimator_features=features_generator.features_out,
|
|
97
90
|
predictor_transformers=[
|
|
98
91
|
EstimatorTransformer(
|
|
99
92
|
prediction_column_name="points_estimate",
|
|
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
|
|
|
109
102
|
estimator=pipeline_negbin,
|
|
110
103
|
prediction_column_name="points_probabilities_negbin",
|
|
111
104
|
target_column="points",
|
|
112
|
-
features=pipeline_negbin.
|
|
105
|
+
features=pipeline_negbin.required_features,
|
|
113
106
|
)
|
|
114
107
|
validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)
|
|
115
108
|
|
|
@@ -13,7 +13,7 @@ Key concepts covered:
|
|
|
13
13
|
- State management: fit_transform vs future_transform
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
import
|
|
16
|
+
import polars as pl
|
|
17
17
|
|
|
18
18
|
from examples import get_sub_sample_nba_data
|
|
19
19
|
from spforge import FeatureGeneratorPipeline
|
|
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
|
|
|
22
22
|
from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
23
23
|
|
|
24
24
|
# Load sample NBA data
|
|
25
|
-
df = get_sub_sample_nba_data(as_pandas=
|
|
25
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
26
26
|
|
|
27
27
|
# Define column mappings for your dataset
|
|
28
28
|
# This tells spforge which columns contain team IDs, player IDs, dates, etc.
|
|
@@ -35,7 +35,7 @@ column_names = ColumnNames(
|
|
|
35
35
|
|
|
36
36
|
# CRITICAL: Always sort data chronologically before generating features
|
|
37
37
|
# This ensures temporal ordering and prevents future leakage (using future data to predict the past)
|
|
38
|
-
df = df.
|
|
38
|
+
df = df.sort(
|
|
39
39
|
[
|
|
40
40
|
column_names.start_date, # First by date
|
|
41
41
|
column_names.match_id, # Then by match
|
|
@@ -46,13 +46,21 @@ df = df.sort_values(
|
|
|
46
46
|
|
|
47
47
|
# Keep only games with exactly 2 teams (filter out invalid data)
|
|
48
48
|
df = (
|
|
49
|
-
df.
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
df.with_columns(
|
|
50
|
+
pl.col(column_names.team_id)
|
|
51
|
+
.n_unique()
|
|
52
|
+
.over(column_names.match_id)
|
|
53
|
+
.alias("team_count")
|
|
54
|
+
)
|
|
55
|
+
.filter(pl.col("team_count") == 2)
|
|
56
|
+
.drop("team_count")
|
|
52
57
|
)
|
|
53
58
|
|
|
54
|
-
|
|
55
|
-
|
|
59
|
+
match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
60
|
+
start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
|
|
61
|
+
end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
|
|
62
|
+
print(f"Dataset: {len(df)} rows, {match_count} games")
|
|
63
|
+
print(f"Date range: {start_date} to {end_date}")
|
|
56
64
|
print()
|
|
57
65
|
|
|
58
66
|
# ====================================================================
|
|
@@ -125,12 +133,22 @@ print()
|
|
|
125
133
|
# ====================================================================
|
|
126
134
|
|
|
127
135
|
# Split data into historical (for training) and future (for prediction)
|
|
128
|
-
most_recent_5_games =
|
|
129
|
-
|
|
130
|
-
|
|
136
|
+
most_recent_5_games = (
|
|
137
|
+
df.select(pl.col(column_names.match_id))
|
|
138
|
+
.unique(maintain_order=True)
|
|
139
|
+
.tail(5)
|
|
140
|
+
.get_column(column_names.match_id)
|
|
141
|
+
.to_list()
|
|
142
|
+
)
|
|
143
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
|
|
144
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
|
|
131
145
|
|
|
132
|
-
|
|
133
|
-
|
|
146
|
+
historical_games = (
|
|
147
|
+
historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
148
|
+
)
|
|
149
|
+
future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
150
|
+
print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
|
|
151
|
+
print(f"Future data: {len(future_df)} rows, {future_games} games")
|
|
134
152
|
print()
|
|
135
153
|
|
|
136
154
|
# FIT_TRANSFORM: Learn from historical data
|
|
@@ -138,7 +156,7 @@ print()
|
|
|
138
156
|
# - Lags/rolling windows build up from initial games
|
|
139
157
|
# - Internal state (ratings, windows) is MUTATED
|
|
140
158
|
print("Applying fit_transform to historical data...")
|
|
141
|
-
historical_df = features_pipeline.fit_transform(historical_df)
|
|
159
|
+
historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
|
|
142
160
|
print(f" Generated {len(features_pipeline.features_out)} features:")
|
|
143
161
|
for feature in features_pipeline.features_out:
|
|
144
162
|
print(f" - {feature}")
|
|
@@ -149,7 +167,7 @@ print()
|
|
|
149
167
|
# - Appends current game to lag/rolling windows but doesn't persist the update
|
|
150
168
|
# - This is what you use in production: generate features without affecting your model's state
|
|
151
169
|
print("Applying future_transform to future data (read-only)...")
|
|
152
|
-
future_df_transformed = features_pipeline.future_transform(future_df)
|
|
170
|
+
future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
|
|
153
171
|
print(f" Future data now has {len(future_df_transformed.columns)} columns")
|
|
154
172
|
print()
|
|
155
173
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
import polars as pl
|
|
2
2
|
from sklearn.linear_model import LogisticRegression
|
|
3
3
|
|
|
4
|
+
from examples import get_sub_sample_nba_data
|
|
4
5
|
from spforge.autopipeline import AutoPipeline
|
|
5
6
|
from spforge.data_structures import ColumnNames
|
|
6
7
|
from spforge.ratings import RatingKnownFeatures
|
|
7
8
|
from spforge.ratings._player_rating import PlayerRatingGenerator
|
|
8
9
|
|
|
9
|
-
df =
|
|
10
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
10
11
|
|
|
11
12
|
# Defines the column names as they appear in the dataframe
|
|
12
13
|
column_names = ColumnNames(
|
|
@@ -16,8 +17,8 @@ column_names = ColumnNames(
|
|
|
16
17
|
player_id="player_name",
|
|
17
18
|
)
|
|
18
19
|
# Sorts the dataframe. The dataframe must always be sorted as below
|
|
19
|
-
df = df.
|
|
20
|
-
|
|
20
|
+
df = df.sort(
|
|
21
|
+
[
|
|
21
22
|
column_names.start_date,
|
|
22
23
|
column_names.match_id,
|
|
23
24
|
column_names.team_id,
|
|
@@ -27,17 +28,26 @@ df = df.sort_values(
|
|
|
27
28
|
|
|
28
29
|
# Drops games with less or more than 2 teams
|
|
29
30
|
df = (
|
|
30
|
-
df.
|
|
31
|
-
|
|
31
|
+
df.with_columns(
|
|
32
|
+
pl.col(column_names.team_id)
|
|
33
|
+
.n_unique()
|
|
34
|
+
.over(column_names.match_id)
|
|
35
|
+
.alias("team_count")
|
|
32
36
|
)
|
|
33
|
-
.
|
|
34
|
-
.drop(
|
|
37
|
+
.filter(pl.col("team_count") == 2)
|
|
38
|
+
.drop("team_count")
|
|
35
39
|
)
|
|
36
40
|
|
|
37
41
|
# Pretends the last 10 games are future games. The most will be trained on everything before that.
|
|
38
|
-
most_recent_10_games =
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
most_recent_10_games = (
|
|
43
|
+
df.select(pl.col(column_names.match_id))
|
|
44
|
+
.unique(maintain_order=True)
|
|
45
|
+
.tail(10)
|
|
46
|
+
.get_column(column_names.match_id)
|
|
47
|
+
.to_list()
|
|
48
|
+
)
|
|
49
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
50
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
|
|
41
51
|
|
|
42
52
|
# Defining a simple rating-generator. It will use the "won" column to update the ratings.
|
|
43
53
|
# In contrast to a typical Elo, ratings will follow players.
|
|
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
|
|
|
49
59
|
column_names=column_names,
|
|
50
60
|
non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
|
|
51
61
|
)
|
|
52
|
-
historical_df = rating_generator.fit_transform(historical_df)
|
|
62
|
+
historical_df = rating_generator.fit_transform(historical_df).to_pandas()
|
|
53
63
|
|
|
54
64
|
# Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
|
|
55
65
|
# Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
|
|
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
|
|
|
61
71
|
pipeline = AutoPipeline(
|
|
62
72
|
estimator=LogisticRegression(),
|
|
63
73
|
granularity=["game_id", "team_id"],
|
|
64
|
-
|
|
74
|
+
estimator_features=rating_generator.features_out + ["location"],
|
|
65
75
|
)
|
|
66
76
|
|
|
67
77
|
pipeline.fit(X=historical_df, y=historical_df["won"])
|
|
68
78
|
|
|
69
79
|
# Future predictions on future results
|
|
70
|
-
future_df = rating_generator.future_transform(future_df)
|
|
80
|
+
future_df = rating_generator.future_transform(future_df).to_pandas()
|
|
71
81
|
future_predictions = pipeline.predict_proba(future_df)[:, 1]
|
|
72
82
|
future_df["game_winner_probability"] = future_predictions
|
|
73
83
|
# Grouping predictions from game-player level to game-level.
|
|
@@ -12,7 +12,7 @@ Key concepts covered:
|
|
|
12
12
|
- Hierarchical modeling: Team strength → Player performance
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
import
|
|
15
|
+
import polars as pl
|
|
16
16
|
from lightgbm import LGBMRegressor
|
|
17
17
|
from sklearn.linear_model import LogisticRegression
|
|
18
18
|
|
|
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
|
24
24
|
from spforge.transformers import EstimatorTransformer
|
|
25
25
|
|
|
26
26
|
# Load sample NBA data
|
|
27
|
-
df = get_sub_sample_nba_data(as_pandas=
|
|
27
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
28
28
|
|
|
29
29
|
# Define column mappings
|
|
30
30
|
column_names = ColumnNames(
|
|
@@ -35,7 +35,7 @@ column_names = ColumnNames(
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
# Sort data chronologically (critical for temporal correctness)
|
|
38
|
-
df = df.
|
|
38
|
+
df = df.sort(
|
|
39
39
|
[
|
|
40
40
|
column_names.start_date,
|
|
41
41
|
column_names.match_id,
|
|
@@ -46,18 +46,31 @@ df = df.sort_values(
|
|
|
46
46
|
|
|
47
47
|
# Filter to valid games
|
|
48
48
|
df = (
|
|
49
|
-
df.
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
df.with_columns(
|
|
50
|
+
pl.col(column_names.team_id)
|
|
51
|
+
.n_unique()
|
|
52
|
+
.over(column_names.match_id)
|
|
53
|
+
.alias("team_count")
|
|
54
|
+
)
|
|
55
|
+
.filter(pl.col("team_count") == 2)
|
|
56
|
+
.drop("team_count")
|
|
52
57
|
)
|
|
53
58
|
|
|
54
59
|
# Train/test split (using temporal ordering)
|
|
55
|
-
most_recent_10_games =
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
most_recent_10_games = (
|
|
61
|
+
df.select(pl.col(column_names.match_id))
|
|
62
|
+
.unique(maintain_order=True)
|
|
63
|
+
.tail(10)
|
|
64
|
+
.get_column(column_names.match_id)
|
|
65
|
+
.to_list()
|
|
66
|
+
)
|
|
67
|
+
train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
68
|
+
test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
58
69
|
|
|
59
|
-
|
|
60
|
-
|
|
70
|
+
train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
71
|
+
test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
72
|
+
print(f"Training: {len(train_df)} rows, {train_games} games")
|
|
73
|
+
print(f"Testing: {len(test_df)} rows, {test_games} games")
|
|
61
74
|
print()
|
|
62
75
|
|
|
63
76
|
# ====================================================================
|
|
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
|
|
|
86
99
|
)
|
|
87
100
|
|
|
88
101
|
# Generate features
|
|
89
|
-
train_df = features_pipeline.fit_transform(train_df)
|
|
90
|
-
test_df = features_pipeline.future_transform(test_df)
|
|
102
|
+
train_df = features_pipeline.fit_transform(train_df).to_pandas()
|
|
103
|
+
test_df = features_pipeline.future_transform(test_df).to_pandas()
|
|
91
104
|
|
|
92
105
|
print(f"Generated {len(features_pipeline.features_out)} baseline features")
|
|
93
106
|
print()
|
|
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
|
|
|
121
134
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
122
135
|
# Features for the final estimator (only pre-game information)
|
|
123
136
|
# Note: points_estimate_raw will be added by the transformer
|
|
124
|
-
|
|
137
|
+
estimator_features=features_pipeline.features_out,
|
|
125
138
|
# The predictor_transformers parameter chains the estimators
|
|
126
139
|
predictor_transformers=[points_estimate_transformer], # Stage 1 executes first
|
|
127
140
|
)
|
|
@@ -150,7 +163,7 @@ print()
|
|
|
150
163
|
|
|
151
164
|
# Fit the pipeline
|
|
152
165
|
# The y target here is for the FINAL estimator (player points)
|
|
153
|
-
#
|
|
166
|
+
# Predictor_transformers are trained on the same target during fit()
|
|
154
167
|
player_points_pipeline.fit(X=train_df, y=train_df["points"])
|
|
155
168
|
|
|
156
169
|
print("Training complete!")
|
|
@@ -188,7 +201,7 @@ print()
|
|
|
188
201
|
|
|
189
202
|
single_stage_pipeline = AutoPipeline(
|
|
190
203
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
191
|
-
|
|
204
|
+
estimator_features=features_pipeline.features_out,
|
|
192
205
|
)
|
|
193
206
|
|
|
194
207
|
print("Training single-stage baseline for comparison...")
|
spforge/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@ from .autopipeline import AutoPipeline as AutoPipeline
|
|
|
2
2
|
from .data_structures import ColumnNames as ColumnNames, GameColumnNames as GameColumnNames
|
|
3
3
|
from .features_generator_pipeline import FeatureGeneratorPipeline as FeatureGeneratorPipeline
|
|
4
4
|
from .hyperparameter_tuning import (
|
|
5
|
+
EstimatorHyperparameterTuner as EstimatorHyperparameterTuner,
|
|
5
6
|
OptunaResult as OptunaResult,
|
|
6
7
|
ParamSpec as ParamSpec,
|
|
7
8
|
RatingHyperparameterTuner as RatingHyperparameterTuner,
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
from spforge.hyperparameter_tuning._default_search_spaces import (
|
|
2
|
+
get_default_estimator_search_space,
|
|
3
|
+
get_default_lgbm_search_space,
|
|
4
|
+
get_default_negative_binomial_search_space,
|
|
5
|
+
get_default_normal_distribution_search_space,
|
|
2
6
|
get_default_player_rating_search_space,
|
|
3
7
|
get_default_search_space,
|
|
8
|
+
get_default_student_t_search_space,
|
|
4
9
|
get_default_team_rating_search_space,
|
|
5
10
|
)
|
|
6
11
|
from spforge.hyperparameter_tuning._tuner import (
|
|
12
|
+
EstimatorHyperparameterTuner,
|
|
7
13
|
OptunaResult,
|
|
8
14
|
ParamSpec,
|
|
9
15
|
RatingHyperparameterTuner,
|
|
@@ -11,9 +17,15 @@ from spforge.hyperparameter_tuning._tuner import (
|
|
|
11
17
|
|
|
12
18
|
__all__ = [
|
|
13
19
|
"RatingHyperparameterTuner",
|
|
20
|
+
"EstimatorHyperparameterTuner",
|
|
14
21
|
"ParamSpec",
|
|
15
22
|
"OptunaResult",
|
|
23
|
+
"get_default_estimator_search_space",
|
|
24
|
+
"get_default_lgbm_search_space",
|
|
25
|
+
"get_default_negative_binomial_search_space",
|
|
26
|
+
"get_default_normal_distribution_search_space",
|
|
16
27
|
"get_default_player_rating_search_space",
|
|
17
28
|
"get_default_team_rating_search_space",
|
|
29
|
+
"get_default_student_t_search_space",
|
|
18
30
|
"get_default_search_space",
|
|
19
31
|
]
|