spforge 0.8.5__py3-none-any.whl → 0.8.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/lol/pipeline_transformer_example.py +69 -86
- examples/nba/cross_validation_example.py +4 -11
- examples/nba/feature_engineering_example.py +33 -15
- examples/nba/game_winner_example.py +24 -14
- examples/nba/predictor_transformers_example.py +29 -16
- spforge/hyperparameter_tuning/_default_search_spaces.py +26 -1
- spforge/ratings/__init__.py +4 -0
- spforge/ratings/_player_rating.py +11 -0
- spforge/ratings/league_start_rating_optimizer.py +201 -0
- {spforge-0.8.5.dist-info → spforge-0.8.7.dist-info}/METADATA +11 -18
- {spforge-0.8.5.dist-info → spforge-0.8.7.dist-info}/RECORD +18 -16
- tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
- tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
- tests/ratings/test_player_rating_generator.py +27 -0
- tests/scorer/test_score.py +90 -0
- {spforge-0.8.5.dist-info → spforge-0.8.7.dist-info}/WHEEL +0 -0
- {spforge-0.8.5.dist-info → spforge-0.8.7.dist-info}/licenses/LICENSE +0 -0
- {spforge-0.8.5.dist-info → spforge-0.8.7.dist-info}/top_level.txt +0 -0
|
@@ -1,123 +1,106 @@
|
|
|
1
|
+
import polars as pl
|
|
1
2
|
from lightgbm import LGBMRegressor
|
|
2
|
-
from sklearn.linear_model import LogisticRegression
|
|
3
3
|
|
|
4
4
|
from examples import get_sub_sample_lol_data
|
|
5
5
|
from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
|
|
6
|
-
from spforge.
|
|
7
|
-
from spforge.distributions import (
|
|
8
|
-
NegativeBinomialEstimator,
|
|
9
|
-
)
|
|
6
|
+
from spforge.distributions import NegativeBinomialEstimator
|
|
10
7
|
from spforge.feature_generator import LagTransformer, RollingWindowTransformer
|
|
11
|
-
from spforge.
|
|
12
|
-
from spforge.ratings import (
|
|
13
|
-
PlayerRatingGenerator,
|
|
14
|
-
RatingKnownFeatures,
|
|
15
|
-
)
|
|
8
|
+
from spforge.transformers import EstimatorTransformer
|
|
16
9
|
|
|
17
10
|
column_names = ColumnNames(
|
|
18
11
|
team_id="teamname",
|
|
19
12
|
match_id="gameid",
|
|
20
13
|
start_date="date",
|
|
21
|
-
player_id="
|
|
14
|
+
player_id="player_uid",
|
|
22
15
|
league="league",
|
|
23
16
|
position="position",
|
|
24
17
|
)
|
|
25
|
-
|
|
18
|
+
|
|
19
|
+
df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
|
|
26
20
|
df = (
|
|
27
|
-
df.
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
21
|
+
df.with_columns(
|
|
22
|
+
pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
|
|
23
|
+
column_names.player_id
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
.filter(pl.col(column_names.position) != "team")
|
|
27
|
+
.with_columns(
|
|
28
|
+
pl.col(column_names.team_id)
|
|
29
|
+
.n_unique()
|
|
30
|
+
.over(column_names.match_id)
|
|
31
|
+
.alias("team_count"),
|
|
32
|
+
pl.col(column_names.player_id)
|
|
33
|
+
.n_unique()
|
|
34
|
+
.over([column_names.match_id, column_names.team_id])
|
|
35
|
+
.alias("player_count"),
|
|
36
|
+
)
|
|
37
|
+
.filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
|
|
38
|
+
.drop(["team_count", "player_count"])
|
|
39
|
+
.unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
|
|
40
|
+
.sort(
|
|
41
|
+
[
|
|
42
|
+
column_names.start_date,
|
|
43
|
+
column_names.match_id,
|
|
44
|
+
column_names.team_id,
|
|
45
|
+
column_names.player_id,
|
|
46
|
+
]
|
|
47
|
+
)
|
|
32
48
|
)
|
|
33
|
-
df = df.assign(team_count=df.groupby("gameid")["teamname"].transform("nunique")).loc[
|
|
34
|
-
lambda x: x.team_count == 2
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
df = df.drop_duplicates(subset=["gameid", "playername", "teamname"])
|
|
38
49
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
performance_column="performance_kills",
|
|
46
|
-
auto_scale_performance=True,
|
|
47
|
-
performance_weights=[ColumnWeight(name="kills", weight=1)],
|
|
50
|
+
most_recent_10_games = (
|
|
51
|
+
df.select(pl.col(column_names.match_id))
|
|
52
|
+
.unique(maintain_order=True)
|
|
53
|
+
.tail(10)
|
|
54
|
+
.get_column(column_names.match_id)
|
|
55
|
+
.to_list()
|
|
48
56
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
performance_column="result",
|
|
52
|
-
non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
|
|
53
|
-
)
|
|
54
|
-
|
|
57
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
58
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
LagTransformer(
|
|
58
|
-
features=["kills", "deaths", "result"], lag_length=3, granularity=["playername"]
|
|
59
|
-
),
|
|
60
|
+
lag_transformers = [
|
|
61
|
+
LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
|
|
60
62
|
RollingWindowTransformer(
|
|
61
|
-
features=["kills", "deaths"
|
|
63
|
+
features=["kills", "deaths"],
|
|
62
64
|
window=20,
|
|
63
65
|
min_periods=1,
|
|
64
|
-
granularity=["
|
|
66
|
+
granularity=["player_uid"],
|
|
65
67
|
),
|
|
66
68
|
]
|
|
67
69
|
|
|
68
70
|
features_generator = FeatureGeneratorPipeline(
|
|
69
71
|
column_names=column_names,
|
|
70
|
-
feature_generators=
|
|
72
|
+
feature_generators=lag_transformers,
|
|
71
73
|
)
|
|
72
74
|
|
|
73
|
-
historical_df = features_generator.fit_transform(historical_df)
|
|
74
|
-
|
|
75
|
-
game_winner_predictor = SklearnPredictor(
|
|
76
|
-
estimator=LogisticRegression(),
|
|
77
|
-
target="result",
|
|
78
|
-
features=rating_generator_result.features_out,
|
|
79
|
-
granularity=[column_names.match_id, column_names.team_id],
|
|
80
|
-
)
|
|
81
|
-
game_winner_pipeline = AutoPipeline(
|
|
82
|
-
predictor=game_winner_predictor, one_hot_encode_cat_features=True, impute_missing_values=True
|
|
83
|
-
)
|
|
75
|
+
historical_df = features_generator.fit_transform(historical_df).to_pandas()
|
|
76
|
+
future_df = features_generator.future_transform(future_df).to_pandas()
|
|
84
77
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
features=
|
|
78
|
+
point_estimate_transformer = EstimatorTransformer(
|
|
79
|
+
prediction_column_name="kills_estimate",
|
|
80
|
+
estimator=LGBMRegressor(verbose=-100, random_state=42),
|
|
81
|
+
features=features_generator.features_out,
|
|
89
82
|
)
|
|
90
83
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
84
|
+
probability_estimator = NegativeBinomialEstimator(
|
|
85
|
+
max_value=15,
|
|
86
|
+
point_estimate_pred_column="kills_estimate",
|
|
87
|
+
r_specific_granularity=[column_names.player_id],
|
|
88
|
+
predicted_r_weight=1,
|
|
89
|
+
column_names=column_names,
|
|
95
90
|
)
|
|
96
91
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
date_column_name=column_names.start_date,
|
|
102
|
-
match_id_column_name=column_names.match_id,
|
|
103
|
-
estimator=player_kills_predictor,
|
|
92
|
+
pipeline = AutoPipeline(
|
|
93
|
+
estimator=probability_estimator,
|
|
94
|
+
estimator_features=features_generator.features_out,
|
|
95
|
+
predictor_transformers=[point_estimate_transformer],
|
|
104
96
|
)
|
|
105
97
|
|
|
106
|
-
|
|
107
|
-
print(player_kills_predictor.features)
|
|
108
|
-
historical_df = cross_validator_player_kills.generate_validation_df(historical_df)
|
|
109
|
-
|
|
110
|
-
future_df = features_generator.future_transform(future_df)
|
|
111
|
-
future_df = game_winner_predictor.predict(future_df)
|
|
112
|
-
future_df = player_kills_predictor.predict(future_df)
|
|
113
|
-
|
|
114
|
-
probability_predictor = NegativeBinomialEstimator(
|
|
115
|
-
target="kills",
|
|
116
|
-
point_estimate_pred_column=player_kills_predictor.pred_column,
|
|
117
|
-
max_value=15,
|
|
118
|
-
)
|
|
98
|
+
pipeline.fit(X=historical_df, y=historical_df["kills"])
|
|
119
99
|
|
|
120
|
-
|
|
121
|
-
|
|
100
|
+
future_point_estimates = pipeline.predict(future_df)
|
|
101
|
+
future_probabilities = pipeline.predict_proba(future_df)
|
|
102
|
+
future_df["kills_pred"] = future_point_estimates
|
|
122
103
|
|
|
123
|
-
print(future_df.head(
|
|
104
|
+
print(future_df.head(5))
|
|
105
|
+
print(f"Probability matrix shape: {future_probabilities.shape}")
|
|
106
|
+
print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")
|
|
@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
|
|
|
51
51
|
print("-" * 70)
|
|
52
52
|
pipeline_classifier = AutoPipeline(
|
|
53
53
|
estimator=LGBMClassifier(verbose=-100, random_state=42),
|
|
54
|
-
|
|
54
|
+
estimator_features=features_generator.features_out,
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
cross_validator_classifier = MatchKFoldCrossValidator(
|
|
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
|
|
|
60
60
|
estimator=pipeline_classifier,
|
|
61
61
|
prediction_column_name="points_probabilities_classifier",
|
|
62
62
|
target_column="points",
|
|
63
|
-
features=pipeline_classifier.
|
|
63
|
+
features=pipeline_classifier.required_features,
|
|
64
64
|
)
|
|
65
65
|
validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
|
|
66
66
|
|
|
@@ -80,20 +80,13 @@ print("-" * 70)
|
|
|
80
80
|
predictor_negbin = NegativeBinomialEstimator(
|
|
81
81
|
max_value=40,
|
|
82
82
|
point_estimate_pred_column="points_estimate",
|
|
83
|
-
r_specific_granularity=["player_id"],
|
|
84
83
|
predicted_r_weight=1,
|
|
85
84
|
column_names=column_names,
|
|
86
85
|
)
|
|
87
86
|
|
|
88
87
|
pipeline_negbin = AutoPipeline(
|
|
89
88
|
estimator=predictor_negbin,
|
|
90
|
-
|
|
91
|
-
context_feature_names=[
|
|
92
|
-
column_names.player_id,
|
|
93
|
-
column_names.start_date,
|
|
94
|
-
column_names.team_id,
|
|
95
|
-
column_names.match_id,
|
|
96
|
-
],
|
|
89
|
+
estimator_features=features_generator.features_out,
|
|
97
90
|
predictor_transformers=[
|
|
98
91
|
EstimatorTransformer(
|
|
99
92
|
prediction_column_name="points_estimate",
|
|
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
|
|
|
109
102
|
estimator=pipeline_negbin,
|
|
110
103
|
prediction_column_name="points_probabilities_negbin",
|
|
111
104
|
target_column="points",
|
|
112
|
-
features=pipeline_negbin.
|
|
105
|
+
features=pipeline_negbin.required_features,
|
|
113
106
|
)
|
|
114
107
|
validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)
|
|
115
108
|
|
|
@@ -13,7 +13,7 @@ Key concepts covered:
|
|
|
13
13
|
- State management: fit_transform vs future_transform
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
import
|
|
16
|
+
import polars as pl
|
|
17
17
|
|
|
18
18
|
from examples import get_sub_sample_nba_data
|
|
19
19
|
from spforge import FeatureGeneratorPipeline
|
|
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
|
|
|
22
22
|
from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
23
23
|
|
|
24
24
|
# Load sample NBA data
|
|
25
|
-
df = get_sub_sample_nba_data(as_pandas=
|
|
25
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
26
26
|
|
|
27
27
|
# Define column mappings for your dataset
|
|
28
28
|
# This tells spforge which columns contain team IDs, player IDs, dates, etc.
|
|
@@ -35,7 +35,7 @@ column_names = ColumnNames(
|
|
|
35
35
|
|
|
36
36
|
# CRITICAL: Always sort data chronologically before generating features
|
|
37
37
|
# This ensures temporal ordering and prevents future leakage (using future data to predict the past)
|
|
38
|
-
df = df.
|
|
38
|
+
df = df.sort(
|
|
39
39
|
[
|
|
40
40
|
column_names.start_date, # First by date
|
|
41
41
|
column_names.match_id, # Then by match
|
|
@@ -46,13 +46,21 @@ df = df.sort_values(
|
|
|
46
46
|
|
|
47
47
|
# Keep only games with exactly 2 teams (filter out invalid data)
|
|
48
48
|
df = (
|
|
49
|
-
df.
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
df.with_columns(
|
|
50
|
+
pl.col(column_names.team_id)
|
|
51
|
+
.n_unique()
|
|
52
|
+
.over(column_names.match_id)
|
|
53
|
+
.alias("team_count")
|
|
54
|
+
)
|
|
55
|
+
.filter(pl.col("team_count") == 2)
|
|
56
|
+
.drop("team_count")
|
|
52
57
|
)
|
|
53
58
|
|
|
54
|
-
|
|
55
|
-
|
|
59
|
+
match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
60
|
+
start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
|
|
61
|
+
end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
|
|
62
|
+
print(f"Dataset: {len(df)} rows, {match_count} games")
|
|
63
|
+
print(f"Date range: {start_date} to {end_date}")
|
|
56
64
|
print()
|
|
57
65
|
|
|
58
66
|
# ====================================================================
|
|
@@ -125,12 +133,22 @@ print()
|
|
|
125
133
|
# ====================================================================
|
|
126
134
|
|
|
127
135
|
# Split data into historical (for training) and future (for prediction)
|
|
128
|
-
most_recent_5_games =
|
|
129
|
-
|
|
130
|
-
|
|
136
|
+
most_recent_5_games = (
|
|
137
|
+
df.select(pl.col(column_names.match_id))
|
|
138
|
+
.unique(maintain_order=True)
|
|
139
|
+
.tail(5)
|
|
140
|
+
.get_column(column_names.match_id)
|
|
141
|
+
.to_list()
|
|
142
|
+
)
|
|
143
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
|
|
144
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
|
|
131
145
|
|
|
132
|
-
|
|
133
|
-
|
|
146
|
+
historical_games = (
|
|
147
|
+
historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
148
|
+
)
|
|
149
|
+
future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
150
|
+
print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
|
|
151
|
+
print(f"Future data: {len(future_df)} rows, {future_games} games")
|
|
134
152
|
print()
|
|
135
153
|
|
|
136
154
|
# FIT_TRANSFORM: Learn from historical data
|
|
@@ -138,7 +156,7 @@ print()
|
|
|
138
156
|
# - Lags/rolling windows build up from initial games
|
|
139
157
|
# - Internal state (ratings, windows) is MUTATED
|
|
140
158
|
print("Applying fit_transform to historical data...")
|
|
141
|
-
historical_df = features_pipeline.fit_transform(historical_df)
|
|
159
|
+
historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
|
|
142
160
|
print(f" Generated {len(features_pipeline.features_out)} features:")
|
|
143
161
|
for feature in features_pipeline.features_out:
|
|
144
162
|
print(f" - {feature}")
|
|
@@ -149,7 +167,7 @@ print()
|
|
|
149
167
|
# - Appends current game to lag/rolling windows but doesn't persist the update
|
|
150
168
|
# - This is what you use in production: generate features without affecting your model's state
|
|
151
169
|
print("Applying future_transform to future data (read-only)...")
|
|
152
|
-
future_df_transformed = features_pipeline.future_transform(future_df)
|
|
170
|
+
future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
|
|
153
171
|
print(f" Future data now has {len(future_df_transformed.columns)} columns")
|
|
154
172
|
print()
|
|
155
173
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
import polars as pl
|
|
2
2
|
from sklearn.linear_model import LogisticRegression
|
|
3
3
|
|
|
4
|
+
from examples import get_sub_sample_nba_data
|
|
4
5
|
from spforge.autopipeline import AutoPipeline
|
|
5
6
|
from spforge.data_structures import ColumnNames
|
|
6
7
|
from spforge.ratings import RatingKnownFeatures
|
|
7
8
|
from spforge.ratings._player_rating import PlayerRatingGenerator
|
|
8
9
|
|
|
9
|
-
df =
|
|
10
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
10
11
|
|
|
11
12
|
# Defines the column names as they appear in the dataframe
|
|
12
13
|
column_names = ColumnNames(
|
|
@@ -16,8 +17,8 @@ column_names = ColumnNames(
|
|
|
16
17
|
player_id="player_name",
|
|
17
18
|
)
|
|
18
19
|
# Sorts the dataframe. The dataframe must always be sorted as below
|
|
19
|
-
df = df.
|
|
20
|
-
|
|
20
|
+
df = df.sort(
|
|
21
|
+
[
|
|
21
22
|
column_names.start_date,
|
|
22
23
|
column_names.match_id,
|
|
23
24
|
column_names.team_id,
|
|
@@ -27,17 +28,26 @@ df = df.sort_values(
|
|
|
27
28
|
|
|
28
29
|
# Drops games with less or more than 2 teams
|
|
29
30
|
df = (
|
|
30
|
-
df.
|
|
31
|
-
|
|
31
|
+
df.with_columns(
|
|
32
|
+
pl.col(column_names.team_id)
|
|
33
|
+
.n_unique()
|
|
34
|
+
.over(column_names.match_id)
|
|
35
|
+
.alias("team_count")
|
|
32
36
|
)
|
|
33
|
-
.
|
|
34
|
-
.drop(
|
|
37
|
+
.filter(pl.col("team_count") == 2)
|
|
38
|
+
.drop("team_count")
|
|
35
39
|
)
|
|
36
40
|
|
|
37
41
|
# Pretends the last 10 games are future games. The most will be trained on everything before that.
|
|
38
|
-
most_recent_10_games =
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
most_recent_10_games = (
|
|
43
|
+
df.select(pl.col(column_names.match_id))
|
|
44
|
+
.unique(maintain_order=True)
|
|
45
|
+
.tail(10)
|
|
46
|
+
.get_column(column_names.match_id)
|
|
47
|
+
.to_list()
|
|
48
|
+
)
|
|
49
|
+
historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
50
|
+
future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
|
|
41
51
|
|
|
42
52
|
# Defining a simple rating-generator. It will use the "won" column to update the ratings.
|
|
43
53
|
# In contrast to a typical Elo, ratings will follow players.
|
|
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
|
|
|
49
59
|
column_names=column_names,
|
|
50
60
|
non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
|
|
51
61
|
)
|
|
52
|
-
historical_df = rating_generator.fit_transform(historical_df)
|
|
62
|
+
historical_df = rating_generator.fit_transform(historical_df).to_pandas()
|
|
53
63
|
|
|
54
64
|
# Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
|
|
55
65
|
# Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
|
|
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
|
|
|
61
71
|
pipeline = AutoPipeline(
|
|
62
72
|
estimator=LogisticRegression(),
|
|
63
73
|
granularity=["game_id", "team_id"],
|
|
64
|
-
|
|
74
|
+
estimator_features=rating_generator.features_out + ["location"],
|
|
65
75
|
)
|
|
66
76
|
|
|
67
77
|
pipeline.fit(X=historical_df, y=historical_df["won"])
|
|
68
78
|
|
|
69
79
|
# Future predictions on future results
|
|
70
|
-
future_df = rating_generator.future_transform(future_df)
|
|
80
|
+
future_df = rating_generator.future_transform(future_df).to_pandas()
|
|
71
81
|
future_predictions = pipeline.predict_proba(future_df)[:, 1]
|
|
72
82
|
future_df["game_winner_probability"] = future_predictions
|
|
73
83
|
# Grouping predictions from game-player level to game-level.
|
|
@@ -12,7 +12,7 @@ Key concepts covered:
|
|
|
12
12
|
- Hierarchical modeling: Team strength → Player performance
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
import
|
|
15
|
+
import polars as pl
|
|
16
16
|
from lightgbm import LGBMRegressor
|
|
17
17
|
from sklearn.linear_model import LogisticRegression
|
|
18
18
|
|
|
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
|
24
24
|
from spforge.transformers import EstimatorTransformer
|
|
25
25
|
|
|
26
26
|
# Load sample NBA data
|
|
27
|
-
df = get_sub_sample_nba_data(as_pandas=
|
|
27
|
+
df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
|
|
28
28
|
|
|
29
29
|
# Define column mappings
|
|
30
30
|
column_names = ColumnNames(
|
|
@@ -35,7 +35,7 @@ column_names = ColumnNames(
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
# Sort data chronologically (critical for temporal correctness)
|
|
38
|
-
df = df.
|
|
38
|
+
df = df.sort(
|
|
39
39
|
[
|
|
40
40
|
column_names.start_date,
|
|
41
41
|
column_names.match_id,
|
|
@@ -46,18 +46,31 @@ df = df.sort_values(
|
|
|
46
46
|
|
|
47
47
|
# Filter to valid games
|
|
48
48
|
df = (
|
|
49
|
-
df.
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
df.with_columns(
|
|
50
|
+
pl.col(column_names.team_id)
|
|
51
|
+
.n_unique()
|
|
52
|
+
.over(column_names.match_id)
|
|
53
|
+
.alias("team_count")
|
|
54
|
+
)
|
|
55
|
+
.filter(pl.col("team_count") == 2)
|
|
56
|
+
.drop("team_count")
|
|
52
57
|
)
|
|
53
58
|
|
|
54
59
|
# Train/test split (using temporal ordering)
|
|
55
|
-
most_recent_10_games =
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
most_recent_10_games = (
|
|
61
|
+
df.select(pl.col(column_names.match_id))
|
|
62
|
+
.unique(maintain_order=True)
|
|
63
|
+
.tail(10)
|
|
64
|
+
.get_column(column_names.match_id)
|
|
65
|
+
.to_list()
|
|
66
|
+
)
|
|
67
|
+
train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
68
|
+
test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
|
|
58
69
|
|
|
59
|
-
|
|
60
|
-
|
|
70
|
+
train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
71
|
+
test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
|
|
72
|
+
print(f"Training: {len(train_df)} rows, {train_games} games")
|
|
73
|
+
print(f"Testing: {len(test_df)} rows, {test_games} games")
|
|
61
74
|
print()
|
|
62
75
|
|
|
63
76
|
# ====================================================================
|
|
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
|
|
|
86
99
|
)
|
|
87
100
|
|
|
88
101
|
# Generate features
|
|
89
|
-
train_df = features_pipeline.fit_transform(train_df)
|
|
90
|
-
test_df = features_pipeline.future_transform(test_df)
|
|
102
|
+
train_df = features_pipeline.fit_transform(train_df).to_pandas()
|
|
103
|
+
test_df = features_pipeline.future_transform(test_df).to_pandas()
|
|
91
104
|
|
|
92
105
|
print(f"Generated {len(features_pipeline.features_out)} baseline features")
|
|
93
106
|
print()
|
|
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
|
|
|
121
134
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
122
135
|
# Features for the final estimator (only pre-game information)
|
|
123
136
|
# Note: points_estimate_raw will be added by the transformer
|
|
124
|
-
|
|
137
|
+
estimator_features=features_pipeline.features_out,
|
|
125
138
|
# The predictor_transformers parameter chains the estimators
|
|
126
139
|
predictor_transformers=[points_estimate_transformer], # Stage 1 executes first
|
|
127
140
|
)
|
|
@@ -150,7 +163,7 @@ print()
|
|
|
150
163
|
|
|
151
164
|
# Fit the pipeline
|
|
152
165
|
# The y target here is for the FINAL estimator (player points)
|
|
153
|
-
#
|
|
166
|
+
# Predictor_transformers are trained on the same target during fit()
|
|
154
167
|
player_points_pipeline.fit(X=train_df, y=train_df["points"])
|
|
155
168
|
|
|
156
169
|
print("Training complete!")
|
|
@@ -188,7 +201,7 @@ print()
|
|
|
188
201
|
|
|
189
202
|
single_stage_pipeline = AutoPipeline(
|
|
190
203
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
191
|
-
|
|
204
|
+
estimator_features=features_pipeline.features_out,
|
|
192
205
|
)
|
|
193
206
|
|
|
194
207
|
print("Training single-stage baseline for comparison...")
|
|
@@ -127,7 +127,7 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
|
|
|
127
127
|
"""
|
|
128
128
|
Default search space for PlayerRatingGenerator.
|
|
129
129
|
|
|
130
|
-
Focuses on
|
|
130
|
+
Focuses on core parameters that have the most impact on performance.
|
|
131
131
|
|
|
132
132
|
Returns:
|
|
133
133
|
Dictionary mapping parameter names to ParamSpec objects
|
|
@@ -167,6 +167,31 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
|
|
|
167
167
|
param_type="categorical",
|
|
168
168
|
choices=["difference", "mean", "ignore_opponent"],
|
|
169
169
|
),
|
|
170
|
+
"start_league_quantile": ParamSpec(
|
|
171
|
+
param_type="float",
|
|
172
|
+
low=0.05,
|
|
173
|
+
high=0.5,
|
|
174
|
+
),
|
|
175
|
+
"start_min_count_for_percentiles": ParamSpec(
|
|
176
|
+
param_type="int",
|
|
177
|
+
low=40,
|
|
178
|
+
high=500,
|
|
179
|
+
),
|
|
180
|
+
"start_team_rating_subtract": ParamSpec(
|
|
181
|
+
param_type="float",
|
|
182
|
+
low=0.0,
|
|
183
|
+
high=200.0,
|
|
184
|
+
),
|
|
185
|
+
"start_team_weight": ParamSpec(
|
|
186
|
+
param_type="float",
|
|
187
|
+
low=0.0,
|
|
188
|
+
high=1.0,
|
|
189
|
+
),
|
|
190
|
+
"start_min_match_count_team_rating": ParamSpec(
|
|
191
|
+
param_type="int",
|
|
192
|
+
low=1,
|
|
193
|
+
high=10,
|
|
194
|
+
),
|
|
170
195
|
}
|
|
171
196
|
|
|
172
197
|
|
spforge/ratings/__init__.py
CHANGED
|
@@ -6,3 +6,7 @@ from .enums import (
|
|
|
6
6
|
RatingUnknownFeatures as RatingUnknownFeatures,
|
|
7
7
|
)
|
|
8
8
|
from .league_identifier import LeagueIdentifier as LeagueIdentifier
|
|
9
|
+
from .league_start_rating_optimizer import (
|
|
10
|
+
LeagueStartRatingOptimizationResult as LeagueStartRatingOptimizationResult,
|
|
11
|
+
LeagueStartRatingOptimizer as LeagueStartRatingOptimizer,
|
|
12
|
+
)
|
|
@@ -129,6 +129,9 @@ class PlayerRatingGenerator(RatingGenerator):
|
|
|
129
129
|
str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_PROJECTED)
|
|
130
130
|
)
|
|
131
131
|
self.MEAN_PROJ_COL = self._suffix(str(RatingKnownFeatures.RATING_MEAN_PROJECTED))
|
|
132
|
+
self.PLAYER_DIFF_FROM_TEAM_PROJ_COL = self._suffix(
|
|
133
|
+
str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED)
|
|
134
|
+
)
|
|
132
135
|
|
|
133
136
|
self.TEAM_OFF_RATING_PROJ_COL = self._suffix(
|
|
134
137
|
str(RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED)
|
|
@@ -618,6 +621,7 @@ class PlayerRatingGenerator(RatingGenerator):
|
|
|
618
621
|
or self.OPP_RATING_PROJ_COL in cols_to_add
|
|
619
622
|
or self.DIFF_PROJ_COL in cols_to_add
|
|
620
623
|
or self.MEAN_PROJ_COL in cols_to_add
|
|
624
|
+
or self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add
|
|
621
625
|
):
|
|
622
626
|
df = add_team_rating_projected(
|
|
623
627
|
df=df,
|
|
@@ -673,6 +677,13 @@ class PlayerRatingGenerator(RatingGenerator):
|
|
|
673
677
|
)
|
|
674
678
|
)
|
|
675
679
|
|
|
680
|
+
if self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add:
|
|
681
|
+
df = df.with_columns(
|
|
682
|
+
(pl.col(self.PLAYER_OFF_RATING_COL) - pl.col(self.TEAM_OFF_RATING_PROJ_COL)).alias(
|
|
683
|
+
self.PLAYER_DIFF_FROM_TEAM_PROJ_COL
|
|
684
|
+
)
|
|
685
|
+
)
|
|
686
|
+
|
|
676
687
|
if (
|
|
677
688
|
self.TEAM_RATING_COL in cols_to_add
|
|
678
689
|
or self.OPP_RATING_COL in cols_to_add
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import narwhals.stable.v2 as nw
|
|
7
|
+
import polars as pl
|
|
8
|
+
from narwhals.stable.v2.typing import IntoFrameT
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_START_RATING = 1000.0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LeagueStartRatingOptimizationResult:
|
|
16
|
+
league_ratings: dict[str, float]
|
|
17
|
+
iteration_errors: list[dict[str, float]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LeagueStartRatingOptimizer:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
rating_generator: object,
|
|
24
|
+
n_iterations: int = 3,
|
|
25
|
+
learning_rate: float = 0.2,
|
|
26
|
+
min_cross_region_rows: int = 10,
|
|
27
|
+
rating_scale: float | None = None,
|
|
28
|
+
):
|
|
29
|
+
self.rating_generator = rating_generator
|
|
30
|
+
self.n_iterations = int(n_iterations)
|
|
31
|
+
self.learning_rate = float(learning_rate)
|
|
32
|
+
self.min_cross_region_rows = int(min_cross_region_rows)
|
|
33
|
+
self.rating_scale = rating_scale
|
|
34
|
+
|
|
35
|
+
@nw.narwhalify
|
|
36
|
+
def optimize(self, df: IntoFrameT) -> LeagueStartRatingOptimizationResult:
|
|
37
|
+
pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars()
|
|
38
|
+
league_ratings = self._get_league_ratings(self.rating_generator)
|
|
39
|
+
iteration_errors: list[dict[str, float]] = []
|
|
40
|
+
|
|
41
|
+
for _ in range(self.n_iterations):
|
|
42
|
+
gen = copy.deepcopy(self.rating_generator)
|
|
43
|
+
self._set_league_ratings(gen, league_ratings)
|
|
44
|
+
self._ensure_prediction_columns(gen)
|
|
45
|
+
|
|
46
|
+
pred_df = gen.fit_transform(pl_df)
|
|
47
|
+
error_df = self._cross_region_error_df(pl_df, pred_df, gen)
|
|
48
|
+
if error_df.is_empty():
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
error_summary = (
|
|
52
|
+
error_df.group_by(self._league_column_name(gen))
|
|
53
|
+
.agg(
|
|
54
|
+
pl.col("error").mean().alias("mean_error"),
|
|
55
|
+
pl.len().alias("row_count"),
|
|
56
|
+
)
|
|
57
|
+
.to_dicts()
|
|
58
|
+
)
|
|
59
|
+
league_key = self._league_column_name(gen)
|
|
60
|
+
iteration_errors.append({r[league_key]: r["mean_error"] for r in error_summary})
|
|
61
|
+
league_ratings = self._apply_error_updates(
|
|
62
|
+
gen, league_ratings, error_summary, league_key
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._set_league_ratings(self.rating_generator, league_ratings)
|
|
66
|
+
return LeagueStartRatingOptimizationResult(
|
|
67
|
+
league_ratings=league_ratings, iteration_errors=iteration_errors
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def _cross_region_error_df(
|
|
71
|
+
self,
|
|
72
|
+
df: pl.DataFrame,
|
|
73
|
+
pred_df: pl.DataFrame,
|
|
74
|
+
rating_generator: object,
|
|
75
|
+
) -> pl.DataFrame:
|
|
76
|
+
column_names = getattr(rating_generator, "column_names", None)
|
|
77
|
+
if column_names is None:
|
|
78
|
+
raise ValueError("rating_generator must define column_names")
|
|
79
|
+
|
|
80
|
+
match_id = getattr(column_names, "match_id", None)
|
|
81
|
+
team_id = getattr(column_names, "team_id", None)
|
|
82
|
+
league_col = getattr(column_names, "league", None)
|
|
83
|
+
if not match_id or not team_id or not league_col:
|
|
84
|
+
raise ValueError("column_names must include match_id, team_id, and league")
|
|
85
|
+
|
|
86
|
+
pred_col, entity_cols, perf_col = self._prediction_spec(rating_generator)
|
|
87
|
+
base_cols = [match_id, team_id, league_col, perf_col]
|
|
88
|
+
for col in base_cols + entity_cols:
|
|
89
|
+
if col not in df.columns:
|
|
90
|
+
raise ValueError(f"{col} missing from input dataframe")
|
|
91
|
+
|
|
92
|
+
join_cols = [match_id, team_id] + entity_cols
|
|
93
|
+
joined = df.select(base_cols + entity_cols).join(
|
|
94
|
+
pred_df.select(join_cols + [pred_col]),
|
|
95
|
+
on=join_cols,
|
|
96
|
+
how="inner",
|
|
97
|
+
)
|
|
98
|
+
opp_league = self._opponent_mode_league(joined, match_id, team_id, league_col)
|
|
99
|
+
enriched = joined.join(opp_league, on=[match_id, team_id], how="left").with_columns(
|
|
100
|
+
(pl.col(perf_col) - pl.col(pred_col)).alias("error")
|
|
101
|
+
)
|
|
102
|
+
return enriched.filter(pl.col("opp_mode_league").is_not_null()).filter(
|
|
103
|
+
pl.col(league_col) != pl.col("opp_mode_league")
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _opponent_mode_league(
|
|
107
|
+
self, df: pl.DataFrame, match_id: str, team_id: str, league_col: str
|
|
108
|
+
) -> pl.DataFrame:
|
|
109
|
+
team_mode = (
|
|
110
|
+
df.group_by([match_id, team_id, league_col])
|
|
111
|
+
.agg(pl.len().alias("__count"))
|
|
112
|
+
.sort(["__count"], descending=True)
|
|
113
|
+
.unique([match_id, team_id])
|
|
114
|
+
.select([match_id, team_id, league_col])
|
|
115
|
+
.rename({league_col: "team_mode_league"})
|
|
116
|
+
)
|
|
117
|
+
opponents = (
|
|
118
|
+
team_mode.join(team_mode, on=match_id, suffix="_opp")
|
|
119
|
+
.filter(pl.col(team_id) != pl.col(f"{team_id}_opp"))
|
|
120
|
+
.group_by([match_id, team_id, "team_mode_league_opp"])
|
|
121
|
+
.agg(pl.len().alias("__count"))
|
|
122
|
+
.sort(["__count"], descending=True)
|
|
123
|
+
.unique([match_id, team_id])
|
|
124
|
+
.select([match_id, team_id, "team_mode_league_opp"])
|
|
125
|
+
.rename({"team_mode_league_opp": "opp_mode_league"})
|
|
126
|
+
)
|
|
127
|
+
return opponents
|
|
128
|
+
|
|
129
|
+
def _prediction_spec(self, rating_generator: object) -> tuple[str, list[str], str]:
|
|
130
|
+
perf_col = getattr(rating_generator, "performance_column", None)
|
|
131
|
+
if not perf_col:
|
|
132
|
+
raise ValueError("rating_generator must define performance_column")
|
|
133
|
+
if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
|
|
134
|
+
pred_col = rating_generator.PLAYER_PRED_PERF_COL
|
|
135
|
+
column_names = rating_generator.column_names
|
|
136
|
+
player_id = getattr(column_names, "player_id", None)
|
|
137
|
+
if not player_id:
|
|
138
|
+
raise ValueError("column_names must include player_id for player ratings")
|
|
139
|
+
return pred_col, [player_id], perf_col
|
|
140
|
+
if hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
|
|
141
|
+
pred_col = rating_generator.TEAM_PRED_OFF_PERF_COL
|
|
142
|
+
return pred_col, [], perf_col
|
|
143
|
+
raise ValueError("rating_generator must expose a predicted performance column")
|
|
144
|
+
|
|
145
|
+
def _ensure_prediction_columns(self, rating_generator: object) -> None:
|
|
146
|
+
pred_cols: list[str] = []
|
|
147
|
+
if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
|
|
148
|
+
pred_cols.append(rating_generator.PLAYER_PRED_PERF_COL)
|
|
149
|
+
elif hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
|
|
150
|
+
pred_cols.append(rating_generator.TEAM_PRED_OFF_PERF_COL)
|
|
151
|
+
|
|
152
|
+
if not pred_cols:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
existing = list(getattr(rating_generator, "non_predictor_features_out", []) or [])
|
|
156
|
+
for col in pred_cols:
|
|
157
|
+
if col not in existing:
|
|
158
|
+
existing.append(col)
|
|
159
|
+
rating_generator.non_predictor_features_out = existing
|
|
160
|
+
|
|
161
|
+
def _apply_error_updates(
|
|
162
|
+
self,
|
|
163
|
+
rating_generator: object,
|
|
164
|
+
league_ratings: dict[str, float],
|
|
165
|
+
error_summary: list[dict[str, float]],
|
|
166
|
+
league_key: str,
|
|
167
|
+
) -> dict[str, float]:
|
|
168
|
+
scale = self.rating_scale
|
|
169
|
+
if scale is None:
|
|
170
|
+
scale = getattr(rating_generator, "rating_change_multiplier_offense", 1.0)
|
|
171
|
+
|
|
172
|
+
updated = dict(league_ratings)
|
|
173
|
+
for row in error_summary:
|
|
174
|
+
if row["row_count"] < self.min_cross_region_rows:
|
|
175
|
+
continue
|
|
176
|
+
league = row[league_key]
|
|
177
|
+
mean_error = row["mean_error"]
|
|
178
|
+
base_rating = updated.get(league, DEFAULT_START_RATING)
|
|
179
|
+
updated[league] = base_rating + self.learning_rate * mean_error * scale
|
|
180
|
+
return updated
|
|
181
|
+
|
|
182
|
+
def _league_column_name(self, rating_generator: object) -> str:
|
|
183
|
+
column_names = getattr(rating_generator, "column_names", None)
|
|
184
|
+
league_col = getattr(column_names, "league", None)
|
|
185
|
+
if not league_col:
|
|
186
|
+
raise ValueError("column_names must include league for league adjustments")
|
|
187
|
+
return league_col
|
|
188
|
+
|
|
189
|
+
def _get_league_ratings(self, rating_generator: object) -> dict[str, float]:
|
|
190
|
+
start_gen = getattr(rating_generator, "start_rating_generator", None)
|
|
191
|
+
if start_gen is None or not hasattr(start_gen, "league_ratings"):
|
|
192
|
+
raise ValueError("rating_generator must define start_rating_generator.league_ratings")
|
|
193
|
+
return dict(start_gen.league_ratings)
|
|
194
|
+
|
|
195
|
+
def _set_league_ratings(self, rating_generator: object, league_ratings: dict[str, float]) -> None:
|
|
196
|
+
start_gen = getattr(rating_generator, "start_rating_generator", None)
|
|
197
|
+
if start_gen is None or not hasattr(start_gen, "league_ratings"):
|
|
198
|
+
raise ValueError("rating_generator must define start_rating_generator.league_ratings")
|
|
199
|
+
start_gen.league_ratings = dict(league_ratings)
|
|
200
|
+
if hasattr(rating_generator, "start_league_ratings"):
|
|
201
|
+
rating_generator.start_league_ratings = dict(league_ratings)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spforge
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.7
|
|
4
4
|
Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
|
|
5
5
|
Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
|
|
6
6
|
License: See LICENSE file
|
|
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
|
|
|
85
85
|
import pandas as pd
|
|
86
86
|
from sklearn.linear_model import LogisticRegression
|
|
87
87
|
|
|
88
|
+
from examples import get_sub_sample_nba_data
|
|
88
89
|
from spforge.autopipeline import AutoPipeline
|
|
89
90
|
from spforge.data_structures import ColumnNames
|
|
90
|
-
from spforge.ratings import RatingKnownFeatures
|
|
91
|
-
from spforge.ratings._player_rating import PlayerRatingGenerator
|
|
91
|
+
from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
92
92
|
|
|
93
|
-
df =
|
|
93
|
+
df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
|
|
94
94
|
|
|
95
95
|
# Step 1: Define column mappings for your dataset
|
|
96
96
|
column_names = ColumnNames(
|
|
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
|
|
|
144
144
|
pipeline = AutoPipeline(
|
|
145
145
|
estimator=LogisticRegression(),
|
|
146
146
|
granularity=["game_id", "team_id"], # Aggregate players → teams
|
|
147
|
-
|
|
147
|
+
estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
# Train on historical data
|
|
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
|
|
|
302
302
|
prediction_column_name="points_pred",
|
|
303
303
|
target_column="points",
|
|
304
304
|
n_splits=3, # Number of temporal folds
|
|
305
|
-
# Must include both
|
|
306
|
-
features=pipeline.
|
|
305
|
+
# Must include both estimator features and context features
|
|
306
|
+
features=pipeline.required_features,
|
|
307
307
|
)
|
|
308
308
|
|
|
309
309
|
# Generate validation predictions
|
|
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
|
|
|
330
330
|
- `is_validation=1` marks validation rows, `is_validation=0` marks training rows
|
|
331
331
|
- Use `validation_column` in scorer to score only validation rows
|
|
332
332
|
- Training data always comes BEFORE validation data chronologically
|
|
333
|
-
- Must pass
|
|
333
|
+
- Must pass all required features (use `pipeline.required_features`)
|
|
334
334
|
- Scorers can filter rows (e.g., only score players who played minutes > 0)
|
|
335
335
|
|
|
336
336
|
See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
|
|
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
|
|
|
371
371
|
# Approach 1: LGBMClassifier (direct probability prediction)
|
|
372
372
|
pipeline_classifier = AutoPipeline(
|
|
373
373
|
estimator=LGBMClassifier(verbose=-100, random_state=42),
|
|
374
|
-
|
|
374
|
+
estimator_features=features_pipeline.features_out,
|
|
375
375
|
)
|
|
376
376
|
|
|
377
377
|
# Approach 2: LGBMRegressor + NegativeBinomialEstimator
|
|
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
|
|
|
385
385
|
|
|
386
386
|
pipeline_negbin = AutoPipeline(
|
|
387
387
|
estimator=distribution_estimator,
|
|
388
|
-
|
|
389
|
-
context_feature_names=[
|
|
390
|
-
column_names.player_id,
|
|
391
|
-
column_names.start_date,
|
|
392
|
-
column_names.team_id,
|
|
393
|
-
column_names.match_id,
|
|
394
|
-
],
|
|
388
|
+
estimator_features=features_pipeline.features_out,
|
|
395
389
|
predictor_transformers=[
|
|
396
390
|
EstimatorTransformer(
|
|
397
391
|
prediction_column_name="points_estimate",
|
|
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
|
|
|
439
433
|
# Stage 2: Refine estimate using Stage 1 output
|
|
440
434
|
player_points_pipeline = AutoPipeline(
|
|
441
435
|
estimator=LGBMRegressor(verbose=-100, n_estimators=50),
|
|
442
|
-
|
|
436
|
+
estimator_features=features_pipeline.features_out, # Original features
|
|
443
437
|
# predictor_transformers execute first, adding their predictions
|
|
444
438
|
predictor_transformers=[points_estimate_transformer],
|
|
445
439
|
)
|
|
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
|
|
|
474
468
|
- **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
|
|
475
469
|
- **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
|
|
476
470
|
- **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
|
|
477
|
-
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
examples/__init__.py,sha256=qGLpphvrjQj0-zS9vP0Q07L-anDnmw7gFZJUEBgYG3U,158
|
|
2
2
|
examples/game_level_example.py,sha256=EOr-H0K79O3Zah4wWuqa5DLmT2iZGbfgxD-xSU2-dfI,2244
|
|
3
3
|
examples/lol/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
examples/lol/pipeline_transformer_example.py,sha256=
|
|
4
|
+
examples/lol/pipeline_transformer_example.py,sha256=XVmm6Xya5z7JyOA0s-DISOlR2I1wpUthCyhRSt9n6qE,3402
|
|
5
5
|
examples/lol/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
examples/lol/data/subsample_lol_data.parquet,sha256=tl04XDslylECJUV1e0DGeqMb6D0Uh6_48NO6TykdgQI,343549
|
|
7
7
|
examples/lol/data/utils.py,sha256=Lt3XNNa5cavvFXHaTQ-GOPxSuWmPEfEO0CVXQEyF_s0,486
|
|
8
8
|
examples/nba/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
examples/nba/cross_validation_example.py,sha256=
|
|
10
|
-
examples/nba/feature_engineering_example.py,sha256=
|
|
11
|
-
examples/nba/game_winner_example.py,sha256=
|
|
12
|
-
examples/nba/predictor_transformers_example.py,sha256=
|
|
9
|
+
examples/nba/cross_validation_example.py,sha256=XVnQJ5mqMou9z83ML5J0wS3gk-pa56sdvahJYQgZ8os,5056
|
|
10
|
+
examples/nba/feature_engineering_example.py,sha256=BDd5594Yi_56lGDqz3SYQkwT8NVZyFkgv3gKPCsAjz4,8197
|
|
11
|
+
examples/nba/game_winner_example.py,sha256=7VVHxGyU2uPjT9q6lDMHJ5KpkWp9gU8brxr_UZfuSHg,3189
|
|
12
|
+
examples/nba/predictor_transformers_example.py,sha256=Fl4BY_hVW0iYERolN6s-ZB2xv-UxOK547L6iI5t0r0Y,8807
|
|
13
13
|
examples/nba/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
examples/nba/data/game_player_subsample.parquet,sha256=ODJxHC-mUYbJ7r-ScUFtPU7hrFuxLUbbDSobmpCkw0w,279161
|
|
15
15
|
examples/nba/data/utils.py,sha256=41hxLQ1d6ZgBEcHa5MI0-fG5KbsRi07cclMPQZM95ek,509
|
|
@@ -44,17 +44,18 @@ spforge/feature_generator/_rolling_mean_days.py,sha256=EZQmFmYVQB-JjZV5k8bOWnaTx
|
|
|
44
44
|
spforge/feature_generator/_rolling_window.py,sha256=HT8LezsRIPNAlMEoP9oTPW2bKFu55ZSRnQZGST7fncw,8836
|
|
45
45
|
spforge/feature_generator/_utils.py,sha256=KDn33ia1OYJTK8THFpvc_uRiH_Bl3fImGqqbfzs0YA4,9654
|
|
46
46
|
spforge/hyperparameter_tuning/__init__.py,sha256=N2sKG4SvG41hlsFT2kx_DQYMmXsQr-8031Tu_rxlxyY,1015
|
|
47
|
-
spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=
|
|
47
|
+
spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=Sm5IrHAW0-vRC8jqCPX0pDi_C-W3L_MoEKGA8bx1Zbc,7546
|
|
48
48
|
spforge/hyperparameter_tuning/_tuner.py,sha256=uovhGqhe8-fdhi79aErUmE2h5NCycFQEIRv5WCjpC7E,16732
|
|
49
49
|
spforge/performance_transformers/__init__.py,sha256=U6d7_kltbUMLYCGBk4QAFVPJTxXD3etD9qUftV-O3q4,422
|
|
50
50
|
spforge/performance_transformers/_performance_manager.py,sha256=KwAga6dGhNkXi-MDW6LPjwk6VZwCcjo5L--jnk9aio8,9706
|
|
51
51
|
spforge/performance_transformers/_performances_transformers.py,sha256=0lxuWjAfWBRXRgQsNJHjw3P-nlTtHBu4_bOVdoy7hq4,15536
|
|
52
|
-
spforge/ratings/__init__.py,sha256=
|
|
52
|
+
spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
|
|
53
53
|
spforge/ratings/_base.py,sha256=dRMkIGj5-2zKddygaEA4g16WCyXon7v8Xa1ymm7IuoM,14335
|
|
54
|
-
spforge/ratings/_player_rating.py,sha256=
|
|
54
|
+
spforge/ratings/_player_rating.py,sha256=MyqsyLSY6d7_bxDSnF8eWOyXpSCADWGdepdFSGM4cHw,51365
|
|
55
55
|
spforge/ratings/_team_rating.py,sha256=T0kFiv3ykYSrVGGsVRa8ZxLB0WMnagxqdFDzl9yZ_9g,24813
|
|
56
56
|
spforge/ratings/enums.py,sha256=s7z_RcZS6Nlgfa_6tasO8_IABZJwywexe7sep9DJBgo,1739
|
|
57
57
|
spforge/ratings/league_identifier.py,sha256=_KDUKOwoNU6RNFKE5jju4eYFGVNGBdJsv5mhNvMakfc,6019
|
|
58
|
+
spforge/ratings/league_start_rating_optimizer.py,sha256=Q4Vo3QT-r55qP4aD9WftsTB00UOSRvxM1khlyuAGWNM,8582
|
|
58
59
|
spforge/ratings/player_performance_predictor.py,sha256=cMxzQuk0nF1MsT_M32g-3mxVdAEbZ-S7TUjEPYdo3Yg,8361
|
|
59
60
|
spforge/ratings/start_rating_generator.py,sha256=_7hIJ9KRVCwsCoY1GIzY8cuOdHR8RH_BCMeMwQG3E04,6776
|
|
60
61
|
spforge/ratings/team_performance_predictor.py,sha256=ThQOmYQUqKBB46ONYHOMM2arXFH8AkyKpAZzs80SjHA,7217
|
|
@@ -70,16 +71,17 @@ spforge/transformers/_other_transformer.py,sha256=xLfaFIhkFsigAoitB4x3F8An2j9ymd
|
|
|
70
71
|
spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
|
|
71
72
|
spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
|
|
72
73
|
spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
|
|
73
|
-
spforge-0.8.
|
|
74
|
+
spforge-0.8.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
74
75
|
tests/test_autopipeline.py,sha256=WXHeqBdjQD6xaXVkzvS8ocz0WVP9R7lN0PiHJ2iD8nA,16911
|
|
75
76
|
tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
|
|
76
77
|
tests/test_feature_generator_pipeline.py,sha256=CAgBknWqawqYi5_hxcPmpxrLVa5elMHVv1VrSVRKXEA,17705
|
|
77
78
|
tests/cross_validator/test_cross_validator.py,sha256=itCGhNY8-NbDbKbhxHW20wiLuRst7-Rixpmi3FSKQtA,17474
|
|
78
79
|
tests/distributions/test_distribution.py,sha256=aU8hfCgliM80TES4WGjs9KFXpV8XghBGF7Hu9sqEVSE,10982
|
|
79
80
|
tests/end_to_end/test_estimator_hyperparameter_tuning.py,sha256=fZCJ9rrED2vT68B9ovmVA1cIG2pHRTjy9xzZLxxpEBo,2513
|
|
81
|
+
tests/end_to_end/test_league_start_rating_optimizer.py,sha256=Mmct2ixp4c6L7PGym8wZc7E-Csozryt1g4_o6OCc1uI,3141
|
|
80
82
|
tests/end_to_end/test_lol_player_kills.py,sha256=RJSYUbPrZ-RzSxGggj03yN0JKYeTB1JghVGYFMYia3Y,11891
|
|
81
83
|
tests/end_to_end/test_nba_player_points.py,sha256=kyzjo7QIcvpteps29Wix6IS_eJG9d1gHLeWtIHpkWMs,9066
|
|
82
|
-
tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py,sha256=
|
|
84
|
+
tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py,sha256=LXRkI_6Ho2kzJVbNAM17QFhx_MP9WdDJXCO9dWgJGNA,6491
|
|
83
85
|
tests/end_to_end/test_nba_prediction_consistency.py,sha256=o3DckJasx_I1ed6MhMYZUo2WSDvQ_p3HtJa9DCWTIYU,9857
|
|
84
86
|
tests/estimator/test_sklearn_estimator.py,sha256=tVfOP9Wx-tV1b6DcHbGxQHZQzNPA0Iobq8jTcUrk59U,48668
|
|
85
87
|
tests/feature_generator/test_lag.py,sha256=5Ffrv0V9cwkbkzRMPBe3_c_YNW-W2al-XH_acQIvdeg,19531
|
|
@@ -92,10 +94,10 @@ tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGF
|
|
|
92
94
|
tests/hyperparameter_tuning/test_rating_tuner.py,sha256=PyCFP3KPc4Iy9E_X9stCVxra14uMgC1tuRwuQ30rO_o,13195
|
|
93
95
|
tests/performance_transformers/test_performance_manager.py,sha256=bfC5GiBuzHw-mLmKeEzBUUPuKm0ayax2bsF1j88W8L0,10120
|
|
94
96
|
tests/performance_transformers/test_performances_transformers.py,sha256=A-tGiCx7kXrj1cVj03Bc7prOeZ1_Ryz8YFx9uj3eK6w,11064
|
|
95
|
-
tests/ratings/test_player_rating_generator.py,sha256=
|
|
97
|
+
tests/ratings/test_player_rating_generator.py,sha256=FGH3Tq0uFoSlkS_XMldsUKhsovBRBvzH9EbqjKvg2O0,59601
|
|
96
98
|
tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
|
|
97
99
|
tests/ratings/test_team_rating_generator.py,sha256=cDnf1zHiYC7pkgydE3MYr8wSTJIq-bPfSqhIRI_4Tic,95357
|
|
98
|
-
tests/scorer/test_score.py,sha256=
|
|
100
|
+
tests/scorer/test_score.py,sha256=_Vd6tKpy_1GeOxU7Omxci4CFf7PvRGMefEI0gv2gV6A,74688
|
|
99
101
|
tests/scorer/test_score_aggregation_granularity.py,sha256=h-hyFOLzwp-92hYVU7CwvlRJ8jhB4DzXCtqgI-zcoqM,13677
|
|
100
102
|
tests/transformers/test_estimator_transformer_context.py,sha256=5GOHbuWCWBMFwwOTJOuD4oNDsv-qDR0OxNZYGGuMdag,1819
|
|
101
103
|
tests/transformers/test_net_over_predicted.py,sha256=vh7O1iRRPf4vcW9aLhOMAOyatfM5ZnLsQBKNAYsR3SU,3363
|
|
@@ -103,7 +105,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
103
105
|
tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
|
|
104
106
|
tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
|
|
105
107
|
tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
|
|
106
|
-
spforge-0.8.
|
|
107
|
-
spforge-0.8.
|
|
108
|
-
spforge-0.8.
|
|
109
|
-
spforge-0.8.
|
|
108
|
+
spforge-0.8.7.dist-info/METADATA,sha256=7vwprmmFvSpEL3lC0HqFZPbzxMi8mRzI0yOsa7pUlNQ,20047
|
|
109
|
+
spforge-0.8.7.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
110
|
+
spforge-0.8.7.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
|
|
111
|
+
spforge-0.8.7.dist-info/RECORD,,
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import polars as pl
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from spforge import ColumnNames
|
|
6
|
+
from spforge.ratings import (
|
|
7
|
+
LeagueStartRatingOptimizer,
|
|
8
|
+
PlayerRatingGenerator,
|
|
9
|
+
TeamRatingGenerator,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _player_df():
|
|
14
|
+
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
|
15
|
+
rows = []
|
|
16
|
+
for i, date in enumerate(dates):
|
|
17
|
+
mid = f"M{i}"
|
|
18
|
+
for player_idx in range(2):
|
|
19
|
+
rows.append(
|
|
20
|
+
{
|
|
21
|
+
"pid": f"A{player_idx}",
|
|
22
|
+
"tid": "TA",
|
|
23
|
+
"mid": mid,
|
|
24
|
+
"date": date,
|
|
25
|
+
"league": "LCK",
|
|
26
|
+
"perf": 0.4,
|
|
27
|
+
}
|
|
28
|
+
)
|
|
29
|
+
for player_idx in range(2):
|
|
30
|
+
rows.append(
|
|
31
|
+
{
|
|
32
|
+
"pid": f"B{player_idx}",
|
|
33
|
+
"tid": "TB",
|
|
34
|
+
"mid": mid,
|
|
35
|
+
"date": date,
|
|
36
|
+
"league": "LEC",
|
|
37
|
+
"perf": 0.6,
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
return pd.DataFrame(rows)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _team_df():
|
|
44
|
+
dates = pd.date_range("2024-01-01", periods=3, freq="D")
|
|
45
|
+
rows = []
|
|
46
|
+
for i, date in enumerate(dates):
|
|
47
|
+
mid = f"M{i}"
|
|
48
|
+
rows.extend(
|
|
49
|
+
[
|
|
50
|
+
{
|
|
51
|
+
"tid": "TA",
|
|
52
|
+
"mid": mid,
|
|
53
|
+
"date": date,
|
|
54
|
+
"league": "LCK",
|
|
55
|
+
"perf": 0.4,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"tid": "TB",
|
|
59
|
+
"mid": mid,
|
|
60
|
+
"date": date,
|
|
61
|
+
"league": "LEC",
|
|
62
|
+
"perf": 0.6,
|
|
63
|
+
},
|
|
64
|
+
]
|
|
65
|
+
)
|
|
66
|
+
return pd.DataFrame(rows)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@pytest.mark.parametrize("use_polars", [False, True])
|
|
70
|
+
def test_league_start_rating_optimizer__adjusts_player_leagues(use_polars):
|
|
71
|
+
cn = ColumnNames(
|
|
72
|
+
player_id="pid",
|
|
73
|
+
team_id="tid",
|
|
74
|
+
match_id="mid",
|
|
75
|
+
start_date="date",
|
|
76
|
+
league="league",
|
|
77
|
+
)
|
|
78
|
+
df = _player_df()
|
|
79
|
+
if use_polars:
|
|
80
|
+
df = pl.from_pandas(df)
|
|
81
|
+
generator = PlayerRatingGenerator(performance_column="perf", column_names=cn)
|
|
82
|
+
optimizer = LeagueStartRatingOptimizer(
|
|
83
|
+
rating_generator=generator,
|
|
84
|
+
n_iterations=1,
|
|
85
|
+
learning_rate=0.5,
|
|
86
|
+
min_cross_region_rows=1,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
result = optimizer.optimize(df)
|
|
90
|
+
|
|
91
|
+
assert result.league_ratings["LCK"] < 1000
|
|
92
|
+
assert result.league_ratings["LEC"] > 1000
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@pytest.mark.parametrize("use_polars", [False, True])
|
|
96
|
+
def test_league_start_rating_optimizer__adjusts_team_leagues(use_polars):
|
|
97
|
+
cn = ColumnNames(
|
|
98
|
+
team_id="tid",
|
|
99
|
+
match_id="mid",
|
|
100
|
+
start_date="date",
|
|
101
|
+
league="league",
|
|
102
|
+
)
|
|
103
|
+
df = _team_df()
|
|
104
|
+
if use_polars:
|
|
105
|
+
df = pl.from_pandas(df)
|
|
106
|
+
generator = TeamRatingGenerator(performance_column="perf", column_names=cn)
|
|
107
|
+
optimizer = LeagueStartRatingOptimizer(
|
|
108
|
+
rating_generator=generator,
|
|
109
|
+
n_iterations=1,
|
|
110
|
+
learning_rate=0.5,
|
|
111
|
+
min_cross_region_rows=1,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
result = optimizer.optimize(df)
|
|
115
|
+
|
|
116
|
+
assert result.league_ratings["LCK"] < 1000
|
|
117
|
+
assert result.league_ratings["LEC"] > 1000
|
|
@@ -97,6 +97,11 @@ def test_nba_player_ratings_hyperparameter_tuning__workflow_completes(
|
|
|
97
97
|
"confidence_max_sum",
|
|
98
98
|
"use_off_def_split",
|
|
99
99
|
"performance_predictor",
|
|
100
|
+
"start_team_weight",
|
|
101
|
+
"start_league_quantile",
|
|
102
|
+
"start_min_count_for_percentiles",
|
|
103
|
+
"start_min_match_count_team_rating",
|
|
104
|
+
"start_team_rating_subtract",
|
|
100
105
|
}
|
|
101
106
|
assert set(result.best_params.keys()) == expected_params
|
|
102
107
|
|
|
@@ -1662,3 +1662,30 @@ def test_player_rating_team_with_strong_offense_and_weak_defense_gets_expected_r
|
|
|
1662
1662
|
|
|
1663
1663
|
assert a_off > start_rating
|
|
1664
1664
|
assert a_def < start_rating
|
|
1665
|
+
|
|
1666
|
+
|
|
1667
|
+
def test_fit_transform__player_rating_difference_from_team_projected_feature(base_cn, sample_df):
|
|
1668
|
+
"""PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED computes player_off_rating - team_off_rating_projected."""
|
|
1669
|
+
gen = PlayerRatingGenerator(
|
|
1670
|
+
performance_column="perf",
|
|
1671
|
+
column_names=base_cn,
|
|
1672
|
+
auto_scale_performance=True,
|
|
1673
|
+
features_out=[
|
|
1674
|
+
RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED,
|
|
1675
|
+
RatingKnownFeatures.PLAYER_OFF_RATING,
|
|
1676
|
+
RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED,
|
|
1677
|
+
],
|
|
1678
|
+
)
|
|
1679
|
+
result = gen.fit_transform(sample_df)
|
|
1680
|
+
|
|
1681
|
+
diff_col = "player_rating_difference_from_team_projected_perf"
|
|
1682
|
+
player_col = "player_off_rating_perf"
|
|
1683
|
+
team_col = "team_off_rating_projected_perf"
|
|
1684
|
+
|
|
1685
|
+
assert diff_col in result.columns
|
|
1686
|
+
assert player_col in result.columns
|
|
1687
|
+
assert team_col in result.columns
|
|
1688
|
+
|
|
1689
|
+
for row in result.iter_rows(named=True):
|
|
1690
|
+
expected = row[player_col] - row[team_col]
|
|
1691
|
+
assert row[diff_col] == pytest.approx(expected, rel=1e-9)
|
tests/scorer/test_score.py
CHANGED
|
@@ -2048,3 +2048,93 @@ def test_all_scorers_handle_all_nan_targets(df_type):
|
|
|
2048
2048
|
assert np.isnan(score) or score == 0.0
|
|
2049
2049
|
except (ValueError, IndexError):
|
|
2050
2050
|
pass
|
|
2051
|
+
SCORER_VALIDATION_CASES = [
|
|
2052
|
+
pytest.param(
|
|
2053
|
+
lambda: MeanBiasScorer(pred_column="pred", target="target", validation_column="is_validation"),
|
|
2054
|
+
lambda: pd.DataFrame(
|
|
2055
|
+
{
|
|
2056
|
+
"pred": [2.0, 0.0],
|
|
2057
|
+
"target": [1.0, 2.0],
|
|
2058
|
+
"is_validation": [1, 0],
|
|
2059
|
+
}
|
|
2060
|
+
),
|
|
2061
|
+
id="mean_bias",
|
|
2062
|
+
),
|
|
2063
|
+
pytest.param(
|
|
2064
|
+
lambda: PWMSE(pred_column="pred", target="target", labels=[0, 1], validation_column="is_validation"),
|
|
2065
|
+
lambda: pd.DataFrame(
|
|
2066
|
+
{
|
|
2067
|
+
"pred": [[0.7, 0.3], [0.4, 0.6]],
|
|
2068
|
+
"target": [0, 1],
|
|
2069
|
+
"is_validation": [1, 0],
|
|
2070
|
+
}
|
|
2071
|
+
),
|
|
2072
|
+
id="pwmse",
|
|
2073
|
+
),
|
|
2074
|
+
pytest.param(
|
|
2075
|
+
lambda: SklearnScorer(
|
|
2076
|
+
scorer_function=mean_absolute_error, pred_column="pred", target="target", validation_column="is_validation"
|
|
2077
|
+
),
|
|
2078
|
+
lambda: pd.DataFrame(
|
|
2079
|
+
{
|
|
2080
|
+
"pred": [1.0, 0.0],
|
|
2081
|
+
"target": [1.0, 0.0],
|
|
2082
|
+
"is_validation": [1, 0],
|
|
2083
|
+
}
|
|
2084
|
+
),
|
|
2085
|
+
id="sklearn",
|
|
2086
|
+
),
|
|
2087
|
+
pytest.param(
|
|
2088
|
+
lambda: ProbabilisticMeanBias(
|
|
2089
|
+
pred_column="pred", target="target", class_column_name="classes", validation_column="is_validation"
|
|
2090
|
+
),
|
|
2091
|
+
lambda: pd.DataFrame(
|
|
2092
|
+
{
|
|
2093
|
+
"pred": [[0.2, 0.8], [0.6, 0.4]],
|
|
2094
|
+
"target": [1, 0],
|
|
2095
|
+
"classes": [[0, 1], [0, 1]],
|
|
2096
|
+
"is_validation": [1, 0],
|
|
2097
|
+
}
|
|
2098
|
+
),
|
|
2099
|
+
id="probabilistic_mean_bias",
|
|
2100
|
+
),
|
|
2101
|
+
pytest.param(
|
|
2102
|
+
lambda: OrdinalLossScorer(pred_column="pred", target="target", classes=[0, 1], validation_column="is_validation"),
|
|
2103
|
+
lambda: pd.DataFrame(
|
|
2104
|
+
{
|
|
2105
|
+
"pred": [[0.2, 0.8], [0.6, 0.4]],
|
|
2106
|
+
"target": [1, 0],
|
|
2107
|
+
"is_validation": [1, 0],
|
|
2108
|
+
}
|
|
2109
|
+
),
|
|
2110
|
+
id="ordinal_loss",
|
|
2111
|
+
),
|
|
2112
|
+
pytest.param(
|
|
2113
|
+
lambda: ThresholdEventScorer(
|
|
2114
|
+
dist_column="dist",
|
|
2115
|
+
threshold_column="threshold",
|
|
2116
|
+
outcome_column="outcome",
|
|
2117
|
+
comparator=Operator.GREATER_THAN_OR_EQUALS,
|
|
2118
|
+
validation_column="is_validation",
|
|
2119
|
+
),
|
|
2120
|
+
lambda: pd.DataFrame(
|
|
2121
|
+
{
|
|
2122
|
+
"dist": [[0.2, 0.8], [0.6, 0.4], [0.3, 0.7]],
|
|
2123
|
+
"threshold": [0.5, 0.2, 0.3],
|
|
2124
|
+
"outcome": [1, 0, 1],
|
|
2125
|
+
"is_validation": [1, 1, 0],
|
|
2126
|
+
}
|
|
2127
|
+
),
|
|
2128
|
+
id="threshold_event",
|
|
2129
|
+
),
|
|
2130
|
+
]
|
|
2131
|
+
|
|
2132
|
+
|
|
2133
|
+
@pytest.mark.parametrize("scorer_factory, df_factory", SCORER_VALIDATION_CASES)
|
|
2134
|
+
def test_scorers_respect_validation_column(scorer_factory, df_factory):
|
|
2135
|
+
"""Scorers should filter on validation_column when specified."""
|
|
2136
|
+
df = df_factory()
|
|
2137
|
+
df_valid = df[df["is_validation"] == 1]
|
|
2138
|
+
score_all = scorer_factory().score(df)
|
|
2139
|
+
score_valid = scorer_factory().score(df_valid)
|
|
2140
|
+
assert score_all == score_valid
|
|
File without changes
|
|
File without changes
|
|
File without changes
|