spforge 0.8.5__py3-none-any.whl → 0.8.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,123 +1,106 @@
1
+ import polars as pl
1
2
  from lightgbm import LGBMRegressor
2
- from sklearn.linear_model import LogisticRegression
3
3
 
4
4
  from examples import get_sub_sample_lol_data
5
5
  from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
6
- from spforge.cross_validator import MatchKFoldCrossValidator
7
- from spforge.distributions import (
8
- NegativeBinomialEstimator,
9
- )
6
+ from spforge.distributions import NegativeBinomialEstimator
10
7
  from spforge.feature_generator import LagTransformer, RollingWindowTransformer
11
- from spforge.performance_transformers._performance_manager import ColumnWeight
12
- from spforge.ratings import (
13
- PlayerRatingGenerator,
14
- RatingKnownFeatures,
15
- )
8
+ from spforge.transformers import EstimatorTransformer
16
9
 
17
10
  column_names = ColumnNames(
18
11
  team_id="teamname",
19
12
  match_id="gameid",
20
13
  start_date="date",
21
- player_id="playername",
14
+ player_id="player_uid",
22
15
  league="league",
23
16
  position="position",
24
17
  )
25
- df = get_sub_sample_lol_data(as_pandas=True)
18
+
19
+ df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
26
20
  df = (
27
- df.loc[lambda x: x.position != "team"]
28
- .assign(team_count=df.groupby("gameid")["teamname"].transform("nunique"))
29
- .loc[lambda x: x.team_count == 2]
30
- .assign(player_count=df.groupby(["gameid", "teamname"])["playername"].transform("nunique"))
31
- .loc[lambda x: x.player_count == 5]
21
+ df.with_columns(
22
+ pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
23
+ column_names.player_id
24
+ )
25
+ )
26
+ .filter(pl.col(column_names.position) != "team")
27
+ .with_columns(
28
+ pl.col(column_names.team_id)
29
+ .n_unique()
30
+ .over(column_names.match_id)
31
+ .alias("team_count"),
32
+ pl.col(column_names.player_id)
33
+ .n_unique()
34
+ .over([column_names.match_id, column_names.team_id])
35
+ .alias("player_count"),
36
+ )
37
+ .filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
38
+ .drop(["team_count", "player_count"])
39
+ .unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
40
+ .sort(
41
+ [
42
+ column_names.start_date,
43
+ column_names.match_id,
44
+ column_names.team_id,
45
+ column_names.player_id,
46
+ ]
47
+ )
32
48
  )
33
- df = df.assign(team_count=df.groupby("gameid")["teamname"].transform("nunique")).loc[
34
- lambda x: x.team_count == 2
35
- ]
36
-
37
- df = df.drop_duplicates(subset=["gameid", "playername", "teamname"])
38
49
 
39
- # Pretends the last 10 games are future games. The most will be trained on everything before that.
40
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
41
- historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
42
- future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["result"])
43
- rating_generator_player_kills = PlayerRatingGenerator(
44
- features_out=[RatingKnownFeatures.PLAYER_RATING],
45
- performance_column="performance_kills",
46
- auto_scale_performance=True,
47
- performance_weights=[ColumnWeight(name="kills", weight=1)],
50
+ most_recent_10_games = (
51
+ df.select(pl.col(column_names.match_id))
52
+ .unique(maintain_order=True)
53
+ .tail(10)
54
+ .get_column(column_names.match_id)
55
+ .to_list()
48
56
  )
49
- rating_generator_result = PlayerRatingGenerator(
50
- features_out=[RatingKnownFeatures.TEAM_RATING_DIFFERENCE_PROJECTED],
51
- performance_column="result",
52
- non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
53
- )
54
-
57
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
58
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
55
59
 
56
- lag_generators = [
57
- LagTransformer(
58
- features=["kills", "deaths", "result"], lag_length=3, granularity=["playername"]
59
- ),
60
+ lag_transformers = [
61
+ LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
60
62
  RollingWindowTransformer(
61
- features=["kills", "deaths", "result"],
63
+ features=["kills", "deaths"],
62
64
  window=20,
63
65
  min_periods=1,
64
- granularity=["playername"],
66
+ granularity=["player_uid"],
65
67
  ),
66
68
  ]
67
69
 
68
70
  features_generator = FeatureGeneratorPipeline(
69
71
  column_names=column_names,
70
- feature_generators=[rating_generator_player_kills, rating_generator_result, *lag_generators],
72
+ feature_generators=lag_transformers,
71
73
  )
72
74
 
73
- historical_df = features_generator.fit_transform(historical_df)
74
-
75
- game_winner_predictor = SklearnPredictor(
76
- estimator=LogisticRegression(),
77
- target="result",
78
- features=rating_generator_result.features_out,
79
- granularity=[column_names.match_id, column_names.team_id],
80
- )
81
- game_winner_pipeline = AutoPipeline(
82
- predictor=game_winner_predictor, one_hot_encode_cat_features=True, impute_missing_values=True
83
- )
75
+ historical_df = features_generator.fit_transform(historical_df).to_pandas()
76
+ future_df = features_generator.future_transform(future_df).to_pandas()
84
77
 
85
- player_kills_predictor = SklearnPredictor(
86
- estimator=LGBMRegressor(verbose=-100),
87
- target="kills",
88
- features=[game_winner_predictor.pred_column, *features_generator.features_out],
78
+ point_estimate_transformer = EstimatorTransformer(
79
+ prediction_column_name="kills_estimate",
80
+ estimator=LGBMRegressor(verbose=-100, random_state=42),
81
+ features=features_generator.features_out,
89
82
  )
90
83
 
91
- cross_validator_game_winner = MatchKFoldCrossValidator(
92
- date_column_name=column_names.start_date,
93
- match_id_column_name=column_names.match_id,
94
- estimator=game_winner_predictor,
84
+ probability_estimator = NegativeBinomialEstimator(
85
+ max_value=15,
86
+ point_estimate_pred_column="kills_estimate",
87
+ r_specific_granularity=[column_names.player_id],
88
+ predicted_r_weight=1,
89
+ column_names=column_names,
95
90
  )
96
91
 
97
- game_winner_predictor.train(historical_df)
98
- historical_df = cross_validator_game_winner.generate_validation_df(historical_df)
99
-
100
- cross_validator_player_kills = MatchKFoldCrossValidator(
101
- date_column_name=column_names.start_date,
102
- match_id_column_name=column_names.match_id,
103
- estimator=player_kills_predictor,
92
+ pipeline = AutoPipeline(
93
+ estimator=probability_estimator,
94
+ estimator_features=features_generator.features_out,
95
+ predictor_transformers=[point_estimate_transformer],
104
96
  )
105
97
 
106
- player_kills_predictor.train(historical_df)
107
- print(player_kills_predictor.features)
108
- historical_df = cross_validator_player_kills.generate_validation_df(historical_df)
109
-
110
- future_df = features_generator.future_transform(future_df)
111
- future_df = game_winner_predictor.predict(future_df)
112
- future_df = player_kills_predictor.predict(future_df)
113
-
114
- probability_predictor = NegativeBinomialEstimator(
115
- target="kills",
116
- point_estimate_pred_column=player_kills_predictor.pred_column,
117
- max_value=15,
118
- )
98
+ pipeline.fit(X=historical_df, y=historical_df["kills"])
119
99
 
120
- probability_predictor.train(historical_df)
121
- future_df = probability_predictor.predict(future_df)
100
+ future_point_estimates = pipeline.predict(future_df)
101
+ future_probabilities = pipeline.predict_proba(future_df)
102
+ future_df["kills_pred"] = future_point_estimates
122
103
 
123
- print(future_df.head(10))
104
+ print(future_df.head(5))
105
+ print(f"Probability matrix shape: {future_probabilities.shape}")
106
+ print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")
@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
51
51
  print("-" * 70)
52
52
  pipeline_classifier = AutoPipeline(
53
53
  estimator=LGBMClassifier(verbose=-100, random_state=42),
54
- feature_names=features_generator.features_out,
54
+ estimator_features=features_generator.features_out,
55
55
  )
56
56
 
57
57
  cross_validator_classifier = MatchKFoldCrossValidator(
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
60
60
  estimator=pipeline_classifier,
61
61
  prediction_column_name="points_probabilities_classifier",
62
62
  target_column="points",
63
- features=pipeline_classifier.feature_names,
63
+ features=pipeline_classifier.required_features,
64
64
  )
65
65
  validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
66
66
 
@@ -80,20 +80,13 @@ print("-" * 70)
80
80
  predictor_negbin = NegativeBinomialEstimator(
81
81
  max_value=40,
82
82
  point_estimate_pred_column="points_estimate",
83
- r_specific_granularity=["player_id"],
84
83
  predicted_r_weight=1,
85
84
  column_names=column_names,
86
85
  )
87
86
 
88
87
  pipeline_negbin = AutoPipeline(
89
88
  estimator=predictor_negbin,
90
- feature_names=features_generator.features_out,
91
- context_feature_names=[
92
- column_names.player_id,
93
- column_names.start_date,
94
- column_names.team_id,
95
- column_names.match_id,
96
- ],
89
+ estimator_features=features_generator.features_out,
97
90
  predictor_transformers=[
98
91
  EstimatorTransformer(
99
92
  prediction_column_name="points_estimate",
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
109
102
  estimator=pipeline_negbin,
110
103
  prediction_column_name="points_probabilities_negbin",
111
104
  target_column="points",
112
- features=pipeline_negbin.context_feature_names + pipeline_negbin.feature_names,
105
+ features=pipeline_negbin.required_features,
113
106
  )
114
107
  validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)
115
108
 
@@ -13,7 +13,7 @@ Key concepts covered:
13
13
  - State management: fit_transform vs future_transform
14
14
  """
15
15
 
16
- import pandas as pd
16
+ import polars as pl
17
17
 
18
18
  from examples import get_sub_sample_nba_data
19
19
  from spforge import FeatureGeneratorPipeline
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
22
22
  from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
23
23
 
24
24
  # Load sample NBA data
25
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
25
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
26
26
 
27
27
  # Define column mappings for your dataset
28
28
  # This tells spforge which columns contain team IDs, player IDs, dates, etc.
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
 
36
36
  # CRITICAL: Always sort data chronologically before generating features
37
37
  # This ensures temporal ordering and prevents future leakage (using future data to predict the past)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date, # First by date
41
41
  column_names.match_id, # Then by match
@@ -46,13 +46,21 @@ df = df.sort_values(
46
46
 
47
47
  # Keep only games with exactly 2 teams (filter out invalid data)
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
- print(f"Dataset: {len(df)} rows, {df[column_names.match_id].nunique()} games")
55
- print(f"Date range: {df[column_names.start_date].min()} to {df[column_names.start_date].max()}")
59
+ match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
60
+ start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
61
+ end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
62
+ print(f"Dataset: {len(df)} rows, {match_count} games")
63
+ print(f"Date range: {start_date} to {end_date}")
56
64
  print()
57
65
 
58
66
  # ====================================================================
@@ -125,12 +133,22 @@ print()
125
133
  # ====================================================================
126
134
 
127
135
  # Split data into historical (for training) and future (for prediction)
128
- most_recent_5_games = df[column_names.match_id].unique()[-5:]
129
- historical_df = df[~df[column_names.match_id].isin(most_recent_5_games)].copy()
130
- future_df = df[df[column_names.match_id].isin(most_recent_5_games)].copy()
136
+ most_recent_5_games = (
137
+ df.select(pl.col(column_names.match_id))
138
+ .unique(maintain_order=True)
139
+ .tail(5)
140
+ .get_column(column_names.match_id)
141
+ .to_list()
142
+ )
143
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
144
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
131
145
 
132
- print(f"Historical data: {len(historical_df)} rows, {historical_df[column_names.match_id].nunique()} games")
133
- print(f"Future data: {len(future_df)} rows, {future_df[column_names.match_id].nunique()} games")
146
+ historical_games = (
147
+ historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
148
+ )
149
+ future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
150
+ print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
151
+ print(f"Future data: {len(future_df)} rows, {future_games} games")
134
152
  print()
135
153
 
136
154
  # FIT_TRANSFORM: Learn from historical data
@@ -138,7 +156,7 @@ print()
138
156
  # - Lags/rolling windows build up from initial games
139
157
  # - Internal state (ratings, windows) is MUTATED
140
158
  print("Applying fit_transform to historical data...")
141
- historical_df = features_pipeline.fit_transform(historical_df)
159
+ historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
142
160
  print(f" Generated {len(features_pipeline.features_out)} features:")
143
161
  for feature in features_pipeline.features_out:
144
162
  print(f" - {feature}")
@@ -149,7 +167,7 @@ print()
149
167
  # - Appends current game to lag/rolling windows but doesn't persist the update
150
168
  # - This is what you use in production: generate features without affecting your model's state
151
169
  print("Applying future_transform to future data (read-only)...")
152
- future_df_transformed = features_pipeline.future_transform(future_df)
170
+ future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
153
171
  print(f" Future data now has {len(future_df_transformed.columns)} columns")
154
172
  print()
155
173
 
@@ -1,12 +1,13 @@
1
- import pandas as pd
1
+ import polars as pl
2
2
  from sklearn.linear_model import LogisticRegression
3
3
 
4
+ from examples import get_sub_sample_nba_data
4
5
  from spforge.autopipeline import AutoPipeline
5
6
  from spforge.data_structures import ColumnNames
6
7
  from spforge.ratings import RatingKnownFeatures
7
8
  from spforge.ratings._player_rating import PlayerRatingGenerator
8
9
 
9
- df = pd.read_parquet("data/game_player_subsample.parquet")
10
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
10
11
 
11
12
  # Defines the column names as they appear in the dataframe
12
13
  column_names = ColumnNames(
@@ -16,8 +17,8 @@ column_names = ColumnNames(
16
17
  player_id="player_name",
17
18
  )
18
19
  # Sorts the dataframe. The dataframe must always be sorted as below
19
- df = df.sort_values(
20
- by=[
20
+ df = df.sort(
21
+ [
21
22
  column_names.start_date,
22
23
  column_names.match_id,
23
24
  column_names.team_id,
@@ -27,17 +28,26 @@ df = df.sort_values(
27
28
 
28
29
  # Drops games with less or more than 2 teams
29
30
  df = (
30
- df.assign(
31
- team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique")
31
+ df.with_columns(
32
+ pl.col(column_names.team_id)
33
+ .n_unique()
34
+ .over(column_names.match_id)
35
+ .alias("team_count")
32
36
  )
33
- .loc[lambda x: x.team_count == 2]
34
- .drop(columns=["team_count"])
37
+ .filter(pl.col("team_count") == 2)
38
+ .drop("team_count")
35
39
  )
36
40
 
37
41
  # Pretends the last 10 games are future games. The most will be trained on everything before that.
38
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
39
- historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
40
- future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["won"])
42
+ most_recent_10_games = (
43
+ df.select(pl.col(column_names.match_id))
44
+ .unique(maintain_order=True)
45
+ .tail(10)
46
+ .get_column(column_names.match_id)
47
+ .to_list()
48
+ )
49
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
50
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
41
51
 
42
52
  # Defining a simple rating-generator. It will use the "won" column to update the ratings.
43
53
  # In contrast to a typical Elo, ratings will follow players.
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
49
59
  column_names=column_names,
50
60
  non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
51
61
  )
52
- historical_df = rating_generator.fit_transform(historical_df)
62
+ historical_df = rating_generator.fit_transform(historical_df).to_pandas()
53
63
 
54
64
  # Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
55
65
  # Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
61
71
  pipeline = AutoPipeline(
62
72
  estimator=LogisticRegression(),
63
73
  granularity=["game_id", "team_id"],
64
- feature_names=rating_generator.features_out + ["location"],
74
+ estimator_features=rating_generator.features_out + ["location"],
65
75
  )
66
76
 
67
77
  pipeline.fit(X=historical_df, y=historical_df["won"])
68
78
 
69
79
  # Future predictions on future results
70
- future_df = rating_generator.future_transform(future_df)
80
+ future_df = rating_generator.future_transform(future_df).to_pandas()
71
81
  future_predictions = pipeline.predict_proba(future_df)[:, 1]
72
82
  future_df["game_winner_probability"] = future_predictions
73
83
  # Grouping predictions from game-player level to game-level.
@@ -12,7 +12,7 @@ Key concepts covered:
12
12
  - Hierarchical modeling: Team strength → Player performance
13
13
  """
14
14
 
15
- import pandas as pd
15
+ import polars as pl
16
16
  from lightgbm import LGBMRegressor
17
17
  from sklearn.linear_model import LogisticRegression
18
18
 
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
24
24
  from spforge.transformers import EstimatorTransformer
25
25
 
26
26
  # Load sample NBA data
27
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
27
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
28
28
 
29
29
  # Define column mappings
30
30
  column_names = ColumnNames(
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
  )
36
36
 
37
37
  # Sort data chronologically (critical for temporal correctness)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date,
41
41
  column_names.match_id,
@@ -46,18 +46,31 @@ df = df.sort_values(
46
46
 
47
47
  # Filter to valid games
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
59
  # Train/test split (using temporal ordering)
55
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
56
- train_df = df[~df[column_names.match_id].isin(most_recent_10_games)].copy()
57
- test_df = df[df[column_names.match_id].isin(most_recent_10_games)].copy()
60
+ most_recent_10_games = (
61
+ df.select(pl.col(column_names.match_id))
62
+ .unique(maintain_order=True)
63
+ .tail(10)
64
+ .get_column(column_names.match_id)
65
+ .to_list()
66
+ )
67
+ train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
68
+ test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
58
69
 
59
- print(f"Training: {len(train_df)} rows, {train_df[column_names.match_id].nunique()} games")
60
- print(f"Testing: {len(test_df)} rows, {test_df[column_names.match_id].nunique()} games")
70
+ train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
71
+ test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
72
+ print(f"Training: {len(train_df)} rows, {train_games} games")
73
+ print(f"Testing: {len(test_df)} rows, {test_games} games")
61
74
  print()
62
75
 
63
76
  # ====================================================================
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
86
99
  )
87
100
 
88
101
  # Generate features
89
- train_df = features_pipeline.fit_transform(train_df)
90
- test_df = features_pipeline.future_transform(test_df)
102
+ train_df = features_pipeline.fit_transform(train_df).to_pandas()
103
+ test_df = features_pipeline.future_transform(test_df).to_pandas()
91
104
 
92
105
  print(f"Generated {len(features_pipeline.features_out)} baseline features")
93
106
  print()
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
121
134
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
122
135
  # Features for the final estimator (only pre-game information)
123
136
  # Note: points_estimate_raw will be added by the transformer
124
- feature_names=features_pipeline.features_out,
137
+ estimator_features=features_pipeline.features_out,
125
138
  # The predictor_transformers parameter chains the estimators
126
139
  predictor_transformers=[points_estimate_transformer], # Stage 1 executes first
127
140
  )
@@ -150,7 +163,7 @@ print()
150
163
 
151
164
  # Fit the pipeline
152
165
  # The y target here is for the FINAL estimator (player points)
153
- # Each predictor_transformer has its own target_column specified
166
+ # Predictor_transformers are trained on the same target during fit()
154
167
  player_points_pipeline.fit(X=train_df, y=train_df["points"])
155
168
 
156
169
  print("Training complete!")
@@ -188,7 +201,7 @@ print()
188
201
 
189
202
  single_stage_pipeline = AutoPipeline(
190
203
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
191
- feature_names=features_pipeline.features_out,
204
+ estimator_features=features_pipeline.features_out,
192
205
  )
193
206
 
194
207
  print("Training single-stage baseline for comparison...")
@@ -127,7 +127,7 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
127
127
  """
128
128
  Default search space for PlayerRatingGenerator.
129
129
 
130
- Focuses on 5-8 core parameters that have the most impact on performance.
130
+ Focuses on core parameters that have the most impact on performance.
131
131
 
132
132
  Returns:
133
133
  Dictionary mapping parameter names to ParamSpec objects
@@ -167,6 +167,31 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
167
167
  param_type="categorical",
168
168
  choices=["difference", "mean", "ignore_opponent"],
169
169
  ),
170
+ "start_league_quantile": ParamSpec(
171
+ param_type="float",
172
+ low=0.05,
173
+ high=0.5,
174
+ ),
175
+ "start_min_count_for_percentiles": ParamSpec(
176
+ param_type="int",
177
+ low=40,
178
+ high=500,
179
+ ),
180
+ "start_team_rating_subtract": ParamSpec(
181
+ param_type="float",
182
+ low=0.0,
183
+ high=200.0,
184
+ ),
185
+ "start_team_weight": ParamSpec(
186
+ param_type="float",
187
+ low=0.0,
188
+ high=1.0,
189
+ ),
190
+ "start_min_match_count_team_rating": ParamSpec(
191
+ param_type="int",
192
+ low=1,
193
+ high=10,
194
+ ),
170
195
  }
171
196
 
172
197
 
@@ -6,3 +6,7 @@ from .enums import (
6
6
  RatingUnknownFeatures as RatingUnknownFeatures,
7
7
  )
8
8
  from .league_identifier import LeagueIdentifier as LeagueIdentifier
9
+ from .league_start_rating_optimizer import (
10
+ LeagueStartRatingOptimizationResult as LeagueStartRatingOptimizationResult,
11
+ LeagueStartRatingOptimizer as LeagueStartRatingOptimizer,
12
+ )
@@ -129,6 +129,9 @@ class PlayerRatingGenerator(RatingGenerator):
129
129
  str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_PROJECTED)
130
130
  )
131
131
  self.MEAN_PROJ_COL = self._suffix(str(RatingKnownFeatures.RATING_MEAN_PROJECTED))
132
+ self.PLAYER_DIFF_FROM_TEAM_PROJ_COL = self._suffix(
133
+ str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED)
134
+ )
132
135
 
133
136
  self.TEAM_OFF_RATING_PROJ_COL = self._suffix(
134
137
  str(RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED)
@@ -618,6 +621,7 @@ class PlayerRatingGenerator(RatingGenerator):
618
621
  or self.OPP_RATING_PROJ_COL in cols_to_add
619
622
  or self.DIFF_PROJ_COL in cols_to_add
620
623
  or self.MEAN_PROJ_COL in cols_to_add
624
+ or self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add
621
625
  ):
622
626
  df = add_team_rating_projected(
623
627
  df=df,
@@ -673,6 +677,13 @@ class PlayerRatingGenerator(RatingGenerator):
673
677
  )
674
678
  )
675
679
 
680
+ if self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add:
681
+ df = df.with_columns(
682
+ (pl.col(self.PLAYER_OFF_RATING_COL) - pl.col(self.TEAM_OFF_RATING_PROJ_COL)).alias(
683
+ self.PLAYER_DIFF_FROM_TEAM_PROJ_COL
684
+ )
685
+ )
686
+
676
687
  if (
677
688
  self.TEAM_RATING_COL in cols_to_add
678
689
  or self.OPP_RATING_COL in cols_to_add
@@ -0,0 +1,201 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from dataclasses import dataclass
5
+
6
+ import narwhals.stable.v2 as nw
7
+ import polars as pl
8
+ from narwhals.stable.v2.typing import IntoFrameT
9
+
10
+
11
+ DEFAULT_START_RATING = 1000.0
12
+
13
+
14
+ @dataclass
15
+ class LeagueStartRatingOptimizationResult:
16
+ league_ratings: dict[str, float]
17
+ iteration_errors: list[dict[str, float]]
18
+
19
+
20
+ class LeagueStartRatingOptimizer:
21
+ def __init__(
22
+ self,
23
+ rating_generator: object,
24
+ n_iterations: int = 3,
25
+ learning_rate: float = 0.2,
26
+ min_cross_region_rows: int = 10,
27
+ rating_scale: float | None = None,
28
+ ):
29
+ self.rating_generator = rating_generator
30
+ self.n_iterations = int(n_iterations)
31
+ self.learning_rate = float(learning_rate)
32
+ self.min_cross_region_rows = int(min_cross_region_rows)
33
+ self.rating_scale = rating_scale
34
+
35
+ @nw.narwhalify
36
+ def optimize(self, df: IntoFrameT) -> LeagueStartRatingOptimizationResult:
37
+ pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars()
38
+ league_ratings = self._get_league_ratings(self.rating_generator)
39
+ iteration_errors: list[dict[str, float]] = []
40
+
41
+ for _ in range(self.n_iterations):
42
+ gen = copy.deepcopy(self.rating_generator)
43
+ self._set_league_ratings(gen, league_ratings)
44
+ self._ensure_prediction_columns(gen)
45
+
46
+ pred_df = gen.fit_transform(pl_df)
47
+ error_df = self._cross_region_error_df(pl_df, pred_df, gen)
48
+ if error_df.is_empty():
49
+ break
50
+
51
+ error_summary = (
52
+ error_df.group_by(self._league_column_name(gen))
53
+ .agg(
54
+ pl.col("error").mean().alias("mean_error"),
55
+ pl.len().alias("row_count"),
56
+ )
57
+ .to_dicts()
58
+ )
59
+ league_key = self._league_column_name(gen)
60
+ iteration_errors.append({r[league_key]: r["mean_error"] for r in error_summary})
61
+ league_ratings = self._apply_error_updates(
62
+ gen, league_ratings, error_summary, league_key
63
+ )
64
+
65
+ self._set_league_ratings(self.rating_generator, league_ratings)
66
+ return LeagueStartRatingOptimizationResult(
67
+ league_ratings=league_ratings, iteration_errors=iteration_errors
68
+ )
69
+
70
+ def _cross_region_error_df(
71
+ self,
72
+ df: pl.DataFrame,
73
+ pred_df: pl.DataFrame,
74
+ rating_generator: object,
75
+ ) -> pl.DataFrame:
76
+ column_names = getattr(rating_generator, "column_names", None)
77
+ if column_names is None:
78
+ raise ValueError("rating_generator must define column_names")
79
+
80
+ match_id = getattr(column_names, "match_id", None)
81
+ team_id = getattr(column_names, "team_id", None)
82
+ league_col = getattr(column_names, "league", None)
83
+ if not match_id or not team_id or not league_col:
84
+ raise ValueError("column_names must include match_id, team_id, and league")
85
+
86
+ pred_col, entity_cols, perf_col = self._prediction_spec(rating_generator)
87
+ base_cols = [match_id, team_id, league_col, perf_col]
88
+ for col in base_cols + entity_cols:
89
+ if col not in df.columns:
90
+ raise ValueError(f"{col} missing from input dataframe")
91
+
92
+ join_cols = [match_id, team_id] + entity_cols
93
+ joined = df.select(base_cols + entity_cols).join(
94
+ pred_df.select(join_cols + [pred_col]),
95
+ on=join_cols,
96
+ how="inner",
97
+ )
98
+ opp_league = self._opponent_mode_league(joined, match_id, team_id, league_col)
99
+ enriched = joined.join(opp_league, on=[match_id, team_id], how="left").with_columns(
100
+ (pl.col(perf_col) - pl.col(pred_col)).alias("error")
101
+ )
102
+ return enriched.filter(pl.col("opp_mode_league").is_not_null()).filter(
103
+ pl.col(league_col) != pl.col("opp_mode_league")
104
+ )
105
+
106
+ def _opponent_mode_league(
107
+ self, df: pl.DataFrame, match_id: str, team_id: str, league_col: str
108
+ ) -> pl.DataFrame:
109
+ team_mode = (
110
+ df.group_by([match_id, team_id, league_col])
111
+ .agg(pl.len().alias("__count"))
112
+ .sort(["__count"], descending=True)
113
+ .unique([match_id, team_id])
114
+ .select([match_id, team_id, league_col])
115
+ .rename({league_col: "team_mode_league"})
116
+ )
117
+ opponents = (
118
+ team_mode.join(team_mode, on=match_id, suffix="_opp")
119
+ .filter(pl.col(team_id) != pl.col(f"{team_id}_opp"))
120
+ .group_by([match_id, team_id, "team_mode_league_opp"])
121
+ .agg(pl.len().alias("__count"))
122
+ .sort(["__count"], descending=True)
123
+ .unique([match_id, team_id])
124
+ .select([match_id, team_id, "team_mode_league_opp"])
125
+ .rename({"team_mode_league_opp": "opp_mode_league"})
126
+ )
127
+ return opponents
128
+
129
+ def _prediction_spec(self, rating_generator: object) -> tuple[str, list[str], str]:
130
+ perf_col = getattr(rating_generator, "performance_column", None)
131
+ if not perf_col:
132
+ raise ValueError("rating_generator must define performance_column")
133
+ if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
134
+ pred_col = rating_generator.PLAYER_PRED_PERF_COL
135
+ column_names = rating_generator.column_names
136
+ player_id = getattr(column_names, "player_id", None)
137
+ if not player_id:
138
+ raise ValueError("column_names must include player_id for player ratings")
139
+ return pred_col, [player_id], perf_col
140
+ if hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
141
+ pred_col = rating_generator.TEAM_PRED_OFF_PERF_COL
142
+ return pred_col, [], perf_col
143
+ raise ValueError("rating_generator must expose a predicted performance column")
144
+
145
+ def _ensure_prediction_columns(self, rating_generator: object) -> None:
146
+ pred_cols: list[str] = []
147
+ if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
148
+ pred_cols.append(rating_generator.PLAYER_PRED_PERF_COL)
149
+ elif hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
150
+ pred_cols.append(rating_generator.TEAM_PRED_OFF_PERF_COL)
151
+
152
+ if not pred_cols:
153
+ return
154
+
155
+ existing = list(getattr(rating_generator, "non_predictor_features_out", []) or [])
156
+ for col in pred_cols:
157
+ if col not in existing:
158
+ existing.append(col)
159
+ rating_generator.non_predictor_features_out = existing
160
+
161
+ def _apply_error_updates(
162
+ self,
163
+ rating_generator: object,
164
+ league_ratings: dict[str, float],
165
+ error_summary: list[dict[str, float]],
166
+ league_key: str,
167
+ ) -> dict[str, float]:
168
+ scale = self.rating_scale
169
+ if scale is None:
170
+ scale = getattr(rating_generator, "rating_change_multiplier_offense", 1.0)
171
+
172
+ updated = dict(league_ratings)
173
+ for row in error_summary:
174
+ if row["row_count"] < self.min_cross_region_rows:
175
+ continue
176
+ league = row[league_key]
177
+ mean_error = row["mean_error"]
178
+ base_rating = updated.get(league, DEFAULT_START_RATING)
179
+ updated[league] = base_rating + self.learning_rate * mean_error * scale
180
+ return updated
181
+
182
+ def _league_column_name(self, rating_generator: object) -> str:
183
+ column_names = getattr(rating_generator, "column_names", None)
184
+ league_col = getattr(column_names, "league", None)
185
+ if not league_col:
186
+ raise ValueError("column_names must include league for league adjustments")
187
+ return league_col
188
+
189
+ def _get_league_ratings(self, rating_generator: object) -> dict[str, float]:
190
+ start_gen = getattr(rating_generator, "start_rating_generator", None)
191
+ if start_gen is None or not hasattr(start_gen, "league_ratings"):
192
+ raise ValueError("rating_generator must define start_rating_generator.league_ratings")
193
+ return dict(start_gen.league_ratings)
194
+
195
+ def _set_league_ratings(self, rating_generator: object, league_ratings: dict[str, float]) -> None:
196
+ start_gen = getattr(rating_generator, "start_rating_generator", None)
197
+ if start_gen is None or not hasattr(start_gen, "league_ratings"):
198
+ raise ValueError("rating_generator must define start_rating_generator.league_ratings")
199
+ start_gen.league_ratings = dict(league_ratings)
200
+ if hasattr(rating_generator, "start_league_ratings"):
201
+ rating_generator.start_league_ratings = dict(league_ratings)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.5
3
+ Version: 0.8.7
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
85
85
  import pandas as pd
86
86
  from sklearn.linear_model import LogisticRegression
87
87
 
88
+ from examples import get_sub_sample_nba_data
88
89
  from spforge.autopipeline import AutoPipeline
89
90
  from spforge.data_structures import ColumnNames
90
- from spforge.ratings import RatingKnownFeatures
91
- from spforge.ratings._player_rating import PlayerRatingGenerator
91
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
92
92
 
93
- df = pd.read_parquet("data/game_player_subsample.parquet")
93
+ df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
94
94
 
95
95
  # Step 1: Define column mappings for your dataset
96
96
  column_names = ColumnNames(
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
144
144
  pipeline = AutoPipeline(
145
145
  estimator=LogisticRegression(),
146
146
  granularity=["game_id", "team_id"], # Aggregate players → teams
147
- feature_names=rating_generator.features_out + ["location"], # Rating + home/away
147
+ estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
148
148
  )
149
149
 
150
150
  # Train on historical data
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
302
302
  prediction_column_name="points_pred",
303
303
  target_column="points",
304
304
  n_splits=3, # Number of temporal folds
305
- # Must include both feature_names AND context_feature_names
306
- features=pipeline.feature_names + pipeline.context_feature_names,
305
+ # Must include both estimator features and context features
306
+ features=pipeline.required_features,
307
307
  )
308
308
 
309
309
  # Generate validation predictions
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
330
330
  - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
331
331
  - Use `validation_column` in scorer to score only validation rows
332
332
  - Training data always comes BEFORE validation data chronologically
333
- - Must pass both `feature_names` + `context_feature_names` to `features` parameter
333
+ - Must pass all required features (use `pipeline.required_features`)
334
334
  - Scorers can filter rows (e.g., only score players who played minutes > 0)
335
335
 
336
336
  See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
371
371
  # Approach 1: LGBMClassifier (direct probability prediction)
372
372
  pipeline_classifier = AutoPipeline(
373
373
  estimator=LGBMClassifier(verbose=-100, random_state=42),
374
- feature_names=features_pipeline.features_out,
374
+ estimator_features=features_pipeline.features_out,
375
375
  )
376
376
 
377
377
  # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
385
385
 
386
386
  pipeline_negbin = AutoPipeline(
387
387
  estimator=distribution_estimator,
388
- feature_names=features_pipeline.features_out,
389
- context_feature_names=[
390
- column_names.player_id,
391
- column_names.start_date,
392
- column_names.team_id,
393
- column_names.match_id,
394
- ],
388
+ estimator_features=features_pipeline.features_out,
395
389
  predictor_transformers=[
396
390
  EstimatorTransformer(
397
391
  prediction_column_name="points_estimate",
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
439
433
  # Stage 2: Refine estimate using Stage 1 output
440
434
  player_points_pipeline = AutoPipeline(
441
435
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
442
- feature_names=features_pipeline.features_out, # Original features
436
+ estimator_features=features_pipeline.features_out, # Original features
443
437
  # predictor_transformers execute first, adding their predictions
444
438
  predictor_transformers=[points_estimate_transformer],
445
439
  )
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
474
468
  - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
475
469
  - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
476
470
  - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
477
-
@@ -1,15 +1,15 @@
1
1
  examples/__init__.py,sha256=qGLpphvrjQj0-zS9vP0Q07L-anDnmw7gFZJUEBgYG3U,158
2
2
  examples/game_level_example.py,sha256=EOr-H0K79O3Zah4wWuqa5DLmT2iZGbfgxD-xSU2-dfI,2244
3
3
  examples/lol/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- examples/lol/pipeline_transformer_example.py,sha256=HousFjE2dbJgdONur4PxwhW2SGQIJGI8aZUIb4TEvIo,4317
4
+ examples/lol/pipeline_transformer_example.py,sha256=XVmm6Xya5z7JyOA0s-DISOlR2I1wpUthCyhRSt9n6qE,3402
5
5
  examples/lol/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  examples/lol/data/subsample_lol_data.parquet,sha256=tl04XDslylECJUV1e0DGeqMb6D0Uh6_48NO6TykdgQI,343549
7
7
  examples/lol/data/utils.py,sha256=Lt3XNNa5cavvFXHaTQ-GOPxSuWmPEfEO0CVXQEyF_s0,486
8
8
  examples/nba/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- examples/nba/cross_validation_example.py,sha256=WD_52vO9m1rILVfXXf8uIb_odpaK-TZ4iOewHU19lTg,5281
10
- examples/nba/feature_engineering_example.py,sha256=0OHJ2w6vkHvFB2bYwIQQb8HjFA5bfXc7tLmngnahG74,7708
11
- examples/nba/game_winner_example.py,sha256=RNKYSwpArr08yDWOtkxjx7eAldf97WYDBBfb3tsVSZc,2975
12
- examples/nba/predictor_transformers_example.py,sha256=mPXRVPx4J5VZtxYH89k7pwh7_EGZ0CXoNHeh2s0AOp8,8499
9
+ examples/nba/cross_validation_example.py,sha256=XVnQJ5mqMou9z83ML5J0wS3gk-pa56sdvahJYQgZ8os,5056
10
+ examples/nba/feature_engineering_example.py,sha256=BDd5594Yi_56lGDqz3SYQkwT8NVZyFkgv3gKPCsAjz4,8197
11
+ examples/nba/game_winner_example.py,sha256=7VVHxGyU2uPjT9q6lDMHJ5KpkWp9gU8brxr_UZfuSHg,3189
12
+ examples/nba/predictor_transformers_example.py,sha256=Fl4BY_hVW0iYERolN6s-ZB2xv-UxOK547L6iI5t0r0Y,8807
13
13
  examples/nba/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  examples/nba/data/game_player_subsample.parquet,sha256=ODJxHC-mUYbJ7r-ScUFtPU7hrFuxLUbbDSobmpCkw0w,279161
15
15
  examples/nba/data/utils.py,sha256=41hxLQ1d6ZgBEcHa5MI0-fG5KbsRi07cclMPQZM95ek,509
@@ -44,17 +44,18 @@ spforge/feature_generator/_rolling_mean_days.py,sha256=EZQmFmYVQB-JjZV5k8bOWnaTx
44
44
  spforge/feature_generator/_rolling_window.py,sha256=HT8LezsRIPNAlMEoP9oTPW2bKFu55ZSRnQZGST7fncw,8836
45
45
  spforge/feature_generator/_utils.py,sha256=KDn33ia1OYJTK8THFpvc_uRiH_Bl3fImGqqbfzs0YA4,9654
46
46
  spforge/hyperparameter_tuning/__init__.py,sha256=N2sKG4SvG41hlsFT2kx_DQYMmXsQr-8031Tu_rxlxyY,1015
47
- spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=entdE7gtj8JM5C47-lLd93CoEsXjw8YfcWeWS8d0AZk,6882
47
+ spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=Sm5IrHAW0-vRC8jqCPX0pDi_C-W3L_MoEKGA8bx1Zbc,7546
48
48
  spforge/hyperparameter_tuning/_tuner.py,sha256=uovhGqhe8-fdhi79aErUmE2h5NCycFQEIRv5WCjpC7E,16732
49
49
  spforge/performance_transformers/__init__.py,sha256=U6d7_kltbUMLYCGBk4QAFVPJTxXD3etD9qUftV-O3q4,422
50
50
  spforge/performance_transformers/_performance_manager.py,sha256=KwAga6dGhNkXi-MDW6LPjwk6VZwCcjo5L--jnk9aio8,9706
51
51
  spforge/performance_transformers/_performances_transformers.py,sha256=0lxuWjAfWBRXRgQsNJHjw3P-nlTtHBu4_bOVdoy7hq4,15536
52
- spforge/ratings/__init__.py,sha256=jAa_xF2e-96FoyD57EYFKE-mO6OnK23siJOB4tzbyek,387
52
+ spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
53
53
  spforge/ratings/_base.py,sha256=dRMkIGj5-2zKddygaEA4g16WCyXon7v8Xa1ymm7IuoM,14335
54
- spforge/ratings/_player_rating.py,sha256=05CuiSa2_uM0xtYpxT00OOxU_TmW4qt6dsXvn7seFss,50861
54
+ spforge/ratings/_player_rating.py,sha256=MyqsyLSY6d7_bxDSnF8eWOyXpSCADWGdepdFSGM4cHw,51365
55
55
  spforge/ratings/_team_rating.py,sha256=T0kFiv3ykYSrVGGsVRa8ZxLB0WMnagxqdFDzl9yZ_9g,24813
56
56
  spforge/ratings/enums.py,sha256=s7z_RcZS6Nlgfa_6tasO8_IABZJwywexe7sep9DJBgo,1739
57
57
  spforge/ratings/league_identifier.py,sha256=_KDUKOwoNU6RNFKE5jju4eYFGVNGBdJsv5mhNvMakfc,6019
58
+ spforge/ratings/league_start_rating_optimizer.py,sha256=Q4Vo3QT-r55qP4aD9WftsTB00UOSRvxM1khlyuAGWNM,8582
58
59
  spforge/ratings/player_performance_predictor.py,sha256=cMxzQuk0nF1MsT_M32g-3mxVdAEbZ-S7TUjEPYdo3Yg,8361
59
60
  spforge/ratings/start_rating_generator.py,sha256=_7hIJ9KRVCwsCoY1GIzY8cuOdHR8RH_BCMeMwQG3E04,6776
60
61
  spforge/ratings/team_performance_predictor.py,sha256=ThQOmYQUqKBB46ONYHOMM2arXFH8AkyKpAZzs80SjHA,7217
@@ -70,16 +71,17 @@ spforge/transformers/_other_transformer.py,sha256=xLfaFIhkFsigAoitB4x3F8An2j9ymd
70
71
  spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
71
72
  spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
72
73
  spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
73
- spforge-0.8.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
+ spforge-0.8.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
75
  tests/test_autopipeline.py,sha256=WXHeqBdjQD6xaXVkzvS8ocz0WVP9R7lN0PiHJ2iD8nA,16911
75
76
  tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
76
77
  tests/test_feature_generator_pipeline.py,sha256=CAgBknWqawqYi5_hxcPmpxrLVa5elMHVv1VrSVRKXEA,17705
77
78
  tests/cross_validator/test_cross_validator.py,sha256=itCGhNY8-NbDbKbhxHW20wiLuRst7-Rixpmi3FSKQtA,17474
78
79
  tests/distributions/test_distribution.py,sha256=aU8hfCgliM80TES4WGjs9KFXpV8XghBGF7Hu9sqEVSE,10982
79
80
  tests/end_to_end/test_estimator_hyperparameter_tuning.py,sha256=fZCJ9rrED2vT68B9ovmVA1cIG2pHRTjy9xzZLxxpEBo,2513
81
+ tests/end_to_end/test_league_start_rating_optimizer.py,sha256=Mmct2ixp4c6L7PGym8wZc7E-Csozryt1g4_o6OCc1uI,3141
80
82
  tests/end_to_end/test_lol_player_kills.py,sha256=RJSYUbPrZ-RzSxGggj03yN0JKYeTB1JghVGYFMYia3Y,11891
81
83
  tests/end_to_end/test_nba_player_points.py,sha256=kyzjo7QIcvpteps29Wix6IS_eJG9d1gHLeWtIHpkWMs,9066
82
- tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py,sha256=eOsTSVWv16bc0l_nCxH4x8jF-gsmn4Ttfv92mHqSXzc,6303
84
+ tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py,sha256=LXRkI_6Ho2kzJVbNAM17QFhx_MP9WdDJXCO9dWgJGNA,6491
83
85
  tests/end_to_end/test_nba_prediction_consistency.py,sha256=o3DckJasx_I1ed6MhMYZUo2WSDvQ_p3HtJa9DCWTIYU,9857
84
86
  tests/estimator/test_sklearn_estimator.py,sha256=tVfOP9Wx-tV1b6DcHbGxQHZQzNPA0Iobq8jTcUrk59U,48668
85
87
  tests/feature_generator/test_lag.py,sha256=5Ffrv0V9cwkbkzRMPBe3_c_YNW-W2al-XH_acQIvdeg,19531
@@ -92,10 +94,10 @@ tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGF
92
94
  tests/hyperparameter_tuning/test_rating_tuner.py,sha256=PyCFP3KPc4Iy9E_X9stCVxra14uMgC1tuRwuQ30rO_o,13195
93
95
  tests/performance_transformers/test_performance_manager.py,sha256=bfC5GiBuzHw-mLmKeEzBUUPuKm0ayax2bsF1j88W8L0,10120
94
96
  tests/performance_transformers/test_performances_transformers.py,sha256=A-tGiCx7kXrj1cVj03Bc7prOeZ1_Ryz8YFx9uj3eK6w,11064
95
- tests/ratings/test_player_rating_generator.py,sha256=3mjqlX159QqOlBoY3r_TFkvLwpE4zlLE0fiqpbfk3ps,58547
97
+ tests/ratings/test_player_rating_generator.py,sha256=FGH3Tq0uFoSlkS_XMldsUKhsovBRBvzH9EbqjKvg2O0,59601
96
98
  tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
97
99
  tests/ratings/test_team_rating_generator.py,sha256=cDnf1zHiYC7pkgydE3MYr8wSTJIq-bPfSqhIRI_4Tic,95357
98
- tests/scorer/test_score.py,sha256=KTrGJypQEpU8tmgJ6LU8wK1SRC3PLUXFzZIyiA-UY7U,71749
100
+ tests/scorer/test_score.py,sha256=_Vd6tKpy_1GeOxU7Omxci4CFf7PvRGMefEI0gv2gV6A,74688
99
101
  tests/scorer/test_score_aggregation_granularity.py,sha256=h-hyFOLzwp-92hYVU7CwvlRJ8jhB4DzXCtqgI-zcoqM,13677
100
102
  tests/transformers/test_estimator_transformer_context.py,sha256=5GOHbuWCWBMFwwOTJOuD4oNDsv-qDR0OxNZYGGuMdag,1819
101
103
  tests/transformers/test_net_over_predicted.py,sha256=vh7O1iRRPf4vcW9aLhOMAOyatfM5ZnLsQBKNAYsR3SU,3363
@@ -103,7 +105,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
103
105
  tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
104
106
  tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
105
107
  tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
106
- spforge-0.8.5.dist-info/METADATA,sha256=bqArRdOKZYvSc47sa9cJsOhsDxh0q4T6GoF_xIBkjpA,20226
107
- spforge-0.8.5.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
108
- spforge-0.8.5.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
109
- spforge-0.8.5.dist-info/RECORD,,
108
+ spforge-0.8.7.dist-info/METADATA,sha256=7vwprmmFvSpEL3lC0HqFZPbzxMi8mRzI0yOsa7pUlNQ,20047
109
+ spforge-0.8.7.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
110
+ spforge-0.8.7.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
111
+ spforge-0.8.7.dist-info/RECORD,,
@@ -0,0 +1,117 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+ import pytest
4
+
5
+ from spforge import ColumnNames
6
+ from spforge.ratings import (
7
+ LeagueStartRatingOptimizer,
8
+ PlayerRatingGenerator,
9
+ TeamRatingGenerator,
10
+ )
11
+
12
+
13
+ def _player_df():
14
+ dates = pd.date_range("2024-01-01", periods=3, freq="D")
15
+ rows = []
16
+ for i, date in enumerate(dates):
17
+ mid = f"M{i}"
18
+ for player_idx in range(2):
19
+ rows.append(
20
+ {
21
+ "pid": f"A{player_idx}",
22
+ "tid": "TA",
23
+ "mid": mid,
24
+ "date": date,
25
+ "league": "LCK",
26
+ "perf": 0.4,
27
+ }
28
+ )
29
+ for player_idx in range(2):
30
+ rows.append(
31
+ {
32
+ "pid": f"B{player_idx}",
33
+ "tid": "TB",
34
+ "mid": mid,
35
+ "date": date,
36
+ "league": "LEC",
37
+ "perf": 0.6,
38
+ }
39
+ )
40
+ return pd.DataFrame(rows)
41
+
42
+
43
+ def _team_df():
44
+ dates = pd.date_range("2024-01-01", periods=3, freq="D")
45
+ rows = []
46
+ for i, date in enumerate(dates):
47
+ mid = f"M{i}"
48
+ rows.extend(
49
+ [
50
+ {
51
+ "tid": "TA",
52
+ "mid": mid,
53
+ "date": date,
54
+ "league": "LCK",
55
+ "perf": 0.4,
56
+ },
57
+ {
58
+ "tid": "TB",
59
+ "mid": mid,
60
+ "date": date,
61
+ "league": "LEC",
62
+ "perf": 0.6,
63
+ },
64
+ ]
65
+ )
66
+ return pd.DataFrame(rows)
67
+
68
+
69
+ @pytest.mark.parametrize("use_polars", [False, True])
70
+ def test_league_start_rating_optimizer__adjusts_player_leagues(use_polars):
71
+ cn = ColumnNames(
72
+ player_id="pid",
73
+ team_id="tid",
74
+ match_id="mid",
75
+ start_date="date",
76
+ league="league",
77
+ )
78
+ df = _player_df()
79
+ if use_polars:
80
+ df = pl.from_pandas(df)
81
+ generator = PlayerRatingGenerator(performance_column="perf", column_names=cn)
82
+ optimizer = LeagueStartRatingOptimizer(
83
+ rating_generator=generator,
84
+ n_iterations=1,
85
+ learning_rate=0.5,
86
+ min_cross_region_rows=1,
87
+ )
88
+
89
+ result = optimizer.optimize(df)
90
+
91
+ assert result.league_ratings["LCK"] < 1000
92
+ assert result.league_ratings["LEC"] > 1000
93
+
94
+
95
+ @pytest.mark.parametrize("use_polars", [False, True])
96
+ def test_league_start_rating_optimizer__adjusts_team_leagues(use_polars):
97
+ cn = ColumnNames(
98
+ team_id="tid",
99
+ match_id="mid",
100
+ start_date="date",
101
+ league="league",
102
+ )
103
+ df = _team_df()
104
+ if use_polars:
105
+ df = pl.from_pandas(df)
106
+ generator = TeamRatingGenerator(performance_column="perf", column_names=cn)
107
+ optimizer = LeagueStartRatingOptimizer(
108
+ rating_generator=generator,
109
+ n_iterations=1,
110
+ learning_rate=0.5,
111
+ min_cross_region_rows=1,
112
+ )
113
+
114
+ result = optimizer.optimize(df)
115
+
116
+ assert result.league_ratings["LCK"] < 1000
117
+ assert result.league_ratings["LEC"] > 1000
@@ -97,6 +97,11 @@ def test_nba_player_ratings_hyperparameter_tuning__workflow_completes(
97
97
  "confidence_max_sum",
98
98
  "use_off_def_split",
99
99
  "performance_predictor",
100
+ "start_team_weight",
101
+ "start_league_quantile",
102
+ "start_min_count_for_percentiles",
103
+ "start_min_match_count_team_rating",
104
+ "start_team_rating_subtract",
100
105
  }
101
106
  assert set(result.best_params.keys()) == expected_params
102
107
 
@@ -1662,3 +1662,30 @@ def test_player_rating_team_with_strong_offense_and_weak_defense_gets_expected_r
1662
1662
 
1663
1663
  assert a_off > start_rating
1664
1664
  assert a_def < start_rating
1665
+
1666
+
1667
+ def test_fit_transform__player_rating_difference_from_team_projected_feature(base_cn, sample_df):
1668
+ """PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED computes player_off_rating - team_off_rating_projected."""
1669
+ gen = PlayerRatingGenerator(
1670
+ performance_column="perf",
1671
+ column_names=base_cn,
1672
+ auto_scale_performance=True,
1673
+ features_out=[
1674
+ RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED,
1675
+ RatingKnownFeatures.PLAYER_OFF_RATING,
1676
+ RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED,
1677
+ ],
1678
+ )
1679
+ result = gen.fit_transform(sample_df)
1680
+
1681
+ diff_col = "player_rating_difference_from_team_projected_perf"
1682
+ player_col = "player_off_rating_perf"
1683
+ team_col = "team_off_rating_projected_perf"
1684
+
1685
+ assert diff_col in result.columns
1686
+ assert player_col in result.columns
1687
+ assert team_col in result.columns
1688
+
1689
+ for row in result.iter_rows(named=True):
1690
+ expected = row[player_col] - row[team_col]
1691
+ assert row[diff_col] == pytest.approx(expected, rel=1e-9)
@@ -2048,3 +2048,93 @@ def test_all_scorers_handle_all_nan_targets(df_type):
2048
2048
  assert np.isnan(score) or score == 0.0
2049
2049
  except (ValueError, IndexError):
2050
2050
  pass
2051
+ SCORER_VALIDATION_CASES = [
2052
+ pytest.param(
2053
+ lambda: MeanBiasScorer(pred_column="pred", target="target", validation_column="is_validation"),
2054
+ lambda: pd.DataFrame(
2055
+ {
2056
+ "pred": [2.0, 0.0],
2057
+ "target": [1.0, 2.0],
2058
+ "is_validation": [1, 0],
2059
+ }
2060
+ ),
2061
+ id="mean_bias",
2062
+ ),
2063
+ pytest.param(
2064
+ lambda: PWMSE(pred_column="pred", target="target", labels=[0, 1], validation_column="is_validation"),
2065
+ lambda: pd.DataFrame(
2066
+ {
2067
+ "pred": [[0.7, 0.3], [0.4, 0.6]],
2068
+ "target": [0, 1],
2069
+ "is_validation": [1, 0],
2070
+ }
2071
+ ),
2072
+ id="pwmse",
2073
+ ),
2074
+ pytest.param(
2075
+ lambda: SklearnScorer(
2076
+ scorer_function=mean_absolute_error, pred_column="pred", target="target", validation_column="is_validation"
2077
+ ),
2078
+ lambda: pd.DataFrame(
2079
+ {
2080
+ "pred": [1.0, 0.0],
2081
+ "target": [1.0, 0.0],
2082
+ "is_validation": [1, 0],
2083
+ }
2084
+ ),
2085
+ id="sklearn",
2086
+ ),
2087
+ pytest.param(
2088
+ lambda: ProbabilisticMeanBias(
2089
+ pred_column="pred", target="target", class_column_name="classes", validation_column="is_validation"
2090
+ ),
2091
+ lambda: pd.DataFrame(
2092
+ {
2093
+ "pred": [[0.2, 0.8], [0.6, 0.4]],
2094
+ "target": [1, 0],
2095
+ "classes": [[0, 1], [0, 1]],
2096
+ "is_validation": [1, 0],
2097
+ }
2098
+ ),
2099
+ id="probabilistic_mean_bias",
2100
+ ),
2101
+ pytest.param(
2102
+ lambda: OrdinalLossScorer(pred_column="pred", target="target", classes=[0, 1], validation_column="is_validation"),
2103
+ lambda: pd.DataFrame(
2104
+ {
2105
+ "pred": [[0.2, 0.8], [0.6, 0.4]],
2106
+ "target": [1, 0],
2107
+ "is_validation": [1, 0],
2108
+ }
2109
+ ),
2110
+ id="ordinal_loss",
2111
+ ),
2112
+ pytest.param(
2113
+ lambda: ThresholdEventScorer(
2114
+ dist_column="dist",
2115
+ threshold_column="threshold",
2116
+ outcome_column="outcome",
2117
+ comparator=Operator.GREATER_THAN_OR_EQUALS,
2118
+ validation_column="is_validation",
2119
+ ),
2120
+ lambda: pd.DataFrame(
2121
+ {
2122
+ "dist": [[0.2, 0.8], [0.6, 0.4], [0.3, 0.7]],
2123
+ "threshold": [0.5, 0.2, 0.3],
2124
+ "outcome": [1, 0, 1],
2125
+ "is_validation": [1, 1, 0],
2126
+ }
2127
+ ),
2128
+ id="threshold_event",
2129
+ ),
2130
+ ]
2131
+
2132
+
2133
+ @pytest.mark.parametrize("scorer_factory, df_factory", SCORER_VALIDATION_CASES)
2134
+ def test_scorers_respect_validation_column(scorer_factory, df_factory):
2135
+ """Scorers should filter on validation_column when specified."""
2136
+ df = df_factory()
2137
+ df_valid = df[df["is_validation"] == 1]
2138
+ score_all = scorer_factory().score(df)
2139
+ score_valid = scorer_factory().score(df_valid)
2140
+ assert score_all == score_valid