spforge 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. examples/__init__.py +2 -0
  2. examples/lol/__init__.py +0 -0
  3. examples/lol/data/__init__.py +0 -0
  4. examples/lol/data/subsample_lol_data.parquet +0 -0
  5. examples/lol/data/utils.py +20 -0
  6. examples/lol/pipeline_transformer_example.py +158 -0
  7. examples/nba/__init__.py +0 -0
  8. examples/nba/cross_validation_example.py +131 -0
  9. examples/nba/data/__init__.py +0 -0
  10. examples/nba/data/game_player_subsample.parquet +0 -0
  11. examples/nba/data/utils.py +19 -0
  12. examples/nba/game_winner_example.py +94 -0
  13. spforge/__init__.py +3 -0
  14. spforge/cross_validator/__init__.py +2 -0
  15. spforge/cross_validator/_base.py +47 -0
  16. spforge/cross_validator/cross_validator.py +203 -0
  17. spforge/data_structures.py +195 -0
  18. spforge/pipeline.py +293 -0
  19. spforge/pipeline_factory.py +87 -0
  20. spforge/pipeline_transformer.py +129 -0
  21. spforge/predictor/__init__.py +8 -0
  22. spforge/predictor/_base.py +239 -0
  23. spforge/predictor/classifier.py +196 -0
  24. spforge/predictor/predictor.py +586 -0
  25. spforge/predictor/sklearn_estimator.py +77 -0
  26. spforge/predictor_transformer/__init__.py +5 -0
  27. spforge/predictor_transformer/_simple_transformer.py +120 -0
  28. spforge/predictor_transformer/transformer.py +111 -0
  29. spforge/ratings/__init__.py +15 -0
  30. spforge/ratings/enums.py +43 -0
  31. spforge/ratings/league_identifier.py +108 -0
  32. spforge/ratings/match_generator.py +399 -0
  33. spforge/ratings/performance_generator/__init__.py +6 -0
  34. spforge/ratings/performance_generator/_performances_generator.py +250 -0
  35. spforge/ratings/rating_calculators/__init__.py +6 -0
  36. spforge/ratings/rating_calculators/match_rating_generator.py +526 -0
  37. spforge/ratings/rating_calculators/performance_predictor.py +356 -0
  38. spforge/ratings/rating_calculators/start_rating_generator.py +199 -0
  39. spforge/ratings/rating_generator.py +158 -0
  40. spforge/ratings/update_rating_generator.py +908 -0
  41. spforge/scorer/__init__.py +1 -0
  42. spforge/scorer/_score.py +493 -0
  43. spforge/transformation_pipeline.py +0 -0
  44. spforge/transformers/__init__.py +12 -0
  45. spforge/transformers/_lag.py +225 -0
  46. spforge/transformers/_rolling_mean.py +237 -0
  47. spforge/transformers/_rolling_mean_binary.py +272 -0
  48. spforge/transformers/_rolling_mean_days.py +222 -0
  49. spforge/transformers/base_transformer.py +513 -0
  50. spforge/transformers/performances_transformers.py +525 -0
  51. spforge/transformers/transformers.py +332 -0
  52. spforge/tuner/__init__.py +5 -0
  53. spforge/tuner/base_tuner.py +13 -0
  54. spforge/tuner/performances_generator_tuner.py +208 -0
  55. spforge/tuner/pipeline_tuner.py +306 -0
  56. spforge/tuner/predictor_tuner.py +253 -0
  57. spforge/tuner/rating_generator_tuner.py +403 -0
  58. spforge/tuner/start_rating_optimizer.py +287 -0
  59. spforge/tuner/utils.py +171 -0
  60. spforge/utils.py +70 -0
  61. spforge-0.0.0.dist-info/METADATA +155 -0
  62. spforge-0.0.0.dist-info/RECORD +65 -0
  63. spforge-0.0.0.dist-info/WHEEL +5 -0
  64. spforge-0.0.0.dist-info/licenses/LICENSE +201 -0
  65. spforge-0.0.0.dist-info/top_level.txt +2 -0
examples/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from .nba.data.utils import get_sub_sample_nba_data
2
+ from .lol.data.utils import get_sub_sample_lol_data
File without changes
File without changes
@@ -0,0 +1,20 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ import polars as pl
6
+ import pandas as pd
7
+
8
+
9
+ def get_sub_sample_lol_data(
10
+ as_pandas: bool = True, as_polars: bool = False
11
+ ) -> Union[pd.DataFrame]:
12
+ script_dir = Path(__file__).parent
13
+ file_path = os.path.join(script_dir, "subsample_lol_data.parquet")
14
+
15
+ if as_polars:
16
+ return pl.read_parquet(file_path)
17
+ elif as_pandas:
18
+ return pd.read_parquet(file_path)
19
+ else:
20
+ raise ValueError("Must specify either as_pandas or as_polars")
@@ -0,0 +1,158 @@
1
+ from lightgbm import LGBMRegressor
2
+ from sklearn.linear_model import LogisticRegression
3
+
4
+ from examples import get_sub_sample_lol_data
5
+ from spforge import ColumnNames
6
+ from spforge.cross_validator import MatchKFoldCrossValidator
7
+ from spforge.pipeline_transformer import PipelineTransformer
8
+ from spforge.predictor import (
9
+ GameTeamPredictor,
10
+ SklearnPredictor,
11
+ )
12
+ from spforge.predictor.classifier import NegativeBinomialPredictor
13
+ from spforge.ratings import (
14
+ UpdateRatingGenerator,
15
+ RatingKnownFeatures,
16
+ )
17
+ from spforge.ratings.performance_generator import (
18
+ PerformancesGenerator,
19
+ Performance,
20
+ ColumnWeight,
21
+ )
22
+ from spforge.transformers import LagTransformer
23
+ from spforge.transformers import (
24
+ RollingMeanTransformer,
25
+ )
26
+
27
+ column_names = ColumnNames(
28
+ team_id="teamname",
29
+ match_id="gameid",
30
+ start_date="date",
31
+ player_id="playername",
32
+ league="league",
33
+ position="position",
34
+ )
35
+ df = get_sub_sample_lol_data(as_pandas=True)
36
+ df = (
37
+ df.loc[lambda x: x.position != "team"]
38
+ .assign(team_count=df.groupby("gameid")["teamname"].transform("nunique"))
39
+ .loc[lambda x: x.team_count == 2]
40
+ .assign(
41
+ player_count=df.groupby(["gameid", "teamname"])["playername"].transform(
42
+ "nunique"
43
+ )
44
+ )
45
+ .loc[lambda x: x.player_count == 5]
46
+ )
47
+ df = df.assign(team_count=df.groupby("gameid")["teamname"].transform("nunique")).loc[
48
+ lambda x: x.team_count == 2
49
+ ]
50
+
51
+ df = df.drop_duplicates(subset=["gameid", "playername"])
52
+
53
+ # Pretends the last 10 games are future games. The most will be trained on everything before that.
54
+ most_recent_10_games = df[column_names.match_id].unique()[-10:]
55
+ historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
56
+ future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(
57
+ columns=["result"]
58
+ )
59
+
60
+ rating_generator_result = UpdateRatingGenerator(
61
+ features_out=[RatingKnownFeatures.RATING_DIFFERENCE_PROJECTED],
62
+ performance_column="result",
63
+ )
64
+
65
+ rating_generator_player_kills = UpdateRatingGenerator(
66
+ features_out=[RatingKnownFeatures.RATING_MEAN_PROJECTED],
67
+ performances_generator=PerformancesGenerator(
68
+ performances=Performance(
69
+ name="performance_kills",
70
+ weights=[
71
+ ColumnWeight(name="kills", weight=1),
72
+ ],
73
+ ),
74
+ ),
75
+ )
76
+
77
+ lag_generators = [
78
+ LagTransformer(
79
+ features=["kills", "deaths", "result"], lag_length=3, granularity=["playername"]
80
+ ),
81
+ RollingMeanTransformer(
82
+ features=["kills", "deaths", "result"],
83
+ window=20,
84
+ min_periods=1,
85
+ granularity=["playername"],
86
+ ),
87
+ ]
88
+
89
+ transformer = PipelineTransformer(
90
+ column_names=column_names,
91
+ rating_generators=[rating_generator_result, rating_generator_player_kills],
92
+ lag_transformers=lag_generators,
93
+ )
94
+
95
+ historical_df = transformer.fit_transform(historical_df)
96
+
97
+ game_winner_predictor = GameTeamPredictor(
98
+ predictor=SklearnPredictor(
99
+ estimator=LogisticRegression(),
100
+ target="result",
101
+ features=[RatingKnownFeatures.RATING_DIFFERENCE_PROJECTED],
102
+ ),
103
+ one_hot_encode_cat_features=True,
104
+ impute_missing_values=True,
105
+ game_id_colum=column_names.match_id,
106
+ team_id_column=column_names.team_id,
107
+ )
108
+
109
+ player_kills_predictor = SklearnPredictor(
110
+ estimator=LGBMRegressor(verbose=-100),
111
+ target="kills",
112
+ features=[game_winner_predictor.pred_column],
113
+ features_contain_str=["rolling_mean_kills", "lag_kills"],
114
+ )
115
+
116
+ cross_validator_game_winner = MatchKFoldCrossValidator(
117
+ date_column_name=column_names.start_date,
118
+ match_id_column_name=column_names.match_id,
119
+ predictor=game_winner_predictor,
120
+ )
121
+
122
+ game_winner_predictor.train(historical_df)
123
+ historical_df = cross_validator_game_winner.generate_validation_df(
124
+ historical_df, column_names
125
+ )
126
+
127
+ cross_validator_player_kills = MatchKFoldCrossValidator(
128
+ date_column_name=column_names.start_date,
129
+ match_id_column_name=column_names.match_id,
130
+ predictor=player_kills_predictor,
131
+ )
132
+
133
+ player_kills_predictor.train(historical_df)
134
+ print(player_kills_predictor.features)
135
+ historical_df = cross_validator_player_kills.generate_validation_df(
136
+ historical_df, column_names
137
+ )
138
+
139
+ future_df = transformer.transform(future_df)
140
+ future_df = game_winner_predictor.predict(future_df)
141
+ future_df = player_kills_predictor.predict(future_df)
142
+
143
+ probability_predictor = NegativeBinomialPredictor(
144
+ target="kills",
145
+ point_estimate_pred_column=player_kills_predictor.pred_column,
146
+ relative_error_predictor=SklearnPredictor(
147
+ estimator=LGBMRegressor(),
148
+ target=None,
149
+ features=["position"],
150
+ convert_cat_features_to_cat_dtype=True,
151
+ ),
152
+ max_value=15,
153
+ )
154
+
155
+ probability_predictor.train(historical_df)
156
+ future_df = probability_predictor.predict(future_df)
157
+
158
+ print(future_df.head(10))
File without changes
@@ -0,0 +1,131 @@
1
+ import polars as pl
2
+ from sklearn.metrics import mean_absolute_error
3
+ from lightgbm import LGBMRegressor, LGBMClassifier
4
+
5
+ from examples import get_sub_sample_nba_data
6
+ from spforge.cross_validator import MatchKFoldCrossValidator
7
+
8
+ from spforge.pipeline import Pipeline
9
+ from spforge.predictor import SklearnPredictor
10
+
11
+ from spforge.data_structures import ColumnNames
12
+ from spforge.predictor.classifier import NegativeBinomialPredictor
13
+ from spforge.predictor.predictor import DistributionPredictor
14
+ from spforge.scorer import SklearnScorer, OrdinalLossScorer
15
+ from spforge.scorer import Filter, Operator
16
+ from spforge.transformers import (
17
+ RollingMeanTransformer, LagTransformer,
18
+ )
19
+
20
+ df = get_sub_sample_nba_data(as_polars=True, as_pandas=False)
21
+ # df = df.filter(pl.col('minutes')>0)
22
+ column_names = ColumnNames(
23
+ team_id="team_id",
24
+ match_id="game_id",
25
+ start_date="start_date",
26
+ player_id="player_name",
27
+ )
28
+ df = df.sort(
29
+ [
30
+ column_names.start_date,
31
+ column_names.match_id,
32
+ column_names.team_id,
33
+ column_names.player_id,
34
+ ]
35
+ )
36
+
37
+ df = df.with_columns(pl.col("points").clip(0, 40).alias("points"))
38
+
39
+ predictor = DistributionPredictor(
40
+ point_predictor=SklearnPredictor(
41
+ estimator=LGBMRegressor(verbose=-100, random_state=42),
42
+ features=["location"],
43
+ target="points",
44
+ convert_cat_features_to_cat_dtype=True,
45
+ pred_column="points_estimate",
46
+ ),
47
+ distribution_predictor=NegativeBinomialPredictor(
48
+ max_value=40, target="points", point_estimate_pred_column="points_estimate"
49
+ ),
50
+ )
51
+
52
+ pipeline = Pipeline(
53
+ lag_transformers=[
54
+ RollingMeanTransformer(features=["points"], window=15, granularity=["player_id"]),
55
+ LagTransformer(features=['points'],lag_length=3, granularity=['player_id'])
56
+ ],
57
+ predictor=predictor,
58
+ column_names=column_names,
59
+ )
60
+
61
+ cross_validator = MatchKFoldCrossValidator(
62
+ date_column_name=column_names.start_date,
63
+ match_id_column_name=column_names.match_id,
64
+ predictor=pipeline,
65
+ )
66
+ validation_df = cross_validator.generate_validation_df(
67
+ df=df, column_names=column_names, return_features=True
68
+ )
69
+
70
+ mean_absolute_scorer = SklearnScorer(
71
+ pred_column=predictor.point_predictor.pred_column,
72
+ target=predictor.target,
73
+ scorer_function=mean_absolute_error,
74
+ validation_column="is_validation",
75
+ filters=[Filter(column_name="minutes", value=0, operator=Operator.GREATER_THAN)],
76
+ )
77
+
78
+ mae_score = cross_validator.cross_validation_score(
79
+ validation_df=validation_df, scorer=mean_absolute_scorer
80
+ )
81
+ print(f"MAE {mae_score}")
82
+
83
+ ordinal_scorer = OrdinalLossScorer(
84
+ pred_column=predictor.pred_column,
85
+ target=predictor.target,
86
+ validation_column="is_validation",
87
+ filters=[Filter(column_name="minutes", value=0, operator=Operator.GREATER_THAN)],
88
+ )
89
+
90
+ ordinal_loss_score = cross_validator.cross_validation_score(
91
+ validation_df=validation_df, scorer=ordinal_scorer
92
+ )
93
+ print(f"Ordinal Loss {ordinal_loss_score}")
94
+
95
+ lgbm_classifier_predictor = SklearnPredictor(
96
+ estimator=LGBMClassifier(verbose=-100, random_state=42, max_depth=2),
97
+ features=[
98
+ *pipeline.lag_transformers[0].features_out,
99
+ "location",
100
+ predictor.point_predictor.pred_column,
101
+ ],
102
+ target=predictor.target,
103
+ pred_column="lgbm_classifier_point_estimate",
104
+ convert_cat_features_to_cat_dtype=True,
105
+ multiclass_output_as_struct=True,
106
+ )
107
+
108
+ lgbm_classifier_cross_validator = MatchKFoldCrossValidator(
109
+ date_column_name=column_names.start_date,
110
+ match_id_column_name=column_names.match_id,
111
+ predictor=lgbm_classifier_predictor,
112
+ )
113
+
114
+ validation_df = lgbm_classifier_cross_validator.generate_validation_df(
115
+ df=validation_df, column_names=column_names
116
+ )
117
+
118
+ ordinal_scorer_lgbm_classifier = OrdinalLossScorer(
119
+ pred_column=lgbm_classifier_predictor.pred_column,
120
+ target=predictor.target,
121
+ validation_column="is_validation",
122
+ filters=[Filter(column_name="minutes", value=0, operator=Operator.GREATER_THAN)],
123
+ )
124
+
125
+
126
+ lgbm_classifier_ordinal_loss_score = (
127
+ lgbm_classifier_cross_validator.cross_validation_score(
128
+ validation_df=validation_df, scorer=ordinal_scorer_lgbm_classifier
129
+ )
130
+ )
131
+ print(f"Ordinal Loss Lgbm Classifier {lgbm_classifier_ordinal_loss_score}")
File without changes
@@ -0,0 +1,19 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+ import polars as pl
6
+ import pandas as pd
7
+
8
+
9
+ def get_sub_sample_nba_data(
10
+ as_pandas: bool = True, as_polars: bool = False
11
+ ) -> Union[pd.DataFrame]:
12
+ script_dir = Path(__file__).parent
13
+ file_path = os.path.join(script_dir, "game_player_subsample.parquet")
14
+ if as_polars:
15
+ return pl.read_parquet(file_path)
16
+ elif as_pandas:
17
+ return pd.read_parquet(file_path)
18
+ else:
19
+ raise ValueError("Must specify either as_pandas or as_polars")
@@ -0,0 +1,94 @@
1
+ import pandas as pd
2
+ from sklearn.linear_model import LogisticRegression
3
+
4
+ from spforge.pipeline import Pipeline
5
+ from spforge.predictor import GameTeamPredictor, SklearnPredictor
6
+
7
+ from spforge.ratings import UpdateRatingGenerator
8
+
9
+ from spforge.data_structures import ColumnNames
10
+ from spforge.ratings.rating_calculators import MatchRatingGenerator
11
+
12
+ df = pd.read_parquet("data/game_player_subsample.parquet")
13
+
14
+ # Defines the column names as they appear in the dataframe
15
+ column_names = ColumnNames(
16
+ team_id="team_id",
17
+ match_id="game_id",
18
+ start_date="start_date",
19
+ player_id="player_name",
20
+ )
21
+ # Sorts the dataframe. The dataframe must always be sorted as below
22
+ df = df.sort_values(
23
+ by=[
24
+ column_names.start_date,
25
+ column_names.match_id,
26
+ column_names.team_id,
27
+ column_names.player_id,
28
+ ]
29
+ )
30
+
31
+ # Drops games with less or more than 2 teams
32
+ df = (
33
+ df.assign(
34
+ team_count=df.groupby(column_names.match_id)[column_names.team_id].transform(
35
+ "nunique"
36
+ )
37
+ )
38
+ .loc[lambda x: x.team_count == 2]
39
+ .drop(columns=["team_count"])
40
+ )
41
+
42
+ # Pretends the last 10 games are future games. The most will be trained on everything before that.
43
+ most_recent_10_games = df[column_names.match_id].unique()[-10:]
44
+ historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
45
+ future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(
46
+ columns=["won"]
47
+ )
48
+
49
+ # Defining a simple rating-generator. It will use the "won" column to update the ratings.
50
+ # In contrast to a typical Elo, ratings will follow players.
51
+
52
+ match_rating_generator = MatchRatingGenerator(rating_change_multiplier=30)
53
+
54
+ rating_generator = UpdateRatingGenerator(
55
+ performance_column="won", match_rating_generator=match_rating_generator
56
+ )
57
+
58
+ # Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
59
+ # Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
60
+ # It will also use the location of the game as a feature.
61
+ predictor = GameTeamPredictor(
62
+ game_id_colum=column_names.match_id,
63
+ team_id_column=column_names.team_id,
64
+ predictor=SklearnPredictor(
65
+ features=["location"], target="won", estimator=LogisticRegression()
66
+ ),
67
+ one_hot_encode_cat_features=True,
68
+ )
69
+
70
+ # Pipeline is whether we define all the steps. Other transformations can take place as well.
71
+ # However, in our simple example we only have a simple rating-generator and a predictor.
72
+ pipeline = Pipeline(
73
+ rating_generators=rating_generator,
74
+ predictor=predictor,
75
+ column_names=column_names,
76
+ )
77
+
78
+ # Trains the model and returns historical predictions
79
+ pipeline.train(df=historical_df)
80
+
81
+ # Future predictions on future results
82
+ future_predictions = pipeline.predict(df=future_df)
83
+
84
+ # Grouping predictions from game-player level to game-level.
85
+ team_grouped_predictions = future_predictions.groupby(column_names.match_id).first()[
86
+ [
87
+ column_names.start_date,
88
+ column_names.team_id,
89
+ "team_id_opponent",
90
+ predictor.pred_column,
91
+ ]
92
+ ]
93
+
94
+ print(team_grouped_predictions)
spforge/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .data_structures import ColumnNames
2
+ from .pipeline import Pipeline
3
+ from .pipeline_factory import PipelineFactory
@@ -0,0 +1,2 @@
1
+ from ._base import CrossValidator
2
+ from .cross_validator import MatchKFoldCrossValidator
@@ -0,0 +1,47 @@
1
+ from abc import abstractmethod, ABC
2
+ from typing import Optional
3
+
4
+ from spforge import ColumnNames
5
+
6
+ from spforge.predictor._base import BasePredictor
7
+ from spforge.scorer import BaseScorer
8
+
9
+ from narwhals.typing import FrameT, IntoFrameT
10
+
11
+
12
+ class CrossValidator(ABC):
13
+
14
+ def __init__(
15
+ self,
16
+ scorer: Optional[BaseScorer],
17
+ min_validation_date: str,
18
+ predictor: BasePredictor,
19
+ ):
20
+ self.scorer = scorer
21
+ self.min_validation_date = min_validation_date
22
+ self.predictor = predictor
23
+
24
+ @property
25
+ def validation_column_name(self) -> str:
26
+ return "is_validation"
27
+
28
+ @abstractmethod
29
+ def generate_validation_df(
30
+ self,
31
+ df: FrameT,
32
+ column_names: ColumnNames,
33
+ return_features: bool = False,
34
+ add_train_prediction: bool = False,
35
+ ) -> IntoFrameT:
36
+ pass
37
+
38
+ def cross_validation_score(
39
+ self, validation_df: FrameT, scorer: Optional[BaseScorer] = None
40
+ ) -> float:
41
+ if not scorer and not self.scorer:
42
+ raise ValueError(
43
+ "scorer is not defined. Either pass into constructor or as argument to method"
44
+ )
45
+
46
+ scorer = scorer or self.scorer
47
+ return scorer.score(df=validation_df)