spforge 0.4.3__tar.gz → 0.8.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (171) hide show
  1. spforge-0.8.35/MANIFEST.in +2 -0
  2. spforge-0.8.35/PKG-INFO +470 -0
  3. spforge-0.8.35/README.md +442 -0
  4. spforge-0.8.35/examples/__init__.py +2 -0
  5. spforge-0.8.35/examples/game_level_example.py +73 -0
  6. {spforge-0.4.3 → spforge-0.8.35}/examples/lol/data/utils.py +2 -5
  7. spforge-0.8.35/examples/lol/pipeline_transformer_example.py +106 -0
  8. spforge-0.8.35/examples/nba/cross_validation_example.py +137 -0
  9. {spforge-0.4.3 → spforge-0.8.35}/examples/nba/data/utils.py +2 -3
  10. spforge-0.8.35/examples/nba/feature_engineering_example.py +196 -0
  11. {spforge-0.4.3 → spforge-0.8.35}/examples/nba/game_winner_example.py +39 -39
  12. spforge-0.8.35/examples/nba/predictor_transformers_example.py +233 -0
  13. spforge-0.8.35/pyproject.toml +106 -0
  14. spforge-0.8.35/spforge/__init__.py +9 -0
  15. spforge-0.8.35/spforge/autopipeline.py +793 -0
  16. spforge-0.8.35/spforge/base_feature_generator.py +26 -0
  17. spforge-0.8.35/spforge/cross_validator/__init__.py +2 -0
  18. {spforge-0.4.3 → spforge-0.8.35}/spforge/cross_validator/_base.py +11 -11
  19. spforge-0.8.35/spforge/cross_validator/cross_validator.py +180 -0
  20. spforge-0.8.35/spforge/data_structures.py +286 -0
  21. spforge-0.8.35/spforge/distributions/__init__.py +9 -0
  22. spforge-0.4.3/spforge/predictor/_distribution.py → spforge-0.8.35/spforge/distributions/_negative_binomial_estimator.py +210 -171
  23. spforge-0.8.35/spforge/distributions/_normal_distribution_predictor.py +94 -0
  24. spforge-0.8.35/spforge/distributions/_student_t_distribution_estimator.py +289 -0
  25. spforge-0.8.35/spforge/estimator/__init__.py +8 -0
  26. spforge-0.8.35/spforge/estimator/_conditional_estimator.py +135 -0
  27. spforge-0.8.35/spforge/estimator/_frequency_bucketing_classifier.py +167 -0
  28. spforge-0.8.35/spforge/estimator/_granularity_estimator.py +101 -0
  29. spforge-0.8.35/spforge/estimator/_group_by_estimator.py +83 -0
  30. spforge-0.8.35/spforge/estimator/_ordinal_classifier.py +60 -0
  31. spforge-0.8.35/spforge/estimator/_sklearn_enhancer_estimator.py +131 -0
  32. spforge-0.8.35/spforge/feature_generator/__init__.py +10 -0
  33. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_base.py +96 -119
  34. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_lag.py +20 -26
  35. spforge-0.8.35/spforge/feature_generator/_net_over_predicted.py +57 -0
  36. spforge-0.8.35/spforge/feature_generator/_regressor_feature_generator.py +136 -0
  37. spforge-0.4.3/spforge/transformers/lag_transformers/_opponent_transformer.py → spforge-0.8.35/spforge/feature_generator/_rolling_against_opponent.py +50 -56
  38. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_rolling_mean_binary.py +92 -107
  39. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_rolling_mean_days.py +44 -69
  40. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_rolling_window.py +24 -26
  41. {spforge-0.4.3/spforge/transformers/lag_transformers → spforge-0.8.35/spforge/feature_generator}/_utils.py +106 -27
  42. spforge-0.8.35/spforge/features_generator_pipeline.py +209 -0
  43. spforge-0.8.35/spforge/hyperparameter_tuning/__init__.py +33 -0
  44. spforge-0.8.35/spforge/hyperparameter_tuning/_default_search_spaces.py +295 -0
  45. spforge-0.8.35/spforge/hyperparameter_tuning/_tuner.py +519 -0
  46. spforge-0.8.35/spforge/performance_transformers/__init__.py +11 -0
  47. spforge-0.8.35/spforge/performance_transformers/_performance_manager.py +327 -0
  48. spforge-0.4.3/spforge/transformers/fit_transformers/performances_transformers.py → spforge-0.8.35/spforge/performance_transformers/_performances_transformers.py +182 -196
  49. spforge-0.8.35/spforge/ratings/__init__.py +12 -0
  50. spforge-0.8.35/spforge/ratings/_base.py +348 -0
  51. spforge-0.8.35/spforge/ratings/_player_rating.py +1608 -0
  52. spforge-0.8.35/spforge/ratings/_team_rating.py +562 -0
  53. {spforge-0.4.3 → spforge-0.8.35}/spforge/ratings/enums.py +15 -16
  54. {spforge-0.4.3 → spforge-0.8.35}/spforge/ratings/league_identifier.py +68 -17
  55. spforge-0.8.35/spforge/ratings/league_start_rating_optimizer.py +201 -0
  56. spforge-0.4.3/spforge/ratings/rating_calculators/performance_predictor.py → spforge-0.8.35/spforge/ratings/player_performance_predictor.py +21 -149
  57. {spforge-0.4.3/spforge/ratings/rating_calculators → spforge-0.8.35/spforge/ratings}/start_rating_generator.py +21 -35
  58. spforge-0.8.35/spforge/ratings/team_performance_predictor.py +208 -0
  59. spforge-0.8.35/spforge/ratings/team_start_rating_generator.py +148 -0
  60. spforge-0.8.35/spforge/ratings/utils.py +152 -0
  61. spforge-0.8.35/spforge/scorer/__init__.py +9 -0
  62. spforge-0.8.35/spforge/scorer/_score.py +1610 -0
  63. spforge-0.8.35/spforge/transformers/__init__.py +7 -0
  64. spforge-0.8.35/spforge/transformers/_base.py +29 -0
  65. spforge-0.8.35/spforge/transformers/_net_over_predicted.py +72 -0
  66. {spforge-0.4.3/spforge/transformers/simple_transformer → spforge-0.8.35/spforge/transformers}/_operator.py +15 -50
  67. spforge-0.8.35/spforge/transformers/_other_transformer.py +137 -0
  68. spforge-0.8.35/spforge/transformers/_predictor.py +90 -0
  69. spforge-0.8.35/spforge/transformers/_simple_transformer.py +154 -0
  70. spforge-0.8.35/spforge/transformers/_team_ratio_predictor.py +185 -0
  71. {spforge-0.4.3 → spforge-0.8.35}/spforge/utils.py +38 -11
  72. spforge-0.8.35/spforge.egg-info/PKG-INFO +470 -0
  73. spforge-0.8.35/spforge.egg-info/SOURCES.txt +118 -0
  74. {spforge-0.4.3 → spforge-0.8.35}/spforge.egg-info/requires.txt +3 -4
  75. {spforge-0.4.3 → spforge-0.8.35}/spforge.egg-info/top_level.txt +3 -0
  76. spforge-0.8.35/tests/cross_validator/test_cross_validator.py +563 -0
  77. spforge-0.8.35/tests/distributions/test_distribution.py +299 -0
  78. spforge-0.8.35/tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
  79. spforge-0.8.35/tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  80. spforge-0.8.35/tests/end_to_end/test_lol_player_kills.py +292 -0
  81. spforge-0.8.35/tests/end_to_end/test_nba_player_points.py +228 -0
  82. spforge-0.8.35/tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +219 -0
  83. spforge-0.8.35/tests/end_to_end/test_nba_prediction_consistency.py +258 -0
  84. spforge-0.8.35/tests/estimator/test_sklearn_estimator.py +1250 -0
  85. spforge-0.8.35/tests/feature_generator/test_lag.py +586 -0
  86. spforge-0.8.35/tests/feature_generator/test_regressor_feature_generator.py +87 -0
  87. spforge-0.8.35/tests/feature_generator/test_rolling_against_opponent.py +182 -0
  88. spforge-0.8.35/tests/feature_generator/test_rolling_mean_binary.py +521 -0
  89. spforge-0.8.35/tests/feature_generator/test_rolling_mean_days.py +581 -0
  90. spforge-0.8.35/tests/feature_generator/test_rolling_window.py +722 -0
  91. spforge-0.8.35/tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
  92. spforge-0.8.35/tests/hyperparameter_tuning/test_rating_tuner.py +613 -0
  93. spforge-0.8.35/tests/performance_transformers/test_performance_manager.py +436 -0
  94. spforge-0.8.35/tests/performance_transformers/test_performances_transformers.py +553 -0
  95. spforge-0.8.35/tests/ratings/test_player_rating_generator.py +2940 -0
  96. spforge-0.8.35/tests/ratings/test_player_rating_no_mutation.py +214 -0
  97. spforge-0.8.35/tests/ratings/test_ratings_property.py +185 -0
  98. spforge-0.8.35/tests/ratings/test_team_rating_generator.py +2949 -0
  99. spforge-0.8.35/tests/ratings/test_utils_scaled_weights.py +136 -0
  100. spforge-0.8.35/tests/scorer/test_score.py +2282 -0
  101. spforge-0.8.35/tests/scorer/test_score_aggregation_granularity.py +494 -0
  102. spforge-0.8.35/tests/scorer/test_scorer_name.py +305 -0
  103. spforge-0.8.35/tests/test_autopipeline.py +883 -0
  104. spforge-0.8.35/tests/test_autopipeline_context.py +32 -0
  105. spforge-0.8.35/tests/test_feature_generator_pipeline.py +592 -0
  106. spforge-0.8.35/tests/transformers/test_estimator_transformer_context.py +42 -0
  107. spforge-0.8.35/tests/transformers/test_net_over_predicted.py +104 -0
  108. spforge-0.8.35/tests/transformers/test_predictor_transformer.py +228 -0
  109. spforge-0.8.35/tests/transformers/test_simple_transformer.py +142 -0
  110. spforge-0.8.35/tests/transformers/test_team_ratio_predictor.py +256 -0
  111. spforge-0.4.3/MANIFEST.in +0 -3
  112. spforge-0.4.3/PKG-INFO +0 -150
  113. spforge-0.4.3/README.md +0 -122
  114. spforge-0.4.3/examples/__init__.py +0 -2
  115. spforge-0.4.3/examples/lol/pipeline_transformer_example.py +0 -143
  116. spforge-0.4.3/examples/nba/cross_validation_example.py +0 -139
  117. spforge-0.4.3/requirements.txt +0 -10
  118. spforge-0.4.3/setup.py +0 -21
  119. spforge-0.4.3/spforge/__init__.py +0 -3
  120. spforge-0.4.3/spforge/cross_validator/__init__.py +0 -2
  121. spforge-0.4.3/spforge/cross_validator/cross_validator.py +0 -205
  122. spforge-0.4.3/spforge/data_structures.py +0 -195
  123. spforge-0.4.3/spforge/pipeline.py +0 -401
  124. spforge-0.4.3/spforge/pipeline_factory.py +0 -85
  125. spforge-0.4.3/spforge/pipeline_transformer.py +0 -218
  126. spforge-0.4.3/spforge/predictor/__init__.py +0 -9
  127. spforge-0.4.3/spforge/predictor/_base.py +0 -247
  128. spforge-0.4.3/spforge/predictor/predictor.py +0 -583
  129. spforge-0.4.3/spforge/predictor/sklearn_estimator.py +0 -77
  130. spforge-0.4.3/spforge/predictor_transformer/__init__.py +0 -5
  131. spforge-0.4.3/spforge/predictor_transformer/_simple_transformer.py +0 -120
  132. spforge-0.4.3/spforge/predictor_transformer/transformer.py +0 -111
  133. spforge-0.4.3/spforge/ratings/__init__.py +0 -15
  134. spforge-0.4.3/spforge/ratings/_player_rating_generator.py +0 -1062
  135. spforge-0.4.3/spforge/ratings/match_generator.py +0 -401
  136. spforge-0.4.3/spforge/ratings/rating_calculators/__init__.py +0 -6
  137. spforge-0.4.3/spforge/ratings/rating_calculators/match_rating_generator.py +0 -525
  138. spforge-0.4.3/spforge/ratings/rating_generator.py +0 -156
  139. spforge-0.4.3/spforge/scorer/__init__.py +0 -9
  140. spforge-0.4.3/spforge/scorer/_score.py +0 -514
  141. spforge-0.4.3/spforge/transformers/__init__.py +0 -14
  142. spforge-0.4.3/spforge/transformers/base_transformer.py +0 -44
  143. spforge-0.4.3/spforge/transformers/fit_transformers/__init__.py +0 -14
  144. spforge-0.4.3/spforge/transformers/fit_transformers/_net_over_predicted.py +0 -100
  145. spforge-0.4.3/spforge/transformers/fit_transformers/_performance_manager.py +0 -320
  146. spforge-0.4.3/spforge/transformers/fit_transformers/_predictor.py +0 -37
  147. spforge-0.4.3/spforge/transformers/fit_transformers/_team_ratio_predictor.py +0 -127
  148. spforge-0.4.3/spforge/transformers/lag_transformers/__init__.py +0 -14
  149. spforge-0.4.3/spforge/transformers/simple_transformer/__init__.py +0 -1
  150. spforge-0.4.3/spforge/tuner/__init__.py +0 -5
  151. spforge-0.4.3/spforge/tuner/base_tuner.py +0 -13
  152. spforge-0.4.3/spforge/tuner/performances_generator_tuner.py +0 -208
  153. spforge-0.4.3/spforge/tuner/pipeline_tuner.py +0 -306
  154. spforge-0.4.3/spforge/tuner/predictor_tuner.py +0 -253
  155. spforge-0.4.3/spforge/tuner/rating_generator_tuner.py +0 -403
  156. spforge-0.4.3/spforge/tuner/start_rating_optimizer.py +0 -287
  157. spforge-0.4.3/spforge/tuner/utils.py +0 -171
  158. spforge-0.4.3/spforge.egg-info/PKG-INFO +0 -150
  159. spforge-0.4.3/spforge.egg-info/SOURCES.txt +0 -80
  160. spforge-0.4.3/tests/test_pipeline.py +0 -474
  161. spforge-0.4.3/tests/test_pipeline_transformer.py +0 -90
  162. {spforge-0.4.3 → spforge-0.8.35}/LICENSE +0 -0
  163. {spforge-0.4.3 → spforge-0.8.35}/examples/lol/__init__.py +0 -0
  164. {spforge-0.4.3 → spforge-0.8.35}/examples/lol/data/__init__.py +0 -0
  165. {spforge-0.4.3 → spforge-0.8.35}/examples/lol/data/subsample_lol_data.parquet +0 -0
  166. {spforge-0.4.3 → spforge-0.8.35}/examples/nba/__init__.py +0 -0
  167. {spforge-0.4.3 → spforge-0.8.35}/examples/nba/data/__init__.py +0 -0
  168. {spforge-0.4.3 → spforge-0.8.35}/examples/nba/data/game_player_subsample.parquet +0 -0
  169. {spforge-0.4.3 → spforge-0.8.35}/setup.cfg +0 -0
  170. {spforge-0.4.3 → spforge-0.8.35}/spforge.egg-info/dependency_links.txt +0 -0
  171. /spforge-0.4.3/spforge/transformation_pipeline.py → /spforge-0.8.35/tests/transformers/test_other_transformer.py +0 -0
@@ -0,0 +1,2 @@
1
+ recursive-include examples/nba/data *.parquet
2
+ recursive-include examples/lol/data *.parquet
@@ -0,0 +1,470 @@
1
+ Metadata-Version: 2.4
2
+ Name: spforge
3
+ Version: 0.8.35
4
+ Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
+ Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
+ License: See LICENSE file
7
+ Project-URL: Homepage, https://github.com/Hiderdk/player-performance-ratings
8
+ Project-URL: Repository, https://github.com/Hiderdk/player-performance-ratings
9
+ Keywords: sports,machine-learning,ratings,features
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: numpy>=1.23.4
19
+ Requires-Dist: optuna>=3.4.0
20
+ Requires-Dist: pandas<3.0.0,>=2.0.0
21
+ Requires-Dist: pendulum>=1.0.0
22
+ Requires-Dist: scikit-learn>=1.4.0
23
+ Requires-Dist: lightgbm>=4.0.0
24
+ Requires-Dist: narwhals>2.0.0
25
+ Requires-Dist: polars>=1.17.0
26
+ Requires-Dist: pyarrow>=19.0.0
27
+ Dynamic: license-file
28
+
29
+ # spforge
30
+
31
+ **spforge** is a sports prediction framework for building feature-rich, stateful, and
32
+ sklearn-compatible modeling pipelines.
33
+
34
+ It is designed for:
35
+ - player- and team-level ratings
36
+ - rolling and lagged feature generation
37
+ - match-aware cross-validation
38
+ - probabilistic and point-estimate models
39
+ - pandas **and** polars DataFrames (via narwhals)
40
+
41
+ Typical use cases include:
42
+ - predicting game winners
43
+ - predicting player or team points
44
+ - generating probabilities using either machine learning models or distributions
45
+ - feature engineering and cross-validation
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install spforge
53
+ ```
54
+
55
+ ## Core assumptions
56
+
57
+ spforge assumes your data is structured as:
58
+
59
+ - **One row per entity per match**
60
+ - e.g. `(game_id, player_id)` or `(game_id, team_id)`
61
+ - Higher-level predictions (team/game) are handled via aggregation or grouping.
62
+
63
+ ## Key concepts
64
+
65
+ Before diving into examples, here are fundamental concepts that guide how spforge works:
66
+
67
+ - **Temporal ordering prevents future leakage**: Data must be sorted chronologically (by date, then match, then team/player). This ensures models never "see the future" when making predictions.
68
+
69
+ - **Elo-style ratings**: Player and team ratings evolve over time based on match performance. Think of it like a chess rating - win against strong opponents and your rating increases more. Ratings are calculated BEFORE each match to avoid leakage.
70
+
71
+ - **State management lifecycle**:
72
+ - `fit_transform(df)`: Learn patterns from historical data (ratings update, windows build up)
73
+ - `transform(df)`: Apply to more historical data (continues updating state)
74
+ - `future_transform(df)`: Generate features for prediction WITHOUT updating internal state (read-only)
75
+
76
+ - **Granularity-based aggregation**: Player-level data (e.g., individual stats) can be automatically aggregated to team-level for game winner predictions.
77
+
78
+ - **pandas and polars support**: All components work identically with both DataFrame types via the narwhals library.
79
+
80
+ ## Example
81
+
82
+ This example demonstrates predicting NBA game winners using player-level ratings.
83
+
84
+ ```python
85
+ import pandas as pd
86
+ from sklearn.linear_model import LogisticRegression
87
+
88
+ from examples import get_sub_sample_nba_data
89
+ from spforge.autopipeline import AutoPipeline
90
+ from spforge.data_structures import ColumnNames
91
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
92
+
93
+ df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
94
+
95
+ # Step 1: Define column mappings for your dataset
96
+ column_names = ColumnNames(
97
+ team_id="team_id",
98
+ match_id="game_id",
99
+ start_date="start_date",
100
+ player_id="player_name",
101
+ )
102
+
103
+ # Step 2: CRITICAL - Sort data chronologically to prevent future leakage
104
+ # This ensures ratings and features only use past information
105
+ df = df.sort_values(
106
+ by=[
107
+ column_names.start_date, # First by date
108
+ column_names.match_id, # Then by match
109
+ column_names.team_id, # Then by team
110
+ column_names.player_id, # Finally by player
111
+ ]
112
+ )
113
+
114
+ # Step 3: Filter to valid games (exactly 2 teams)
115
+ df = (
116
+ df.assign(
117
+ team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique")
118
+ )
119
+ .loc[lambda x: x.team_count == 2]
120
+ .drop(columns=["team_count"])
121
+ )
122
+
123
+ # Step 4: Split into historical (training) and future (prediction) data
124
+ # In production, "future" would be upcoming games without outcomes
125
+ most_recent_10_games = df[column_names.match_id].unique()[-10:]
126
+ historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
127
+ future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["won"])
128
+
129
+ # Step 5: Generate player ratings based on win/loss history
130
+ # Each player gets a rating that updates after each game
131
+ # Unlike traditional team Elo, ratings follow individual players
132
+ rating_generator = PlayerRatingGenerator(
133
+ performance_column="won", # Update ratings based on wins/losses
134
+ rating_change_multiplier=30, # How quickly ratings adjust (higher = more volatile)
135
+ column_names=column_names,
136
+ non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
137
+ )
138
+ # fit_transform learns ratings from historical games
139
+ historical_df = rating_generator.fit_transform(historical_df)
140
+
141
+ # Step 6: Create prediction pipeline
142
+ # AutoPipeline automatically handles preprocessing (encoding, scaling)
143
+ # granularity aggregates player-level data to team-level before fitting
144
+ pipeline = AutoPipeline(
145
+ estimator=LogisticRegression(),
146
+ granularity=["game_id", "team_id"], # Aggregate players → teams
147
+ estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
148
+ )
149
+
150
+ # Train on historical data
151
+ pipeline.fit(X=historical_df, y=historical_df["won"])
152
+
153
+ # Step 7: Make predictions on future games
154
+ # future_transform generates features WITHOUT updating rating state
155
+ # This is crucial: we don't want to update ratings until games actually happen
156
+ future_df = rating_generator.future_transform(future_df)
157
+ future_predictions = pipeline.predict_proba(future_df)[:, 1] # Probability of winning
158
+ future_df["game_winner_probability"] = future_predictions
159
+
160
+ # Aggregate player-level predictions to team-level for final output
161
+ team_grouped_predictions = future_df.groupby(column_names.match_id).first()[
162
+ [
163
+ column_names.start_date,
164
+ column_names.team_id,
165
+ "team_id_opponent",
166
+ "game_winner_probability",
167
+ ]
168
+ ]
169
+
170
+ print(team_grouped_predictions)
171
+ ```
172
+ Output:
173
+ ```
174
+ start_date team_id team_id_opponent game_winner_probability
175
+ game_id
176
+ 0022200767 2023-01-31 1610612749 1610612766 0.731718
177
+ 0022200768 2023-01-31 1610612740 1610612743 0.242622
178
+ 0022200770 2023-02-01 1610612753 1610612755 0.278237
179
+ 0022200771 2023-02-01 1610612757 1610612763 0.340883
180
+ 0022200772 2023-02-01 1610612738 1610612751 0.629010
181
+ 0022200773 2023-02-01 1610612745 1610612760 0.401803
182
+ 0022200774 2023-02-01 1610612744 1610612750 0.430164
183
+ 0022200775 2023-02-01 1610612758 1610612759 0.587513
184
+ 0022200776 2023-02-01 1610612761 1610612762 0.376864
185
+ 0022200777 2023-02-01 1610612737 1610612756 0.371888
186
+ ```
187
+ ## AutoPipeline
188
+
189
+ `AutoPipeline` is a sklearn-compatible wrapper that handles the full modeling pipeline,
190
+ from preprocessing to final estimation.
191
+
192
+ - Builds all required preprocessing steps automatically based on the estimator:
193
+ - One-hot encoding and imputation for linear models (e.g. `LogisticRegression`)
194
+ - Native categorical handling for LightGBM
195
+ - Ordinal encoding where appropriate
196
+ - Supports **predictor transformers**, allowing upstream models to generate features
197
+ that are consumed by the final estimator.
198
+ - Supports optional **granularity-based aggregation**, enabling row-level data
199
+ (e.g. player-game) to be grouped before fitting (e.g. game-team level).
200
+ - Provides additional functionality such as:
201
+ - training-time row filtering
202
+ - target clipping and validation handling
203
+ - consistent feature tracking for sklearn integration
204
+
205
+ ## Feature Engineering
206
+
207
+ spforge provides stateful feature generators that create rich features from historical match data while maintaining temporal ordering to prevent data leakage.
208
+
209
+ ### Feature types available
210
+
211
+ - **Ratings**: Elo-style player/team ratings that evolve based on performance (separate offense/defense ratings)
212
+ - Can combine multiple stats into a composite performance metric using `performance_weights` (e.g., 60% kills + 40% assists)
213
+ - Auto-normalizes raw stats to 0-1 range with `auto_scale_performance=True`
214
+ - **Lags**: Previous match statistics, automatically shifted to prevent leakage
215
+ - **Rolling windows**: Averages/sums over the last N matches
216
+ - **FeatureGeneratorPipeline**: Chain multiple generators together sequentially
217
+
218
+ ### Example: Building a feature pipeline
219
+
220
+ ```python
221
+ from spforge import FeatureGeneratorPipeline
222
+ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
223
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
224
+ from spforge.performance_transformers import ColumnWeight
225
+
226
+ # Create individual feature generators
227
+ player_rating_generator = PlayerRatingGenerator(
228
+ performance_column="points",
229
+ auto_scale_performance=True, # Normalizes points to 0-1 range
230
+ column_names=column_names,
231
+ features_out=[RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_PROJECTED],
232
+ )
233
+
234
+ # Alternative: Combine multiple stats into a composite performance metric
235
+ # player_rating_generator = PlayerRatingGenerator(
236
+ # performance_column="weighted_performance", # Name for the composite metric
237
+ # performance_weights=[
238
+ # ColumnWeight(name="kills", weight=0.6),
239
+ # ColumnWeight(name="assists", weight=0.4),
240
+ # ],
241
+ # column_names=column_names,
242
+ # features_out=[RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_PROJECTED],
243
+ # )
244
+
245
+ lag_transformer = LagTransformer(
246
+ features=["points"],
247
+ lag_length=3, # Last 3 games
248
+ granularity=["player_id"],
249
+ )
250
+
251
+ rolling_transformer = RollingWindowTransformer(
252
+ features=["points"],
253
+ window=10, # Last 10 games average
254
+ granularity=["player_id"],
255
+ )
256
+
257
+ # Chain them together
258
+ features_pipeline = FeatureGeneratorPipeline(
259
+ column_names=column_names,
260
+ feature_generators=[
261
+ player_rating_generator,
262
+ lag_transformer,
263
+ rolling_transformer,
264
+ ],
265
+ )
266
+
267
+ # Learn from historical data
268
+ historical_df = features_pipeline.fit_transform(historical_df)
269
+
270
+ # For production predictions (doesn't update internal state)
271
+ future_df = features_pipeline.future_transform(future_df)
272
+ ```
273
+
274
+ **Key points:**
275
+ - `fit_transform`: Learn ratings/patterns from historical data (updates internal state)
276
+ - `transform`: Apply to more historical data (continues updating state)
277
+ - `future_transform`: Generate features for prediction (read-only, no state updates)
278
+ - Features are automatically shifted by 1 match to prevent data leakage
279
+
280
+ See [examples/nba/feature_engineering_example.py](examples/nba/feature_engineering_example.py) for a complete example with detailed explanations.
281
+
282
+ ## Cross Validation and Scorer metrics
283
+
284
+ Regular k-fold cross-validation doesn't work for time-series sports data because it can create "future leakage" - using future games to predict past games. `MatchKFoldCrossValidator` ensures training data is always BEFORE validation data, respecting temporal ordering.
285
+
286
+ ### Why this matters
287
+
288
+ Sports data has strong time dependencies: teams improve, players get injured, strategies evolve. Standard CV would overestimate model performance by allowing the model to "see the future."
289
+
290
+ ### Example: Time-series cross-validation
291
+
292
+ ```python
293
+ from spforge.cross_validator import MatchKFoldCrossValidator
294
+ from spforge.scorer import SklearnScorer, Filter, Operator
295
+ from sklearn.metrics import mean_absolute_error
296
+
297
+ # Set up temporal cross-validation
298
+ cross_validator = MatchKFoldCrossValidator(
299
+ date_column_name=column_names.start_date,
300
+ match_id_column_name=column_names.match_id,
301
+ estimator=pipeline, # Your AutoPipeline
302
+ prediction_column_name="points_pred",
303
+ target_column="points",
304
+ n_splits=3, # Number of temporal folds
305
+ # Must include both estimator features and context features
306
+ features=pipeline.required_features,
307
+ )
308
+
309
+ # Generate validation predictions
310
+ # add_training_predictions=True also returns predictions on training data
311
+ validation_df = cross_validator.generate_validation_df(df=df, add_training_predictions=True)
312
+
313
+ # Score only validation rows, filtering to players who actually played
314
+ scorer = SklearnScorer(
315
+ pred_column="points_pred",
316
+ target="points",
317
+ scorer_function=mean_absolute_error,
318
+ validation_column="is_validation", # Only score where is_validation == 1
319
+ filters=[
320
+ Filter(column_name="minutes", value=0, operator=Operator.GREATER_THAN)
321
+ ],
322
+ )
323
+
324
+ mae = scorer.score(validation_df)
325
+ print(f"Validation MAE: {mae:.2f}")
326
+ ```
327
+
328
+ **Key points:**
329
+ - `add_training_predictions=True` returns both training and validation predictions
330
+ - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
331
+ - Use `validation_column` in scorer to score only validation rows
332
+ - Training data always comes BEFORE validation data chronologically
333
+ - Must pass all required features (use `pipeline.required_features`)
334
+ - Scorers can filter rows (e.g., only score players who played minutes > 0)
335
+
336
+ See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
337
+
338
+ ## Distributions (Advanced)
339
+
340
+ Instead of predicting a single point estimate, you can predict full probability distributions. For example, instead of "player will score 15 points", predict P(0 points), P(1 point), ..., P(40 points).
341
+
342
+ ### When to use distributions
343
+
344
+ - Modeling count data (points, goals, kills, assists)
345
+ - When you need uncertainty estimates or confidence intervals
346
+ - For expected value calculations in betting or DFS
347
+ - When the outcome has inherent randomness
348
+
349
+ ### What NegativeBinomialEstimator does during fit
350
+
351
+ During training, `NegativeBinomialEstimator`:
352
+
353
+ 1. Takes the point estimates (from `point_estimate_pred_column`) and actual target values
354
+ 2. Optimizes a dispersion parameter `r` using maximum likelihood estimation on the negative binomial distribution
355
+ 3. If `r_specific_granularity` is set (e.g., per player), calculates entity-specific `r` values by:
356
+ - Computing rolling means and variances of point estimates over recent matches
357
+ - Binning entities by quantiles of mean and variance
358
+ - Fitting separate `r` values for each bin to capture different uncertainty patterns
359
+
360
+ During prediction, it uses the learned `r` parameter(s) and the point estimates to generate a full probability distribution over all possible values (0 to max_value).
361
+
362
+ ### Example: Comparing classifiers vs distribution estimators
363
+
364
+ A key advantage is comparing different approaches for generating probability distributions. Both LGBMClassifier and LGBMRegressor+NegativeBinomial output probabilities in the same format, making them directly comparable.
365
+
366
+ ```python
367
+ from spforge.distributions import NegativeBinomialEstimator
368
+ from spforge.transformers import EstimatorTransformer
369
+ from lightgbm import LGBMClassifier, LGBMRegressor
370
+
371
+ # Approach 1: LGBMClassifier (direct probability prediction)
372
+ pipeline_classifier = AutoPipeline(
373
+ estimator=LGBMClassifier(verbose=-100, random_state=42),
374
+ estimator_features=features_pipeline.features_out,
375
+ )
376
+
377
+ # Approach 2: LGBMRegressor + NegativeBinomialEstimator
378
+ distribution_estimator = NegativeBinomialEstimator(
379
+ max_value=40, # Predict 0-40 points
380
+ point_estimate_pred_column="points_estimate", # Uses regressor output
381
+ r_specific_granularity=["player_id"], # Player-specific dispersion
382
+ predicted_r_weight=1,
383
+ column_names=column_names,
384
+ )
385
+
386
+ pipeline_negbin = AutoPipeline(
387
+ estimator=distribution_estimator,
388
+ estimator_features=features_pipeline.features_out,
389
+ predictor_transformers=[
390
+ EstimatorTransformer(
391
+ prediction_column_name="points_estimate",
392
+ estimator=LGBMRegressor(verbose=-100, random_state=42),
393
+ features=features_pipeline.features_out,
394
+ )
395
+ ],
396
+ )
397
+
398
+ # Compare using cross-validation (see examples for full setup)
399
+ # Results on NBA player points prediction:
400
+ # LGBMClassifier Ordinal Loss: 1.0372
401
+ # LGBMRegressor + NegativeBinomial Ordinal Loss: 0.3786
402
+ # LGBMRegressor + NegativeBinomial Point Est MAE: 4.5305
403
+ ```
404
+
405
+ **Key points:**
406
+ - Both approaches output probability distributions over the same range
407
+ - `NegativeBinomialEstimator` performs significantly better (lower ordinal loss)
408
+ - Distribution approach provides both probability distributions and point estimates
409
+ - Can model player-specific variance with `r_specific_granularity`
410
+
411
+ See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete runnable example with both approaches.
412
+
413
+ ## Predictions as features for downstream models (Advanced)
414
+
415
+ A common pattern in sports analytics is using output from one model as input to another. For example, team strength (game winner probability) often influences individual player performance.
416
+
417
+ ### Why this matters
418
+
419
+ Hierarchical modeling captures dependencies: team context → player performance, game flow → outcome probabilities. By chaining models, each stage can specialize and the final model combines their insights.
420
+
421
+ ### Example: Two-stage modeling with predictor_transformers
422
+
423
+ ```python
424
+ from spforge.transformers import EstimatorTransformer
425
+ from lightgbm import LGBMRegressor
426
+
427
+ # Stage 1: Create a raw point estimate
428
+ points_estimate_transformer = EstimatorTransformer(
429
+ prediction_column_name="points_estimate_raw",
430
+ estimator=LGBMRegressor(verbose=-100, n_estimators=30),
431
+ )
432
+
433
+ # Stage 2: Refine estimate using Stage 1 output
434
+ player_points_pipeline = AutoPipeline(
435
+ estimator=LGBMRegressor(verbose=-100, n_estimators=50),
436
+ estimator_features=features_pipeline.features_out, # Original features
437
+ # predictor_transformers execute first, adding their predictions
438
+ predictor_transformers=[points_estimate_transformer],
439
+ )
440
+
441
+ # During fit:
442
+ # 1. Stage 1 fits and generates "points_estimate_raw" column
443
+ # 2. Stage 2 fits using original features + points_estimate_raw
444
+ player_points_pipeline.fit(X=train_df, y=train_df["points"])
445
+
446
+ # During predict:
447
+ # 1. Stage 1 generates "points_estimate_raw"
448
+ # 2. Stage 2 uses it to make final prediction
449
+ predictions = player_points_pipeline.predict(test_df)
450
+ ```
451
+
452
+ **Key points:**
453
+ - `predictor_transformers` chains estimators: output of one becomes input to next
454
+ - All transformers share the same target (y) during fit
455
+ - Transformers execute during both `fit()` and `predict()`
456
+ - Common use cases:
457
+ - Generate point estimates for distribution models
458
+ - Multi-stage refinement of predictions
459
+ - Combining different model types (linear → tree-based)
460
+
461
+ See [examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py) for a complete example. Also demonstrated in [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py).
462
+
463
+ ## More Examples
464
+
465
+ For complete, runnable examples with detailed explanations:
466
+
467
+ - **[examples/nba/feature_engineering_example.py](examples/nba/feature_engineering_example.py)** - Feature generation lifecycle (ratings, lags, rolling windows)
468
+ - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
469
+ - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
470
+ - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction