spforge 0.8.5__tar.gz → 0.8.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

Files changed (118) hide show
  1. {spforge-0.8.5/spforge.egg-info → spforge-0.8.8}/PKG-INFO +11 -18
  2. {spforge-0.8.5 → spforge-0.8.8}/README.md +10 -17
  3. spforge-0.8.8/examples/lol/pipeline_transformer_example.py +106 -0
  4. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/cross_validation_example.py +4 -11
  5. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/feature_engineering_example.py +33 -15
  6. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/game_winner_example.py +24 -14
  7. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/predictor_transformers_example.py +29 -16
  8. {spforge-0.8.5 → spforge-0.8.8}/pyproject.toml +1 -1
  9. {spforge-0.8.5 → spforge-0.8.8}/spforge/features_generator_pipeline.py +8 -4
  10. {spforge-0.8.5 → spforge-0.8.8}/spforge/hyperparameter_tuning/_default_search_spaces.py +26 -1
  11. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/__init__.py +4 -0
  12. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/_player_rating.py +11 -0
  13. spforge-0.8.8/spforge/ratings/league_start_rating_optimizer.py +201 -0
  14. {spforge-0.8.5 → spforge-0.8.8/spforge.egg-info}/PKG-INFO +11 -18
  15. {spforge-0.8.5 → spforge-0.8.8}/spforge.egg-info/SOURCES.txt +2 -0
  16. spforge-0.8.8/tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
  17. {spforge-0.8.5 → spforge-0.8.8}/tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
  18. {spforge-0.8.5 → spforge-0.8.8}/tests/ratings/test_player_rating_generator.py +27 -0
  19. {spforge-0.8.5 → spforge-0.8.8}/tests/scorer/test_score.py +90 -0
  20. {spforge-0.8.5 → spforge-0.8.8}/tests/test_feature_generator_pipeline.py +43 -0
  21. spforge-0.8.5/examples/lol/pipeline_transformer_example.py +0 -123
  22. {spforge-0.8.5 → spforge-0.8.8}/LICENSE +0 -0
  23. {spforge-0.8.5 → spforge-0.8.8}/MANIFEST.in +0 -0
  24. {spforge-0.8.5 → spforge-0.8.8}/examples/__init__.py +0 -0
  25. {spforge-0.8.5 → spforge-0.8.8}/examples/game_level_example.py +0 -0
  26. {spforge-0.8.5 → spforge-0.8.8}/examples/lol/__init__.py +0 -0
  27. {spforge-0.8.5 → spforge-0.8.8}/examples/lol/data/__init__.py +0 -0
  28. {spforge-0.8.5 → spforge-0.8.8}/examples/lol/data/subsample_lol_data.parquet +0 -0
  29. {spforge-0.8.5 → spforge-0.8.8}/examples/lol/data/utils.py +0 -0
  30. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/__init__.py +0 -0
  31. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/data/__init__.py +0 -0
  32. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/data/game_player_subsample.parquet +0 -0
  33. {spforge-0.8.5 → spforge-0.8.8}/examples/nba/data/utils.py +0 -0
  34. {spforge-0.8.5 → spforge-0.8.8}/setup.cfg +0 -0
  35. {spforge-0.8.5 → spforge-0.8.8}/spforge/__init__.py +0 -0
  36. {spforge-0.8.5 → spforge-0.8.8}/spforge/autopipeline.py +0 -0
  37. {spforge-0.8.5 → spforge-0.8.8}/spforge/base_feature_generator.py +0 -0
  38. {spforge-0.8.5 → spforge-0.8.8}/spforge/cross_validator/__init__.py +0 -0
  39. {spforge-0.8.5 → spforge-0.8.8}/spforge/cross_validator/_base.py +0 -0
  40. {spforge-0.8.5 → spforge-0.8.8}/spforge/cross_validator/cross_validator.py +0 -0
  41. {spforge-0.8.5 → spforge-0.8.8}/spforge/data_structures.py +0 -0
  42. {spforge-0.8.5 → spforge-0.8.8}/spforge/distributions/__init__.py +0 -0
  43. {spforge-0.8.5 → spforge-0.8.8}/spforge/distributions/_negative_binomial_estimator.py +0 -0
  44. {spforge-0.8.5 → spforge-0.8.8}/spforge/distributions/_normal_distribution_predictor.py +0 -0
  45. {spforge-0.8.5 → spforge-0.8.8}/spforge/distributions/_student_t_distribution_estimator.py +0 -0
  46. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/__init__.py +0 -0
  47. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_conditional_estimator.py +0 -0
  48. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_frequency_bucketing_classifier.py +0 -0
  49. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_granularity_estimator.py +0 -0
  50. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_group_by_estimator.py +0 -0
  51. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_ordinal_classifier.py +0 -0
  52. {spforge-0.8.5 → spforge-0.8.8}/spforge/estimator/_sklearn_enhancer_estimator.py +0 -0
  53. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/__init__.py +0 -0
  54. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_base.py +0 -0
  55. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_lag.py +0 -0
  56. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_net_over_predicted.py +0 -0
  57. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_regressor_feature_generator.py +0 -0
  58. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_rolling_against_opponent.py +0 -0
  59. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_rolling_mean_binary.py +0 -0
  60. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_rolling_mean_days.py +0 -0
  61. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_rolling_window.py +0 -0
  62. {spforge-0.8.5 → spforge-0.8.8}/spforge/feature_generator/_utils.py +0 -0
  63. {spforge-0.8.5 → spforge-0.8.8}/spforge/hyperparameter_tuning/__init__.py +0 -0
  64. {spforge-0.8.5 → spforge-0.8.8}/spforge/hyperparameter_tuning/_tuner.py +0 -0
  65. {spforge-0.8.5 → spforge-0.8.8}/spforge/performance_transformers/__init__.py +0 -0
  66. {spforge-0.8.5 → spforge-0.8.8}/spforge/performance_transformers/_performance_manager.py +0 -0
  67. {spforge-0.8.5 → spforge-0.8.8}/spforge/performance_transformers/_performances_transformers.py +0 -0
  68. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/_base.py +0 -0
  69. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/_team_rating.py +0 -0
  70. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/enums.py +0 -0
  71. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/league_identifier.py +0 -0
  72. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/player_performance_predictor.py +0 -0
  73. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/start_rating_generator.py +0 -0
  74. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/team_performance_predictor.py +0 -0
  75. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/team_start_rating_generator.py +0 -0
  76. {spforge-0.8.5 → spforge-0.8.8}/spforge/ratings/utils.py +0 -0
  77. {spforge-0.8.5 → spforge-0.8.8}/spforge/scorer/__init__.py +0 -0
  78. {spforge-0.8.5 → spforge-0.8.8}/spforge/scorer/_score.py +0 -0
  79. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/__init__.py +0 -0
  80. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_base.py +0 -0
  81. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_net_over_predicted.py +0 -0
  82. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_operator.py +0 -0
  83. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_other_transformer.py +0 -0
  84. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_predictor.py +0 -0
  85. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_simple_transformer.py +0 -0
  86. {spforge-0.8.5 → spforge-0.8.8}/spforge/transformers/_team_ratio_predictor.py +0 -0
  87. {spforge-0.8.5 → spforge-0.8.8}/spforge/utils.py +0 -0
  88. {spforge-0.8.5 → spforge-0.8.8}/spforge.egg-info/dependency_links.txt +0 -0
  89. {spforge-0.8.5 → spforge-0.8.8}/spforge.egg-info/requires.txt +0 -0
  90. {spforge-0.8.5 → spforge-0.8.8}/spforge.egg-info/top_level.txt +0 -0
  91. {spforge-0.8.5 → spforge-0.8.8}/tests/cross_validator/test_cross_validator.py +0 -0
  92. {spforge-0.8.5 → spforge-0.8.8}/tests/distributions/test_distribution.py +0 -0
  93. {spforge-0.8.5 → spforge-0.8.8}/tests/end_to_end/test_estimator_hyperparameter_tuning.py +0 -0
  94. {spforge-0.8.5 → spforge-0.8.8}/tests/end_to_end/test_lol_player_kills.py +0 -0
  95. {spforge-0.8.5 → spforge-0.8.8}/tests/end_to_end/test_nba_player_points.py +0 -0
  96. {spforge-0.8.5 → spforge-0.8.8}/tests/end_to_end/test_nba_prediction_consistency.py +0 -0
  97. {spforge-0.8.5 → spforge-0.8.8}/tests/estimator/test_sklearn_estimator.py +0 -0
  98. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_lag.py +0 -0
  99. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_regressor_feature_generator.py +0 -0
  100. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_rolling_against_opponent.py +0 -0
  101. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_rolling_mean_binary.py +0 -0
  102. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_rolling_mean_days.py +0 -0
  103. {spforge-0.8.5 → spforge-0.8.8}/tests/feature_generator/test_rolling_window.py +0 -0
  104. {spforge-0.8.5 → spforge-0.8.8}/tests/hyperparameter_tuning/test_estimator_tuner.py +0 -0
  105. {spforge-0.8.5 → spforge-0.8.8}/tests/hyperparameter_tuning/test_rating_tuner.py +0 -0
  106. {spforge-0.8.5 → spforge-0.8.8}/tests/performance_transformers/test_performance_manager.py +0 -0
  107. {spforge-0.8.5 → spforge-0.8.8}/tests/performance_transformers/test_performances_transformers.py +0 -0
  108. {spforge-0.8.5 → spforge-0.8.8}/tests/ratings/test_ratings_property.py +0 -0
  109. {spforge-0.8.5 → spforge-0.8.8}/tests/ratings/test_team_rating_generator.py +0 -0
  110. {spforge-0.8.5 → spforge-0.8.8}/tests/scorer/test_score_aggregation_granularity.py +0 -0
  111. {spforge-0.8.5 → spforge-0.8.8}/tests/test_autopipeline.py +0 -0
  112. {spforge-0.8.5 → spforge-0.8.8}/tests/test_autopipeline_context.py +0 -0
  113. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_estimator_transformer_context.py +0 -0
  114. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_net_over_predicted.py +0 -0
  115. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_other_transformer.py +0 -0
  116. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_predictor_transformer.py +0 -0
  117. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_simple_transformer.py +0 -0
  118. {spforge-0.8.5 → spforge-0.8.8}/tests/transformers/test_team_ratio_predictor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.5
3
+ Version: 0.8.8
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
85
85
  import pandas as pd
86
86
  from sklearn.linear_model import LogisticRegression
87
87
 
88
+ from examples import get_sub_sample_nba_data
88
89
  from spforge.autopipeline import AutoPipeline
89
90
  from spforge.data_structures import ColumnNames
90
- from spforge.ratings import RatingKnownFeatures
91
- from spforge.ratings._player_rating import PlayerRatingGenerator
91
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
92
92
 
93
- df = pd.read_parquet("data/game_player_subsample.parquet")
93
+ df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
94
94
 
95
95
  # Step 1: Define column mappings for your dataset
96
96
  column_names = ColumnNames(
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
144
144
  pipeline = AutoPipeline(
145
145
  estimator=LogisticRegression(),
146
146
  granularity=["game_id", "team_id"], # Aggregate players → teams
147
- feature_names=rating_generator.features_out + ["location"], # Rating + home/away
147
+ estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
148
148
  )
149
149
 
150
150
  # Train on historical data
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
302
302
  prediction_column_name="points_pred",
303
303
  target_column="points",
304
304
  n_splits=3, # Number of temporal folds
305
- # Must include both feature_names AND context_feature_names
306
- features=pipeline.feature_names + pipeline.context_feature_names,
305
+ # Must include both estimator features and context features
306
+ features=pipeline.required_features,
307
307
  )
308
308
 
309
309
  # Generate validation predictions
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
330
330
  - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
331
331
  - Use `validation_column` in scorer to score only validation rows
332
332
  - Training data always comes BEFORE validation data chronologically
333
- - Must pass both `feature_names` + `context_feature_names` to `features` parameter
333
+ - Must pass all required features (use `pipeline.required_features`)
334
334
  - Scorers can filter rows (e.g., only score players who played minutes > 0)
335
335
 
336
336
  See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
371
371
  # Approach 1: LGBMClassifier (direct probability prediction)
372
372
  pipeline_classifier = AutoPipeline(
373
373
  estimator=LGBMClassifier(verbose=-100, random_state=42),
374
- feature_names=features_pipeline.features_out,
374
+ estimator_features=features_pipeline.features_out,
375
375
  )
376
376
 
377
377
  # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
385
385
 
386
386
  pipeline_negbin = AutoPipeline(
387
387
  estimator=distribution_estimator,
388
- feature_names=features_pipeline.features_out,
389
- context_feature_names=[
390
- column_names.player_id,
391
- column_names.start_date,
392
- column_names.team_id,
393
- column_names.match_id,
394
- ],
388
+ estimator_features=features_pipeline.features_out,
395
389
  predictor_transformers=[
396
390
  EstimatorTransformer(
397
391
  prediction_column_name="points_estimate",
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
439
433
  # Stage 2: Refine estimate using Stage 1 output
440
434
  player_points_pipeline = AutoPipeline(
441
435
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
442
- feature_names=features_pipeline.features_out, # Original features
436
+ estimator_features=features_pipeline.features_out, # Original features
443
437
  # predictor_transformers execute first, adding their predictions
444
438
  predictor_transformers=[points_estimate_transformer],
445
439
  )
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
474
468
  - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
475
469
  - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
476
470
  - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
477
-
@@ -57,12 +57,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
57
57
  import pandas as pd
58
58
  from sklearn.linear_model import LogisticRegression
59
59
 
60
+ from examples import get_sub_sample_nba_data
60
61
  from spforge.autopipeline import AutoPipeline
61
62
  from spforge.data_structures import ColumnNames
62
- from spforge.ratings import RatingKnownFeatures
63
- from spforge.ratings._player_rating import PlayerRatingGenerator
63
+ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
64
64
 
65
- df = pd.read_parquet("data/game_player_subsample.parquet")
65
+ df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
66
66
 
67
67
  # Step 1: Define column mappings for your dataset
68
68
  column_names = ColumnNames(
@@ -116,7 +116,7 @@ historical_df = rating_generator.fit_transform(historical_df)
116
116
  pipeline = AutoPipeline(
117
117
  estimator=LogisticRegression(),
118
118
  granularity=["game_id", "team_id"], # Aggregate players → teams
119
- feature_names=rating_generator.features_out + ["location"], # Rating + home/away
119
+ estimator_features=rating_generator.features_out + ["location"], # Rating + home/away
120
120
  )
121
121
 
122
122
  # Train on historical data
@@ -274,8 +274,8 @@ cross_validator = MatchKFoldCrossValidator(
274
274
  prediction_column_name="points_pred",
275
275
  target_column="points",
276
276
  n_splits=3, # Number of temporal folds
277
- # Must include both feature_names AND context_feature_names
278
- features=pipeline.feature_names + pipeline.context_feature_names,
277
+ # Must include both estimator features and context features
278
+ features=pipeline.required_features,
279
279
  )
280
280
 
281
281
  # Generate validation predictions
@@ -302,7 +302,7 @@ print(f"Validation MAE: {mae:.2f}")
302
302
  - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
303
303
  - Use `validation_column` in scorer to score only validation rows
304
304
  - Training data always comes BEFORE validation data chronologically
305
- - Must pass both `feature_names` + `context_feature_names` to `features` parameter
305
+ - Must pass all required features (use `pipeline.required_features`)
306
306
  - Scorers can filter rows (e.g., only score players who played minutes > 0)
307
307
 
308
308
  See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -343,7 +343,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
343
343
  # Approach 1: LGBMClassifier (direct probability prediction)
344
344
  pipeline_classifier = AutoPipeline(
345
345
  estimator=LGBMClassifier(verbose=-100, random_state=42),
346
- feature_names=features_pipeline.features_out,
346
+ estimator_features=features_pipeline.features_out,
347
347
  )
348
348
 
349
349
  # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -357,13 +357,7 @@ distribution_estimator = NegativeBinomialEstimator(
357
357
 
358
358
  pipeline_negbin = AutoPipeline(
359
359
  estimator=distribution_estimator,
360
- feature_names=features_pipeline.features_out,
361
- context_feature_names=[
362
- column_names.player_id,
363
- column_names.start_date,
364
- column_names.team_id,
365
- column_names.match_id,
366
- ],
360
+ estimator_features=features_pipeline.features_out,
367
361
  predictor_transformers=[
368
362
  EstimatorTransformer(
369
363
  prediction_column_name="points_estimate",
@@ -411,7 +405,7 @@ points_estimate_transformer = EstimatorTransformer(
411
405
  # Stage 2: Refine estimate using Stage 1 output
412
406
  player_points_pipeline = AutoPipeline(
413
407
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
414
- feature_names=features_pipeline.features_out, # Original features
408
+ estimator_features=features_pipeline.features_out, # Original features
415
409
  # predictor_transformers execute first, adding their predictions
416
410
  predictor_transformers=[points_estimate_transformer],
417
411
  )
@@ -446,4 +440,3 @@ For complete, runnable examples with detailed explanations:
446
440
  - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
447
441
  - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
448
442
  - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction
449
-
@@ -0,0 +1,106 @@
1
+ import polars as pl
2
+ from lightgbm import LGBMRegressor
3
+
4
+ from examples import get_sub_sample_lol_data
5
+ from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
6
+ from spforge.distributions import NegativeBinomialEstimator
7
+ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
8
+ from spforge.transformers import EstimatorTransformer
9
+
10
+ column_names = ColumnNames(
11
+ team_id="teamname",
12
+ match_id="gameid",
13
+ start_date="date",
14
+ player_id="player_uid",
15
+ league="league",
16
+ position="position",
17
+ )
18
+
19
+ df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
20
+ df = (
21
+ df.with_columns(
22
+ pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
23
+ column_names.player_id
24
+ )
25
+ )
26
+ .filter(pl.col(column_names.position) != "team")
27
+ .with_columns(
28
+ pl.col(column_names.team_id)
29
+ .n_unique()
30
+ .over(column_names.match_id)
31
+ .alias("team_count"),
32
+ pl.col(column_names.player_id)
33
+ .n_unique()
34
+ .over([column_names.match_id, column_names.team_id])
35
+ .alias("player_count"),
36
+ )
37
+ .filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
38
+ .drop(["team_count", "player_count"])
39
+ .unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
40
+ .sort(
41
+ [
42
+ column_names.start_date,
43
+ column_names.match_id,
44
+ column_names.team_id,
45
+ column_names.player_id,
46
+ ]
47
+ )
48
+ )
49
+
50
+ most_recent_10_games = (
51
+ df.select(pl.col(column_names.match_id))
52
+ .unique(maintain_order=True)
53
+ .tail(10)
54
+ .get_column(column_names.match_id)
55
+ .to_list()
56
+ )
57
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
58
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
59
+
60
+ lag_transformers = [
61
+ LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
62
+ RollingWindowTransformer(
63
+ features=["kills", "deaths"],
64
+ window=20,
65
+ min_periods=1,
66
+ granularity=["player_uid"],
67
+ ),
68
+ ]
69
+
70
+ features_generator = FeatureGeneratorPipeline(
71
+ column_names=column_names,
72
+ feature_generators=lag_transformers,
73
+ )
74
+
75
+ historical_df = features_generator.fit_transform(historical_df).to_pandas()
76
+ future_df = features_generator.future_transform(future_df).to_pandas()
77
+
78
+ point_estimate_transformer = EstimatorTransformer(
79
+ prediction_column_name="kills_estimate",
80
+ estimator=LGBMRegressor(verbose=-100, random_state=42),
81
+ features=features_generator.features_out,
82
+ )
83
+
84
+ probability_estimator = NegativeBinomialEstimator(
85
+ max_value=15,
86
+ point_estimate_pred_column="kills_estimate",
87
+ r_specific_granularity=[column_names.player_id],
88
+ predicted_r_weight=1,
89
+ column_names=column_names,
90
+ )
91
+
92
+ pipeline = AutoPipeline(
93
+ estimator=probability_estimator,
94
+ estimator_features=features_generator.features_out,
95
+ predictor_transformers=[point_estimate_transformer],
96
+ )
97
+
98
+ pipeline.fit(X=historical_df, y=historical_df["kills"])
99
+
100
+ future_point_estimates = pipeline.predict(future_df)
101
+ future_probabilities = pipeline.predict_proba(future_df)
102
+ future_df["kills_pred"] = future_point_estimates
103
+
104
+ print(future_df.head(5))
105
+ print(f"Probability matrix shape: {future_probabilities.shape}")
106
+ print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")
@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
51
51
  print("-" * 70)
52
52
  pipeline_classifier = AutoPipeline(
53
53
  estimator=LGBMClassifier(verbose=-100, random_state=42),
54
- feature_names=features_generator.features_out,
54
+ estimator_features=features_generator.features_out,
55
55
  )
56
56
 
57
57
  cross_validator_classifier = MatchKFoldCrossValidator(
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
60
60
  estimator=pipeline_classifier,
61
61
  prediction_column_name="points_probabilities_classifier",
62
62
  target_column="points",
63
- features=pipeline_classifier.feature_names,
63
+ features=pipeline_classifier.required_features,
64
64
  )
65
65
  validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
66
66
 
@@ -80,20 +80,13 @@ print("-" * 70)
80
80
  predictor_negbin = NegativeBinomialEstimator(
81
81
  max_value=40,
82
82
  point_estimate_pred_column="points_estimate",
83
- r_specific_granularity=["player_id"],
84
83
  predicted_r_weight=1,
85
84
  column_names=column_names,
86
85
  )
87
86
 
88
87
  pipeline_negbin = AutoPipeline(
89
88
  estimator=predictor_negbin,
90
- feature_names=features_generator.features_out,
91
- context_feature_names=[
92
- column_names.player_id,
93
- column_names.start_date,
94
- column_names.team_id,
95
- column_names.match_id,
96
- ],
89
+ estimator_features=features_generator.features_out,
97
90
  predictor_transformers=[
98
91
  EstimatorTransformer(
99
92
  prediction_column_name="points_estimate",
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
109
102
  estimator=pipeline_negbin,
110
103
  prediction_column_name="points_probabilities_negbin",
111
104
  target_column="points",
112
- features=pipeline_negbin.context_feature_names + pipeline_negbin.feature_names,
105
+ features=pipeline_negbin.required_features,
113
106
  )
114
107
  validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)
115
108
 
@@ -13,7 +13,7 @@ Key concepts covered:
13
13
  - State management: fit_transform vs future_transform
14
14
  """
15
15
 
16
- import pandas as pd
16
+ import polars as pl
17
17
 
18
18
  from examples import get_sub_sample_nba_data
19
19
  from spforge import FeatureGeneratorPipeline
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
22
22
  from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
23
23
 
24
24
  # Load sample NBA data
25
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
25
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
26
26
 
27
27
  # Define column mappings for your dataset
28
28
  # This tells spforge which columns contain team IDs, player IDs, dates, etc.
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
 
36
36
  # CRITICAL: Always sort data chronologically before generating features
37
37
  # This ensures temporal ordering and prevents future leakage (using future data to predict the past)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date, # First by date
41
41
  column_names.match_id, # Then by match
@@ -46,13 +46,21 @@ df = df.sort_values(
46
46
 
47
47
  # Keep only games with exactly 2 teams (filter out invalid data)
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
- print(f"Dataset: {len(df)} rows, {df[column_names.match_id].nunique()} games")
55
- print(f"Date range: {df[column_names.start_date].min()} to {df[column_names.start_date].max()}")
59
+ match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
60
+ start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
61
+ end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
62
+ print(f"Dataset: {len(df)} rows, {match_count} games")
63
+ print(f"Date range: {start_date} to {end_date}")
56
64
  print()
57
65
 
58
66
  # ====================================================================
@@ -125,12 +133,22 @@ print()
125
133
  # ====================================================================
126
134
 
127
135
  # Split data into historical (for training) and future (for prediction)
128
- most_recent_5_games = df[column_names.match_id].unique()[-5:]
129
- historical_df = df[~df[column_names.match_id].isin(most_recent_5_games)].copy()
130
- future_df = df[df[column_names.match_id].isin(most_recent_5_games)].copy()
136
+ most_recent_5_games = (
137
+ df.select(pl.col(column_names.match_id))
138
+ .unique(maintain_order=True)
139
+ .tail(5)
140
+ .get_column(column_names.match_id)
141
+ .to_list()
142
+ )
143
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
144
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
131
145
 
132
- print(f"Historical data: {len(historical_df)} rows, {historical_df[column_names.match_id].nunique()} games")
133
- print(f"Future data: {len(future_df)} rows, {future_df[column_names.match_id].nunique()} games")
146
+ historical_games = (
147
+ historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
148
+ )
149
+ future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
150
+ print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
151
+ print(f"Future data: {len(future_df)} rows, {future_games} games")
134
152
  print()
135
153
 
136
154
  # FIT_TRANSFORM: Learn from historical data
@@ -138,7 +156,7 @@ print()
138
156
  # - Lags/rolling windows build up from initial games
139
157
  # - Internal state (ratings, windows) is MUTATED
140
158
  print("Applying fit_transform to historical data...")
141
- historical_df = features_pipeline.fit_transform(historical_df)
159
+ historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
142
160
  print(f" Generated {len(features_pipeline.features_out)} features:")
143
161
  for feature in features_pipeline.features_out:
144
162
  print(f" - {feature}")
@@ -149,7 +167,7 @@ print()
149
167
  # - Appends current game to lag/rolling windows but doesn't persist the update
150
168
  # - This is what you use in production: generate features without affecting your model's state
151
169
  print("Applying future_transform to future data (read-only)...")
152
- future_df_transformed = features_pipeline.future_transform(future_df)
170
+ future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
153
171
  print(f" Future data now has {len(future_df_transformed.columns)} columns")
154
172
  print()
155
173
 
@@ -1,12 +1,13 @@
1
- import pandas as pd
1
+ import polars as pl
2
2
  from sklearn.linear_model import LogisticRegression
3
3
 
4
+ from examples import get_sub_sample_nba_data
4
5
  from spforge.autopipeline import AutoPipeline
5
6
  from spforge.data_structures import ColumnNames
6
7
  from spforge.ratings import RatingKnownFeatures
7
8
  from spforge.ratings._player_rating import PlayerRatingGenerator
8
9
 
9
- df = pd.read_parquet("data/game_player_subsample.parquet")
10
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
10
11
 
11
12
  # Defines the column names as they appear in the dataframe
12
13
  column_names = ColumnNames(
@@ -16,8 +17,8 @@ column_names = ColumnNames(
16
17
  player_id="player_name",
17
18
  )
18
19
  # Sorts the dataframe. The dataframe must always be sorted as below
19
- df = df.sort_values(
20
- by=[
20
+ df = df.sort(
21
+ [
21
22
  column_names.start_date,
22
23
  column_names.match_id,
23
24
  column_names.team_id,
@@ -27,17 +28,26 @@ df = df.sort_values(
27
28
 
28
29
  # Drops games with less or more than 2 teams
29
30
  df = (
30
- df.assign(
31
- team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique")
31
+ df.with_columns(
32
+ pl.col(column_names.team_id)
33
+ .n_unique()
34
+ .over(column_names.match_id)
35
+ .alias("team_count")
32
36
  )
33
- .loc[lambda x: x.team_count == 2]
34
- .drop(columns=["team_count"])
37
+ .filter(pl.col("team_count") == 2)
38
+ .drop("team_count")
35
39
  )
36
40
 
37
41
  # Pretends the last 10 games are future games. The most will be trained on everything before that.
38
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
39
- historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
40
- future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["won"])
42
+ most_recent_10_games = (
43
+ df.select(pl.col(column_names.match_id))
44
+ .unique(maintain_order=True)
45
+ .tail(10)
46
+ .get_column(column_names.match_id)
47
+ .to_list()
48
+ )
49
+ historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
50
+ future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
41
51
 
42
52
  # Defining a simple rating-generator. It will use the "won" column to update the ratings.
43
53
  # In contrast to a typical Elo, ratings will follow players.
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
49
59
  column_names=column_names,
50
60
  non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
51
61
  )
52
- historical_df = rating_generator.fit_transform(historical_df)
62
+ historical_df = rating_generator.fit_transform(historical_df).to_pandas()
53
63
 
54
64
  # Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
55
65
  # Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
61
71
  pipeline = AutoPipeline(
62
72
  estimator=LogisticRegression(),
63
73
  granularity=["game_id", "team_id"],
64
- feature_names=rating_generator.features_out + ["location"],
74
+ estimator_features=rating_generator.features_out + ["location"],
65
75
  )
66
76
 
67
77
  pipeline.fit(X=historical_df, y=historical_df["won"])
68
78
 
69
79
  # Future predictions on future results
70
- future_df = rating_generator.future_transform(future_df)
80
+ future_df = rating_generator.future_transform(future_df).to_pandas()
71
81
  future_predictions = pipeline.predict_proba(future_df)[:, 1]
72
82
  future_df["game_winner_probability"] = future_predictions
73
83
  # Grouping predictions from game-player level to game-level.
@@ -12,7 +12,7 @@ Key concepts covered:
12
12
  - Hierarchical modeling: Team strength → Player performance
13
13
  """
14
14
 
15
- import pandas as pd
15
+ import polars as pl
16
16
  from lightgbm import LGBMRegressor
17
17
  from sklearn.linear_model import LogisticRegression
18
18
 
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
24
24
  from spforge.transformers import EstimatorTransformer
25
25
 
26
26
  # Load sample NBA data
27
- df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
27
+ df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
28
28
 
29
29
  # Define column mappings
30
30
  column_names = ColumnNames(
@@ -35,7 +35,7 @@ column_names = ColumnNames(
35
35
  )
36
36
 
37
37
  # Sort data chronologically (critical for temporal correctness)
38
- df = df.sort_values(
38
+ df = df.sort(
39
39
  [
40
40
  column_names.start_date,
41
41
  column_names.match_id,
@@ -46,18 +46,31 @@ df = df.sort_values(
46
46
 
47
47
  # Filter to valid games
48
48
  df = (
49
- df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
50
- .loc[lambda x: x.team_count == 2]
51
- .drop(columns=["team_count"])
49
+ df.with_columns(
50
+ pl.col(column_names.team_id)
51
+ .n_unique()
52
+ .over(column_names.match_id)
53
+ .alias("team_count")
54
+ )
55
+ .filter(pl.col("team_count") == 2)
56
+ .drop("team_count")
52
57
  )
53
58
 
54
59
  # Train/test split (using temporal ordering)
55
- most_recent_10_games = df[column_names.match_id].unique()[-10:]
56
- train_df = df[~df[column_names.match_id].isin(most_recent_10_games)].copy()
57
- test_df = df[df[column_names.match_id].isin(most_recent_10_games)].copy()
60
+ most_recent_10_games = (
61
+ df.select(pl.col(column_names.match_id))
62
+ .unique(maintain_order=True)
63
+ .tail(10)
64
+ .get_column(column_names.match_id)
65
+ .to_list()
66
+ )
67
+ train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
68
+ test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
58
69
 
59
- print(f"Training: {len(train_df)} rows, {train_df[column_names.match_id].nunique()} games")
60
- print(f"Testing: {len(test_df)} rows, {test_df[column_names.match_id].nunique()} games")
70
+ train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
71
+ test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
72
+ print(f"Training: {len(train_df)} rows, {train_games} games")
73
+ print(f"Testing: {len(test_df)} rows, {test_games} games")
61
74
  print()
62
75
 
63
76
  # ====================================================================
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
86
99
  )
87
100
 
88
101
  # Generate features
89
- train_df = features_pipeline.fit_transform(train_df)
90
- test_df = features_pipeline.future_transform(test_df)
102
+ train_df = features_pipeline.fit_transform(train_df).to_pandas()
103
+ test_df = features_pipeline.future_transform(test_df).to_pandas()
91
104
 
92
105
  print(f"Generated {len(features_pipeline.features_out)} baseline features")
93
106
  print()
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
121
134
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
122
135
  # Features for the final estimator (only pre-game information)
123
136
  # Note: points_estimate_raw will be added by the transformer
124
- feature_names=features_pipeline.features_out,
137
+ estimator_features=features_pipeline.features_out,
125
138
  # The predictor_transformers parameter chains the estimators
126
139
  predictor_transformers=[points_estimate_transformer], # Stage 1 executes first
127
140
  )
@@ -150,7 +163,7 @@ print()
150
163
 
151
164
  # Fit the pipeline
152
165
  # The y target here is for the FINAL estimator (player points)
153
- # Each predictor_transformer has its own target_column specified
166
+ # Predictor_transformers are trained on the same target during fit()
154
167
  player_points_pipeline.fit(X=train_df, y=train_df["points"])
155
168
 
156
169
  print("Training complete!")
@@ -188,7 +201,7 @@ print()
188
201
 
189
202
  single_stage_pipeline = AutoPipeline(
190
203
  estimator=LGBMRegressor(verbose=-100, n_estimators=50),
191
- feature_names=features_pipeline.features_out,
204
+ estimator_features=features_pipeline.features_out,
192
205
  )
193
206
 
194
207
  print("Training single-stage baseline for comparison...")
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spforge"
7
- version = "0.8.5"
7
+ version = "0.8.8"
8
8
  description = "A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -120,7 +120,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
120
120
 
121
121
  for transformer in self.feature_generators:
122
122
  pre_row_count = len(df)
123
- df = nw.from_native(transformer.fit_transform(df, column_names=column_names))
123
+ native_df = df.to_native()
124
+ df = nw.from_native(transformer.fit_transform(native_df, column_names=column_names))
124
125
  assert len(df) == pre_row_count
125
126
  for f in transformer.features_out:
126
127
  if f in expected_feats_added:
@@ -151,7 +152,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
151
152
 
152
153
  for transformer in self.feature_generators:
153
154
  pre_row_count = len(df)
154
- df = nw.from_native(transformer.transform(df))
155
+ native_df = df.to_native()
156
+ df = nw.from_native(transformer.transform(native_df))
155
157
  assert len(df) == pre_row_count
156
158
  for f in transformer.features_out:
157
159
  if f in expected_feats_added:
@@ -181,9 +183,11 @@ class FeatureGeneratorPipeline(FeatureGenerator):
181
183
  for transformer in self.feature_generators:
182
184
  pre_row_count = len(df)
183
185
  if hasattr(transformer, "future_transform") and callable(transformer.future_transform):
184
- df = nw.from_native(transformer.future_transform(df))
186
+ native_df = df.to_native()
187
+ df = nw.from_native(transformer.future_transform(native_df))
185
188
  else:
186
- df = nw.from_native(transformer.transform(df))
189
+ native_df = df.to_native()
190
+ df = nw.from_native(transformer.transform(native_df))
187
191
  assert len(df) == pre_row_count
188
192
  for f in transformer.features_out:
189
193
  if f in expected_feats_added: