PyPI - spforge - Versions diffs - 0.8.5__tar.gz → 0.8.8__tar.gz - Mend

spforge 0.8.5tar.gz → 0.8.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spforge might be problematic. Click here for more details.

Files changed (118) hide show

{spforge-0.8.5/spforge.egg-info → spforge-0.8.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spforge
-Version: 0.8.5
+Version: 0.8.8
 Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
 Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
 License: See LICENSE file
@@ -85,12 +85,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
+from examples import get_sub_sample_nba_data
 from spforge.autopipeline import AutoPipeline
 from spforge.data_structures import ColumnNames
-from spforge.ratings import RatingKnownFeatures
-from spforge.ratings._player_rating import PlayerRatingGenerator
+from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
-df = pd.read_parquet("data/game_player_subsample.parquet")
+df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
 # Step 1: Define column mappings for your dataset
 column_names = ColumnNames(
@@ -144,7 +144,7 @@ historical_df = rating_generator.fit_transform(historical_df)
 pipeline = AutoPipeline(
     estimator=LogisticRegression(),
     granularity=["game_id", "team_id"],  # Aggregate players → teams
-    feature_names=rating_generator.features_out + ["location"],  # Rating + home/away
+    estimator_features=rating_generator.features_out + ["location"],  # Rating + home/away
 )
 # Train on historical data
@@ -302,8 +302,8 @@ cross_validator = MatchKFoldCrossValidator(
     prediction_column_name="points_pred",
     target_column="points",
     n_splits=3,  # Number of temporal folds
-    # Must include both feature_names AND context_feature_names
-    features=pipeline.feature_names + pipeline.context_feature_names,
+    # Must include both estimator features and context features
+    features=pipeline.required_features,
 )
 # Generate validation predictions
@@ -330,7 +330,7 @@ print(f"Validation MAE: {mae:.2f}")
   - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
   - Use `validation_column` in scorer to score only validation rows
 - Training data always comes BEFORE validation data chronologically
-- Must pass both `feature_names` + `context_feature_names` to `features` parameter
+- Must pass all required features (use `pipeline.required_features`)
 - Scorers can filter rows (e.g., only score players who played minutes > 0)
 See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -371,7 +371,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
 # Approach 1: LGBMClassifier (direct probability prediction)
 pipeline_classifier = AutoPipeline(
     estimator=LGBMClassifier(verbose=-100, random_state=42),
-    feature_names=features_pipeline.features_out,
+    estimator_features=features_pipeline.features_out,
 )
 # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -385,13 +385,7 @@ distribution_estimator = NegativeBinomialEstimator(
 pipeline_negbin = AutoPipeline(
     estimator=distribution_estimator,
-    feature_names=features_pipeline.features_out,
-    context_feature_names=[
-        column_names.player_id,
-        column_names.start_date,
-        column_names.team_id,
-        column_names.match_id,
-    ],
+    estimator_features=features_pipeline.features_out,
     predictor_transformers=[
         EstimatorTransformer(
             prediction_column_name="points_estimate",
@@ -439,7 +433,7 @@ points_estimate_transformer = EstimatorTransformer(
 # Stage 2: Refine estimate using Stage 1 output
 player_points_pipeline = AutoPipeline(
     estimator=LGBMRegressor(verbose=-100, n_estimators=50),
-    feature_names=features_pipeline.features_out,  # Original features
+    estimator_features=features_pipeline.features_out,  # Original features
     # predictor_transformers execute first, adding their predictions
     predictor_transformers=[points_estimate_transformer],
 )
@@ -474,4 +468,3 @@ For complete, runnable examples with detailed explanations:
 - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
 - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
 - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction

{spforge-0.8.5 → spforge-0.8.8}/README.md RENAMED Viewed

@@ -57,12 +57,12 @@ This example demonstrates predicting NBA game winners using player-level ratings
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
+from examples import get_sub_sample_nba_data
 from spforge.autopipeline import AutoPipeline
 from spforge.data_structures import ColumnNames
-from spforge.ratings import RatingKnownFeatures
-from spforge.ratings._player_rating import PlayerRatingGenerator
+from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
-df = pd.read_parquet("data/game_player_subsample.parquet")
+df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
 # Step 1: Define column mappings for your dataset
 column_names = ColumnNames(
@@ -116,7 +116,7 @@ historical_df = rating_generator.fit_transform(historical_df)
 pipeline = AutoPipeline(
     estimator=LogisticRegression(),
     granularity=["game_id", "team_id"],  # Aggregate players → teams
-    feature_names=rating_generator.features_out + ["location"],  # Rating + home/away
+    estimator_features=rating_generator.features_out + ["location"],  # Rating + home/away
 )
 # Train on historical data
@@ -274,8 +274,8 @@ cross_validator = MatchKFoldCrossValidator(
     prediction_column_name="points_pred",
     target_column="points",
     n_splits=3,  # Number of temporal folds
-    # Must include both feature_names AND context_feature_names
-    features=pipeline.feature_names + pipeline.context_feature_names,
+    # Must include both estimator features and context features
+    features=pipeline.required_features,
 )
 # Generate validation predictions
@@ -302,7 +302,7 @@ print(f"Validation MAE: {mae:.2f}")
   - `is_validation=1` marks validation rows, `is_validation=0` marks training rows
   - Use `validation_column` in scorer to score only validation rows
 - Training data always comes BEFORE validation data chronologically
-- Must pass both `feature_names` + `context_feature_names` to `features` parameter
+- Must pass all required features (use `pipeline.required_features`)
 - Scorers can filter rows (e.g., only score players who played minutes > 0)
 See [examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py) for a complete example.
@@ -343,7 +343,7 @@ from lightgbm import LGBMClassifier, LGBMRegressor
 # Approach 1: LGBMClassifier (direct probability prediction)
 pipeline_classifier = AutoPipeline(
     estimator=LGBMClassifier(verbose=-100, random_state=42),
-    feature_names=features_pipeline.features_out,
+    estimator_features=features_pipeline.features_out,
 )
 # Approach 2: LGBMRegressor + NegativeBinomialEstimator
@@ -357,13 +357,7 @@ distribution_estimator = NegativeBinomialEstimator(
 pipeline_negbin = AutoPipeline(
     estimator=distribution_estimator,
-    feature_names=features_pipeline.features_out,
-    context_feature_names=[
-        column_names.player_id,
-        column_names.start_date,
-        column_names.team_id,
-        column_names.match_id,
-    ],
+    estimator_features=features_pipeline.features_out,
     predictor_transformers=[
         EstimatorTransformer(
             prediction_column_name="points_estimate",
@@ -411,7 +405,7 @@ points_estimate_transformer = EstimatorTransformer(
 # Stage 2: Refine estimate using Stage 1 output
 player_points_pipeline = AutoPipeline(
     estimator=LGBMRegressor(verbose=-100, n_estimators=50),
-    feature_names=features_pipeline.features_out,  # Original features
+    estimator_features=features_pipeline.features_out,  # Original features
     # predictor_transformers execute first, adding their predictions
     predictor_transformers=[points_estimate_transformer],
 )
@@ -446,4 +440,3 @@ For complete, runnable examples with detailed explanations:
 - **[examples/nba/cross_validation_example.py](examples/nba/cross_validation_example.py)** - Time-series CV, distributions, and scoring
 - **[examples/nba/predictor_transformers_example.py](examples/nba/predictor_transformers_example.py)** - Multi-stage hierarchical modeling
 - **[examples/nba/game_winner_example.py](examples/nba/game_winner_example.py)** - Basic workflow for game winner prediction

spforge-0.8.8/examples/lol/pipeline_transformer_example.py ADDED Viewed

@@ -0,0 +1,106 @@
+import polars as pl
+from lightgbm import LGBMRegressor
+from examples import get_sub_sample_lol_data
+from spforge import AutoPipeline, ColumnNames, FeatureGeneratorPipeline
+from spforge.distributions import NegativeBinomialEstimator
+from spforge.feature_generator import LagTransformer, RollingWindowTransformer
+from spforge.transformers import EstimatorTransformer
+column_names = ColumnNames(
+    team_id="teamname",
+    match_id="gameid",
+    start_date="date",
+    player_id="player_uid",
+    league="league",
+    position="position",
+)
+df = get_sub_sample_lol_data(as_pandas=False, as_polars=True)
+df = (
+    df.with_columns(
+        pl.concat_str([pl.col("playername"), pl.col("teamname")], separator="__").alias(
+            column_names.player_id
+        )
+    )
+    .filter(pl.col(column_names.position) != "team")
+    .with_columns(
+        pl.col(column_names.team_id)
+        .n_unique()
+        .over(column_names.match_id)
+        .alias("team_count"),
+        pl.col(column_names.player_id)
+        .n_unique()
+        .over([column_names.match_id, column_names.team_id])
+        .alias("player_count"),
+    )
+    .filter((pl.col("team_count") == 2) & (pl.col("player_count") == 5))
+    .drop(["team_count", "player_count"])
+    .unique(subset=[column_names.match_id, column_names.player_id, column_names.team_id])
+    .sort(
+        [
+            column_names.start_date,
+            column_names.match_id,
+            column_names.team_id,
+            column_names.player_id,
+        ]
+    )
+)
+most_recent_10_games = (
+    df.select(pl.col(column_names.match_id))
+    .unique(maintain_order=True)
+    .tail(10)
+    .get_column(column_names.match_id)
+    .to_list()
+)
+historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
+future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("kills")
+lag_transformers = [
+    LagTransformer(features=["kills", "deaths"], lag_length=3, granularity=["player_uid"]),
+    RollingWindowTransformer(
+        features=["kills", "deaths"],
+        window=20,
+        min_periods=1,
+        granularity=["player_uid"],
+    ),
+]
+features_generator = FeatureGeneratorPipeline(
+    column_names=column_names,
+    feature_generators=lag_transformers,
+)
+historical_df = features_generator.fit_transform(historical_df).to_pandas()
+future_df = features_generator.future_transform(future_df).to_pandas()
+point_estimate_transformer = EstimatorTransformer(
+    prediction_column_name="kills_estimate",
+    estimator=LGBMRegressor(verbose=-100, random_state=42),
+    features=features_generator.features_out,
+)
+probability_estimator = NegativeBinomialEstimator(
+    max_value=15,
+    point_estimate_pred_column="kills_estimate",
+    r_specific_granularity=[column_names.player_id],
+    predicted_r_weight=1,
+    column_names=column_names,
+)
+pipeline = AutoPipeline(
+    estimator=probability_estimator,
+    estimator_features=features_generator.features_out,
+    predictor_transformers=[point_estimate_transformer],
+)
+pipeline.fit(X=historical_df, y=historical_df["kills"])
+future_point_estimates = pipeline.predict(future_df)
+future_probabilities = pipeline.predict_proba(future_df)
+future_df["kills_pred"] = future_point_estimates
+print(future_df.head(5))
+print(f"Probability matrix shape: {future_probabilities.shape}")
+print(f"First row probabilities (0-15 kills): {future_probabilities[0]}")

{spforge-0.8.5 → spforge-0.8.8}/examples/nba/cross_validation_example.py RENAMED Viewed

@@ -51,7 +51,7 @@ print("\nApproach 1: LGBMClassifier (direct probability prediction)")
 print("-" * 70)
 pipeline_classifier = AutoPipeline(
     estimator=LGBMClassifier(verbose=-100, random_state=42),
-    feature_names=features_generator.features_out,
+    estimator_features=features_generator.features_out,
 )
 cross_validator_classifier = MatchKFoldCrossValidator(
@@ -60,7 +60,7 @@ cross_validator_classifier = MatchKFoldCrossValidator(
     estimator=pipeline_classifier,
     prediction_column_name="points_probabilities_classifier",
     target_column="points",
-    features=pipeline_classifier.feature_names,
+    features=pipeline_classifier.required_features,
 )
 validation_df_classifier = cross_validator_classifier.generate_validation_df(df=df)
@@ -80,20 +80,13 @@ print("-" * 70)
 predictor_negbin = NegativeBinomialEstimator(
     max_value=40,
     point_estimate_pred_column="points_estimate",
-    r_specific_granularity=["player_id"],
     predicted_r_weight=1,
     column_names=column_names,
 )
 pipeline_negbin = AutoPipeline(
     estimator=predictor_negbin,
-    feature_names=features_generator.features_out,
-    context_feature_names=[
-        column_names.player_id,
-        column_names.start_date,
-        column_names.team_id,
-        column_names.match_id,
-    ],
+    estimator_features=features_generator.features_out,
     predictor_transformers=[
         EstimatorTransformer(
             prediction_column_name="points_estimate",
@@ -109,7 +102,7 @@ cross_validator_negbin = MatchKFoldCrossValidator(
     estimator=pipeline_negbin,
     prediction_column_name="points_probabilities_negbin",
     target_column="points",
-    features=pipeline_negbin.context_feature_names + pipeline_negbin.feature_names,
+    features=pipeline_negbin.required_features,
 )
 validation_df_negbin = cross_validator_negbin.generate_validation_df(df=df)

{spforge-0.8.5 → spforge-0.8.8}/examples/nba/feature_engineering_example.py RENAMED Viewed

@@ -13,7 +13,7 @@ Key concepts covered:
 - State management: fit_transform vs future_transform
 """
-import pandas as pd
+import polars as pl
 from examples import get_sub_sample_nba_data
 from spforge import FeatureGeneratorPipeline
@@ -22,7 +22,7 @@ from spforge.feature_generator import LagTransformer, RollingWindowTransformer
 from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
 # Load sample NBA data
-df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
+df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
 # Define column mappings for your dataset
 # This tells spforge which columns contain team IDs, player IDs, dates, etc.
@@ -35,7 +35,7 @@ column_names = ColumnNames(
 # CRITICAL: Always sort data chronologically before generating features
 # This ensures temporal ordering and prevents future leakage (using future data to predict the past)
-df = df.sort_values(
+df = df.sort(
     [
         column_names.start_date,  # First by date
         column_names.match_id,  # Then by match
@@ -46,13 +46,21 @@ df = df.sort_values(
 # Keep only games with exactly 2 teams (filter out invalid data)
 df = (
-    df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
-    .loc[lambda x: x.team_count == 2]
-    .drop(columns=["team_count"])
+    df.with_columns(
+        pl.col(column_names.team_id)
+        .n_unique()
+        .over(column_names.match_id)
+        .alias("team_count")
+    )
+    .filter(pl.col("team_count") == 2)
+    .drop("team_count")
 )
-print(f"Dataset: {len(df)} rows, {df[column_names.match_id].nunique()} games")
-print(f"Date range: {df[column_names.start_date].min()} to {df[column_names.start_date].max()}")
+match_count = df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
+start_date = df.select(pl.col(column_names.start_date).min()).to_series().item()
+end_date = df.select(pl.col(column_names.start_date).max()).to_series().item()
+print(f"Dataset: {len(df)} rows, {match_count} games")
+print(f"Date range: {start_date} to {end_date}")
 print()
 # ====================================================================
@@ -125,12 +133,22 @@ print()
 # ====================================================================
 # Split data into historical (for training) and future (for prediction)
-most_recent_5_games = df[column_names.match_id].unique()[-5:]
-historical_df = df[~df[column_names.match_id].isin(most_recent_5_games)].copy()
-future_df = df[df[column_names.match_id].isin(most_recent_5_games)].copy()
+most_recent_5_games = (
+    df.select(pl.col(column_names.match_id))
+    .unique(maintain_order=True)
+    .tail(5)
+    .get_column(column_names.match_id)
+    .to_list()
+)
+historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_5_games))
+future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_5_games))
-print(f"Historical data: {len(historical_df)} rows, {historical_df[column_names.match_id].nunique()} games")
-print(f"Future data: {len(future_df)} rows, {future_df[column_names.match_id].nunique()} games")
+historical_games = (
+    historical_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
+)
+future_games = future_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
+print(f"Historical data: {len(historical_df)} rows, {historical_games} games")
+print(f"Future data: {len(future_df)} rows, {future_games} games")
 print()
 # FIT_TRANSFORM: Learn from historical data
@@ -138,7 +156,7 @@ print()
 # - Lags/rolling windows build up from initial games
 # - Internal state (ratings, windows) is MUTATED
 print("Applying fit_transform to historical data...")
-historical_df = features_pipeline.fit_transform(historical_df)
+historical_df = features_pipeline.fit_transform(historical_df).to_pandas()
 print(f"  Generated {len(features_pipeline.features_out)} features:")
 for feature in features_pipeline.features_out:
     print(f"    - {feature}")
@@ -149,7 +167,7 @@ print()
 # - Appends current game to lag/rolling windows but doesn't persist the update
 # - This is what you use in production: generate features without affecting your model's state
 print("Applying future_transform to future data (read-only)...")
-future_df_transformed = features_pipeline.future_transform(future_df)
+future_df_transformed = features_pipeline.future_transform(future_df).to_pandas()
 print(f"  Future data now has {len(future_df_transformed.columns)} columns")
 print()

{spforge-0.8.5 → spforge-0.8.8}/examples/nba/game_winner_example.py RENAMED Viewed

@@ -1,12 +1,13 @@
-import pandas as pd
+import polars as pl
 from sklearn.linear_model import LogisticRegression
+from examples import get_sub_sample_nba_data
 from spforge.autopipeline import AutoPipeline
 from spforge.data_structures import ColumnNames
 from spforge.ratings import RatingKnownFeatures
 from spforge.ratings._player_rating import PlayerRatingGenerator
-df = pd.read_parquet("data/game_player_subsample.parquet")
+df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
 # Defines the column names as they appear in the dataframe
 column_names = ColumnNames(
@@ -16,8 +17,8 @@ column_names = ColumnNames(
     player_id="player_name",
 )
 # Sorts the dataframe. The dataframe must always be sorted as below
-df = df.sort_values(
-    by=[
+df = df.sort(
+    [
         column_names.start_date,
         column_names.match_id,
         column_names.team_id,
@@ -27,17 +28,26 @@ df = df.sort_values(
 # Drops games with less or more than 2 teams
 df = (
-    df.assign(
-        team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique")
+    df.with_columns(
+        pl.col(column_names.team_id)
+        .n_unique()
+        .over(column_names.match_id)
+        .alias("team_count")
     )
-    .loc[lambda x: x.team_count == 2]
-    .drop(columns=["team_count"])
+    .filter(pl.col("team_count") == 2)
+    .drop("team_count")
 )
 # Pretends the last 10 games are future games. The most will be trained on everything before that.
-most_recent_10_games = df[column_names.match_id].unique()[-10:]
-historical_df = df[~df[column_names.match_id].isin(most_recent_10_games)]
-future_df = df[df[column_names.match_id].isin(most_recent_10_games)].drop(columns=["won"])
+most_recent_10_games = (
+    df.select(pl.col(column_names.match_id))
+    .unique(maintain_order=True)
+    .tail(10)
+    .get_column(column_names.match_id)
+    .to_list()
+)
+historical_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
+future_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games)).drop("won")
 # Defining a simple rating-generator. It will use the "won" column to update the ratings.
 # In contrast to a typical Elo, ratings will follow players.
@@ -49,7 +59,7 @@ rating_generator = PlayerRatingGenerator(
     column_names=column_names,
     non_predictor_features_out=[RatingKnownFeatures.PLAYER_RATING],
 )
-historical_df = rating_generator.fit_transform(historical_df)
+historical_df = rating_generator.fit_transform(historical_df).to_pandas()
 # Defines the predictor. A machine-learning model will be used to predict game winner on a game-team-level.
 # Mean team-ratings will be calculated (from player-level) and rating-difference between the 2 teams calculated.
@@ -61,13 +71,13 @@ historical_df = rating_generator.fit_transform(historical_df)
 pipeline = AutoPipeline(
     estimator=LogisticRegression(),
     granularity=["game_id", "team_id"],
-    feature_names=rating_generator.features_out + ["location"],
+    estimator_features=rating_generator.features_out + ["location"],
 )
 pipeline.fit(X=historical_df, y=historical_df["won"])
 # Future predictions on future results
-future_df = rating_generator.future_transform(future_df)
+future_df = rating_generator.future_transform(future_df).to_pandas()
 future_predictions = pipeline.predict_proba(future_df)[:, 1]
 future_df["game_winner_probability"] = future_predictions
 # Grouping predictions from game-player level to game-level.

{spforge-0.8.5 → spforge-0.8.8}/examples/nba/predictor_transformers_example.py RENAMED Viewed

@@ -12,7 +12,7 @@ Key concepts covered:
 - Hierarchical modeling: Team strength → Player performance
 """
-import pandas as pd
+import polars as pl
 from lightgbm import LGBMRegressor
 from sklearn.linear_model import LogisticRegression
@@ -24,7 +24,7 @@ from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
 from spforge.transformers import EstimatorTransformer
 # Load sample NBA data
-df = get_sub_sample_nba_data(as_pandas=True, as_polars=False)
+df = get_sub_sample_nba_data(as_pandas=False, as_polars=True)
 # Define column mappings
 column_names = ColumnNames(
@@ -35,7 +35,7 @@ column_names = ColumnNames(
 )
 # Sort data chronologically (critical for temporal correctness)
-df = df.sort_values(
+df = df.sort(
     [
         column_names.start_date,
         column_names.match_id,
@@ -46,18 +46,31 @@ df = df.sort_values(
 # Filter to valid games
 df = (
-    df.assign(team_count=df.groupby(column_names.match_id)[column_names.team_id].transform("nunique"))
-    .loc[lambda x: x.team_count == 2]
-    .drop(columns=["team_count"])
+    df.with_columns(
+        pl.col(column_names.team_id)
+        .n_unique()
+        .over(column_names.match_id)
+        .alias("team_count")
+    )
+    .filter(pl.col("team_count") == 2)
+    .drop("team_count")
 )
 # Train/test split (using temporal ordering)
-most_recent_10_games = df[column_names.match_id].unique()[-10:]
-train_df = df[~df[column_names.match_id].isin(most_recent_10_games)].copy()
-test_df = df[df[column_names.match_id].isin(most_recent_10_games)].copy()
+most_recent_10_games = (
+    df.select(pl.col(column_names.match_id))
+    .unique(maintain_order=True)
+    .tail(10)
+    .get_column(column_names.match_id)
+    .to_list()
+)
+train_df = df.filter(~pl.col(column_names.match_id).is_in(most_recent_10_games))
+test_df = df.filter(pl.col(column_names.match_id).is_in(most_recent_10_games))
-print(f"Training: {len(train_df)} rows, {train_df[column_names.match_id].nunique()} games")
-print(f"Testing: {len(test_df)} rows, {test_df[column_names.match_id].nunique()} games")
+train_games = train_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
+test_games = test_df.select(pl.col(column_names.match_id).n_unique()).to_series().item()
+print(f"Training: {len(train_df)} rows, {train_games} games")
+print(f"Testing: {len(test_df)} rows, {test_games} games")
 print()
 # ====================================================================
@@ -86,8 +99,8 @@ features_pipeline = FeatureGeneratorPipeline(
 )
 # Generate features
-train_df = features_pipeline.fit_transform(train_df)
-test_df = features_pipeline.future_transform(test_df)
+train_df = features_pipeline.fit_transform(train_df).to_pandas()
+test_df = features_pipeline.future_transform(test_df).to_pandas()
 print(f"Generated {len(features_pipeline.features_out)} baseline features")
 print()
@@ -121,7 +134,7 @@ player_points_pipeline = AutoPipeline(
     estimator=LGBMRegressor(verbose=-100, n_estimators=50),
     # Features for the final estimator (only pre-game information)
     # Note: points_estimate_raw will be added by the transformer
-    feature_names=features_pipeline.features_out,
+    estimator_features=features_pipeline.features_out,
     # The predictor_transformers parameter chains the estimators
     predictor_transformers=[points_estimate_transformer],  # Stage 1 executes first
 )
@@ -150,7 +163,7 @@ print()
 # Fit the pipeline
 # The y target here is for the FINAL estimator (player points)
-# Each predictor_transformer has its own target_column specified
+# Predictor_transformers are trained on the same target during fit()
 player_points_pipeline.fit(X=train_df, y=train_df["points"])
 print("Training complete!")
@@ -188,7 +201,7 @@ print()
 single_stage_pipeline = AutoPipeline(
     estimator=LGBMRegressor(verbose=-100, n_estimators=50),
-    feature_names=features_pipeline.features_out,
+    estimator_features=features_pipeline.features_out,
 )
 print("Training single-stage baseline for comparison...")

{spforge-0.8.5 → spforge-0.8.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spforge"
-version = "0.8.5"
+version = "0.8.8"
 description = "A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data."
 readme = "README.md"
 requires-python = ">=3.11"

{spforge-0.8.5 → spforge-0.8.8}/spforge/features_generator_pipeline.py RENAMED Viewed

@@ -120,7 +120,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
         for transformer in self.feature_generators:
             pre_row_count = len(df)
-            df = nw.from_native(transformer.fit_transform(df, column_names=column_names))
+            native_df = df.to_native()
+            df = nw.from_native(transformer.fit_transform(native_df, column_names=column_names))
             assert len(df) == pre_row_count
             for f in transformer.features_out:
                 if f in expected_feats_added:
@@ -151,7 +152,8 @@ class FeatureGeneratorPipeline(FeatureGenerator):
         for transformer in self.feature_generators:
             pre_row_count = len(df)
-            df = nw.from_native(transformer.transform(df))
+            native_df = df.to_native()
+            df = nw.from_native(transformer.transform(native_df))
             assert len(df) == pre_row_count
             for f in transformer.features_out:
                 if f in expected_feats_added:
@@ -181,9 +183,11 @@ class FeatureGeneratorPipeline(FeatureGenerator):
         for transformer in self.feature_generators:
             pre_row_count = len(df)
             if hasattr(transformer, "future_transform") and callable(transformer.future_transform):
-                df = nw.from_native(transformer.future_transform(df))
+                native_df = df.to_native()
+                df = nw.from_native(transformer.future_transform(native_df))
             else:
-                df = nw.from_native(transformer.transform(df))
+                native_df = df.to_native()
+                df = nw.from_native(transformer.transform(native_df))
             assert len(df) == pre_row_count
             for f in transformer.features_out:
                 if f in expected_feats_added:

spforge 0.8.5__tar.gz → 0.8.8__tar.gz

Potentially problematic release.

spforge 0.8.5tar.gz → 0.8.8tar.gz