PyPI - spforge - Versions diffs - 0.8.10__tar.gz → 0.8.13__tar.gz - Mend

spforge 0.8.10tar.gz → 0.8.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spforge might be problematic. Click here for more details.

Files changed (119) hide show

{spforge-0.8.10/spforge.egg-info → spforge-0.8.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spforge
-Version: 0.8.10
+Version: 0.8.13
 Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
 Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
 License: See LICENSE file

{spforge-0.8.10 → spforge-0.8.13}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spforge"
-version = "0.8.10"
+version = "0.8.13"
 description = "A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data."
 readme = "README.md"
 requires-python = ">=3.11"

{spforge-0.8.10 → spforge-0.8.13}/spforge/autopipeline.py RENAMED Viewed

@@ -195,6 +195,40 @@ def lgbm_in_root(root) -> bool:
     return any(_is_lightgbm_estimator(obj) for obj in _walk_objects(root))
+def _get_importance_estimator(estimator) -> tuple[Any, str] | None:
+    """Recursively find innermost estimator with feature_importances_ or coef_."""
+    if hasattr(estimator, "feature_importances_"):
+        inner = _get_importance_estimator_inner(estimator)
+        if inner is not None:
+            return inner
+        return (estimator, "feature_importances_")
+    if hasattr(estimator, "coef_"):
+        inner = _get_importance_estimator_inner(estimator)
+        if inner is not None:
+            return inner
+        return (estimator, "coef_")
+    return _get_importance_estimator_inner(estimator)
+def _get_importance_estimator_inner(estimator) -> tuple[Any, str] | None:
+    """Check wrapped estimators for importance attributes."""
+    # Check estimator_ (sklearn fitted wrapper convention)
+    if hasattr(estimator, "estimator_") and estimator.estimator_ is not None:
+        result = _get_importance_estimator(estimator.estimator_)
+        if result is not None:
+            return result
+    # Check _est (GroupByEstimator convention)
+    if hasattr(estimator, "_est") and estimator._est is not None:
+        result = _get_importance_estimator(estimator._est)
+        if result is not None:
+            return result
+    return None
 class AutoPipeline(BaseEstimator):
     def __init__(
         self,
@@ -627,3 +661,61 @@ class AutoPipeline(BaseEstimator):
                 all_features.append(ctx)
         return all_features
+    def _get_estimator_feature_names(self) -> list[str]:
+        """Get feature names as seen by the final estimator after all transformations."""
+        pre_out = list(self.sklearn_pipeline.named_steps["pre"].get_feature_names_out())
+        # Remove context columns dropped by "final" step
+        final_step = self.sklearn_pipeline.named_steps["final"]
+        drop_cols = final_step.kw_args.get("drop_cols", set()) if final_step.kw_args else set()
+        features = [f for f in pre_out if f not in drop_cols]
+        # Remove granularity columns (dropped by GroupByEstimator)
+        granularity_set = set(self.granularity)
+        features = [f for f in features if f not in granularity_set]
+        # Remove context features (used by wrapper estimators, not inner model)
+        context_set = set(self.context_feature_names)
+        features = [f for f in features if f not in context_set]
+        return features
+    @property
+    def feature_importances_(self) -> pd.DataFrame:
+        """Get feature importances from the fitted estimator.
+        Returns a DataFrame with columns ["feature", "importance"] sorted by
+        absolute importance descending. Works with tree-based models
+        (feature_importances_) and linear models (coef_).
+        """
+        if self.sklearn_pipeline is None:
+            raise RuntimeError("Pipeline not fitted. Call fit() first.")
+        est = self.sklearn_pipeline.named_steps["est"]
+        result = _get_importance_estimator(est)
+        if result is None:
+            raise RuntimeError(
+                "Estimator does not support feature importances. "
+                "Requires feature_importances_ or coef_ attribute."
+            )
+        inner_est, attr_name = result
+        raw = getattr(inner_est, attr_name)
+        if attr_name == "coef_":
+            # Linear models: use absolute value of coefficients
+            if raw.ndim == 2:
+                # Multi-class: average absolute values across classes
+                importances = np.abs(raw).mean(axis=0)
+            else:
+                importances = np.abs(raw)
+        else:
+            importances = raw
+        feature_names = self._get_estimator_feature_names()
+        df = pd.DataFrame({"feature": feature_names, "importance": importances})
+        df = df.sort_values("importance", ascending=False, key=abs).reset_index(drop=True)
+        return df

{spforge-0.8.10 → spforge-0.8.13}/spforge/ratings/_player_rating.py RENAMED Viewed

@@ -34,6 +34,8 @@ from spforge.ratings.utils import (
 from spforge.feature_generator._utils import to_polars
 PLAYER_STATS = "__PLAYER_STATS"
+_SCALED_PW = "__scaled_participation_weight__"
+_SCALED_PPW = "__scaled_projected_participation_weight__"
 class PlayerRatingGenerator(RatingGenerator):
@@ -273,6 +275,7 @@ class PlayerRatingGenerator(RatingGenerator):
             self._projected_participation_weight_max = self._participation_weight_max
     def _scale_participation_weight_columns(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Create internal scaled participation weight columns without mutating originals."""
         if not self.scale_participation_weights:
             return df
         if self._participation_weight_max is None or self._participation_weight_max <= 0:
@@ -287,7 +290,7 @@ class PlayerRatingGenerator(RatingGenerator):
             df = df.with_columns(
                 (pl.col(cn.participation_weight) / denom)
                 .clip(0.0, 1.0)
-                .alias(cn.participation_weight)
+                .alias(_SCALED_PW)
             )
         if (
@@ -300,16 +303,38 @@ class PlayerRatingGenerator(RatingGenerator):
             df = df.with_columns(
                 (pl.col(cn.projected_participation_weight) / denom)
                 .clip(0.0, 1.0)
-                .alias(cn.projected_participation_weight)
+                .alias(_SCALED_PPW)
             )
         return df
+    def _get_participation_weight_col(self) -> str:
+        """Get the column name to use for participation weight (scaled if available)."""
+        cn = self.column_names
+        if self.scale_participation_weights and cn and cn.participation_weight:
+            return _SCALED_PW
+        return cn.participation_weight if cn else ""
+    def _get_projected_participation_weight_col(self) -> str:
+        """Get the column name to use for projected participation weight (scaled if available)."""
+        cn = self.column_names
+        if self.scale_participation_weights and cn and cn.projected_participation_weight:
+            return _SCALED_PPW
+        return cn.projected_participation_weight if cn else ""
+    def _remove_internal_scaled_columns(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Remove internal scaled columns before returning."""
+        cols_to_drop = [c for c in [_SCALED_PW, _SCALED_PPW] if c in df.columns]
+        if cols_to_drop:
+            df = df.drop(cols_to_drop)
+        return df
     def _historical_transform(self, df: pl.DataFrame) -> pl.DataFrame:
         df = self._scale_participation_weight_columns(df)
         match_df = self._create_match_df(df)
         ratings = self._calculate_ratings(match_df)
+        # Keep scaled columns for now - they're needed by _add_rating_features
         cols = [
             c
             for c in df.columns
@@ -329,13 +354,15 @@ class PlayerRatingGenerator(RatingGenerator):
             on=[self.column_names.player_id, self.column_names.match_id, self.column_names.team_id],
         )
-        return self._add_rating_features(df)
+        result = self._add_rating_features(df)
+        return self._remove_internal_scaled_columns(result)
     def _future_transform(self, df: pl.DataFrame) -> pl.DataFrame:
         df = self._scale_participation_weight_columns(df)
         match_df = self._create_match_df(df)
         ratings = self._calculate_future_ratings(match_df)
+        # Keep scaled columns for now - they're needed by _add_rating_features
         cols = [
             c
             for c in df.columns
@@ -360,7 +387,8 @@ class PlayerRatingGenerator(RatingGenerator):
             how="left",
         )
-        return self._add_rating_features(df_with_ratings)
+        result = self._add_rating_features(df_with_ratings)
+        return self._remove_internal_scaled_columns(result)
     def _calculate_ratings(self, match_df: pl.DataFrame) -> pl.DataFrame:
         cn = self.column_names
@@ -796,9 +824,13 @@ class PlayerRatingGenerator(RatingGenerator):
         if cn.participation_weight and cn.participation_weight in df.columns:
             player_stat_cols.append(cn.participation_weight)
+        if _SCALED_PW in df.columns:
+            player_stat_cols.append(_SCALED_PW)
         if cn.projected_participation_weight and cn.projected_participation_weight in df.columns:
             player_stat_cols.append(cn.projected_participation_weight)
+        if _SCALED_PPW in df.columns:
+            player_stat_cols.append(_SCALED_PPW)
         if cn.position and cn.position in df.columns:
             player_stat_cols.append(cn.position)
@@ -854,14 +886,23 @@ class PlayerRatingGenerator(RatingGenerator):
             position = team_player.get(cn.position)
             player_league = team_player.get(cn.league, None)
-            participation_weight = (
-                team_player.get(cn.participation_weight, 1.0) if cn.participation_weight else 1.0
-            )
-            projected_participation_weight = (
-                team_player.get(cn.projected_participation_weight, participation_weight)
-                if cn.projected_participation_weight
-                else participation_weight
-            )
+            # Use scaled participation weight if available, otherwise use original
+            if _SCALED_PW in team_player:
+                participation_weight = team_player.get(_SCALED_PW, 1.0)
+            elif cn.participation_weight:
+                participation_weight = team_player.get(cn.participation_weight, 1.0)
+            else:
+                participation_weight = 1.0
+            # Use scaled projected participation weight if available, otherwise use original
+            if _SCALED_PPW in team_player:
+                projected_participation_weight = team_player.get(_SCALED_PPW, participation_weight)
+            elif cn.projected_participation_weight:
+                projected_participation_weight = team_player.get(
+                    cn.projected_participation_weight, participation_weight
+                )
+            else:
+                projected_participation_weight = participation_weight
             projected_participation_weights.append(projected_participation_weight)
             perf_val = (
@@ -1087,14 +1128,21 @@ class PlayerRatingGenerator(RatingGenerator):
                     position = tp.get(cn.position)
                     league = tp.get(cn.league, None)
-                    pw = (
-                        tp.get(cn.participation_weight, 1.0) if cn.participation_weight else 1.0
-                    )
-                    ppw = (
-                        tp.get(cn.projected_participation_weight, pw)
-                        if cn.projected_participation_weight
-                        else pw
-                    )
+                    # Use scaled participation weight if available, otherwise use original
+                    if _SCALED_PW in tp:
+                        pw = tp.get(_SCALED_PW, 1.0)
+                    elif cn.participation_weight:
+                        pw = tp.get(cn.participation_weight, 1.0)
+                    else:
+                        pw = 1.0
+                    # Use scaled projected participation weight if available, otherwise use original
+                    if _SCALED_PPW in tp:
+                        ppw = tp.get(_SCALED_PPW, pw)
+                    elif cn.projected_participation_weight:
+                        ppw = tp.get(cn.projected_participation_weight, pw)
+                    else:
+                        ppw = pw
                     proj_w.append(float(ppw))
                     mp = MatchPerformance(

{spforge-0.8.10 → spforge-0.8.13}/spforge/ratings/utils.py RENAMED Viewed

@@ -2,6 +2,10 @@ import polars as pl
 from spforge.data_structures import ColumnNames
+# Internal column names for scaled participation weights
+_SCALED_PW = "__scaled_participation_weight__"
+_SCALED_PPW = "__scaled_projected_participation_weight__"
 def add_team_rating(
     df: pl.DataFrame,
@@ -46,11 +50,14 @@ def add_team_rating_projected(
     tid = column_names.team_id
     ppw = column_names.projected_participation_weight
-    if ppw:
+    # Use scaled column if available (clipped to [0, 1]), otherwise raw column
+    weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
+    if weight_col and weight_col in df.columns:
         return df.with_columns(
             (
-                (pl.col(ppw) * pl.col(player_rating_col)).sum().over([mid, tid])
-                / pl.col(ppw).sum().over([mid, tid])
+                (pl.col(weight_col) * pl.col(player_rating_col)).sum().over([mid, tid])
+                / pl.col(weight_col).sum().over([mid, tid])
             ).alias(team_rating_out)
         )
@@ -118,11 +125,14 @@ def add_rating_mean_projected(
     mid = column_names.match_id
     ppw = column_names.projected_participation_weight
-    if ppw:
+    # Use scaled column if available (clipped to [0, 1]), otherwise raw column
+    weight_col = _SCALED_PPW if _SCALED_PPW in df.columns else ppw
+    if weight_col and weight_col in df.columns:
         return df.with_columns(
             (
-                (pl.col(ppw) * pl.col(player_rating_col)).sum().over(mid)
-                / pl.col(ppw).sum().over(mid)
+                (pl.col(weight_col) * pl.col(player_rating_col)).sum().over(mid)
+                / pl.col(weight_col).sum().over(mid)
             ).alias(rating_mean_out)
         )

{spforge-0.8.10 → spforge-0.8.13/spforge.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spforge
-Version: 0.8.10
+Version: 0.8.13
 Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
 Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
 License: See LICENSE file

{spforge-0.8.10 → spforge-0.8.13}/spforge.egg-info/SOURCES.txt RENAMED Viewed

@@ -103,8 +103,10 @@ tests/hyperparameter_tuning/test_rating_tuner.py
 tests/performance_transformers/test_performance_manager.py
 tests/performance_transformers/test_performances_transformers.py
 tests/ratings/test_player_rating_generator.py
+tests/ratings/test_player_rating_no_mutation.py
 tests/ratings/test_ratings_property.py
 tests/ratings/test_team_rating_generator.py
+tests/ratings/test_utils_scaled_weights.py
 tests/scorer/test_score.py
 tests/scorer/test_score_aggregation_granularity.py
 tests/transformers/test_estimator_transformer_context.py

spforge-0.8.13/tests/ratings/test_player_rating_no_mutation.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""Tests to ensure PlayerRatingGenerator does not mutate input columns."""
+import polars as pl
+import pytest
+from spforge import ColumnNames
+from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
+@pytest.fixture
+def cn_with_projected():
+    """ColumnNames with both participation_weight and projected_participation_weight."""
+    return ColumnNames(
+        player_id="pid",
+        team_id="tid",
+        match_id="mid",
+        start_date="dt",
+        update_match_id="mid",
+        participation_weight="minutes",
+        projected_participation_weight="minutes_prediction",
+    )
+@pytest.fixture
+def fit_df():
+    """Training data with minutes > 1 (will trigger auto-scaling)."""
+    return pl.DataFrame(
+        {
+            "pid": ["P1", "P2", "P3", "P4"],
+            "tid": ["T1", "T1", "T2", "T2"],
+            "mid": ["M1", "M1", "M1", "M1"],
+            "dt": ["2024-01-01"] * 4,
+            "perf": [0.6, 0.4, 0.7, 0.3],
+            "minutes": [30.0, 25.0, 32.0, 28.0],
+            "minutes_prediction": [28.0, 24.0, 30.0, 26.0],
+        }
+    )
+@pytest.fixture
+def future_df():
+    """Future prediction data with minutes > 1 (will trigger auto-scaling)."""
+    return pl.DataFrame(
+        {
+            "pid": ["P1", "P2", "P3", "P4"],
+            "tid": ["T1", "T1", "T2", "T2"],
+            "mid": ["M2", "M2", "M2", "M2"],
+            "dt": ["2024-01-02"] * 4,
+            "minutes": [30.0, 25.0, 32.0, 28.0],
+            "minutes_prediction": [28.0, 24.0, 30.0, 26.0],
+        }
+    )
+def test_fit_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df):
+    """fit_transform should not modify the participation_weight column values."""
+    # Join result with original to compare values by player_id
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    result = gen.fit_transform(fit_df)
+    # Check that each player's minutes value is preserved
+    original_by_player = dict(zip(fit_df["pid"].to_list(), fit_df["minutes"].to_list()))
+    result_by_player = dict(zip(result["pid"].to_list(), result["minutes"].to_list()))
+    for pid, original_val in original_by_player.items():
+        result_val = result_by_player[pid]
+        assert result_val == original_val, (
+            f"participation_weight for player {pid} was mutated. "
+            f"Expected {original_val}, got {result_val}"
+        )
+def test_fit_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df):
+    """fit_transform should not modify the projected_participation_weight column values."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    result = gen.fit_transform(fit_df)
+    # Check that each player's minutes_prediction value is preserved
+    original_by_player = dict(zip(fit_df["pid"].to_list(), fit_df["minutes_prediction"].to_list()))
+    result_by_player = dict(zip(result["pid"].to_list(), result["minutes_prediction"].to_list()))
+    for pid, original_val in original_by_player.items():
+        result_val = result_by_player[pid]
+        assert result_val == original_val, (
+            f"projected_participation_weight for player {pid} was mutated. "
+            f"Expected {original_val}, got {result_val}"
+        )
+def test_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df, future_df):
+    """transform should not modify the participation_weight column values."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    gen.fit_transform(fit_df)
+    result = gen.transform(future_df)
+    # Check that each player's minutes value is preserved
+    original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes"].to_list()))
+    result_by_player = dict(zip(result["pid"].to_list(), result["minutes"].to_list()))
+    for pid, original_val in original_by_player.items():
+        result_val = result_by_player[pid]
+        assert result_val == original_val, (
+            f"participation_weight for player {pid} was mutated during transform. "
+            f"Expected {original_val}, got {result_val}"
+        )
+def test_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df, future_df):
+    """transform should not modify the projected_participation_weight column values."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    gen.fit_transform(fit_df)
+    result = gen.transform(future_df)
+    # Check that each player's minutes_prediction value is preserved
+    original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes_prediction"].to_list()))
+    result_by_player = dict(zip(result["pid"].to_list(), result["minutes_prediction"].to_list()))
+    for pid, original_val in original_by_player.items():
+        result_val = result_by_player[pid]
+        assert result_val == original_val, (
+            f"projected_participation_weight for player {pid} was mutated during transform. "
+            f"Expected {original_val}, got {result_val}"
+        )
+def test_future_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df, future_df):
+    """future_transform should not modify the participation_weight column values."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    gen.fit_transform(fit_df)
+    original_minutes = future_df["minutes"].to_list()
+    result = gen.future_transform(future_df)
+    # The minutes column should have the same values as before
+    result_minutes = result["minutes"].to_list()
+    assert result_minutes == original_minutes, (
+        f"participation_weight column was mutated during future_transform. "
+        f"Expected {original_minutes}, got {result_minutes}"
+    )
+def test_future_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df, future_df):
+    """future_transform should not modify the projected_participation_weight column values."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    gen.fit_transform(fit_df)
+    original_minutes_pred = future_df["minutes_prediction"].to_list()
+    result = gen.future_transform(future_df)
+    # The minutes_prediction column should have the same values as before
+    result_minutes_pred = result["minutes_prediction"].to_list()
+    assert result_minutes_pred == original_minutes_pred, (
+        f"projected_participation_weight column was mutated during future_transform. "
+        f"Expected {original_minutes_pred}, got {result_minutes_pred}"
+    )
+def test_multiple_transforms_do_not_compound_scaling(cn_with_projected, fit_df, future_df):
+    """Multiple transform calls should not compound the scaling effect."""
+    gen = PlayerRatingGenerator(
+        performance_column="perf",
+        column_names=cn_with_projected,
+        auto_scale_performance=True,
+        features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
+    )
+    gen.fit_transform(fit_df)
+    # Call transform multiple times
+    result1 = gen.transform(future_df)
+    result2 = gen.transform(result1)
+    result3 = gen.transform(result2)
+    # After 3 transforms, each player's values should still be the same as original
+    original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes_prediction"].to_list()))
+    final_by_player = dict(zip(result3["pid"].to_list(), result3["minutes_prediction"].to_list()))
+    for pid, original_val in original_by_player.items():
+        final_val = final_by_player[pid]
+        assert final_val == original_val, (
+            f"Multiple transforms compounded the scaling for player {pid}. "
+            f"Expected {original_val}, got {final_val}"
+        )

spforge 0.8.10__tar.gz → 0.8.13__tar.gz

Potentially problematic release.

spforge 0.8.10tar.gz → 0.8.13tar.gz