PyPI - spforge - Versions diffs - 0.8.8__py3-none-any.whl → 0.8.18__py3-none-any.whl - Mend

spforge 0.8.8py3-none-any.whl → 0.8.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spforge might be problematic. Click here for more details.

Files changed (20) hide show

spforge/autopipeline.py +169 -5
spforge/estimator/_group_by_estimator.py +11 -3
spforge/performance_transformers/_performance_manager.py +2 -4
spforge/ratings/_player_rating.py +131 -28
spforge/ratings/start_rating_generator.py +1 -1
spforge/ratings/team_start_rating_generator.py +1 -1
spforge/ratings/utils.py +16 -6
spforge/scorer/_score.py +42 -11
spforge/transformers/_other_transformer.py +38 -8
{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/METADATA +1 -1
{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/RECORD +20 -18
{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
tests/performance_transformers/test_performance_manager.py +15 -0
tests/ratings/test_player_rating_generator.py +127 -0
tests/ratings/test_player_rating_no_mutation.py +214 -0
tests/ratings/test_utils_scaled_weights.py +136 -0
tests/scorer/test_score.py +142 -0
tests/test_autopipeline.py +336 -6
{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0

tests/test_autopipeline.py CHANGED Viewed

@@ -12,6 +12,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
 from spforge import AutoPipeline
 from spforge.estimator import SkLearnEnhancerEstimator
+from spforge.scorer import Filter, Operator
 from spforge.transformers import EstimatorTransformer
@@ -231,6 +232,27 @@ def test_predict_proba(df_clf):
     assert np.allclose(proba.sum(axis=1), 1.0, atol=1e-6)
+def test_filter_columns_not_passed_to_estimator(frame):
+    df_pd = pd.DataFrame(
+        {"x": [1.0, 2.0, 3.0, 4.0], "keep": [1, 0, 1, 0], "y": [1.0, 2.0, 3.0, 4.0]}
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    model = AutoPipeline(
+        estimator=CaptureEstimator(),
+        estimator_features=["x"],
+        filters=[Filter(column_name="keep", value=1, operator=Operator.EQUALS)],
+    )
+    X = _select(df, ["x", "keep"])
+    y = _col(df, "y")
+    model.fit(X, y=y)
+    est = _inner_estimator(model)
+    assert "keep" in model.required_features
+    assert "keep" not in est.fit_columns
 def test_predict_proba_raises_if_not_supported(df_reg):
     model = AutoPipeline(
         estimator=LinearRegression(),
@@ -306,7 +328,18 @@ def test_infer_categorical_from_feature_names_when_only_numeric_features_given(d
     assert any(c.startswith("cat") for c in cap.fit_columns)
-def test_granularity_groups_rows_before_estimator_fit_and_predict(df_reg):
+def test_granularity_groups_rows_before_estimator_fit_and_predict(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1", "g2", "g2", "g3", "g3"],
+            "num1": [1.0, 2.0, np.nan, 4.0, 5.0, 6.0],
+            "num2": [10.0, 20.0, 30.0, 40.0, np.nan, 60.0],
+            "cat1": ["a", "b", "a", None, "b", "c"],
+            "y": [1.0, 1.0, 2.0, 2.0, 3.0, 3.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
     model = AutoPipeline(
         estimator=CaptureEstimator(),
         estimator_features=["gameid", "num1", "num2", "cat1"],
@@ -317,16 +350,16 @@ def test_granularity_groups_rows_before_estimator_fit_and_predict(df_reg):
         remainder="drop",
     )
-    X = _select(df_reg, ["gameid", "num1", "num2", "cat1"])
-    y = _col(df_reg, "y")
+    X = _select(df, ["gameid", "num1", "num2", "cat1"])
+    y = _col(df, "y")
     model.fit(X, y=y)
     inner = _inner_estimator(model)
-    if isinstance(df_reg, pl.DataFrame):
-        n_groups = df_reg.select(pl.col("gameid").n_unique()).item()
+    if isinstance(df, pl.DataFrame):
+        n_groups = df.select(pl.col("gameid").n_unique()).item()
     else:
-        n_groups = df_reg["gameid"].nunique()
+        n_groups = df["gameid"].nunique()
     assert inner.fit_shape[0] == n_groups
@@ -551,3 +584,300 @@ def test_autopipeline_is_picklable_after_fit():
     model.fit(df, y)
     pickle.dumps(model)
+# --- Feature Importances Tests ---
+def test_feature_importances__tree_model():
+    from sklearn.ensemble import RandomForestRegressor
+    df = pd.DataFrame(
+        {
+            "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0],
+            "cat1": ["a", "b", "a", "b", "a", "b"],
+        }
+    )
+    y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
+    model = AutoPipeline(
+        estimator=RandomForestRegressor(n_estimators=5, random_state=42),
+        estimator_features=["num1", "num2", "cat1"],
+        categorical_handling="ordinal",
+    )
+    model.fit(df, y)
+    importances = model.feature_importances_
+    assert isinstance(importances, pd.DataFrame)
+    assert list(importances.columns) == ["feature", "importance"]
+    assert len(importances) == 3
+    assert set(importances["feature"].tolist()) == {"num1", "num2", "cat1"}
+    assert all(importances["importance"] >= 0)
+def test_feature_importances__linear_model():
+    df = pd.DataFrame(
+        {
+            "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+            "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0],
+        }
+    )
+    y = pd.Series([0, 1, 0, 1, 0, 1, 0, 1], name="y")
+    model = AutoPipeline(
+        estimator=LogisticRegression(max_iter=1000),
+        estimator_features=["num1", "num2"],
+        scale_features=True,
+    )
+    model.fit(df, y)
+    importances = model.feature_importances_
+    assert isinstance(importances, pd.DataFrame)
+    assert list(importances.columns) == ["feature", "importance"]
+    assert len(importances) == 2
+    assert set(importances["feature"].tolist()) == {"num1", "num2"}
+    assert all(importances["importance"] >= 0)
+def test_feature_importances__not_fitted_raises():
+    model = AutoPipeline(
+        estimator=LinearRegression(),
+        estimator_features=["x"],
+    )
+    with pytest.raises(RuntimeError, match="Pipeline not fitted"):
+        _ = model.feature_importances_
+def test_feature_importances__unsupported_estimator_raises():
+    df = pd.DataFrame({"x": [1.0, 2.0, 3.0, 4.0]})
+    y = pd.Series([1.0, 2.0, 3.0, 4.0], name="y")
+    model = AutoPipeline(
+        estimator=DummyRegressor(),
+        estimator_features=["x"],
+    )
+    model.fit(df, y)
+    with pytest.raises(RuntimeError, match="does not support feature importances"):
+        _ = model.feature_importances_
+def test_feature_importances__with_sklearn_enhancer():
+    from sklearn.ensemble import RandomForestRegressor
+    df = pd.DataFrame(
+        {
+            "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            "num2": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0],
+            "start_date": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04", "2022-01-05", "2022-01-06"],
+        }
+    )
+    y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
+    inner = RandomForestRegressor(n_estimators=5, random_state=42)
+    enhancer = SkLearnEnhancerEstimator(
+        estimator=inner,
+        date_column="start_date",
+        day_weight_epsilon=0.1,
+    )
+    model = AutoPipeline(
+        estimator=enhancer,
+        estimator_features=["num1", "num2"],
+    )
+    model.fit(df, y)
+    importances = model.feature_importances_
+    assert isinstance(importances, pd.DataFrame)
+    assert list(importances.columns) == ["feature", "importance"]
+    assert len(importances) == 2
+    assert set(importances["feature"].tolist()) == {"num1", "num2"}
+def test_feature_importances__onehot_features():
+    from sklearn.ensemble import RandomForestRegressor
+    df = pd.DataFrame(
+        {
+            "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            "cat1": ["a", "b", "c", "a", "b", "c"],
+        }
+    )
+    y = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="y")
+    model = AutoPipeline(
+        estimator=RandomForestRegressor(n_estimators=5, random_state=42),
+        estimator_features=["num1", "cat1"],
+        categorical_handling="onehot",
+    )
+    model.fit(df, y)
+    importances = model.feature_importances_
+    assert isinstance(importances, pd.DataFrame)
+    assert list(importances.columns) == ["feature", "importance"]
+    # Should have expanded features: num1 + cat1_a, cat1_b, cat1_c
+    assert len(importances) == 4
+    assert "num1" in importances["feature"].tolist()
+    assert any("cat1_" in f for f in importances["feature"].tolist())
+def test_feature_importance_names__granularity_uses_deep_feature_names():
+    from sklearn.ensemble import RandomForestRegressor
+    df = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1", "g2", "g2"],
+            "num1": [1.0, 2.0, 3.0, 4.0],
+            "num2": [10.0, 20.0, 30.0, 40.0],
+            "y": [1.0, 1.0, 2.0, 2.0],
+        }
+    )
+    y = df["y"]
+    model = AutoPipeline(
+        estimator=RandomForestRegressor(n_estimators=5, random_state=42),
+        estimator_features=["gameid", "num1", "num2"],
+        predictor_transformers=[AddConstantPredictionTransformer(col_name="const_pred")],
+        granularity=["gameid"],
+        categorical_features=["gameid"],
+        categorical_handling="ordinal",
+        remainder="drop",
+    )
+    model.fit(df, y)
+    names = model.feature_importance_names
+    inner = _inner_estimator(model)
+    assert list(names.keys()) == list(inner.feature_names_in_)
+    assert "gameid" not in names
+    assert "const_pred" in names
+@pytest.mark.parametrize("frame", ["pd", "pl"])
+def test_granularity_with_aggregation_weight__features_weighted(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1", "g2", "g2"],
+            "num1": [10.0, 30.0, 20.0, 40.0],
+            "weight": [0.25, 0.75, 0.5, 0.5],
+            "y": [1.0, 1.0, 2.0, 2.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    cap = CaptureEstimator()
+    model = AutoPipeline(
+        estimator=cap,
+        estimator_features=["num1"],
+        granularity=["gameid"],
+        aggregation_weight="weight",
+        remainder="drop",
+    )
+    X = _select(df, ["gameid", "num1", "weight"])
+    y = _col(df, "y")
+    model.fit(X, y=y)
+    inner = _inner_estimator(model)
+    assert inner.fit_shape[0] == 2
+    preds = model.predict(X)
+    assert preds.shape[0] == len(X)
+@pytest.mark.parametrize("frame", ["pd", "pl"])
+def test_granularity_aggregation_weight__weighted_mean_correct(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1"],
+            "num1": [10.0, 30.0],
+            "weight": [0.25, 0.75],
+            "y": [1.0, 1.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    from spforge.transformers._other_transformer import GroupByReducer
+    reducer = GroupByReducer(granularity=["gameid"], aggregation_weight="weight")
+    transformed = reducer.fit_transform(df)
+    if frame == "pl":
+        num1_val = transformed["num1"].to_list()[0]
+    else:
+        num1_val = transformed["num1"].iloc[0]
+    expected = (10.0 * 0.25 + 30.0 * 0.75) / (0.25 + 0.75)
+    assert abs(num1_val - expected) < 1e-6
+@pytest.mark.parametrize("frame", ["pd", "pl"])
+def test_reduce_y_raises_when_target_not_uniform_per_group(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1"],
+            "num1": [10.0, 30.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    from spforge.transformers._other_transformer import GroupByReducer
+    reducer = GroupByReducer(granularity=["gameid"])
+    y = np.array([1.0, 2.0])
+    with pytest.raises(ValueError, match="Target.*must be uniform"):
+        reducer.reduce_y(df, y)
+@pytest.mark.parametrize("frame", ["pd", "pl"])
+def test_reduce_y_works_when_target_uniform_per_group(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1", "g2", "g2"],
+            "num1": [10.0, 30.0, 20.0, 40.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    from spforge.transformers._other_transformer import GroupByReducer
+    reducer = GroupByReducer(granularity=["gameid"])
+    y = np.array([1.0, 1.0, 2.0, 2.0])
+    y_out, _ = reducer.reduce_y(df, y)
+    assert len(y_out) == 2
+    assert set(y_out) == {1.0, 2.0}
+@pytest.mark.parametrize("frame", ["pd", "pl"])
+def test_aggregation_weight_sums_weight_column(frame):
+    df_pd = pd.DataFrame(
+        {
+            "gameid": ["g1", "g1"],
+            "num1": [10.0, 30.0],
+            "weight": [0.25, 0.75],
+            "y": [1.0, 1.0],
+        }
+    )
+    df = df_pd if frame == "pd" else pl.from_pandas(df_pd)
+    from spforge.transformers._other_transformer import GroupByReducer
+    reducer = GroupByReducer(granularity=["gameid"], aggregation_weight="weight")
+    transformed = reducer.fit_transform(df)
+    if frame == "pl":
+        weight_val = transformed["weight"].to_list()[0]
+    else:
+        weight_val = transformed["weight"].iloc[0]
+    expected = 0.25 + 0.75
+    assert abs(weight_val - expected) < 1e-6

{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

spforge 0.8.8__py3-none-any.whl → 0.8.18__py3-none-any.whl

Potentially problematic release.

spforge 0.8.8py3-none-any.whl → 0.8.18py3-none-any.whl