PyPI - spforge - Versions diffs - 0.8.4__py3-none-any.whl → 0.8.8__py3-none-any.whl - Mend

spforge 0.8.4py3-none-any.whl → 0.8.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

examples/lol/pipeline_transformer_example.py +69 -86
examples/nba/cross_validation_example.py +4 -11
examples/nba/feature_engineering_example.py +33 -15
examples/nba/game_winner_example.py +24 -14
examples/nba/predictor_transformers_example.py +29 -16
spforge/__init__.py +1 -0
spforge/features_generator_pipeline.py +8 -4
spforge/hyperparameter_tuning/__init__.py +12 -0
spforge/hyperparameter_tuning/_default_search_spaces.py +159 -1
spforge/hyperparameter_tuning/_tuner.py +192 -0
spforge/ratings/__init__.py +4 -0
spforge/ratings/_player_rating.py +11 -0
spforge/ratings/league_start_rating_optimizer.py +201 -0
{spforge-0.8.4.dist-info → spforge-0.8.8.dist-info}/METADATA +12 -19
{spforge-0.8.4.dist-info → spforge-0.8.8.dist-info}/RECORD +25 -21
tests/end_to_end/test_estimator_hyperparameter_tuning.py +85 -0
tests/end_to_end/test_league_start_rating_optimizer.py +117 -0
tests/end_to_end/test_nba_player_ratings_hyperparameter_tuning.py +5 -0
tests/hyperparameter_tuning/test_estimator_tuner.py +167 -0
tests/ratings/test_player_rating_generator.py +27 -0
tests/scorer/test_score.py +90 -0
tests/test_feature_generator_pipeline.py +43 -0
{spforge-0.8.4.dist-info → spforge-0.8.8.dist-info}/WHEEL +0 -0
{spforge-0.8.4.dist-info → spforge-0.8.8.dist-info}/licenses/LICENSE +0 -0
{spforge-0.8.4.dist-info → spforge-0.8.8.dist-info}/top_level.txt +0 -0

spforge/hyperparameter_tuning/_default_search_spaces.py CHANGED Viewed

@@ -1,12 +1,133 @@
 from spforge.hyperparameter_tuning._tuner import ParamSpec
 from spforge.ratings import PlayerRatingGenerator, TeamRatingGenerator
+from spforge.distributions import (
+    NegativeBinomialEstimator,
+    NormalDistributionPredictor,
+    StudentTDistributionEstimator,
+)
+def _is_lightgbm_estimator(obj: object) -> bool:
+    mod = (getattr(type(obj), "__module__", "") or "").lower()
+    name = type(obj).__name__
+    if "lightgbm" in mod:
+        return True
+    return bool(name.startswith("LGBM"))
+def get_default_lgbm_search_space() -> dict[str, ParamSpec]:
+    return {
+        "n_estimators": ParamSpec(
+            param_type="int",
+            low=50,
+            high=800,
+            log=True,
+        ),
+        "num_leaves": ParamSpec(
+            param_type="int",
+            low=16,
+            high=256,
+            log=True,
+        ),
+        "max_depth": ParamSpec(
+            param_type="int",
+            low=3,
+            high=12,
+        ),
+        "min_child_samples": ParamSpec(
+            param_type="int",
+            low=10,
+            high=200,
+            log=True,
+        ),
+        "subsample": ParamSpec(
+            param_type="float",
+            low=0.6,
+            high=1.0,
+        ),
+        "subsample_freq": ParamSpec(
+            param_type="int",
+            low=1,
+            high=7,
+        ),
+        "reg_alpha": ParamSpec(
+            param_type="float",
+            low=1e-8,
+            high=10.0,
+            log=True,
+        ),
+        "reg_lambda": ParamSpec(
+            param_type="float",
+            low=1e-8,
+            high=10.0,
+            log=True,
+        ),
+    }
+def get_default_negative_binomial_search_space() -> dict[str, ParamSpec]:
+    return {
+        "predicted_r_weight": ParamSpec(
+            param_type="float",
+            low=0.0,
+            high=1.0,
+        ),
+        "r_rolling_mean_window": ParamSpec(
+            param_type="int",
+            low=10,
+            high=120,
+        ),
+        "predicted_r_iterations": ParamSpec(
+            param_type="int",
+            low=2,
+            high=12,
+        ),
+    }
+def get_default_normal_distribution_search_space() -> dict[str, ParamSpec]:
+    return {
+        "sigma": ParamSpec(
+            param_type="float",
+            low=0.5,
+            high=30.0,
+            log=True,
+        ),
+    }
+def get_default_student_t_search_space() -> dict[str, ParamSpec]:
+    return {
+        "df": ParamSpec(
+            param_type="float",
+            low=3.0,
+            high=30.0,
+            log=True,
+        ),
+        "min_sigma": ParamSpec(
+            param_type="float",
+            low=0.5,
+            high=10.0,
+            log=True,
+        ),
+        "sigma_bins": ParamSpec(
+            param_type="int",
+            low=4,
+            high=12,
+        ),
+        "min_bin_rows": ParamSpec(
+            param_type="int",
+            low=10,
+            high=100,
+        ),
+    }
 def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
     """
     Default search space for PlayerRatingGenerator.
-    Focuses on 5-8 core parameters that have the most impact on performance.
+    Focuses on core parameters that have the most impact on performance.
     Returns:
         Dictionary mapping parameter names to ParamSpec objects
@@ -46,6 +167,31 @@ def get_default_player_rating_search_space() -> dict[str, ParamSpec]:
             param_type="categorical",
             choices=["difference", "mean", "ignore_opponent"],
         ),
+        "start_league_quantile": ParamSpec(
+            param_type="float",
+            low=0.05,
+            high=0.5,
+        ),
+        "start_min_count_for_percentiles": ParamSpec(
+            param_type="int",
+            low=40,
+            high=500,
+        ),
+        "start_team_rating_subtract": ParamSpec(
+            param_type="float",
+            low=0.0,
+            high=200.0,
+        ),
+        "start_team_weight": ParamSpec(
+            param_type="float",
+            low=0.0,
+            high=1.0,
+        ),
+        "start_min_match_count_team_rating": ParamSpec(
+            param_type="int",
+            low=1,
+            high=10,
+        ),
     }
@@ -120,3 +266,15 @@ def get_default_search_space(
             f"Unsupported rating generator type: {type(rating_generator)}. "
             "Expected PlayerRatingGenerator or TeamRatingGenerator."
         )
+def get_default_estimator_search_space(estimator: object) -> dict[str, ParamSpec]:
+    if _is_lightgbm_estimator(estimator):
+        return get_default_lgbm_search_space()
+    if isinstance(estimator, NegativeBinomialEstimator):
+        return get_default_negative_binomial_search_space()
+    if isinstance(estimator, NormalDistributionPredictor):
+        return get_default_normal_distribution_search_space()
+    if isinstance(estimator, StudentTDistributionEstimator):
+        return get_default_student_t_search_space()
+    return {}

spforge/hyperparameter_tuning/_tuner.py CHANGED Viewed

@@ -45,6 +45,8 @@ class ParamSpec:
         elif self.param_type == "int":
             if self.low is None or self.high is None:
                 raise ValueError(f"int parameter '{name}' requires low and high bounds")
+            if self.step is None:
+                return trial.suggest_int(name, int(self.low), int(self.high))
             return trial.suggest_int(name, int(self.low), int(self.high), step=self.step)
         elif self.param_type == "categorical":
             if self.choices is None:
@@ -272,3 +274,193 @@ class RatingHyperparameterTuner:
                 raise ValueError("Scorer returned invalid values in dict")
             return float(np.mean(values))
         return float(score)
+def _is_estimator(obj: object) -> bool:
+    return hasattr(obj, "get_params") and hasattr(obj, "set_params")
+def _get_leaf_estimator_paths(estimator: Any) -> dict[str, Any]:
+    if not _is_estimator(estimator):
+        raise ValueError("estimator must implement get_params and set_params")
+    params = estimator.get_params(deep=True)
+    estimator_keys = [k for k, v in params.items() if _is_estimator(v)]
+    if not estimator_keys:
+        return {"": estimator}
+    leaves: list[str] = []
+    for key in estimator_keys:
+        if not any(other != key and other.startswith(f"{key}__") for other in estimator_keys):
+            leaves.append(key)
+    return {key: params[key] for key in sorted(leaves)}
+def _build_search_space_for_targets(
+    targets: dict[str, dict[str, ParamSpec]],
+) -> dict[str, ParamSpec]:
+    search_space: dict[str, ParamSpec] = {}
+    for path, params in targets.items():
+        for param_name, param_spec in params.items():
+            full_name = f"{path}__{param_name}" if path else param_name
+            if full_name in search_space:
+                raise ValueError(f"Duplicate parameter name detected: {full_name}")
+            search_space[full_name] = param_spec
+    return search_space
+def _enqueue_predicted_r_weight_zero(study: optuna.Study, search_space: dict[str, ParamSpec]):
+    zero_params: dict[str, float] = {}
+    for name, spec in search_space.items():
+        if not name.endswith("predicted_r_weight"):
+            continue
+        if spec.param_type not in {"float", "int"}:
+            continue
+        if spec.low is None or spec.high is None:
+            continue
+        if spec.low <= 0 <= spec.high:
+            zero_params[name] = 0.0
+    if zero_params:
+        study.enqueue_trial(zero_params)
+class EstimatorHyperparameterTuner:
+    """
+    Hyperparameter tuner for sklearn-compatible estimators.
+    Supports nested estimators and can target deepest leaf estimators.
+    """
+    def __init__(
+        self,
+        estimator: Any,
+        cross_validator: MatchKFoldCrossValidator,
+        scorer: BaseScorer,
+        direction: Literal["minimize", "maximize"],
+        param_search_space: dict[str, ParamSpec] | None = None,
+        param_targets: dict[str, dict[str, ParamSpec]] | None = None,
+        n_trials: int = 50,
+        n_jobs: int = 1,
+        storage: str | None = None,
+        study_name: str | None = None,
+        timeout: float | None = None,
+        show_progress_bar: bool = True,
+        sampler: optuna.samplers.BaseSampler | None = None,
+        pruner: optuna.pruners.BasePruner | None = None,
+    ):
+        self.estimator = estimator
+        self.cross_validator = cross_validator
+        self.scorer = scorer
+        self.direction = direction
+        self.param_search_space = param_search_space
+        self.param_targets = param_targets
+        self.n_trials = n_trials
+        self.n_jobs = n_jobs
+        self.storage = storage
+        self.study_name = study_name
+        self.timeout = timeout
+        self.show_progress_bar = show_progress_bar
+        self.sampler = sampler
+        self.pruner = pruner
+        if direction not in ["minimize", "maximize"]:
+            raise ValueError(f"direction must be 'minimize' or 'maximize', got: {direction}")
+        if storage is not None and study_name is None:
+            raise ValueError("study_name is required when using storage")
+        if param_search_space is not None and param_targets is not None:
+            raise ValueError("param_search_space and param_targets cannot both be provided")
+    def optimize(self, df: IntoFrameT) -> OptunaResult:
+        from spforge.hyperparameter_tuning._default_search_spaces import (
+            get_default_estimator_search_space,
+        )
+        leaf_estimators = _get_leaf_estimator_paths(self.estimator)
+        default_targets = {
+            path: get_default_estimator_search_space(est)
+            for path, est in leaf_estimators.items()
+        }
+        default_targets = {path: space for path, space in default_targets.items() if space}
+        if self.param_targets is not None:
+            unknown = set(self.param_targets) - set(leaf_estimators)
+            if unknown:
+                raise ValueError(f"param_targets contains unknown estimator paths: {unknown}")
+            targets = self.param_targets
+        elif self.param_search_space is not None:
+            targets = {path: self.param_search_space for path in leaf_estimators}
+        elif default_targets:
+            targets = default_targets
+        else:
+            raise ValueError(
+                "param_search_space is required when no default search space is available"
+            )
+        search_space = _build_search_space_for_targets(targets)
+        if not search_space:
+            raise ValueError("Resolved search space is empty")
+        study = optuna.create_study(
+            direction=self.direction,
+            sampler=self.sampler,
+            pruner=self.pruner,
+            storage=self.storage,
+            study_name=self.study_name,
+            load_if_exists=True if self.storage else False,
+        )
+        _enqueue_predicted_r_weight_zero(study, search_space)
+        study.optimize(
+            lambda trial: self._objective(trial, df, search_space),
+            n_trials=self.n_trials,
+            n_jobs=self.n_jobs,
+            timeout=self.timeout,
+            show_progress_bar=self.show_progress_bar,
+        )
+        return OptunaResult(
+            best_params=study.best_params,
+            best_value=study.best_value,
+            best_trial=study.best_trial,
+            study=study,
+        )
+    def _objective(
+        self, trial: optuna.Trial, df: IntoFrameT, search_space: dict[str, ParamSpec]
+    ) -> float:
+        try:
+            trial_params = self._suggest_params(trial, search_space)
+            copied_estimator = copy.deepcopy(self.estimator)
+            copied_estimator.set_params(**trial_params)
+            cv = copy.deepcopy(self.cross_validator)
+            cv.estimator = copied_estimator
+            validation_df = cv.generate_validation_df(df)
+            score = self.scorer.score(validation_df)
+            score_value = RatingHyperparameterTuner._aggregate_score(score)
+            if math.isnan(score_value) or math.isinf(score_value):
+                logger.warning(f"Trial {trial.number} returned invalid score: {score_value}")
+                return float("inf") if self.direction == "minimize" else float("-inf")
+            return score_value
+        except Exception as e:
+            logger.warning(f"Trial {trial.number} failed with error: {e}")
+            return float("inf") if self.direction == "minimize" else float("-inf")
+    def _suggest_params(
+        self, trial: optuna.Trial, search_space: dict[str, ParamSpec]
+    ) -> dict[str, Any]:
+        params: dict[str, Any] = {}
+        for param_name, param_spec in search_space.items():
+            params[param_name] = param_spec.suggest(trial, param_name)
+        return params

spforge/ratings/__init__.py CHANGED Viewed

@@ -6,3 +6,7 @@ from .enums import (
     RatingUnknownFeatures as RatingUnknownFeatures,
 )
 from .league_identifier import LeagueIdentifier as LeagueIdentifier
+from .league_start_rating_optimizer import (
+    LeagueStartRatingOptimizationResult as LeagueStartRatingOptimizationResult,
+    LeagueStartRatingOptimizer as LeagueStartRatingOptimizer,
+)

spforge/ratings/_player_rating.py CHANGED Viewed

@@ -129,6 +129,9 @@ class PlayerRatingGenerator(RatingGenerator):
             str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_PROJECTED)
         )
         self.MEAN_PROJ_COL = self._suffix(str(RatingKnownFeatures.RATING_MEAN_PROJECTED))
+        self.PLAYER_DIFF_FROM_TEAM_PROJ_COL = self._suffix(
+            str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED)
+        )
         self.TEAM_OFF_RATING_PROJ_COL = self._suffix(
             str(RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED)
@@ -618,6 +621,7 @@ class PlayerRatingGenerator(RatingGenerator):
             or self.OPP_RATING_PROJ_COL in cols_to_add
             or self.DIFF_PROJ_COL in cols_to_add
             or self.MEAN_PROJ_COL in cols_to_add
+            or self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add
         ):
             df = add_team_rating_projected(
                 df=df,
@@ -673,6 +677,13 @@ class PlayerRatingGenerator(RatingGenerator):
                 )
             )
+        if self.PLAYER_DIFF_FROM_TEAM_PROJ_COL in cols_to_add:
+            df = df.with_columns(
+                (pl.col(self.PLAYER_OFF_RATING_COL) - pl.col(self.TEAM_OFF_RATING_PROJ_COL)).alias(
+                    self.PLAYER_DIFF_FROM_TEAM_PROJ_COL
+                )
+            )
         if (
             self.TEAM_RATING_COL in cols_to_add
             or self.OPP_RATING_COL in cols_to_add

spforge/ratings/league_start_rating_optimizer.py ADDED Viewed

@@ -0,0 +1,201 @@
+from __future__ import annotations
+import copy
+from dataclasses import dataclass
+import narwhals.stable.v2 as nw
+import polars as pl
+from narwhals.stable.v2.typing import IntoFrameT
+DEFAULT_START_RATING = 1000.0
+@dataclass
+class LeagueStartRatingOptimizationResult:
+    league_ratings: dict[str, float]
+    iteration_errors: list[dict[str, float]]
+class LeagueStartRatingOptimizer:
+    def __init__(
+        self,
+        rating_generator: object,
+        n_iterations: int = 3,
+        learning_rate: float = 0.2,
+        min_cross_region_rows: int = 10,
+        rating_scale: float | None = None,
+    ):
+        self.rating_generator = rating_generator
+        self.n_iterations = int(n_iterations)
+        self.learning_rate = float(learning_rate)
+        self.min_cross_region_rows = int(min_cross_region_rows)
+        self.rating_scale = rating_scale
+    @nw.narwhalify
+    def optimize(self, df: IntoFrameT) -> LeagueStartRatingOptimizationResult:
+        pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars()
+        league_ratings = self._get_league_ratings(self.rating_generator)
+        iteration_errors: list[dict[str, float]] = []
+        for _ in range(self.n_iterations):
+            gen = copy.deepcopy(self.rating_generator)
+            self._set_league_ratings(gen, league_ratings)
+            self._ensure_prediction_columns(gen)
+            pred_df = gen.fit_transform(pl_df)
+            error_df = self._cross_region_error_df(pl_df, pred_df, gen)
+            if error_df.is_empty():
+                break
+            error_summary = (
+                error_df.group_by(self._league_column_name(gen))
+                .agg(
+                    pl.col("error").mean().alias("mean_error"),
+                    pl.len().alias("row_count"),
+                )
+                .to_dicts()
+            )
+            league_key = self._league_column_name(gen)
+            iteration_errors.append({r[league_key]: r["mean_error"] for r in error_summary})
+            league_ratings = self._apply_error_updates(
+                gen, league_ratings, error_summary, league_key
+            )
+        self._set_league_ratings(self.rating_generator, league_ratings)
+        return LeagueStartRatingOptimizationResult(
+            league_ratings=league_ratings, iteration_errors=iteration_errors
+        )
+    def _cross_region_error_df(
+        self,
+        df: pl.DataFrame,
+        pred_df: pl.DataFrame,
+        rating_generator: object,
+    ) -> pl.DataFrame:
+        column_names = getattr(rating_generator, "column_names", None)
+        if column_names is None:
+            raise ValueError("rating_generator must define column_names")
+        match_id = getattr(column_names, "match_id", None)
+        team_id = getattr(column_names, "team_id", None)
+        league_col = getattr(column_names, "league", None)
+        if not match_id or not team_id or not league_col:
+            raise ValueError("column_names must include match_id, team_id, and league")
+        pred_col, entity_cols, perf_col = self._prediction_spec(rating_generator)
+        base_cols = [match_id, team_id, league_col, perf_col]
+        for col in base_cols + entity_cols:
+            if col not in df.columns:
+                raise ValueError(f"{col} missing from input dataframe")
+        join_cols = [match_id, team_id] + entity_cols
+        joined = df.select(base_cols + entity_cols).join(
+            pred_df.select(join_cols + [pred_col]),
+            on=join_cols,
+            how="inner",
+        )
+        opp_league = self._opponent_mode_league(joined, match_id, team_id, league_col)
+        enriched = joined.join(opp_league, on=[match_id, team_id], how="left").with_columns(
+            (pl.col(perf_col) - pl.col(pred_col)).alias("error")
+        )
+        return enriched.filter(pl.col("opp_mode_league").is_not_null()).filter(
+            pl.col(league_col) != pl.col("opp_mode_league")
+        )
+    def _opponent_mode_league(
+        self, df: pl.DataFrame, match_id: str, team_id: str, league_col: str
+    ) -> pl.DataFrame:
+        team_mode = (
+            df.group_by([match_id, team_id, league_col])
+            .agg(pl.len().alias("__count"))
+            .sort(["__count"], descending=True)
+            .unique([match_id, team_id])
+            .select([match_id, team_id, league_col])
+            .rename({league_col: "team_mode_league"})
+        )
+        opponents = (
+            team_mode.join(team_mode, on=match_id, suffix="_opp")
+            .filter(pl.col(team_id) != pl.col(f"{team_id}_opp"))
+            .group_by([match_id, team_id, "team_mode_league_opp"])
+            .agg(pl.len().alias("__count"))
+            .sort(["__count"], descending=True)
+            .unique([match_id, team_id])
+            .select([match_id, team_id, "team_mode_league_opp"])
+            .rename({"team_mode_league_opp": "opp_mode_league"})
+        )
+        return opponents
+    def _prediction_spec(self, rating_generator: object) -> tuple[str, list[str], str]:
+        perf_col = getattr(rating_generator, "performance_column", None)
+        if not perf_col:
+            raise ValueError("rating_generator must define performance_column")
+        if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
+            pred_col = rating_generator.PLAYER_PRED_PERF_COL
+            column_names = rating_generator.column_names
+            player_id = getattr(column_names, "player_id", None)
+            if not player_id:
+                raise ValueError("column_names must include player_id for player ratings")
+            return pred_col, [player_id], perf_col
+        if hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
+            pred_col = rating_generator.TEAM_PRED_OFF_PERF_COL
+            return pred_col, [], perf_col
+        raise ValueError("rating_generator must expose a predicted performance column")
+    def _ensure_prediction_columns(self, rating_generator: object) -> None:
+        pred_cols: list[str] = []
+        if hasattr(rating_generator, "PLAYER_PRED_PERF_COL"):
+            pred_cols.append(rating_generator.PLAYER_PRED_PERF_COL)
+        elif hasattr(rating_generator, "TEAM_PRED_OFF_PERF_COL"):
+            pred_cols.append(rating_generator.TEAM_PRED_OFF_PERF_COL)
+        if not pred_cols:
+            return
+        existing = list(getattr(rating_generator, "non_predictor_features_out", []) or [])
+        for col in pred_cols:
+            if col not in existing:
+                existing.append(col)
+        rating_generator.non_predictor_features_out = existing
+    def _apply_error_updates(
+        self,
+        rating_generator: object,
+        league_ratings: dict[str, float],
+        error_summary: list[dict[str, float]],
+        league_key: str,
+    ) -> dict[str, float]:
+        scale = self.rating_scale
+        if scale is None:
+            scale = getattr(rating_generator, "rating_change_multiplier_offense", 1.0)
+        updated = dict(league_ratings)
+        for row in error_summary:
+            if row["row_count"] < self.min_cross_region_rows:
+                continue
+            league = row[league_key]
+            mean_error = row["mean_error"]
+            base_rating = updated.get(league, DEFAULT_START_RATING)
+            updated[league] = base_rating + self.learning_rate * mean_error * scale
+        return updated
+    def _league_column_name(self, rating_generator: object) -> str:
+        column_names = getattr(rating_generator, "column_names", None)
+        league_col = getattr(column_names, "league", None)
+        if not league_col:
+            raise ValueError("column_names must include league for league adjustments")
+        return league_col
+    def _get_league_ratings(self, rating_generator: object) -> dict[str, float]:
+        start_gen = getattr(rating_generator, "start_rating_generator", None)
+        if start_gen is None or not hasattr(start_gen, "league_ratings"):
+            raise ValueError("rating_generator must define start_rating_generator.league_ratings")
+        return dict(start_gen.league_ratings)
+    def _set_league_ratings(self, rating_generator: object, league_ratings: dict[str, float]) -> None:
+        start_gen = getattr(rating_generator, "start_rating_generator", None)
+        if start_gen is None or not hasattr(start_gen, "league_ratings"):
+            raise ValueError("rating_generator must define start_rating_generator.league_ratings")
+        start_gen.league_ratings = dict(league_ratings)
+        if hasattr(rating_generator, "start_league_ratings"):
+            rating_generator.start_league_ratings = dict(league_ratings)

spforge 0.8.4__py3-none-any.whl → 0.8.8__py3-none-any.whl

spforge 0.8.4py3-none-any.whl → 0.8.8py3-none-any.whl