PyPI - autogluon.tabular - Versions diffs - 1.4.0__py3-none-any.whl → 1.4.1b20251128__py3-none-any.whl - Mend

autogluon.tabular 1.4.0py3-none-any.whl → 1.4.1b20251128py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (40) hide show

autogluon/tabular/configs/pipeline_presets.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+import math
+from autogluon.core.constants import BINARY, PROBLEM_TYPES
+from autogluon.core.utils.utils import default_holdout_frac
+USE_BAG_HOLDOUT_AUTO_THRESHOLD = 1_000_000
+def _get_validation_preset(num_train_rows: int, hpo_enabled: bool) -> dict[str, int | float]:
+    """Recommended validation preset manually defined by the AutoGluon developers."""
+    # -- Default recommendation
+    #  max 8 due to 8 cores per CPU being very common.
+    #  down to 5 folds for small datasets to have enough samples for a representative validation set.
+    num_bag_folds = min(8, max(5, math.floor(num_train_rows / 10)))
+    num_bag_sets = 1  # More repeats do not seem to help due to overfitting on val data.
+    use_bag_holdout = num_train_rows >= USE_BAG_HOLDOUT_AUTO_THRESHOLD
+    holdout_frac = round(default_holdout_frac(num_train_rows=num_train_rows, hyperparameter_tune=hpo_enabled), 4)
+    return dict(
+        num_bag_sets=num_bag_sets,
+        num_bag_folds=num_bag_folds,
+        use_bag_holdout=use_bag_holdout,
+        holdout_frac=holdout_frac,
+    )
+# TODO(refactor): use a data class for the config of the validation method.
+# TODO(improvement): Implement a more sophisticated solution.
+#   Could also use more metadata such as  num_features, num_models,
+#   or time_limit for a heuristic.
+#       num_features: The number of features in the dataset.
+#       num_models: The number of models in the portfolio to fit.
+#       time_limit: The time limit for fitting models.
+#   Pointer for non-heuristic approach:
+#       -> meta-learning like Auto-Sklearn 2.0, needs a lot of metadata
+def get_validation_and_stacking_method(
+    # Validation parameters
+    num_bag_folds: int | None,
+    num_bag_sets: int | None,
+    use_bag_holdout: bool | None,
+    holdout_frac: float | None,
+    # Stacking/Pipeline parameters
+    auto_stack: bool,
+    num_stack_levels: int | None,
+    dynamic_stacking: bool | None,
+    refit_full: bool | None,
+    # Metadata
+    num_train_rows: int,
+    problem_type: PROBLEM_TYPES,
+    hpo_enabled: bool,
+) -> tuple[int, int, int, bool, bool, float, bool]:
+    """Get the validation method for AutoGluon via a heuristic.
+    Input variables are `None` if they were not specified by the user or have an explicit default.
+    Parameters
+    ----------
+    num_bag_folds: int | None
+        The number of folds for cross-validation.
+    num_bag_sets: int | None
+        The number of repeats for cross-validation.
+    use_bag_holdout: bool | None
+        Whether to use (additional) holdout validation.
+    holdout_frac: float | None
+        The fraction of data to holdout for validation.
+    auto_stack: bool
+        Whether to automatically determine the stacking method.
+    num_stack_levels: int | None
+        The number of stacking levels.
+    dynamic_stacking: bool | None
+        Whether to use dynamic stacking.
+    refit_full: bool
+        Whether to refit the full training dataset.
+    num_train_rows: int
+        The number of rows in the training dataset.
+    problem_type: PROBLEM_TYPES
+        The type of problem to solve.
+    hpo_enabled: bool
+        If True, HPO is enabled during the run of AutoGluon.
+    Returns:
+    --------
+    Returns all variables needed to define the validation method.
+    """
+    cv_preset = _get_validation_preset(num_train_rows=num_train_rows, hpo_enabled=hpo_enabled)
+    # Independent of `auto_stack`
+    if use_bag_holdout is None:
+        use_bag_holdout = cv_preset["use_bag_holdout"]
+    if holdout_frac is None:
+        holdout_frac = cv_preset["holdout_frac"]
+    if dynamic_stacking is None:
+        dynamic_stacking = not use_bag_holdout
+    if refit_full is None:
+        refit_full = False
+    # Changed by `auto_stack`
+    if num_bag_folds is None:
+        # `num_bag_folds == 0` -> only use holdout validation
+        num_bag_folds = cv_preset["num_bag_folds"] if auto_stack else 0
+    if num_bag_sets is None:
+        # `num_bag_sets == 1` -> no repeats
+        num_bag_sets = cv_preset["num_bag_sets"] if auto_stack else 1
+    if num_stack_levels is None:
+        # Disable multi-layer stacking by default
+        num_stack_levels = 0
+        # Activate multi-layer stacking for `auto_stack` if
+        if auto_stack and (
+            dynamic_stacking  # -> We use dynamic stacking
+            or
+            # -> We have holdout validation or a non-binary problem with more than 750 training rows
+            ((use_bag_holdout or (problem_type != BINARY)) and (num_train_rows >= 750))
+        ):
+            num_stack_levels = 1
+    return (
+        num_bag_folds,
+        num_bag_sets,
+        num_stack_levels,
+        dynamic_stacking,
+        use_bag_holdout,
+        holdout_frac,
+        refit_full,
+    )

autogluon/tabular/configs/presets_configs.py CHANGED Viewed

@@ -6,7 +6,6 @@ tabular_presets_dict = dict(
     best_quality={
         "auto_stack": True,
         "dynamic_stacking": "auto",
-        "num_bag_sets": 1,
         "hyperparameters": "zeroshot",
         "time_limit": 3600,
     },
@@ -16,7 +15,6 @@ tabular_presets_dict = dict(
     high_quality={
         "auto_stack": True,
         "dynamic_stacking": "auto",
-        "num_bag_sets": 1,
         "hyperparameters": "zeroshot",
         "time_limit": 3600,
         "refit_full": True,
@@ -29,7 +27,6 @@ tabular_presets_dict = dict(
     good_quality={
         "auto_stack": True,
         "dynamic_stacking": "auto",
-        "num_bag_sets": 1,
         "hyperparameters": "light",
         "time_limit": 3600,
         "refit_full": True,

autogluon/tabular/models/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from autogluon.core.models.abstract.abstract_model import AbstractModel
 from .automm.automm_model import MultiModalPredictorModel
 from .automm.ft_transformer import FTTransformerModel
 from .catboost.catboost_model import CatBoostModel
+from .ebm.ebm_model import EBMModel
 from .fastainn.tabular_nn_fastai import NNFastAiTabularModel
 from .fasttext.fasttext_model import FastTextModel
 from .image_prediction.image_predictor import ImagePredictorModel

autogluon/tabular/models/catboost/catboost_model.py CHANGED Viewed

@@ -39,6 +39,7 @@ class CatBoostModel(AbstractModel):
     ag_priority_by_problem_type = MappingProxyType({
         SOFTCLASS: 60
     })
+    seed_name = "random_seed"
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -48,7 +49,6 @@ class CatBoostModel(AbstractModel):
         default_params = get_param_baseline(problem_type=self.problem_type)
         for param, val in default_params.items():
             self._set_default_param_value(param, val)
-        self._set_default_param_value("random_seed", 0)  # Remove randomness for reproducibility
         # Set 'allow_writing_files' to True in order to keep log files created by catboost during training (these will be saved in the directory where AutoGluon stores this model)
         self._set_default_param_value("allow_writing_files", False)  # Disables creation of catboost logging files during training by default
         if self.problem_type != SOFTCLASS:  # TODO: remove this after catboost 0.24
@@ -126,6 +126,7 @@ class CatBoostModel(AbstractModel):
         ag_params = self._get_ag_params()
         params = self._get_model_params()
         params["thread_count"] = num_cpus
         if self.problem_type == SOFTCLASS:
             # FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost
@@ -310,6 +311,8 @@ class CatBoostModel(AbstractModel):
         max_memory_iters = math.floor(available_mem * max_memory_proportion / mem_usage_per_iter)
         final_iters = min(default_iters, min(max_memory_iters, estimated_iters_in_time))
+        if final_iters < 1:
+            raise TimeLimitExceeded
         return final_iters
     def _predict_proba(self, X, **kwargs):

autogluon/tabular/models/ebm/__init__.py ADDED Viewed

File without changes

autogluon/tabular/models/ebm/ebm_model.py ADDED Viewed

@@ -0,0 +1,259 @@
+from __future__ import annotations
+import time
+import warnings
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
+from autogluon.core.models import AbstractModel
+from .hyperparameters.parameters import get_param_baseline
+from .hyperparameters.searchspaces import get_default_searchspace
+if TYPE_CHECKING:
+    from autogluon.core.metrics import Scorer
+class EbmCallback:
+    """Time limit callback for EBM."""
+    def __init__(self, seconds: float):
+        self.seconds = seconds
+        self.end_time: float | None = None
+    def __call__(self, *args, **kwargs):
+        if self.end_time is None:
+            self.end_time = time.monotonic() + self.seconds
+            return False
+        return time.monotonic() > self.end_time
+class EBMModel(AbstractModel):
+    """
+    The Explainable Boosting Machine (EBM) is a glass-box generalized additive model
+    with automatic interaction detection (https://interpret.ml/docs). EBMs are
+    designed to be highly interpretable while achieving accuracy comparable to
+    black-box models on a wide range of tabular datasets.
+    Requires the 'interpret' or 'interpret-core' package. Install via:
+    pip install interpret
+    Paper: InterpretML: A Unified Framework for Machine Learning Interpretability
+    Authors: H. Nori, S. Jenkins, P. Koch, and R. Caruana 2019
+    Codebase: https://github.com/interpretml/interpret
+    License: MIT
+    .. versionadded:: 1.5.0
+    """
+    ag_key = "EBM"
+    ag_name = "EBM"
+    ag_priority = 35
+    seed_name = "random_state"
+    def _fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        X_val: pd.DataFrame | None = None,
+        y_val: pd.Series | None = None,
+        time_limit: float | None = None,
+        sample_weight: np.ndarray | None = None,
+        sample_weight_val: np.ndarray | None = None,
+        num_cpus: int | str = "auto",
+        **kwargs,
+    ):
+        # Preprocess data.
+        X = self.preprocess(X)
+        if X_val is not None:
+            X_val = self.preprocess(X_val)
+        features = self._features
+        if features is None:
+            features = X.columns
+        params = construct_ebm_params(
+            self.problem_type,
+            self._get_model_params(),
+            features,
+            self.stopping_metric,
+            num_cpus,
+            time_limit,
+        )
+        # Init Class
+        model_cls = get_class_from_problem_type(self.problem_type)
+        self.model = model_cls(**params)
+        # Handle validation data format for EBM
+        fit_X = X
+        fit_y = y
+        fit_sample_weight = sample_weight
+        bags = None
+        if X_val is not None:
+            fit_X = pd.concat([X, X_val], ignore_index=True)
+            fit_y = pd.concat([y, y_val], ignore_index=True)
+            if sample_weight is not None:
+                fit_sample_weight = np.hstack([sample_weight, sample_weight_val])
+            bags = np.full((len(fit_X), 1), 1, np.int8)
+            bags[len(X) :, 0] = -1
+        with warnings.catch_warnings():  # try to filter joblib warnings
+            warnings.filterwarnings(
+                "ignore",
+                category=UserWarning,
+                message=".*resource_tracker: process died.*",
+            )
+            self.model.fit(fit_X, fit_y, sample_weight=fit_sample_weight, bags=bags)
+    def _set_default_params(self):
+        default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes)
+        for param, val in default_params.items():
+            self._set_default_param_value(param, val)
+    def _get_default_searchspace(self):
+        return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes)
+    def _get_default_auxiliary_params(self) -> dict:
+        default_auxiliary_params = super()._get_default_auxiliary_params()
+        extra_auxiliary_params = {
+            "valid_raw_types": ["int", "float", "category"],
+        }
+        default_auxiliary_params.update(extra_auxiliary_params)
+        return default_auxiliary_params
+    @classmethod
+    def supported_problem_types(cls) -> list[str] | None:
+        return ["binary", "multiclass", "regression"]
+    @classmethod
+    def _class_tags(cls) -> dict:
+        return {"can_estimate_memory_usage_static": True}
+    def _more_tags(self) -> dict:
+        """EBMs support refit full."""
+        return {"can_refit_full": True}
+    def _estimate_memory_usage(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> int:
+        return self.estimate_memory_usage_static(
+            X=X,
+            y=y,
+            hyperparameters=self._get_model_params(),
+            problem_type=self.problem_type,
+            num_classes=self.num_classes,
+            features=self._features,
+            **kwargs,
+        )
+    @classmethod
+    def _estimate_memory_usage_static(
+        cls,
+        *,
+        X: pd.DataFrame,
+        y: pd.Series | None = None,
+        hyperparameters: dict | None = None,
+        problem_type: str = "infer",
+        num_classes: int = 1,
+        features=None,
+        **kwargs,
+    ) -> int:
+        """Returns the expected peak memory usage in bytes of the EBM model during fit."""
+        # TODO: we can improve the memory estimate slightly by using num_classes if y is None
+        if features is None:
+            features = X.columns
+        model_cls = get_class_from_problem_type(problem_type)
+        params = construct_ebm_params(problem_type, hyperparameters, features)
+        baseline_memory_bytes = 400_000_000  # 400 MB baseline memory
+        # assuming we call pd.concat([X, X_val], ignore_index=True), then X size will be doubled
+        return baseline_memory_bytes + model_cls(**params).estimate_mem(
+            X, y, data_multiplier=2.0
+        )
+    def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
+        # Given the good mem estimates with overhead, we set the threshold to 1.
+        return super()._validate_fit_memory_usage(
+            mem_error_threshold=mem_error_threshold, **kwargs
+        )
+def construct_ebm_params(
+    problem_type,
+    hyperparameters=None,
+    features=None,
+    stopping_metric=None,
+    num_cpus=-1,
+    time_limit=None,
+):
+    if hyperparameters is None:
+        hyperparameters = {}
+    hyperparameters = hyperparameters.copy()  # we pop values below, so copy.
+    # The user can specify nominal and continuous columns.
+    continuous_columns = hyperparameters.pop("continuous_columns", [])
+    nominal_columns = hyperparameters.pop("nominal_columns", [])
+    feature_types = None
+    if features is not None:
+        feature_types = []
+        for c in features:
+            if c in continuous_columns:
+                f_type = "continuous"
+            elif c in nominal_columns:
+                f_type = "nominal"
+            else:
+                f_type = "auto"
+            feature_types.append(f_type)
+    # Default parameters for EBM
+    params = {
+        "outer_bags": 1,  # AutoGluon ensemble creates outer bags, no need for this overhead.
+        "n_jobs": 1,  # EBM only parallelizes across outer bags currently, so ignore num_cpus
+        "feature_names": features,
+        "feature_types": feature_types,
+    }
+    if stopping_metric is not None:
+        params["objective"] = get_metric_from_ag_metric(
+            metric=stopping_metric, problem_type=problem_type
+        )
+    if time_limit is not None:
+        params["callback"] = EbmCallback(time_limit)
+    params.update(hyperparameters)
+    return params
+def get_class_from_problem_type(problem_type: str):
+    if problem_type in [BINARY, MULTICLASS]:
+        from interpret.glassbox import ExplainableBoostingClassifier
+        model_cls = ExplainableBoostingClassifier
+    elif problem_type == REGRESSION:
+        from interpret.glassbox import ExplainableBoostingRegressor
+        model_cls = ExplainableBoostingRegressor
+    else:
+        raise ValueError(f"Unsupported problem type: {problem_type}")
+    return model_cls
+def get_metric_from_ag_metric(*, metric: Scorer, problem_type: str):
+    """Map AutoGluon metric to EBM metric for early stopping."""
+    if problem_type in [BINARY, MULTICLASS]:
+        metric_class = "log_loss"
+    elif problem_type == REGRESSION:
+        metric_class = "rmse"
+    else:
+        raise AssertionError(f"EBM does not support {problem_type} problem type.")
+    return metric_class

autogluon/tabular/models/ebm/hyperparameters/__init__.py ADDED Viewed

File without changes

autogluon/tabular/models/ebm/hyperparameters/parameters.py ADDED Viewed

@@ -0,0 +1,39 @@
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
+def get_param_baseline(problem_type, num_classes=None):
+    if problem_type == BINARY:
+        return get_param_binary_baseline()
+    elif problem_type == MULTICLASS:
+        return get_param_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == SOFTCLASS:
+        return get_param_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == REGRESSION:
+        return get_param_regression_baseline()
+    else:
+        return get_param_binary_baseline()
+def get_base_params():
+    base_params = {}
+    return base_params
+def get_param_binary_baseline():
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_param_multiclass_baseline(num_classes):
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_param_regression_baseline():
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params

autogluon/tabular/models/ebm/hyperparameters/searchspaces.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Default hyperparameter search spaces used in EBM model"""
+from autogluon.common import space
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
+def get_default_searchspace(problem_type, num_classes=None):
+    if problem_type == BINARY:
+        return get_searchspace_binary_baseline()
+    elif problem_type == MULTICLASS:
+        return get_searchspace_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == REGRESSION:
+        return get_searchspace_regression_baseline()
+    else:
+        return get_searchspace_binary_baseline()
+def get_base_searchspace():
+    base_params = {
+        "max_leaves": space.Int(2, 3, default=2),
+        "smoothing_rounds": space.Int(0, 1000, default=200),
+        "learning_rate": space.Real(0.0025, 0.2, default=0.02, log=True),
+        "interactions": space.Categorical(
+            0,
+            "0.5x",
+            "1x",
+            "1.5x",
+            "2x",
+            "2.5x",
+            "3x",
+            "3.5x",
+            "4x",
+            "4.5x",
+            "5x",
+            "6x",
+            "7x",
+            "8x",
+            "9x",
+            "10x",
+            "15x",
+            "20x",
+            "25x",
+        ),
+        "interaction_smoothing_rounds": space.Int(0, 200, default=90),
+        "min_hessian": space.Real(1e-10, 1e-2, default=1e-4, log=True),
+        "min_samples_leaf": space.Int(2, 20, default=4),
+        "gain_scale": space.Real(0.5, 5.0, default=5.0, log=True),
+        "min_cat_samples": space.Int(5, 20, default=10),
+        "cat_smooth": space.Real(5.0, 100.0, default=10.0, log=True),
+        "missing": space.Categorical("separate", "low", "high", "gain"),
+    }
+    return base_params
+def get_searchspace_multiclass_baseline(num_classes):
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_searchspace_binary_baseline():
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_searchspace_regression_baseline():
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params

autogluon/tabular/models/fastainn/tabular_nn_fastai.py CHANGED Viewed

@@ -103,6 +103,7 @@ class NNFastAiTabularModel(AbstractModel):
     ag_priority_by_problem_type = MappingProxyType({
         MULTICLASS: 95,
     })
+    seed_name = "random_seed"
     model_internals_file_name = "model-internals.pkl"
@@ -322,8 +323,9 @@ class NNFastAiTabularModel(AbstractModel):
         # Make deterministic
         from fastai.torch_core import set_seed
-        set_seed(0, True)
-        dls.rng.seed(0)
+        random_seed = params.pop(self.seed_name, self.default_random_seed)
+        set_seed(random_seed, True)
+        dls.rng.seed(random_seed)
         if self.problem_type == QUANTILE:
             dls.c = len(self.quantile_levels)

autogluon/tabular/models/knn/knn_model.py CHANGED Viewed

@@ -214,7 +214,7 @@ class KNNModel(AbstractModel):
         def sample_func(chunk, frac):
             # Guarantee at least 1 sample (otherwise log_loss would crash or model would return different column counts in pred_proba)
             n = max(math.ceil(len(chunk) * frac), 1)
-            return chunk.sample(n=n, replace=False, random_state=0)
+            return chunk.sample(n=n, replace=False, random_state=self.random_seed)
         if self.problem_type != REGRESSION:
             y_df = y.to_frame(name="label").reset_index(drop=True)
@@ -255,9 +255,13 @@ class KNNModel(AbstractModel):
             self._X_unused_index = [i for i in range(num_rows_max) if i not in idx]
         return self.model
-    def _get_maximum_resources(self) -> Dict[str, Union[int, float]]:
+    def _get_maximum_resources(self) -> dict[str, int | float]:
         # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
-        return {"num_cpus": 32}
+        # no GPU support
+        return {
+            "num_cpus": 32,
+            "num_gpus": 0,
+        }
     def _get_default_resources(self):
         # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020

autogluon.tabular 1.4.0__py3-none-any.whl → 1.4.1b20251128__py3-none-any.whl

Potentially problematic release.

autogluon.tabular 1.4.0py3-none-any.whl → 1.4.1b20251128py3-none-any.whl