PyPI - autogluon.tabular - Versions diffs - 1.4.1b20251014__py3-none-any.whl → 1.5.0b20251222__py3-none-any.whl - Mend

autogluon.tabular 1.4.1b20251014py3-none-any.whl → 1.5.0b20251222py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

autogluon/tabular/models/tabprep/prep_mixin.py ADDED Viewed

@@ -0,0 +1,220 @@
+from __future__ import annotations
+import logging
+from typing import Type
+import numpy as np
+import pandas as pd
+from autogluon.features import ArithmeticFeatureGenerator
+from autogluon.features import CategoricalInteractionFeatureGenerator
+from autogluon.features import OOFTargetEncodingFeatureGenerator
+from autogluon.features import BulkFeatureGenerator
+from autogluon.features.generators.abstract import AbstractFeatureGenerator
+logger = logging.getLogger(__name__)
+# TODO: In future we can have a feature generator registry like what is done for models
+_feature_generator_class_lst = [
+    ArithmeticFeatureGenerator,
+    CategoricalInteractionFeatureGenerator,
+    OOFTargetEncodingFeatureGenerator,
+]
+_feature_generator_class_map = {
+    feature_generator_cls.__name__: feature_generator_cls for feature_generator_cls in _feature_generator_class_lst
+}
+def _recursive_expand_prep_param(prep_param: tuple | list[list | tuple]) -> list[tuple]:
+    if isinstance(prep_param, list):
+        if len(prep_param) == 0:
+            param_type = "list"
+        elif len(prep_param) == 2:
+            if isinstance(prep_param[0], (str, AbstractFeatureGenerator)):
+                param_type = "generator"
+            else:
+                param_type = "list"
+        else:
+            param_type = "list"
+    elif isinstance(prep_param, tuple):
+        param_type = "generator"
+    else:
+        raise ValueError(f"Invalid value for prep_param: {prep_param}")
+    if param_type == "list":
+        out = []
+        for p in prep_param:
+            out += _recursive_expand_prep_param(p)
+        return out
+    elif param_type == "generator":
+        return [prep_param]
+    else:
+        raise ValueError(f"Invalid value for prep_param: {prep_param}")
+# FIXME: Why is preprocessing twice as slow per fold when bagging LightGBM??? Need to investigate. Try sequential fold fit
+# TODO: Why is `prep_params` a dict instead of a list?
+class ModelAgnosticPrepMixin:
+    def _estimate_dtypes_after_preprocessing(self, X: pd.DataFrame, **kwargs) -> int:
+        prep_params = self._get_ag_params().get("prep_params", None)
+        if prep_params is None:
+            prep_params = []
+        # FIXME: Temporarily simplify for memory calculation
+        prep_params = _recursive_expand_prep_param(prep_params)
+        X_nunique = X.nunique().values
+        n_categorical = X.select_dtypes(exclude=[np.number]).shape[1]
+        n_numeric = X.loc[:, X_nunique > 2].select_dtypes(include=[np.number]).shape[1]
+        n_binary = X.loc[:, X_nunique <= 2].select_dtypes(include=[np.number]).shape[
+            1]  # NOTE: It can happen that features have less than two unique values if cleaning is applied before the bagging, i.e. Bioresponse
+        assert n_numeric + n_categorical + n_binary == X.shape[1]  # NOTE: FOr debugging, to be removed later
+        for preprocessor_cls_name, init_params in prep_params:
+            if preprocessor_cls_name == 'ArithmeticFeatureGenerator':
+                prep_cls = ArithmeticFeatureGenerator(target_type=self.problem_type, **init_params)
+            elif preprocessor_cls_name == 'CategoricalInteractionFeatureGenerator':
+                prep_cls = CategoricalInteractionFeatureGenerator(target_type=self.problem_type, **init_params)
+            elif preprocessor_cls_name == 'OOFTargetEncodingFeatureGenerator':
+                prep_cls = OOFTargetEncodingFeatureGenerator(target_type=self.problem_type, **init_params)
+            else:
+                raise ValueError(f"Unknown preprocessor class name: {preprocessor_cls_name}")
+            n_numeric, n_categorical, n_binary = prep_cls.estimate_new_dtypes(n_numeric, n_categorical, n_binary,
+                                                                              num_classes=self.num_classes)
+        return n_numeric, n_categorical, n_binary
+    def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
+        hyperparameters = self._get_model_params()
+        n_numeric, n_categorical, n_binary = self._estimate_dtypes_after_preprocessing(X=X, **kwargs)
+        if hasattr(self, "_estimate_memory_usage_static_lite"):
+            return self._estimate_memory_usage_static_lite(
+                num_samples=X.shape[0],
+                num_features=n_numeric + n_categorical + n_binary,
+                num_bytes_per_cell=4,
+                hyperparameters=hyperparameters,
+                problem_type=self.problem_type,
+                num_classes=self.num_classes,
+                **kwargs,
+            )
+        # TODO: Replace with memory estimation logic based on no. of features instead of dataframe generation
+        shape = X.shape[0]
+        df_lst = []
+        if n_numeric > 0:
+            X_estimate = np.random.random(size=[shape, n_numeric]).astype(np.float32)
+            X_estimate_numeric = pd.DataFrame(X_estimate)
+            df_lst.append(X_estimate_numeric)
+        if n_categorical > 0:
+            cardinality = int(X.select_dtypes(exclude=[np.number]).nunique().mean())
+            X_estimate = np.random.randint(0, cardinality, [shape, n_categorical]).astype('str')
+            X_estimate_cat = pd.DataFrame(X_estimate)
+            df_lst.append(X_estimate_cat)
+        if n_binary > 0:
+            X_estimate = np.random.randint(0, 2, [shape, n_binary]).astype(np.int8)
+            X_estimate_binary = pd.DataFrame(X_estimate)
+            df_lst.append(X_estimate_binary)
+        X = pd.concat(df_lst, ignore_index=True, axis=1)
+        return self.estimate_memory_usage_static(
+            X=X,
+            problem_type=self.problem_type,
+            num_classes=self.num_classes,
+            hyperparameters=hyperparameters,
+            **kwargs,
+        )
+    def _init_preprocessor(
+            self,
+            preprocessor_cls: Type[AbstractFeatureGenerator] | str,
+            init_params: dict | None,
+    ) -> AbstractFeatureGenerator:
+        if isinstance(preprocessor_cls, str):
+            preprocessor_cls = _feature_generator_class_map[preprocessor_cls]
+        if init_params is None:
+            init_params = {}
+        _init_params = dict(
+            verbosity=0,
+            random_state=self.random_seed,  # FIXME: Not a generic param
+            target_type=self.problem_type,  # FIXME: Not a generic param
+        )
+        _init_params.update(**init_params)
+        return preprocessor_cls(
+            **_init_params,
+        )
+    def _recursive_init_preprocessors(self, prep_param: tuple | list[list | tuple]):
+        if isinstance(prep_param, list):
+            if len(prep_param) == 0:
+                param_type = "list"
+            elif len(prep_param) == 2:
+                if isinstance(prep_param[0], (str, AbstractFeatureGenerator)):
+                    param_type = "generator"
+                else:
+                    param_type = "list"
+            else:
+                param_type = "list"
+        elif isinstance(prep_param, tuple):
+            param_type = "generator"
+        else:
+            raise ValueError(f"Invalid value for prep_param: {prep_param}")
+        if param_type == "list":
+            out = []
+            for i, p in enumerate(prep_param):
+                out.append(self._recursive_init_preprocessors(p))
+            return out
+        elif param_type == "generator":
+            assert len(prep_param) == 2
+            preprocessor_cls = prep_param[0]
+            init_params = prep_param[1]
+            return self._init_preprocessor(preprocessor_cls=preprocessor_cls, init_params=init_params)
+        else:
+            raise ValueError(f"Invalid value for prep_param: {prep_param}")
+    def get_preprocessors(self) -> list[AbstractFeatureGenerator]:
+        ag_params = self._get_ag_params()
+        prep_params = ag_params.get("prep_params", None)
+        passthrough_types = ag_params.get("prep_params.passthrough_types", None)
+        if prep_params is None:
+            return []
+        if not prep_params:
+            return []
+        preprocessors = self._recursive_init_preprocessors(prep_param=prep_params)
+        if len(preprocessors) == 0:
+            return []
+        if len(preprocessors) == 1 and isinstance(preprocessors[0], AbstractFeatureGenerator):
+            return preprocessors
+        else:
+            preprocessors = [BulkFeatureGenerator(
+                generators=preprocessors,
+                # TODO: "false_recursive" technically can slow down inference, but need to optimize `True` first
+                #  Refer to `Bioresponse` dataset where setting to `True` -> 200s fit time vs `false_recursive` -> 1s fit time
+                remove_unused_features="false_recursive",
+                post_drop_duplicates=True,
+                passthrough=True,
+                passthrough_types=passthrough_types,
+                verbosity=0,
+            )]
+            return preprocessors
+    def _preprocess(self, X: pd.DataFrame, y=None, is_train: bool = False, **kwargs):
+        if is_train:
+            self.preprocessors = self.get_preprocessors()
+            if self.preprocessors:
+                assert y is not None, f"y must be specified to fit preprocessors... Likely the inheriting class isn't passing `y` in its `preprocess` call."
+                # FIXME: add `post_drop_useless`, example: anneal has many useless features
+                feature_metadata_in = self._feature_metadata
+                for prep in self.preprocessors:
+                    X = prep.fit_transform(X, y, feature_metadata_in=feature_metadata_in)
+                    # FIXME: Nick: This is incorrect because it strips away special dtypes. Need to do this properly by fixing in the preprocessors
+                    feature_metadata_in = prep.feature_metadata
+                self._feature_metadata = feature_metadata_in
+                self._features_internal = self._feature_metadata.get_features()
+        else:
+            for prep in self.preprocessors:
+                X = prep.transform(X)
+        return super()._preprocess(X, y=y, is_train=is_train, **kwargs)

autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py CHANGED Viewed

@@ -50,6 +50,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
     ag_key = "NN_TORCH"
     ag_name = "NeuralNetTorch"
     ag_priority = 25
+    seed_name = "seed_value"
     # Constants used throughout this class:
     unique_category_str = np.nan  # string used to represent missing values and unknown categories for categorical features.
@@ -164,9 +165,6 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         return processor_kwargs, optimizer_kwargs, fit_kwargs, loss_kwargs, params
-    def _get_random_seed_from_hyperparameters(self, hyperparameters: dict) -> int | None | str:
-        return hyperparameters.get("seed_value", "N/A")
     def _fit(
         self,
         X: pd.DataFrame,
@@ -194,7 +192,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         processor_kwargs, optimizer_kwargs, fit_kwargs, loss_kwargs, params = self._prepare_params(params=params)
-        seed_value = self.random_seed
+        seed_value = params.pop(self.seed_name, self.default_random_seed)
         self._num_cpus_infer = params.pop("_num_cpus_infer", 1)
         if seed_value is not None:  # Set seeds
@@ -373,7 +371,6 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         best_epoch = 0
         best_val_metric = -np.inf  # higher = better
         best_val_update = 0
-        val_improve_epoch = 0  # most recent epoch where validation-score strictly improved
         start_fit_time = time.time()
         if time_limit is not None:
             time_limit = time_limit - (start_fit_time - start_time)
@@ -498,7 +495,7 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
             if time_limit is not None:
                 time_elapsed = time.time() - start_fit_time
-                time_epoch_average = time_elapsed / (epoch + 1)
+                time_epoch_average = time_elapsed / max(epoch, 1)  # avoid divide by 0
                 time_left = time_limit - time_elapsed
                 if time_left < time_epoch_average:
                     logger.log(20, f"\tRan out of time, stopping training early. (Stopping on epoch {epoch})")

autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py CHANGED Viewed

@@ -37,10 +37,18 @@ def create_preprocessor(
             steps=[("ordinal", OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]
         )  # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
         transformers.append(("ordinal", ordinal_transformer, embed_features))
-    return ColumnTransformer(
-        transformers=transformers, remainder="passthrough", force_int_remainder_cols=False,
-    )  # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
+    try:
+        out = ColumnTransformer(
+            transformers=transformers, remainder="passthrough", force_int_remainder_cols=False,
+        )  # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
+    except:
+        # TODO: Avoid try/except once scikit-learn 1.5 is minimum
+        # Needed for scikit-learn 1.4 and 1.9+, force_int_remainder_cols is deprecated in 1.7 and introduced in 1.5
+        # ref: https://github.com/autogluon/autogluon/issues/5289
+        out = ColumnTransformer(
+            transformers=transformers, remainder="passthrough",
+        )  # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
+    return out
 def convert_df_dtype_to_str(df):
     return df.astype(str)

autogluon/tabular/models/xgboost/xgboost_model.py CHANGED Viewed

@@ -32,6 +32,7 @@ class XGBoostModel(AbstractModel):
     ag_key = "XGB"
     ag_name = "XGBoost"
     ag_priority = 40
+    seed_name = "seed"
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -75,15 +76,11 @@ class XGBoostModel(AbstractModel):
         return X
-    def _get_random_seed_from_hyperparameters(self, hyperparameters: dict) -> int | None | str:
-        return hyperparameters.get("seed", "N/A")
     def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_gpus=0, num_cpus=None, sample_weight=None, sample_weight_val=None, verbosity=2, **kwargs):
         # TODO: utilize sample_weight_val in early-stopping if provided
         start_time = time.time()
         ag_params = self._get_ag_params()
         params = self._get_model_params()
-        params["seed"] = self.random_seed
         generate_curves = ag_params.get("generate_curves", False)
         if generate_curves:
@@ -125,6 +122,8 @@ class XGBoostModel(AbstractModel):
             if eval_metric is not None:
                 params["eval_metric"] = eval_metric
                 eval_metric_name = eval_metric.__name__ if not isinstance(eval_metric, str) else eval_metric
+        else:
+            eval_metric_name = params["eval_metric"].__name__ if not isinstance(params["eval_metric"], str) else params["eval_metric"]
         if X_val is None:
             early_stopping_rounds = None

autogluon/tabular/predictor/predictor.py CHANGED Viewed

@@ -19,7 +19,10 @@ from packaging import version
 from autogluon.common import FeatureMetadata, TabularDataset
 from autogluon.common.loaders import load_json
 from autogluon.common.savers import save_json
+from autogluon.common.utils.cv_splitter import CVSplitter
+from autogluon.common.utils.decorators import apply_presets
 from autogluon.common.utils.file_utils import get_directory_size, get_directory_size_per_file
+from autogluon.common.utils.resource_utils import ResourceManager, get_resource_manager
 from autogluon.common.utils.hyperparameter_utils import get_hyperparameter_str_deprecation_msg, is_advanced_hyperparameter_format
 from autogluon.common.utils.log_utils import add_log_to_file, set_logger_verbosity, warn_if_mlflow_autologging_is_enabled
 from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
@@ -45,10 +48,9 @@ from autogluon.core.pseudolabeling.pseudolabeling import filter_ensemble_pseudo,
 from autogluon.core.scheduler.scheduler_factory import scheduler_factory
 from autogluon.core.stacked_overfitting.utils import check_stacked_overfitting_from_leaderboard
 from autogluon.core.utils import get_pred_from_proba_df, plot_performance_vs_trials, plot_summary_of_models, plot_tabular_models
-from autogluon.core.utils.decorators import apply_presets
 from autogluon.core.utils.loaders import load_pkl, load_str
 from autogluon.core.utils.savers import save_pkl, save_str
-from autogluon.core.utils.utils import CVSplitter, generate_train_test_split_combined
+from autogluon.core.utils.utils import generate_train_test_split_combined
 from ..configs.feature_generator_presets import get_default_feature_generator
 from ..configs.hyperparameter_configs import get_hyperparameter_config
@@ -421,7 +423,7 @@ class TabularPredictor:
         num_gpus: int | str = "auto",
         fit_strategy: Literal["sequential", "parallel"] = "sequential",
         memory_limit: float | str = "auto",
-        callbacks: list[AbstractCallback] = None,
+        callbacks: list[AbstractCallback | list | tuple] = None,
         **kwargs,
     ) -> "TabularPredictor":
         """
@@ -462,16 +464,23 @@ class TabularPredictor:
             It is recommended to only use one `quality` based preset in a given call to `fit()` as they alter many of the same arguments and are not compatible with each-other.
             In-depth Preset Info:
-                extreme_quality={"auto_stack": True, "dynamic_stacking": "auto", "_experimental_dynamic_hyperparameters": True, "hyperparameters": None}
-                    Significantly more accurate than `best_quality` on datasets <= 30000 samples. Requires a GPU for best results.
-                    For datasets <= 30000 samples, will use recent tabular foundation models TabPFNv2, TabICL, and Mitra to maximize performance.
-                    For datasets > 30000 samples, will behave identically to `best_quality`.
+                extreme_quality={...}
+                    New in v1.5: The state-of-the-art for tabular machine learning.
+                    Requires `pip install autogluon.tabular[tabarena]` to install TabPFN, TabICL, and TabDPT.
+                    Significantly more accurate than `best_quality` on datasets <= 100000 samples. Requires a GPU.
+                    Will use recent tabular foundation models TabPFNv2, TabICL, TabDPT, and Mitra to maximize performance.
                     Recommended for applications that benefit from the best possible model accuracy.
+                best_quality_v150={...}
+                    New in v1.5: Better quality than 'best_quality' and 5x+ faster to train. Give it a try!
                 best_quality={'auto_stack': True, 'dynamic_stacking': 'auto', 'hyperparameters': 'zeroshot'}
                     Best predictive accuracy with little consideration to inference time or disk usage. Achieve even better results by specifying a large time_limit value.
                     Recommended for applications that benefit from the best possible model accuracy.
+                high_quality_v150={...}
+                    New in v1.5: Better quality than 'high_quality' and 5x+ faster to train. Give it a try!
                 high_quality={'auto_stack': True, 'dynamic_stacking': 'auto', 'hyperparameters': 'zeroshot', 'refit_full': True, 'set_best_to_refit_full': True, 'save_bag_folds': False}
                     High predictive accuracy with fast inference. ~8x faster inference and ~8x lower disk usage than `best_quality`.
                     Recommended for applications that require reasonable inference speed and/or model size.
@@ -1091,7 +1100,8 @@ class TabularPredictor:
             elif verbosity >= 4:
                 logger.log(20, f"Verbosity: {verbosity} (Maximum Logging)")
-        include_gpu_count = verbosity >= 3
+        resource_manager: ResourceManager = get_resource_manager()
+        include_gpu_count = resource_manager.get_gpu_count_torch() or verbosity >= 3
         sys_msg = get_ag_system_info(path=self.path, include_gpu_count=include_gpu_count)
         logger.log(20, sys_msg)
@@ -1104,11 +1114,13 @@ class TabularPredictor:
                 20,
                 "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...\n"
                 "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n"
-                "\tpresets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.\n"
-                "\tpresets='best'    : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
-                "\tpresets='high'    : Strong accuracy with fast inference speed.\n"
-                "\tpresets='good'    : Good accuracy with very fast inference speed.\n"
-                "\tpresets='medium'  : Fast training time, ideal for initial prototyping.",
+                "\tpresets='extreme'  : New in v1.5: The state-of-the-art for tabular data. Massively better than 'best' on datasets <100000 samples by using new Tabular Foundation Models (TFMs) meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, TabDPT, and TabM. Requires a GPU and `pip install autogluon.tabular[tabarena]` to install TabPFN, TabICL, and TabDPT.\n"
+                "\tpresets='best'     : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.\n"
+                "\tpresets='best_v150': New in v1.5: Better quality than 'best' and 5x+ faster to train. Give it a try!\n"
+                "\tpresets='high'     : Strong accuracy with fast inference speed.\n"
+                "\tpresets='high_v150': New in v1.5: Better quality than 'high' and 5x+ faster to train. Give it a try!\n"
+                "\tpresets='good'     : Good accuracy with very fast inference speed.\n"
+                "\tpresets='medium'   : Fast training time, ideal for initial prototyping.",
             )
         kwargs_orig = kwargs.copy()
@@ -1162,7 +1174,7 @@ class TabularPredictor:
         # TODO: Temporary for v1.4. Make this more extensible for v1.5 by letting users make their own dynamic hyperparameters.
         dynamic_hyperparameters = kwargs["_experimental_dynamic_hyperparameters"]
         if dynamic_hyperparameters:
-            logger.log(20, f"`extreme` preset uses a dynamic portfolio based on dataset size...")
+            logger.log(20, f"`extreme_v140` preset uses a dynamic portfolio based on dataset size...")
             assert hyperparameters is None, f"hyperparameters must be unspecified when `_experimental_dynamic_hyperparameters=True`."
             n_samples = len(train_data)
             if n_samples > 30000:
@@ -1591,6 +1603,25 @@ class TabularPredictor:
         memory_safe_fits = ds_fit_kwargs.get("memory_safe_fits", True)
         enable_ray_logging = ds_fit_kwargs.get("enable_ray_logging", True)
         normal_fit = False
+        total_resources = ag_fit_kwargs["core_kwargs"]["total_resources"]
+        if memory_safe_fits == "auto":
+            num_gpus = total_resources.get("num_gpus", "auto")
+            if num_gpus == "auto":
+                num_gpus = ResourceManager.get_gpu_count_torch()
+                if num_gpus > 0:
+                    logger.log(
+                        30,
+                        f"DyStack: Disabling memory safe fit mode in DyStack "
+                        f"because GPUs were detected and num_gpus='auto' (GPUs cannot be used in memory safe fit mode). "
+                        f"If you want to use memory safe fit mode, manually set `num_gpus=0`."
+                    )
+            if num_gpus > 0:
+                memory_safe_fits = False
+            else:
+                memory_safe_fits = True
         if memory_safe_fits:
             try:
                 _ds_ray = try_import_ray()
@@ -1630,9 +1661,6 @@ class TabularPredictor:
             if _ds_ray is not None:
                 # Handle resources
                 # FIXME: what about distributed?
-                from autogluon.common.utils.resource_utils import ResourceManager
-                total_resources = ag_fit_kwargs["core_kwargs"]["total_resources"]
                 num_cpus = total_resources.get("num_cpus", "auto")
@@ -5243,11 +5271,11 @@ class TabularPredictor:
             holdout_frac=1 / 9,
             n_folds=2,
             n_repeats=1,
-            memory_safe_fits=True,
+            memory_safe_fits="auto",
             clean_up_fits=True,
             holdout_data=None,
             enable_ray_logging=True,
-            enable_callbacks=False,
+            enable_callbacks=True,
         )
         allowed_kes = set(ds_args.keys())
@@ -5262,9 +5290,11 @@ class TabularPredictor:
             (not isinstance(ds_args["validation_procedure"], str)) or (ds_args["validation_procedure"] not in ["holdout", "cv"])
         ):
             raise ValueError("`validation_procedure` in `ds_args` must be str in {'holdout','cv'}. " + f"Got: {ds_args['validation_procedure']}")
-        for arg_name in ["memory_safe_fits", "clean_up_fits", "enable_ray_logging"]:
+        for arg_name in ["clean_up_fits", "enable_ray_logging"]:
             if (arg_name in ds_args) and (not isinstance(ds_args[arg_name], bool)):
                 raise ValueError(f"`{arg_name}` in `ds_args` must be bool.  Got: {type(ds_args[arg_name])}")
+        if "memory_safe_fits" in ds_args and not isinstance(ds_args["memory_safe_fits"], (bool, str)):
+            raise ValueError(f"`memory_safe_fits` in `ds_args` must be bool or 'auto'.  Got: {type(ds_args['memory_safe_fits'])}")
         for arg_name in ["detection_time_frac", "holdout_frac"]:
             if (arg_name in ds_args) and ((not isinstance(ds_args[arg_name], float)) or (ds_args[arg_name] >= 1) or (ds_args[arg_name] <= 0)):
                 raise ValueError(f"`{arg_name}` in `ds_args` must be float in (0,1).  Got: {type(ds_args[arg_name])}, {ds_args[arg_name]}")

autogluon/tabular/registry/_ag_model_registry.py CHANGED Viewed

@@ -20,14 +20,17 @@ from ..models import (
     LinearModel,
     MultiModalPredictorModel,
     NNFastAiTabularModel,
+    PrepLGBModel,
     RealMLPModel,
     RFModel,
     RuleFitModel,
+    TabDPTModel,
     TabICLModel,
     TabMModel,
     TabPFNMixModel,
     MitraModel,
-    TabPFNV2Model,
+    RealTabPFNv2Model,
+    RealTabPFNv25Model,
     TabularNeuralNetTorchModel,
     TextPredictorModel,
     XGBoostModel,
@@ -47,14 +50,17 @@ REGISTERED_MODEL_CLS_LST = [
     TabularNeuralNetTorchModel,
     LinearModel,
     NNFastAiTabularModel,
+    PrepLGBModel,
     TextPredictorModel,
     ImagePredictorModel,
     MultiModalPredictorModel,
     FTTransformerModel,
+    TabDPTModel,
     TabICLModel,
     TabMModel,
     TabPFNMixModel,
-    TabPFNV2Model,
+    RealTabPFNv2Model,
+    RealTabPFNv25Model,
     MitraModel,
     FastTextModel,
     GreedyWeightedEnsembleModel,

autogluon/tabular/testing/fit_helper.py CHANGED Viewed

@@ -4,6 +4,9 @@ import copy
 import os
 import pandas as pd
 import shutil
+import sys
+import subprocess
+import textwrap
 import uuid
 from typing import Any, Type
@@ -12,6 +15,7 @@ from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
 from autogluon.core.metrics import METRICS
 from autogluon.core.models import AbstractModel, BaggedEnsembleModel
 from autogluon.core.stacked_overfitting.utils import check_stacked_overfitting_from_leaderboard
+from autogluon.core.testing.global_context_snapshot import GlobalContextSnapshot
 from autogluon.core.utils import download, generate_train_test_split_combined, infer_problem_type, unzip
 from autogluon.tabular import TabularDataset, TabularPredictor
@@ -175,6 +179,8 @@ class FitHelper:
         use_test_for_val: bool = False,
         raise_on_model_failure: bool | None = None,
         deepcopy_fit_args: bool = True,
+        verify_model_seed: bool = False,
+        verify_load_wo_cuda: bool = False,
     ) -> TabularPredictor:
         if compiler_configs is None:
             compiler_configs = {}
@@ -218,6 +224,8 @@ class FitHelper:
                 expected_model_count -= 1
             fit_args["fit_weighted_ensemble"] = fit_weighted_ensemble
+        ctx_before = GlobalContextSnapshot.capture()
         predictor: TabularPredictor = FitHelper.fit_dataset(
             train_data=train_data,
             init_args=init_args,
@@ -226,6 +234,10 @@ class FitHelper:
             scikit_api=scikit_api,
             min_cls_count_train=min_cls_count_train,
         )
+        ctx_after = GlobalContextSnapshot.capture()
+        ctx_before.assert_unchanged(ctx_after)
         if compile:
             predictor.compile(models="all", compiler_configs=compiler_configs)
             predictor.persist(models="all")
@@ -269,6 +281,11 @@ class FitHelper:
                 assert not model_info["val_in_fit"], f"val data must not be present in refit model if `can_refit_full=True`. Maybe an exception occurred?"
             else:
                 assert model_info["val_in_fit"], f"val data must be present in refit model if `can_refit_full=False`"
+        if verify_model_seed:
+            model_names = predictor.model_names()
+            for model_name in model_names:
+                model = predictor._trainer.load_model(model_name)
+                _verify_model_seed(model=model)
         if predictor_info:
             predictor.info()
@@ -281,6 +298,28 @@ class FitHelper:
         predictor_load = predictor.load(path=predictor.path)
         predictor_load.predict(test_data)
+        # TODO: This is expensive, only do this sparingly.
+        if verify_load_wo_cuda:
+            import torch
+            if torch.cuda.is_available():
+                # Checks if the model is able to predict w/o CUDA.
+                # This verifies that a model artifact works on a CPU machine.
+                predictor_path = predictor.path
+                code = textwrap.dedent(f"""
+                        import os
+                        os.environ["CUDA_VISIBLE_DEVICES"] = ""
+                        from autogluon.tabular import TabularPredictor
+                        import torch
+                        assert torch.cuda.is_available() is False
+                        predictor = TabularPredictor.load(r"{predictor_path}")
+                        X, y = predictor.load_data_internal()
+                        predictor.persist("all")
+                        predictor.predict_multi(X, transform_features=False)
+                    """)
+                subprocess.run([sys.executable, "-c", code], check=True)
         assert os.path.realpath(save_path) == os.path.realpath(predictor.path)
         if delete_directory:
             shutil.rmtree(save_path, ignore_errors=True)  # Delete AutoGluon output directory to ensure runs' information has been removed.
@@ -339,6 +378,7 @@ class FitHelper:
         require_known_problem_types: bool = True,
         raise_on_model_failure: bool = True,
         problem_types: list[str] | None = None,
+        verify_model_seed: bool = True,
         **kwargs,
     ):
         """
@@ -355,12 +395,18 @@ class FitHelper:
         problem_types: list[str], optional
             If specified, checks the given problem_types.
             If None, checks `model_cls.supported_problem_types()`
+        verify_model_seed: bool = True
         **kwargs
         Returns
         -------
         """
+        if verify_model_seed and model_cls.seed_name is not None:
+            # verify that the seed logic works
+            model_hyperparameters = model_hyperparameters.copy()
+            model_hyperparameters[model_cls.seed_name] = 42
         fit_args = dict(
             hyperparameters={model_cls: model_hyperparameters},
         )
@@ -429,6 +475,7 @@ class FitHelper:
                     refit_full=refit_full,
                     extra_metrics=_extra_metrics,
                     raise_on_model_failure=raise_on_model_failure,
+                    verify_model_seed=verify_model_seed,
                     **kwargs,
                 )
@@ -460,6 +507,7 @@ class FitHelper:
                         refit_full=refit_full,
                         extra_metrics=_extra_metrics,
                         raise_on_model_failure=raise_on_model_failure,
+                        verify_model_seed=verify_model_seed,
                         **kwargs,
                     )
@@ -476,3 +524,16 @@ def stacked_overfitting_assert(
     if expected_stacked_overfitting_at_test is not None:
         stacked_overfitting = check_stacked_overfitting_from_leaderboard(lb)
         assert stacked_overfitting == expected_stacked_overfitting_at_test, "Expected stacked overfitting at test mismatch!"
+def _verify_model_seed(model: AbstractModel):
+    assert model.random_seed is None or isinstance(model.random_seed, int)
+    if model.seed_name is not None:
+        if model.seed_name in model._user_params:
+            assert model.random_seed == model._user_params[model.seed_name]
+        assert model.seed_name in model.params
+        assert model.random_seed == model.params[model.seed_name]
+    if isinstance(model, BaggedEnsembleModel):
+        for child in model.models:
+            child = model.load_child(child)
+            _verify_model_seed(child)

autogluon.tabular 1.4.1b20251014__py3-none-any.whl → 1.5.0b20251222__py3-none-any.whl

autogluon.tabular 1.4.1b20251014py3-none-any.whl → 1.5.0b20251222py3-none-any.whl