PyPI - autogluon.tabular - Versions diffs - 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260117__py3-none-any.whl - Mend

autogluon.tabular 1.5.1b20260105py3-none-any.whl → 1.5.1b20260117py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (135) hide show

autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py CHANGED Viewed

@@ -47,13 +47,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         ag.early_stop : int | str, default = "default"
             Specifies the early stopping rounds. Defaults to an adaptive strategy. Recommended to keep default.
     """
     ag_key = "NN_TORCH"
     ag_name = "NeuralNetTorch"
     ag_priority = 25
     seed_name = "seed_value"
     # Constants used throughout this class:
-    unique_category_str = np.nan  # string used to represent missing values and unknown categories for categorical features.
+    unique_category_str = (
+        np.nan
+    )  # string used to represent missing values and unknown categories for categorical features.
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -106,12 +109,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                 device = torch.device("cuda")
                 logger.log(15, "Training on GPU (CUDA)")
                 if num_gpus > 1:
-                    logger.warning(f"{self.__class__.__name__} not yet able to use more than 1 GPU. 'num_gpus' is set to >1, but we will be using only 1 GPU.")
-            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                    logger.warning(
+                        f"{self.__class__.__name__} not yet able to use more than 1 GPU. 'num_gpus' is set to >1, but we will be using only 1 GPU."
+                    )
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
                 device = torch.device("mps")
                 logger.log(15, "Training on GPU (MPS - Apple Silicon)")
                 if num_gpus > 1:
-                    logger.warning(f"{self.__class__.__name__} on Apple Silicon can only use 1 GPU (MPS). 'num_gpus' is set to >1, but we will be using only 1 GPU.")
+                    logger.warning(
+                        f"{self.__class__.__name__} on Apple Silicon can only use 1 GPU (MPS). 'num_gpus' is set to >1, but we will be using only 1 GPU."
+                    )
             else:
                 device = torch.device("cpu")
                 logger.log(15, "Training on CPU")
@@ -126,7 +133,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         """ Sets dataset-adaptive default values to use for our neural network """
         if self.problem_type in [REGRESSION, QUANTILE]:
             if params["y_range"] is None:
-                params["y_range"] = infer_y_range(y_vals=train_dataset.data_list[train_dataset.label_index], y_range_extend=y_range_extend)
+                params["y_range"] = infer_y_range(
+                    y_vals=train_dataset.data_list[train_dataset.label_index], y_range_extend=y_range_extend
+                )
         return params
     def _get_default_loss_function(self):
@@ -143,7 +152,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
     def _prepare_params(params):
         params = params.copy()
-        processor_param_keys = {"proc.embed_min_categories", "proc.impute_strategy", "proc.max_category_levels", "proc.skew_threshold", "use_ngram_features"}
+        processor_param_keys = {
+            "proc.embed_min_categories",
+            "proc.impute_strategy",
+            "proc.max_category_levels",
+            "proc.skew_threshold",
+            "use_ngram_features",
+        }
         processor_kwargs = {k: v for k, v in params.items() if k in processor_param_keys}
         for key in processor_param_keys:
             params.pop(key, None)
@@ -201,14 +216,20 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
             torch.manual_seed(seed_value)
         if sample_weight is not None:  # TODO: support
-            logger.log(15, f"sample_weight not yet supported for {self.__class__.__name__}," " this model will ignore them in training.")
+            logger.log(
+                15,
+                f"sample_weight not yet supported for {self.__class__.__name__},"
+                " this model will ignore them in training.",
+            )
         if num_cpus is not None:
             self.num_dataloading_workers = max(1, int(num_cpus / 2.0))
         else:
             self.num_dataloading_workers = 1
         if self.num_dataloading_workers == 1:
-            self.num_dataloading_workers = 0  # TODO: verify 0 is typically faster and uses less memory than 1 in pytorch
+            self.num_dataloading_workers = (
+                0  # TODO: verify 0 is typically faster and uses less memory than 1 in pytorch
+            )
         self.num_dataloading_workers = 0  # TODO: >0 crashes on MacOS
         self.max_batch_size = params.pop("max_batch_size", 512)
@@ -298,7 +319,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         logging.debug("initialized")
         train_dataloader = train_dataset.build_loader(batch_size, self.num_dataloading_workers, is_test=False)
-        if isinstance(loss_kwargs.get("loss_function", "auto"), str) and loss_kwargs.get("loss_function", "auto") == "auto":
+        if (
+            isinstance(loss_kwargs.get("loss_function", "auto"), str)
+            and loss_kwargs.get("loss_function", "auto") == "auto"
+        ):
             loss_kwargs["loss_function"] = self._get_default_loss_function()
         if epochs_wo_improve is not None:
             early_stopping_method = SimpleES(patience=epochs_wo_improve)
@@ -404,13 +428,18 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                         # v1 estimate is sensitive to fixed cost overhead at the start of training, such as torch initialization.
                         # v2 fixes this, but we keep both and take the min to avoid potential cases where v2 is inaccurate due to an overly slow batch.
-                        estimated_time_v1 = time_elapsed_epoch / update_cur * num_updates_per_epoch  # Less accurate than v2, but never underestimates time
-                        estimated_time_v2 = time_elapsed_epoch + time_elapsed_batch * (num_updates_per_epoch - update_cur)  # Less likely to overestimate time
+                        estimated_time_v1 = (
+                            time_elapsed_epoch / update_cur * num_updates_per_epoch
+                        )  # Less accurate than v2, but never underestimates time
+                        estimated_time_v2 = time_elapsed_epoch + time_elapsed_batch * (
+                            num_updates_per_epoch - update_cur
+                        )  # Less likely to overestimate time
                         estimated_time = min(estimated_time_v1, estimated_time_v2)
                         if estimated_time > time_limit:
                             logger.log(
                                 30,
-                                f"\tNot enough time to train first epoch. " f"(Time Required: {round(estimated_time, 2)}s, Time Left: {round(time_limit, 2)}s)",
+                                f"\tNot enough time to train first epoch. "
+                                f"(Time Required: {round(estimated_time, 2)}s, Time Left: {round(time_limit, 2)}s)",
                             )
                             raise TimeLimitExceeded
                     time_elapsed = time_cur - start_fit_time
@@ -421,7 +450,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                                 f"\tNot enough time to train first epoch. Stopped on Update {total_updates} (Epoch {epoch}))",
                             )
                             raise TimeLimitExceeded
-                        logger.log(15, f"\tRan out of time, stopping training early. (Stopped on Update {total_updates} (Epoch {epoch}))")
+                        logger.log(
+                            15,
+                            f"\tRan out of time, stopping training early. (Stopped on Update {total_updates} (Epoch {epoch}))",
+                        )
                         do_update = False
                         break
@@ -464,10 +496,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                         is_best = True
                     best_val_metric = val_metric
                     io_buffer = io.BytesIO()
-                    torch.save(self.model, io_buffer)  # nosec B614
+                    torch.save(self.model.state_dict(), io_buffer)
                     best_epoch = epoch
                     best_val_update = total_updates
-                early_stop = early_stopping_method.update(cur_round=epoch-1, is_best=is_best)
+                early_stop = early_stopping_method.update(cur_round=epoch - 1, is_best=is_best)
                 if verbose_eval:
                     logger.log(
                         15,
@@ -514,10 +546,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         # revert back to best model
         if val_dataset is not None:
-            logger.log(15, f"Best model found on Epoch {best_epoch} (Update {best_val_update}). Val {self.stopping_metric.name}: {best_val_metric}")
+            logger.log(
+                15,
+                f"Best model found on Epoch {best_epoch} (Update {best_val_update}). Val {self.stopping_metric.name}: {best_val_metric}",
+            )
             if io_buffer is not None:
                 io_buffer.seek(0)
-                self.model = torch.load(io_buffer, weights_only=False)  # nosec B614
+                self.model.load_state_dict(torch.load(io_buffer, weights_only=True))
         else:
             logger.log(15, f"Best model found on Epoch {best_epoch} (Update {best_val_update}).")
         self.params_trained["batch_size"] = batch_size
@@ -530,7 +565,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         elif isinstance(ag_early_stop, str) and ag_early_stop == "default":
             early_stopping_method = self._get_early_stop_default()
         elif isinstance(ag_early_stop, (str, tuple, list)):
-            early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=ag_early_stop)
+            early_stopping_rounds = self._get_early_stopping_rounds(
+                num_rows_train=num_rows_train, strategy=ag_early_stop
+            )
             early_stopping_method = early_stopping_rounds[0](**early_stopping_rounds[1])
         elif isinstance(ag_early_stop, int):
             early_stopping_method = SimpleES(patience=ag_early_stop)
@@ -573,8 +610,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         for metric in scorers:
             train_metrics.append(self.score(X=train_dataset, y=y_train, metric=metric, _reset_threads=False))
-            val_metrics += [self.score(X=val_dataset, y=y_val, metric=metric, _reset_threads=False)] if val_dataset is not None else []
-            test_metrics += [self.score(X=test_dataset, y=y_test, metric=metric, _reset_threads=False)] if test_dataset is not None else []
+            val_metrics += (
+                [self.score(X=val_dataset, y=y_val, metric=metric, _reset_threads=False)]
+                if val_dataset is not None
+                else []
+            )
+            test_metrics += (
+                [self.score(X=test_dataset, y=y_test, metric=metric, _reset_threads=False)]
+                if test_dataset is not None
+                else []
+            )
             if use_curve_metric_error:
                 train_metrics[-1] = metric.convert_score_to_error(train_metrics[-1])
@@ -585,8 +630,14 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
             if (
                 not self._assert_valid_metric(metric=train_metrics[-1], best_epoch=best_epoch)
-                or (val_dataset is not None and not self._assert_valid_metric(metric=val_metrics[-1], best_epoch=best_epoch))
-                or (test_dataset is not None and not self._assert_valid_metric(metric=test_metrics[-1], best_epoch=best_epoch))
+                or (
+                    val_dataset is not None
+                    and not self._assert_valid_metric(metric=val_metrics[-1], best_epoch=best_epoch)
+                )
+                or (
+                    test_dataset is not None
+                    and not self._assert_valid_metric(metric=test_metrics[-1], best_epoch=best_epoch)
+                )
             ):
                 return True
@@ -621,7 +672,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                     "or NN weights may have diverged."
                 )
             else:
-                logger.warning(f"Warning: NaNs encountered in {self.__class__.__name__} training. " "Reverting model to last checkpoint without NaNs.")
+                logger.warning(
+                    f"Warning: NaNs encountered in {self.__class__.__name__} training. "
+                    "Reverting model to last checkpoint without NaNs."
+                )
                 return False
         return True
@@ -657,7 +711,9 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         preds_dataset = np.concatenate(preds_dataset, 0)
         return preds_dataset
-    def _generate_dataset(self, X: pd.DataFrame | TabularTorchDataset, y: pd.Series, train_params: dict = {}, is_train: bool = False) -> TabularTorchDataset:
+    def _generate_dataset(
+        self, X: pd.DataFrame | TabularTorchDataset, y: pd.Series, train_params: dict = {}, is_train: bool = False
+    ) -> TabularTorchDataset:
         """
         Generate TabularTorchDataset from X and y.
@@ -721,7 +777,12 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         warnings.filterwarnings("ignore", module="sklearn.preprocessing")
         if labels is not None and len(labels) != len(df):
             raise ValueError("Number of examples in Dataframe does not match number of labels")
-        if self.processor is None or self._types_of_features is None or self.feature_arraycol_map is None or self.feature_type_map is None:
+        if (
+            self.processor is None
+            or self._types_of_features is None
+            or self.feature_arraycol_map is None
+            or self.feature_type_map is None
+        ):
             raise ValueError("Need to process training data before test data")
         if self.features_to_drop:
             drop_cols = [col for col in df.columns if col in self.features_to_drop]
@@ -732,7 +793,16 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         df = self.processor.transform(df)
         return TabularTorchDataset(df, self.feature_arraycol_map, self.feature_type_map, self.problem_type, labels)
-    def _process_train_data(self, df, impute_strategy, max_category_levels, skew_threshold, embed_min_categories, use_ngram_features, labels):
+    def _process_train_data(
+        self,
+        df,
+        impute_strategy,
+        max_category_levels,
+        skew_threshold,
+        embed_min_categories,
+        use_ngram_features,
+        labels,
+    ):
         from .tabular_torch_dataset import TabularTorchDataset
         # sklearn processing n_quantiles warning
@@ -744,13 +814,18 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         # dict with keys: : 'continuous', 'skewed', 'onehot', 'embed', values = column-names of df
         self._types_of_features, df = self._get_types_of_features(
-            df, skew_threshold=skew_threshold, embed_min_categories=embed_min_categories, use_ngram_features=use_ngram_features
+            df,
+            skew_threshold=skew_threshold,
+            embed_min_categories=embed_min_categories,
+            use_ngram_features=use_ngram_features,
         )
         logger.log(15, "Tabular Neural Network treats features as the following types:")
         logger.log(15, json.dumps(self._types_of_features, indent=4))
         logger.log(15, "\n")
         if self.processor is not None:
-            Warning(f"Attempting to process training data for {self.__class__.__name__}, but previously already did this.")
+            Warning(
+                f"Attempting to process training data for {self.__class__.__name__}, but previously already did this."
+            )
         self.processor = create_preprocessor(
             impute_strategy=impute_strategy,
             max_category_levels=max_category_levels,
@@ -763,15 +838,22 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         )
         df = self.processor.fit_transform(df)
         # OrderedDict of feature-name -> list of column-indices in df corresponding to this feature
-        self.feature_arraycol_map = get_feature_arraycol_map(processor=self.processor, max_category_levels=max_category_levels)
-        num_array_cols = np.sum([len(self.feature_arraycol_map[key]) for key in self.feature_arraycol_map])  # should match number of columns in processed array
+        self.feature_arraycol_map = get_feature_arraycol_map(
+            processor=self.processor, max_category_levels=max_category_levels
+        )
+        num_array_cols = np.sum(
+            [len(self.feature_arraycol_map[key]) for key in self.feature_arraycol_map]
+        )  # should match number of columns in processed array
         if num_array_cols != df.shape[1]:
             raise ValueError(
-                "Error during one-hot encoding data processing for neural network. " "Number of columns in df array does not match feature_arraycol_map."
+                "Error during one-hot encoding data processing for neural network. "
+                "Number of columns in df array does not match feature_arraycol_map."
             )
         # OrderedDict of feature-name -> feature_type string (options: 'vector', 'embed')
-        self.feature_type_map = get_feature_type_map(feature_arraycol_map=self.feature_arraycol_map, types_of_features=self._types_of_features)
+        self.feature_type_map = get_feature_type_map(
+            feature_arraycol_map=self.feature_arraycol_map, types_of_features=self._types_of_features
+        )
         return TabularTorchDataset(df, self.feature_arraycol_map, self.feature_type_map, self.problem_type, labels)
     def _init_optimizer(self, optimizer, learning_rate, weight_decay):
@@ -801,7 +883,13 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
     def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
         hyperparameters = self._get_model_params()
-        return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
+        return self.estimate_memory_usage_static(
+            X=X,
+            problem_type=self.problem_type,
+            num_classes=self.num_classes,
+            hyperparameters=hyperparameters,
+            **kwargs,
+        )
     @classmethod
     def _estimate_memory_usage_static(
@@ -877,7 +965,10 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
                 device = torch.device(original_device_type)
             if verbose and (original_device_type != device.type):
-                logger.log(15, f"Model is trained on {original_device_type}, but the device is not available - loading on {device.type}")
+                logger.log(
+                    15,
+                    f"Model is trained on {original_device_type}, but the device is not available - loading on {device.type}",
+                )
             model.device = device
             model.model = model.model.to(model.device)
@@ -949,9 +1040,12 @@ class TabularNeuralNetTorchModel(AbstractNeuralNetworkModel):
         input_types = kwargs.get("input_types", self._get_input_types(batch_size=self.max_batch_size))
         assert isinstance(self.processor, ColumnTransformer), (
-            f"unexpected processor type {type(self.processor)}, " "expecting processor type to be sklearn.compose._column_transformer.ColumnTransformer"
+            f"unexpected processor type {type(self.processor)}, "
+            "expecting processor type to be sklearn.compose._column_transformer.ColumnTransformer"
+        )
+        self.processor = self._compiler.compile(
+            model=(self.processor, self.model), path=self.path, input_types=input_types
         )
-        self.processor = self._compiler.compile(model=(self.processor, self.model), path=self.path, input_types=input_types)
     @classmethod
     def supported_problem_types(cls) -> list[str] | None:

autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py CHANGED Viewed

@@ -205,7 +205,9 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
                 feat_i = self.feature_groups["embed"][i]
                 feat_i_data = self.get_feature_data(feat_i).flatten().tolist()
                 num_categories_i = len(set(feat_i_data))  # number of categories for ith feature
-                num_categories_per_embedfeature[i] = num_categories_i + 1  # to account for unknown test-time categories
+                num_categories_per_embedfeature[i] = (
+                    num_categories_i + 1
+                )  # to account for unknown test-time categories
             return num_categories_per_embedfeature
     def get_feature_data(self, feature):
@@ -231,14 +233,14 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
         dataobj_file = file_prefix + self.DATAOBJ_SUFFIX
         if not os.path.exists(os.path.dirname(dataobj_file)):
             os.makedirs(os.path.dirname(dataobj_file))
-        torch.save(self, dataobj_file) # nosec B614
+        torch.save(self, dataobj_file)  # nosec B614
         logger.debug("TabularPyTorchDataset Dataset saved to a file: \n %s" % dataobj_file)
     @classmethod
     def load(cls, file_prefix=""):
         """Additional naming changes will be appended to end of file_prefix (must contain full absolute path)"""
         dataobj_file = file_prefix + cls.DATAOBJ_SUFFIX
-        dataset: TabularTorchDataset = torch.load(dataobj_file) # nosec B614
+        dataset: TabularTorchDataset = torch.load(dataobj_file)  # nosec B614
         logger.debug("TabularNN Dataset loaded from a file: \n %s" % dataobj_file)
         return dataset
@@ -256,5 +258,7 @@ class TabularTorchDataset(torch.utils.data.IterableDataset):
         self.shuffle = False if is_test else True
         self.drop_last = False if is_test else True
         generator = torch.Generator().manual_seed(torch.initial_seed()) if is_test else None
-        loader = torch.utils.data.DataLoader(self, num_workers=num_workers, batch_size=None, worker_init_fn=worker_init_fn, generator=generator)  # no collation
+        loader = torch.utils.data.DataLoader(
+            self, num_workers=num_workers, batch_size=None, worker_init_fn=worker_init_fn, generator=generator
+        )  # no collation
         return loader

autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py CHANGED Viewed

@@ -16,7 +16,16 @@ class EmbedNet(nn.Module):
     y_range: Used specifically for regression. = None for classification.
     """
-    def __init__(self, problem_type, num_net_outputs=None, quantile_levels=None, train_dataset=None, architecture_desc=None, device=None, **kwargs):
+    def __init__(
+        self,
+        problem_type,
+        num_net_outputs=None,
+        quantile_levels=None,
+        train_dataset=None,
+        architecture_desc=None,
+        device=None,
+        **kwargs,
+    ):
         if (architecture_desc is None) and (train_dataset is None):
             raise ValueError("train_dataset cannot = None if architecture_desc=None")
         super().__init__()
@@ -54,7 +63,9 @@ class EmbedNet(nn.Module):
         if self.has_embed_features:
             self.embed_blocks = nn.ModuleList()
             for i in range(len(num_categs_per_feature)):
-                self.embed_blocks.append(nn.Embedding(num_embeddings=num_categs_per_feature[i], embedding_dim=embed_dims[i]))
+                self.embed_blocks.append(
+                    nn.Embedding(num_embeddings=num_categs_per_feature[i], embedding_dim=embed_dims[i])
+                )
                 input_size += embed_dims[i]
         # update input size
@@ -189,9 +200,17 @@ class EmbedNet(nn.Module):
             loss_data = torch.max(self.quantile_levels * error_data, (self.quantile_levels - 1) * error_data)
             return loss_data.mean()
-        loss_data = torch.where(torch.abs(error_data) < self.alpha, 0.5 * error_data * error_data, self.alpha * (torch.abs(error_data) - 0.5 * self.alpha))
+        loss_data = torch.where(
+            torch.abs(error_data) < self.alpha,
+            0.5 * error_data * error_data,
+            self.alpha * (torch.abs(error_data) - 0.5 * self.alpha),
+        )
         loss_data /= self.alpha
-        scale = torch.where(error_data >= 0, torch.ones_like(error_data) * self.quantile_levels, torch.ones_like(error_data) * (1 - self.quantile_levels))
+        scale = torch.where(
+            error_data >= 0,
+            torch.ones_like(error_data) * self.quantile_levels,
+            torch.ones_like(error_data) * (1 - self.quantile_levels),
+        )
         loss_data *= scale
         return loss_data.mean()
@@ -226,7 +245,9 @@ class EmbedNet(nn.Module):
         predict_data = self(data_batch)
         target_data = data_batch[-1].to(self.device)
         if self.problem_type in [BINARY, MULTICLASS]:
-            target_data = target_data.type(torch.long)  # Windows default int type is int32. Need to explicit convert to Long.
+            target_data = target_data.type(
+                torch.long
+            )  # Windows default int type is int32. Need to explicit convert to Long.
         if self.problem_type == QUANTILE:
             return self.quantile_loss(predict_data, target_data, margin=gamma)
         if self.problem_type == SOFTCLASS:

autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py CHANGED Viewed

@@ -6,9 +6,9 @@ Unknown categories are returned as None in inverse transforms. Always converts i
 import copy
 from numbers import Integral
-from packaging.version import parse as parse_version
 import numpy as np
+from packaging.version import parse as parse_version
 from scipy import sparse
 from sklearn import __version__ as _sklearn_version
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -136,7 +136,7 @@ def _encode_check_unknown(values, uniques, return_mask=False):
         diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
         if return_mask:
             if diff:
-                valid_mask = np.in1d(values, uniques)
+                valid_mask = np.isin(values, uniques)
             else:
                 valid_mask = np.ones(len(values), dtype=bool)
             return diff, valid_mask
@@ -203,11 +203,11 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
         if self.categories != "auto":
             if len(self.categories) != n_features:
-                raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).")
+                raise ValueError("Shape mismatch: if categories is an array, it has to be of shape (n_features,).")
         if self.max_levels is not None:
             if not isinstance(self.max_levels, Integral) or self.max_levels <= 0:
-                raise ValueError("max_levels must be None or a strictly " "positive int, got {}.".format(self.max_levels))
+                raise ValueError("max_levels must be None or a strictly positive int, got {}.".format(self.max_levels))
         self.categories_ = []
         self.infrequent_indices_ = []
@@ -220,11 +220,11 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
                 if Xi.dtype != object:
                     if not np.all(np.sort(cats) == cats):
-                        raise ValueError("Unsorted categories are not " "supported for numerical categories")
+                        raise ValueError("Unsorted categories are not supported for numerical categories")
                 if handle_unknown == "error":
                     diff = _encode_check_unknown(Xi, cats)
                     if diff:
-                        msg = "Found unknown categories {0} in column {1}" " during fit".format(diff, i)
+                        msg = "Found unknown categories {0} in column {1} during fit".format(diff, i)
                         raise ValueError(msg)
             self.categories_.append(cats)
@@ -264,7 +264,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
             if not np.all(valid_mask):
                 if handle_unknown == "error":
-                    msg = "Found unknown categories {0} in column {1}" " during transform".format(diff, i)
+                    msg = "Found unknown categories {0} in column {1} during transform".format(diff, i)
                     raise ValueError(msg)
                 else:
                     # Set the problematic rows to an acceptable value and
@@ -325,7 +325,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
             A Tags object containing all tag information.
         """
         # lazily import to avoid crashing if sklearn<1.6
-        from sklearn.utils import Tags, InputTags, TargetTags
+        from sklearn.utils import InputTags, Tags, TargetTags
         # Create the Tags object with appropriate settings
         tags = Tags(
@@ -433,13 +433,17 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
     def _validate_keywords(self):
         if self.handle_unknown not in ("error", "ignore"):
-            msg = "handle_unknown should be either 'error' or 'ignore', " "got {0}.".format(self.handle_unknown)
+            msg = "handle_unknown should be either 'error' or 'ignore', got {0}.".format(self.handle_unknown)
             raise ValueError(msg)
         # If we have both dropped columns and ignored unknown
         # values, there will be ambiguous cells. This creates difficulties
         # in interpreting the model.
         if self.drop is not None and self.handle_unknown != "error":
-            raise ValueError("`handle_unknown` must be 'error' when the drop parameter is " "specified, as both would create categories that are all " "zero.")
+            raise ValueError(
+                "`handle_unknown` must be 'error' when the drop parameter is "
+                "specified, as both would create categories that are all "
+                "zero."
+            )
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -451,20 +455,25 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
                 self.drop = np.asarray(self.drop, dtype=object)
                 droplen = len(self.drop)
             except (ValueError, TypeError):
-                msg = "Wrong input for parameter `drop`. Expected " "'first', None or array of objects, got {}"
+                msg = "Wrong input for parameter `drop`. Expected 'first', None or array of objects, got {}"
                 raise ValueError(msg.format(type(self.drop)))
             if droplen != len(self.categories_):
-                msg = "`drop` should have length equal to the number " "of features ({}), got {}"
+                msg = "`drop` should have length equal to the number of features ({}), got {}"
                 raise ValueError(msg.format(len(self.categories_), len(self.drop)))
             missing_drops = [(i, val) for i, val in enumerate(self.drop) if val not in self.categories_[i]]
             if any(missing_drops):
-                msg = "The following categories were supposed to be " "dropped, but were not found in the training " "data.\n{}".format(
-                    "\n".join(["Category: {}, Feature: {}".format(c, v) for c, v in missing_drops])
+                msg = (
+                    "The following categories were supposed to be "
+                    "dropped, but were not found in the training "
+                    "data.\n{}".format("\n".join(["Category: {}, Feature: {}".format(c, v) for c, v in missing_drops]))
                 )
                 raise ValueError(msg)
-            return np.array([np.where(cat_list == val)[0][0] for (val, cat_list) in zip(self.drop, self.categories_)], dtype=np.int_)
+            return np.array(
+                [np.where(cat_list == val)[0][0] for (val, cat_list) in zip(self.drop, self.categories_)],
+                dtype=np.int_,
+            )
         else:
-            msg = "Wrong input for parameter `drop`. Expected " "'first', None or array of objects, got {}"
+            msg = "Wrong input for parameter `drop`. Expected 'first', None or array of objects, got {}"
             raise ValueError(msg.format(type(self.drop)))
     def _convert_cat_to_int(self, X):
@@ -497,12 +506,14 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
         # check if user wants to manually drop a feature that is
         # infrequent: this is not allowed
         if self.drop is not None and not isinstance(self.drop, str):
-            for feature_idx, (infrequent_indices, drop_idx) in enumerate(zip(self.infrequent_indices_, self.drop_idx_)):
+            for feature_idx, (infrequent_indices, drop_idx) in enumerate(
+                zip(self.infrequent_indices_, self.drop_idx_)
+            ):
                 if drop_idx in infrequent_indices:
                     raise ValueError(
-                        "Category {} of feature {} is infrequent and thus " "cannot be dropped. Use drop='infrequent' " "instead.".format(
-                            self.categories_[feature_idx][drop_idx], feature_idx
-                        )
+                        "Category {} of feature {} is infrequent and thus "
+                        "cannot be dropped. Use drop='infrequent' "
+                        "instead.".format(self.categories_[feature_idx][drop_idx], feature_idx)
                     )
         return self
@@ -614,7 +625,7 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
             n_transformed_features = sum(len(cats) - 1 for cats in self.categories_)
         # validate shape of passed X
-        msg = "Shape of the passed X data is not correct. Expected {0} " "columns, got {1}."
+        msg = "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
         if X.shape[1] != n_transformed_features:
             raise ValueError(msg.format(n_transformed_features, X.shape[1]))
@@ -686,7 +697,11 @@ class OneHotMergeRaresHandleUnknownEncoder(_BaseEncoder):
         if input_features is None:
             input_features = ["x%d" % i for i in range(len(cats))]
         elif len(input_features) != len(self.categories_):
-            raise ValueError("input_features should have length equal to number of " "features ({}), got {}".format(len(self.categories_), len(input_features)))
+            raise ValueError(
+                "input_features should have length equal to number of features ({}), got {}".format(
+                    len(self.categories_), len(input_features)
+                )
+            )
         feature_names = []
         for i in range(len(cats)):
@@ -788,7 +803,9 @@ class OrdinalMergeRaresHandleUnknownEncoder(_BaseEncoder):
         """
         X = self._label_encoder.transform(X)
         X_og_array = np.array(X)  # original X array before transform
-        X_int, _ = self._transform(X, handle_unknown="ignore")  # will contain zeros for 0th category as well as unknown values.
+        X_int, _ = self._transform(
+            X, handle_unknown="ignore"
+        )  # will contain zeros for 0th category as well as unknown values.
         for i in range(X_int.shape[1]):
             X_col_data = X_og_array[:, i]
@@ -822,7 +839,7 @@ class OrdinalMergeRaresHandleUnknownEncoder(_BaseEncoder):
         n_features = len(self.categories_)
         # validate shape of passed X
-        msg = "Shape of the passed X data is not correct. Expected {0} " "columns, got {1}."
+        msg = "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
         if X.shape[1] != n_features:
             raise ValueError(msg.format(n_features, X.shape[1]))

autogluon.tabular 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260117__py3-none-any.whl

Potentially problematic release.

autogluon.tabular 1.5.1b20260105py3-none-any.whl → 1.5.1b20260117py3-none-any.whl