autogluon.tabular 1.3.2b20250710__py3-none-any.whl → 1.3.2b20250712__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. autogluon/tabular/models/__init__.py +1 -1
  2. autogluon/tabular/models/tabpfnv2/__init__.py +0 -0
  3. autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +20 -0
  4. autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +40 -0
  5. autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +201 -0
  6. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +1464 -0
  7. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +747 -0
  8. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +863 -0
  9. autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +106 -0
  10. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +376 -0
  11. autogluon/tabular/registry/_ag_model_registry.py +2 -2
  12. autogluon/tabular/version.py +1 -1
  13. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/METADATA +12 -14
  14. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/RECORD +21 -14
  15. autogluon/tabular/models/tabpfn/__init__.py +0 -1
  16. autogluon/tabular/models/tabpfn/tabpfn_model.py +0 -153
  17. /autogluon.tabular-1.3.2b20250710-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250712-py3.9-nspkg.pth +0 -0
  18. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/LICENSE +0 -0
  19. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/NOTICE +0 -0
  20. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/WHEEL +0 -0
  21. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/namespace_packages.txt +0 -0
  22. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/top_level.txt +0 -0
  23. {autogluon.tabular-1.3.2b20250710.dist-info → autogluon.tabular-1.3.2b20250712.dist-info}/zip-safe +0 -0
@@ -1,153 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import numpy as np
4
- import pandas as pd
5
-
6
- from autogluon.core.constants import BINARY, MULTICLASS
7
- from autogluon.core.models import AbstractModel
8
- from autogluon.core.utils import generate_train_test_split
9
- from autogluon.features.generators import LabelEncoderFeatureGenerator
10
-
11
-
12
- class TabPFNModel(AbstractModel):
13
- """
14
- AutoGluon model wrapper to the TabPFN model: https://github.com/automl/TabPFN
15
-
16
- Paper: "TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second"
17
- Authors: Noah Hollmann, Samuel Müller, Katharina Eggensperger, and Frank Hutter
18
-
19
- TabPFN is a viable model option when inference speed is not a concern,
20
- and the number of rows of training data is less than 10,000.
21
-
22
- Additionally, TabPFN is only available for classification tasks with up to 10 classes and 100 features.
23
-
24
- To use this model, `tabpfn` must be installed.
25
- To install TabPFN, you can run `pip install autogluon.tabular[tabpfn]` or `pip install tabpfn`.
26
- """
27
- ag_key = "TABPFN"
28
- ag_name = "TabPFN"
29
- ag_priority = 110
30
-
31
- def __init__(self, **kwargs):
32
- super().__init__(**kwargs)
33
- self._feature_generator = None
34
-
35
- def _fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
36
- from tabpfn import TabPFNClassifier
37
-
38
- ag_params = self._get_ag_params()
39
- sample_rows = ag_params.get("sample_rows")
40
- max_features = ag_params.get("max_features")
41
- max_classes = ag_params.get("max_classes")
42
- if max_classes is not None and self.num_classes > max_classes:
43
- # TODO: Move to earlier stage when problem_type is checked
44
- raise AssertionError(f"Max allowed classes for the model is {max_classes}, " f"but found {self.num_classes} classes.")
45
-
46
- # TODO: Make sample_rows generic
47
- if sample_rows is not None and len(X) > sample_rows:
48
- X, y = self._subsample_train(X=X, y=y, num_rows=sample_rows)
49
- X = self.preprocess(X)
50
- num_features = X.shape[1]
51
- # TODO: Make max_features generic
52
- if max_features is not None and num_features > max_features:
53
- raise AssertionError(f"Max allowed features for the model is {max_features}, " f"but found {num_features} features.")
54
- hyp = self._get_model_params()
55
- N_ensemble_configurations = hyp.get("N_ensemble_configurations")
56
- self.model = TabPFNClassifier(device="cpu", N_ensemble_configurations=N_ensemble_configurations).fit( # TODO: Add GPU option
57
- X, y, overwrite_warning=True
58
- )
59
-
60
- # TODO: Make this generic by creating a generic `preprocess_train` and putting this logic prior to `_preprocess`.
61
- def _subsample_train(self, X: pd.DataFrame, y: pd.Series, num_rows: int, random_state=0) -> (pd.DataFrame, pd.Series):
62
- num_rows_to_drop = len(X) - num_rows
63
- X, _, y, _ = generate_train_test_split(
64
- X=X,
65
- y=y,
66
- problem_type=self.problem_type,
67
- test_size=num_rows_to_drop,
68
- random_state=random_state,
69
- min_cls_count_train=1,
70
- )
71
- return X, y
72
-
73
- def _preprocess(self, X: pd.DataFrame, **kwargs) -> np.ndarray:
74
- """
75
- Converts categorical to label encoded integers
76
- Keeps missing values, as TabPFN automatically handles missing values internally.
77
- """
78
- X = super()._preprocess(X, **kwargs)
79
- if self._feature_generator is None:
80
- self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
81
- self._feature_generator.fit(X=X)
82
- if self._feature_generator.features_in:
83
- X = X.copy()
84
- X[self._feature_generator.features_in] = self._feature_generator.transform(X=X)
85
- X = X.to_numpy(dtype=np.float32)
86
- return X
87
-
88
- def _set_default_params(self):
89
- """
90
- By default, we only use 1 ensemble configurations to speed up inference times.
91
- Increase the value to improve model quality while linearly increasing inference time.
92
-
93
- Model quality improvement diminishes significantly beyond `N_ensemble_configurations=8`.
94
- """
95
- default_params = {
96
- "N_ensemble_configurations": 1,
97
- }
98
- for param, val in default_params.items():
99
- self._set_default_param_value(param, val)
100
-
101
- @classmethod
102
- def supported_problem_types(cls) -> list[str] | None:
103
- return ["binary", "multiclass"]
104
-
105
- def _get_default_auxiliary_params(self) -> dict:
106
- """
107
- TabPFN was originally learned on synthetic datasets with 1024 rows, and struggles to
108
- leverage additional rows effectively beyond a certain point.
109
-
110
- In the TabPFN paper, performance appeared to stagnate around 4000 rows of training data (Figure 10).
111
- Thus, we set `sample_rows=4096`, to only use that many rows of training data, even if more is available.
112
-
113
- TODO: TabPFN scales poorly on large datasets, so we set `max_rows=20000`.
114
- Not implemented yet, first move this logic to the trainer level to avoid `refit_full` edge-case crashes.
115
- TabPFN only works on datasets with at most 100 features, so we set `max_features=100`.
116
- TabPFN only works on datasets with at most 10 classes, so we set `max_classes=10`.
117
- """
118
- default_auxiliary_params = super()._get_default_auxiliary_params()
119
- default_auxiliary_params.update(
120
- {
121
- "sample_rows": 4096,
122
- # 'max_rows': 20000,
123
- "max_features": 100,
124
- "max_classes": 10,
125
- }
126
- )
127
- return default_auxiliary_params
128
-
129
- # FIXME: Enabling parallel bagging TabPFN creates a lot of warnings / potential failures from Ray
130
- # TODO: Consider not setting `max_sets=1`, and only setting it in the preset hyperparameter definition.
131
- @classmethod
132
- def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
133
- """
134
- Set max_sets to 1 when bagging, otherwise inference time could become extremely slow.
135
- Set fold_fitting_strategy to sequential_local, as parallel folding causing many warnings / potential errors from Ray.
136
- """
137
- default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
138
- extra_ag_args_ensemble = {
139
- "max_sets": 1,
140
- "fold_fitting_strategy": "sequential_local",
141
- }
142
- default_ag_args_ensemble.update(extra_ag_args_ensemble)
143
- return default_ag_args_ensemble
144
-
145
- def _ag_params(self) -> set:
146
- return {"sample_rows", "max_features", "max_classes"}
147
-
148
- def _more_tags(self) -> dict:
149
- """
150
- Because TabPFN doesn't use validation data for early stopping, it supports refit_full natively.
151
- """
152
- tags = {"can_refit_full": True}
153
- return tags