autogluon.tabular 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.tabular might be problematic. Click here for more details.
- autogluon/tabular/__init__.py +1 -0
- autogluon/tabular/configs/config_helper.py +18 -6
- autogluon/tabular/configs/feature_generator_presets.py +3 -1
- autogluon/tabular/configs/hyperparameter_configs.py +42 -9
- autogluon/tabular/configs/presets_configs.py +38 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +84 -14
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +48 -48
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +774 -1
- autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +421 -1
- autogluon/tabular/experimental/_scikit_mixin.py +6 -2
- autogluon/tabular/experimental/_tabular_classifier.py +3 -1
- autogluon/tabular/experimental/_tabular_regressor.py +3 -1
- autogluon/tabular/experimental/plot_leaderboard.py +73 -19
- autogluon/tabular/learner/abstract_learner.py +160 -42
- autogluon/tabular/learner/default_learner.py +78 -22
- autogluon/tabular/models/__init__.py +2 -2
- autogluon/tabular/models/_utils/rapids_utils.py +3 -1
- autogluon/tabular/models/abstract/abstract_torch_model.py +2 -0
- autogluon/tabular/models/automm/automm_model.py +12 -3
- autogluon/tabular/models/automm/ft_transformer.py +5 -1
- autogluon/tabular/models/catboost/callbacks.py +2 -2
- autogluon/tabular/models/catboost/catboost_model.py +93 -29
- autogluon/tabular/models/catboost/catboost_softclass_utils.py +4 -1
- autogluon/tabular/models/catboost/catboost_utils.py +3 -1
- autogluon/tabular/models/ebm/ebm_model.py +8 -13
- autogluon/tabular/models/ebm/hyperparameters/parameters.py +1 -0
- autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +1 -0
- autogluon/tabular/models/fastainn/callbacks.py +20 -3
- autogluon/tabular/models/fastainn/hyperparameters/searchspaces.py +11 -1
- autogluon/tabular/models/fastainn/quantile_helpers.py +10 -2
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +65 -18
- autogluon/tabular/models/fasttext/fasttext_model.py +3 -1
- autogluon/tabular/models/image_prediction/image_predictor.py +7 -2
- autogluon/tabular/models/knn/knn_model.py +41 -8
- autogluon/tabular/models/lgb/callbacks.py +32 -9
- autogluon/tabular/models/lgb/hyperparameters/searchspaces.py +3 -1
- autogluon/tabular/models/lgb/lgb_model.py +150 -34
- autogluon/tabular/models/lgb/lgb_utils.py +12 -4
- autogluon/tabular/models/lr/hyperparameters/searchspaces.py +5 -1
- autogluon/tabular/models/lr/lr_model.py +40 -10
- autogluon/tabular/models/lr/lr_rapids_model.py +22 -13
- autogluon/tabular/models/mitra/_internal/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +36 -40
- autogluon/tabular/models/mitra/_internal/config/config_run.py +2 -14
- autogluon/tabular/models/mitra/_internal/config/enums.py +27 -26
- autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/core/callbacks.py +14 -21
- autogluon/tabular/models/mitra/_internal/core/get_loss.py +10 -12
- autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +17 -32
- autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +12 -27
- autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +16 -21
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +130 -111
- autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/data/collator.py +30 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +18 -26
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +10 -7
- autogluon/tabular/models/mitra/_internal/data/preprocessor.py +70 -100
- autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/models/base.py +7 -10
- autogluon/tabular/models/mitra/_internal/models/embedding.py +46 -56
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +140 -120
- autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -1
- autogluon/tabular/models/mitra/_internal/utils/set_seed.py +3 -1
- autogluon/tabular/models/mitra/mitra_model.py +16 -11
- autogluon/tabular/models/mitra/sklearn_interface.py +178 -162
- autogluon/tabular/models/realmlp/realmlp_model.py +28 -15
- autogluon/tabular/models/rf/compilers/onnx.py +1 -1
- autogluon/tabular/models/rf/rf_model.py +45 -12
- autogluon/tabular/models/rf/rf_quantile.py +4 -2
- autogluon/tabular/models/tabdpt/tabdpt_model.py +8 -17
- autogluon/tabular/models/tabicl/tabicl_model.py +8 -1
- autogluon/tabular/models/tabm/_tabm_internal.py +6 -4
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +80 -127
- autogluon/tabular/models/tabm/tabm_model.py +8 -4
- autogluon/tabular/models/tabm/tabm_reference.py +53 -85
- autogluon/tabular/models/tabpfnmix/_internal/core/callbacks.py +7 -16
- autogluon/tabular/models/tabpfnmix/_internal/core/collator.py +16 -24
- autogluon/tabular/models/tabpfnmix/_internal/core/dataset_split.py +5 -7
- autogluon/tabular/models/tabpfnmix/_internal/core/enums.py +0 -2
- autogluon/tabular/models/tabpfnmix/_internal/core/get_loss.py +0 -1
- autogluon/tabular/models/tabpfnmix/_internal/core/get_optimizer.py +7 -18
- autogluon/tabular/models/tabpfnmix/_internal/core/get_scheduler.py +3 -14
- autogluon/tabular/models/tabpfnmix/_internal/core/trainer_finetune.py +79 -64
- autogluon/tabular/models/tabpfnmix/_internal/core/y_transformer.py +3 -5
- autogluon/tabular/models/tabpfnmix/_internal/data/dataset_finetune.py +17 -30
- autogluon/tabular/models/tabpfnmix/_internal/data/preprocessor.py +15 -35
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/embedding.py +21 -38
- autogluon/tabular/models/tabpfnmix/_internal/models/foundation/foundation_transformer.py +33 -51
- autogluon/tabular/models/tabpfnmix/_internal/results/prediction_metrics.py +4 -4
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_classifier.py +32 -12
- autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_regressor.py +32 -13
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +55 -19
- autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +21 -48
- autogluon/tabular/models/tabprep/prep_mixin.py +34 -26
- autogluon/tabular/models/tabular_nn/compilers/onnx.py +36 -8
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +130 -36
- autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py +8 -4
- autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py +26 -5
- autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py +41 -24
- autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +33 -8
- autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py +21 -6
- autogluon/tabular/models/xgboost/callbacks.py +9 -3
- autogluon/tabular/models/xgboost/xgboost_model.py +59 -11
- autogluon/tabular/models/xt/xt_model.py +1 -0
- autogluon/tabular/predictor/interpretable_predictor.py +3 -1
- autogluon/tabular/predictor/predictor.py +409 -128
- autogluon/tabular/registry/__init__.py +1 -1
- autogluon/tabular/registry/_ag_model_registry.py +4 -5
- autogluon/tabular/registry/_model_registry.py +1 -0
- autogluon/tabular/testing/fit_helper.py +55 -15
- autogluon/tabular/testing/generate_datasets.py +1 -1
- autogluon/tabular/testing/model_fit_helper.py +10 -4
- autogluon/tabular/trainer/abstract_trainer.py +644 -230
- autogluon/tabular/trainer/auto_trainer.py +19 -8
- autogluon/tabular/trainer/model_presets/presets.py +33 -9
- autogluon/tabular/trainer/model_presets/presets_distill.py +16 -2
- autogluon/tabular/version.py +1 -1
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/METADATA +26 -26
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/RECORD +127 -135
- autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +0 -20
- autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +0 -40
- autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +0 -201
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +0 -1464
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +0 -747
- autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +0 -863
- autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +0 -106
- autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +0 -466
- /autogluon.tabular-1.5.1b20260105-py3.11-nspkg.pth → /autogluon.tabular-1.5.1b20260116-py3.11-nspkg.pth +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/WHEEL +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/LICENSE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/NOTICE +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/namespace_packages.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/top_level.txt +0 -0
- {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/zip-safe +0 -0
|
@@ -4,13 +4,11 @@ import torch.nn as nn
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class FoundationEmbeddingX(torch.nn.Module):
|
|
7
|
-
|
|
8
7
|
def __init__(
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
self,
|
|
9
|
+
dim: int,
|
|
10
|
+
n_features: int,
|
|
11
|
+
) -> None:
|
|
14
12
|
super().__init__()
|
|
15
13
|
|
|
16
14
|
self.dim = dim
|
|
@@ -18,9 +16,7 @@ class FoundationEmbeddingX(torch.nn.Module):
|
|
|
18
16
|
|
|
19
17
|
self.x_embedding = nn.Linear(n_features, dim)
|
|
20
18
|
|
|
21
|
-
|
|
22
19
|
def forward(self, x_support: torch.Tensor, x_query__: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
23
|
-
|
|
24
20
|
batch_size = x_support.shape[0]
|
|
25
21
|
n_obs_support = x_support.shape[1]
|
|
26
22
|
n_obs_query__ = x_query__.shape[1]
|
|
@@ -32,53 +28,44 @@ class FoundationEmbeddingX(torch.nn.Module):
|
|
|
32
28
|
|
|
33
29
|
|
|
34
30
|
class FoundationEmbeddingYFloat(torch.nn.Module):
|
|
35
|
-
|
|
36
31
|
def __init__(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
32
|
+
self,
|
|
33
|
+
dim: int,
|
|
34
|
+
) -> None:
|
|
41
35
|
super().__init__()
|
|
42
36
|
|
|
43
37
|
self.dim = dim
|
|
44
38
|
|
|
45
39
|
self.y_embedding = nn.Linear(1, dim)
|
|
46
40
|
|
|
47
|
-
|
|
48
41
|
def forward(self, y_support: torch.Tensor, n_obs_query: int) -> tuple[torch.Tensor, torch.Tensor]:
|
|
49
|
-
|
|
50
42
|
batch_size = y_support.shape[0]
|
|
51
43
|
|
|
52
44
|
y_support = y_support.type(torch.float32)
|
|
53
|
-
y_support = einops.rearrange(y_support,
|
|
45
|
+
y_support = einops.rearrange(y_support, "b n -> b n 1")
|
|
54
46
|
|
|
55
47
|
y_support = self.y_embedding(y_support)
|
|
56
48
|
y_query = torch.zeros((batch_size, n_obs_query, self.dim), device=y_support.device, dtype=torch.float32)
|
|
57
49
|
|
|
58
50
|
return y_support, y_query
|
|
59
|
-
|
|
60
51
|
|
|
61
52
|
|
|
62
53
|
class FoundationEmbeddingYInteger(torch.nn.Module):
|
|
63
|
-
|
|
64
54
|
def __init__(
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
55
|
+
self,
|
|
56
|
+
n_classes: int,
|
|
57
|
+
dim: int,
|
|
58
|
+
) -> None:
|
|
70
59
|
super().__init__()
|
|
71
60
|
|
|
72
61
|
self.n_classes = n_classes
|
|
73
62
|
self.dim = dim
|
|
74
63
|
|
|
75
64
|
self.y_embedding = nn.Embedding(n_classes, dim)
|
|
76
|
-
self.y_padding = nn.Embedding(1, dim, padding_idx=0)
|
|
77
|
-
self.y_mask = nn.Embedding(1, dim)
|
|
78
|
-
|
|
65
|
+
self.y_padding = nn.Embedding(1, dim, padding_idx=0) # padding is modeled as a separate class
|
|
66
|
+
self.y_mask = nn.Embedding(1, dim) # masking is also modeled as a separate class
|
|
79
67
|
|
|
80
68
|
def forward(self, y_support: torch.Tensor, n_obs_query: int) -> tuple[torch.Tensor, torch.Tensor]:
|
|
81
|
-
|
|
82
69
|
batch_size = y_support.shape[0]
|
|
83
70
|
n_obs_support = y_support.shape[1]
|
|
84
71
|
|
|
@@ -88,33 +75,29 @@ class FoundationEmbeddingYInteger(torch.nn.Module):
|
|
|
88
75
|
y_support_pad = y_support == -100
|
|
89
76
|
|
|
90
77
|
y_sup = torch.zeros((batch_size, n_obs_support, self.dim), device=y_support.device, dtype=torch.float32)
|
|
91
|
-
y_sup[
|
|
92
|
-
y_sup[~y_support_pad] = self.y_embedding(
|
|
78
|
+
y_sup[y_support_pad] = self.y_padding(y_support[y_support_pad] + 100)
|
|
79
|
+
y_sup[~y_support_pad] = self.y_embedding(y_support[~y_support_pad])
|
|
93
80
|
|
|
94
81
|
y_query = torch.zeros((batch_size, n_obs_query), device=y_support.device, dtype=torch.int64)
|
|
95
82
|
y_query = self.y_mask(y_query)
|
|
96
83
|
|
|
97
84
|
return y_sup, y_query
|
|
98
|
-
|
|
99
85
|
|
|
100
|
-
class FoundationObservationEmbedding(torch.nn.Module):
|
|
101
86
|
|
|
87
|
+
class FoundationObservationEmbedding(torch.nn.Module):
|
|
102
88
|
def __init__(self, dim: int) -> None:
|
|
103
|
-
|
|
104
89
|
super().__init__()
|
|
105
90
|
|
|
106
91
|
self.dim = dim
|
|
107
92
|
self.max_dim = 2**16
|
|
108
93
|
self.embedding = nn.Embedding(self.max_dim, dim)
|
|
109
94
|
|
|
110
|
-
|
|
111
95
|
def forward(self, batch_size: int, n_obs: int) -> torch.Tensor:
|
|
96
|
+
assert n_obs <= self.max_dim, f"Number of observations is too large. Max is {self.max_dim}, got {n_obs}"
|
|
112
97
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Take a random embedding from the pool of embeddings
|
|
98
|
+
# Take a random embedding from the pool of embeddings
|
|
116
99
|
weights = torch.ones((batch_size, self.max_dim), dtype=torch.float32, device=self.embedding.weight.device)
|
|
117
100
|
indices = torch.multinomial(weights, num_samples=n_obs, replacement=False)
|
|
118
101
|
x = self.embedding(indices)
|
|
119
|
-
|
|
120
|
-
return x
|
|
102
|
+
|
|
103
|
+
return x
|
|
@@ -1,16 +1,14 @@
|
|
|
1
|
-
|
|
2
1
|
import einops
|
|
3
2
|
import torch
|
|
4
3
|
import torch.nn as nn
|
|
5
4
|
import torch.nn.functional as F
|
|
6
|
-
from
|
|
5
|
+
from huggingface_hub import PyTorchModelHubMixin
|
|
7
6
|
|
|
7
|
+
from ...core.enums import Task
|
|
8
8
|
from .embedding import FoundationEmbeddingX, FoundationEmbeddingYFloat, FoundationEmbeddingYInteger
|
|
9
|
-
from huggingface_hub import PyTorchModelHubMixin
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
class FoundationTransformer(nn.Module, PyTorchModelHubMixin):
|
|
13
|
-
|
|
14
12
|
def __init__(
|
|
15
13
|
self,
|
|
16
14
|
n_features: int,
|
|
@@ -22,7 +20,6 @@ class FoundationTransformer(nn.Module, PyTorchModelHubMixin):
|
|
|
22
20
|
y_as_float_embedding: bool,
|
|
23
21
|
task: str = Task.CLASSIFICATION,
|
|
24
22
|
) -> None:
|
|
25
|
-
|
|
26
23
|
super().__init__()
|
|
27
24
|
|
|
28
25
|
self.n_features = n_features
|
|
@@ -44,36 +41,34 @@ class FoundationTransformer(nn.Module, PyTorchModelHubMixin):
|
|
|
44
41
|
self.layers = nn.ModuleList([])
|
|
45
42
|
|
|
46
43
|
for _ in range(n_layers):
|
|
47
|
-
|
|
48
44
|
att = MultiheadAttention(dim, n_heads)
|
|
49
45
|
|
|
50
|
-
self.layers.append(
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
46
|
+
self.layers.append(
|
|
47
|
+
nn.ModuleDict(
|
|
48
|
+
{
|
|
49
|
+
"layer_norm1": nn.LayerNorm(dim),
|
|
50
|
+
"attention": att,
|
|
51
|
+
"layer_norm2": nn.LayerNorm(dim),
|
|
52
|
+
"linear1": nn.Linear(dim, dim * 4),
|
|
53
|
+
"linear2": nn.Linear(dim * 4, dim),
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self.final_layer1 = nn.Linear(dim, dim * 4)
|
|
59
59
|
if self.task == Task.CLASSIFICATION:
|
|
60
|
-
self.final_layer2 = nn.Linear(dim*4, n_classes)
|
|
60
|
+
self.final_layer2 = nn.Linear(dim * 4, n_classes)
|
|
61
61
|
elif self.task == Task.REGRESSION:
|
|
62
|
-
self.final_layer2 = nn.Linear(dim*4, 1)
|
|
62
|
+
self.final_layer2 = nn.Linear(dim * 4, 1)
|
|
63
63
|
self.init_weights()
|
|
64
64
|
|
|
65
|
-
|
|
66
65
|
def init_weights(self):
|
|
67
|
-
|
|
68
66
|
for module_dict in self.layers:
|
|
69
|
-
|
|
70
67
|
# module_dict['attention'].init_weights()
|
|
71
|
-
nn.init.zeros_(module_dict[
|
|
72
|
-
nn.init.zeros_(module_dict[
|
|
73
|
-
|
|
68
|
+
nn.init.zeros_(module_dict["linear2"].weight)
|
|
69
|
+
nn.init.zeros_(module_dict["linear2"].bias)
|
|
74
70
|
|
|
75
71
|
def forward(self, x_support: torch.Tensor, y_support: torch.Tensor, x_query: torch.Tensor):
|
|
76
|
-
|
|
77
72
|
"""
|
|
78
73
|
x_support is (batch_size, n_observations_support, n_features)
|
|
79
74
|
y_support is (batch_size, n_observations_support)
|
|
@@ -106,38 +101,34 @@ class FoundationTransformer(nn.Module, PyTorchModelHubMixin):
|
|
|
106
101
|
support = x_support + y_support
|
|
107
102
|
query__ = x_query__ + y_query__
|
|
108
103
|
|
|
109
|
-
x, pack = einops.pack((support, query__),
|
|
110
|
-
|
|
111
|
-
for module_dict in self.layers:
|
|
104
|
+
x, pack = einops.pack((support, query__), "b * d")
|
|
112
105
|
|
|
106
|
+
for module_dict in self.layers:
|
|
113
107
|
x_residual = x
|
|
114
|
-
support, query__ = einops.unpack(x, pack,
|
|
115
|
-
att_support = module_dict[
|
|
116
|
-
att_query__ = module_dict[
|
|
117
|
-
x = einops.pack((att_support, att_query__),
|
|
108
|
+
support, query__ = einops.unpack(x, pack, "b * d")
|
|
109
|
+
att_support = module_dict["attention"](support, support, support, key_padding_mask=padding_mask)
|
|
110
|
+
att_query__ = module_dict["attention"](query__, support, support, key_padding_mask=padding_mask)
|
|
111
|
+
x = einops.pack((att_support, att_query__), "b * d")[0]
|
|
118
112
|
x = x_residual + x
|
|
119
|
-
x = module_dict[
|
|
113
|
+
x = module_dict["layer_norm1"](x)
|
|
120
114
|
x_residual = x
|
|
121
|
-
x = module_dict[
|
|
115
|
+
x = module_dict["linear1"](x)
|
|
122
116
|
x = torch.nn.functional.gelu(x)
|
|
123
|
-
x = module_dict[
|
|
117
|
+
x = module_dict["linear2"](x)
|
|
124
118
|
x = x_residual + x
|
|
125
|
-
x = module_dict[
|
|
119
|
+
x = module_dict["layer_norm2"](x)
|
|
126
120
|
|
|
127
121
|
x = self.final_layer1(x)
|
|
128
122
|
x = F.gelu(x)
|
|
129
123
|
x = self.final_layer2(x)
|
|
130
124
|
|
|
131
|
-
support, query__ = einops.unpack(x, pack,
|
|
125
|
+
support, query__ = einops.unpack(x, pack, "b * c")
|
|
132
126
|
|
|
133
127
|
return query__
|
|
134
128
|
|
|
135
129
|
|
|
136
|
-
|
|
137
130
|
class MultiheadAttention(torch.nn.Module):
|
|
138
|
-
|
|
139
131
|
def __init__(self, dim: int, n_heads: int) -> None:
|
|
140
|
-
|
|
141
132
|
super().__init__()
|
|
142
133
|
|
|
143
134
|
self.use_flash_attention = False
|
|
@@ -146,21 +137,14 @@ class MultiheadAttention(torch.nn.Module):
|
|
|
146
137
|
|
|
147
138
|
self.att = nn.MultiheadAttention(dim, n_heads, dropout=0.0, batch_first=True)
|
|
148
139
|
|
|
149
|
-
|
|
150
|
-
|
|
151
140
|
def init_weights(self):
|
|
152
141
|
pass
|
|
153
142
|
# nn.init.zeros_(self.att.out_proj.weight)
|
|
154
143
|
# nn.init.zeros_(self.att.out_proj.bias)
|
|
155
144
|
|
|
156
|
-
|
|
157
145
|
def forward(
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
key: torch.Tensor,
|
|
161
|
-
value: torch.Tensor,
|
|
162
|
-
key_padding_mask: torch.Tensor
|
|
163
|
-
) -> torch.Tensor:
|
|
146
|
+
self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, key_padding_mask: torch.Tensor
|
|
147
|
+
) -> torch.Tensor:
|
|
164
148
|
"""
|
|
165
149
|
b = batch size
|
|
166
150
|
n = number of samples (dataset size)
|
|
@@ -179,9 +163,7 @@ class MultiheadAttention(torch.nn.Module):
|
|
|
179
163
|
return output
|
|
180
164
|
|
|
181
165
|
|
|
182
|
-
|
|
183
|
-
|
|
184
166
|
class SwiGLU(nn.Module):
|
|
185
167
|
def forward(self, x):
|
|
186
168
|
x, gate = x.chunk(2, dim=-1)
|
|
187
|
-
return F.silu(gate) * x
|
|
169
|
+
return F.silu(gate) * x
|
|
@@ -17,21 +17,21 @@ class PredictionMetrics:
|
|
|
17
17
|
|
|
18
18
|
@classmethod
|
|
19
19
|
def from_prediction(cls, y_pred: np.ndarray, y_true: np.ndarray, task: Task, metric: Scorer):
|
|
20
|
-
|
|
21
20
|
loss, score, metrics = compute_metrics(y_pred, y_true, task, metric=metric)
|
|
22
21
|
|
|
23
22
|
return PredictionMetrics(task=task, loss=loss, score=score, metrics=metrics)
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
def compute_metrics(y_pred: np.ndarray, y_true: np.ndarray, task: Task, metric: Scorer) -> tuple[float, float, dict]:
|
|
27
|
-
|
|
28
26
|
if task == Task.CLASSIFICATION:
|
|
29
27
|
return compute_classification_metrics(y_pred, y_true, metric=metric)
|
|
30
28
|
else:
|
|
31
29
|
return compute_regression_metrics(y_pred, y_true, metric=metric)
|
|
32
|
-
|
|
33
30
|
|
|
34
|
-
|
|
31
|
+
|
|
32
|
+
def compute_classification_metrics(
|
|
33
|
+
y_pred: np.ndarray, y_true: np.ndarray, metric: Scorer
|
|
34
|
+
) -> tuple[float, float, dict]:
|
|
35
35
|
# predictions are assumed to be log-probabilities
|
|
36
36
|
|
|
37
37
|
if metric.needs_pred or metric.needs_class:
|
|
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
-
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
7
6
|
import torch
|
|
7
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
8
8
|
|
|
9
9
|
from .core.dataset_split import make_stratified_dataset_split
|
|
10
10
|
from .core.trainer_finetune import TrainerFinetune
|
|
@@ -16,32 +16,52 @@ from .models.foundation.foundation_transformer import FoundationTransformer
|
|
|
16
16
|
# TODO: To mitigate val overfitting, can fit multiple random seeds at same time and pick same epoch for all of them, track average performance on epoch.
|
|
17
17
|
# TODO: Test shuffling the data and see if it makes TabPFNv2 worse, same with TabForestPFN
|
|
18
18
|
class TabPFNMixClassifier(BaseEstimator, ClassifierMixin):
|
|
19
|
-
def __init__(
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
n_classes,
|
|
22
|
+
cfg,
|
|
23
|
+
split_val,
|
|
24
|
+
model_path: str = None,
|
|
25
|
+
weights_path: str | Path = None,
|
|
26
|
+
stopping_metric=None,
|
|
27
|
+
use_best_epoch: bool = True,
|
|
28
|
+
):
|
|
20
29
|
if weights_path is not None:
|
|
21
30
|
weights_path = str(Path(weights_path))
|
|
22
31
|
|
|
23
32
|
if model_path is not None:
|
|
24
33
|
model = FoundationTransformer.from_pretrained(model_path)
|
|
25
|
-
assert model.task == cfg.task,
|
|
34
|
+
assert model.task == cfg.task, (
|
|
35
|
+
f"The pretrained model '{model_path}' is for task {model.task}, but the problem type is for task {cfg.task}..."
|
|
36
|
+
)
|
|
26
37
|
else:
|
|
27
38
|
model = FoundationTransformer(
|
|
28
|
-
n_features=cfg.hyperparams[
|
|
29
|
-
n_classes=cfg.hyperparams[
|
|
30
|
-
dim=cfg.hyperparams[
|
|
31
|
-
n_layers=cfg.hyperparams[
|
|
32
|
-
n_heads=cfg.hyperparams[
|
|
33
|
-
attn_dropout=cfg.hyperparams[
|
|
34
|
-
y_as_float_embedding=cfg.hyperparams[
|
|
39
|
+
n_features=cfg.hyperparams["n_features"],
|
|
40
|
+
n_classes=cfg.hyperparams["n_classes"],
|
|
41
|
+
dim=cfg.hyperparams["dim"],
|
|
42
|
+
n_layers=cfg.hyperparams["n_layers"],
|
|
43
|
+
n_heads=cfg.hyperparams["n_heads"],
|
|
44
|
+
attn_dropout=cfg.hyperparams["attn_dropout"],
|
|
45
|
+
y_as_float_embedding=cfg.hyperparams["y_as_float_embedding"],
|
|
35
46
|
task=cfg.task,
|
|
36
47
|
)
|
|
37
48
|
if weights_path is not None:
|
|
38
49
|
model.load_state_dict(torch.load(weights_path, weights_only=True)) # nosec B614
|
|
39
50
|
|
|
40
51
|
self.split_val = split_val
|
|
41
|
-
self.trainer = TrainerFinetune(
|
|
52
|
+
self.trainer = TrainerFinetune(
|
|
53
|
+
cfg, model, n_classes=n_classes, stopping_metric=stopping_metric, use_best_epoch=use_best_epoch
|
|
54
|
+
)
|
|
42
55
|
super().__init__()
|
|
43
56
|
|
|
44
|
-
def fit(
|
|
57
|
+
def fit(
|
|
58
|
+
self,
|
|
59
|
+
X: np.ndarray,
|
|
60
|
+
y: np.ndarray,
|
|
61
|
+
X_val: np.ndarray = None,
|
|
62
|
+
y_val: np.ndarray = None,
|
|
63
|
+
time_limit: float = None,
|
|
64
|
+
):
|
|
45
65
|
# FIXME: Should X and y be preprocessed for inference efficiency? Yes.
|
|
46
66
|
self.X_ = X # FIXME: Optimize storage of X and y? Is this redundant? Is X and y saving done multiple times during pickle?
|
|
47
67
|
self.y_ = y
|
|
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
-
from sklearn.base import BaseEstimator, RegressorMixin
|
|
7
6
|
import torch
|
|
7
|
+
from sklearn.base import BaseEstimator, RegressorMixin
|
|
8
8
|
|
|
9
9
|
from .core.dataset_split import make_stratified_dataset_split
|
|
10
10
|
from .core.trainer_finetune import TrainerFinetune
|
|
@@ -16,8 +16,16 @@ from .models.foundation.foundation_transformer import FoundationTransformer
|
|
|
16
16
|
# TODO: To mitigate val overfitting, can fit multiple random seeds at same time and pick same epoch for all of them, track average performance on epoch.
|
|
17
17
|
# TODO: Test shuffling the data and see if it makes TabPFNv2 worse, same with TabForestPFN
|
|
18
18
|
class TabPFNMixRegressor(BaseEstimator, RegressorMixin):
|
|
19
|
-
def __init__(
|
|
20
|
-
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
n_classes,
|
|
22
|
+
cfg,
|
|
23
|
+
split_val,
|
|
24
|
+
model_path: str = None,
|
|
25
|
+
weights_path: str | Path = None,
|
|
26
|
+
stopping_metric=None,
|
|
27
|
+
use_best_epoch: bool = True,
|
|
28
|
+
):
|
|
21
29
|
self.cfg = cfg
|
|
22
30
|
|
|
23
31
|
if weights_path is not None:
|
|
@@ -25,26 +33,37 @@ class TabPFNMixRegressor(BaseEstimator, RegressorMixin):
|
|
|
25
33
|
|
|
26
34
|
if model_path is not None:
|
|
27
35
|
model = FoundationTransformer.from_pretrained(model_path)
|
|
28
|
-
assert model.task == cfg.task,
|
|
36
|
+
assert model.task == cfg.task, (
|
|
37
|
+
f"The pretrained model '{model_path}' is for task {model.task}, but the problem type is for task {cfg.task}..."
|
|
38
|
+
)
|
|
29
39
|
else:
|
|
30
40
|
model = FoundationTransformer(
|
|
31
|
-
n_features=cfg.hyperparams[
|
|
32
|
-
n_classes=cfg.hyperparams[
|
|
33
|
-
dim=cfg.hyperparams[
|
|
34
|
-
n_layers=cfg.hyperparams[
|
|
35
|
-
n_heads=cfg.hyperparams[
|
|
36
|
-
attn_dropout=cfg.hyperparams[
|
|
37
|
-
y_as_float_embedding=cfg.hyperparams[
|
|
41
|
+
n_features=cfg.hyperparams["n_features"],
|
|
42
|
+
n_classes=cfg.hyperparams["n_classes"],
|
|
43
|
+
dim=cfg.hyperparams["dim"],
|
|
44
|
+
n_layers=cfg.hyperparams["n_layers"],
|
|
45
|
+
n_heads=cfg.hyperparams["n_heads"],
|
|
46
|
+
attn_dropout=cfg.hyperparams["attn_dropout"],
|
|
47
|
+
y_as_float_embedding=cfg.hyperparams["y_as_float_embedding"],
|
|
38
48
|
task=cfg.task,
|
|
39
49
|
)
|
|
40
50
|
if weights_path is not None:
|
|
41
51
|
model.load_state_dict(torch.load(weights_path, weights_only=True)) # nosec B614
|
|
42
52
|
|
|
43
53
|
self.split_val = split_val
|
|
44
|
-
self.trainer = TrainerFinetune(
|
|
54
|
+
self.trainer = TrainerFinetune(
|
|
55
|
+
cfg, model, n_classes=n_classes, stopping_metric=stopping_metric, use_best_epoch=use_best_epoch
|
|
56
|
+
)
|
|
45
57
|
super().__init__()
|
|
46
58
|
|
|
47
|
-
def fit(
|
|
59
|
+
def fit(
|
|
60
|
+
self,
|
|
61
|
+
X: np.ndarray,
|
|
62
|
+
y: np.ndarray,
|
|
63
|
+
X_val: np.ndarray = None,
|
|
64
|
+
y_val: np.ndarray = None,
|
|
65
|
+
time_limit: float = None,
|
|
66
|
+
):
|
|
48
67
|
# FIXME: Should X and y be preprocessed for inference efficiency? Yes.
|
|
49
68
|
self.X_ = X # FIXME: Optimize storage of X and y? Is this redundant? Is X and y saving done multiple times during pickle?
|
|
50
69
|
self.y_ = y
|