autogluon.tabular 1.3.2b20250713__py3-none-any.whl → 1.3.2b20250715__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/models/__init__.py +1 -0
- autogluon/tabular/models/catboost/catboost_model.py +9 -6
- autogluon/tabular/models/catboost/catboost_utils.py +10 -0
- autogluon/tabular/models/lgb/lgb_model.py +2 -1
- autogluon/tabular/models/mitra/__init__.py +0 -0
- autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +190 -0
- autogluon/tabular/models/mitra/_internal/config/config_run.py +32 -0
- autogluon/tabular/models/mitra/_internal/config/enums.py +145 -0
- autogluon/tabular/models/mitra/_internal/core/callbacks.py +94 -0
- autogluon/tabular/models/mitra/_internal/core/get_loss.py +55 -0
- autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +108 -0
- autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +67 -0
- autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +134 -0
- autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +367 -0
- autogluon/tabular/models/mitra/_internal/data/collator.py +46 -0
- autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +132 -0
- autogluon/tabular/models/mitra/_internal/data/dataset_split.py +53 -0
- autogluon/tabular/models/mitra/_internal/data/preprocessor.py +420 -0
- autogluon/tabular/models/mitra/_internal/models/base.py +21 -0
- autogluon/tabular/models/mitra/_internal/models/embedding.py +182 -0
- autogluon/tabular/models/mitra/_internal/models/tab2d.py +667 -0
- autogluon/tabular/models/mitra/_internal/utils/set_seed.py +15 -0
- autogluon/tabular/models/mitra/mitra_model.py +214 -0
- autogluon/tabular/models/mitra/sklearn_interface.py +462 -0
- autogluon/tabular/registry/_ag_model_registry.py +2 -0
- autogluon/tabular/testing/fit_helper.py +2 -2
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/METADATA +21 -12
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/RECORD +36 -16
- /autogluon.tabular-1.3.2b20250713-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250715-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.3.2b20250713.dist-info → autogluon.tabular-1.3.2b20250715.dist-info}/zip-safe +0 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
|
4
|
+
class CollatorWithPadding():
|
5
|
+
|
6
|
+
def __init__(
|
7
|
+
self,
|
8
|
+
max_features: int,
|
9
|
+
pad_to_max_features: bool,
|
10
|
+
) -> None:
|
11
|
+
|
12
|
+
self.max_features = max_features
|
13
|
+
self.pad_to_max_features = pad_to_max_features
|
14
|
+
|
15
|
+
|
16
|
+
def __call__(self, batch: list[dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
|
17
|
+
|
18
|
+
max_support_samples = max(dataset['x_support'].shape[0] for dataset in batch)
|
19
|
+
max_query_samples = max(dataset['x_query'].shape[0] for dataset in batch)
|
20
|
+
max_features = max(dataset['x_support'].shape[1] for dataset in batch)
|
21
|
+
|
22
|
+
if self.pad_to_max_features:
|
23
|
+
max_features = self.max_features
|
24
|
+
|
25
|
+
batch_size = len(batch)
|
26
|
+
|
27
|
+
tensor_dict = {
|
28
|
+
'x_support': torch.zeros((batch_size, max_support_samples, max_features), dtype=batch[0]['x_support'].dtype),
|
29
|
+
'y_support': torch.full((batch_size, max_support_samples), fill_value=-100, dtype=batch[0]['y_support'].dtype),
|
30
|
+
'x_query': torch.zeros((batch_size, max_query_samples, max_features), dtype=batch[0]['x_query'].dtype),
|
31
|
+
'y_query': torch.full((batch_size, max_query_samples), fill_value=-100, dtype=batch[0]['y_query'].dtype),
|
32
|
+
'padding_features': torch.ones((batch_size, max_features), dtype=torch.bool),
|
33
|
+
'padding_obs_support': torch.ones((batch_size, max_support_samples), dtype=torch.bool),
|
34
|
+
'padding_obs_query': torch.ones((batch_size, max_query_samples), dtype=torch.bool),
|
35
|
+
}
|
36
|
+
|
37
|
+
for i, dataset in enumerate(batch):
|
38
|
+
tensor_dict['x_support'][i, :dataset['x_support'].shape[0], :dataset['x_support'].shape[1]] = dataset['x_support']
|
39
|
+
tensor_dict['y_support'][i, :dataset['y_support'].shape[0]] = dataset['y_support']
|
40
|
+
tensor_dict['x_query'][i, :dataset['x_query'].shape[0], :dataset['x_support'].shape[1]] = dataset['x_query']
|
41
|
+
tensor_dict['y_query'][i, :dataset['y_query'].shape[0]] = dataset['y_query']
|
42
|
+
tensor_dict['padding_features'][i, :dataset['x_support'].shape[1]] = False
|
43
|
+
tensor_dict['padding_obs_support'][i, :dataset['x_support'].shape[0]] = False
|
44
|
+
tensor_dict['padding_obs_query'][i, :dataset['x_query'].shape[0]] = False
|
45
|
+
|
46
|
+
return tensor_dict
|
@@ -0,0 +1,132 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from ..._internal.config.config_run import ConfigRun
|
7
|
+
from ..._internal.data.dataset_split import make_dataset_split
|
8
|
+
from ..._internal.config.enums import Task
|
9
|
+
|
10
|
+
|
11
|
+
class DatasetFinetune(torch.utils.data.Dataset):
|
12
|
+
"""
|
13
|
+
The main goal of this class is to generate a dataset for fine-tuning.
|
14
|
+
The input data are the full (x_support, y_support, x_query, y_query)
|
15
|
+
But these arrays are too large to be pushed through the model at once.
|
16
|
+
So here we split query the data into chunks if the query data is too large.
|
17
|
+
If the support data is too large, we randomly sample from it.
|
18
|
+
Furthermore, we transition from numpy to tensors.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
cfg: ConfigRun,
|
24
|
+
x_support: np.ndarray,
|
25
|
+
y_support: np.ndarray,
|
26
|
+
x_query: np.ndarray,
|
27
|
+
y_query: Optional[np.ndarray],
|
28
|
+
max_samples_support: int,
|
29
|
+
max_samples_query: int
|
30
|
+
):
|
31
|
+
"""
|
32
|
+
:param: max_features: number of features the tab pfn model has been trained on
|
33
|
+
"""
|
34
|
+
|
35
|
+
self.cfg = cfg
|
36
|
+
|
37
|
+
self.x_support = x_support
|
38
|
+
self.y_support = y_support
|
39
|
+
self.x_query = x_query
|
40
|
+
self.y_query = y_query
|
41
|
+
|
42
|
+
if self.y_query is None:
|
43
|
+
self.y_query = np.zeros((self.x_query.shape[0],)) - 1
|
44
|
+
|
45
|
+
self.max_samples_support = max_samples_support
|
46
|
+
self.max_samples_query = max_samples_query
|
47
|
+
|
48
|
+
self.x_queries = self.split_in_chunks(self.x_query, max_samples_query)
|
49
|
+
self.y_queries = self.split_in_chunks(self.y_query, max_samples_query)
|
50
|
+
|
51
|
+
self.n_samples_support = self.x_support.shape[0]
|
52
|
+
|
53
|
+
# We push the whole training data through the model, unless it is too large
|
54
|
+
self.support_size = min(self.max_samples_support, self.n_samples_support)
|
55
|
+
|
56
|
+
|
57
|
+
def __len__(self):
|
58
|
+
return len(self.x_queries)
|
59
|
+
|
60
|
+
def __getitem__(self, idx):
|
61
|
+
|
62
|
+
support_indices = np.random.choice(
|
63
|
+
self.n_samples_support,
|
64
|
+
size=self.support_size,
|
65
|
+
replace=False
|
66
|
+
)
|
67
|
+
|
68
|
+
x_support = self.x_support[support_indices]
|
69
|
+
y_support = self.y_support[support_indices]
|
70
|
+
|
71
|
+
x_support_tensor = torch.as_tensor(x_support)
|
72
|
+
y_support_tensor = torch.as_tensor(y_support)
|
73
|
+
x_query_tensor = torch.as_tensor(self.x_queries[idx])
|
74
|
+
y_query_tensor = torch.as_tensor(self.y_queries[idx])
|
75
|
+
|
76
|
+
return {
|
77
|
+
'x_support': x_support_tensor,
|
78
|
+
'y_support': y_support_tensor,
|
79
|
+
'x_query': x_query_tensor,
|
80
|
+
'y_query': y_query_tensor,
|
81
|
+
}
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
def split_in_chunks(self, x: np.ndarray, batch_size: int) -> list[np.ndarray]:
|
86
|
+
"""
|
87
|
+
Splits the data into chunks of size batch_size
|
88
|
+
"""
|
89
|
+
|
90
|
+
n_chunks = int(np.ceil(x.shape[0] / batch_size))
|
91
|
+
x_chunks = []
|
92
|
+
|
93
|
+
for i in range(n_chunks):
|
94
|
+
x_chunks.append(x[i * batch_size: (i + 1) * batch_size])
|
95
|
+
|
96
|
+
return x_chunks
|
97
|
+
|
98
|
+
def DatasetFinetuneGenerator(
|
99
|
+
cfg: ConfigRun,
|
100
|
+
x: np.ndarray,
|
101
|
+
y: np.ndarray,
|
102
|
+
task: Task,
|
103
|
+
max_samples_support: int,
|
104
|
+
max_samples_query: int
|
105
|
+
):
|
106
|
+
"""
|
107
|
+
The dataset fine-tune generator is a generator that yields a dataset for fine-tuning.
|
108
|
+
The idea is to split the training dataset into a support and query set.
|
109
|
+
Every single iteration, the generator yields a different support and query set split.
|
110
|
+
The dataset made always has exactly one batch.
|
111
|
+
"""
|
112
|
+
|
113
|
+
while True:
|
114
|
+
|
115
|
+
x_support, x_query, y_support, y_query = make_dataset_split(x=x, y=y, task=task, seed=cfg.seed)
|
116
|
+
n_samples_support = x_support.shape[0]
|
117
|
+
n_samples_query = x_query.shape[0]
|
118
|
+
|
119
|
+
support_size = min(max_samples_support, n_samples_support)
|
120
|
+
query_size = min(max_samples_query, n_samples_query)
|
121
|
+
|
122
|
+
dataset_finetune = DatasetFinetune(
|
123
|
+
cfg=cfg,
|
124
|
+
x_support=x_support[:support_size],
|
125
|
+
y_support=y_support[:support_size],
|
126
|
+
x_query=x_query[:query_size],
|
127
|
+
y_query=y_query[:query_size],
|
128
|
+
max_samples_support=max_samples_support,
|
129
|
+
max_samples_query=max_samples_query,
|
130
|
+
)
|
131
|
+
|
132
|
+
yield dataset_finetune
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from sklearn.model_selection import StratifiedKFold, train_test_split
|
3
|
+
|
4
|
+
from ..._internal.config.enums import Task
|
5
|
+
|
6
|
+
def make_dataset_split(x: np.ndarray, y: np.ndarray, task: Task, seed: int) -> tuple[np.ndarray, ...]:
|
7
|
+
# Splits the dataset into train and validation sets with ratio 80/20
|
8
|
+
|
9
|
+
if task == Task.REGRESSION:
|
10
|
+
return make_standard_dataset_split(x, y, seed=seed)
|
11
|
+
|
12
|
+
size_of_smallest_class = np.min(np.bincount(y))
|
13
|
+
|
14
|
+
if size_of_smallest_class >= 5:
|
15
|
+
# stratification needs have at least 5 samples in each class if split is 80/20
|
16
|
+
return make_stratified_dataset_split(x, y, seed=seed)
|
17
|
+
else:
|
18
|
+
return make_standard_dataset_split(x, y, seed=seed)
|
19
|
+
|
20
|
+
|
21
|
+
def make_stratified_dataset_split(x, y, n_splits=5, seed=0):
|
22
|
+
|
23
|
+
# Stratify doesn't shuffle the data, so we shuffle it first
|
24
|
+
permutation = np.random.permutation(len(y))
|
25
|
+
x, y = x[permutation], y[permutation]
|
26
|
+
|
27
|
+
min_samples_per_class = np.min(np.bincount(y))
|
28
|
+
|
29
|
+
# Adjust n_splits based on both total samples and minimum samples per class
|
30
|
+
n_samples = len(y)
|
31
|
+
max_possible_splits = min(n_samples - 1, min_samples_per_class)
|
32
|
+
n_splits = min(n_splits, max_possible_splits)
|
33
|
+
|
34
|
+
# Ensure we have at least 2 splits if possible
|
35
|
+
if n_samples >= 2 and min_samples_per_class >= 2:
|
36
|
+
n_splits = max(2, n_splits)
|
37
|
+
else:
|
38
|
+
# If we can't do stratified splitting, fall back to standard split
|
39
|
+
return make_standard_dataset_split(x, y, seed)
|
40
|
+
|
41
|
+
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
|
42
|
+
indices = next(skf.split(x, y))
|
43
|
+
x_t_train, x_t_valid = x[indices[0]], x[indices[1]] # 80%, 20%
|
44
|
+
y_t_train, y_t_valid = y[indices[0]], y[indices[1]]
|
45
|
+
|
46
|
+
return x_t_train, x_t_valid, y_t_train, y_t_valid
|
47
|
+
|
48
|
+
|
49
|
+
def make_standard_dataset_split(x, y, seed):
|
50
|
+
|
51
|
+
return train_test_split(
|
52
|
+
x, y, test_size=0.2, random_state=seed,
|
53
|
+
)
|
@@ -0,0 +1,420 @@
|
|
1
|
+
from typing import Optional, Self
|
2
|
+
|
3
|
+
import random
|
4
|
+
import numpy as np
|
5
|
+
from loguru import logger
|
6
|
+
from sklearn.feature_selection import SelectKBest
|
7
|
+
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OrdinalEncoder
|
8
|
+
from sklearn.compose import ColumnTransformer
|
9
|
+
from sklearn.decomposition import TruncatedSVD
|
10
|
+
from sklearn.pipeline import Pipeline, FeatureUnion
|
11
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
12
|
+
|
13
|
+
from ..._internal.config.enums import Task
|
14
|
+
|
15
|
+
class NoneTransformer(BaseEstimator, TransformerMixin):
|
16
|
+
def fit(self, X, y=None):
|
17
|
+
return self
|
18
|
+
def transform(self, X):
|
19
|
+
return X
|
20
|
+
|
21
|
+
class Preprocessor():
|
22
|
+
"""
|
23
|
+
This class is used to preprocess the data before it is pushed through the model.
|
24
|
+
The preprocessor assures that the data has the right shape and is normalized,
|
25
|
+
This way the model always gets the same input distribution,
|
26
|
+
no matter whether the input data is synthetic or real.
|
27
|
+
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
dim_embedding: Optional[int], # Size of the feature embedding. For some models this is None, which means the embedding does not depend on the number of features
|
33
|
+
n_classes: int, # Actual number of classes in the dataset, assumed to be numbered 0, ..., n_classes - 1
|
34
|
+
dim_output: int, # Maximum number of classes the model has been trained on -> size of the output
|
35
|
+
use_quantile_transformer: bool,
|
36
|
+
use_feature_count_scaling: bool,
|
37
|
+
use_random_transforms: bool,
|
38
|
+
shuffle_classes: bool,
|
39
|
+
shuffle_features: bool,
|
40
|
+
random_mirror_regression: bool,
|
41
|
+
random_mirror_x: bool,
|
42
|
+
task: Task
|
43
|
+
):
|
44
|
+
|
45
|
+
self.dim_embedding = dim_embedding
|
46
|
+
self.n_classes = n_classes
|
47
|
+
self.dim_output = dim_output
|
48
|
+
self.use_quantile_transformer = use_quantile_transformer
|
49
|
+
self.use_feature_count_scaling = use_feature_count_scaling
|
50
|
+
self.use_random_transforms = use_random_transforms
|
51
|
+
self.shuffle_classes = shuffle_classes
|
52
|
+
self.shuffle_features = shuffle_features
|
53
|
+
self.random_mirror_regression = random_mirror_regression
|
54
|
+
self.random_mirror_x = random_mirror_x
|
55
|
+
self.task = task
|
56
|
+
|
57
|
+
def fit(self, X: np.ndarray, y: np.ndarray) -> Self:
|
58
|
+
"""
|
59
|
+
X: np.ndarray [n_samples, n_features]
|
60
|
+
y: np.ndarray [n_samples]
|
61
|
+
"""
|
62
|
+
|
63
|
+
if self.task == Task.CLASSIFICATION:
|
64
|
+
# We assume that y properly presents classes [0, 1, 2, ...] before passing to the preprocessor
|
65
|
+
# If the test set has a class that is not in the training set, we will throw an error
|
66
|
+
|
67
|
+
assert np.all(y < self.n_classes), "y contains class values that are not in the range of n_classes"
|
68
|
+
|
69
|
+
self.compute_pre_nan_mean(X)
|
70
|
+
X = self.impute_nan_features_with_mean(X)
|
71
|
+
|
72
|
+
self.determine_which_features_are_singular(X)
|
73
|
+
X = self.cutoff_singular_features(X, self.singular_features)
|
74
|
+
|
75
|
+
self.determine_which_features_to_select(X, y)
|
76
|
+
X = self.select_features(X)
|
77
|
+
|
78
|
+
if self.use_quantile_transformer:
|
79
|
+
# If use quantile transform is off, it means that the preprocessing will happen on the GPU.
|
80
|
+
X = self.fit_transform_quantile_transformer(X)
|
81
|
+
|
82
|
+
self.mean, self.std = self.calc_mean_std(X)
|
83
|
+
X = self.normalize_by_mean_std(X, self.mean, self.std)
|
84
|
+
|
85
|
+
if self.use_random_transforms:
|
86
|
+
X = self.transform_tabpfn(X)
|
87
|
+
|
88
|
+
if self.task == Task.CLASSIFICATION and self.shuffle_classes:
|
89
|
+
self.determine_shuffle_class_order()
|
90
|
+
|
91
|
+
if self.shuffle_features:
|
92
|
+
self.determine_feature_order(X)
|
93
|
+
|
94
|
+
if self.task == Task.REGRESSION:
|
95
|
+
self.determine_mix_max_scale(y)
|
96
|
+
|
97
|
+
if self.task == Task.REGRESSION and self.random_mirror_regression:
|
98
|
+
self.determine_regression_mirror()
|
99
|
+
|
100
|
+
if self.random_mirror_x:
|
101
|
+
self.determine_mirror(X)
|
102
|
+
|
103
|
+
X[np.isnan(X)] = 0
|
104
|
+
X[np.isinf(X)] = 0
|
105
|
+
|
106
|
+
return self
|
107
|
+
|
108
|
+
|
109
|
+
def transform_X(self, X: np.ndarray):
|
110
|
+
|
111
|
+
X = self.impute_nan_features_with_mean(X)
|
112
|
+
X = self.cutoff_singular_features(X, self.singular_features)
|
113
|
+
X = self.select_features(X)
|
114
|
+
|
115
|
+
if self.use_quantile_transformer:
|
116
|
+
# If use quantile transform is off, it means that the preprocessing will happen on the GPU.
|
117
|
+
|
118
|
+
X = self.quantile_transformer.transform(X)
|
119
|
+
|
120
|
+
X = self.normalize_by_mean_std(X, self.mean, self.std)
|
121
|
+
|
122
|
+
if self.use_feature_count_scaling:
|
123
|
+
X = self.normalize_by_feature_count(X)
|
124
|
+
|
125
|
+
if self.use_random_transforms:
|
126
|
+
X = self.random_transforms.transform(X)
|
127
|
+
|
128
|
+
if self.shuffle_features:
|
129
|
+
X = self.randomize_feature_order(X)
|
130
|
+
|
131
|
+
if self.random_mirror_x:
|
132
|
+
X = self.apply_random_mirror_x(X)
|
133
|
+
|
134
|
+
X = X.astype(np.float32)
|
135
|
+
|
136
|
+
X[np.isnan(X)] = 0
|
137
|
+
X[np.isinf(X)] = 0
|
138
|
+
|
139
|
+
return X
|
140
|
+
|
141
|
+
|
142
|
+
def transform_tabpfn(self, X: np.ndarray):
|
143
|
+
|
144
|
+
n_samples = X.shape[0]
|
145
|
+
n_features = X.shape[1]
|
146
|
+
|
147
|
+
use_config1 = random.random() < 0.5
|
148
|
+
random_state = random.randint(0, 1000000)
|
149
|
+
|
150
|
+
if use_config1:
|
151
|
+
self.random_transforms = Pipeline([
|
152
|
+
('quantile', QuantileTransformer(
|
153
|
+
output_distribution="normal",
|
154
|
+
n_quantiles=max(n_samples // 10, 2),
|
155
|
+
random_state=random_state
|
156
|
+
)),
|
157
|
+
('svd', FeatureUnion([
|
158
|
+
('passthrough', NoneTransformer()),
|
159
|
+
('svd', Pipeline([
|
160
|
+
('standard', StandardScaler(with_mean=False)),
|
161
|
+
('svd', TruncatedSVD(
|
162
|
+
algorithm="arpack",
|
163
|
+
n_components=max(1, min(n_samples // 10 + 1, n_features // 2)),
|
164
|
+
random_state=random_state
|
165
|
+
))
|
166
|
+
]))
|
167
|
+
]))
|
168
|
+
])
|
169
|
+
else:
|
170
|
+
self.random_transforms = ColumnTransformer([
|
171
|
+
('ordinal', OrdinalEncoder(
|
172
|
+
handle_unknown="use_encoded_value",
|
173
|
+
unknown_value=np.nan
|
174
|
+
), [])
|
175
|
+
], remainder='passthrough')
|
176
|
+
|
177
|
+
return self.random_transforms.fit_transform(X)
|
178
|
+
|
179
|
+
|
180
|
+
def transform_y(self, y: np.ndarray):
|
181
|
+
|
182
|
+
if self.task == Task.CLASSIFICATION:
|
183
|
+
# We assume that y properly presents classes [0, 1, 2, ...] before passing to the preprocessor
|
184
|
+
# If the test set has a class that is not in the training set, we will throw an error
|
185
|
+
assert np.all(y < self.n_classes), "y contains class values that are not in the range of n_classes"
|
186
|
+
|
187
|
+
if self.task == Task.CLASSIFICATION and self.shuffle_classes:
|
188
|
+
y = self.randomize_class_order(y)
|
189
|
+
|
190
|
+
if self.task == Task.REGRESSION:
|
191
|
+
y = self.normalize_y(y)
|
192
|
+
|
193
|
+
if self.task == Task.REGRESSION and self.random_mirror_regression:
|
194
|
+
y = self.apply_random_mirror_regression(y)
|
195
|
+
|
196
|
+
match self.task:
|
197
|
+
case Task.CLASSIFICATION:
|
198
|
+
y = y.astype(np.int64)
|
199
|
+
case Task.REGRESSION:
|
200
|
+
y = y.astype(np.float32)
|
201
|
+
|
202
|
+
return y
|
203
|
+
|
204
|
+
|
205
|
+
def inverse_transform_y(self, y: np.ndarray):
|
206
|
+
# Function used during the prediction to transform the model output back to the original space
|
207
|
+
# For classification, y is assumed to be logits of shape [n_samples, n_classes]
|
208
|
+
|
209
|
+
match self.task:
|
210
|
+
case Task.CLASSIFICATION:
|
211
|
+
y = self.extract_correct_classes(y)
|
212
|
+
|
213
|
+
if self.shuffle_classes:
|
214
|
+
y = self.undo_randomize_class_order(y)
|
215
|
+
|
216
|
+
case Task.REGRESSION:
|
217
|
+
|
218
|
+
if self.random_mirror_regression:
|
219
|
+
y = self.apply_random_mirror_regression(y)
|
220
|
+
|
221
|
+
y = self.undo_normalize_y(y)
|
222
|
+
|
223
|
+
return y
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
def fit_transform_quantile_transformer(self, X: np.ndarray) -> np.ndarray:
|
228
|
+
|
229
|
+
n_obs, n_features = X.shape
|
230
|
+
n_quantiles = min(n_obs, 1000)
|
231
|
+
self.quantile_transformer = QuantileTransformer(n_quantiles=n_quantiles, output_distribution='normal')
|
232
|
+
X = self.quantile_transformer.fit_transform(X)
|
233
|
+
|
234
|
+
return X
|
235
|
+
|
236
|
+
|
237
|
+
|
238
|
+
def determine_which_features_are_singular(self, x: np.ndarray) -> None:
|
239
|
+
|
240
|
+
self.singular_features = np.array([ len(np.unique(x_col)) for x_col in x.T ]) == 1
|
241
|
+
|
242
|
+
|
243
|
+
|
244
|
+
def determine_which_features_to_select(self, x: np.ndarray, y: np.ndarray) -> None:
|
245
|
+
|
246
|
+
if self.dim_embedding is None:
|
247
|
+
# All features are selected
|
248
|
+
return
|
249
|
+
|
250
|
+
if x.shape[1] > self.dim_embedding:
|
251
|
+
logger.info(f"Number of features is capped at {self.dim_embedding}, but the dataset has {x.shape[1]} features. A subset of {self.dim_embedding} are selected using SelectKBest")
|
252
|
+
|
253
|
+
self.select_k_best = SelectKBest(k=self.dim_embedding)
|
254
|
+
self.select_k_best.fit(x, y)
|
255
|
+
|
256
|
+
|
257
|
+
def compute_pre_nan_mean(self, x: np.ndarray) -> None:
|
258
|
+
"""
|
259
|
+
Computes the mean of the data before the NaNs are imputed
|
260
|
+
"""
|
261
|
+
self.pre_nan_mean = np.nanmean(x, axis=0)
|
262
|
+
|
263
|
+
|
264
|
+
def impute_nan_features_with_mean(self, x: np.ndarray) -> np.ndarray:
|
265
|
+
|
266
|
+
inds = np.where(np.isnan(x))
|
267
|
+
x[inds] = np.take(self.pre_nan_mean, inds[1])
|
268
|
+
return x
|
269
|
+
|
270
|
+
|
271
|
+
def select_features(self, x: np.ndarray) -> np.ndarray:
|
272
|
+
|
273
|
+
if self.dim_embedding is None:
|
274
|
+
# All features are selected
|
275
|
+
return x
|
276
|
+
|
277
|
+
if x.shape[1] > self.dim_embedding:
|
278
|
+
x = self.select_k_best.transform(x)
|
279
|
+
|
280
|
+
return x
|
281
|
+
|
282
|
+
|
283
|
+
def cutoff_singular_features(self, x: np.ndarray, singular_features: np.ndarray) -> np.ndarray:
|
284
|
+
|
285
|
+
if singular_features.any():
|
286
|
+
x = x[:, ~singular_features]
|
287
|
+
|
288
|
+
return x
|
289
|
+
|
290
|
+
|
291
|
+
def calc_mean_std(self, x: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
292
|
+
"""
|
293
|
+
Calculates the mean and std of the training data
|
294
|
+
"""
|
295
|
+
mean = x.mean(axis=0)
|
296
|
+
std = x.std(axis=0) + 1e-6
|
297
|
+
return mean, std
|
298
|
+
|
299
|
+
|
300
|
+
def normalize_by_mean_std(self, x: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
|
301
|
+
"""
|
302
|
+
Normalizes the data by the mean and std
|
303
|
+
"""
|
304
|
+
|
305
|
+
x = (x - mean) / std
|
306
|
+
return x
|
307
|
+
|
308
|
+
|
309
|
+
def normalize_by_feature_count(self, x: np.ndarray) -> np.ndarray:
|
310
|
+
"""
|
311
|
+
An interesting way of normalization by the tabPFN paper
|
312
|
+
"""
|
313
|
+
|
314
|
+
assert self.dim_embedding is not None, "dim_embedding must be set to use this feature count scaling"
|
315
|
+
|
316
|
+
x = x * self.dim_embedding / x.shape[1]
|
317
|
+
|
318
|
+
return x
|
319
|
+
|
320
|
+
|
321
|
+
|
322
|
+
def extend_feature_dim_to_dim_embedding(self, x: np.ndarray, dim_embedding) -> np.ndarray:
|
323
|
+
"""
|
324
|
+
Increases the number of features to the number of features the model has been trained on
|
325
|
+
"""
|
326
|
+
|
327
|
+
assert self.dim_embedding is not None, "dim_embedding must be set to extend the feature dimension"
|
328
|
+
|
329
|
+
added_zeros = np.zeros((x.shape[0], dim_embedding - x.shape[1]), dtype=np.float32)
|
330
|
+
x = np.concatenate([x, added_zeros], axis=1)
|
331
|
+
return x
|
332
|
+
|
333
|
+
|
334
|
+
def determine_mix_max_scale(self, y: np.ndarray) -> None:
|
335
|
+
self.y_min = y.min()
|
336
|
+
self.y_max = y.max()
|
337
|
+
assert self.y_min != self.y_max, "y_min and y_max are the same, cannot normalize, regression makes no sense"
|
338
|
+
|
339
|
+
|
340
|
+
def normalize_y(self, y: np.ndarray) -> np.ndarray:
|
341
|
+
y = (y - self.y_min) / (self.y_max - self.y_min)
|
342
|
+
return y
|
343
|
+
|
344
|
+
|
345
|
+
def undo_normalize_y(self, y: np.ndarray) -> np.ndarray:
|
346
|
+
y = y * (self.y_max - self.y_min) + self.y_min
|
347
|
+
return y
|
348
|
+
|
349
|
+
|
350
|
+
def determine_regression_mirror(self) -> None:
|
351
|
+
self.regression_mirror = np.random.choice([True, False], size=(1,)).item()
|
352
|
+
|
353
|
+
|
354
|
+
def apply_random_mirror_regression(self, y: np.ndarray) -> np.ndarray:
|
355
|
+
if self.regression_mirror:
|
356
|
+
y = 1 - y
|
357
|
+
return y
|
358
|
+
|
359
|
+
|
360
|
+
def determine_mirror(self, x: np.ndarray) -> None:
|
361
|
+
|
362
|
+
n_features = x.shape[1]
|
363
|
+
self.mirror = np.random.choice([1, -1], size=(1, n_features))
|
364
|
+
|
365
|
+
|
366
|
+
def apply_random_mirror_x(self, x: np.ndarray) -> np.ndarray:
|
367
|
+
|
368
|
+
x = x * self.mirror
|
369
|
+
return x
|
370
|
+
|
371
|
+
|
372
|
+
def determine_shuffle_class_order(self) -> None:
|
373
|
+
|
374
|
+
if self.shuffle_classes:
|
375
|
+
self.new_shuffle_classes = np.random.permutation(self.n_classes)
|
376
|
+
else:
|
377
|
+
self.new_shuffle_classes = np.arange(self.n_classes)
|
378
|
+
|
379
|
+
|
380
|
+
def randomize_class_order(self, y: np.ndarray) -> np.ndarray:
|
381
|
+
|
382
|
+
mapping = { i: self.new_shuffle_classes[i] for i in range(self.n_classes) }
|
383
|
+
y = np.array([mapping[i.item()] for i in y], dtype=np.int64)
|
384
|
+
|
385
|
+
return y
|
386
|
+
|
387
|
+
|
388
|
+
def undo_randomize_class_order(self, y_logits: np.ndarray) -> np.ndarray:
|
389
|
+
"""
|
390
|
+
We assume y_logits has shape [n_samples, n_classes]
|
391
|
+
"""
|
392
|
+
|
393
|
+
# mapping = {self.new_shuffle_classes[i]: i for i in range(self.n_classes)}
|
394
|
+
mapping = {i: self.new_shuffle_classes[i] for i in range(self.n_classes)}
|
395
|
+
y = np.concatenate([y_logits[:, mapping[i]:mapping[i]+1] for i in range(self.n_classes)], axis=1)
|
396
|
+
|
397
|
+
return y
|
398
|
+
|
399
|
+
|
400
|
+
def extract_correct_classes(self, y_logits: np.ndarray) -> np.ndarray:
|
401
|
+
# Even though our network might be able to support 10 classes,
|
402
|
+
# If the problem only has three classes, we should give three classes as output.
|
403
|
+
# We assume y_logits has shape [n_samples, n_classes]
|
404
|
+
y_logits = y_logits[:, :self.n_classes]
|
405
|
+
return y_logits
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
def determine_feature_order(self, x: np.ndarray) -> None:
|
410
|
+
|
411
|
+
n_features = x.shape[1]
|
412
|
+
self.new_feature_order = np.random.permutation(n_features)
|
413
|
+
|
414
|
+
|
415
|
+
|
416
|
+
def randomize_feature_order(self, x: np.ndarray) -> np.ndarray:
|
417
|
+
|
418
|
+
x = x[:, self.new_feature_order]
|
419
|
+
|
420
|
+
return x
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import torch
|
2
|
+
import torch.nn as nn
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
|
5
|
+
class BaseModel(nn.Module, ABC):
|
6
|
+
|
7
|
+
def __init__(self):
|
8
|
+
super().__init__()
|
9
|
+
|
10
|
+
def init_weights(self):
|
11
|
+
"""Initialize model weights."""
|
12
|
+
pass
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def forward(self,
|
16
|
+
x_support: torch.Tensor,
|
17
|
+
y_support: torch.Tensor,
|
18
|
+
x_query: torch.Tensor,
|
19
|
+
**kwargs):
|
20
|
+
"""Forward pass for the model."""
|
21
|
+
pass
|