autogluon.tabular 1.3.2b20250610__py3-none-any.whl → 1.4.1b20251214__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. autogluon/tabular/configs/config_helper.py +1 -1
  2. autogluon/tabular/configs/hyperparameter_configs.py +2 -265
  3. autogluon/tabular/configs/pipeline_presets.py +130 -0
  4. autogluon/tabular/configs/presets_configs.py +51 -26
  5. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +0 -1
  6. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +310 -0
  7. autogluon/tabular/models/__init__.py +6 -1
  8. autogluon/tabular/models/_utils/rapids_utils.py +1 -1
  9. autogluon/tabular/models/automm/automm_model.py +2 -0
  10. autogluon/tabular/models/automm/ft_transformer.py +4 -1
  11. autogluon/tabular/models/catboost/callbacks.py +3 -2
  12. autogluon/tabular/models/catboost/catboost_model.py +15 -9
  13. autogluon/tabular/models/catboost/catboost_utils.py +17 -3
  14. autogluon/tabular/models/ebm/__init__.py +0 -0
  15. autogluon/tabular/models/ebm/ebm_model.py +259 -0
  16. autogluon/tabular/models/ebm/hyperparameters/__init__.py +0 -0
  17. autogluon/tabular/models/ebm/hyperparameters/parameters.py +39 -0
  18. autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +72 -0
  19. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +7 -5
  20. autogluon/tabular/models/knn/knn_model.py +7 -3
  21. autogluon/tabular/models/lgb/lgb_model.py +60 -21
  22. autogluon/tabular/models/lr/lr_model.py +6 -1
  23. autogluon/tabular/models/lr/lr_preprocessing_utils.py +6 -7
  24. autogluon/tabular/models/lr/lr_rapids_model.py +45 -5
  25. autogluon/tabular/models/mitra/__init__.py +0 -0
  26. autogluon/tabular/models/mitra/_internal/__init__.py +1 -0
  27. autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -0
  28. autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +190 -0
  29. autogluon/tabular/models/mitra/_internal/config/config_run.py +32 -0
  30. autogluon/tabular/models/mitra/_internal/config/enums.py +162 -0
  31. autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -0
  32. autogluon/tabular/models/mitra/_internal/core/callbacks.py +94 -0
  33. autogluon/tabular/models/mitra/_internal/core/get_loss.py +54 -0
  34. autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +108 -0
  35. autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +67 -0
  36. autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +132 -0
  37. autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +373 -0
  38. autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -0
  39. autogluon/tabular/models/mitra/_internal/data/collator.py +46 -0
  40. autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +136 -0
  41. autogluon/tabular/models/mitra/_internal/data/dataset_split.py +57 -0
  42. autogluon/tabular/models/mitra/_internal/data/preprocessor.py +420 -0
  43. autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -0
  44. autogluon/tabular/models/mitra/_internal/models/base.py +21 -0
  45. autogluon/tabular/models/mitra/_internal/models/embedding.py +182 -0
  46. autogluon/tabular/models/mitra/_internal/models/tab2d.py +667 -0
  47. autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -0
  48. autogluon/tabular/models/mitra/_internal/utils/set_seed.py +15 -0
  49. autogluon/tabular/models/mitra/mitra_model.py +380 -0
  50. autogluon/tabular/models/mitra/sklearn_interface.py +494 -0
  51. autogluon/tabular/models/realmlp/__init__.py +0 -0
  52. autogluon/tabular/models/realmlp/realmlp_model.py +360 -0
  53. autogluon/tabular/models/rf/rf_model.py +11 -6
  54. autogluon/tabular/models/tabicl/__init__.py +0 -0
  55. autogluon/tabular/models/tabicl/tabicl_model.py +179 -0
  56. autogluon/tabular/models/tabm/__init__.py +0 -0
  57. autogluon/tabular/models/tabm/_tabm_internal.py +545 -0
  58. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +810 -0
  59. autogluon/tabular/models/tabm/tabm_model.py +356 -0
  60. autogluon/tabular/models/tabm/tabm_reference.py +631 -0
  61. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +13 -7
  62. autogluon/tabular/models/tabpfnv2/__init__.py +0 -0
  63. autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +20 -0
  64. autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +40 -0
  65. autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +201 -0
  66. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +1464 -0
  67. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +747 -0
  68. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +863 -0
  69. autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +106 -0
  70. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +388 -0
  71. autogluon/tabular/models/tabular_nn/hyperparameters/parameters.py +1 -3
  72. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +5 -5
  73. autogluon/tabular/models/xgboost/xgboost_model.py +10 -3
  74. autogluon/tabular/predictor/predictor.py +147 -84
  75. autogluon/tabular/registry/_ag_model_registry.py +12 -2
  76. autogluon/tabular/testing/fit_helper.py +57 -27
  77. autogluon/tabular/testing/generate_datasets.py +7 -0
  78. autogluon/tabular/trainer/abstract_trainer.py +3 -1
  79. autogluon/tabular/trainer/model_presets/presets.py +10 -1
  80. autogluon/tabular/version.py +1 -1
  81. autogluon.tabular-1.4.1b20251214-py3.11-nspkg.pth +1 -0
  82. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/METADATA +112 -57
  83. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/RECORD +89 -40
  84. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/WHEEL +1 -1
  85. autogluon/tabular/models/tabpfn/__init__.py +0 -1
  86. autogluon/tabular/models/tabpfn/tabpfn_model.py +0 -153
  87. autogluon.tabular-1.3.2b20250610-py3.9-nspkg.pth +0 -1
  88. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/LICENSE +0 -0
  89. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info/licenses}/NOTICE +0 -0
  90. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/namespace_packages.txt +0 -0
  91. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/top_level.txt +0 -0
  92. {autogluon.tabular-1.3.2b20250610.dist-info → autogluon_tabular-1.4.1b20251214.dist-info}/zip-safe +0 -0
@@ -0,0 +1,46 @@
1
+ import torch
2
+
3
+
4
+ class CollatorWithPadding():
5
+
6
+ def __init__(
7
+ self,
8
+ max_features: int,
9
+ pad_to_max_features: bool,
10
+ ) -> None:
11
+
12
+ self.max_features = max_features
13
+ self.pad_to_max_features = pad_to_max_features
14
+
15
+
16
+ def __call__(self, batch: list[dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
17
+
18
+ max_support_samples = max(dataset['x_support'].shape[0] for dataset in batch)
19
+ max_query_samples = max(dataset['x_query'].shape[0] for dataset in batch)
20
+ max_features = max(dataset['x_support'].shape[1] for dataset in batch)
21
+
22
+ if self.pad_to_max_features:
23
+ max_features = self.max_features
24
+
25
+ batch_size = len(batch)
26
+
27
+ tensor_dict = {
28
+ 'x_support': torch.zeros((batch_size, max_support_samples, max_features), dtype=batch[0]['x_support'].dtype),
29
+ 'y_support': torch.full((batch_size, max_support_samples), fill_value=-100, dtype=batch[0]['y_support'].dtype),
30
+ 'x_query': torch.zeros((batch_size, max_query_samples, max_features), dtype=batch[0]['x_query'].dtype),
31
+ 'y_query': torch.full((batch_size, max_query_samples), fill_value=-100, dtype=batch[0]['y_query'].dtype),
32
+ 'padding_features': torch.ones((batch_size, max_features), dtype=torch.bool),
33
+ 'padding_obs_support': torch.ones((batch_size, max_support_samples), dtype=torch.bool),
34
+ 'padding_obs_query': torch.ones((batch_size, max_query_samples), dtype=torch.bool),
35
+ }
36
+
37
+ for i, dataset in enumerate(batch):
38
+ tensor_dict['x_support'][i, :dataset['x_support'].shape[0], :dataset['x_support'].shape[1]] = dataset['x_support']
39
+ tensor_dict['y_support'][i, :dataset['y_support'].shape[0]] = dataset['y_support']
40
+ tensor_dict['x_query'][i, :dataset['x_query'].shape[0], :dataset['x_support'].shape[1]] = dataset['x_query']
41
+ tensor_dict['y_query'][i, :dataset['y_query'].shape[0]] = dataset['y_query']
42
+ tensor_dict['padding_features'][i, :dataset['x_support'].shape[1]] = False
43
+ tensor_dict['padding_obs_support'][i, :dataset['x_support'].shape[0]] = False
44
+ tensor_dict['padding_obs_query'][i, :dataset['x_query'].shape[0]] = False
45
+
46
+ return tensor_dict
@@ -0,0 +1,136 @@
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+ import torch
5
+
6
+ from ..._internal.config.config_run import ConfigRun
7
+ from ..._internal.data.dataset_split import make_dataset_split
8
+ from ..._internal.config.enums import Task
9
+
10
+
11
+ class DatasetFinetune(torch.utils.data.Dataset):
12
+ """
13
+ The main goal of this class is to generate a dataset for fine-tuning.
14
+ The input data are the full (x_support, y_support, x_query, y_query)
15
+ But these arrays are too large to be pushed through the model at once.
16
+ So here we split query the data into chunks if the query data is too large.
17
+ If the support data is too large, we randomly sample from it.
18
+ Furthermore, we transition from numpy to tensors.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ cfg: ConfigRun,
24
+ x_support: np.ndarray,
25
+ y_support: np.ndarray,
26
+ x_query: np.ndarray,
27
+ y_query: Optional[np.ndarray],
28
+ max_samples_support: int,
29
+ max_samples_query: int,
30
+ rng: np.random.RandomState,
31
+ ):
32
+ """
33
+ :param: max_features: number of features the tab pfn model has been trained on
34
+ """
35
+
36
+ self.cfg = cfg
37
+ self.rng = rng
38
+
39
+ self.x_support = x_support
40
+ self.y_support = y_support
41
+ self.x_query = x_query
42
+ self.y_query = y_query
43
+
44
+ if self.y_query is None:
45
+ self.y_query = np.zeros((self.x_query.shape[0],)) - 1
46
+
47
+ self.max_samples_support = max_samples_support
48
+ self.max_samples_query = max_samples_query
49
+
50
+ self.x_queries = self.split_in_chunks(self.x_query, max_samples_query)
51
+ self.y_queries = self.split_in_chunks(self.y_query, max_samples_query)
52
+
53
+ self.n_samples_support = self.x_support.shape[0]
54
+
55
+ # We push the whole training data through the model, unless it is too large
56
+ self.support_size = min(self.max_samples_support, self.n_samples_support)
57
+
58
+
59
+ def __len__(self):
60
+ return len(self.x_queries)
61
+
62
+ def __getitem__(self, idx):
63
+
64
+ support_indices = self.rng.choice(
65
+ self.n_samples_support,
66
+ size=self.support_size,
67
+ replace=False
68
+ )
69
+
70
+ x_support = self.x_support[support_indices]
71
+ y_support = self.y_support[support_indices]
72
+
73
+ x_support_tensor = torch.as_tensor(x_support)
74
+ y_support_tensor = torch.as_tensor(y_support)
75
+ x_query_tensor = torch.as_tensor(self.x_queries[idx])
76
+ y_query_tensor = torch.as_tensor(self.y_queries[idx])
77
+
78
+ return {
79
+ 'x_support': x_support_tensor,
80
+ 'y_support': y_support_tensor,
81
+ 'x_query': x_query_tensor,
82
+ 'y_query': y_query_tensor,
83
+ }
84
+
85
+
86
+
87
+ def split_in_chunks(self, x: np.ndarray, batch_size: int) -> list[np.ndarray]:
88
+ """
89
+ Splits the data into chunks of size batch_size
90
+ """
91
+
92
+ n_chunks = int(np.ceil(x.shape[0] / batch_size))
93
+ x_chunks = []
94
+
95
+ for i in range(n_chunks):
96
+ x_chunks.append(x[i * batch_size: (i + 1) * batch_size])
97
+
98
+ return x_chunks
99
+
100
+ def DatasetFinetuneGenerator(
101
+ cfg: ConfigRun,
102
+ x: np.ndarray,
103
+ y: np.ndarray,
104
+ task: Task,
105
+ max_samples_support: int,
106
+ max_samples_query: int,
107
+ rng: np.random.RandomState,
108
+ ):
109
+ """
110
+ The dataset fine-tune generator is a generator that yields a dataset for fine-tuning.
111
+ The idea is to split the training dataset into a support and query set.
112
+ Every single iteration, the generator yields a different support and query set split.
113
+ The dataset made always has exactly one batch.
114
+ """
115
+
116
+ while True:
117
+
118
+ x_support, x_query, y_support, y_query = make_dataset_split(x=x, y=y, task=task, seed=rng)
119
+ n_samples_support = x_support.shape[0]
120
+ n_samples_query = x_query.shape[0]
121
+
122
+ support_size = min(max_samples_support, n_samples_support)
123
+ query_size = min(max_samples_query, n_samples_query)
124
+
125
+ dataset_finetune = DatasetFinetune(
126
+ cfg=cfg,
127
+ x_support=x_support[:support_size],
128
+ y_support=y_support[:support_size],
129
+ x_query=x_query[:query_size],
130
+ y_query=y_query[:query_size],
131
+ max_samples_support=max_samples_support,
132
+ max_samples_query=max_samples_query,
133
+ rng=rng,
134
+ )
135
+
136
+ yield dataset_finetune
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ from sklearn.model_selection import StratifiedKFold, train_test_split
5
+
6
+ from ..._internal.config.enums import Task
7
+
8
+ def make_dataset_split(x: np.ndarray, y: np.ndarray, task: Task, seed: int) -> tuple[np.ndarray, ...]:
9
+ # Splits the dataset into train and validation sets with ratio 80/20
10
+
11
+ if task == Task.REGRESSION:
12
+ return make_standard_dataset_split(x, y, seed=seed)
13
+
14
+ size_of_smallest_class = np.min(np.bincount(y))
15
+
16
+ if size_of_smallest_class >= 5:
17
+ # stratification needs have at least 5 samples in each class if split is 80/20
18
+ return make_stratified_dataset_split(x, y, seed=seed)
19
+ else:
20
+ return make_standard_dataset_split(x, y, seed=seed)
21
+
22
+
23
+ def make_stratified_dataset_split(x, y, n_splits=5, seed=0):
24
+ if isinstance(seed, int):
25
+ seed = np.random.RandomState(seed)
26
+
27
+ # Stratify doesn't shuffle the data, so we shuffle it first
28
+ permutation = seed.permutation(len(y))
29
+ x, y = x[permutation], y[permutation]
30
+
31
+ min_samples_per_class = np.min(np.bincount(y))
32
+
33
+ # Adjust n_splits based on both total samples and minimum samples per class
34
+ n_samples = len(y)
35
+ max_possible_splits = min(n_samples - 1, min_samples_per_class)
36
+ n_splits = min(n_splits, max_possible_splits)
37
+
38
+ # Ensure we have at least 2 splits if possible
39
+ if n_samples >= 2 and min_samples_per_class >= 2:
40
+ n_splits = max(2, n_splits)
41
+ else:
42
+ # If we can't do stratified splitting, fall back to standard split
43
+ return make_standard_dataset_split(x, y, seed)
44
+
45
+ skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
46
+ indices = next(skf.split(x, y))
47
+ x_t_train, x_t_valid = x[indices[0]], x[indices[1]] # 80%, 20%
48
+ y_t_train, y_t_valid = y[indices[0]], y[indices[1]]
49
+
50
+ return x_t_train, x_t_valid, y_t_train, y_t_valid
51
+
52
+
53
+ def make_standard_dataset_split(x, y, seed):
54
+
55
+ return train_test_split(
56
+ x, y, test_size=0.2, random_state=seed,
57
+ )
@@ -0,0 +1,420 @@
1
+ import random
2
+ from typing import Optional
3
+
4
+ import numpy as np
5
+ from loguru import logger
6
+ from sklearn.base import BaseEstimator, TransformerMixin
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.decomposition import TruncatedSVD
9
+ from sklearn.feature_selection import SelectKBest
10
+ from sklearn.pipeline import FeatureUnion, Pipeline
11
+ from sklearn.preprocessing import (OrdinalEncoder, QuantileTransformer,
12
+ StandardScaler)
13
+
14
+ from ..._internal.config.enums import Task
15
+
16
+
17
+ class NoneTransformer(BaseEstimator, TransformerMixin):
18
+ def fit(self, X, y=None):
19
+ return self
20
+ def transform(self, X):
21
+ return X
22
+
23
+ class Preprocessor():
24
+ """
25
+ This class is used to preprocess the data before it is pushed through the model.
26
+ The preprocessor assures that the data has the right shape and is normalized,
27
+ This way the model always gets the same input distribution,
28
+ no matter whether the input data is synthetic or real.
29
+
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ dim_embedding: Optional[int], # Size of the feature embedding. For some models this is None, which means the embedding does not depend on the number of features
35
+ n_classes: int, # Actual number of classes in the dataset, assumed to be numbered 0, ..., n_classes - 1
36
+ dim_output: int, # Maximum number of classes the model has been trained on -> size of the output
37
+ use_quantile_transformer: bool,
38
+ use_feature_count_scaling: bool,
39
+ use_random_transforms: bool,
40
+ shuffle_classes: bool,
41
+ shuffle_features: bool,
42
+ random_mirror_regression: bool,
43
+ random_mirror_x: bool,
44
+ task: Task
45
+ ):
46
+
47
+ self.dim_embedding = dim_embedding
48
+ self.n_classes = n_classes
49
+ self.dim_output = dim_output
50
+ self.use_quantile_transformer = use_quantile_transformer
51
+ self.use_feature_count_scaling = use_feature_count_scaling
52
+ self.use_random_transforms = use_random_transforms
53
+ self.shuffle_classes = shuffle_classes
54
+ self.shuffle_features = shuffle_features
55
+ self.random_mirror_regression = random_mirror_regression
56
+ self.random_mirror_x = random_mirror_x
57
+ self.task = task
58
+
59
+ def fit(self, X: np.ndarray, y: np.ndarray) -> "Preprocessor":
60
+ """
61
+ X: np.ndarray [n_samples, n_features]
62
+ y: np.ndarray [n_samples]
63
+ """
64
+
65
+ if self.task == Task.CLASSIFICATION:
66
+ # We assume that y properly presents classes [0, 1, 2, ...] before passing to the preprocessor
67
+ # If the test set has a class that is not in the training set, we will throw an error
68
+
69
+ assert np.all(y < self.n_classes), "y contains class values that are not in the range of n_classes"
70
+
71
+ self.compute_pre_nan_mean(X)
72
+ X = self.impute_nan_features_with_mean(X)
73
+
74
+ self.determine_which_features_are_singular(X)
75
+ X = self.cutoff_singular_features(X, self.singular_features)
76
+
77
+ self.determine_which_features_to_select(X, y)
78
+ X = self.select_features(X)
79
+
80
+ if self.use_quantile_transformer:
81
+ # If use quantile transform is off, it means that the preprocessing will happen on the GPU.
82
+ X = self.fit_transform_quantile_transformer(X)
83
+
84
+ self.mean, self.std = self.calc_mean_std(X)
85
+ X = self.normalize_by_mean_std(X, self.mean, self.std)
86
+
87
+ if self.use_random_transforms:
88
+ X = self.transform_tabpfn(X)
89
+
90
+ if self.task == Task.CLASSIFICATION and self.shuffle_classes:
91
+ self.determine_shuffle_class_order()
92
+
93
+ if self.shuffle_features:
94
+ self.determine_feature_order(X)
95
+
96
+ if self.task == Task.REGRESSION:
97
+ self.determine_mix_max_scale(y)
98
+
99
+ if self.task == Task.REGRESSION and self.random_mirror_regression:
100
+ self.determine_regression_mirror()
101
+
102
+ if self.random_mirror_x:
103
+ self.determine_mirror(X)
104
+
105
+ X[np.isnan(X)] = 0
106
+ X[np.isinf(X)] = 0
107
+
108
+ return self
109
+
110
+
111
+ def transform_X(self, X: np.ndarray):
112
+
113
+ X = self.impute_nan_features_with_mean(X)
114
+ X = self.cutoff_singular_features(X, self.singular_features)
115
+ X = self.select_features(X)
116
+
117
+ if self.use_quantile_transformer:
118
+ # If use quantile transform is off, it means that the preprocessing will happen on the GPU.
119
+
120
+ X = self.quantile_transformer.transform(X)
121
+
122
+ X = self.normalize_by_mean_std(X, self.mean, self.std)
123
+
124
+ if self.use_feature_count_scaling:
125
+ X = self.normalize_by_feature_count(X)
126
+
127
+ if self.use_random_transforms:
128
+ X = self.random_transforms.transform(X)
129
+
130
+ if self.shuffle_features:
131
+ X = self.randomize_feature_order(X)
132
+
133
+ if self.random_mirror_x:
134
+ X = self.apply_random_mirror_x(X)
135
+
136
+ X = X.astype(np.float32)
137
+
138
+ X[np.isnan(X)] = 0
139
+ X[np.isinf(X)] = 0
140
+
141
+ return X
142
+
143
+
144
+ def transform_tabpfn(self, X: np.ndarray):
145
+
146
+ n_samples = X.shape[0]
147
+ n_features = X.shape[1]
148
+
149
+ use_config1 = random.random() < 0.5
150
+ random_state = random.randint(0, 1000000)
151
+
152
+ if use_config1:
153
+ self.random_transforms = Pipeline([
154
+ ('quantile', QuantileTransformer(
155
+ output_distribution="normal",
156
+ n_quantiles=max(n_samples // 10, 2),
157
+ random_state=random_state
158
+ )),
159
+ ('svd', FeatureUnion([
160
+ ('passthrough', NoneTransformer()),
161
+ ('svd', Pipeline([
162
+ ('standard', StandardScaler(with_mean=False)),
163
+ ('svd', TruncatedSVD(
164
+ algorithm="arpack",
165
+ n_components=max(1, min(n_samples // 10 + 1, n_features // 2)),
166
+ random_state=random_state
167
+ ))
168
+ ]))
169
+ ]))
170
+ ])
171
+ else:
172
+ self.random_transforms = ColumnTransformer([
173
+ ('ordinal', OrdinalEncoder(
174
+ handle_unknown="use_encoded_value",
175
+ unknown_value=np.nan
176
+ ), [])
177
+ ], remainder='passthrough')
178
+
179
+ return self.random_transforms.fit_transform(X)
180
+
181
+
182
+ def transform_y(self, y: np.ndarray):
183
+
184
+ if self.task == Task.CLASSIFICATION:
185
+ # We assume that y properly presents classes [0, 1, 2, ...] before passing to the preprocessor
186
+ # If the test set has a class that is not in the training set, we will throw an error
187
+ assert np.all(y < self.n_classes), "y contains class values that are not in the range of n_classes"
188
+
189
+ if self.task == Task.CLASSIFICATION and self.shuffle_classes:
190
+ y = self.randomize_class_order(y)
191
+
192
+ if self.task == Task.REGRESSION:
193
+ y = self.normalize_y(y)
194
+
195
+ if self.task == Task.REGRESSION and self.random_mirror_regression:
196
+ y = self.apply_random_mirror_regression(y)
197
+
198
+ if self.task == Task.CLASSIFICATION:
199
+ y = y.astype(np.int64)
200
+ elif self.task == Task.REGRESSION:
201
+ y = y.astype(np.float32)
202
+
203
+ return y
204
+
205
+
206
+ def inverse_transform_y(self, y: np.ndarray):
207
+ # Function used during the prediction to transform the model output back to the original space
208
+ # For classification, y is assumed to be logits of shape [n_samples, n_classes]
209
+
210
+ if self.task == Task.CLASSIFICATION:
211
+ y = self.extract_correct_classes(y)
212
+
213
+ if self.shuffle_classes:
214
+ y = self.undo_randomize_class_order(y)
215
+
216
+ elif self.task == Task.REGRESSION:
217
+
218
+ if self.random_mirror_regression:
219
+ y = self.apply_random_mirror_regression(y)
220
+
221
+ y = self.undo_normalize_y(y)
222
+
223
+ return y
224
+
225
+
226
+
227
+ def fit_transform_quantile_transformer(self, X: np.ndarray) -> np.ndarray:
228
+
229
+ n_obs, n_features = X.shape
230
+ n_quantiles = min(n_obs, 1000)
231
+ self.quantile_transformer = QuantileTransformer(n_quantiles=n_quantiles, output_distribution='normal')
232
+ X = self.quantile_transformer.fit_transform(X)
233
+
234
+ return X
235
+
236
+
237
+
238
+ def determine_which_features_are_singular(self, x: np.ndarray) -> None:
239
+
240
+ self.singular_features = np.array([ len(np.unique(x_col)) for x_col in x.T ]) == 1
241
+
242
+
243
+
244
+ def determine_which_features_to_select(self, x: np.ndarray, y: np.ndarray) -> None:
245
+
246
+ if self.dim_embedding is None:
247
+ # All features are selected
248
+ return
249
+
250
+ if x.shape[1] > self.dim_embedding:
251
+ logger.info(f"Number of features is capped at {self.dim_embedding}, but the dataset has {x.shape[1]} features. A subset of {self.dim_embedding} are selected using SelectKBest")
252
+
253
+ self.select_k_best = SelectKBest(k=self.dim_embedding)
254
+ self.select_k_best.fit(x, y)
255
+
256
+
257
+ def compute_pre_nan_mean(self, x: np.ndarray) -> None:
258
+ """
259
+ Computes the mean of the data before the NaNs are imputed
260
+ """
261
+ self.pre_nan_mean = np.nanmean(x, axis=0)
262
+
263
+
264
+ def impute_nan_features_with_mean(self, x: np.ndarray) -> np.ndarray:
265
+
266
+ inds = np.where(np.isnan(x))
267
+ x[inds] = np.take(self.pre_nan_mean, inds[1])
268
+ return x
269
+
270
+
271
+ def select_features(self, x: np.ndarray) -> np.ndarray:
272
+
273
+ if self.dim_embedding is None:
274
+ # All features are selected
275
+ return x
276
+
277
+ if x.shape[1] > self.dim_embedding:
278
+ x = self.select_k_best.transform(x)
279
+
280
+ return x
281
+
282
+
283
+ def cutoff_singular_features(self, x: np.ndarray, singular_features: np.ndarray) -> np.ndarray:
284
+
285
+ if singular_features.any():
286
+ x = x[:, ~singular_features]
287
+
288
+ return x
289
+
290
+
291
+ def calc_mean_std(self, x: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
292
+ """
293
+ Calculates the mean and std of the training data
294
+ """
295
+ mean = x.mean(axis=0)
296
+ std = x.std(axis=0) + 1e-6
297
+ return mean, std
298
+
299
+
300
+ def normalize_by_mean_std(self, x: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
301
+ """
302
+ Normalizes the data by the mean and std
303
+ """
304
+
305
+ x = (x - mean) / std
306
+ return x
307
+
308
+
309
+ def normalize_by_feature_count(self, x: np.ndarray) -> np.ndarray:
310
+ """
311
+ An interesting way of normalization by the tabPFN paper
312
+ """
313
+
314
+ assert self.dim_embedding is not None, "dim_embedding must be set to use this feature count scaling"
315
+
316
+ x = x * self.dim_embedding / x.shape[1]
317
+
318
+ return x
319
+
320
+
321
+
322
+ def extend_feature_dim_to_dim_embedding(self, x: np.ndarray, dim_embedding) -> np.ndarray:
323
+ """
324
+ Increases the number of features to the number of features the model has been trained on
325
+ """
326
+
327
+ assert self.dim_embedding is not None, "dim_embedding must be set to extend the feature dimension"
328
+
329
+ added_zeros = np.zeros((x.shape[0], dim_embedding - x.shape[1]), dtype=np.float32)
330
+ x = np.concatenate([x, added_zeros], axis=1)
331
+ return x
332
+
333
+
334
+ def determine_mix_max_scale(self, y: np.ndarray) -> None:
335
+ self.y_min = y.min()
336
+ self.y_max = y.max()
337
+ assert self.y_min != self.y_max, "y_min and y_max are the same, cannot normalize, regression makes no sense"
338
+
339
+
340
+ def normalize_y(self, y: np.ndarray) -> np.ndarray:
341
+ y = (y - self.y_min) / (self.y_max - self.y_min)
342
+ return y
343
+
344
+
345
+ def undo_normalize_y(self, y: np.ndarray) -> np.ndarray:
346
+ y = y * (self.y_max - self.y_min) + self.y_min
347
+ return y
348
+
349
+
350
+ def determine_regression_mirror(self) -> None:
351
+ self.regression_mirror = np.random.choice([True, False], size=(1,)).item()
352
+
353
+
354
+ def apply_random_mirror_regression(self, y: np.ndarray) -> np.ndarray:
355
+ if self.regression_mirror:
356
+ y = 1 - y
357
+ return y
358
+
359
+
360
+ def determine_mirror(self, x: np.ndarray) -> None:
361
+
362
+ n_features = x.shape[1]
363
+ self.mirror = np.random.choice([1, -1], size=(1, n_features))
364
+
365
+
366
+ def apply_random_mirror_x(self, x: np.ndarray) -> np.ndarray:
367
+
368
+ x = x * self.mirror
369
+ return x
370
+
371
+
372
+ def determine_shuffle_class_order(self) -> None:
373
+
374
+ if self.shuffle_classes:
375
+ self.new_shuffle_classes = np.random.permutation(self.n_classes)
376
+ else:
377
+ self.new_shuffle_classes = np.arange(self.n_classes)
378
+
379
+
380
+ def randomize_class_order(self, y: np.ndarray) -> np.ndarray:
381
+
382
+ mapping = { i: self.new_shuffle_classes[i] for i in range(self.n_classes) }
383
+ y = np.array([mapping[i.item()] for i in y], dtype=np.int64)
384
+
385
+ return y
386
+
387
+
388
+ def undo_randomize_class_order(self, y_logits: np.ndarray) -> np.ndarray:
389
+ """
390
+ We assume y_logits has shape [n_samples, n_classes]
391
+ """
392
+
393
+ # mapping = {self.new_shuffle_classes[i]: i for i in range(self.n_classes)}
394
+ mapping = {i: self.new_shuffle_classes[i] for i in range(self.n_classes)}
395
+ y = np.concatenate([y_logits[:, mapping[i]:mapping[i]+1] for i in range(self.n_classes)], axis=1)
396
+
397
+ return y
398
+
399
+
400
+ def extract_correct_classes(self, y_logits: np.ndarray) -> np.ndarray:
401
+ # Even though our network might be able to support 10 classes,
402
+ # If the problem only has three classes, we should give three classes as output.
403
+ # We assume y_logits has shape [n_samples, n_classes]
404
+ y_logits = y_logits[:, :self.n_classes]
405
+ return y_logits
406
+
407
+
408
+
409
+ def determine_feature_order(self, x: np.ndarray) -> None:
410
+
411
+ n_features = x.shape[1]
412
+ self.new_feature_order = np.random.permutation(n_features)
413
+
414
+
415
+
416
+ def randomize_feature_order(self, x: np.ndarray) -> np.ndarray:
417
+
418
+ x = x[:, self.new_feature_order]
419
+
420
+ return x
@@ -0,0 +1 @@
1
+ # Model architecture modules for MitraModel
@@ -0,0 +1,21 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from abc import ABC, abstractmethod
4
+
5
+ class BaseModel(nn.Module, ABC):
6
+
7
+ def __init__(self):
8
+ super().__init__()
9
+
10
+ def init_weights(self):
11
+ """Initialize model weights."""
12
+ pass
13
+
14
+ @abstractmethod
15
+ def forward(self,
16
+ x_support: torch.Tensor,
17
+ y_support: torch.Tensor,
18
+ x_query: torch.Tensor,
19
+ **kwargs):
20
+ """Forward pass for the model."""
21
+ pass