autogluon.tabular 1.3.2b20250709__py3-none-any.whl → 1.3.2b20250711__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/tabular/models/__init__.py +3 -0
- autogluon/tabular/models/catboost/callbacks.py +3 -2
- autogluon/tabular/models/catboost/catboost_model.py +2 -2
- autogluon/tabular/models/catboost/catboost_utils.py +7 -3
- autogluon/tabular/models/fastainn/tabular_nn_fastai.py +3 -3
- autogluon/tabular/models/lgb/lgb_model.py +2 -2
- autogluon/tabular/models/realmlp/__init__.py +0 -0
- autogluon/tabular/models/realmlp/realmlp_model.py +347 -0
- autogluon/tabular/models/rf/rf_model.py +2 -1
- autogluon/tabular/models/tabicl/__init__.py +0 -0
- autogluon/tabular/models/tabicl/tabicl_model.py +174 -0
- autogluon/tabular/models/tabm/__init__.py +0 -0
- autogluon/tabular/models/tabm/_tabm_internal.py +544 -0
- autogluon/tabular/models/tabm/rtdl_num_embeddings.py +807 -0
- autogluon/tabular/models/tabm/tabm_model.py +275 -0
- autogluon/tabular/models/tabm/tabm_reference.py +627 -0
- autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +3 -3
- autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -3
- autogluon/tabular/models/xgboost/xgboost_model.py +2 -2
- autogluon/tabular/predictor/predictor.py +5 -3
- autogluon/tabular/registry/_ag_model_registry.py +6 -0
- autogluon/tabular/testing/fit_helper.py +27 -25
- autogluon/tabular/testing/generate_datasets.py +7 -0
- autogluon/tabular/trainer/abstract_trainer.py +1 -1
- autogluon/tabular/trainer/model_presets/presets.py +10 -1
- autogluon/tabular/version.py +1 -1
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/METADATA +21 -13
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/RECORD +35 -26
- /autogluon.tabular-1.3.2b20250709-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250711-py3.9-nspkg.pth +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/LICENSE +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/NOTICE +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/WHEEL +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/namespace_packages.txt +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/top_level.txt +0 -0
- {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/zip-safe +0 -0
@@ -0,0 +1,807 @@
|
|
1
|
+
# taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py
|
2
|
+
"""On Embeddings for Numerical Features in Tabular Deep Learning."""
|
3
|
+
|
4
|
+
__version__ = '0.0.12'
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
'LinearEmbeddings',
|
8
|
+
'LinearReLUEmbeddings',
|
9
|
+
'PeriodicEmbeddings',
|
10
|
+
'PiecewiseLinearEmbeddings',
|
11
|
+
'PiecewiseLinearEncoding',
|
12
|
+
'compute_bins',
|
13
|
+
]
|
14
|
+
|
15
|
+
import math
|
16
|
+
import warnings
|
17
|
+
from typing import Any, Literal, Optional, Union
|
18
|
+
|
19
|
+
try:
|
20
|
+
import sklearn.tree as sklearn_tree
|
21
|
+
except ImportError:
|
22
|
+
sklearn_tree = None
|
23
|
+
|
24
|
+
import torch
|
25
|
+
import torch.nn as nn
|
26
|
+
from torch import Tensor
|
27
|
+
from torch.nn.parameter import Parameter
|
28
|
+
|
29
|
+
try:
|
30
|
+
from tqdm import tqdm
|
31
|
+
except ImportError:
|
32
|
+
tqdm = None
|
33
|
+
|
34
|
+
|
35
|
+
def _check_input_shape(x: Tensor, expected_n_features: int) -> None:
|
36
|
+
if x.ndim < 1:
|
37
|
+
raise ValueError(
|
38
|
+
f'The input must have at least one dimension, however: {x.ndim=}'
|
39
|
+
)
|
40
|
+
if x.shape[-1] != expected_n_features:
|
41
|
+
raise ValueError(
|
42
|
+
'The last dimension of the input was expected to be'
|
43
|
+
f' {expected_n_features}, however, {x.shape[-1]=}'
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
class LinearEmbeddings(nn.Module):
|
48
|
+
"""Linear embeddings for continuous features.
|
49
|
+
|
50
|
+
**Shape**
|
51
|
+
|
52
|
+
- Input: `(*, n_features)`
|
53
|
+
- Output: `(*, n_features, d_embedding)`
|
54
|
+
|
55
|
+
**Examples**
|
56
|
+
|
57
|
+
>>> batch_size = 2
|
58
|
+
>>> n_cont_features = 3
|
59
|
+
>>> x = torch.randn(batch_size, n_cont_features)
|
60
|
+
>>> d_embedding = 4
|
61
|
+
>>> m = LinearEmbeddings(n_cont_features, d_embedding)
|
62
|
+
>>> m.get_output_shape()
|
63
|
+
torch.Size([3, 4])
|
64
|
+
>>> m(x).shape
|
65
|
+
torch.Size([2, 3, 4])
|
66
|
+
"""
|
67
|
+
|
68
|
+
def __init__(self, n_features: int, d_embedding: int) -> None:
|
69
|
+
"""
|
70
|
+
Args:
|
71
|
+
n_features: the number of continuous features.
|
72
|
+
d_embedding: the embedding size.
|
73
|
+
"""
|
74
|
+
if n_features <= 0:
|
75
|
+
raise ValueError(f'n_features must be positive, however: {n_features=}')
|
76
|
+
if d_embedding <= 0:
|
77
|
+
raise ValueError(f'd_embedding must be positive, however: {d_embedding=}')
|
78
|
+
|
79
|
+
super().__init__()
|
80
|
+
self.weight = Parameter(torch.empty(n_features, d_embedding))
|
81
|
+
self.bias = Parameter(torch.empty(n_features, d_embedding))
|
82
|
+
self.reset_parameters()
|
83
|
+
|
84
|
+
def reset_parameters(self) -> None:
|
85
|
+
d_rqsrt = self.weight.shape[1] ** -0.5
|
86
|
+
nn.init.uniform_(self.weight, -d_rqsrt, d_rqsrt)
|
87
|
+
nn.init.uniform_(self.bias, -d_rqsrt, d_rqsrt)
|
88
|
+
|
89
|
+
def get_output_shape(self) -> torch.Size:
|
90
|
+
"""Get the output shape without the batch dimensions."""
|
91
|
+
return self.weight.shape
|
92
|
+
|
93
|
+
def forward(self, x: Tensor) -> Tensor:
|
94
|
+
"""Do the forward pass."""
|
95
|
+
_check_input_shape(x, self.weight.shape[0])
|
96
|
+
return torch.addcmul(self.bias, self.weight, x[..., None])
|
97
|
+
|
98
|
+
|
99
|
+
class LinearReLUEmbeddings(nn.Module):
|
100
|
+
"""Simple non-linear embeddings for continuous features.
|
101
|
+
|
102
|
+
**Shape**
|
103
|
+
|
104
|
+
- Input: `(*, n_features)`
|
105
|
+
- Output: `(*, n_features, d_embedding)`
|
106
|
+
|
107
|
+
**Examples**
|
108
|
+
|
109
|
+
>>> batch_size = 2
|
110
|
+
>>> n_cont_features = 3
|
111
|
+
>>> x = torch.randn(batch_size, n_cont_features)
|
112
|
+
>>>
|
113
|
+
>>> d_embedding = 32
|
114
|
+
>>> m = LinearReLUEmbeddings(n_cont_features, d_embedding)
|
115
|
+
>>> m.get_output_shape()
|
116
|
+
torch.Size([3, 32])
|
117
|
+
>>> m(x).shape
|
118
|
+
torch.Size([2, 3, 32])
|
119
|
+
"""
|
120
|
+
|
121
|
+
def __init__(self, n_features: int, d_embedding: int = 32) -> None:
|
122
|
+
"""
|
123
|
+
Args:
|
124
|
+
n_features: the number of continuous features.
|
125
|
+
d_embedding: the embedding size.
|
126
|
+
"""
|
127
|
+
super().__init__()
|
128
|
+
self.linear = LinearEmbeddings(n_features, d_embedding)
|
129
|
+
self.activation = nn.ReLU()
|
130
|
+
|
131
|
+
def get_output_shape(self) -> torch.Size:
|
132
|
+
"""Get the output shape without the batch dimensions."""
|
133
|
+
return self.linear.weight.shape
|
134
|
+
|
135
|
+
def forward(self, x: Tensor) -> Tensor:
|
136
|
+
"""Do the forward pass."""
|
137
|
+
x = self.linear(x)
|
138
|
+
x = self.activation(x)
|
139
|
+
return x
|
140
|
+
|
141
|
+
|
142
|
+
class _Periodic(nn.Module):
|
143
|
+
"""
|
144
|
+
NOTE: THIS MODULE SHOULD NOT BE USED DIRECTLY.
|
145
|
+
|
146
|
+
Technically, this is a linear embedding without bias followed by
|
147
|
+
the periodic activations. The scale of the initialization
|
148
|
+
(defined by the `sigma` argument) plays an important role.
|
149
|
+
"""
|
150
|
+
|
151
|
+
def __init__(self, n_features: int, k: int, sigma: float) -> None:
|
152
|
+
if sigma <= 0.0:
|
153
|
+
raise ValueError(f'sigma must be positive, however: {sigma=}')
|
154
|
+
|
155
|
+
super().__init__()
|
156
|
+
self._sigma = sigma
|
157
|
+
self.weight = Parameter(torch.empty(n_features, k))
|
158
|
+
self.reset_parameters()
|
159
|
+
|
160
|
+
def reset_parameters(self):
|
161
|
+
"""Reset the parameters."""
|
162
|
+
# NOTE[DIFF]
|
163
|
+
# Here, extreme values (~0.3% probability) are explicitly avoided just in case.
|
164
|
+
# In the paper, there was no protection from extreme values.
|
165
|
+
bound = self._sigma * 3
|
166
|
+
nn.init.trunc_normal_(self.weight, 0.0, self._sigma, a=-bound, b=bound)
|
167
|
+
|
168
|
+
def forward(self, x: Tensor) -> Tensor:
|
169
|
+
"""Do the forward pass."""
|
170
|
+
_check_input_shape(x, self.weight.shape[0])
|
171
|
+
x = 2 * math.pi * self.weight * x[..., None]
|
172
|
+
x = torch.cat([torch.cos(x), torch.sin(x)], -1)
|
173
|
+
return x
|
174
|
+
|
175
|
+
|
176
|
+
# _NLinear is a simplified copy of delu.nn.NLinear:
|
177
|
+
# https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html
|
178
|
+
class _NLinear(nn.Module):
|
179
|
+
"""N *separate* linear layers for N feature embeddings.
|
180
|
+
|
181
|
+
In other words,
|
182
|
+
each feature embedding is transformed by its own dedicated linear layer.
|
183
|
+
"""
|
184
|
+
|
185
|
+
def __init__(
|
186
|
+
self, n: int, in_features: int, out_features: int, bias: bool = True
|
187
|
+
) -> None:
|
188
|
+
super().__init__()
|
189
|
+
self.weight = Parameter(torch.empty(n, in_features, out_features))
|
190
|
+
self.bias = Parameter(torch.empty(n, out_features)) if bias else None
|
191
|
+
self.reset_parameters()
|
192
|
+
|
193
|
+
def reset_parameters(self):
|
194
|
+
"""Reset the parameters."""
|
195
|
+
d_in_rsqrt = self.weight.shape[-2] ** -0.5
|
196
|
+
nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
|
197
|
+
if self.bias is not None:
|
198
|
+
nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)
|
199
|
+
|
200
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
201
|
+
"""Do the forward pass."""
|
202
|
+
if x.ndim != 3:
|
203
|
+
raise ValueError(
|
204
|
+
'_NLinear supports only inputs with exactly one batch dimension,'
|
205
|
+
' so `x` must have a shape like (BATCH_SIZE, N_FEATURES, D_EMBEDDING).'
|
206
|
+
)
|
207
|
+
assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]
|
208
|
+
|
209
|
+
x = x.transpose(0, 1)
|
210
|
+
x = x @ self.weight
|
211
|
+
x = x.transpose(0, 1)
|
212
|
+
if self.bias is not None:
|
213
|
+
x = x + self.bias
|
214
|
+
return x
|
215
|
+
|
216
|
+
|
217
|
+
class PeriodicEmbeddings(nn.Module):
|
218
|
+
"""Embeddings for continuous features based on periodic activations.
|
219
|
+
|
220
|
+
See README for details.
|
221
|
+
|
222
|
+
**Shape**
|
223
|
+
|
224
|
+
- Input: `(*, n_features)`
|
225
|
+
- Output: `(*, n_features, d_embedding)`
|
226
|
+
|
227
|
+
**Examples**
|
228
|
+
|
229
|
+
>>> batch_size = 2
|
230
|
+
>>> n_cont_features = 3
|
231
|
+
>>> x = torch.randn(batch_size, n_cont_features)
|
232
|
+
>>>
|
233
|
+
>>> d_embedding = 24
|
234
|
+
>>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=False)
|
235
|
+
>>> m.get_output_shape()
|
236
|
+
torch.Size([3, 24])
|
237
|
+
>>> m(x).shape
|
238
|
+
torch.Size([2, 3, 24])
|
239
|
+
>>>
|
240
|
+
>>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=True)
|
241
|
+
>>> m.get_output_shape()
|
242
|
+
torch.Size([3, 24])
|
243
|
+
>>> m(x).shape
|
244
|
+
torch.Size([2, 3, 24])
|
245
|
+
>>>
|
246
|
+
>>> # PL embeddings.
|
247
|
+
>>> m = PeriodicEmbeddings(n_cont_features, d_embedding=8, activation=False, lite=False)
|
248
|
+
>>> m.get_output_shape()
|
249
|
+
torch.Size([3, 8])
|
250
|
+
>>> m(x).shape
|
251
|
+
torch.Size([2, 3, 8])
|
252
|
+
""" # noqa: E501
|
253
|
+
|
254
|
+
def __init__(
|
255
|
+
self,
|
256
|
+
n_features: int,
|
257
|
+
d_embedding: int = 24,
|
258
|
+
*,
|
259
|
+
n_frequencies: int = 48,
|
260
|
+
frequency_init_scale: float = 0.01,
|
261
|
+
activation: bool = True,
|
262
|
+
lite: bool,
|
263
|
+
) -> None:
|
264
|
+
"""
|
265
|
+
Args:
|
266
|
+
n_features: the number of features.
|
267
|
+
d_embedding: the embedding size.
|
268
|
+
n_frequencies: the number of frequencies for each feature.
|
269
|
+
(denoted as "k" in Section 3.3 in the paper).
|
270
|
+
frequency_init_scale: the initialization scale for the first linear layer
|
271
|
+
(denoted as "sigma" in Section 3.3 in the paper).
|
272
|
+
**This is an important hyperparameter**, see README for details.
|
273
|
+
activation: if `False`, the ReLU activation is not applied.
|
274
|
+
Must be `True` if ``lite=True``.
|
275
|
+
lite: if True, the outer linear layer is shared between all features.
|
276
|
+
See README for details.
|
277
|
+
"""
|
278
|
+
super().__init__()
|
279
|
+
self.periodic = _Periodic(n_features, n_frequencies, frequency_init_scale)
|
280
|
+
self.linear: Union[nn.Linear, _NLinear]
|
281
|
+
if lite:
|
282
|
+
# NOTE[DIFF]
|
283
|
+
# The lite variation was introduced in a different paper
|
284
|
+
# (about the TabR model).
|
285
|
+
if not activation:
|
286
|
+
raise ValueError('lite=True is allowed only when activation=True')
|
287
|
+
self.linear = nn.Linear(2 * n_frequencies, d_embedding)
|
288
|
+
else:
|
289
|
+
self.linear = _NLinear(n_features, 2 * n_frequencies, d_embedding)
|
290
|
+
self.activation = nn.ReLU() if activation else None
|
291
|
+
|
292
|
+
def get_output_shape(self) -> torch.Size:
|
293
|
+
"""Get the output shape without the batch dimensions."""
|
294
|
+
n_features = self.periodic.weight.shape[0]
|
295
|
+
d_embedding = (
|
296
|
+
self.linear.weight.shape[0]
|
297
|
+
if isinstance(self.linear, nn.Linear)
|
298
|
+
else self.linear.weight.shape[-1]
|
299
|
+
)
|
300
|
+
return torch.Size((n_features, d_embedding))
|
301
|
+
|
302
|
+
def forward(self, x: Tensor) -> Tensor:
|
303
|
+
"""Do the forward pass."""
|
304
|
+
x = self.periodic(x)
|
305
|
+
x = self.linear(x)
|
306
|
+
if self.activation is not None:
|
307
|
+
x = self.activation(x)
|
308
|
+
return x
|
309
|
+
|
310
|
+
|
311
|
+
def _check_bins(bins: list[Tensor]) -> None:
|
312
|
+
if not bins:
|
313
|
+
raise ValueError('The list of bins must not be empty')
|
314
|
+
for i, feature_bins in enumerate(bins):
|
315
|
+
if not isinstance(feature_bins, Tensor):
|
316
|
+
raise ValueError(
|
317
|
+
'bins must be a list of PyTorch tensors. '
|
318
|
+
f'However, for {i=}: {type(bins[i])=}'
|
319
|
+
)
|
320
|
+
if feature_bins.ndim != 1:
|
321
|
+
raise ValueError(
|
322
|
+
'Each item of the bin list must have exactly one dimension.'
|
323
|
+
f' However, for {i=}: {bins[i].ndim=}'
|
324
|
+
)
|
325
|
+
if len(feature_bins) < 2:
|
326
|
+
raise ValueError(
|
327
|
+
'All features must have at least two bin edges.'
|
328
|
+
f' However, for {i=}: {len(bins[i])=}'
|
329
|
+
)
|
330
|
+
if not feature_bins.isfinite().all():
|
331
|
+
raise ValueError(
|
332
|
+
'Bin edges must not contain nan/inf/-inf.'
|
333
|
+
f' However, this is not true for the {i}-th feature'
|
334
|
+
)
|
335
|
+
if (feature_bins[:-1] >= feature_bins[1:]).any():
|
336
|
+
raise ValueError(
|
337
|
+
'Bin edges must be sorted.'
|
338
|
+
f' However, the for the {i}-th feature, the bin edges are not sorted'
|
339
|
+
)
|
340
|
+
# Commented out due to spaming warnings.
|
341
|
+
# if len(feature_bins) == 2:
|
342
|
+
# warnings.warn(
|
343
|
+
# f'The {i}-th feature has just two bin edges, which means only one bin.'
|
344
|
+
# ' Strictly speaking, using a single bin for the'
|
345
|
+
# ' piecewise-linear encoding should not break anything,'
|
346
|
+
# ' but it is the same as using sklearn.preprocessing.MinMaxScaler'
|
347
|
+
# )
|
348
|
+
|
349
|
+
|
350
|
+
def compute_bins(
|
351
|
+
X: torch.Tensor,
|
352
|
+
n_bins: int = 48,
|
353
|
+
*,
|
354
|
+
tree_kwargs: Optional[dict[str, Any]] = None,
|
355
|
+
y: Optional[Tensor] = None,
|
356
|
+
regression: Optional[bool] = None,
|
357
|
+
verbose: bool = False,
|
358
|
+
) -> list[Tensor]:
|
359
|
+
"""Compute the bin boundaries for `PiecewiseLinearEncoding` and `PiecewiseLinearEmbeddings`.
|
360
|
+
|
361
|
+
**Usage**
|
362
|
+
|
363
|
+
Compute bins using quantiles (Section 3.2.1 in the paper):
|
364
|
+
|
365
|
+
>>> X_train = torch.randn(10000, 2)
|
366
|
+
>>> bins = compute_bins(X_train)
|
367
|
+
|
368
|
+
Compute bins using decision trees (Section 3.2.2 in the paper):
|
369
|
+
|
370
|
+
>>> X_train = torch.randn(10000, 2)
|
371
|
+
>>> y_train = torch.randn(len(X_train))
|
372
|
+
>>> bins = compute_bins(
|
373
|
+
... X_train,
|
374
|
+
... y=y_train,
|
375
|
+
... regression=True,
|
376
|
+
... tree_kwargs={'min_samples_leaf': 64, 'min_impurity_decrease': 1e-4},
|
377
|
+
... )
|
378
|
+
|
379
|
+
Args:
|
380
|
+
X: the training features.
|
381
|
+
n_bins: the number of bins.
|
382
|
+
tree_kwargs: keyword arguments for `sklearn.tree.DecisionTreeRegressor`
|
383
|
+
(if ``regression=True``) or `sklearn.tree.DecisionTreeClassifier`
|
384
|
+
(if ``regression=False``).
|
385
|
+
NOTE: requires ``scikit-learn>=1.0,>2`` to be installed.
|
386
|
+
y: the training labels (must be provided if ``tree`` is not None).
|
387
|
+
regression: whether the labels are regression labels
|
388
|
+
(must be provided if ``tree`` is not None).
|
389
|
+
verbose: if True and ``tree_kwargs`` is not None, than ``tqdm``
|
390
|
+
(must be installed) will report the progress while fitting trees.
|
391
|
+
|
392
|
+
Returns:
|
393
|
+
A list of bin edges for all features. For one feature:
|
394
|
+
|
395
|
+
- the maximum possible number of bin edges is ``n_bins + 1``.
|
396
|
+
- the minimum possible number of bin edges is ``1``.
|
397
|
+
""" # noqa: E501
|
398
|
+
if not isinstance(X, Tensor):
|
399
|
+
raise ValueError(f'X must be a PyTorch tensor, however: {type(X)=}')
|
400
|
+
if X.ndim != 2:
|
401
|
+
raise ValueError(f'X must have exactly two dimensions, however: {X.ndim=}')
|
402
|
+
if X.shape[0] < 2:
|
403
|
+
raise ValueError(f'X must have at least two rows, however: {X.shape[0]=}')
|
404
|
+
if X.shape[1] < 1:
|
405
|
+
raise ValueError(f'X must have at least one column, however: {X.shape[1]=}')
|
406
|
+
if not X.isfinite().all():
|
407
|
+
raise ValueError('X must not contain nan/inf/-inf.')
|
408
|
+
if (X == X[0]).all(dim=0).any():
|
409
|
+
raise ValueError(
|
410
|
+
'All columns of X must have at least two distinct values.'
|
411
|
+
' However, X contains columns with just one distinct value.'
|
412
|
+
)
|
413
|
+
if n_bins <= 1 or n_bins >= len(X):
|
414
|
+
raise ValueError(
|
415
|
+
'n_bins must be more than 1, but less than len(X), however:'
|
416
|
+
f' {n_bins=}, {len(X)=}'
|
417
|
+
)
|
418
|
+
|
419
|
+
if tree_kwargs is None:
|
420
|
+
if y is not None or regression is not None or verbose:
|
421
|
+
raise ValueError(
|
422
|
+
'If tree_kwargs is None, then y must be None, regression must be None'
|
423
|
+
' and verbose must be False'
|
424
|
+
)
|
425
|
+
|
426
|
+
_upper = 2**24 # 16_777_216
|
427
|
+
if len(X) > _upper:
|
428
|
+
warnings.warn(
|
429
|
+
f'Computing quantile-based bins for more than {_upper} million objects'
|
430
|
+
' may not be possible due to the limitation of PyTorch'
|
431
|
+
' (for details, see https://github.com/pytorch/pytorch/issues/64947;'
|
432
|
+
' if that issue is successfully resolved, this warning may be irrelevant).' # noqa
|
433
|
+
' As a workaround, subsample the data, i.e. instead of'
|
434
|
+
'\ncompute_bins(X, ...)'
|
435
|
+
'\ndo'
|
436
|
+
'\ncompute_bins(X[torch.randperm(len(X), device=X.device)[:16_777_216]], ...)' # noqa
|
437
|
+
'\nOn CUDA, the computation can still fail with OOM even after'
|
438
|
+
' subsampling. If this is the case, try passing features by groups:'
|
439
|
+
'\nbins = sum('
|
440
|
+
'\n compute_bins(X[:, idx], ...)'
|
441
|
+
'\n for idx in torch.arange(len(X), device=X.device).split(group_size),' # noqa
|
442
|
+
'\n start=[]'
|
443
|
+
'\n)'
|
444
|
+
'\nAnother option is to perform the computation on CPU:'
|
445
|
+
'\ncompute_bins(X.cpu(), ...)'
|
446
|
+
)
|
447
|
+
del _upper
|
448
|
+
|
449
|
+
# NOTE[DIFF]
|
450
|
+
# The code below is more correct than the original implementation,
|
451
|
+
# because the original implementation contains an unintentional divergence
|
452
|
+
# from what is written in the paper. That divergence affected only the
|
453
|
+
# quantile-based embeddings, but not the tree-based embeddings.
|
454
|
+
# For historical reference, here is the original, less correct, implementation:
|
455
|
+
# https://github.com/yandex-research/tabular-dl-num-embeddings/blob/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8/bin/train4.py#L612C30-L612C30
|
456
|
+
# (explanation: limiting the number of quantiles by the number of distinct
|
457
|
+
# values is NOT the same as removing identical quantiles after computing them).
|
458
|
+
bins = [
|
459
|
+
q.unique()
|
460
|
+
for q in torch.quantile(
|
461
|
+
X, torch.linspace(0.0, 1.0, n_bins + 1).to(X), dim=0
|
462
|
+
).T
|
463
|
+
]
|
464
|
+
_check_bins(bins)
|
465
|
+
return bins
|
466
|
+
|
467
|
+
else:
|
468
|
+
if sklearn_tree is None:
|
469
|
+
raise RuntimeError(
|
470
|
+
'The scikit-learn package is missing.'
|
471
|
+
' See README.md for installation instructions'
|
472
|
+
)
|
473
|
+
if y is None or regression is None:
|
474
|
+
raise ValueError(
|
475
|
+
'If tree_kwargs is not None, then y and regression must not be None'
|
476
|
+
)
|
477
|
+
if y.ndim != 1:
|
478
|
+
raise ValueError(f'y must have exactly one dimension, however: {y.ndim=}')
|
479
|
+
if len(y) != len(X):
|
480
|
+
raise ValueError(
|
481
|
+
f'len(y) must be equal to len(X), however: {len(y)=}, {len(X)=}'
|
482
|
+
)
|
483
|
+
if y is None or regression is None:
|
484
|
+
raise ValueError(
|
485
|
+
'If tree_kwargs is not None, then y and regression must not be None'
|
486
|
+
)
|
487
|
+
if 'max_leaf_nodes' in tree_kwargs:
|
488
|
+
raise ValueError(
|
489
|
+
'tree_kwargs must not contain the key "max_leaf_nodes"'
|
490
|
+
' (it will be set to n_bins automatically).'
|
491
|
+
)
|
492
|
+
|
493
|
+
if verbose:
|
494
|
+
if tqdm is None:
|
495
|
+
raise ImportError('If verbose is True, tqdm must be installed')
|
496
|
+
tqdm_ = tqdm
|
497
|
+
else:
|
498
|
+
tqdm_ = lambda x: x # noqa: E731
|
499
|
+
|
500
|
+
if X.device.type != 'cpu' or y.device.type != 'cpu':
|
501
|
+
warnings.warn(
|
502
|
+
'Computing tree-based bins involves the conversion of the input PyTorch'
|
503
|
+
' tensors to NumPy arrays. The provided PyTorch tensors are not'
|
504
|
+
' located on CPU, so the conversion has some overhead.',
|
505
|
+
UserWarning,
|
506
|
+
)
|
507
|
+
X_numpy = X.cpu().numpy()
|
508
|
+
y_numpy = y.cpu().numpy()
|
509
|
+
bins = []
|
510
|
+
for column in tqdm_(X_numpy.T):
|
511
|
+
feature_bin_edges = [float(column.min()), float(column.max())]
|
512
|
+
tree = (
|
513
|
+
(
|
514
|
+
sklearn_tree.DecisionTreeRegressor
|
515
|
+
if regression
|
516
|
+
else sklearn_tree.DecisionTreeClassifier
|
517
|
+
)(max_leaf_nodes=n_bins, **tree_kwargs)
|
518
|
+
.fit(column.reshape(-1, 1), y_numpy)
|
519
|
+
.tree_
|
520
|
+
)
|
521
|
+
for node_id in range(tree.node_count):
|
522
|
+
# The following condition is True only for split nodes. Source:
|
523
|
+
# https://scikit-learn.org/1.0/auto_examples/tree/plot_unveil_tree_structure.html#tree-structure
|
524
|
+
if tree.children_left[node_id] != tree.children_right[node_id]:
|
525
|
+
feature_bin_edges.append(float(tree.threshold[node_id]))
|
526
|
+
bins.append(torch.as_tensor(feature_bin_edges).unique())
|
527
|
+
_check_bins(bins)
|
528
|
+
return [x.to(device=X.device, dtype=X.dtype) for x in bins]
|
529
|
+
|
530
|
+
|
531
|
+
class _PiecewiseLinearEncodingImpl(nn.Module):
|
532
|
+
"""Piecewise-linear encoding.
|
533
|
+
|
534
|
+
NOTE: THIS CLASS SHOULD NOT BE USED DIRECTLY.
|
535
|
+
In particular, this class does *not* add any positional information
|
536
|
+
to feature encodings. Thus, for Transformer-like models,
|
537
|
+
`PiecewiseLinearEmbeddings` is the only valid option.
|
538
|
+
|
539
|
+
Note:
|
540
|
+
This is the *encoding* module, not the *embedding* module,
|
541
|
+
so it only implements Equation 1 (Figure 1) from the paper,
|
542
|
+
and does not have trainable parameters.
|
543
|
+
|
544
|
+
**Shape**
|
545
|
+
|
546
|
+
* Input: ``(*, n_features)``
|
547
|
+
* Output: ``(*, n_features, max_n_bins)``,
|
548
|
+
where ``max_n_bins`` is the maximum number of bins over all features:
|
549
|
+
``max_n_bins = max(len(b) - 1 for b in bins)``.
|
550
|
+
|
551
|
+
To understand the output structure,
|
552
|
+
consider a feature with the number of bins ``n_bins``.
|
553
|
+
Formally, its piecewise-linear encoding is a vector of the size ``n_bins``
|
554
|
+
that looks as follows::
|
555
|
+
|
556
|
+
x_ple = [1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]
|
557
|
+
|
558
|
+
However, this class will instead produce a vector of the size ``max_n_bins``::
|
559
|
+
|
560
|
+
x_ple_actual = [*x_ple[:-1], *zeros(max_n_bins - n_bins), x_ple[-1]]
|
561
|
+
|
562
|
+
In other words:
|
563
|
+
|
564
|
+
* The last encoding component is **always** located in the end,
|
565
|
+
even if ``n_bins == 1`` (i.e. even if it is the only component).
|
566
|
+
* The leading ``n_bins - 1`` components are located in the beginning.
|
567
|
+
* Everything in-between is always set to zeros (like "padding", but in the middle).
|
568
|
+
|
569
|
+
This implementation is *significantly* faster than the original one.
|
570
|
+
It relies on two key observations:
|
571
|
+
|
572
|
+
* The piecewise-linear encoding is just
|
573
|
+
a non-trainable linear transformation followed by a clamp-based activation.
|
574
|
+
Pseudocode: `PiecewiseLinearEncoding(x) = Activation(Linear(x))`.
|
575
|
+
The parameters of the linear transformation are defined by the bin edges.
|
576
|
+
* Aligning the *last* encoding channel across all features
|
577
|
+
allows applying the aforementioned activation simultaneously to all features
|
578
|
+
without the loop over features.
|
579
|
+
"""
|
580
|
+
|
581
|
+
weight: Tensor
|
582
|
+
"""The weight of the linear transformation mentioned in the class docstring."""
|
583
|
+
|
584
|
+
bias: Tensor
|
585
|
+
"""The bias of the linear transformation mentioned in the class docstring."""
|
586
|
+
|
587
|
+
single_bin_mask: Optional[Tensor]
|
588
|
+
"""The indicators of the features with only one bin."""
|
589
|
+
|
590
|
+
mask: Optional[Tensor]
|
591
|
+
"""The indicators of the "valid" (i.e. "non-padding") part of the encoding."""
|
592
|
+
|
593
|
+
def __init__(self, bins: list[Tensor]) -> None:
|
594
|
+
"""
|
595
|
+
Args:
|
596
|
+
bins: the bins computed by `compute_bins`.
|
597
|
+
"""
|
598
|
+
assert len(bins) > 0
|
599
|
+
super().__init__()
|
600
|
+
|
601
|
+
n_features = len(bins)
|
602
|
+
n_bins = [len(x) - 1 for x in bins]
|
603
|
+
max_n_bins = max(n_bins)
|
604
|
+
|
605
|
+
self.register_buffer('weight', torch.zeros(n_features, max_n_bins))
|
606
|
+
self.register_buffer('bias', torch.zeros(n_features, max_n_bins))
|
607
|
+
|
608
|
+
single_bin_mask = torch.tensor(n_bins) == 1
|
609
|
+
self.register_buffer(
|
610
|
+
'single_bin_mask', single_bin_mask if single_bin_mask.any() else None
|
611
|
+
)
|
612
|
+
|
613
|
+
self.register_buffer(
|
614
|
+
'mask',
|
615
|
+
# The mask is needed if features have different number of bins.
|
616
|
+
None
|
617
|
+
if all(len(x) == len(bins[0]) for x in bins)
|
618
|
+
else torch.row_stack(
|
619
|
+
[
|
620
|
+
torch.cat(
|
621
|
+
[
|
622
|
+
# The number of bins for this feature, minus 1:
|
623
|
+
torch.ones((len(x) - 1) - 1, dtype=torch.bool),
|
624
|
+
# Unused components (always zeros):
|
625
|
+
torch.zeros(max_n_bins - (len(x) - 1), dtype=torch.bool),
|
626
|
+
# The last bin:
|
627
|
+
torch.ones(1, dtype=torch.bool),
|
628
|
+
]
|
629
|
+
)
|
630
|
+
# x is a tensor containing the bin bounds for a given feature.
|
631
|
+
for x in bins
|
632
|
+
]
|
633
|
+
),
|
634
|
+
)
|
635
|
+
|
636
|
+
for i, bin_edges in enumerate(bins):
|
637
|
+
# Formally, the piecewise-linear encoding of one feature looks as follows:
|
638
|
+
# `[1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]`
|
639
|
+
# The linear transformation based on the weight and bias defined below
|
640
|
+
# implements the expression in the middle before the clipping to [0, 1].
|
641
|
+
# Note that the actual encoding layout produced by this class
|
642
|
+
# is slightly different. See the docstring of this class for details.
|
643
|
+
bin_width = bin_edges.diff()
|
644
|
+
w = 1.0 / bin_width
|
645
|
+
b = -bin_edges[:-1] / bin_width
|
646
|
+
# The last encoding component:
|
647
|
+
self.weight[i, -1] = w[-1]
|
648
|
+
self.bias[i, -1] = b[-1]
|
649
|
+
# The leading encoding components:
|
650
|
+
self.weight[i, : n_bins[i] - 1] = w[:-1]
|
651
|
+
self.bias[i, : n_bins[i] - 1] = b[:-1]
|
652
|
+
# All in-between components will always be zeros,
|
653
|
+
# because the weight and bias are initialized with zeros.
|
654
|
+
|
655
|
+
def get_max_n_bins(self) -> int:
|
656
|
+
return self.weight.shape[-1]
|
657
|
+
|
658
|
+
def forward(self, x: Tensor) -> Tensor:
|
659
|
+
"""Do the forward pass."""
|
660
|
+
x = torch.addcmul(self.bias, self.weight, x[..., None])
|
661
|
+
if x.shape[-1] > 1:
|
662
|
+
x = torch.cat(
|
663
|
+
[
|
664
|
+
x[..., :1].clamp_max(1.0),
|
665
|
+
x[..., 1:-1].clamp(0.0, 1.0),
|
666
|
+
(
|
667
|
+
x[..., -1:].clamp_min(0.0)
|
668
|
+
if self.single_bin_mask is None
|
669
|
+
else torch.where(
|
670
|
+
# For features with only one bin,
|
671
|
+
# the whole "piecewise-linear" encoding effectively behaves
|
672
|
+
# like mix-max scaling
|
673
|
+
# (assuming that the edges of the single bin
|
674
|
+
# are the minimum and maximum feature values).
|
675
|
+
self.single_bin_mask[..., None],
|
676
|
+
x[..., -1:],
|
677
|
+
x[..., -1:].clamp_min(0.0),
|
678
|
+
)
|
679
|
+
),
|
680
|
+
],
|
681
|
+
dim=-1,
|
682
|
+
)
|
683
|
+
return x
|
684
|
+
|
685
|
+
|
686
|
+
class PiecewiseLinearEncoding(nn.Module):
|
687
|
+
"""Piecewise-linear encoding.
|
688
|
+
|
689
|
+
See README for detailed explanation.
|
690
|
+
|
691
|
+
**Shape**
|
692
|
+
|
693
|
+
- Input: ``(*, n_features)``
|
694
|
+
- Output: ``(*, total_n_bins)``,
|
695
|
+
where ``total_n_bins`` is the total number of bins for all features:
|
696
|
+
``total_n_bins = sum(len(b) - 1 for b in bins)``.
|
697
|
+
|
698
|
+
Technically, the output of this module is the flattened output
|
699
|
+
of `_PiecewiseLinearEncoding` with all "padding" values removed.
|
700
|
+
"""
|
701
|
+
|
702
|
+
def __init__(self, bins: list[Tensor]) -> None:
|
703
|
+
"""
|
704
|
+
Args:
|
705
|
+
bins: the bins computed by `compute_bins`.
|
706
|
+
"""
|
707
|
+
super().__init__()
|
708
|
+
self.impl = _PiecewiseLinearEncodingImpl(bins)
|
709
|
+
|
710
|
+
def get_output_shape(self) -> torch.Size:
|
711
|
+
"""Get the output shape without the batch dimensions."""
|
712
|
+
total_n_bins = (
|
713
|
+
self.impl.weight.shape.numel()
|
714
|
+
if self.impl.mask is None
|
715
|
+
else int(self.impl.mask.long().sum().cpu().item())
|
716
|
+
)
|
717
|
+
return torch.Size((total_n_bins,))
|
718
|
+
|
719
|
+
def forward(self, x: Tensor) -> Tensor:
|
720
|
+
"""Do the forward pass."""
|
721
|
+
x = self.impl(x)
|
722
|
+
return x.flatten(-2) if self.impl.mask is None else x[:, self.impl.mask]
|
723
|
+
|
724
|
+
|
725
|
+
class PiecewiseLinearEmbeddings(nn.Module):
|
726
|
+
"""Piecewise-linear embeddings.
|
727
|
+
|
728
|
+
**Shape**
|
729
|
+
|
730
|
+
- Input: ``(batch_size, n_features)``
|
731
|
+
- Output: ``(batch_size, n_features, d_embedding)``
|
732
|
+
"""
|
733
|
+
|
734
|
+
def __init__(
|
735
|
+
self,
|
736
|
+
bins: list[Tensor],
|
737
|
+
d_embedding: int,
|
738
|
+
*,
|
739
|
+
activation: bool,
|
740
|
+
version: Literal[None, 'A', 'B'] = None,
|
741
|
+
) -> None:
|
742
|
+
"""
|
743
|
+
Args:
|
744
|
+
bins: the bins computed by `compute_bins`.
|
745
|
+
d_embedding: the embedding size.
|
746
|
+
activation: if True, the ReLU activation is additionally applied in the end.
|
747
|
+
version: the preset for various implementation details, such as
|
748
|
+
parametrization and initialization. See README for details.
|
749
|
+
"""
|
750
|
+
if d_embedding <= 0:
|
751
|
+
raise ValueError(
|
752
|
+
f'd_embedding must be a positive integer, however: {d_embedding=}'
|
753
|
+
)
|
754
|
+
_check_bins(bins)
|
755
|
+
if version is None:
|
756
|
+
warnings.warn(
|
757
|
+
'The `version` argument is not provided, so version="A" will be used'
|
758
|
+
' for backward compatibility.'
|
759
|
+
' See README for recommendations regarding `version`.'
|
760
|
+
' In future, omitting this argument will result in an exception.'
|
761
|
+
)
|
762
|
+
version = 'A'
|
763
|
+
|
764
|
+
super().__init__()
|
765
|
+
n_features = len(bins)
|
766
|
+
# NOTE[DIFF]
|
767
|
+
# version="B" was introduced in a different paper (about the TabM model).
|
768
|
+
is_version_B = version == 'B'
|
769
|
+
|
770
|
+
self.linear0 = (
|
771
|
+
LinearEmbeddings(n_features, d_embedding) if is_version_B else None
|
772
|
+
)
|
773
|
+
self.impl = _PiecewiseLinearEncodingImpl(bins)
|
774
|
+
self.linear = _NLinear(
|
775
|
+
len(bins),
|
776
|
+
self.impl.get_max_n_bins(),
|
777
|
+
d_embedding,
|
778
|
+
# For the version "B", the bias is already presented in self.linear0.
|
779
|
+
bias=not is_version_B,
|
780
|
+
)
|
781
|
+
if is_version_B:
|
782
|
+
# Because of the following line, at initialization,
|
783
|
+
# the whole embedding behaves like a linear embedding.
|
784
|
+
# The piecewise-linear component is incrementally learnt during training.
|
785
|
+
nn.init.zeros_(self.linear.weight)
|
786
|
+
self.activation = nn.ReLU() if activation else None
|
787
|
+
|
788
|
+
def get_output_shape(self) -> torch.Size:
|
789
|
+
"""Get the output shape without the batch dimensions."""
|
790
|
+
n_features = self.linear.weight.shape[0]
|
791
|
+
d_embedding = self.linear.weight.shape[2]
|
792
|
+
return torch.Size((n_features, d_embedding))
|
793
|
+
|
794
|
+
def forward(self, x: Tensor) -> Tensor:
|
795
|
+
"""Do the forward pass."""
|
796
|
+
if x.ndim != 2:
|
797
|
+
raise ValueError(
|
798
|
+
'For now, only inputs with exactly one batch dimension are supported.'
|
799
|
+
)
|
800
|
+
|
801
|
+
x_linear = None if self.linear0 is None else self.linear0(x)
|
802
|
+
|
803
|
+
x_ple = self.impl(x)
|
804
|
+
x_ple = self.linear(x_ple)
|
805
|
+
if self.activation is not None:
|
806
|
+
x_ple = self.activation(x_ple)
|
807
|
+
return x_ple if x_linear is None else x_linear + x_ple
|