autogluon.tabular 1.3.2b20250709__py3-none-any.whl → 1.3.2b20250711__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. autogluon/tabular/models/__init__.py +3 -0
  2. autogluon/tabular/models/catboost/callbacks.py +3 -2
  3. autogluon/tabular/models/catboost/catboost_model.py +2 -2
  4. autogluon/tabular/models/catboost/catboost_utils.py +7 -3
  5. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +3 -3
  6. autogluon/tabular/models/lgb/lgb_model.py +2 -2
  7. autogluon/tabular/models/realmlp/__init__.py +0 -0
  8. autogluon/tabular/models/realmlp/realmlp_model.py +347 -0
  9. autogluon/tabular/models/rf/rf_model.py +2 -1
  10. autogluon/tabular/models/tabicl/__init__.py +0 -0
  11. autogluon/tabular/models/tabicl/tabicl_model.py +174 -0
  12. autogluon/tabular/models/tabm/__init__.py +0 -0
  13. autogluon/tabular/models/tabm/_tabm_internal.py +544 -0
  14. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +807 -0
  15. autogluon/tabular/models/tabm/tabm_model.py +275 -0
  16. autogluon/tabular/models/tabm/tabm_reference.py +627 -0
  17. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +3 -3
  18. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +3 -3
  19. autogluon/tabular/models/xgboost/xgboost_model.py +2 -2
  20. autogluon/tabular/predictor/predictor.py +5 -3
  21. autogluon/tabular/registry/_ag_model_registry.py +6 -0
  22. autogluon/tabular/testing/fit_helper.py +27 -25
  23. autogluon/tabular/testing/generate_datasets.py +7 -0
  24. autogluon/tabular/trainer/abstract_trainer.py +1 -1
  25. autogluon/tabular/trainer/model_presets/presets.py +10 -1
  26. autogluon/tabular/version.py +1 -1
  27. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/METADATA +21 -13
  28. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/RECORD +35 -26
  29. /autogluon.tabular-1.3.2b20250709-py3.9-nspkg.pth → /autogluon.tabular-1.3.2b20250711-py3.9-nspkg.pth +0 -0
  30. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/LICENSE +0 -0
  31. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/NOTICE +0 -0
  32. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/WHEEL +0 -0
  33. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/namespace_packages.txt +0 -0
  34. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/top_level.txt +0 -0
  35. {autogluon.tabular-1.3.2b20250709.dist-info → autogluon.tabular-1.3.2b20250711.dist-info}/zip-safe +0 -0
@@ -0,0 +1,807 @@
1
+ # taken from https://github.com/yandex-research/rtdl-num-embeddings/blob/main/package/rtdl_num_embeddings.py
2
+ """On Embeddings for Numerical Features in Tabular Deep Learning."""
3
+
4
+ __version__ = '0.0.12'
5
+
6
+ __all__ = [
7
+ 'LinearEmbeddings',
8
+ 'LinearReLUEmbeddings',
9
+ 'PeriodicEmbeddings',
10
+ 'PiecewiseLinearEmbeddings',
11
+ 'PiecewiseLinearEncoding',
12
+ 'compute_bins',
13
+ ]
14
+
15
+ import math
16
+ import warnings
17
+ from typing import Any, Literal, Optional, Union
18
+
19
+ try:
20
+ import sklearn.tree as sklearn_tree
21
+ except ImportError:
22
+ sklearn_tree = None
23
+
24
+ import torch
25
+ import torch.nn as nn
26
+ from torch import Tensor
27
+ from torch.nn.parameter import Parameter
28
+
29
+ try:
30
+ from tqdm import tqdm
31
+ except ImportError:
32
+ tqdm = None
33
+
34
+
35
+ def _check_input_shape(x: Tensor, expected_n_features: int) -> None:
36
+ if x.ndim < 1:
37
+ raise ValueError(
38
+ f'The input must have at least one dimension, however: {x.ndim=}'
39
+ )
40
+ if x.shape[-1] != expected_n_features:
41
+ raise ValueError(
42
+ 'The last dimension of the input was expected to be'
43
+ f' {expected_n_features}, however, {x.shape[-1]=}'
44
+ )
45
+
46
+
47
+ class LinearEmbeddings(nn.Module):
48
+ """Linear embeddings for continuous features.
49
+
50
+ **Shape**
51
+
52
+ - Input: `(*, n_features)`
53
+ - Output: `(*, n_features, d_embedding)`
54
+
55
+ **Examples**
56
+
57
+ >>> batch_size = 2
58
+ >>> n_cont_features = 3
59
+ >>> x = torch.randn(batch_size, n_cont_features)
60
+ >>> d_embedding = 4
61
+ >>> m = LinearEmbeddings(n_cont_features, d_embedding)
62
+ >>> m.get_output_shape()
63
+ torch.Size([3, 4])
64
+ >>> m(x).shape
65
+ torch.Size([2, 3, 4])
66
+ """
67
+
68
+ def __init__(self, n_features: int, d_embedding: int) -> None:
69
+ """
70
+ Args:
71
+ n_features: the number of continuous features.
72
+ d_embedding: the embedding size.
73
+ """
74
+ if n_features <= 0:
75
+ raise ValueError(f'n_features must be positive, however: {n_features=}')
76
+ if d_embedding <= 0:
77
+ raise ValueError(f'd_embedding must be positive, however: {d_embedding=}')
78
+
79
+ super().__init__()
80
+ self.weight = Parameter(torch.empty(n_features, d_embedding))
81
+ self.bias = Parameter(torch.empty(n_features, d_embedding))
82
+ self.reset_parameters()
83
+
84
+ def reset_parameters(self) -> None:
85
+ d_rqsrt = self.weight.shape[1] ** -0.5
86
+ nn.init.uniform_(self.weight, -d_rqsrt, d_rqsrt)
87
+ nn.init.uniform_(self.bias, -d_rqsrt, d_rqsrt)
88
+
89
+ def get_output_shape(self) -> torch.Size:
90
+ """Get the output shape without the batch dimensions."""
91
+ return self.weight.shape
92
+
93
+ def forward(self, x: Tensor) -> Tensor:
94
+ """Do the forward pass."""
95
+ _check_input_shape(x, self.weight.shape[0])
96
+ return torch.addcmul(self.bias, self.weight, x[..., None])
97
+
98
+
99
+ class LinearReLUEmbeddings(nn.Module):
100
+ """Simple non-linear embeddings for continuous features.
101
+
102
+ **Shape**
103
+
104
+ - Input: `(*, n_features)`
105
+ - Output: `(*, n_features, d_embedding)`
106
+
107
+ **Examples**
108
+
109
+ >>> batch_size = 2
110
+ >>> n_cont_features = 3
111
+ >>> x = torch.randn(batch_size, n_cont_features)
112
+ >>>
113
+ >>> d_embedding = 32
114
+ >>> m = LinearReLUEmbeddings(n_cont_features, d_embedding)
115
+ >>> m.get_output_shape()
116
+ torch.Size([3, 32])
117
+ >>> m(x).shape
118
+ torch.Size([2, 3, 32])
119
+ """
120
+
121
+ def __init__(self, n_features: int, d_embedding: int = 32) -> None:
122
+ """
123
+ Args:
124
+ n_features: the number of continuous features.
125
+ d_embedding: the embedding size.
126
+ """
127
+ super().__init__()
128
+ self.linear = LinearEmbeddings(n_features, d_embedding)
129
+ self.activation = nn.ReLU()
130
+
131
+ def get_output_shape(self) -> torch.Size:
132
+ """Get the output shape without the batch dimensions."""
133
+ return self.linear.weight.shape
134
+
135
+ def forward(self, x: Tensor) -> Tensor:
136
+ """Do the forward pass."""
137
+ x = self.linear(x)
138
+ x = self.activation(x)
139
+ return x
140
+
141
+
142
+ class _Periodic(nn.Module):
143
+ """
144
+ NOTE: THIS MODULE SHOULD NOT BE USED DIRECTLY.
145
+
146
+ Technically, this is a linear embedding without bias followed by
147
+ the periodic activations. The scale of the initialization
148
+ (defined by the `sigma` argument) plays an important role.
149
+ """
150
+
151
+ def __init__(self, n_features: int, k: int, sigma: float) -> None:
152
+ if sigma <= 0.0:
153
+ raise ValueError(f'sigma must be positive, however: {sigma=}')
154
+
155
+ super().__init__()
156
+ self._sigma = sigma
157
+ self.weight = Parameter(torch.empty(n_features, k))
158
+ self.reset_parameters()
159
+
160
+ def reset_parameters(self):
161
+ """Reset the parameters."""
162
+ # NOTE[DIFF]
163
+ # Here, extreme values (~0.3% probability) are explicitly avoided just in case.
164
+ # In the paper, there was no protection from extreme values.
165
+ bound = self._sigma * 3
166
+ nn.init.trunc_normal_(self.weight, 0.0, self._sigma, a=-bound, b=bound)
167
+
168
+ def forward(self, x: Tensor) -> Tensor:
169
+ """Do the forward pass."""
170
+ _check_input_shape(x, self.weight.shape[0])
171
+ x = 2 * math.pi * self.weight * x[..., None]
172
+ x = torch.cat([torch.cos(x), torch.sin(x)], -1)
173
+ return x
174
+
175
+
176
+ # _NLinear is a simplified copy of delu.nn.NLinear:
177
+ # https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html
178
+ class _NLinear(nn.Module):
179
+ """N *separate* linear layers for N feature embeddings.
180
+
181
+ In other words,
182
+ each feature embedding is transformed by its own dedicated linear layer.
183
+ """
184
+
185
+ def __init__(
186
+ self, n: int, in_features: int, out_features: int, bias: bool = True
187
+ ) -> None:
188
+ super().__init__()
189
+ self.weight = Parameter(torch.empty(n, in_features, out_features))
190
+ self.bias = Parameter(torch.empty(n, out_features)) if bias else None
191
+ self.reset_parameters()
192
+
193
+ def reset_parameters(self):
194
+ """Reset the parameters."""
195
+ d_in_rsqrt = self.weight.shape[-2] ** -0.5
196
+ nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
197
+ if self.bias is not None:
198
+ nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)
199
+
200
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
201
+ """Do the forward pass."""
202
+ if x.ndim != 3:
203
+ raise ValueError(
204
+ '_NLinear supports only inputs with exactly one batch dimension,'
205
+ ' so `x` must have a shape like (BATCH_SIZE, N_FEATURES, D_EMBEDDING).'
206
+ )
207
+ assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]
208
+
209
+ x = x.transpose(0, 1)
210
+ x = x @ self.weight
211
+ x = x.transpose(0, 1)
212
+ if self.bias is not None:
213
+ x = x + self.bias
214
+ return x
215
+
216
+
217
+ class PeriodicEmbeddings(nn.Module):
218
+ """Embeddings for continuous features based on periodic activations.
219
+
220
+ See README for details.
221
+
222
+ **Shape**
223
+
224
+ - Input: `(*, n_features)`
225
+ - Output: `(*, n_features, d_embedding)`
226
+
227
+ **Examples**
228
+
229
+ >>> batch_size = 2
230
+ >>> n_cont_features = 3
231
+ >>> x = torch.randn(batch_size, n_cont_features)
232
+ >>>
233
+ >>> d_embedding = 24
234
+ >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=False)
235
+ >>> m.get_output_shape()
236
+ torch.Size([3, 24])
237
+ >>> m(x).shape
238
+ torch.Size([2, 3, 24])
239
+ >>>
240
+ >>> m = PeriodicEmbeddings(n_cont_features, d_embedding, lite=True)
241
+ >>> m.get_output_shape()
242
+ torch.Size([3, 24])
243
+ >>> m(x).shape
244
+ torch.Size([2, 3, 24])
245
+ >>>
246
+ >>> # PL embeddings.
247
+ >>> m = PeriodicEmbeddings(n_cont_features, d_embedding=8, activation=False, lite=False)
248
+ >>> m.get_output_shape()
249
+ torch.Size([3, 8])
250
+ >>> m(x).shape
251
+ torch.Size([2, 3, 8])
252
+ """ # noqa: E501
253
+
254
+ def __init__(
255
+ self,
256
+ n_features: int,
257
+ d_embedding: int = 24,
258
+ *,
259
+ n_frequencies: int = 48,
260
+ frequency_init_scale: float = 0.01,
261
+ activation: bool = True,
262
+ lite: bool,
263
+ ) -> None:
264
+ """
265
+ Args:
266
+ n_features: the number of features.
267
+ d_embedding: the embedding size.
268
+ n_frequencies: the number of frequencies for each feature.
269
+ (denoted as "k" in Section 3.3 in the paper).
270
+ frequency_init_scale: the initialization scale for the first linear layer
271
+ (denoted as "sigma" in Section 3.3 in the paper).
272
+ **This is an important hyperparameter**, see README for details.
273
+ activation: if `False`, the ReLU activation is not applied.
274
+ Must be `True` if ``lite=True``.
275
+ lite: if True, the outer linear layer is shared between all features.
276
+ See README for details.
277
+ """
278
+ super().__init__()
279
+ self.periodic = _Periodic(n_features, n_frequencies, frequency_init_scale)
280
+ self.linear: Union[nn.Linear, _NLinear]
281
+ if lite:
282
+ # NOTE[DIFF]
283
+ # The lite variation was introduced in a different paper
284
+ # (about the TabR model).
285
+ if not activation:
286
+ raise ValueError('lite=True is allowed only when activation=True')
287
+ self.linear = nn.Linear(2 * n_frequencies, d_embedding)
288
+ else:
289
+ self.linear = _NLinear(n_features, 2 * n_frequencies, d_embedding)
290
+ self.activation = nn.ReLU() if activation else None
291
+
292
+ def get_output_shape(self) -> torch.Size:
293
+ """Get the output shape without the batch dimensions."""
294
+ n_features = self.periodic.weight.shape[0]
295
+ d_embedding = (
296
+ self.linear.weight.shape[0]
297
+ if isinstance(self.linear, nn.Linear)
298
+ else self.linear.weight.shape[-1]
299
+ )
300
+ return torch.Size((n_features, d_embedding))
301
+
302
+ def forward(self, x: Tensor) -> Tensor:
303
+ """Do the forward pass."""
304
+ x = self.periodic(x)
305
+ x = self.linear(x)
306
+ if self.activation is not None:
307
+ x = self.activation(x)
308
+ return x
309
+
310
+
311
+ def _check_bins(bins: list[Tensor]) -> None:
312
+ if not bins:
313
+ raise ValueError('The list of bins must not be empty')
314
+ for i, feature_bins in enumerate(bins):
315
+ if not isinstance(feature_bins, Tensor):
316
+ raise ValueError(
317
+ 'bins must be a list of PyTorch tensors. '
318
+ f'However, for {i=}: {type(bins[i])=}'
319
+ )
320
+ if feature_bins.ndim != 1:
321
+ raise ValueError(
322
+ 'Each item of the bin list must have exactly one dimension.'
323
+ f' However, for {i=}: {bins[i].ndim=}'
324
+ )
325
+ if len(feature_bins) < 2:
326
+ raise ValueError(
327
+ 'All features must have at least two bin edges.'
328
+ f' However, for {i=}: {len(bins[i])=}'
329
+ )
330
+ if not feature_bins.isfinite().all():
331
+ raise ValueError(
332
+ 'Bin edges must not contain nan/inf/-inf.'
333
+ f' However, this is not true for the {i}-th feature'
334
+ )
335
+ if (feature_bins[:-1] >= feature_bins[1:]).any():
336
+ raise ValueError(
337
+ 'Bin edges must be sorted.'
338
+ f' However, the for the {i}-th feature, the bin edges are not sorted'
339
+ )
340
+ # Commented out due to spaming warnings.
341
+ # if len(feature_bins) == 2:
342
+ # warnings.warn(
343
+ # f'The {i}-th feature has just two bin edges, which means only one bin.'
344
+ # ' Strictly speaking, using a single bin for the'
345
+ # ' piecewise-linear encoding should not break anything,'
346
+ # ' but it is the same as using sklearn.preprocessing.MinMaxScaler'
347
+ # )
348
+
349
+
350
+ def compute_bins(
351
+ X: torch.Tensor,
352
+ n_bins: int = 48,
353
+ *,
354
+ tree_kwargs: Optional[dict[str, Any]] = None,
355
+ y: Optional[Tensor] = None,
356
+ regression: Optional[bool] = None,
357
+ verbose: bool = False,
358
+ ) -> list[Tensor]:
359
+ """Compute the bin boundaries for `PiecewiseLinearEncoding` and `PiecewiseLinearEmbeddings`.
360
+
361
+ **Usage**
362
+
363
+ Compute bins using quantiles (Section 3.2.1 in the paper):
364
+
365
+ >>> X_train = torch.randn(10000, 2)
366
+ >>> bins = compute_bins(X_train)
367
+
368
+ Compute bins using decision trees (Section 3.2.2 in the paper):
369
+
370
+ >>> X_train = torch.randn(10000, 2)
371
+ >>> y_train = torch.randn(len(X_train))
372
+ >>> bins = compute_bins(
373
+ ... X_train,
374
+ ... y=y_train,
375
+ ... regression=True,
376
+ ... tree_kwargs={'min_samples_leaf': 64, 'min_impurity_decrease': 1e-4},
377
+ ... )
378
+
379
+ Args:
380
+ X: the training features.
381
+ n_bins: the number of bins.
382
+ tree_kwargs: keyword arguments for `sklearn.tree.DecisionTreeRegressor`
383
+ (if ``regression=True``) or `sklearn.tree.DecisionTreeClassifier`
384
+ (if ``regression=False``).
385
+ NOTE: requires ``scikit-learn>=1.0,>2`` to be installed.
386
+ y: the training labels (must be provided if ``tree`` is not None).
387
+ regression: whether the labels are regression labels
388
+ (must be provided if ``tree`` is not None).
389
+ verbose: if True and ``tree_kwargs`` is not None, than ``tqdm``
390
+ (must be installed) will report the progress while fitting trees.
391
+
392
+ Returns:
393
+ A list of bin edges for all features. For one feature:
394
+
395
+ - the maximum possible number of bin edges is ``n_bins + 1``.
396
+ - the minimum possible number of bin edges is ``1``.
397
+ """ # noqa: E501
398
+ if not isinstance(X, Tensor):
399
+ raise ValueError(f'X must be a PyTorch tensor, however: {type(X)=}')
400
+ if X.ndim != 2:
401
+ raise ValueError(f'X must have exactly two dimensions, however: {X.ndim=}')
402
+ if X.shape[0] < 2:
403
+ raise ValueError(f'X must have at least two rows, however: {X.shape[0]=}')
404
+ if X.shape[1] < 1:
405
+ raise ValueError(f'X must have at least one column, however: {X.shape[1]=}')
406
+ if not X.isfinite().all():
407
+ raise ValueError('X must not contain nan/inf/-inf.')
408
+ if (X == X[0]).all(dim=0).any():
409
+ raise ValueError(
410
+ 'All columns of X must have at least two distinct values.'
411
+ ' However, X contains columns with just one distinct value.'
412
+ )
413
+ if n_bins <= 1 or n_bins >= len(X):
414
+ raise ValueError(
415
+ 'n_bins must be more than 1, but less than len(X), however:'
416
+ f' {n_bins=}, {len(X)=}'
417
+ )
418
+
419
+ if tree_kwargs is None:
420
+ if y is not None or regression is not None or verbose:
421
+ raise ValueError(
422
+ 'If tree_kwargs is None, then y must be None, regression must be None'
423
+ ' and verbose must be False'
424
+ )
425
+
426
+ _upper = 2**24 # 16_777_216
427
+ if len(X) > _upper:
428
+ warnings.warn(
429
+ f'Computing quantile-based bins for more than {_upper} million objects'
430
+ ' may not be possible due to the limitation of PyTorch'
431
+ ' (for details, see https://github.com/pytorch/pytorch/issues/64947;'
432
+ ' if that issue is successfully resolved, this warning may be irrelevant).' # noqa
433
+ ' As a workaround, subsample the data, i.e. instead of'
434
+ '\ncompute_bins(X, ...)'
435
+ '\ndo'
436
+ '\ncompute_bins(X[torch.randperm(len(X), device=X.device)[:16_777_216]], ...)' # noqa
437
+ '\nOn CUDA, the computation can still fail with OOM even after'
438
+ ' subsampling. If this is the case, try passing features by groups:'
439
+ '\nbins = sum('
440
+ '\n compute_bins(X[:, idx], ...)'
441
+ '\n for idx in torch.arange(len(X), device=X.device).split(group_size),' # noqa
442
+ '\n start=[]'
443
+ '\n)'
444
+ '\nAnother option is to perform the computation on CPU:'
445
+ '\ncompute_bins(X.cpu(), ...)'
446
+ )
447
+ del _upper
448
+
449
+ # NOTE[DIFF]
450
+ # The code below is more correct than the original implementation,
451
+ # because the original implementation contains an unintentional divergence
452
+ # from what is written in the paper. That divergence affected only the
453
+ # quantile-based embeddings, but not the tree-based embeddings.
454
+ # For historical reference, here is the original, less correct, implementation:
455
+ # https://github.com/yandex-research/tabular-dl-num-embeddings/blob/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8/bin/train4.py#L612C30-L612C30
456
+ # (explanation: limiting the number of quantiles by the number of distinct
457
+ # values is NOT the same as removing identical quantiles after computing them).
458
+ bins = [
459
+ q.unique()
460
+ for q in torch.quantile(
461
+ X, torch.linspace(0.0, 1.0, n_bins + 1).to(X), dim=0
462
+ ).T
463
+ ]
464
+ _check_bins(bins)
465
+ return bins
466
+
467
+ else:
468
+ if sklearn_tree is None:
469
+ raise RuntimeError(
470
+ 'The scikit-learn package is missing.'
471
+ ' See README.md for installation instructions'
472
+ )
473
+ if y is None or regression is None:
474
+ raise ValueError(
475
+ 'If tree_kwargs is not None, then y and regression must not be None'
476
+ )
477
+ if y.ndim != 1:
478
+ raise ValueError(f'y must have exactly one dimension, however: {y.ndim=}')
479
+ if len(y) != len(X):
480
+ raise ValueError(
481
+ f'len(y) must be equal to len(X), however: {len(y)=}, {len(X)=}'
482
+ )
483
+ if y is None or regression is None:
484
+ raise ValueError(
485
+ 'If tree_kwargs is not None, then y and regression must not be None'
486
+ )
487
+ if 'max_leaf_nodes' in tree_kwargs:
488
+ raise ValueError(
489
+ 'tree_kwargs must not contain the key "max_leaf_nodes"'
490
+ ' (it will be set to n_bins automatically).'
491
+ )
492
+
493
+ if verbose:
494
+ if tqdm is None:
495
+ raise ImportError('If verbose is True, tqdm must be installed')
496
+ tqdm_ = tqdm
497
+ else:
498
+ tqdm_ = lambda x: x # noqa: E731
499
+
500
+ if X.device.type != 'cpu' or y.device.type != 'cpu':
501
+ warnings.warn(
502
+ 'Computing tree-based bins involves the conversion of the input PyTorch'
503
+ ' tensors to NumPy arrays. The provided PyTorch tensors are not'
504
+ ' located on CPU, so the conversion has some overhead.',
505
+ UserWarning,
506
+ )
507
+ X_numpy = X.cpu().numpy()
508
+ y_numpy = y.cpu().numpy()
509
+ bins = []
510
+ for column in tqdm_(X_numpy.T):
511
+ feature_bin_edges = [float(column.min()), float(column.max())]
512
+ tree = (
513
+ (
514
+ sklearn_tree.DecisionTreeRegressor
515
+ if regression
516
+ else sklearn_tree.DecisionTreeClassifier
517
+ )(max_leaf_nodes=n_bins, **tree_kwargs)
518
+ .fit(column.reshape(-1, 1), y_numpy)
519
+ .tree_
520
+ )
521
+ for node_id in range(tree.node_count):
522
+ # The following condition is True only for split nodes. Source:
523
+ # https://scikit-learn.org/1.0/auto_examples/tree/plot_unveil_tree_structure.html#tree-structure
524
+ if tree.children_left[node_id] != tree.children_right[node_id]:
525
+ feature_bin_edges.append(float(tree.threshold[node_id]))
526
+ bins.append(torch.as_tensor(feature_bin_edges).unique())
527
+ _check_bins(bins)
528
+ return [x.to(device=X.device, dtype=X.dtype) for x in bins]
529
+
530
+
531
+ class _PiecewiseLinearEncodingImpl(nn.Module):
532
+ """Piecewise-linear encoding.
533
+
534
+ NOTE: THIS CLASS SHOULD NOT BE USED DIRECTLY.
535
+ In particular, this class does *not* add any positional information
536
+ to feature encodings. Thus, for Transformer-like models,
537
+ `PiecewiseLinearEmbeddings` is the only valid option.
538
+
539
+ Note:
540
+ This is the *encoding* module, not the *embedding* module,
541
+ so it only implements Equation 1 (Figure 1) from the paper,
542
+ and does not have trainable parameters.
543
+
544
+ **Shape**
545
+
546
+ * Input: ``(*, n_features)``
547
+ * Output: ``(*, n_features, max_n_bins)``,
548
+ where ``max_n_bins`` is the maximum number of bins over all features:
549
+ ``max_n_bins = max(len(b) - 1 for b in bins)``.
550
+
551
+ To understand the output structure,
552
+ consider a feature with the number of bins ``n_bins``.
553
+ Formally, its piecewise-linear encoding is a vector of the size ``n_bins``
554
+ that looks as follows::
555
+
556
+ x_ple = [1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]
557
+
558
+ However, this class will instead produce a vector of the size ``max_n_bins``::
559
+
560
+ x_ple_actual = [*x_ple[:-1], *zeros(max_n_bins - n_bins), x_ple[-1]]
561
+
562
+ In other words:
563
+
564
+ * The last encoding component is **always** located in the end,
565
+ even if ``n_bins == 1`` (i.e. even if it is the only component).
566
+ * The leading ``n_bins - 1`` components are located in the beginning.
567
+ * Everything in-between is always set to zeros (like "padding", but in the middle).
568
+
569
+ This implementation is *significantly* faster than the original one.
570
+ It relies on two key observations:
571
+
572
+ * The piecewise-linear encoding is just
573
+ a non-trainable linear transformation followed by a clamp-based activation.
574
+ Pseudocode: `PiecewiseLinearEncoding(x) = Activation(Linear(x))`.
575
+ The parameters of the linear transformation are defined by the bin edges.
576
+ * Aligning the *last* encoding channel across all features
577
+ allows applying the aforementioned activation simultaneously to all features
578
+ without the loop over features.
579
+ """
580
+
581
+ weight: Tensor
582
+ """The weight of the linear transformation mentioned in the class docstring."""
583
+
584
+ bias: Tensor
585
+ """The bias of the linear transformation mentioned in the class docstring."""
586
+
587
+ single_bin_mask: Optional[Tensor]
588
+ """The indicators of the features with only one bin."""
589
+
590
+ mask: Optional[Tensor]
591
+ """The indicators of the "valid" (i.e. "non-padding") part of the encoding."""
592
+
593
+ def __init__(self, bins: list[Tensor]) -> None:
594
+ """
595
+ Args:
596
+ bins: the bins computed by `compute_bins`.
597
+ """
598
+ assert len(bins) > 0
599
+ super().__init__()
600
+
601
+ n_features = len(bins)
602
+ n_bins = [len(x) - 1 for x in bins]
603
+ max_n_bins = max(n_bins)
604
+
605
+ self.register_buffer('weight', torch.zeros(n_features, max_n_bins))
606
+ self.register_buffer('bias', torch.zeros(n_features, max_n_bins))
607
+
608
+ single_bin_mask = torch.tensor(n_bins) == 1
609
+ self.register_buffer(
610
+ 'single_bin_mask', single_bin_mask if single_bin_mask.any() else None
611
+ )
612
+
613
+ self.register_buffer(
614
+ 'mask',
615
+ # The mask is needed if features have different number of bins.
616
+ None
617
+ if all(len(x) == len(bins[0]) for x in bins)
618
+ else torch.row_stack(
619
+ [
620
+ torch.cat(
621
+ [
622
+ # The number of bins for this feature, minus 1:
623
+ torch.ones((len(x) - 1) - 1, dtype=torch.bool),
624
+ # Unused components (always zeros):
625
+ torch.zeros(max_n_bins - (len(x) - 1), dtype=torch.bool),
626
+ # The last bin:
627
+ torch.ones(1, dtype=torch.bool),
628
+ ]
629
+ )
630
+ # x is a tensor containing the bin bounds for a given feature.
631
+ for x in bins
632
+ ]
633
+ ),
634
+ )
635
+
636
+ for i, bin_edges in enumerate(bins):
637
+ # Formally, the piecewise-linear encoding of one feature looks as follows:
638
+ # `[1, ..., 1, (x - this_bin_left_edge) / this_bin_width, 0, ..., 0]`
639
+ # The linear transformation based on the weight and bias defined below
640
+ # implements the expression in the middle before the clipping to [0, 1].
641
+ # Note that the actual encoding layout produced by this class
642
+ # is slightly different. See the docstring of this class for details.
643
+ bin_width = bin_edges.diff()
644
+ w = 1.0 / bin_width
645
+ b = -bin_edges[:-1] / bin_width
646
+ # The last encoding component:
647
+ self.weight[i, -1] = w[-1]
648
+ self.bias[i, -1] = b[-1]
649
+ # The leading encoding components:
650
+ self.weight[i, : n_bins[i] - 1] = w[:-1]
651
+ self.bias[i, : n_bins[i] - 1] = b[:-1]
652
+ # All in-between components will always be zeros,
653
+ # because the weight and bias are initialized with zeros.
654
+
655
+ def get_max_n_bins(self) -> int:
656
+ return self.weight.shape[-1]
657
+
658
+ def forward(self, x: Tensor) -> Tensor:
659
+ """Do the forward pass."""
660
+ x = torch.addcmul(self.bias, self.weight, x[..., None])
661
+ if x.shape[-1] > 1:
662
+ x = torch.cat(
663
+ [
664
+ x[..., :1].clamp_max(1.0),
665
+ x[..., 1:-1].clamp(0.0, 1.0),
666
+ (
667
+ x[..., -1:].clamp_min(0.0)
668
+ if self.single_bin_mask is None
669
+ else torch.where(
670
+ # For features with only one bin,
671
+ # the whole "piecewise-linear" encoding effectively behaves
672
+ # like mix-max scaling
673
+ # (assuming that the edges of the single bin
674
+ # are the minimum and maximum feature values).
675
+ self.single_bin_mask[..., None],
676
+ x[..., -1:],
677
+ x[..., -1:].clamp_min(0.0),
678
+ )
679
+ ),
680
+ ],
681
+ dim=-1,
682
+ )
683
+ return x
684
+
685
+
686
+ class PiecewiseLinearEncoding(nn.Module):
687
+ """Piecewise-linear encoding.
688
+
689
+ See README for detailed explanation.
690
+
691
+ **Shape**
692
+
693
+ - Input: ``(*, n_features)``
694
+ - Output: ``(*, total_n_bins)``,
695
+ where ``total_n_bins`` is the total number of bins for all features:
696
+ ``total_n_bins = sum(len(b) - 1 for b in bins)``.
697
+
698
+ Technically, the output of this module is the flattened output
699
+ of `_PiecewiseLinearEncoding` with all "padding" values removed.
700
+ """
701
+
702
+ def __init__(self, bins: list[Tensor]) -> None:
703
+ """
704
+ Args:
705
+ bins: the bins computed by `compute_bins`.
706
+ """
707
+ super().__init__()
708
+ self.impl = _PiecewiseLinearEncodingImpl(bins)
709
+
710
+ def get_output_shape(self) -> torch.Size:
711
+ """Get the output shape without the batch dimensions."""
712
+ total_n_bins = (
713
+ self.impl.weight.shape.numel()
714
+ if self.impl.mask is None
715
+ else int(self.impl.mask.long().sum().cpu().item())
716
+ )
717
+ return torch.Size((total_n_bins,))
718
+
719
+ def forward(self, x: Tensor) -> Tensor:
720
+ """Do the forward pass."""
721
+ x = self.impl(x)
722
+ return x.flatten(-2) if self.impl.mask is None else x[:, self.impl.mask]
723
+
724
+
725
+ class PiecewiseLinearEmbeddings(nn.Module):
726
+ """Piecewise-linear embeddings.
727
+
728
+ **Shape**
729
+
730
+ - Input: ``(batch_size, n_features)``
731
+ - Output: ``(batch_size, n_features, d_embedding)``
732
+ """
733
+
734
+ def __init__(
735
+ self,
736
+ bins: list[Tensor],
737
+ d_embedding: int,
738
+ *,
739
+ activation: bool,
740
+ version: Literal[None, 'A', 'B'] = None,
741
+ ) -> None:
742
+ """
743
+ Args:
744
+ bins: the bins computed by `compute_bins`.
745
+ d_embedding: the embedding size.
746
+ activation: if True, the ReLU activation is additionally applied in the end.
747
+ version: the preset for various implementation details, such as
748
+ parametrization and initialization. See README for details.
749
+ """
750
+ if d_embedding <= 0:
751
+ raise ValueError(
752
+ f'd_embedding must be a positive integer, however: {d_embedding=}'
753
+ )
754
+ _check_bins(bins)
755
+ if version is None:
756
+ warnings.warn(
757
+ 'The `version` argument is not provided, so version="A" will be used'
758
+ ' for backward compatibility.'
759
+ ' See README for recommendations regarding `version`.'
760
+ ' In future, omitting this argument will result in an exception.'
761
+ )
762
+ version = 'A'
763
+
764
+ super().__init__()
765
+ n_features = len(bins)
766
+ # NOTE[DIFF]
767
+ # version="B" was introduced in a different paper (about the TabM model).
768
+ is_version_B = version == 'B'
769
+
770
+ self.linear0 = (
771
+ LinearEmbeddings(n_features, d_embedding) if is_version_B else None
772
+ )
773
+ self.impl = _PiecewiseLinearEncodingImpl(bins)
774
+ self.linear = _NLinear(
775
+ len(bins),
776
+ self.impl.get_max_n_bins(),
777
+ d_embedding,
778
+ # For the version "B", the bias is already presented in self.linear0.
779
+ bias=not is_version_B,
780
+ )
781
+ if is_version_B:
782
+ # Because of the following line, at initialization,
783
+ # the whole embedding behaves like a linear embedding.
784
+ # The piecewise-linear component is incrementally learnt during training.
785
+ nn.init.zeros_(self.linear.weight)
786
+ self.activation = nn.ReLU() if activation else None
787
+
788
+ def get_output_shape(self) -> torch.Size:
789
+ """Get the output shape without the batch dimensions."""
790
+ n_features = self.linear.weight.shape[0]
791
+ d_embedding = self.linear.weight.shape[2]
792
+ return torch.Size((n_features, d_embedding))
793
+
794
+ def forward(self, x: Tensor) -> Tensor:
795
+ """Do the forward pass."""
796
+ if x.ndim != 2:
797
+ raise ValueError(
798
+ 'For now, only inputs with exactly one batch dimension are supported.'
799
+ )
800
+
801
+ x_linear = None if self.linear0 is None else self.linear0(x)
802
+
803
+ x_ple = self.impl(x)
804
+ x_ple = self.linear(x_ple)
805
+ if self.activation is not None:
806
+ x_ple = self.activation(x_ple)
807
+ return x_ple if x_linear is None else x_linear + x_ple