tabstar 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. tabstar-0.1.0/PKG-INFO +161 -0
  2. tabstar-0.1.0/README.md +145 -0
  3. tabstar-0.1.0/pyproject.toml +26 -0
  4. tabstar-0.1.0/setup.cfg +4 -0
  5. tabstar-0.1.0/src/tabstar/__init__.py +1 -0
  6. tabstar-0.1.0/src/tabstar/arch/__init__.py +0 -0
  7. tabstar-0.1.0/src/tabstar/arch/arch.py +69 -0
  8. tabstar-0.1.0/src/tabstar/arch/config.py +37 -0
  9. tabstar-0.1.0/src/tabstar/arch/fusion.py +41 -0
  10. tabstar-0.1.0/src/tabstar/arch/interaction.py +25 -0
  11. tabstar-0.1.0/src/tabstar/arch/prediction.py +19 -0
  12. tabstar-0.1.0/src/tabstar/constants.py +1 -0
  13. tabstar-0.1.0/src/tabstar/preprocessing/__init__.py +0 -0
  14. tabstar-0.1.0/src/tabstar/preprocessing/binning.py +58 -0
  15. tabstar-0.1.0/src/tabstar/preprocessing/dates.py +44 -0
  16. tabstar-0.1.0/src/tabstar/preprocessing/detection.py +35 -0
  17. tabstar-0.1.0/src/tabstar/preprocessing/feat_types.py +45 -0
  18. tabstar-0.1.0/src/tabstar/preprocessing/nulls.py +39 -0
  19. tabstar-0.1.0/src/tabstar/preprocessing/scaler.py +24 -0
  20. tabstar-0.1.0/src/tabstar/preprocessing/sparse.py +18 -0
  21. tabstar-0.1.0/src/tabstar/preprocessing/splits.py +32 -0
  22. tabstar-0.1.0/src/tabstar/preprocessing/target.py +33 -0
  23. tabstar-0.1.0/src/tabstar/preprocessing/texts.py +30 -0
  24. tabstar-0.1.0/src/tabstar/preprocessing/verbalize.py +31 -0
  25. tabstar-0.1.0/src/tabstar/tabstar_model.py +106 -0
  26. tabstar-0.1.0/src/tabstar/tabstar_verbalizer.py +102 -0
  27. tabstar-0.1.0/src/tabstar/training/__init__.py +0 -0
  28. tabstar-0.1.0/src/tabstar/training/dataloader.py +46 -0
  29. tabstar-0.1.0/src/tabstar/training/devices.py +22 -0
  30. tabstar-0.1.0/src/tabstar/training/early_stopping.py +23 -0
  31. tabstar-0.1.0/src/tabstar/training/lora.py +34 -0
  32. tabstar-0.1.0/src/tabstar/training/metrics.py +49 -0
  33. tabstar-0.1.0/src/tabstar/training/optimizer.py +18 -0
  34. tabstar-0.1.0/src/tabstar/training/trainer.py +145 -0
  35. tabstar-0.1.0/src/tabstar/training/utils.py +8 -0
  36. tabstar-0.1.0/src/tabstar.egg-info/PKG-INFO +161 -0
  37. tabstar-0.1.0/src/tabstar.egg-info/SOURCES.txt +44 -0
  38. tabstar-0.1.0/src/tabstar.egg-info/dependency_links.txt +1 -0
  39. tabstar-0.1.0/src/tabstar.egg-info/requires.txt +8 -0
  40. tabstar-0.1.0/src/tabstar.egg-info/top_level.txt +1 -0
  41. tabstar-0.1.0/test/test_curation.py +11 -0
  42. tabstar-0.1.0/test/test_datetimes.py +49 -0
  43. tabstar-0.1.0/test/test_feature_types.py +8 -0
  44. tabstar-0.1.0/test/test_layers_unfreeze.py +8 -0
  45. tabstar-0.1.0/test/test_null_detections.py +17 -0
  46. tabstar-0.1.0/test/test_numerical_verbalization.py +8 -0
tabstar-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.4
2
+ Name: tabstar
3
+ Version: 0.1.0
4
+ Summary: TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations
5
+ Author-email: Alan Arazi <alanarazi7@gmail.com>
6
+ License: MIT
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pandas>=2.2.2
10
+ Requires-Dist: peft
11
+ Requires-Dist: scikit-learn
12
+ Requires-Dist: skrub
13
+ Requires-Dist: torch>=2.6.0
14
+ Requires-Dist: tqdm
15
+ Requires-Dist: transformers>=4.49.0
16
+
17
+ <img src="src/tabstar/resources/tabstar_logo.png" alt="TabSTAR Logo" width="50%">
18
+
19
+ **Welcome to the TabSTAR repository! 👋**
20
+ You can use it in two modes: production mode for fitting TabSTAR on your own dataset, and research mode to pretrain TabSTAR and replicate our work in the paper.
21
+
22
+ 🚧 The repository is under construction: Any bugs or feature request? Please open an issue! 🚧
23
+
24
+ ---
25
+
26
+ ### 📚 Resources
27
+
28
+ * **Paper**: [TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations](https://arxiv.org/abs/2505.18125)
29
+ * **Project Website**: [TabSTAR](https://eilamshapira.com/TabSTAR/)
30
+
31
+ <img src="src/tabstar/resources/tabstar_arch.png" alt="TabSTAR Logo" width="200%">
32
+
33
+ ---
34
+
35
+ ## Production Mode
36
+
37
+ Use this mode if you want to fit a pretrained TabSTAR model to your own dataset.
38
+ (Note that currently we still don't support reloading that model for later use, but this is coming soon! 🔜)
39
+
40
+ ### Installation
41
+
42
+ ```bash
43
+ source init.sh
44
+ ```
45
+
46
+ ### Inference Example
47
+
48
+ TabSTAR uses the sklearn API, and it is as simple as this:
49
+
50
+ ```python
51
+ import pandas as pd
52
+ from sklearn.metrics import classification_report
53
+ from sklearn.model_selection import train_test_split
54
+
55
+ from tabstar.tabstar_model import TabSTARClassifier
56
+
57
+ x = pd.read_csv("src/tabstar/resources/imdb.csv")
58
+ y = x.pop('Genre_is_Drama')
59
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
60
+ tabstar = TabSTARClassifier()
61
+ tabstar.fit(x_train, y_train)
62
+ y_pred = tabstar.predict(x_test)
63
+ print(classification_report(y_test, y_pred))
64
+ ```
65
+
66
+ Below is a template you can use to quickly get started with TabSTAR in production mode.
67
+
68
+ ```python
69
+ from pandas import DataFrame, Series
70
+ from sklearn.model_selection import train_test_split
71
+
72
+ from tabstar.tabstar_model import TabSTARClassifier, TabSTARRegressor
73
+
74
+ # --- USER-PROVIDED INPUTS ---
75
+ x_train = None # TODO: load your feature DataFrame here
76
+ y_train = None # TODO: load your target Series here
77
+ is_cls = None # TODO: True for classification, False for regression
78
+ x_test = None # TODO Optional: load your test feature DataFrame (or leave as None)
79
+ y_test = None # TODO Optional: load your test target Series (or leave as None)
80
+ # -----------------------------
81
+
82
+ # Sanity checks
83
+ assert isinstance(x_train, DataFrame), "x should be a pandas DataFrame"
84
+ assert isinstance(y_train, Series), "y should be a pandas Series"
85
+ assert isinstance(is_cls, bool), "is_cls should be a boolean indicating classification or regression"
86
+
87
+ if x_test is None:
88
+ assert y_test is None, "If x_test is None, y_test must also be None"
89
+ x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)
90
+
91
+ assert isinstance(x_test, DataFrame), "x_test should be a pandas DataFrame"
92
+ assert isinstance(y_test, Series), "y_test should be a pandas Series"
93
+
94
+ tabstar_cls = TabSTARClassifier if is_cls else TabSTARRegressor
95
+ tabstar = tabstar_cls()
96
+ tabstar.fit(x_train, y_train)
97
+ y_pred = tabstar.predict(x_test)
98
+ ```
99
+
100
+ ---
101
+
102
+ ## Research Mode
103
+
104
+ Use this section when you want to pretrain, finetune, or run baselines on TabSTAR. It assumes you are actively working on model development, experimenting with different datasets, or comparing against other methods.
105
+
106
+ ### Prerequisites
107
+
108
+ After cloning the repo, run:
109
+
110
+ ```bash
111
+ source init.sh
112
+ ```
113
+
114
+ This will install all necessary dependencies, set up your environment, and download any example data needed to get started.
115
+
116
+ ### Pretraining
117
+
118
+ To pretrain TabSTAR on a specified number of datasets:
119
+
120
+ ```bash
121
+ python do_pretrain.py --n_datasets=256
122
+ ```
123
+
124
+ `--n_datasets` determines how many datasets to use for pretraining. You can reduce this number for quick debugging, but note this will harm downstream performance.
125
+
126
+ ### Finetuning
127
+
128
+ Once pretraining finishes, note the printed `<PRETRAINED_EXP>` identifier. Then run:
129
+
130
+ ```bash
131
+ python do_finetune.py --pretrain_exp=<PRETRAINED_EXP> --dataset_id=46655
132
+ ```
133
+
134
+ `--dataset_id` is an ID for the downstream task you want to evaluate yourself on. Only the 400 datasets in the paper are supported.
135
+
136
+ ### Baseline Comparison
137
+
138
+ If you want to compare TabSTAR against a classic baseline (e.g., random forest):
139
+
140
+ ```bash
141
+ python do_baseline.py --model=rf --dataset_id=46655
142
+ ```
143
+
144
+ You can also try other names models supported by `do_baseline.py` (check the script for details).
145
+
146
+ ### License
147
+
148
+ This work is licensed under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).
149
+
150
+ ### Citation
151
+
152
+ If you use TabSTAR in your research, please cite:
153
+
154
+ ```bibtex
155
+ @article{arazi2025tabstarf,
156
+ title = {TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations},
157
+ author = {Alan Arazi and Eilam Shapira and Roi Reichart},
158
+ journal = {arXiv preprint arXiv:2505.18125},
159
+ year = {2025},
160
+ }
161
+ ```
@@ -0,0 +1,145 @@
1
+ <img src="src/tabstar/resources/tabstar_logo.png" alt="TabSTAR Logo" width="50%">
2
+
3
+ **Welcome to the TabSTAR repository! 👋**
4
+ You can use it in two modes: production mode for fitting TabSTAR on your own dataset, and research mode to pretrain TabSTAR and replicate our work in the paper.
5
+
6
+ 🚧 The repository is under construction: Any bugs or feature request? Please open an issue! 🚧
7
+
8
+ ---
9
+
10
+ ### 📚 Resources
11
+
12
+ * **Paper**: [TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations](https://arxiv.org/abs/2505.18125)
13
+ * **Project Website**: [TabSTAR](https://eilamshapira.com/TabSTAR/)
14
+
15
+ <img src="src/tabstar/resources/tabstar_arch.png" alt="TabSTAR Logo" width="200%">
16
+
17
+ ---
18
+
19
+ ## Production Mode
20
+
21
+ Use this mode if you want to fit a pretrained TabSTAR model to your own dataset.
22
+ (Note that currently we still don't support reloading that model for later use, but this is coming soon! 🔜)
23
+
24
+ ### Installation
25
+
26
+ ```bash
27
+ source init.sh
28
+ ```
29
+
30
+ ### Inference Example
31
+
32
+ TabSTAR uses the sklearn API, and it is as simple as this:
33
+
34
+ ```python
35
+ import pandas as pd
36
+ from sklearn.metrics import classification_report
37
+ from sklearn.model_selection import train_test_split
38
+
39
+ from tabstar.tabstar_model import TabSTARClassifier
40
+
41
+ x = pd.read_csv("src/tabstar/resources/imdb.csv")
42
+ y = x.pop('Genre_is_Drama')
43
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
44
+ tabstar = TabSTARClassifier()
45
+ tabstar.fit(x_train, y_train)
46
+ y_pred = tabstar.predict(x_test)
47
+ print(classification_report(y_test, y_pred))
48
+ ```
49
+
50
+ Below is a template you can use to quickly get started with TabSTAR in production mode.
51
+
52
+ ```python
53
+ from pandas import DataFrame, Series
54
+ from sklearn.model_selection import train_test_split
55
+
56
+ from tabstar.tabstar_model import TabSTARClassifier, TabSTARRegressor
57
+
58
+ # --- USER-PROVIDED INPUTS ---
59
+ x_train = None # TODO: load your feature DataFrame here
60
+ y_train = None # TODO: load your target Series here
61
+ is_cls = None # TODO: True for classification, False for regression
62
+ x_test = None # TODO Optional: load your test feature DataFrame (or leave as None)
63
+ y_test = None # TODO Optional: load your test target Series (or leave as None)
64
+ # -----------------------------
65
+
66
+ # Sanity checks
67
+ assert isinstance(x_train, DataFrame), "x should be a pandas DataFrame"
68
+ assert isinstance(y_train, Series), "y should be a pandas Series"
69
+ assert isinstance(is_cls, bool), "is_cls should be a boolean indicating classification or regression"
70
+
71
+ if x_test is None:
72
+ assert y_test is None, "If x_test is None, y_test must also be None"
73
+ x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)
74
+
75
+ assert isinstance(x_test, DataFrame), "x_test should be a pandas DataFrame"
76
+ assert isinstance(y_test, Series), "y_test should be a pandas Series"
77
+
78
+ tabstar_cls = TabSTARClassifier if is_cls else TabSTARRegressor
79
+ tabstar = tabstar_cls()
80
+ tabstar.fit(x_train, y_train)
81
+ y_pred = tabstar.predict(x_test)
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Research Mode
87
+
88
+ Use this section when you want to pretrain, finetune, or run baselines on TabSTAR. It assumes you are actively working on model development, experimenting with different datasets, or comparing against other methods.
89
+
90
+ ### Prerequisites
91
+
92
+ After cloning the repo, run:
93
+
94
+ ```bash
95
+ source init.sh
96
+ ```
97
+
98
+ This will install all necessary dependencies, set up your environment, and download any example data needed to get started.
99
+
100
+ ### Pretraining
101
+
102
+ To pretrain TabSTAR on a specified number of datasets:
103
+
104
+ ```bash
105
+ python do_pretrain.py --n_datasets=256
106
+ ```
107
+
108
+ `--n_datasets` determines how many datasets to use for pretraining. You can reduce this number for quick debugging, but note this will harm downstream performance.
109
+
110
+ ### Finetuning
111
+
112
+ Once pretraining finishes, note the printed `<PRETRAINED_EXP>` identifier. Then run:
113
+
114
+ ```bash
115
+ python do_finetune.py --pretrain_exp=<PRETRAINED_EXP> --dataset_id=46655
116
+ ```
117
+
118
+ `--dataset_id` is an ID for the downstream task you want to evaluate yourself on. Only the 400 datasets in the paper are supported.
119
+
120
+ ### Baseline Comparison
121
+
122
+ If you want to compare TabSTAR against a classic baseline (e.g., random forest):
123
+
124
+ ```bash
125
+ python do_baseline.py --model=rf --dataset_id=46655
126
+ ```
127
+
128
+ You can also try other names models supported by `do_baseline.py` (check the script for details).
129
+
130
+ ### License
131
+
132
+ This work is licensed under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).
133
+
134
+ ### Citation
135
+
136
+ If you use TabSTAR in your research, please cite:
137
+
138
+ ```bibtex
139
+ @article{arazi2025tabstarf,
140
+ title = {TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations},
141
+ author = {Alan Arazi and Eilam Shapira and Roi Reichart},
142
+ journal = {arXiv preprint arXiv:2505.18125},
143
+ year = {2025},
144
+ }
145
+ ```
@@ -0,0 +1,26 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tabstar"
7
+ version = "0.1.0"
8
+ description = "TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations"
9
+ readme = "README.md"
10
+ authors = [
11
+ { name = "Alan Arazi", email = "alanarazi7@gmail.com" }
12
+ ]
13
+ license = { text = "MIT" }
14
+ dependencies = [
15
+ "numpy",
16
+ "pandas>=2.2.2",
17
+ "peft",
18
+ "scikit-learn",
19
+ "skrub",
20
+ "torch>=2.6.0",
21
+ "tqdm",
22
+ "transformers>=4.49.0"
23
+ ]
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,69 @@
1
+ import numpy as np
2
+ import torch
3
+ from torch import Tensor
4
+ from transformers import AutoTokenizer, AutoModel, PreTrainedModel
5
+
6
+ from tabstar.arch.config import TabStarConfig, E5_SMALL
7
+ from tabstar.arch.interaction import InteractionEncoder
8
+ from tabstar.arch.fusion import NumericalFusion
9
+ from tabstar.arch.prediction import PredictionHead
10
+ from tabstar.training.devices import clear_cuda_cache
11
+
12
+
13
+ class TabStarModel(PreTrainedModel):
14
+ config_class = TabStarConfig
15
+
16
+ def __init__(self, config: TabStarConfig):
17
+ super().__init__(config)
18
+ self.text_encoder = AutoModel.from_pretrained(E5_SMALL)
19
+ self.tokenizer = AutoTokenizer.from_pretrained(E5_SMALL)
20
+ self.numerical_fusion = NumericalFusion()
21
+ self.tabular_encoder = InteractionEncoder()
22
+ self.cls_head = PredictionHead()
23
+ self.reg_head = PredictionHead()
24
+ self.post_init()
25
+
26
+ def forward(self, x_txt: np.ndarray, x_num: np.ndarray, d_output: int) -> Tensor:
27
+ textual_embeddings = self.get_textual_embedding(x_txt)
28
+ embeddings = self.numerical_fusion(textual_embeddings=textual_embeddings, x_num=x_num)
29
+ encoded = self.tabular_encoder(embeddings)
30
+ target_tokens = encoded[:, :d_output]
31
+ if d_output == 1:
32
+ target_scores = self.reg_head(target_tokens)
33
+ else:
34
+ target_scores = self.cls_head(target_tokens)
35
+ target_scores = target_scores.squeeze(dim=-1)
36
+ assert tuple(target_scores.shape) == (x_txt.shape[0], d_output)
37
+ return target_scores
38
+
39
+ def get_textual_embedding(self, x_txt: np.array) -> Tensor:
40
+ text_batch_size = 128
41
+ while text_batch_size > 1:
42
+ try:
43
+ return self.get_textual_embedding_in_batches(x_txt, text_batch_size=text_batch_size)
44
+ except torch.cuda.OutOfMemoryError:
45
+ text_batch_size //= 2
46
+ clear_cuda_cache()
47
+ print(f"🤯 Reducing batch size to {text_batch_size} due to OOM")
48
+ raise RuntimeError(f"🤯 OOM even with batch size 1!")
49
+
50
+ def get_textual_embedding_in_batches(self, x_txt: np.array, text_batch_size: int) -> Tensor:
51
+ # Get unique texts and mapping indices
52
+ unique_texts, inverse_indices = np.unique(x_txt, return_inverse=True)
53
+ num_unique_texts = len(unique_texts)
54
+ embeddings = []
55
+ for i in range(0, num_unique_texts, text_batch_size):
56
+ batch_texts = unique_texts[i:i + text_batch_size].tolist()
57
+ inputs = self.tokenizer(batch_texts, padding=True, return_tensors='pt', truncation=True)
58
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
59
+ outputs = self.text_encoder(**inputs)
60
+ # Take the [CLS] token representation
61
+ embeddings.append(outputs.last_hidden_state[:, 0, :])
62
+ embeddings = torch.cat(embeddings, dim=0)
63
+ inverse_indices = torch.tensor(inverse_indices, dtype=torch.long, device=embeddings.device)
64
+ # Map the unique embeddings back to the original positions and reshape to match the original dimension
65
+ batch_size, seq_len = x_txt.shape
66
+ embeddings = embeddings[inverse_indices].view(batch_size, seq_len, -1)
67
+ if not tuple(embeddings.shape) == (batch_size, seq_len, self.config.d_model):
68
+ raise RuntimeError(f"Unexpected embedding shape: {embeddings.shape}")
69
+ return embeddings
@@ -0,0 +1,37 @@
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ D_MODEL = 384
5
+ E5_SMALL = 'intfloat/e5-small-v2'
6
+
7
+ GLOBAL_BATCH_SIZE = 128
8
+ BATCH_SIZE = 64
9
+ WEIGHT_DECAY = 0.001
10
+ LORA_LR = 0.001
11
+ LORA_R = 32
12
+
13
+ class TabStarConfig(PretrainedConfig):
14
+ model_type = "tabstar"
15
+
16
+ def __init__(
17
+ self,
18
+ r: int = LORA_R,
19
+ lr: float = LORA_LR,
20
+ weight_decay: float = WEIGHT_DECAY,
21
+ macro_batch_size: int = GLOBAL_BATCH_SIZE,
22
+ batch_size: int = BATCH_SIZE,
23
+ **kwargs,
24
+ ):
25
+ super().__init__(**kwargs)
26
+ self.r = r
27
+ self.lr = lr
28
+ self.weight_decay = weight_decay
29
+ self.macro_batch_size = macro_batch_size
30
+ self.batch_size = batch_size
31
+ assert self.batch_size <= self.macro_batch_size, "Batch size cannot be larger than macro batch size"
32
+
33
+ @property
34
+ def accumulation_steps(self) -> int:
35
+ accumulation_steps = self.macro_batch_size // self.batch_size
36
+ assert accumulation_steps * self.batch_size == self.macro_batch_size
37
+ return accumulation_steps
@@ -0,0 +1,41 @@
1
+ import numpy as np
2
+ import torch
3
+ from torch import nn, Tensor
4
+
5
+ from tabstar.arch.config import D_MODEL
6
+
7
+
8
+ class NumericalFusion(nn.Module):
9
+
10
+ def __init__(self):
11
+ super().__init__()
12
+ self.scalar_embedder = nn.Sequential(
13
+ nn.Linear(1, D_MODEL * 2),
14
+ nn.ReLU(),
15
+ nn.Dropout(0.1),
16
+ nn.Linear(D_MODEL * 2, D_MODEL)
17
+ )
18
+ self.fusion_block = nn.TransformerEncoderLayer(
19
+ d_model=D_MODEL,
20
+ nhead=2,
21
+ dim_feedforward=D_MODEL * 4,
22
+ dropout=0.1,
23
+ activation='relu',
24
+ batch_first=True,
25
+ norm_first=True
26
+ )
27
+
28
+
29
+ def forward(self, textual_embeddings: Tensor, x_num: np.ndarray) -> Tensor:
30
+ batch_size, seq_len, d_model = textual_embeddings.shape
31
+ x_num = torch.tensor(x_num, dtype=textual_embeddings.dtype, device=textual_embeddings.device)
32
+ num_embeddings = self.scalar_embedder(x_num.unsqueeze(-1))
33
+ assert num_embeddings.shape == textual_embeddings.shape
34
+ fusion_input = torch.stack([textual_embeddings, num_embeddings], dim=2)
35
+ assert fusion_input.shape == (batch_size, seq_len, 2, d_model)
36
+ fusion_input = fusion_input.view(batch_size * seq_len, 2, d_model)
37
+ fused = self.fusion_block(fusion_input)
38
+ fused_embeddings = fused.view(batch_size, seq_len, 2, d_model)
39
+ fused_embeddings = fused_embeddings.mean(dim=2)
40
+ assert fused_embeddings.shape == textual_embeddings.shape
41
+ return fused_embeddings
@@ -0,0 +1,25 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from tabstar.arch.config import D_MODEL
5
+
6
+
7
+ class InteractionEncoder(nn.Module):
8
+ def __init__(self, num_layers: int = 6, d_model: int = D_MODEL, num_heads_factor: int = 64,
9
+ ffn_d_hidden_multiplier: int = 4, dropout: float = 0.1):
10
+ super().__init__()
11
+ dim_feedforward = d_model * ffn_d_hidden_multiplier
12
+ num_heads = d_model // num_heads_factor
13
+ encoder_layer = nn.TransformerEncoderLayer(
14
+ d_model=d_model,
15
+ nhead=num_heads,
16
+ dim_feedforward=dim_feedforward,
17
+ dropout=dropout,
18
+ activation='relu',
19
+ batch_first=True,
20
+ norm_first=True
21
+ )
22
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, enable_nested_tensor=False)
23
+
24
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
25
+ return self.encoder(x)
@@ -0,0 +1,19 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from tabstar.arch.config import D_MODEL
5
+
6
+
7
+ class PredictionHead(nn.Module):
8
+
9
+ def __init__(self, input_size: int = D_MODEL):
10
+ super().__init__()
11
+ hidden_size = input_size * 4
12
+ self.layers = nn.Sequential(
13
+ nn.Linear(input_size, hidden_size),
14
+ nn.ReLU(),
15
+ nn.Linear(hidden_size, 1)
16
+ )
17
+
18
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
19
+ return self.layers(x)
@@ -0,0 +1 @@
1
+ SEED = 1306
File without changes
@@ -0,0 +1,58 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ from pandas import Series
5
+ from sklearn.preprocessing import QuantileTransformer
6
+
7
+ from tabstar.preprocessing.nulls import get_invalid_indices, MISSING_VALUE
8
+
9
+ VERBALIZED_QUANTILE_BINS = 10
10
+
11
+ def fit_numerical_bins(s: Series) -> QuantileTransformer:
12
+ s = s.copy().dropna()
13
+ scaler = QuantileTransformer(output_distribution='uniform',
14
+ n_quantiles=min(1000, len(s)),
15
+ subsample=1000000000,
16
+ random_state=0)
17
+ scaler.fit(s.values.reshape(-1, 1))
18
+ return scaler
19
+
20
+
21
+ def transform_numerical_bins(s: Series, scaler: QuantileTransformer) -> Series:
22
+ invalid_indices = get_invalid_indices(s)
23
+ quantile_levels = np.linspace(0, 1, VERBALIZED_QUANTILE_BINS + 1)
24
+ boundaries = scaler.inverse_transform(quantile_levels.reshape(-1, 1)).flatten()
25
+ assert len(boundaries) == VERBALIZED_QUANTILE_BINS + 1
26
+ verbalized_bins = verbalize_bins(boundaries)
27
+ bin_index = np.digitize(s, boundaries)
28
+ verbalized = [verbalized_bins[i] for i in bin_index]
29
+ for idx in invalid_indices:
30
+ verbalized[idx] = MISSING_VALUE
31
+ s = Series(verbalized, index=s.index, name=s.name)
32
+ return s
33
+
34
+
35
+ def verbalize_bins(boundaries: np.array) -> List[str]:
36
+ # TODO: this can become a bit ugly with high-precision numbers, or relatively-discrete numerical values
37
+ boundaries = [format_float(b) for b in boundaries]
38
+ first = f"Lower than {min(boundaries)} (Quantile 0%)"
39
+ last = f"Higher than {max(boundaries)} (Quantile 100%)"
40
+ bins = []
41
+ for i, b in enumerate(boundaries[:-1]):
42
+ r = f"{b} to {boundaries[i + 1]}"
43
+ low = i * VERBALIZED_QUANTILE_BINS
44
+ high = (i + 1) * VERBALIZED_QUANTILE_BINS
45
+ q = f"(Quantile {low} - {high}%)"
46
+ bins.append(f"{r} {q}")
47
+ assert len(bins) == VERBALIZED_QUANTILE_BINS == len(boundaries) - 1
48
+ bins = [first] + bins + [last]
49
+ return bins
50
+
51
+
52
+ def format_float(num: float) -> str:
53
+ rounded = round(num, 4)
54
+ if rounded.is_integer():
55
+ return str(int(rounded))
56
+ formatted = f"{rounded:.4f}"
57
+ formatted = formatted.rstrip("0").rstrip(".")
58
+ return formatted
@@ -0,0 +1,44 @@
1
+ from typing import Any, Dict
2
+
3
+ import pandas as pd
4
+ from pandas import Series, DataFrame
5
+ from pandas.core.dtypes.common import is_datetime64_any_dtype
6
+ from skrub import DatetimeEncoder
7
+
8
+
9
+ def transform_date_features(x: DataFrame, date_transformers: Dict[str, DatetimeEncoder]) -> DataFrame:
10
+ for col, dt_encoder in date_transformers.items():
11
+ s = series_to_dt(s=x[col])
12
+ dt_df = dt_encoder.transform(s)
13
+ x = x.drop(columns=[col])
14
+ x = pd.concat([x, dt_df], axis=1)
15
+ return x
16
+
17
+ def fit_date_encoders(x: DataFrame) -> Dict[str, DatetimeEncoder]:
18
+ date_encoders = {}
19
+ date_columns = [str(col) for col, dtype in x.dtypes.items() if is_datetime64_any_dtype(dtype)]
20
+ for col in date_columns:
21
+ dt_s = series_to_dt(s=x[col])
22
+ encoder = DatetimeEncoder(add_weekday=True, add_total_seconds=True)
23
+ encoder.fit(dt_s)
24
+ date_encoders[col] = encoder
25
+ return date_encoders
26
+
27
+ def series_to_dt(s: Series) -> Series:
28
+ # TODO: do we want to handle missing values here?
29
+ s = s.apply(_clean_dirty_date)
30
+ dt_s = pd.to_datetime(s, errors='coerce')
31
+ dt_s = dt_s.apply(_remove_timezone)
32
+ return dt_s
33
+
34
+
35
+ def _remove_timezone(dt):
36
+ if pd.notnull(dt) and getattr(dt, 'tzinfo', None) is not None:
37
+ return dt.tz_localize(None)
38
+ return dt
39
+
40
+
41
+ def _clean_dirty_date(s: Any) -> Any:
42
+ if isinstance(s, str):
43
+ s = s.replace('"', "")
44
+ return s
@@ -0,0 +1,35 @@
1
+ from typing import Any
2
+
3
+ from pandas import Series
4
+
5
+ from tabstar.preprocessing.nulls import get_valid_values
6
+
7
+ MAX_NUMERIC_FOR_CATEGORICAL = 50
8
+
9
+
10
+ def is_mostly_numerical(s: Series) -> bool:
11
+ values = get_valid_values(s)
12
+ unique = set(values)
13
+ n_unique = len(unique)
14
+ if n_unique <= MAX_NUMERIC_FOR_CATEGORICAL:
15
+ return False
16
+ non_numerical_unique = [v for v in unique if not is_numeric(v)]
17
+ if len(non_numerical_unique) > 1:
18
+ return False
19
+ return True
20
+
21
+
22
+
23
+ def is_numeric(f: Any) -> bool:
24
+ if f is None:
25
+ return False
26
+ if isinstance(f, str):
27
+ return f.isdigit()
28
+ if isinstance(f, (int, float,)):
29
+ return True
30
+ try:
31
+ f = float(f)
32
+ return True
33
+ except ValueError:
34
+ print(f"ValueError: {f} from type {f} cannot be converted to float")
35
+ return False