PyPI - tabstar - Versions diffs - 0.1.0__tar.gz - Mend

tabstar 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

tabstar-0.1.0/PKG-INFO +161 -0
tabstar-0.1.0/README.md +145 -0
tabstar-0.1.0/pyproject.toml +26 -0
tabstar-0.1.0/setup.cfg +4 -0
tabstar-0.1.0/src/tabstar/__init__.py +1 -0
tabstar-0.1.0/src/tabstar/arch/__init__.py +0 -0
tabstar-0.1.0/src/tabstar/arch/arch.py +69 -0
tabstar-0.1.0/src/tabstar/arch/config.py +37 -0
tabstar-0.1.0/src/tabstar/arch/fusion.py +41 -0
tabstar-0.1.0/src/tabstar/arch/interaction.py +25 -0
tabstar-0.1.0/src/tabstar/arch/prediction.py +19 -0
tabstar-0.1.0/src/tabstar/constants.py +1 -0
tabstar-0.1.0/src/tabstar/preprocessing/__init__.py +0 -0
tabstar-0.1.0/src/tabstar/preprocessing/binning.py +58 -0
tabstar-0.1.0/src/tabstar/preprocessing/dates.py +44 -0
tabstar-0.1.0/src/tabstar/preprocessing/detection.py +35 -0
tabstar-0.1.0/src/tabstar/preprocessing/feat_types.py +45 -0
tabstar-0.1.0/src/tabstar/preprocessing/nulls.py +39 -0
tabstar-0.1.0/src/tabstar/preprocessing/scaler.py +24 -0
tabstar-0.1.0/src/tabstar/preprocessing/sparse.py +18 -0
tabstar-0.1.0/src/tabstar/preprocessing/splits.py +32 -0
tabstar-0.1.0/src/tabstar/preprocessing/target.py +33 -0
tabstar-0.1.0/src/tabstar/preprocessing/texts.py +30 -0
tabstar-0.1.0/src/tabstar/preprocessing/verbalize.py +31 -0
tabstar-0.1.0/src/tabstar/tabstar_model.py +106 -0
tabstar-0.1.0/src/tabstar/tabstar_verbalizer.py +102 -0
tabstar-0.1.0/src/tabstar/training/__init__.py +0 -0
tabstar-0.1.0/src/tabstar/training/dataloader.py +46 -0
tabstar-0.1.0/src/tabstar/training/devices.py +22 -0
tabstar-0.1.0/src/tabstar/training/early_stopping.py +23 -0
tabstar-0.1.0/src/tabstar/training/lora.py +34 -0
tabstar-0.1.0/src/tabstar/training/metrics.py +49 -0
tabstar-0.1.0/src/tabstar/training/optimizer.py +18 -0
tabstar-0.1.0/src/tabstar/training/trainer.py +145 -0
tabstar-0.1.0/src/tabstar/training/utils.py +8 -0
tabstar-0.1.0/src/tabstar.egg-info/PKG-INFO +161 -0
tabstar-0.1.0/src/tabstar.egg-info/SOURCES.txt +44 -0
tabstar-0.1.0/src/tabstar.egg-info/dependency_links.txt +1 -0
tabstar-0.1.0/src/tabstar.egg-info/requires.txt +8 -0
tabstar-0.1.0/src/tabstar.egg-info/top_level.txt +1 -0
tabstar-0.1.0/test/test_curation.py +11 -0
tabstar-0.1.0/test/test_datetimes.py +49 -0
tabstar-0.1.0/test/test_feature_types.py +8 -0
tabstar-0.1.0/test/test_layers_unfreeze.py +8 -0
tabstar-0.1.0/test/test_null_detections.py +17 -0
tabstar-0.1.0/test/test_numerical_verbalization.py +8 -0

tabstar-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,161 @@
+Metadata-Version: 2.4
+Name: tabstar
+Version: 0.1.0
+Summary: TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations
+Author-email: Alan Arazi <alanarazi7@gmail.com>
+License: MIT
+Description-Content-Type: text/markdown
+Requires-Dist: numpy
+Requires-Dist: pandas>=2.2.2
+Requires-Dist: peft
+Requires-Dist: scikit-learn
+Requires-Dist: skrub
+Requires-Dist: torch>=2.6.0
+Requires-Dist: tqdm
+Requires-Dist: transformers>=4.49.0
+<img src="src/tabstar/resources/tabstar_logo.png" alt="TabSTAR Logo" width="50%">
+**Welcome to the TabSTAR repository! 👋**
+You can use it in two modes: production mode for fitting TabSTAR on your own dataset, and research mode to pretrain TabSTAR and replicate our work in the paper.
+🚧 The repository is under construction: Any bugs or feature request? Please open an issue! 🚧
+---
+### 📚 Resources
+* **Paper**: [TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations](https://arxiv.org/abs/2505.18125)
+* **Project Website**: [TabSTAR](https://eilamshapira.com/TabSTAR/)
+<img src="src/tabstar/resources/tabstar_arch.png" alt="TabSTAR Logo" width="200%">
+---
+## Production Mode
+Use this mode if you want to fit a pretrained TabSTAR model to your own dataset.
+(Note that currently we still don't support reloading that model for later use, but this is coming soon! 🔜)
+### Installation
+```bash
+source init.sh
+```
+### Inference Example
+TabSTAR uses the sklearn API, and it is as simple as this:
+```python
+import pandas as pd
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+from tabstar.tabstar_model import TabSTARClassifier
+x = pd.read_csv("src/tabstar/resources/imdb.csv")
+y = x.pop('Genre_is_Drama')
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
+tabstar = TabSTARClassifier()
+tabstar.fit(x_train, y_train)
+y_pred = tabstar.predict(x_test)
+print(classification_report(y_test, y_pred))
+```
+Below is a template you can use to quickly get started with TabSTAR in production mode.
+```python
+from pandas import DataFrame, Series
+from sklearn.model_selection import train_test_split
+from tabstar.tabstar_model import TabSTARClassifier, TabSTARRegressor
+# --- USER-PROVIDED INPUTS ---
+x_train = None  # TODO: load your feature DataFrame here
+y_train = None  # TODO: load your target Series here
+is_cls = None   # TODO: True for classification, False for regression
+x_test = None   # TODO Optional: load your test feature DataFrame (or leave as None)
+y_test = None   # TODO Optional: load your test target Series (or leave as None)
+# -----------------------------
+# Sanity checks
+assert isinstance(x_train, DataFrame), "x should be a pandas DataFrame"
+assert isinstance(y_train, Series), "y should be a pandas Series"
+assert isinstance(is_cls, bool), "is_cls should be a boolean indicating classification or regression"
+if x_test is None:
+    assert y_test is None, "If x_test is None, y_test must also be None"
+    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)
+assert isinstance(x_test, DataFrame), "x_test should be a pandas DataFrame"
+assert isinstance(y_test, Series), "y_test should be a pandas Series"
+tabstar_cls = TabSTARClassifier if is_cls else TabSTARRegressor
+tabstar = tabstar_cls()
+tabstar.fit(x_train, y_train)
+y_pred = tabstar.predict(x_test)
+```
+---
+## Research Mode
+Use this section when you want to pretrain, finetune, or run baselines on TabSTAR. It assumes you are actively working on model development, experimenting with different datasets, or comparing against other methods.
+### Prerequisites
+After cloning the repo, run:
+```bash
+source init.sh
+```
+This will install all necessary dependencies, set up your environment, and download any example data needed to get started.
+### Pretraining
+To pretrain TabSTAR on a specified number of datasets:
+```bash
+python do_pretrain.py --n_datasets=256
+```
+`--n_datasets` determines how many datasets to use for pretraining. You can reduce this number for quick debugging, but note this will harm downstream performance.
+### Finetuning
+Once pretraining finishes, note the printed `<PRETRAINED_EXP>` identifier. Then run:
+```bash
+python do_finetune.py --pretrain_exp=<PRETRAINED_EXP> --dataset_id=46655
+```
+`--dataset_id` is an ID for the downstream task you want to evaluate yourself on. Only the 400 datasets in the paper are supported.
+### Baseline Comparison
+If you want to compare TabSTAR against a classic baseline (e.g., random forest):
+```bash
+python do_baseline.py --model=rf --dataset_id=46655
+```
+You can also try other names models supported by `do_baseline.py` (check the script for details).
+### License
+This work is licensed under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).
+### Citation
+If you use TabSTAR in your research, please cite:
+```bibtex
+@article{arazi2025tabstarf,
+  title   = {TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations},
+  author  = {Alan Arazi and Eilam Shapira and Roi Reichart},
+  journal = {arXiv preprint arXiv:2505.18125},
+  year    = {2025},
+}
+```

tabstar-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,145 @@
+<img src="src/tabstar/resources/tabstar_logo.png" alt="TabSTAR Logo" width="50%">
+**Welcome to the TabSTAR repository! 👋**
+You can use it in two modes: production mode for fitting TabSTAR on your own dataset, and research mode to pretrain TabSTAR and replicate our work in the paper.
+🚧 The repository is under construction: Any bugs or feature request? Please open an issue! 🚧
+---
+### 📚 Resources
+* **Paper**: [TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations](https://arxiv.org/abs/2505.18125)
+* **Project Website**: [TabSTAR](https://eilamshapira.com/TabSTAR/)
+<img src="src/tabstar/resources/tabstar_arch.png" alt="TabSTAR Logo" width="200%">
+---
+## Production Mode
+Use this mode if you want to fit a pretrained TabSTAR model to your own dataset.
+(Note that currently we still don't support reloading that model for later use, but this is coming soon! 🔜)
+### Installation
+```bash
+source init.sh
+```
+### Inference Example
+TabSTAR uses the sklearn API, and it is as simple as this:
+```python
+import pandas as pd
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+from tabstar.tabstar_model import TabSTARClassifier
+x = pd.read_csv("src/tabstar/resources/imdb.csv")
+y = x.pop('Genre_is_Drama')
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
+tabstar = TabSTARClassifier()
+tabstar.fit(x_train, y_train)
+y_pred = tabstar.predict(x_test)
+print(classification_report(y_test, y_pred))
+```
+Below is a template you can use to quickly get started with TabSTAR in production mode.
+```python
+from pandas import DataFrame, Series
+from sklearn.model_selection import train_test_split
+from tabstar.tabstar_model import TabSTARClassifier, TabSTARRegressor
+# --- USER-PROVIDED INPUTS ---
+x_train = None  # TODO: load your feature DataFrame here
+y_train = None  # TODO: load your target Series here
+is_cls = None   # TODO: True for classification, False for regression
+x_test = None   # TODO Optional: load your test feature DataFrame (or leave as None)
+y_test = None   # TODO Optional: load your test target Series (or leave as None)
+# -----------------------------
+# Sanity checks
+assert isinstance(x_train, DataFrame), "x should be a pandas DataFrame"
+assert isinstance(y_train, Series), "y should be a pandas Series"
+assert isinstance(is_cls, bool), "is_cls should be a boolean indicating classification or regression"
+if x_test is None:
+    assert y_test is None, "If x_test is None, y_test must also be None"
+    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)
+assert isinstance(x_test, DataFrame), "x_test should be a pandas DataFrame"
+assert isinstance(y_test, Series), "y_test should be a pandas Series"
+tabstar_cls = TabSTARClassifier if is_cls else TabSTARRegressor
+tabstar = tabstar_cls()
+tabstar.fit(x_train, y_train)
+y_pred = tabstar.predict(x_test)
+```
+---
+## Research Mode
+Use this section when you want to pretrain, finetune, or run baselines on TabSTAR. It assumes you are actively working on model development, experimenting with different datasets, or comparing against other methods.
+### Prerequisites
+After cloning the repo, run:
+```bash
+source init.sh
+```
+This will install all necessary dependencies, set up your environment, and download any example data needed to get started.
+### Pretraining
+To pretrain TabSTAR on a specified number of datasets:
+```bash
+python do_pretrain.py --n_datasets=256
+```
+`--n_datasets` determines how many datasets to use for pretraining. You can reduce this number for quick debugging, but note this will harm downstream performance.
+### Finetuning
+Once pretraining finishes, note the printed `<PRETRAINED_EXP>` identifier. Then run:
+```bash
+python do_finetune.py --pretrain_exp=<PRETRAINED_EXP> --dataset_id=46655
+```
+`--dataset_id` is an ID for the downstream task you want to evaluate yourself on. Only the 400 datasets in the paper are supported.
+### Baseline Comparison
+If you want to compare TabSTAR against a classic baseline (e.g., random forest):
+```bash
+python do_baseline.py --model=rf --dataset_id=46655
+```
+You can also try other names models supported by `do_baseline.py` (check the script for details).
+### License
+This work is licensed under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/).
+### Citation
+If you use TabSTAR in your research, please cite:
+```bibtex
+@article{arazi2025tabstarf,
+  title   = {TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations},
+  author  = {Alan Arazi and Eilam Shapira and Roi Reichart},
+  journal = {arXiv preprint arXiv:2505.18125},
+  year    = {2025},
+}
+```

tabstar-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tabstar"
+version = "0.1.0"
+description = "TabSTAR: A Foundation Tabular Model With Semantically Target-Aware Representations"
+readme = "README.md"
+authors = [
+  { name = "Alan Arazi", email = "alanarazi7@gmail.com" }
+]
+license = { text = "MIT" }
+dependencies = [
+  "numpy",
+  "pandas>=2.2.2",
+  "peft",
+  "scikit-learn",
+  "skrub",
+  "torch>=2.6.0",
+  "tqdm",
+  "transformers>=4.49.0"
+]
+[tool.setuptools.packages.find]
+where = ["src"]

tabstar-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tabstar-0.1.0/src/tabstar/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

tabstar-0.1.0/src/tabstar/arch/__init__.py ADDED Viewed

File without changes

tabstar-0.1.0/src/tabstar/arch/arch.py ADDED Viewed

@@ -0,0 +1,69 @@
+import numpy as np
+import torch
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel, PreTrainedModel
+from tabstar.arch.config import TabStarConfig, E5_SMALL
+from tabstar.arch.interaction import InteractionEncoder
+from tabstar.arch.fusion import NumericalFusion
+from tabstar.arch.prediction import PredictionHead
+from tabstar.training.devices import clear_cuda_cache
+class TabStarModel(PreTrainedModel):
+    config_class = TabStarConfig
+    def __init__(self, config: TabStarConfig):
+        super().__init__(config)
+        self.text_encoder = AutoModel.from_pretrained(E5_SMALL)
+        self.tokenizer = AutoTokenizer.from_pretrained(E5_SMALL)
+        self.numerical_fusion = NumericalFusion()
+        self.tabular_encoder = InteractionEncoder()
+        self.cls_head = PredictionHead()
+        self.reg_head = PredictionHead()
+        self.post_init()
+    def forward(self, x_txt: np.ndarray, x_num: np.ndarray, d_output: int) -> Tensor:
+        textual_embeddings = self.get_textual_embedding(x_txt)
+        embeddings = self.numerical_fusion(textual_embeddings=textual_embeddings, x_num=x_num)
+        encoded = self.tabular_encoder(embeddings)
+        target_tokens = encoded[:, :d_output]
+        if d_output == 1:
+            target_scores = self.reg_head(target_tokens)
+        else:
+            target_scores = self.cls_head(target_tokens)
+        target_scores = target_scores.squeeze(dim=-1)
+        assert tuple(target_scores.shape) == (x_txt.shape[0], d_output)
+        return target_scores
+    def get_textual_embedding(self, x_txt: np.array) -> Tensor:
+        text_batch_size = 128
+        while text_batch_size > 1:
+            try:
+                return self.get_textual_embedding_in_batches(x_txt, text_batch_size=text_batch_size)
+            except torch.cuda.OutOfMemoryError:
+                text_batch_size //= 2
+                clear_cuda_cache()
+                print(f"🤯 Reducing batch size to {text_batch_size} due to OOM")
+        raise RuntimeError(f"🤯 OOM even with batch size 1!")
+    def get_textual_embedding_in_batches(self, x_txt: np.array, text_batch_size: int) -> Tensor:
+        # Get unique texts and mapping indices
+        unique_texts, inverse_indices = np.unique(x_txt, return_inverse=True)
+        num_unique_texts = len(unique_texts)
+        embeddings = []
+        for i in range(0, num_unique_texts, text_batch_size):
+            batch_texts = unique_texts[i:i + text_batch_size].tolist()
+            inputs = self.tokenizer(batch_texts, padding=True, return_tensors='pt', truncation=True)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            outputs = self.text_encoder(**inputs)
+            # Take the [CLS] token representation
+            embeddings.append(outputs.last_hidden_state[:, 0, :])
+        embeddings = torch.cat(embeddings, dim=0)
+        inverse_indices = torch.tensor(inverse_indices, dtype=torch.long, device=embeddings.device)
+        # Map the unique embeddings back to the original positions and reshape to match the original dimension
+        batch_size, seq_len = x_txt.shape
+        embeddings = embeddings[inverse_indices].view(batch_size, seq_len, -1)
+        if not tuple(embeddings.shape) == (batch_size, seq_len, self.config.d_model):
+            raise RuntimeError(f"Unexpected embedding shape: {embeddings.shape}")
+        return embeddings

tabstar-0.1.0/src/tabstar/arch/config.py ADDED Viewed

@@ -0,0 +1,37 @@
+from transformers import PretrainedConfig
+D_MODEL = 384
+E5_SMALL = 'intfloat/e5-small-v2'
+GLOBAL_BATCH_SIZE = 128
+BATCH_SIZE = 64
+WEIGHT_DECAY = 0.001
+LORA_LR = 0.001
+LORA_R = 32
+class TabStarConfig(PretrainedConfig):
+    model_type = "tabstar"
+    def __init__(
+        self,
+        r: int = LORA_R,
+        lr: float = LORA_LR,
+        weight_decay: float = WEIGHT_DECAY,
+        macro_batch_size: int = GLOBAL_BATCH_SIZE,
+        batch_size: int = BATCH_SIZE,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.r = r
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.macro_batch_size = macro_batch_size
+        self.batch_size = batch_size
+        assert self.batch_size <= self.macro_batch_size, "Batch size cannot be larger than macro batch size"
+    @property
+    def accumulation_steps(self) -> int:
+        accumulation_steps = self.macro_batch_size // self.batch_size
+        assert accumulation_steps * self.batch_size == self.macro_batch_size
+        return accumulation_steps

tabstar-0.1.0/src/tabstar/arch/fusion.py ADDED Viewed

@@ -0,0 +1,41 @@
+import numpy as np
+import torch
+from torch import nn, Tensor
+from tabstar.arch.config import D_MODEL
+class NumericalFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scalar_embedder = nn.Sequential(
+            nn.Linear(1, D_MODEL * 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(D_MODEL * 2, D_MODEL)
+        )
+        self.fusion_block = nn.TransformerEncoderLayer(
+            d_model=D_MODEL,
+            nhead=2,
+            dim_feedforward=D_MODEL * 4,
+            dropout=0.1,
+            activation='relu',
+            batch_first=True,
+            norm_first=True
+        )
+    def forward(self, textual_embeddings: Tensor, x_num: np.ndarray) -> Tensor:
+        batch_size, seq_len, d_model = textual_embeddings.shape
+        x_num = torch.tensor(x_num, dtype=textual_embeddings.dtype, device=textual_embeddings.device)
+        num_embeddings = self.scalar_embedder(x_num.unsqueeze(-1))
+        assert num_embeddings.shape == textual_embeddings.shape
+        fusion_input = torch.stack([textual_embeddings, num_embeddings], dim=2)
+        assert fusion_input.shape == (batch_size, seq_len, 2, d_model)
+        fusion_input = fusion_input.view(batch_size * seq_len, 2, d_model)
+        fused = self.fusion_block(fusion_input)
+        fused_embeddings = fused.view(batch_size, seq_len, 2, d_model)
+        fused_embeddings = fused_embeddings.mean(dim=2)
+        assert fused_embeddings.shape == textual_embeddings.shape
+        return fused_embeddings

tabstar-0.1.0/src/tabstar/arch/interaction.py ADDED Viewed

@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+from tabstar.arch.config import D_MODEL
+class InteractionEncoder(nn.Module):
+    def __init__(self, num_layers: int = 6, d_model: int = D_MODEL, num_heads_factor: int = 64,
+                 ffn_d_hidden_multiplier: int = 4, dropout: float = 0.1):
+        super().__init__()
+        dim_feedforward = d_model * ffn_d_hidden_multiplier
+        num_heads = d_model // num_heads_factor
+        encoder_layer = nn.TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=num_heads,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation='relu',
+                batch_first=True,
+                norm_first=True
+            )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, enable_nested_tensor=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.encoder(x)

tabstar-0.1.0/src/tabstar/arch/prediction.py ADDED Viewed

@@ -0,0 +1,19 @@
+import torch
+import torch.nn as nn
+from tabstar.arch.config import D_MODEL
+class PredictionHead(nn.Module):
+    def __init__(self, input_size: int = D_MODEL):
+        super().__init__()
+        hidden_size = input_size * 4
+        self.layers = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, 1)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)

tabstar-0.1.0/src/tabstar/constants.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ SEED = 1306

tabstar-0.1.0/src/tabstar/preprocessing/__init__.py ADDED Viewed

File without changes

tabstar-0.1.0/src/tabstar/preprocessing/binning.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import List
+import numpy as np
+from pandas import Series
+from sklearn.preprocessing import QuantileTransformer
+from tabstar.preprocessing.nulls import get_invalid_indices, MISSING_VALUE
+VERBALIZED_QUANTILE_BINS = 10
+def fit_numerical_bins(s: Series) -> QuantileTransformer:
+    s = s.copy().dropna()
+    scaler = QuantileTransformer(output_distribution='uniform',
+                                 n_quantiles=min(1000, len(s)),
+                                 subsample=1000000000,
+                                 random_state=0)
+    scaler.fit(s.values.reshape(-1, 1))
+    return scaler
+def transform_numerical_bins(s: Series, scaler: QuantileTransformer) -> Series:
+    invalid_indices = get_invalid_indices(s)
+    quantile_levels = np.linspace(0, 1, VERBALIZED_QUANTILE_BINS + 1)
+    boundaries = scaler.inverse_transform(quantile_levels.reshape(-1, 1)).flatten()
+    assert len(boundaries) == VERBALIZED_QUANTILE_BINS + 1
+    verbalized_bins = verbalize_bins(boundaries)
+    bin_index = np.digitize(s, boundaries)
+    verbalized = [verbalized_bins[i] for i in bin_index]
+    for idx in invalid_indices:
+        verbalized[idx] = MISSING_VALUE
+    s = Series(verbalized, index=s.index, name=s.name)
+    return s
+def verbalize_bins(boundaries: np.array) -> List[str]:
+    # TODO: this can become a bit ugly with high-precision numbers, or relatively-discrete numerical values
+    boundaries = [format_float(b) for b in boundaries]
+    first = f"Lower than {min(boundaries)} (Quantile 0%)"
+    last = f"Higher than {max(boundaries)} (Quantile 100%)"
+    bins = []
+    for i, b in enumerate(boundaries[:-1]):
+        r = f"{b} to {boundaries[i + 1]}"
+        low = i * VERBALIZED_QUANTILE_BINS
+        high = (i + 1) * VERBALIZED_QUANTILE_BINS
+        q = f"(Quantile {low} - {high}%)"
+        bins.append(f"{r} {q}")
+    assert len(bins) == VERBALIZED_QUANTILE_BINS == len(boundaries) - 1
+    bins = [first] + bins + [last]
+    return bins
+def format_float(num: float) -> str:
+    rounded = round(num, 4)
+    if rounded.is_integer():
+        return str(int(rounded))
+    formatted = f"{rounded:.4f}"
+    formatted = formatted.rstrip("0").rstrip(".")
+    return formatted

tabstar-0.1.0/src/tabstar/preprocessing/dates.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Any, Dict
+import pandas as pd
+from pandas import Series, DataFrame
+from pandas.core.dtypes.common import is_datetime64_any_dtype
+from skrub import DatetimeEncoder
+def transform_date_features(x: DataFrame, date_transformers: Dict[str, DatetimeEncoder]) -> DataFrame:
+    for col, dt_encoder in date_transformers.items():
+        s = series_to_dt(s=x[col])
+        dt_df = dt_encoder.transform(s)
+        x = x.drop(columns=[col])
+        x = pd.concat([x, dt_df], axis=1)
+    return x
+def fit_date_encoders(x: DataFrame) -> Dict[str, DatetimeEncoder]:
+    date_encoders = {}
+    date_columns = [str(col) for col, dtype in x.dtypes.items() if is_datetime64_any_dtype(dtype)]
+    for col in date_columns:
+        dt_s = series_to_dt(s=x[col])
+        encoder = DatetimeEncoder(add_weekday=True, add_total_seconds=True)
+        encoder.fit(dt_s)
+        date_encoders[col] = encoder
+    return date_encoders
+def series_to_dt(s: Series) -> Series:
+    # TODO: do we want to handle missing values here?
+    s = s.apply(_clean_dirty_date)
+    dt_s = pd.to_datetime(s, errors='coerce')
+    dt_s = dt_s.apply(_remove_timezone)
+    return dt_s
+def _remove_timezone(dt):
+    if pd.notnull(dt) and getattr(dt, 'tzinfo', None) is not None:
+        return dt.tz_localize(None)
+    return dt
+def _clean_dirty_date(s: Any) -> Any:
+    if isinstance(s, str):
+        s = s.replace('"', "")
+    return s

tabstar-0.1.0/src/tabstar/preprocessing/detection.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Any
+from pandas import Series
+from tabstar.preprocessing.nulls import get_valid_values
+MAX_NUMERIC_FOR_CATEGORICAL = 50
+def is_mostly_numerical(s: Series) -> bool:
+    values = get_valid_values(s)
+    unique = set(values)
+    n_unique = len(unique)
+    if n_unique <= MAX_NUMERIC_FOR_CATEGORICAL:
+        return False
+    non_numerical_unique = [v for v in unique if not is_numeric(v)]
+    if len(non_numerical_unique) > 1:
+        return False
+    return True
+def is_numeric(f: Any) -> bool:
+    if f is None:
+        return False
+    if isinstance(f, str):
+        return f.isdigit()
+    if isinstance(f, (int, float,)):
+        return True
+    try:
+        f = float(f)
+        return True
+    except ValueError:
+        print(f"ValueError: {f} from type {f} cannot be converted to float")
+        return False