evoforest-tab 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """evoforest_tab: EvoForest-Tab -- an evolved universal tabular feature map + closed-form ridge head.
2
+
3
+ An interpretable, training-free, local in-context learner competitive with tabular foundation
4
+ models. See README. Main entry points:
5
+
6
+ from evoforest_tab import EvoForestTabClassifier, EvoForestTabRegressor
7
+ """
8
+ from .estimator import TabMapClassifier, TabMapRegressor
9
+ from ._genome import load_genome, build_phi, seed_variant, DEFAULT_GENOME
10
+ from ._channels import build_channels
11
+ from .combine import StackedTabularEnsemble
12
+ from ._module import EvoForestTabModule
13
+ from .inductive import (
14
+ EvoForestTabInductiveClassifier, EvoForestTabInductiveRegressor, EvoForestTabTransformer,
15
+ TabMapInductiveClassifier, TabMapInductiveRegressor, TabMapTransformer,
16
+ )
17
+
18
+ # brand-consistent names matching the paper (the TabMap* names remain as aliases)
19
+ EvoForestTabClassifier = TabMapClassifier
20
+ EvoForestTabRegressor = TabMapRegressor
21
+
22
+ __all__ = ["EvoForestTabClassifier", "EvoForestTabRegressor",
23
+ "TabMapClassifier", "TabMapRegressor", "build_channels",
24
+ "build_phi", "load_genome", "seed_variant", "DEFAULT_GENOME",
25
+ "StackedTabularEnsemble", "EvoForestTabModule",
26
+ "EvoForestTabInductiveClassifier", "EvoForestTabInductiveRegressor", "EvoForestTabTransformer",
27
+ "TabMapInductiveClassifier", "TabMapInductiveRegressor", "TabMapTransformer"]
28
+ __version__ = "0.1.0"
@@ -0,0 +1,122 @@
1
+ """Channel construction: raw table rows -> the input channels the feature map reads.
2
+
3
+ Faithful to the development pipeline. All channels are UNSUPERVISED and TRANSDUCTIVE (computed over the
4
+ pooled support+query rows, label-free) so the map is leakage-safe in the in-context (support->query)
5
+ setting. Categoricals are ordinal-encoded; missing values are nan-safe; columns are padded/capped to Dmax.
6
+ """
7
+ import math
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+ DMAX_DEFAULT = 100
13
+
14
+
15
+ def _nan_col_zscore(X: torch.Tensor) -> torch.Tensor:
16
+ """Per-column z-score (nan-safe); nan -> 0 after standardizing."""
17
+ nan = torch.isnan(X)
18
+ Xf = torch.where(nan, torch.zeros_like(X), X)
19
+ cnt = (~nan).sum(0).clamp(min=1).to(X.dtype)
20
+ mean = Xf.sum(0) / cnt
21
+ var = (torch.where(nan, torch.zeros_like(X), (X - mean) ** 2)).sum(0) / cnt
22
+ z = (X - mean) / (var.sqrt() + 1e-6)
23
+ return torch.where(nan, torch.zeros_like(z), z)
24
+
25
+
26
+ def _col_rankgauss(X: torch.Tensor) -> torch.Tensor:
27
+ """Per-column rank -> ~N(0,1) (nan ranked last, then zeroed). Transductive, label-free."""
28
+ N = X.shape[0]
29
+ filled = torch.where(torch.isnan(X), torch.full_like(X, float("inf")), X)
30
+ ranks = filled.argsort(0).argsort(0).to(X.dtype)
31
+ u = (ranks + 0.5) / N
32
+ z = math.sqrt(2.0) * torch.erfinv((2 * u - 1).clamp(-1 + 1e-6, 1 - 1e-6))
33
+ return torch.where(torch.isnan(X), torch.zeros_like(z), z)
34
+
35
+
36
+ def _col_freq(Xm: np.ndarray) -> np.ndarray:
37
+ """Per-column count/frequency encoding: each cell -> fraction of rows sharing its value."""
38
+ n, D = Xm.shape
39
+ Fr = np.zeros_like(Xm, dtype=np.float64)
40
+ for j in range(D):
41
+ _, inv, counts = np.unique(Xm[:, j], return_inverse=True, return_counts=True)
42
+ Fr[:, j] = counts[inv] / n
43
+ return Fr
44
+
45
+
46
+ def _pad(X: torch.Tensor, Dmax: int) -> torch.Tensor:
47
+ if X.shape[1] >= Dmax:
48
+ return X[:, :Dmax]
49
+ return torch.cat([X, torch.zeros(X.shape[0], Dmax - X.shape[1], dtype=X.dtype)], dim=1)
50
+
51
+
52
+ def _to_ordinal(X, cat_features):
53
+ """Coerce a 2-D array (numpy object/float or pandas DataFrame) to a float matrix with categoricals
54
+ ordinal-encoded (nan preserved), returning (Xm float64, cat_mask bool)."""
55
+ try:
56
+ import pandas as pd
57
+ is_df = isinstance(X, pd.DataFrame)
58
+ except ImportError:
59
+ is_df = False
60
+ if is_df:
61
+ cols, cat = [], []
62
+ for ci, c in enumerate(X.columns):
63
+ s = X[c]
64
+ auto_cat = str(s.dtype) in ("category", "object", "bool")
65
+ user_cat = cat_features is not None and (ci in cat_features or c in cat_features)
66
+ if auto_cat or user_cat:
67
+ codes = s.astype("category").cat.codes.to_numpy().astype(np.float64)
68
+ codes[codes < 0] = np.nan
69
+ cols.append(codes); cat.append(True)
70
+ else:
71
+ cols.append(s.to_numpy(dtype=np.float64)); cat.append(False)
72
+ return np.column_stack(cols), np.array(cat, dtype=bool)
73
+ Xm = np.asarray(X, dtype=object)
74
+ n, D = Xm.shape
75
+ out = np.zeros((n, D), dtype=np.float64)
76
+ cat = np.zeros(D, dtype=bool)
77
+ for j in range(D):
78
+ col = Xm[:, j]
79
+ user_cat = cat_features is not None and j in cat_features
80
+ is_numeric = np.issubdtype(np.asarray(col).dtype, np.number)
81
+ try:
82
+ fcol = col.astype(np.float64)
83
+ numeric_ok = True
84
+ except (ValueError, TypeError):
85
+ numeric_ok = False
86
+ if user_cat or not numeric_ok or (not is_numeric):
87
+ uniq = {v: i for i, v in enumerate(sorted(set(map(str, col))))}
88
+ out[:, j] = np.array([uniq[str(v)] for v in col], dtype=np.float64)
89
+ cat[j] = True
90
+ else:
91
+ out[:, j] = fcol
92
+ return out, cat
93
+
94
+
95
+ def build_channels(X, cat_features=None, Dmax: int = DMAX_DEFAULT, device="cpu"):
96
+ """Build the input-channel dict from raw rows X (n, d). Returns tensors padded to Dmax.
97
+
98
+ X: numpy array or pandas DataFrame (the POOLED support+query rows).
99
+ cat_features: indices (or names) of categorical columns; if None, auto-detect by dtype/cardinality.
100
+ """
101
+ Xm, cat = _to_ordinal(X, cat_features)
102
+ # median-impute, drop constant columns (matches the dev pipeline)
103
+ med = np.nanmedian(Xm, axis=0)
104
+ inds = np.where(np.isnan(Xm))
105
+ Xm[inds] = np.take(med, inds[1])
106
+ keep = Xm.std(0) > 1e-9
107
+ Xm = Xm[:, keep]; cat = cat[keep]
108
+ if Xm.shape[1] == 0:
109
+ raise ValueError("no non-constant columns after preprocessing")
110
+ # cap to Dmax by top variance
111
+ if Xm.shape[1] > Dmax:
112
+ top = np.argsort(-Xm.std(0))[:Dmax]
113
+ Xm = Xm[:, top]; cat = cat[top]
114
+ n, D = Xm.shape
115
+ Xt = torch.from_numpy(Xm).float().to(device)
116
+ x = _pad(_nan_col_zscore(Xt), Dmax)
117
+ xrank = _pad(_col_rankgauss(Xt), Dmax)
118
+ x_freq = _pad(_nan_col_zscore(torch.from_numpy(_col_freq(Xm)).float().to(device)), Dmax)
119
+ fmask = torch.zeros(n, Dmax, device=device); fmask[:, :min(D, Dmax)] = 1.0
120
+ is_cat = torch.zeros(n, Dmax, device=device)
121
+ is_cat[:, :min(D, Dmax)] = torch.from_numpy(cat[:min(D, Dmax)].astype(np.float32)).to(device)
122
+ return {"x": x, "xrank": xrank, "fmask": fmask, "is_cat": is_cat, "x_freq": x_freq, "Dmax": Dmax}
@@ -0,0 +1,64 @@
1
+ """Evaluate the evolved genome (a list of feature-lambdas) on the input channels to produce the
2
+ feature matrix Phi. The genome is the deployment artifact; each lambda is a small, inspectable
3
+ expression over the channels with fixed seeded random projections."""
4
+ import math
5
+ import os
6
+ import re
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import yaml
12
+
13
+ _G = {"torch": torch, "F": F, "np": np, "math": math}
14
+ _HERE = os.path.dirname(os.path.abspath(__file__))
15
+ DEFAULT_GENOME = os.path.join(_HERE, "champion.yaml")
16
+
17
+
18
+ def load_genome(path: str = DEFAULT_GENOME) -> dict:
19
+ return yaml.safe_load(open(path))
20
+
21
+
22
+ def seed_variant(genome: dict, offset: int) -> dict:
23
+ """Offset every manual_seed(N) -> manual_seed(N+offset): a decorrelated random-feature draw of the
24
+ SAME architecture (used for the variance-reducing ensemble)."""
25
+ out = [re.sub(r"manual_seed\((\d+)\)", lambda m: f"manual_seed({int(m.group(1)) + offset})", lam)
26
+ for lam in genome["output"]]
27
+ g = dict(genome); g["output"] = out
28
+ return g
29
+
30
+
31
+ def build_phi(genome: dict, channels: dict) -> torch.Tensor:
32
+ """Apply the genome's output lambdas to the channel dict and stack into a normalized (n, K) matrix."""
33
+ gg = {k: eval(v, _G) for k, v in (genome.get("@globals", {}) or {}).items()}
34
+ n = channels["x"].shape[0]
35
+ signals = []
36
+ for lam in genome["output"]:
37
+ fn = eval(lam, _G)
38
+ na = fn.__code__.co_argcount
39
+ an = fn.__code__.co_varnames[:na]
40
+ args = [channels if a == "input" else gg if a == "globals" else None for a in an]
41
+ signals.append(fn(*args))
42
+ cols = []
43
+ for sig in signals:
44
+ if not torch.is_tensor(sig):
45
+ sig = torch.as_tensor(sig)
46
+ sig = sig.float()
47
+ if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
48
+ sub = [sig[:, c] for c in range(sig.shape[1])]
49
+ else:
50
+ sig = sig.squeeze()
51
+ if sig.dim() != 1 or sig.shape[0] != n:
52
+ continue
53
+ sub = [sig]
54
+ for col in sub:
55
+ col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
56
+ if col.std() < 1e-12:
57
+ continue
58
+ cols.append(col)
59
+ if not cols:
60
+ raise ValueError("genome produced no usable feature columns")
61
+ Phi = torch.stack(cols, dim=1)
62
+ mu = Phi.mean(0, keepdim=True)
63
+ sigma = Phi.std(0, keepdim=True).clamp(min=1e-12)
64
+ return (Phi - mu) / sigma
@@ -0,0 +1,152 @@
1
+ """Compile the evolved genome (champion.yaml) into a self-contained ``torch.nn.Module``.
2
+
3
+ The genome is the *architecture spec* produced by EvoForest search. In the reference path each family's
4
+ seeded random projection is re-drawn on every forward via ``torch.randn(..., generator=manual_seed(N))``
5
+ inside an ``eval``'d lambda. That is fine for a frozen feature map but (a) cannot be fine-tuned, (b) re-draws
6
+ on every call, and (c) is not a saveable artifact.
7
+
8
+ ``EvoForestTabModule`` fixes all three: it materializes every seeded ``randn``/``rand`` draw **once** as a
9
+ frozen ``nn.Parameter`` (``requires_grad=False`` by default), so the module is a self-contained, saveable,
10
+ HuggingFace-publishable checkpoint whose ``forward`` is byte-identical to the reference evaluation -- and the
11
+ random features become fine-tunable simply by calling :meth:`unfreeze_random_features`. The deterministic
12
+ families (rank/stats/frequency) carry no parameters and are reproduced exactly.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import math
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torch.nn.functional as F
21
+ from torch import nn
22
+
23
+ from ._genome import load_genome, DEFAULT_GENOME
24
+
25
+
26
+ class _SeededWeightBank(nn.Module):
27
+ """Materialize seeded ``torch.randn``/``torch.rand`` draws as frozen Parameters, keyed by (op, seed, shape)."""
28
+
29
+ def __init__(self) -> None:
30
+ super().__init__()
31
+ self.weights = nn.ParameterDict()
32
+
33
+ @staticmethod
34
+ def _key(op: str, shape, seed: int) -> str:
35
+ return f"{op}__seed{int(seed)}__" + "x".join(str(int(s)) for s in shape)
36
+
37
+ def get(self, op: str, shape, seed: int, device=None) -> torch.Tensor:
38
+ key = self._key(op, shape, seed)
39
+ if key not in self.weights:
40
+ gen = torch.Generator().manual_seed(int(seed)) # CPU draw == reference on CPU
41
+ draw = (torch.randn if op == "randn" else torch.rand)(*shape, generator=gen)
42
+ self.weights[key] = nn.Parameter(draw, requires_grad=False) # frozen by default
43
+ w = self.weights[key]
44
+ return w.to(device) if device is not None else w
45
+
46
+
47
+ class _TorchProxy:
48
+ """A stand-in for the ``torch`` namespace inside the genome lambdas: seeded ``randn``/``rand`` are routed
49
+ to the materialized weight bank; every other attribute delegates to real ``torch``."""
50
+
51
+ def __init__(self, bank: _SeededWeightBank) -> None:
52
+ object.__setattr__(self, "_bank", bank)
53
+
54
+ def __getattr__(self, name):
55
+ return getattr(torch, name)
56
+
57
+ def randn(self, *shape, generator=None, device=None, dtype=None):
58
+ if generator is None:
59
+ return torch.randn(*shape, device=device, dtype=dtype)
60
+ return self._bank.get("randn", shape, generator.initial_seed(), device)
61
+
62
+ def rand(self, *shape, generator=None, device=None, dtype=None):
63
+ if generator is None:
64
+ return torch.rand(*shape, device=device, dtype=dtype)
65
+ return self._bank.get("rand", shape, generator.initial_seed(), device)
66
+
67
+
68
+ class EvoForestTabModule(nn.Module):
69
+ """Evolved EvoForest-Tab feature map phi as a compiled, fine-tunable, saveable ``nn.Module``.
70
+
71
+ Parameters
72
+ ----------
73
+ genome : dict | None
74
+ The parsed genome (defaults to the released champion).
75
+ dmax : int
76
+ Padded channel width the module is built for (random-projection shapes depend only on this).
77
+
78
+ Notes
79
+ -----
80
+ ``forward(channels)`` takes the channel dict from :func:`evoforest_tab.build_channels` and returns the
81
+ normalized ``(n, K)`` feature matrix, byte-identical to :func:`evoforest_tab.build_phi`. Random-feature
82
+ weights are frozen by default; call :meth:`unfreeze_random_features` to fine-tune them (e.g. at BSC), then
83
+ ``state_dict()`` is a publishable derivative checkpoint.
84
+ """
85
+
86
+ def __init__(self, genome: dict | None = None, dmax: int = 100) -> None:
87
+ super().__init__()
88
+ genome = genome if genome is not None else load_genome(DEFAULT_GENOME)
89
+ self.output_src = list(genome["output"])
90
+ self.dmax = dmax
91
+ self.bank = _SeededWeightBank()
92
+ self._proxy = _TorchProxy(self.bank)
93
+ env = {"torch": self._proxy, "F": F, "np": np, "math": math}
94
+ self._fns = [eval(src, env) for src in self.output_src] # proxy-bound lambdas
95
+ with torch.no_grad(): # eager-materialize the bank
96
+ self._materialize()
97
+
98
+ # ------------------------------------------------------------------ build / fine-tune controls
99
+ def _materialize(self) -> None:
100
+ dummy = self._dummy_channels()
101
+ for fn in self._fns:
102
+ try:
103
+ fn(dummy) # fires the seeded randn/rand -> banks the weights
104
+ except Exception: # noqa: BLE001 - dummy may degenerate post-draw; weights are banked
105
+ pass
106
+
107
+ def _dummy_channels(self) -> dict:
108
+ z = torch.randn(4, self.dmax)
109
+ return {"x": z, "xrank": z.clone(), "x_freq": z.clone(),
110
+ "fmask": torch.ones(4, self.dmax), "is_cat": torch.zeros(4, self.dmax), "Dmax": self.dmax}
111
+
112
+ def unfreeze_random_features(self) -> "EvoForestTabModule":
113
+ for p in self.bank.weights.values():
114
+ p.requires_grad_(True)
115
+ return self
116
+
117
+ def freeze_random_features(self) -> "EvoForestTabModule":
118
+ for p in self.bank.weights.values():
119
+ p.requires_grad_(False)
120
+ return self
121
+
122
+ @property
123
+ def n_random_parameters(self) -> int:
124
+ return sum(p.numel() for p in self.bank.weights.values())
125
+
126
+ # ------------------------------------------------------------------ forward (mirrors build_phi exactly)
127
+ def forward(self, channels: dict) -> torch.Tensor:
128
+ n = channels["x"].shape[0]
129
+ cols = []
130
+ for fn in self._fns:
131
+ sig = fn(channels)
132
+ if not torch.is_tensor(sig):
133
+ sig = torch.as_tensor(sig)
134
+ sig = sig.float()
135
+ if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
136
+ sub = [sig[:, c] for c in range(sig.shape[1])]
137
+ else:
138
+ sig = sig.squeeze()
139
+ if sig.dim() != 1 or sig.shape[0] != n:
140
+ continue
141
+ sub = [sig]
142
+ for col in sub:
143
+ col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
144
+ if col.std() < 1e-12:
145
+ continue
146
+ cols.append(col)
147
+ if not cols:
148
+ raise ValueError("genome produced no usable feature columns")
149
+ phi = torch.stack(cols, dim=1)
150
+ mu = phi.mean(0, keepdim=True)
151
+ sigma = phi.std(0, keepdim=True).clamp(min=1e-12)
152
+ return (phi - mu) / sigma
@@ -0,0 +1,45 @@
1
+ """Closed-form ridge head with Bayesian evidence-maximized regularization (MacKay/Tipping).
2
+
3
+ The head is fit on the support features and applied to the query features in a single SVD solve --
4
+ no gradient descent, no per-dataset hyperparameter grid. lambda is set by evidence maximization,
5
+ which is markedly more stable than a leave-one-out grid in the few-shot (K>>n) regime.
6
+ """
7
+ import torch
8
+
9
+ LAM_FLOOR = 1e-2
10
+
11
+
12
+ def evidence_lambda(U, S, Y, n_iter: int = 60) -> float:
13
+ """Bayesian-ridge lambda = alpha/beta by evidence maximization, reusing the support SVD (U,S),
14
+ with alpha,beta shared across Y's columns and a floor for the K>n interpolation regime."""
15
+ s2 = S ** 2
16
+ UtY = U.transpose(0, 1) @ Y
17
+ n, M = Y.shape
18
+ ytot = (Y ** 2).sum(); proj = (UtY ** 2).sum()
19
+ lam = torch.tensor(1.0, dtype=S.dtype, device=S.device)
20
+ for _ in range(n_iter):
21
+ h = s2 / (s2 + lam)
22
+ d = S / (s2 + lam)
23
+ wsq = ((d.unsqueeze(1) * UtY) ** 2).sum()
24
+ rss = (ytot - proj) + (((1 - h).unsqueeze(1) * UtY) ** 2).sum()
25
+ gamma = h.sum()
26
+ alpha = (M * gamma) / (wsq + 1e-12)
27
+ beta = (M * n - M * gamma) / (rss + 1e-12)
28
+ lam_new = (alpha / (beta + 1e-12)).clamp(min=LAM_FLOOR, max=1e8)
29
+ if (lam_new - lam).abs() < 1e-3 * lam:
30
+ lam = lam_new
31
+ break
32
+ lam = lam_new
33
+ return float(lam)
34
+
35
+
36
+ def ridge_scores(Phi_s, Y, Phi_q, lam=None):
37
+ """Standardize by support stats, solve ridge in closed form, return query scores (nq, M)."""
38
+ mu = Phi_s.mean(0, keepdim=True); sd = Phi_s.std(0, keepdim=True).clamp(min=1e-8)
39
+ Phi_s = (Phi_s - mu) / sd; Phi_q = (Phi_q - mu) / sd
40
+ U, S, Vt = torch.linalg.svd(Phi_s, full_matrices=False)
41
+ UY = U.transpose(0, 1) @ Y
42
+ if lam is None:
43
+ lam = evidence_lambda(U, S, Y)
44
+ W = Vt.transpose(0, 1) @ ((S / (S ** 2 + lam)).unsqueeze(1) * UY)
45
+ return Phi_q @ W
@@ -0,0 +1,77 @@
1
+ output:
2
+ - 'lambda input: (lambda x, fm, c: torch.stack([(x * fm).sum(1) / c, ((x * x * fm).sum(1)
3
+ / c - ((x * fm).sum(1) / c) ** 2).clamp(min=0).sqrt(), x.masked_fill(fm == 0, float(''-inf'')).amax(1),
4
+ x.masked_fill(fm == 0, float(''inf'')).amin(1), c / x.shape[1]], dim=1))(input[''x''],
5
+ input[''fmask''], input[''fmask''].sum(1).clamp(min=1)) #name: output_0 qi_mean=None
6
+ qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=69'
7
+ - 'lambda input: (lambda x, fm: (lambda A: torch.exp(-(((x.unsqueeze(1) - A.unsqueeze(0))
8
+ ** 2) * fm.unsqueeze(1)).sum(2) / fm.sum(1, keepdim=True).clamp(min=1)))(torch.randn(8,
9
+ x.shape[1], generator=torch.Generator(device=x.device).manual_seed(209), device=x.device)))(input[''x''],
10
+ input[''fmask'']) #name: output_1 qi_mean=None qi_std=None qi_max=None qd_mean=None
11
+ qd_std=None qd_max=None age=69'
12
+ - 'lambda input: (lambda x, s: torch.cat([torch.relu(x @ torch.randn(x.shape[1], 8,
13
+ generator=torch.Generator(device=x.device).manual_seed(321), device=x.device) /
14
+ s), (x @ torch.randn(x.shape[1], 8, generator=torch.Generator(device=x.device).manual_seed(322),
15
+ device=x.device) / s) ** 2], dim=1))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
16
+ output_2 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
17
+ age=66'
18
+ - 'lambda input: (lambda x, s: torch.tanh(x @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(501),
19
+ device=x.device) / s))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
20
+ output_3 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
21
+ age=65'
22
+ - 'lambda input: (lambda x, s: torch.tanh((torch.sign(x) * torch.log1p(x.abs())) @
23
+ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(701),
24
+ device=x.device) / s))(input[''x''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
25
+ output_4 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
26
+ age=62'
27
+ - 'lambda input: (lambda x, s: torch.tanh((torch.sign(x) * x.abs().sqrt()) @ torch.randn(x.shape[1],
28
+ 16, generator=torch.Generator(device=x.device).manual_seed(801), device=x.device)
29
+ / s))(input[''x''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
30
+ output_5 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
31
+ age=60'
32
+ - 'lambda input: (lambda x, s: torch.sigmoid(8.0 * (x @ torch.randn(x.shape[1], 16,
33
+ generator=torch.Generator(device=x.device).manual_seed(951), device=x.device) /
34
+ s)))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
35
+ output_6 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
36
+ age=55'
37
+ - 'lambda input: (lambda x, ic, ct: torch.stack([(x*ic).sum(1)/ic.sum(1).clamp(min=1),
38
+ (x*ct).sum(1)/ct.sum(1).clamp(min=1), (x*x*ic).sum(1)/ic.sum(1).clamp(min=1), (x*x*ct).sum(1)/ct.sum(1).clamp(min=1)],
39
+ dim=1))(input[''x''], input[''is_cat''], input[''fmask'']-input[''is_cat'']) #name:
40
+ output_7 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
41
+ age=50'
42
+ - 'lambda input: (lambda x, ic: torch.tanh((x*ic) @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(1221),
43
+ device=x.device) / ic.sum(1,keepdim=True).clamp(min=1).sqrt()))(input[''xrank''],
44
+ input[''is_cat'']) #name: output_8 qi_mean=None qi_std=None qi_max=None qd_mean=None
45
+ qd_std=None qd_max=None age=48'
46
+ - 'lambda input: (lambda x, s: torch.tanh(x @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(1501),
47
+ device=x.device) / s))(input[''x_freq''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
48
+ output_9 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
49
+ age=43'
50
+ - 'lambda input: torch.tanh((input[''xrank'']*input[''x_freq'']) @ torch.randn(input[''x''].shape[1],
51
+ 16, generator=torch.Generator(device=input[''x''].device).manual_seed(1711), device=input[''x''].device)
52
+ / input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name: output_10 qi_mean=None
53
+ qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=37'
54
+ - 'lambda input: torch.tanh(((torch.sign(input[''x''])*input[''x''].abs().sqrt())*input[''xrank''])
55
+ @ torch.randn(input[''x''].shape[1], 16, generator=torch.Generator(device=input[''x''].device).manual_seed(1901),
56
+ device=input[''x''].device) / input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
57
+ output_11 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
58
+ age=35'
59
+ - 'lambda input: (lambda W,b: torch.cos(input[''xrank''] @ W / input[''fmask''].sum(1,
60
+ keepdim=True).clamp(min=1).sqrt() + b))(torch.randn(input[''x''].shape[1], 16, generator=torch.Generator(device=input[''x''].device).manual_seed(207),
61
+ device=input[''x''].device), torch.rand(16, generator=torch.Generator(device=input[''x''].device).manual_seed(208),
62
+ device=input[''x''].device)*6.2831853) #name: output_12 qi_mean=None qi_std=None
63
+ qi_max=None qd_mean=None qd_std=None qd_max=None age=30'
64
+ - 'lambda input: torch.tanh(input[''x''] @ torch.randn(input[''x''].shape[1], 16,
65
+ generator=torch.Generator(device=input[''x''].device).manual_seed(401), device=input[''x''].device)
66
+ / input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name: output_13 qi_mean=None
67
+ qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=30'
68
+ - 'lambda input: (lambda W,b: torch.cos(input[''x''] @ W / input[''fmask''].sum(1,
69
+ keepdim=True).clamp(min=1).sqrt() + b))(torch.randn(input[''x''].shape[1], 24, generator=torch.Generator(device=input[''x''].device).manual_seed(201),
70
+ device=input[''x''].device), torch.rand(24, generator=torch.Generator(device=input[''x''].device).manual_seed(202),
71
+ device=input[''x''].device)*6.2831853) #name: output_14 qi_mean=None qi_std=None
72
+ qi_max=None qd_mean=None qd_std=None qd_max=None age=28'
73
+ - 'lambda input: (lambda x, fm: (lambda C: (torch.exp(-((x.unsqueeze(2) - C) ** 2))
74
+ * fm.unsqueeze(2)).sum(1) / fm.sum(1, keepdim=True).clamp(min=1))(torch.linspace(-2.0,
75
+ 2.0, 8, device=x.device).reshape(1, 1, 8)))(input[''xrank''], input[''fmask'']) #name:
76
+ output_15 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
77
+ age=12'
@@ -0,0 +1,165 @@
1
+ """Combine TabMap with a tabular foundation model (e.g. TabPFN) into a single predictor.
2
+
3
+ Implements the paper's complementarity result: an evolved closed-form map and a pretrained transformer
4
+ capture different structure (one tends to win classification, the other regression), so combining them
5
+ beats either alone. Three methods, in increasing adaptivity:
6
+
7
+ - ``blend`` : fixed 50/50 average of the base models' outputs (naive; only helps when matched).
8
+ - ``compwt`` : weight each base by its support-cross-validated competence (label-free, no meta-learner).
9
+ - ``meta`` : a learned ridge head over the base models' out-of-fold support predictions (most robust).
10
+
11
+ All combination is leakage-safe and *in-context*: weights/head are fit on the SUPPORT set via out-of-fold
12
+ predictions (no query labels). Base models are any in-context estimators with the scikit-learn surface
13
+ (``fit(X, y)`` + ``predict``/``predict_proba``), e.g. ``TabMapClassifier`` and TabPFN's client.
14
+
15
+ Example
16
+ -------
17
+ >>> from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
18
+ >>> from tabpfn_client import TabPFNClassifier
19
+ >>> ens = StackedTabularEnsemble([TabMapClassifier(), TabPFNClassifier()], task="classification",
20
+ ... method="meta").fit(X_support, y_support)
21
+ >>> proba = ens.predict_proba(X_query)
22
+ """
23
+ import numpy as np
24
+
25
+ try:
26
+ from sklearn.base import clone
27
+ except ImportError:
28
+ def clone(e): # minimal fallback
29
+ return e.__class__(**(e.get_params() if hasattr(e, "get_params") else {}))
30
+
31
+
32
+ def _ridge_fit_predict(Xtr, Ytr, Xte, lam=1.0):
33
+ """Tiny closed-form ridge meta-head (numpy) with an intercept. Inputs are model outputs (probas /
34
+ predictions, already on a comparable bounded scale), so we \emph{center} rather than divide by the
35
+ per-column std---dividing by a near-zero std on a degenerate (near-constant) base output otherwise
36
+ explodes the fit on imbalanced datasets."""
37
+ mu = Xtr.mean(0, keepdims=True); ybar = Ytr.mean(0, keepdims=True)
38
+ Xc, Xqc = Xtr - mu, Xte - mu
39
+ A = Xc.T @ Xc + lam * np.eye(Xc.shape[1])
40
+ W = np.linalg.solve(A, Xc.T @ (Ytr - ybar))
41
+ return Xqc @ W + ybar
42
+
43
+
44
+ class StackedTabularEnsemble:
45
+ """Stack/blend in-context tabular base models (e.g. TabMap + TabPFN).
46
+
47
+ Parameters
48
+ ----------
49
+ base_models : list of estimators (in-context; sklearn surface). For classification each must expose
50
+ ``predict_proba``; for regression, ``predict``.
51
+ task : {"classification", "regression"}.
52
+ method : {"meta", "compwt", "blend"} (default "meta").
53
+ n_splits : folds for the support out-of-fold predictions (default 2).
54
+ """
55
+
56
+ def __init__(self, base_models, task="classification", method="meta", n_splits=2, ridge_lambda=1.0,
57
+ random_state=0):
58
+ self.base_models = list(base_models)
59
+ self.task = task
60
+ self.method = method
61
+ self.n_splits = n_splits
62
+ self.ridge_lambda = ridge_lambda
63
+ self.random_state = random_state
64
+
65
+ # ---- helpers --------------------------------------------------------
66
+ def _is_clf(self):
67
+ return self.task == "classification"
68
+
69
+ def _base_out(self, model, Xq):
70
+ if self._is_clf():
71
+ p = np.asarray(model.predict_proba(Xq))
72
+ P = np.zeros((p.shape[0], self.n_classes_))
73
+ cls = getattr(model, "classes_", np.arange(p.shape[1]))
74
+ for j, c in enumerate(cls):
75
+ idx = self._cls_index.get(c, j if j < self.n_classes_ else None)
76
+ if idx is not None:
77
+ P[:, idx] = p[:, j]
78
+ return P
79
+ return np.asarray(model.predict(Xq)).reshape(-1)
80
+
81
+ def _competence(self, oof, y):
82
+ if self._is_clf():
83
+ acc = (oof.argmax(1) == y).mean()
84
+ maj = np.bincount(y, minlength=self.n_classes_).argmax()
85
+ base = (y == maj).mean()
86
+ return max((acc - base) / (1 - base + 1e-8), 0.0)
87
+ ss_res = ((y - oof) ** 2).sum(); ss_tot = ((y - y.mean()) ** 2).sum() + 1e-8
88
+ return max(1.0 - ss_res / ss_tot, 0.0)
89
+
90
+ # ---- fit/predict ----------------------------------------------------
91
+ def fit(self, X, y):
92
+ X = np.asarray(X) if not hasattr(X, "iloc") else X
93
+ y = np.asarray(y)
94
+ n = len(y)
95
+ if self._is_clf():
96
+ self.classes_, y_idx = np.unique(y, return_inverse=True)
97
+ self.n_classes_ = len(self.classes_)
98
+ self._cls_index = {c: i for i, c in enumerate(self.classes_)}
99
+ y_work = y_idx
100
+ else:
101
+ y_work = y.astype(float)
102
+ self._X, self._y = X, y_work
103
+ # out-of-fold support predictions for each base model
104
+ rng = np.random.RandomState(self.random_state)
105
+ folds = np.array_split(rng.permutation(n), self.n_splits)
106
+ oofs = []
107
+ for model in self.base_models:
108
+ oof = (np.zeros((n, self.n_classes_)) if self._is_clf() else np.zeros(n))
109
+ for te in folds:
110
+ tr = np.setdiff1d(np.arange(n), te)
111
+ if self._is_clf() and len(np.unique(y_work[tr])) < 2:
112
+ continue
113
+ m = clone(model)
114
+ Xtr = X.iloc[tr] if hasattr(X, "iloc") else X[tr]
115
+ Xte = X.iloc[te] if hasattr(X, "iloc") else X[te]
116
+ m.fit(Xtr, (self.classes_[y_work[tr]] if self._is_clf() else y_work[tr]))
117
+ oof[te] = self._base_out(m, Xte)
118
+ oofs.append(oof)
119
+ self._oofs = oofs
120
+ # competence weights (normalized); fall back to uniform (= 50/50) if no model is competent,
121
+ # so a degenerate support (e.g. heavy imbalance -> all competences clamp to 0) never yields a
122
+ # zero prediction.
123
+ comps = [self._competence(o, y_work) for o in oofs]
124
+ s = sum(comps)
125
+ self._weights = [c / s for c in comps] if s > 1e-6 else [1.0 / len(comps)] * len(comps)
126
+ if self.method == "meta":
127
+ H = np.concatenate(oofs, 1) if self._is_clf() else np.stack(oofs, 1)
128
+ if self._is_clf():
129
+ Y = np.eye(self.n_classes_)[y_work]
130
+ self._head = ("clf", H.mean(0), H.std(0), Y) # store; refit at predict via _ridge
131
+ self._Htr = H
132
+ self._Ytr = (np.eye(self.n_classes_)[y_work] if self._is_clf()
133
+ else (y_work - y_work.mean())[:, None])
134
+ self._ymean = (0.0 if self._is_clf() else y_work.mean())
135
+ return self
136
+
137
+ def _query_outs(self, Xq):
138
+ outs = []
139
+ for model in self.base_models:
140
+ m = clone(model)
141
+ m.fit(self._X, (self.classes_[self._y] if self._is_clf() else self._y))
142
+ outs.append(self._base_out(m, Xq))
143
+ return outs
144
+
145
+ def _combine(self, outs):
146
+ if self.method == "blend":
147
+ w = [1.0 / len(outs)] * len(outs)
148
+ return sum(wi * o for wi, o in zip(w, outs))
149
+ if self.method == "compwt":
150
+ return sum(wi * o for wi, o in zip(self._weights, outs))
151
+ # meta: learned ridge head over concatenated/stacked base outputs
152
+ Hq = np.concatenate(outs, 1) if self._is_clf() else np.stack(outs, 1)
153
+ pred = _ridge_fit_predict(self._Htr, self._Ytr, Hq, self.ridge_lambda)
154
+ return pred if self._is_clf() else pred[:, 0] + self._ymean
155
+
156
+ def predict_proba(self, Xq):
157
+ assert self._is_clf(), "predict_proba is classification-only"
158
+ P = self._combine(self._query_outs(Xq))
159
+ P = np.clip(P, 1e-9, None); return P / P.sum(1, keepdims=True)
160
+
161
+ def predict(self, Xq):
162
+ outs = self._query_outs(Xq)
163
+ if self._is_clf():
164
+ return self.classes_[self._combine(outs).argmax(1)]
165
+ return self._combine(outs)
@@ -0,0 +1,108 @@
1
+ """scikit-learn-style estimators for the evolved universal tabular feature map + closed-form ridge head.
2
+
3
+ An in-context learner: ``fit`` stores the labeled support rows; ``predict`` builds the (transductive,
4
+ label-free) channels over the pooled support+query rows, maps them through the evolved feature map,
5
+ and solves a Bayesian ridge in closed form -- no gradient descent, no per-dataset tuning. An optional
6
+ K-member ensemble averages decorrelated random-feature draws (variance reduction toward the kernel
7
+ limit). Free, local, CPU-friendly, and interpretable.
8
+
9
+ Example
10
+ -------
11
+ >>> from evoforest_tab import TabMapClassifier
12
+ >>> clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support)
13
+ >>> proba = clf.predict_proba(X_query)
14
+ """
15
+ import numpy as np
16
+ import torch
17
+
18
+ from ._channels import DMAX_DEFAULT, build_channels
19
+ from ._genome import DEFAULT_GENOME, build_phi, load_genome, seed_variant
20
+ from ._ridge import ridge_scores
21
+
22
+ try:
23
+ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
24
+ except ImportError: # sklearn optional
25
+ class BaseEstimator: # type: ignore
26
+ def get_params(self, deep=True):
27
+ return {}
28
+ class ClassifierMixin: # type: ignore
29
+ pass
30
+ class RegressorMixin: # type: ignore
31
+ pass
32
+
33
+
34
+ class _TabMapBase(BaseEstimator):
35
+ def __init__(self, n_estimators: int = 6, genome_path: str = DEFAULT_GENOME,
36
+ cat_features=None, Dmax: int = DMAX_DEFAULT, device: str = "cpu"):
37
+ self.n_estimators = n_estimators
38
+ self.genome_path = genome_path
39
+ self.cat_features = cat_features
40
+ self.Dmax = Dmax
41
+ self.device = device
42
+
43
+ def _genomes(self):
44
+ base = load_genome(self.genome_path)
45
+ return [base] + [seed_variant(base, 1000 * k) for k in range(1, self.n_estimators)]
46
+
47
+ def _fit(self, X, y):
48
+ self._X = np.asarray(X) if not hasattr(X, "iloc") else X
49
+ self._y = np.asarray(y)
50
+ self._n_support = len(self._y)
51
+ return self
52
+
53
+ def _phi_split(self, X_query):
54
+ """Pool support+query, build channels + Phi per genome variant, yield (Phi_s, Phi_q)."""
55
+ if hasattr(self._X, "iloc"):
56
+ import pandas as pd
57
+ X_all = pd.concat([self._X, X_query if hasattr(X_query, "iloc") else pd.DataFrame(
58
+ np.asarray(X_query), columns=self._X.columns)], axis=0, ignore_index=True)
59
+ else:
60
+ X_all = np.vstack([np.asarray(self._X), np.asarray(X_query)])
61
+ channels = build_channels(X_all, cat_features=self.cat_features, Dmax=self.Dmax, device=self.device)
62
+ ns = self._n_support
63
+ for g in self._genomes():
64
+ Phi = build_phi(g, channels)
65
+ yield Phi[:ns], Phi[ns:]
66
+
67
+
68
+ class TabMapClassifier(_TabMapBase, ClassifierMixin):
69
+ def fit(self, X, y):
70
+ self._fit(X, y)
71
+ self.classes_, y_idx = np.unique(self._y, return_inverse=True)
72
+ self._y_idx = y_idx
73
+ return self
74
+
75
+ def predict_proba(self, X_query):
76
+ nc = len(self.classes_)
77
+ ys = torch.from_numpy(self._y_idx).long()
78
+ Y = torch.zeros(self._n_support, nc, device=self.device)
79
+ Y[torch.arange(self._n_support), ys] = 1.0
80
+ probs = None
81
+ k = 0
82
+ for Phi_s, Phi_q in self._phi_split(X_query):
83
+ scores = ridge_scores(Phi_s, Y, Phi_q)
84
+ p = torch.softmax(scores, dim=1)
85
+ probs = p if probs is None else probs + p
86
+ k += 1
87
+ return (probs / k).cpu().numpy()
88
+
89
+ def predict(self, X_query):
90
+ return self.classes_[self.predict_proba(X_query).argmax(1)]
91
+
92
+
93
+ class TabMapRegressor(_TabMapBase, RegressorMixin):
94
+ def fit(self, X, y):
95
+ self._fit(X, y.astype(float) if hasattr(y, "astype") else y)
96
+ return self
97
+
98
+ def predict(self, X_query):
99
+ yt = torch.from_numpy(np.asarray(self._y, dtype=np.float64)).float().to(self.device)
100
+ mean = yt.mean()
101
+ Y = (yt - mean).unsqueeze(1)
102
+ preds = None
103
+ k = 0
104
+ for Phi_s, Phi_q in self._phi_split(X_query):
105
+ scores = ridge_scores(Phi_s, Y, Phi_q).squeeze(1) + mean
106
+ preds = scores if preds is None else preds + scores
107
+ k += 1
108
+ return (preds / k).cpu().numpy()
@@ -0,0 +1,274 @@
1
+ """Inductive EvoForest-Tab: a standard fit->transform/predict feature map (fits channel statistics on
2
+ TRAINING data, applies the *fixed* maps to new rows), as opposed to the transductive/in-context tabmap
3
+ estimators whose predict() pools support+query. The inductive version satisfies the ordinary
4
+ scikit-learn contract (and therefore `check_estimator`), so it can serve as the basis for the
5
+ scikit-learn-contrib estimator, the skrub `TabMapEncoder` (a TransformerMixin), and a pytorch-frame /
6
+ AutoGluon featurizer. It reuses the SAME evolved genome (the 16 families) -- only the channels are now
7
+ computed inductively.
8
+
9
+ Channels (all fit on train, applied to new data):
10
+ x = per-column standardize (StandardScaler)
11
+ xrank = per-column rank-gauss (QuantileTransformer, output_distribution='normal')
12
+ x_freq = per-column count/frequency encode (value->train-frequency map; unseen -> 0), standardized
13
+ is_cat = categorical mask (from cat_features / fit-time dtype)
14
+ fmask = feature-presence mask
15
+ Padded/capped to Dmax columns (top-variance selection fit on train).
16
+ """
17
+ import math
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn.functional as F
22
+ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
23
+ from sklearn.linear_model import RidgeCV, RidgeClassifierCV
24
+ import numpy as _np
25
+ from sklearn.preprocessing import QuantileTransformer, StandardScaler
26
+ from sklearn.utils.validation import check_is_fitted
27
+
28
+ # reuse the evolved genome (the 16 families) from this package
29
+ from ._genome import load_genome, seed_variant, DEFAULT_GENOME
30
+
31
+ _G = {"torch": torch, "F": F, "np": np, "math": math}
32
+ DMAX = 100
33
+
34
+
35
+ def _raw_phi(genome, channels):
36
+ """Evaluate the genome lambdas on the channel dict and stack ALL feature columns, WITHOUT the
37
+ per-batch drop/standardize that the transductive build_phi applies (which would make the output
38
+ width and scale batch-dependent). Inductive selection + standardization is applied by the caller
39
+ using statistics stored at fit time."""
40
+ gg = {k: eval(v, _G) for k, v in (genome.get("@globals", {}) or {}).items()}
41
+ n = channels["x"].shape[0]
42
+ cols = []
43
+ for lam in genome["output"]:
44
+ fn = eval(lam, _G); na = fn.__code__.co_argcount; an = fn.__code__.co_varnames[:na]
45
+ sig = fn(*[channels if a == "input" else gg if a == "globals" else None for a in an])
46
+ if not torch.is_tensor(sig):
47
+ sig = torch.as_tensor(sig)
48
+ sig = sig.float()
49
+ if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
50
+ sub = [sig[:, c] for c in range(sig.shape[1])]
51
+ else:
52
+ sig = sig.squeeze()
53
+ sub = [sig] if (sig.dim() == 1 and sig.shape[0] == n) else []
54
+ for col in sub:
55
+ cols.append(torch.where(torch.isfinite(col), col, torch.zeros_like(col)))
56
+ return torch.stack(cols, dim=1).cpu().numpy() # (n, K_raw), un-normalized, fixed order
57
+
58
+
59
+ def _check_no_inf(Xm):
60
+ """NaN is allowed (we median-impute, and declare allow_nan=True), but +/-inf is rejected so the
61
+ estimator passes sklearn's check_estimators_nan_inf for allow_nan estimators."""
62
+ if np.isinf(Xm).any():
63
+ raise ValueError("Input contains infinity or a value too large for dtype('float64').")
64
+ return Xm
65
+
66
+
67
+ def _as_float_matrix(X, cat_cols):
68
+ """Coerce to float ndarray, ordinal-encoding categorical columns; returns (Xm, cat_mask, cat_maps).
69
+ cat_maps lets transform() reproduce the *training* category->code mapping (unseen -> nan)."""
70
+ import pandas as pd
71
+ import scipy.sparse as sp
72
+ if sp.issparse(X):
73
+ raise TypeError("EvoForest-Tab does not support sparse input. Densify it first "
74
+ "(e.g. X.toarray()) or set a sparse-aware preprocessor upstream.")
75
+ if isinstance(X, pd.DataFrame):
76
+ cols = list(X.columns)
77
+ cat = [(str(X[c].dtype) in ("category", "object", "bool")) or
78
+ (cat_cols is not None and (i in cat_cols or c in cat_cols)) for i, c in enumerate(cols)]
79
+ arrs, maps = [], {}
80
+ for i, c in enumerate(cols):
81
+ if cat[i]:
82
+ cats = list(pd.Categorical(X[c]).categories)
83
+ maps[i] = {v: k for k, v in enumerate(cats)}
84
+ arrs.append(X[c].map(maps[i]).to_numpy(dtype=np.float64))
85
+ else:
86
+ arrs.append(X[c].to_numpy(dtype=np.float64))
87
+ return _check_no_inf(np.column_stack(arrs)), np.array(cat), maps
88
+ Xm = np.asarray(X, dtype=np.float64)
89
+ cat = np.zeros(Xm.shape[1], bool)
90
+ if cat_cols is not None:
91
+ for i in cat_cols:
92
+ cat[i] = True
93
+ return _check_no_inf(Xm), cat, {}
94
+
95
+
96
+ class TabMapTransformer(TransformerMixin, BaseEstimator):
97
+ """Inductive EvoForest-Tab feature map: fit channel maps on train, transform any rows -> phi (n, K).
98
+
99
+ Parameters: ``genome_path`` (the evolved genome), ``cat_features`` (indices/names; else auto by dtype),
100
+ ``Dmax`` (feature width cap), ``n_quantiles``.
101
+ """
102
+
103
+ def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, n_quantiles=256):
104
+ self.genome_path = genome_path
105
+ self.cat_features = cat_features
106
+ self.Dmax = Dmax
107
+ self.n_quantiles = n_quantiles
108
+
109
+ def __sklearn_tags__(self):
110
+ tags = super().__sklearn_tags__()
111
+ tags.input_tags.allow_nan = True # we median-impute NaN (but reject inf)
112
+ tags.transformer_tags.preserves_dtype = [] # output is always float32/64, not input dtype
113
+ return tags
114
+
115
+ def _validate(self, X, *, reset):
116
+ """Numeric inputs go through sklearn's check_array (enforces 2D, rejects complex/1D, allows NaN,
117
+ sets/checks n_features_in_) so the estimator passes check_estimator. A pandas DataFrame carrying
118
+ categorical/object columns takes the bespoke path (check_array would reject object dtype)."""
119
+ import pandas as pd
120
+ from sklearn.utils.validation import check_array
121
+ is_df = isinstance(X, pd.DataFrame)
122
+ has_cat = is_df and any(str(d) in ("object", "category", "bool") for d in X.dtypes)
123
+ if has_cat or (is_df and self.cat_features is not None):
124
+ Xv, n = X, X.shape[1]
125
+ else:
126
+ Xv = check_array(X, dtype=np.float64, ensure_2d=True, ensure_all_finite="allow-nan",
127
+ ensure_min_samples=2 if reset else 1, estimator=self)
128
+ n = Xv.shape[1]
129
+ if reset:
130
+ self.n_features_in_ = n
131
+ elif n != self.n_features_in_:
132
+ raise ValueError(f"X has {n} features, but {type(self).__name__} "
133
+ f"is expecting {self.n_features_in_} features as input.")
134
+ return Xv
135
+
136
+ def fit(self, X, y=None):
137
+ X = self._validate(X, reset=True)
138
+ Xm, cat, self.cat_maps_ = _as_float_matrix(X, self.cat_features)
139
+ # median-impute (store train medians), drop constant cols, cap to Dmax by variance
140
+ self.medians_ = np.nanmedian(Xm, axis=0)
141
+ Xi = np.where(np.isnan(Xm), self.medians_, Xm)
142
+ keep = Xi.std(0) > 1e-9
143
+ idx = np.where(keep)[0]
144
+ if len(idx) > self.Dmax:
145
+ idx = idx[np.argsort(-Xi[:, idx].std(0))[:self.Dmax]]
146
+ self.keep_idx_ = idx
147
+ self.cat_kept_ = cat[idx]
148
+ Xk = Xi[:, idx]
149
+ # fit the per-column channel maps
150
+ self.scaler_x_ = StandardScaler().fit(Xk)
151
+ self.qt_ = QuantileTransformer(output_distribution="normal",
152
+ n_quantiles=min(self.n_quantiles, len(Xk)), subsample=10 ** 9).fit(Xk)
153
+ self.freq_maps_ = [{v: c / len(Xk) for v, c in zip(*np.unique(Xk[:, j], return_counts=True))}
154
+ for j in range(Xk.shape[1])]
155
+ F = self._freq(Xk)
156
+ self.scaler_f_ = StandardScaler().fit(F)
157
+ self.genome_ = load_genome(self.genome_path)
158
+ # fix the phi columns + standardization on TRAIN (so transform output is inductive: same width
159
+ # and scale for any batch, including a single row)
160
+ raw = _raw_phi(self.genome_, self._channels(X))
161
+ sd = raw.std(0)
162
+ self.phi_keep_ = np.where(sd > 1e-9)[0]
163
+ self.phi_mu_ = raw[:, self.phi_keep_].mean(0)
164
+ self.phi_sd_ = sd[self.phi_keep_].clip(min=1e-9)
165
+ self.n_output_features_ = len(self.phi_keep_)
166
+ return self
167
+
168
+ def _freq(self, Xk):
169
+ F = np.zeros_like(Xk)
170
+ for j in range(Xk.shape[1]):
171
+ m = self.freq_maps_[j]
172
+ F[:, j] = [m.get(v, 0.0) for v in Xk[:, j]]
173
+ return F
174
+
175
+ def _channels(self, X):
176
+ Xm, _, _ = _as_float_matrix(X, self.cat_features)
177
+ # reproduce train category codes for DataFrame inputs is handled in _as_float_matrix via dtype;
178
+ # here we just impute + select the kept columns with train medians.
179
+ Xi = np.where(np.isnan(Xm), self.medians_[: Xm.shape[1]], Xm)
180
+ Xk = Xi[:, self.keep_idx_]
181
+ n, D = Xk.shape
182
+ Dmax = self.Dmax
183
+ def pad(a):
184
+ return np.pad(a, ((0, 0), (0, max(0, Dmax - a.shape[1]))))[:, :Dmax]
185
+ x = pad(self.scaler_x_.transform(Xk))
186
+ xrank = pad(self.qt_.transform(Xk))
187
+ xfreq = pad(self.scaler_f_.transform(self._freq(Xk)))
188
+ fmask = np.zeros((n, Dmax)); fmask[:, :min(D, Dmax)] = 1.0
189
+ iscat = np.zeros((n, Dmax)); iscat[:, :min(D, Dmax)] = self.cat_kept_[:min(D, Dmax)].astype(float)
190
+ t = lambda a: torch.from_numpy(np.nan_to_num(a)).float()
191
+ return {"x": t(x), "xrank": t(xrank), "x_freq": t(xfreq), "fmask": t(fmask), "is_cat": t(iscat), "Dmax": Dmax}
192
+
193
+ def transform(self, X):
194
+ check_is_fitted(self, "phi_keep_")
195
+ X = self._validate(X, reset=False)
196
+ raw = _raw_phi(self.genome_, self._channels(X))[:, self.phi_keep_]
197
+ return (raw - self.phi_mu_) / self.phi_sd_
198
+
199
+ def get_feature_names_out(self, input_features=None):
200
+ check_is_fitted(self, "n_output_features_")
201
+ return np.array([f"evoforest_{i}" for i in range(self.n_output_features_)])
202
+
203
+
204
+ class TabMapInductiveClassifier(ClassifierMixin, BaseEstimator):
205
+ """Inductive classifier: EvoForest-Tab feature map -> ridge head. Standard fit->predict (passes the
206
+ ordinary sklearn contract). Use the transductive ``tabmap.TabMapClassifier`` for the support->query
207
+ in-context setting; use this for a drop-in sklearn estimator."""
208
+
209
+ def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, alpha=1.0):
210
+ self.genome_path = genome_path
211
+ self.cat_features = cat_features
212
+ self.Dmax = Dmax
213
+ self.alpha = alpha
214
+
215
+ def __sklearn_tags__(self):
216
+ tags = super().__sklearn_tags__()
217
+ tags.input_tags.allow_nan = True
218
+ return tags
219
+
220
+ def fit(self, X, y):
221
+ if y is None:
222
+ raise ValueError(f"{type(self).__name__} requires y to be passed, but the target y is None.")
223
+ y = np.asarray(y) # de-wrap _NotAnArray before np.unique
224
+ self._tf = TabMapTransformer(self.genome_path, self.cat_features, self.Dmax).fit(X)
225
+ self.classes_ = np.unique(y)
226
+ self.n_features_in_ = self._tf.n_features_in_
227
+ self._head = RidgeClassifierCV(alphas=_np.logspace(-2, 4, 13)).fit(self._tf.transform(X), y)
228
+ return self
229
+
230
+ def predict(self, X):
231
+ check_is_fitted(self, "_head")
232
+ return self._head.predict(self._tf.transform(X))
233
+
234
+ def predict_proba(self, X):
235
+ """Calibration-free probabilities from the ridge decision function (sigmoid for binary, softmax
236
+ for multiclass) -- RidgeClassifierCV exposes only decision_function."""
237
+ check_is_fitted(self, "_head")
238
+ d = self._head.decision_function(self._tf.transform(X))
239
+ if d.ndim == 1: # binary
240
+ p = 1.0 / (1.0 + np.exp(-d))
241
+ return np.column_stack([1.0 - p, p])
242
+ e = np.exp(d - d.max(1, keepdims=True))
243
+ return e / e.sum(1, keepdims=True)
244
+
245
+
246
+ class TabMapInductiveRegressor(RegressorMixin, BaseEstimator):
247
+ def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, alpha=1.0):
248
+ self.genome_path = genome_path
249
+ self.cat_features = cat_features
250
+ self.Dmax = Dmax
251
+ self.alpha = alpha
252
+
253
+ def __sklearn_tags__(self):
254
+ tags = super().__sklearn_tags__()
255
+ tags.input_tags.allow_nan = True
256
+ return tags
257
+
258
+ def fit(self, X, y):
259
+ from sklearn.utils.validation import column_or_1d
260
+ y = column_or_1d(y, warn=True) # warn + ravel on (n,1) y -> check_supervised_y_2d
261
+ self._tf = TabMapTransformer(self.genome_path, self.cat_features, self.Dmax).fit(X)
262
+ self.n_features_in_ = self._tf.n_features_in_
263
+ self._head = RidgeCV(alphas=_np.logspace(-2, 4, 13)).fit(self._tf.transform(X), y)
264
+ return self
265
+
266
+ def predict(self, X):
267
+ check_is_fitted(self, "_head")
268
+ return self._head.predict(self._tf.transform(X))
269
+
270
+
271
+ # brand-consistent names matching the paper (TabMap* retained as aliases)
272
+ EvoForestTabTransformer = TabMapTransformer
273
+ EvoForestTabInductiveClassifier = TabMapInductiveClassifier
274
+ EvoForestTabInductiveRegressor = TabMapInductiveRegressor
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: evoforest-tab
3
+ Version: 0.1.0
4
+ Summary: Evolved universal tabular feature map + closed-form ridge: an interpretable, training-free, local in-context learner for tabular data.
5
+ License: MIT
6
+ Keywords: tabular,in-context-learning,feature-map,tabpfn,ridge,automl
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: torch>=1.13
11
+ Requires-Dist: numpy>=1.21
12
+ Requires-Dist: pyyaml>=5.4
13
+ Provides-Extra: sklearn
14
+ Requires-Dist: scikit-learn>=1.0; extra == "sklearn"
15
+ Provides-Extra: examples
16
+ Requires-Dist: scikit-learn>=1.0; extra == "examples"
17
+ Requires-Dist: pandas>=1.3; extra == "examples"
18
+ Dynamic: license-file
19
+
20
+ # tabmap — EvoForest-Tab: an evolved universal tabular feature map
21
+
22
+ `tabmap` is the reference implementation of **EvoForest-Tab** (the EvoForest computation-search framework specialized to tabular data).
23
+
24
+ `tabmap` is an interpretable, training-free, **local** in-context learner for tabular data: an
25
+ evolved universal feature map `φ: row → ℝᴷ` (16 transform families over rank-gauss, count-encoding,
26
+ and categorical-mask channels) paired with a per-dataset **closed-form Bayesian-ridge head**. Given a
27
+ labeled *support* set and an unlabeled *query* set, it predicts in a single SVD solve — no gradient
28
+ descent, no per-dataset tuning, no GPU. It is competitive with gradient boosting and with the
29
+ published **TabPFN-v2** tabular foundation model, while remaining free to run and fully inspectable.
30
+
31
+ This repository accompanies the paper *"Evolving a Universal Tabular Feature Map: Interpretable,
32
+ Closed-Form In-Context Learning Competitive with Tabular Foundation Models"* and is **stand-alone**:
33
+ the deployment pipeline (feature map + ridge) depends only on `torch`, `numpy`, and `pyyaml`.
34
+
35
+ ## Install
36
+ ```bash
37
+ pip install -e . # editable; or: pip install .
38
+ # deps: torch, numpy, pyyaml (+ scikit-learn for the estimator base classes & examples)
39
+ ```
40
+
41
+ ## Usage (scikit-learn style)
42
+ ```python
43
+ from evoforest_tab import TabMapClassifier, TabMapRegressor
44
+
45
+ clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support) # X: ndarray or DataFrame
46
+ proba = clf.predict_proba(X_query) # in-context: query needed to fit φ channels
47
+ pred = clf.predict(X_query)
48
+
49
+ reg = TabMapRegressor(n_estimators=6).fit(X_support, y_support)
50
+ yhat = reg.predict(X_query)
51
+ ```
52
+ Notes:
53
+ - It is an **in-context** learner: `predict` builds the (label-free, transductive) channels over the
54
+ pooled support+query rows, so the query rows are needed at prediction time (as with TabPFN).
55
+ - `n_estimators` is the random-feature ensemble size (averaged decorrelated seed-variants of `φ`);
56
+ `n_estimators=1` is the single map, `6` is the paper default (variance reduction toward the kernel limit).
57
+ - `cat_features=[...]` marks categorical columns (indices or DataFrame names); omitted → auto-detected.
58
+ - No class-count ceiling (unlike TabPFN-v2's ≤10 classes); runs on CPU in milliseconds.
59
+
60
+ ## What's inside
61
+ ```
62
+ tabmap/
63
+ _channels.py raw rows -> input channels (col-z, rank-gauss, count-encoding, categorical mask), nan-safe
64
+ _genome.py evaluate the evolved genome (champion.yaml) -> feature matrix Phi; seed-variants for the ensemble
65
+ _ridge.py closed-form Bayesian-ridge head (evidence-maximized lambda), single SVD solve
66
+ estimator.py TabMapClassifier / TabMapRegressor (sklearn API) + K-seed ensemble
67
+ champion.yaml the evolved 16-family genome (the deployment artifact)
68
+ examples/quickstart.py
69
+ reproduce/ scripts + cached TabPFN-v2 predictions to reproduce the paper's experiments
70
+ tests/
71
+ ```
72
+
73
+ ## Reproducing the paper
74
+ See [`reproduce/README.md`](reproduce/README.md). The cached TabPFN-v2 cloud predictions are included
75
+ so the head-to-head and routing experiments reproduce **without** any API key.
76
+
77
+ ## Contributing this method upstream
78
+ `tabmap` is designed to drop into the tabular ML ecosystem. Best integration targets (most aligned first):
79
+
80
+ | Repo | Why it fits | Integration |
81
+ |---|---|---|
82
+ | **PriorLabs/tabpfn-extensions** | community extensions around TabPFN; our method is a free/local **complementary** in-context learner and a natural **cost-aware router** companion (route hard datasets to TabPFN, the rest to `tabmap`) | add as an extension module + a routing utility (`sklearn`-compatible) |
83
+ | **scikit-learn-contrib** | `TabMapClassifier`/`TabMapRegressor` already follow the estimator API | publish as a standalone `scikit-learn-contrib` project |
84
+ | **skrub** (ex dirty-cat) | tabular feature engineering / encoders; our channels (rank-gauss, count-encoding) + `φ` are a drop-in `TransformerMixin` featurizer | contribute `TabMapEncoder` (transform-only) |
85
+ | **pyg-team/pytorch-frame** | deep tabular; `φ` is a fixed featurizer usable as an input stem | add as an `encoder`/`stype` transform |
86
+ | **autogluon / TabArena** | leaderboard model implementations | submit `tabmap` as a model for the TabArena living benchmark |
87
+
88
+ The estimator's sklearn-compatible surface (`fit`/`predict`/`predict_proba`, `get_params`) is the
89
+ contribution-ready API; the transform-only `build_channels`+`build_phi` path serves the encoder use-cases.
90
+
91
+
92
+ ## Combining with a foundation model (e.g. TabPFN)
93
+ `StackedTabularEnsemble` combines TabMap with any in-context base model (such as TabPFN's client) into a
94
+ single, stronger predictor -- the paper's complementarity result (our map tends to win classification,
95
+ TabPFN regression; combining beats either alone). Three methods: `blend` (50/50), `compwt`
96
+ (label-free, weight each model by its support-cross-validated competence), `meta` (a learned ridge head
97
+ over the models' out-of-fold support predictions; most robust). All are leakage-safe and in-context
98
+ (weights/head fit on support, no query labels).
99
+
100
+ ```python
101
+ from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
102
+ from tabpfn_client import TabPFNClassifier # or any sklearn-surface in-context model
103
+
104
+ ens = StackedTabularEnsemble(
105
+ [TabMapClassifier(n_estimators=6), TabPFNClassifier()],
106
+ task="classification", method="meta", # "meta" | "compwt" | "blend"
107
+ ).fit(X_support, y_support)
108
+ proba = ens.predict_proba(X_query)
109
+ ```
110
+ The learned head (`meta`) is robust whether the two models are evenly matched or one dominates; the
111
+ label-free `compwt` is a close, deployable second with no meta-learner. See `examples/combine_tabpfn.py`.
112
+
113
+ ## Citation
114
+ If you use this library, please cite the accompanying paper *"Evolving a Universal Tabular Feature Map:
115
+ Interpretable, Closed-Form In-Context Learning Competitive with Tabular Foundation Models."* (anonymized
116
+ for review; see `../tabular_paper/`).
117
+
118
+ ## License
119
+ MIT (see `LICENSE`).
@@ -0,0 +1,14 @@
1
+ evoforest_tab/__init__.py,sha256=hc7UzSxYLKk_L3PYiGlPXlrZJ05-ABKaJIJs_xd3hR4,1413
2
+ evoforest_tab/_channels.py,sha256=f_kp4gHARc7R1q-4WX3lRMBsfav1cFbIS2lz15WRxLc,5181
3
+ evoforest_tab/_genome.py,sha256=GlgktMKeZsDI5sxz7IrI3wzjViOVCnuv12vR4XqpDtM,2454
4
+ evoforest_tab/_module.py,sha256=x90Yg2z5RYnwsFQcH17GkIVwOpUu0P2remP5P594E2o,6750
5
+ evoforest_tab/_ridge.py,sha256=5kWanlIIc72zM7OnzVwyedgYkf5jAWUOg0mL9bhnWJk,1902
6
+ evoforest_tab/champion.yaml,sha256=lXCukeBH_EEKQZium6pgkptlkvg7HevJxvkXWDutDeo,6353
7
+ evoforest_tab/combine.py,sha256=gqjHn50rLlcowiPsJ0T_bfXELfCvF7G21BUT8strM-c,7683
8
+ evoforest_tab/estimator.py,sha256=uDoLhklSbZml1P0I9qcqiqtk1i7ORnWUT0OKcHXcSs8,4214
9
+ evoforest_tab/inductive.py,sha256=-GbZc9yFJ9PhtVKxczRKoBi09f4VqrArOUqUPCnhRdE,13173
10
+ evoforest_tab-0.1.0.dist-info/licenses/LICENSE,sha256=mPdFwLq00cOE_zAvx0jTNZ2Jy2Bl0D1RMu9JmYTXpyQ,1075
11
+ evoforest_tab-0.1.0.dist-info/METADATA,sha256=RELHHMnOhLC7aaZTD-OQLTX4GTwanql-IRL4ultoK4c,6709
12
+ evoforest_tab-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
13
+ evoforest_tab-0.1.0.dist-info/top_level.txt,sha256=RNTgLpP5y1b6xVDmwFLAxtrGm1AIBcaT_umBfEA67x4,14
14
+ evoforest_tab-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 the tabmap authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ evoforest_tab