evoforest-tab 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evoforest_tab/__init__.py +28 -0
- evoforest_tab/_channels.py +122 -0
- evoforest_tab/_genome.py +64 -0
- evoforest_tab/_module.py +152 -0
- evoforest_tab/_ridge.py +45 -0
- evoforest_tab/champion.yaml +77 -0
- evoforest_tab/combine.py +165 -0
- evoforest_tab/estimator.py +108 -0
- evoforest_tab/inductive.py +274 -0
- evoforest_tab-0.1.0.dist-info/METADATA +119 -0
- evoforest_tab-0.1.0.dist-info/RECORD +14 -0
- evoforest_tab-0.1.0.dist-info/WHEEL +5 -0
- evoforest_tab-0.1.0.dist-info/licenses/LICENSE +21 -0
- evoforest_tab-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""evoforest_tab: EvoForest-Tab -- an evolved universal tabular feature map + closed-form ridge head.
|
|
2
|
+
|
|
3
|
+
An interpretable, training-free, local in-context learner competitive with tabular foundation
|
|
4
|
+
models. See README. Main entry points:
|
|
5
|
+
|
|
6
|
+
from evoforest_tab import EvoForestTabClassifier, EvoForestTabRegressor
|
|
7
|
+
"""
|
|
8
|
+
from .estimator import TabMapClassifier, TabMapRegressor
|
|
9
|
+
from ._genome import load_genome, build_phi, seed_variant, DEFAULT_GENOME
|
|
10
|
+
from ._channels import build_channels
|
|
11
|
+
from .combine import StackedTabularEnsemble
|
|
12
|
+
from ._module import EvoForestTabModule
|
|
13
|
+
from .inductive import (
|
|
14
|
+
EvoForestTabInductiveClassifier, EvoForestTabInductiveRegressor, EvoForestTabTransformer,
|
|
15
|
+
TabMapInductiveClassifier, TabMapInductiveRegressor, TabMapTransformer,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# brand-consistent names matching the paper (the TabMap* names remain as aliases)
|
|
19
|
+
EvoForestTabClassifier = TabMapClassifier
|
|
20
|
+
EvoForestTabRegressor = TabMapRegressor
|
|
21
|
+
|
|
22
|
+
__all__ = ["EvoForestTabClassifier", "EvoForestTabRegressor",
|
|
23
|
+
"TabMapClassifier", "TabMapRegressor", "build_channels",
|
|
24
|
+
"build_phi", "load_genome", "seed_variant", "DEFAULT_GENOME",
|
|
25
|
+
"StackedTabularEnsemble", "EvoForestTabModule",
|
|
26
|
+
"EvoForestTabInductiveClassifier", "EvoForestTabInductiveRegressor", "EvoForestTabTransformer",
|
|
27
|
+
"TabMapInductiveClassifier", "TabMapInductiveRegressor", "TabMapTransformer"]
|
|
28
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Channel construction: raw table rows -> the input channels the feature map reads.
|
|
2
|
+
|
|
3
|
+
Faithful to the development pipeline. All channels are UNSUPERVISED and TRANSDUCTIVE (computed over the
|
|
4
|
+
pooled support+query rows, label-free) so the map is leakage-safe in the in-context (support->query)
|
|
5
|
+
setting. Categoricals are ordinal-encoded; missing values are nan-safe; columns are padded/capped to Dmax.
|
|
6
|
+
"""
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
DMAX_DEFAULT = 100
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _nan_col_zscore(X: torch.Tensor) -> torch.Tensor:
|
|
16
|
+
"""Per-column z-score (nan-safe); nan -> 0 after standardizing."""
|
|
17
|
+
nan = torch.isnan(X)
|
|
18
|
+
Xf = torch.where(nan, torch.zeros_like(X), X)
|
|
19
|
+
cnt = (~nan).sum(0).clamp(min=1).to(X.dtype)
|
|
20
|
+
mean = Xf.sum(0) / cnt
|
|
21
|
+
var = (torch.where(nan, torch.zeros_like(X), (X - mean) ** 2)).sum(0) / cnt
|
|
22
|
+
z = (X - mean) / (var.sqrt() + 1e-6)
|
|
23
|
+
return torch.where(nan, torch.zeros_like(z), z)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _col_rankgauss(X: torch.Tensor) -> torch.Tensor:
|
|
27
|
+
"""Per-column rank -> ~N(0,1) (nan ranked last, then zeroed). Transductive, label-free."""
|
|
28
|
+
N = X.shape[0]
|
|
29
|
+
filled = torch.where(torch.isnan(X), torch.full_like(X, float("inf")), X)
|
|
30
|
+
ranks = filled.argsort(0).argsort(0).to(X.dtype)
|
|
31
|
+
u = (ranks + 0.5) / N
|
|
32
|
+
z = math.sqrt(2.0) * torch.erfinv((2 * u - 1).clamp(-1 + 1e-6, 1 - 1e-6))
|
|
33
|
+
return torch.where(torch.isnan(X), torch.zeros_like(z), z)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _col_freq(Xm: np.ndarray) -> np.ndarray:
|
|
37
|
+
"""Per-column count/frequency encoding: each cell -> fraction of rows sharing its value."""
|
|
38
|
+
n, D = Xm.shape
|
|
39
|
+
Fr = np.zeros_like(Xm, dtype=np.float64)
|
|
40
|
+
for j in range(D):
|
|
41
|
+
_, inv, counts = np.unique(Xm[:, j], return_inverse=True, return_counts=True)
|
|
42
|
+
Fr[:, j] = counts[inv] / n
|
|
43
|
+
return Fr
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _pad(X: torch.Tensor, Dmax: int) -> torch.Tensor:
|
|
47
|
+
if X.shape[1] >= Dmax:
|
|
48
|
+
return X[:, :Dmax]
|
|
49
|
+
return torch.cat([X, torch.zeros(X.shape[0], Dmax - X.shape[1], dtype=X.dtype)], dim=1)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _to_ordinal(X, cat_features):
|
|
53
|
+
"""Coerce a 2-D array (numpy object/float or pandas DataFrame) to a float matrix with categoricals
|
|
54
|
+
ordinal-encoded (nan preserved), returning (Xm float64, cat_mask bool)."""
|
|
55
|
+
try:
|
|
56
|
+
import pandas as pd
|
|
57
|
+
is_df = isinstance(X, pd.DataFrame)
|
|
58
|
+
except ImportError:
|
|
59
|
+
is_df = False
|
|
60
|
+
if is_df:
|
|
61
|
+
cols, cat = [], []
|
|
62
|
+
for ci, c in enumerate(X.columns):
|
|
63
|
+
s = X[c]
|
|
64
|
+
auto_cat = str(s.dtype) in ("category", "object", "bool")
|
|
65
|
+
user_cat = cat_features is not None and (ci in cat_features or c in cat_features)
|
|
66
|
+
if auto_cat or user_cat:
|
|
67
|
+
codes = s.astype("category").cat.codes.to_numpy().astype(np.float64)
|
|
68
|
+
codes[codes < 0] = np.nan
|
|
69
|
+
cols.append(codes); cat.append(True)
|
|
70
|
+
else:
|
|
71
|
+
cols.append(s.to_numpy(dtype=np.float64)); cat.append(False)
|
|
72
|
+
return np.column_stack(cols), np.array(cat, dtype=bool)
|
|
73
|
+
Xm = np.asarray(X, dtype=object)
|
|
74
|
+
n, D = Xm.shape
|
|
75
|
+
out = np.zeros((n, D), dtype=np.float64)
|
|
76
|
+
cat = np.zeros(D, dtype=bool)
|
|
77
|
+
for j in range(D):
|
|
78
|
+
col = Xm[:, j]
|
|
79
|
+
user_cat = cat_features is not None and j in cat_features
|
|
80
|
+
is_numeric = np.issubdtype(np.asarray(col).dtype, np.number)
|
|
81
|
+
try:
|
|
82
|
+
fcol = col.astype(np.float64)
|
|
83
|
+
numeric_ok = True
|
|
84
|
+
except (ValueError, TypeError):
|
|
85
|
+
numeric_ok = False
|
|
86
|
+
if user_cat or not numeric_ok or (not is_numeric):
|
|
87
|
+
uniq = {v: i for i, v in enumerate(sorted(set(map(str, col))))}
|
|
88
|
+
out[:, j] = np.array([uniq[str(v)] for v in col], dtype=np.float64)
|
|
89
|
+
cat[j] = True
|
|
90
|
+
else:
|
|
91
|
+
out[:, j] = fcol
|
|
92
|
+
return out, cat
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_channels(X, cat_features=None, Dmax: int = DMAX_DEFAULT, device="cpu"):
|
|
96
|
+
"""Build the input-channel dict from raw rows X (n, d). Returns tensors padded to Dmax.
|
|
97
|
+
|
|
98
|
+
X: numpy array or pandas DataFrame (the POOLED support+query rows).
|
|
99
|
+
cat_features: indices (or names) of categorical columns; if None, auto-detect by dtype/cardinality.
|
|
100
|
+
"""
|
|
101
|
+
Xm, cat = _to_ordinal(X, cat_features)
|
|
102
|
+
# median-impute, drop constant columns (matches the dev pipeline)
|
|
103
|
+
med = np.nanmedian(Xm, axis=0)
|
|
104
|
+
inds = np.where(np.isnan(Xm))
|
|
105
|
+
Xm[inds] = np.take(med, inds[1])
|
|
106
|
+
keep = Xm.std(0) > 1e-9
|
|
107
|
+
Xm = Xm[:, keep]; cat = cat[keep]
|
|
108
|
+
if Xm.shape[1] == 0:
|
|
109
|
+
raise ValueError("no non-constant columns after preprocessing")
|
|
110
|
+
# cap to Dmax by top variance
|
|
111
|
+
if Xm.shape[1] > Dmax:
|
|
112
|
+
top = np.argsort(-Xm.std(0))[:Dmax]
|
|
113
|
+
Xm = Xm[:, top]; cat = cat[top]
|
|
114
|
+
n, D = Xm.shape
|
|
115
|
+
Xt = torch.from_numpy(Xm).float().to(device)
|
|
116
|
+
x = _pad(_nan_col_zscore(Xt), Dmax)
|
|
117
|
+
xrank = _pad(_col_rankgauss(Xt), Dmax)
|
|
118
|
+
x_freq = _pad(_nan_col_zscore(torch.from_numpy(_col_freq(Xm)).float().to(device)), Dmax)
|
|
119
|
+
fmask = torch.zeros(n, Dmax, device=device); fmask[:, :min(D, Dmax)] = 1.0
|
|
120
|
+
is_cat = torch.zeros(n, Dmax, device=device)
|
|
121
|
+
is_cat[:, :min(D, Dmax)] = torch.from_numpy(cat[:min(D, Dmax)].astype(np.float32)).to(device)
|
|
122
|
+
return {"x": x, "xrank": xrank, "fmask": fmask, "is_cat": is_cat, "x_freq": x_freq, "Dmax": Dmax}
|
evoforest_tab/_genome.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Evaluate the evolved genome (a list of feature-lambdas) on the input channels to produce the
|
|
2
|
+
feature matrix Phi. The genome is the deployment artifact; each lambda is a small, inspectable
|
|
3
|
+
expression over the channels with fixed seeded random projections."""
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn.functional as F
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
_G = {"torch": torch, "F": F, "np": np, "math": math}
|
|
14
|
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|
15
|
+
DEFAULT_GENOME = os.path.join(_HERE, "champion.yaml")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_genome(path: str = DEFAULT_GENOME) -> dict:
|
|
19
|
+
return yaml.safe_load(open(path))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def seed_variant(genome: dict, offset: int) -> dict:
|
|
23
|
+
"""Offset every manual_seed(N) -> manual_seed(N+offset): a decorrelated random-feature draw of the
|
|
24
|
+
SAME architecture (used for the variance-reducing ensemble)."""
|
|
25
|
+
out = [re.sub(r"manual_seed\((\d+)\)", lambda m: f"manual_seed({int(m.group(1)) + offset})", lam)
|
|
26
|
+
for lam in genome["output"]]
|
|
27
|
+
g = dict(genome); g["output"] = out
|
|
28
|
+
return g
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_phi(genome: dict, channels: dict) -> torch.Tensor:
|
|
32
|
+
"""Apply the genome's output lambdas to the channel dict and stack into a normalized (n, K) matrix."""
|
|
33
|
+
gg = {k: eval(v, _G) for k, v in (genome.get("@globals", {}) or {}).items()}
|
|
34
|
+
n = channels["x"].shape[0]
|
|
35
|
+
signals = []
|
|
36
|
+
for lam in genome["output"]:
|
|
37
|
+
fn = eval(lam, _G)
|
|
38
|
+
na = fn.__code__.co_argcount
|
|
39
|
+
an = fn.__code__.co_varnames[:na]
|
|
40
|
+
args = [channels if a == "input" else gg if a == "globals" else None for a in an]
|
|
41
|
+
signals.append(fn(*args))
|
|
42
|
+
cols = []
|
|
43
|
+
for sig in signals:
|
|
44
|
+
if not torch.is_tensor(sig):
|
|
45
|
+
sig = torch.as_tensor(sig)
|
|
46
|
+
sig = sig.float()
|
|
47
|
+
if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
|
|
48
|
+
sub = [sig[:, c] for c in range(sig.shape[1])]
|
|
49
|
+
else:
|
|
50
|
+
sig = sig.squeeze()
|
|
51
|
+
if sig.dim() != 1 or sig.shape[0] != n:
|
|
52
|
+
continue
|
|
53
|
+
sub = [sig]
|
|
54
|
+
for col in sub:
|
|
55
|
+
col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
|
|
56
|
+
if col.std() < 1e-12:
|
|
57
|
+
continue
|
|
58
|
+
cols.append(col)
|
|
59
|
+
if not cols:
|
|
60
|
+
raise ValueError("genome produced no usable feature columns")
|
|
61
|
+
Phi = torch.stack(cols, dim=1)
|
|
62
|
+
mu = Phi.mean(0, keepdim=True)
|
|
63
|
+
sigma = Phi.std(0, keepdim=True).clamp(min=1e-12)
|
|
64
|
+
return (Phi - mu) / sigma
|
evoforest_tab/_module.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Compile the evolved genome (champion.yaml) into a self-contained ``torch.nn.Module``.
|
|
2
|
+
|
|
3
|
+
The genome is the *architecture spec* produced by EvoForest search. In the reference path each family's
|
|
4
|
+
seeded random projection is re-drawn on every forward via ``torch.randn(..., generator=manual_seed(N))``
|
|
5
|
+
inside an ``eval``'d lambda. That is fine for a frozen feature map but (a) cannot be fine-tuned, (b) re-draws
|
|
6
|
+
on every call, and (c) is not a saveable artifact.
|
|
7
|
+
|
|
8
|
+
``EvoForestTabModule`` fixes all three: it materializes every seeded ``randn``/``rand`` draw **once** as a
|
|
9
|
+
frozen ``nn.Parameter`` (``requires_grad=False`` by default), so the module is a self-contained, saveable,
|
|
10
|
+
HuggingFace-publishable checkpoint whose ``forward`` is byte-identical to the reference evaluation -- and the
|
|
11
|
+
random features become fine-tunable simply by calling :meth:`unfreeze_random_features`. The deterministic
|
|
12
|
+
families (rank/stats/frequency) carry no parameters and are reproduced exactly.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import math
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import torch
|
|
20
|
+
import torch.nn.functional as F
|
|
21
|
+
from torch import nn
|
|
22
|
+
|
|
23
|
+
from ._genome import load_genome, DEFAULT_GENOME
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _SeededWeightBank(nn.Module):
|
|
27
|
+
"""Materialize seeded ``torch.randn``/``torch.rand`` draws as frozen Parameters, keyed by (op, seed, shape)."""
|
|
28
|
+
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.weights = nn.ParameterDict()
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _key(op: str, shape, seed: int) -> str:
|
|
35
|
+
return f"{op}__seed{int(seed)}__" + "x".join(str(int(s)) for s in shape)
|
|
36
|
+
|
|
37
|
+
def get(self, op: str, shape, seed: int, device=None) -> torch.Tensor:
|
|
38
|
+
key = self._key(op, shape, seed)
|
|
39
|
+
if key not in self.weights:
|
|
40
|
+
gen = torch.Generator().manual_seed(int(seed)) # CPU draw == reference on CPU
|
|
41
|
+
draw = (torch.randn if op == "randn" else torch.rand)(*shape, generator=gen)
|
|
42
|
+
self.weights[key] = nn.Parameter(draw, requires_grad=False) # frozen by default
|
|
43
|
+
w = self.weights[key]
|
|
44
|
+
return w.to(device) if device is not None else w
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _TorchProxy:
|
|
48
|
+
"""A stand-in for the ``torch`` namespace inside the genome lambdas: seeded ``randn``/``rand`` are routed
|
|
49
|
+
to the materialized weight bank; every other attribute delegates to real ``torch``."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, bank: _SeededWeightBank) -> None:
|
|
52
|
+
object.__setattr__(self, "_bank", bank)
|
|
53
|
+
|
|
54
|
+
def __getattr__(self, name):
|
|
55
|
+
return getattr(torch, name)
|
|
56
|
+
|
|
57
|
+
def randn(self, *shape, generator=None, device=None, dtype=None):
|
|
58
|
+
if generator is None:
|
|
59
|
+
return torch.randn(*shape, device=device, dtype=dtype)
|
|
60
|
+
return self._bank.get("randn", shape, generator.initial_seed(), device)
|
|
61
|
+
|
|
62
|
+
def rand(self, *shape, generator=None, device=None, dtype=None):
|
|
63
|
+
if generator is None:
|
|
64
|
+
return torch.rand(*shape, device=device, dtype=dtype)
|
|
65
|
+
return self._bank.get("rand", shape, generator.initial_seed(), device)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EvoForestTabModule(nn.Module):
|
|
69
|
+
"""Evolved EvoForest-Tab feature map phi as a compiled, fine-tunable, saveable ``nn.Module``.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
genome : dict | None
|
|
74
|
+
The parsed genome (defaults to the released champion).
|
|
75
|
+
dmax : int
|
|
76
|
+
Padded channel width the module is built for (random-projection shapes depend only on this).
|
|
77
|
+
|
|
78
|
+
Notes
|
|
79
|
+
-----
|
|
80
|
+
``forward(channels)`` takes the channel dict from :func:`evoforest_tab.build_channels` and returns the
|
|
81
|
+
normalized ``(n, K)`` feature matrix, byte-identical to :func:`evoforest_tab.build_phi`. Random-feature
|
|
82
|
+
weights are frozen by default; call :meth:`unfreeze_random_features` to fine-tune them (e.g. at BSC), then
|
|
83
|
+
``state_dict()`` is a publishable derivative checkpoint.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self, genome: dict | None = None, dmax: int = 100) -> None:
|
|
87
|
+
super().__init__()
|
|
88
|
+
genome = genome if genome is not None else load_genome(DEFAULT_GENOME)
|
|
89
|
+
self.output_src = list(genome["output"])
|
|
90
|
+
self.dmax = dmax
|
|
91
|
+
self.bank = _SeededWeightBank()
|
|
92
|
+
self._proxy = _TorchProxy(self.bank)
|
|
93
|
+
env = {"torch": self._proxy, "F": F, "np": np, "math": math}
|
|
94
|
+
self._fns = [eval(src, env) for src in self.output_src] # proxy-bound lambdas
|
|
95
|
+
with torch.no_grad(): # eager-materialize the bank
|
|
96
|
+
self._materialize()
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------ build / fine-tune controls
|
|
99
|
+
def _materialize(self) -> None:
|
|
100
|
+
dummy = self._dummy_channels()
|
|
101
|
+
for fn in self._fns:
|
|
102
|
+
try:
|
|
103
|
+
fn(dummy) # fires the seeded randn/rand -> banks the weights
|
|
104
|
+
except Exception: # noqa: BLE001 - dummy may degenerate post-draw; weights are banked
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
def _dummy_channels(self) -> dict:
|
|
108
|
+
z = torch.randn(4, self.dmax)
|
|
109
|
+
return {"x": z, "xrank": z.clone(), "x_freq": z.clone(),
|
|
110
|
+
"fmask": torch.ones(4, self.dmax), "is_cat": torch.zeros(4, self.dmax), "Dmax": self.dmax}
|
|
111
|
+
|
|
112
|
+
def unfreeze_random_features(self) -> "EvoForestTabModule":
|
|
113
|
+
for p in self.bank.weights.values():
|
|
114
|
+
p.requires_grad_(True)
|
|
115
|
+
return self
|
|
116
|
+
|
|
117
|
+
def freeze_random_features(self) -> "EvoForestTabModule":
|
|
118
|
+
for p in self.bank.weights.values():
|
|
119
|
+
p.requires_grad_(False)
|
|
120
|
+
return self
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def n_random_parameters(self) -> int:
|
|
124
|
+
return sum(p.numel() for p in self.bank.weights.values())
|
|
125
|
+
|
|
126
|
+
# ------------------------------------------------------------------ forward (mirrors build_phi exactly)
|
|
127
|
+
def forward(self, channels: dict) -> torch.Tensor:
|
|
128
|
+
n = channels["x"].shape[0]
|
|
129
|
+
cols = []
|
|
130
|
+
for fn in self._fns:
|
|
131
|
+
sig = fn(channels)
|
|
132
|
+
if not torch.is_tensor(sig):
|
|
133
|
+
sig = torch.as_tensor(sig)
|
|
134
|
+
sig = sig.float()
|
|
135
|
+
if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
|
|
136
|
+
sub = [sig[:, c] for c in range(sig.shape[1])]
|
|
137
|
+
else:
|
|
138
|
+
sig = sig.squeeze()
|
|
139
|
+
if sig.dim() != 1 or sig.shape[0] != n:
|
|
140
|
+
continue
|
|
141
|
+
sub = [sig]
|
|
142
|
+
for col in sub:
|
|
143
|
+
col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
|
|
144
|
+
if col.std() < 1e-12:
|
|
145
|
+
continue
|
|
146
|
+
cols.append(col)
|
|
147
|
+
if not cols:
|
|
148
|
+
raise ValueError("genome produced no usable feature columns")
|
|
149
|
+
phi = torch.stack(cols, dim=1)
|
|
150
|
+
mu = phi.mean(0, keepdim=True)
|
|
151
|
+
sigma = phi.std(0, keepdim=True).clamp(min=1e-12)
|
|
152
|
+
return (phi - mu) / sigma
|
evoforest_tab/_ridge.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Closed-form ridge head with Bayesian evidence-maximized regularization (MacKay/Tipping).
|
|
2
|
+
|
|
3
|
+
The head is fit on the support features and applied to the query features in a single SVD solve --
|
|
4
|
+
no gradient descent, no per-dataset hyperparameter grid. lambda is set by evidence maximization,
|
|
5
|
+
which is markedly more stable than a leave-one-out grid in the few-shot (K>>n) regime.
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
LAM_FLOOR = 1e-2
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def evidence_lambda(U, S, Y, n_iter: int = 60) -> float:
|
|
13
|
+
"""Bayesian-ridge lambda = alpha/beta by evidence maximization, reusing the support SVD (U,S),
|
|
14
|
+
with alpha,beta shared across Y's columns and a floor for the K>n interpolation regime."""
|
|
15
|
+
s2 = S ** 2
|
|
16
|
+
UtY = U.transpose(0, 1) @ Y
|
|
17
|
+
n, M = Y.shape
|
|
18
|
+
ytot = (Y ** 2).sum(); proj = (UtY ** 2).sum()
|
|
19
|
+
lam = torch.tensor(1.0, dtype=S.dtype, device=S.device)
|
|
20
|
+
for _ in range(n_iter):
|
|
21
|
+
h = s2 / (s2 + lam)
|
|
22
|
+
d = S / (s2 + lam)
|
|
23
|
+
wsq = ((d.unsqueeze(1) * UtY) ** 2).sum()
|
|
24
|
+
rss = (ytot - proj) + (((1 - h).unsqueeze(1) * UtY) ** 2).sum()
|
|
25
|
+
gamma = h.sum()
|
|
26
|
+
alpha = (M * gamma) / (wsq + 1e-12)
|
|
27
|
+
beta = (M * n - M * gamma) / (rss + 1e-12)
|
|
28
|
+
lam_new = (alpha / (beta + 1e-12)).clamp(min=LAM_FLOOR, max=1e8)
|
|
29
|
+
if (lam_new - lam).abs() < 1e-3 * lam:
|
|
30
|
+
lam = lam_new
|
|
31
|
+
break
|
|
32
|
+
lam = lam_new
|
|
33
|
+
return float(lam)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def ridge_scores(Phi_s, Y, Phi_q, lam=None):
|
|
37
|
+
"""Standardize by support stats, solve ridge in closed form, return query scores (nq, M)."""
|
|
38
|
+
mu = Phi_s.mean(0, keepdim=True); sd = Phi_s.std(0, keepdim=True).clamp(min=1e-8)
|
|
39
|
+
Phi_s = (Phi_s - mu) / sd; Phi_q = (Phi_q - mu) / sd
|
|
40
|
+
U, S, Vt = torch.linalg.svd(Phi_s, full_matrices=False)
|
|
41
|
+
UY = U.transpose(0, 1) @ Y
|
|
42
|
+
if lam is None:
|
|
43
|
+
lam = evidence_lambda(U, S, Y)
|
|
44
|
+
W = Vt.transpose(0, 1) @ ((S / (S ** 2 + lam)).unsqueeze(1) * UY)
|
|
45
|
+
return Phi_q @ W
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
output:
|
|
2
|
+
- 'lambda input: (lambda x, fm, c: torch.stack([(x * fm).sum(1) / c, ((x * x * fm).sum(1)
|
|
3
|
+
/ c - ((x * fm).sum(1) / c) ** 2).clamp(min=0).sqrt(), x.masked_fill(fm == 0, float(''-inf'')).amax(1),
|
|
4
|
+
x.masked_fill(fm == 0, float(''inf'')).amin(1), c / x.shape[1]], dim=1))(input[''x''],
|
|
5
|
+
input[''fmask''], input[''fmask''].sum(1).clamp(min=1)) #name: output_0 qi_mean=None
|
|
6
|
+
qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=69'
|
|
7
|
+
- 'lambda input: (lambda x, fm: (lambda A: torch.exp(-(((x.unsqueeze(1) - A.unsqueeze(0))
|
|
8
|
+
** 2) * fm.unsqueeze(1)).sum(2) / fm.sum(1, keepdim=True).clamp(min=1)))(torch.randn(8,
|
|
9
|
+
x.shape[1], generator=torch.Generator(device=x.device).manual_seed(209), device=x.device)))(input[''x''],
|
|
10
|
+
input[''fmask'']) #name: output_1 qi_mean=None qi_std=None qi_max=None qd_mean=None
|
|
11
|
+
qd_std=None qd_max=None age=69'
|
|
12
|
+
- 'lambda input: (lambda x, s: torch.cat([torch.relu(x @ torch.randn(x.shape[1], 8,
|
|
13
|
+
generator=torch.Generator(device=x.device).manual_seed(321), device=x.device) /
|
|
14
|
+
s), (x @ torch.randn(x.shape[1], 8, generator=torch.Generator(device=x.device).manual_seed(322),
|
|
15
|
+
device=x.device) / s) ** 2], dim=1))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
16
|
+
output_2 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
17
|
+
age=66'
|
|
18
|
+
- 'lambda input: (lambda x, s: torch.tanh(x @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(501),
|
|
19
|
+
device=x.device) / s))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
20
|
+
output_3 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
21
|
+
age=65'
|
|
22
|
+
- 'lambda input: (lambda x, s: torch.tanh((torch.sign(x) * torch.log1p(x.abs())) @
|
|
23
|
+
torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(701),
|
|
24
|
+
device=x.device) / s))(input[''x''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
25
|
+
output_4 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
26
|
+
age=62'
|
|
27
|
+
- 'lambda input: (lambda x, s: torch.tanh((torch.sign(x) * x.abs().sqrt()) @ torch.randn(x.shape[1],
|
|
28
|
+
16, generator=torch.Generator(device=x.device).manual_seed(801), device=x.device)
|
|
29
|
+
/ s))(input[''x''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
30
|
+
output_5 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
31
|
+
age=60'
|
|
32
|
+
- 'lambda input: (lambda x, s: torch.sigmoid(8.0 * (x @ torch.randn(x.shape[1], 16,
|
|
33
|
+
generator=torch.Generator(device=x.device).manual_seed(951), device=x.device) /
|
|
34
|
+
s)))(input[''xrank''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
35
|
+
output_6 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
36
|
+
age=55'
|
|
37
|
+
- 'lambda input: (lambda x, ic, ct: torch.stack([(x*ic).sum(1)/ic.sum(1).clamp(min=1),
|
|
38
|
+
(x*ct).sum(1)/ct.sum(1).clamp(min=1), (x*x*ic).sum(1)/ic.sum(1).clamp(min=1), (x*x*ct).sum(1)/ct.sum(1).clamp(min=1)],
|
|
39
|
+
dim=1))(input[''x''], input[''is_cat''], input[''fmask'']-input[''is_cat'']) #name:
|
|
40
|
+
output_7 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
41
|
+
age=50'
|
|
42
|
+
- 'lambda input: (lambda x, ic: torch.tanh((x*ic) @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(1221),
|
|
43
|
+
device=x.device) / ic.sum(1,keepdim=True).clamp(min=1).sqrt()))(input[''xrank''],
|
|
44
|
+
input[''is_cat'']) #name: output_8 qi_mean=None qi_std=None qi_max=None qd_mean=None
|
|
45
|
+
qd_std=None qd_max=None age=48'
|
|
46
|
+
- 'lambda input: (lambda x, s: torch.tanh(x @ torch.randn(x.shape[1], 16, generator=torch.Generator(device=x.device).manual_seed(1501),
|
|
47
|
+
device=x.device) / s))(input[''x_freq''], input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
48
|
+
output_9 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
49
|
+
age=43'
|
|
50
|
+
- 'lambda input: torch.tanh((input[''xrank'']*input[''x_freq'']) @ torch.randn(input[''x''].shape[1],
|
|
51
|
+
16, generator=torch.Generator(device=input[''x''].device).manual_seed(1711), device=input[''x''].device)
|
|
52
|
+
/ input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name: output_10 qi_mean=None
|
|
53
|
+
qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=37'
|
|
54
|
+
- 'lambda input: torch.tanh(((torch.sign(input[''x''])*input[''x''].abs().sqrt())*input[''xrank''])
|
|
55
|
+
@ torch.randn(input[''x''].shape[1], 16, generator=torch.Generator(device=input[''x''].device).manual_seed(1901),
|
|
56
|
+
device=input[''x''].device) / input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name:
|
|
57
|
+
output_11 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
58
|
+
age=35'
|
|
59
|
+
- 'lambda input: (lambda W,b: torch.cos(input[''xrank''] @ W / input[''fmask''].sum(1,
|
|
60
|
+
keepdim=True).clamp(min=1).sqrt() + b))(torch.randn(input[''x''].shape[1], 16, generator=torch.Generator(device=input[''x''].device).manual_seed(207),
|
|
61
|
+
device=input[''x''].device), torch.rand(16, generator=torch.Generator(device=input[''x''].device).manual_seed(208),
|
|
62
|
+
device=input[''x''].device)*6.2831853) #name: output_12 qi_mean=None qi_std=None
|
|
63
|
+
qi_max=None qd_mean=None qd_std=None qd_max=None age=30'
|
|
64
|
+
- 'lambda input: torch.tanh(input[''x''] @ torch.randn(input[''x''].shape[1], 16,
|
|
65
|
+
generator=torch.Generator(device=input[''x''].device).manual_seed(401), device=input[''x''].device)
|
|
66
|
+
/ input[''fmask''].sum(1, keepdim=True).clamp(min=1).sqrt()) #name: output_13 qi_mean=None
|
|
67
|
+
qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None age=30'
|
|
68
|
+
- 'lambda input: (lambda W,b: torch.cos(input[''x''] @ W / input[''fmask''].sum(1,
|
|
69
|
+
keepdim=True).clamp(min=1).sqrt() + b))(torch.randn(input[''x''].shape[1], 24, generator=torch.Generator(device=input[''x''].device).manual_seed(201),
|
|
70
|
+
device=input[''x''].device), torch.rand(24, generator=torch.Generator(device=input[''x''].device).manual_seed(202),
|
|
71
|
+
device=input[''x''].device)*6.2831853) #name: output_14 qi_mean=None qi_std=None
|
|
72
|
+
qi_max=None qd_mean=None qd_std=None qd_max=None age=28'
|
|
73
|
+
- 'lambda input: (lambda x, fm: (lambda C: (torch.exp(-((x.unsqueeze(2) - C) ** 2))
|
|
74
|
+
* fm.unsqueeze(2)).sum(1) / fm.sum(1, keepdim=True).clamp(min=1))(torch.linspace(-2.0,
|
|
75
|
+
2.0, 8, device=x.device).reshape(1, 1, 8)))(input[''xrank''], input[''fmask'']) #name:
|
|
76
|
+
output_15 qi_mean=None qi_std=None qi_max=None qd_mean=None qd_std=None qd_max=None
|
|
77
|
+
age=12'
|
evoforest_tab/combine.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Combine TabMap with a tabular foundation model (e.g. TabPFN) into a single predictor.
|
|
2
|
+
|
|
3
|
+
Implements the paper's complementarity result: an evolved closed-form map and a pretrained transformer
|
|
4
|
+
capture different structure (one tends to win classification, the other regression), so combining them
|
|
5
|
+
beats either alone. Three methods, in increasing adaptivity:
|
|
6
|
+
|
|
7
|
+
- ``blend`` : fixed 50/50 average of the base models' outputs (naive; only helps when matched).
|
|
8
|
+
- ``compwt`` : weight each base by its support-cross-validated competence (label-free, no meta-learner).
|
|
9
|
+
- ``meta`` : a learned ridge head over the base models' out-of-fold support predictions (most robust).
|
|
10
|
+
|
|
11
|
+
All combination is leakage-safe and *in-context*: weights/head are fit on the SUPPORT set via out-of-fold
|
|
12
|
+
predictions (no query labels). Base models are any in-context estimators with the scikit-learn surface
|
|
13
|
+
(``fit(X, y)`` + ``predict``/``predict_proba``), e.g. ``TabMapClassifier`` and TabPFN's client.
|
|
14
|
+
|
|
15
|
+
Example
|
|
16
|
+
-------
|
|
17
|
+
>>> from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
|
|
18
|
+
>>> from tabpfn_client import TabPFNClassifier
|
|
19
|
+
>>> ens = StackedTabularEnsemble([TabMapClassifier(), TabPFNClassifier()], task="classification",
|
|
20
|
+
... method="meta").fit(X_support, y_support)
|
|
21
|
+
>>> proba = ens.predict_proba(X_query)
|
|
22
|
+
"""
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from sklearn.base import clone
|
|
27
|
+
except ImportError:
|
|
28
|
+
def clone(e): # minimal fallback
|
|
29
|
+
return e.__class__(**(e.get_params() if hasattr(e, "get_params") else {}))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _ridge_fit_predict(Xtr, Ytr, Xte, lam=1.0):
|
|
33
|
+
"""Tiny closed-form ridge meta-head (numpy) with an intercept. Inputs are model outputs (probas /
|
|
34
|
+
predictions, already on a comparable bounded scale), so we \emph{center} rather than divide by the
|
|
35
|
+
per-column std---dividing by a near-zero std on a degenerate (near-constant) base output otherwise
|
|
36
|
+
explodes the fit on imbalanced datasets."""
|
|
37
|
+
mu = Xtr.mean(0, keepdims=True); ybar = Ytr.mean(0, keepdims=True)
|
|
38
|
+
Xc, Xqc = Xtr - mu, Xte - mu
|
|
39
|
+
A = Xc.T @ Xc + lam * np.eye(Xc.shape[1])
|
|
40
|
+
W = np.linalg.solve(A, Xc.T @ (Ytr - ybar))
|
|
41
|
+
return Xqc @ W + ybar
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class StackedTabularEnsemble:
|
|
45
|
+
"""Stack/blend in-context tabular base models (e.g. TabMap + TabPFN).
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
base_models : list of estimators (in-context; sklearn surface). For classification each must expose
|
|
50
|
+
``predict_proba``; for regression, ``predict``.
|
|
51
|
+
task : {"classification", "regression"}.
|
|
52
|
+
method : {"meta", "compwt", "blend"} (default "meta").
|
|
53
|
+
n_splits : folds for the support out-of-fold predictions (default 2).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, base_models, task="classification", method="meta", n_splits=2, ridge_lambda=1.0,
|
|
57
|
+
random_state=0):
|
|
58
|
+
self.base_models = list(base_models)
|
|
59
|
+
self.task = task
|
|
60
|
+
self.method = method
|
|
61
|
+
self.n_splits = n_splits
|
|
62
|
+
self.ridge_lambda = ridge_lambda
|
|
63
|
+
self.random_state = random_state
|
|
64
|
+
|
|
65
|
+
# ---- helpers --------------------------------------------------------
|
|
66
|
+
def _is_clf(self):
|
|
67
|
+
return self.task == "classification"
|
|
68
|
+
|
|
69
|
+
def _base_out(self, model, Xq):
|
|
70
|
+
if self._is_clf():
|
|
71
|
+
p = np.asarray(model.predict_proba(Xq))
|
|
72
|
+
P = np.zeros((p.shape[0], self.n_classes_))
|
|
73
|
+
cls = getattr(model, "classes_", np.arange(p.shape[1]))
|
|
74
|
+
for j, c in enumerate(cls):
|
|
75
|
+
idx = self._cls_index.get(c, j if j < self.n_classes_ else None)
|
|
76
|
+
if idx is not None:
|
|
77
|
+
P[:, idx] = p[:, j]
|
|
78
|
+
return P
|
|
79
|
+
return np.asarray(model.predict(Xq)).reshape(-1)
|
|
80
|
+
|
|
81
|
+
def _competence(self, oof, y):
|
|
82
|
+
if self._is_clf():
|
|
83
|
+
acc = (oof.argmax(1) == y).mean()
|
|
84
|
+
maj = np.bincount(y, minlength=self.n_classes_).argmax()
|
|
85
|
+
base = (y == maj).mean()
|
|
86
|
+
return max((acc - base) / (1 - base + 1e-8), 0.0)
|
|
87
|
+
ss_res = ((y - oof) ** 2).sum(); ss_tot = ((y - y.mean()) ** 2).sum() + 1e-8
|
|
88
|
+
return max(1.0 - ss_res / ss_tot, 0.0)
|
|
89
|
+
|
|
90
|
+
# ---- fit/predict ----------------------------------------------------
|
|
91
|
+
def fit(self, X, y):
|
|
92
|
+
X = np.asarray(X) if not hasattr(X, "iloc") else X
|
|
93
|
+
y = np.asarray(y)
|
|
94
|
+
n = len(y)
|
|
95
|
+
if self._is_clf():
|
|
96
|
+
self.classes_, y_idx = np.unique(y, return_inverse=True)
|
|
97
|
+
self.n_classes_ = len(self.classes_)
|
|
98
|
+
self._cls_index = {c: i for i, c in enumerate(self.classes_)}
|
|
99
|
+
y_work = y_idx
|
|
100
|
+
else:
|
|
101
|
+
y_work = y.astype(float)
|
|
102
|
+
self._X, self._y = X, y_work
|
|
103
|
+
# out-of-fold support predictions for each base model
|
|
104
|
+
rng = np.random.RandomState(self.random_state)
|
|
105
|
+
folds = np.array_split(rng.permutation(n), self.n_splits)
|
|
106
|
+
oofs = []
|
|
107
|
+
for model in self.base_models:
|
|
108
|
+
oof = (np.zeros((n, self.n_classes_)) if self._is_clf() else np.zeros(n))
|
|
109
|
+
for te in folds:
|
|
110
|
+
tr = np.setdiff1d(np.arange(n), te)
|
|
111
|
+
if self._is_clf() and len(np.unique(y_work[tr])) < 2:
|
|
112
|
+
continue
|
|
113
|
+
m = clone(model)
|
|
114
|
+
Xtr = X.iloc[tr] if hasattr(X, "iloc") else X[tr]
|
|
115
|
+
Xte = X.iloc[te] if hasattr(X, "iloc") else X[te]
|
|
116
|
+
m.fit(Xtr, (self.classes_[y_work[tr]] if self._is_clf() else y_work[tr]))
|
|
117
|
+
oof[te] = self._base_out(m, Xte)
|
|
118
|
+
oofs.append(oof)
|
|
119
|
+
self._oofs = oofs
|
|
120
|
+
# competence weights (normalized); fall back to uniform (= 50/50) if no model is competent,
|
|
121
|
+
# so a degenerate support (e.g. heavy imbalance -> all competences clamp to 0) never yields a
|
|
122
|
+
# zero prediction.
|
|
123
|
+
comps = [self._competence(o, y_work) for o in oofs]
|
|
124
|
+
s = sum(comps)
|
|
125
|
+
self._weights = [c / s for c in comps] if s > 1e-6 else [1.0 / len(comps)] * len(comps)
|
|
126
|
+
if self.method == "meta":
|
|
127
|
+
H = np.concatenate(oofs, 1) if self._is_clf() else np.stack(oofs, 1)
|
|
128
|
+
if self._is_clf():
|
|
129
|
+
Y = np.eye(self.n_classes_)[y_work]
|
|
130
|
+
self._head = ("clf", H.mean(0), H.std(0), Y) # store; refit at predict via _ridge
|
|
131
|
+
self._Htr = H
|
|
132
|
+
self._Ytr = (np.eye(self.n_classes_)[y_work] if self._is_clf()
|
|
133
|
+
else (y_work - y_work.mean())[:, None])
|
|
134
|
+
self._ymean = (0.0 if self._is_clf() else y_work.mean())
|
|
135
|
+
return self
|
|
136
|
+
|
|
137
|
+
def _query_outs(self, Xq):
|
|
138
|
+
outs = []
|
|
139
|
+
for model in self.base_models:
|
|
140
|
+
m = clone(model)
|
|
141
|
+
m.fit(self._X, (self.classes_[self._y] if self._is_clf() else self._y))
|
|
142
|
+
outs.append(self._base_out(m, Xq))
|
|
143
|
+
return outs
|
|
144
|
+
|
|
145
|
+
def _combine(self, outs):
|
|
146
|
+
if self.method == "blend":
|
|
147
|
+
w = [1.0 / len(outs)] * len(outs)
|
|
148
|
+
return sum(wi * o for wi, o in zip(w, outs))
|
|
149
|
+
if self.method == "compwt":
|
|
150
|
+
return sum(wi * o for wi, o in zip(self._weights, outs))
|
|
151
|
+
# meta: learned ridge head over concatenated/stacked base outputs
|
|
152
|
+
Hq = np.concatenate(outs, 1) if self._is_clf() else np.stack(outs, 1)
|
|
153
|
+
pred = _ridge_fit_predict(self._Htr, self._Ytr, Hq, self.ridge_lambda)
|
|
154
|
+
return pred if self._is_clf() else pred[:, 0] + self._ymean
|
|
155
|
+
|
|
156
|
+
def predict_proba(self, Xq):
|
|
157
|
+
assert self._is_clf(), "predict_proba is classification-only"
|
|
158
|
+
P = self._combine(self._query_outs(Xq))
|
|
159
|
+
P = np.clip(P, 1e-9, None); return P / P.sum(1, keepdims=True)
|
|
160
|
+
|
|
161
|
+
def predict(self, Xq):
|
|
162
|
+
outs = self._query_outs(Xq)
|
|
163
|
+
if self._is_clf():
|
|
164
|
+
return self.classes_[self._combine(outs).argmax(1)]
|
|
165
|
+
return self._combine(outs)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""scikit-learn-style estimators for the evolved universal tabular feature map + closed-form ridge head.
|
|
2
|
+
|
|
3
|
+
An in-context learner: ``fit`` stores the labeled support rows; ``predict`` builds the (transductive,
|
|
4
|
+
label-free) channels over the pooled support+query rows, maps them through the evolved feature map,
|
|
5
|
+
and solves a Bayesian ridge in closed form -- no gradient descent, no per-dataset tuning. An optional
|
|
6
|
+
K-member ensemble averages decorrelated random-feature draws (variance reduction toward the kernel
|
|
7
|
+
limit). Free, local, CPU-friendly, and interpretable.
|
|
8
|
+
|
|
9
|
+
Example
|
|
10
|
+
-------
|
|
11
|
+
>>> from evoforest_tab import TabMapClassifier
|
|
12
|
+
>>> clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support)
|
|
13
|
+
>>> proba = clf.predict_proba(X_query)
|
|
14
|
+
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import torch
|
|
17
|
+
|
|
18
|
+
from ._channels import DMAX_DEFAULT, build_channels
|
|
19
|
+
from ._genome import DEFAULT_GENOME, build_phi, load_genome, seed_variant
|
|
20
|
+
from ._ridge import ridge_scores
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
|
|
24
|
+
except ImportError: # sklearn optional
|
|
25
|
+
class BaseEstimator: # type: ignore
|
|
26
|
+
def get_params(self, deep=True):
|
|
27
|
+
return {}
|
|
28
|
+
class ClassifierMixin: # type: ignore
|
|
29
|
+
pass
|
|
30
|
+
class RegressorMixin: # type: ignore
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _TabMapBase(BaseEstimator):
|
|
35
|
+
def __init__(self, n_estimators: int = 6, genome_path: str = DEFAULT_GENOME,
|
|
36
|
+
cat_features=None, Dmax: int = DMAX_DEFAULT, device: str = "cpu"):
|
|
37
|
+
self.n_estimators = n_estimators
|
|
38
|
+
self.genome_path = genome_path
|
|
39
|
+
self.cat_features = cat_features
|
|
40
|
+
self.Dmax = Dmax
|
|
41
|
+
self.device = device
|
|
42
|
+
|
|
43
|
+
def _genomes(self):
|
|
44
|
+
base = load_genome(self.genome_path)
|
|
45
|
+
return [base] + [seed_variant(base, 1000 * k) for k in range(1, self.n_estimators)]
|
|
46
|
+
|
|
47
|
+
def _fit(self, X, y):
|
|
48
|
+
self._X = np.asarray(X) if not hasattr(X, "iloc") else X
|
|
49
|
+
self._y = np.asarray(y)
|
|
50
|
+
self._n_support = len(self._y)
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def _phi_split(self, X_query):
|
|
54
|
+
"""Pool support+query, build channels + Phi per genome variant, yield (Phi_s, Phi_q)."""
|
|
55
|
+
if hasattr(self._X, "iloc"):
|
|
56
|
+
import pandas as pd
|
|
57
|
+
X_all = pd.concat([self._X, X_query if hasattr(X_query, "iloc") else pd.DataFrame(
|
|
58
|
+
np.asarray(X_query), columns=self._X.columns)], axis=0, ignore_index=True)
|
|
59
|
+
else:
|
|
60
|
+
X_all = np.vstack([np.asarray(self._X), np.asarray(X_query)])
|
|
61
|
+
channels = build_channels(X_all, cat_features=self.cat_features, Dmax=self.Dmax, device=self.device)
|
|
62
|
+
ns = self._n_support
|
|
63
|
+
for g in self._genomes():
|
|
64
|
+
Phi = build_phi(g, channels)
|
|
65
|
+
yield Phi[:ns], Phi[ns:]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TabMapClassifier(_TabMapBase, ClassifierMixin):
|
|
69
|
+
def fit(self, X, y):
|
|
70
|
+
self._fit(X, y)
|
|
71
|
+
self.classes_, y_idx = np.unique(self._y, return_inverse=True)
|
|
72
|
+
self._y_idx = y_idx
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def predict_proba(self, X_query):
|
|
76
|
+
nc = len(self.classes_)
|
|
77
|
+
ys = torch.from_numpy(self._y_idx).long()
|
|
78
|
+
Y = torch.zeros(self._n_support, nc, device=self.device)
|
|
79
|
+
Y[torch.arange(self._n_support), ys] = 1.0
|
|
80
|
+
probs = None
|
|
81
|
+
k = 0
|
|
82
|
+
for Phi_s, Phi_q in self._phi_split(X_query):
|
|
83
|
+
scores = ridge_scores(Phi_s, Y, Phi_q)
|
|
84
|
+
p = torch.softmax(scores, dim=1)
|
|
85
|
+
probs = p if probs is None else probs + p
|
|
86
|
+
k += 1
|
|
87
|
+
return (probs / k).cpu().numpy()
|
|
88
|
+
|
|
89
|
+
def predict(self, X_query):
|
|
90
|
+
return self.classes_[self.predict_proba(X_query).argmax(1)]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TabMapRegressor(_TabMapBase, RegressorMixin):
|
|
94
|
+
def fit(self, X, y):
|
|
95
|
+
self._fit(X, y.astype(float) if hasattr(y, "astype") else y)
|
|
96
|
+
return self
|
|
97
|
+
|
|
98
|
+
def predict(self, X_query):
|
|
99
|
+
yt = torch.from_numpy(np.asarray(self._y, dtype=np.float64)).float().to(self.device)
|
|
100
|
+
mean = yt.mean()
|
|
101
|
+
Y = (yt - mean).unsqueeze(1)
|
|
102
|
+
preds = None
|
|
103
|
+
k = 0
|
|
104
|
+
for Phi_s, Phi_q in self._phi_split(X_query):
|
|
105
|
+
scores = ridge_scores(Phi_s, Y, Phi_q).squeeze(1) + mean
|
|
106
|
+
preds = scores if preds is None else preds + scores
|
|
107
|
+
k += 1
|
|
108
|
+
return (preds / k).cpu().numpy()
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Inductive EvoForest-Tab: a standard fit->transform/predict feature map (fits channel statistics on
|
|
2
|
+
TRAINING data, applies the *fixed* maps to new rows), as opposed to the transductive/in-context tabmap
|
|
3
|
+
estimators whose predict() pools support+query. The inductive version satisfies the ordinary
|
|
4
|
+
scikit-learn contract (and therefore `check_estimator`), so it can serve as the basis for the
|
|
5
|
+
scikit-learn-contrib estimator, the skrub `TabMapEncoder` (a TransformerMixin), and a pytorch-frame /
|
|
6
|
+
AutoGluon featurizer. It reuses the SAME evolved genome (the 16 families) -- only the channels are now
|
|
7
|
+
computed inductively.
|
|
8
|
+
|
|
9
|
+
Channels (all fit on train, applied to new data):
|
|
10
|
+
x = per-column standardize (StandardScaler)
|
|
11
|
+
xrank = per-column rank-gauss (QuantileTransformer, output_distribution='normal')
|
|
12
|
+
x_freq = per-column count/frequency encode (value->train-frequency map; unseen -> 0), standardized
|
|
13
|
+
is_cat = categorical mask (from cat_features / fit-time dtype)
|
|
14
|
+
fmask = feature-presence mask
|
|
15
|
+
Padded/capped to Dmax columns (top-variance selection fit on train).
|
|
16
|
+
"""
|
|
17
|
+
import math
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import torch
|
|
21
|
+
import torch.nn.functional as F
|
|
22
|
+
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
|
|
23
|
+
from sklearn.linear_model import RidgeCV, RidgeClassifierCV
|
|
24
|
+
import numpy as _np
|
|
25
|
+
from sklearn.preprocessing import QuantileTransformer, StandardScaler
|
|
26
|
+
from sklearn.utils.validation import check_is_fitted
|
|
27
|
+
|
|
28
|
+
# reuse the evolved genome (the 16 families) from this package
|
|
29
|
+
from ._genome import load_genome, seed_variant, DEFAULT_GENOME
|
|
30
|
+
|
|
31
|
+
_G = {"torch": torch, "F": F, "np": np, "math": math}
|
|
32
|
+
DMAX = 100
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _raw_phi(genome, channels):
|
|
36
|
+
"""Evaluate the genome lambdas on the channel dict and stack ALL feature columns, WITHOUT the
|
|
37
|
+
per-batch drop/standardize that the transductive build_phi applies (which would make the output
|
|
38
|
+
width and scale batch-dependent). Inductive selection + standardization is applied by the caller
|
|
39
|
+
using statistics stored at fit time."""
|
|
40
|
+
gg = {k: eval(v, _G) for k, v in (genome.get("@globals", {}) or {}).items()}
|
|
41
|
+
n = channels["x"].shape[0]
|
|
42
|
+
cols = []
|
|
43
|
+
for lam in genome["output"]:
|
|
44
|
+
fn = eval(lam, _G); na = fn.__code__.co_argcount; an = fn.__code__.co_varnames[:na]
|
|
45
|
+
sig = fn(*[channels if a == "input" else gg if a == "globals" else None for a in an])
|
|
46
|
+
if not torch.is_tensor(sig):
|
|
47
|
+
sig = torch.as_tensor(sig)
|
|
48
|
+
sig = sig.float()
|
|
49
|
+
if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
|
|
50
|
+
sub = [sig[:, c] for c in range(sig.shape[1])]
|
|
51
|
+
else:
|
|
52
|
+
sig = sig.squeeze()
|
|
53
|
+
sub = [sig] if (sig.dim() == 1 and sig.shape[0] == n) else []
|
|
54
|
+
for col in sub:
|
|
55
|
+
cols.append(torch.where(torch.isfinite(col), col, torch.zeros_like(col)))
|
|
56
|
+
return torch.stack(cols, dim=1).cpu().numpy() # (n, K_raw), un-normalized, fixed order
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _check_no_inf(Xm):
|
|
60
|
+
"""NaN is allowed (we median-impute, and declare allow_nan=True), but +/-inf is rejected so the
|
|
61
|
+
estimator passes sklearn's check_estimators_nan_inf for allow_nan estimators."""
|
|
62
|
+
if np.isinf(Xm).any():
|
|
63
|
+
raise ValueError("Input contains infinity or a value too large for dtype('float64').")
|
|
64
|
+
return Xm
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _as_float_matrix(X, cat_cols):
|
|
68
|
+
"""Coerce to float ndarray, ordinal-encoding categorical columns; returns (Xm, cat_mask, cat_maps).
|
|
69
|
+
cat_maps lets transform() reproduce the *training* category->code mapping (unseen -> nan)."""
|
|
70
|
+
import pandas as pd
|
|
71
|
+
import scipy.sparse as sp
|
|
72
|
+
if sp.issparse(X):
|
|
73
|
+
raise TypeError("EvoForest-Tab does not support sparse input. Densify it first "
|
|
74
|
+
"(e.g. X.toarray()) or set a sparse-aware preprocessor upstream.")
|
|
75
|
+
if isinstance(X, pd.DataFrame):
|
|
76
|
+
cols = list(X.columns)
|
|
77
|
+
cat = [(str(X[c].dtype) in ("category", "object", "bool")) or
|
|
78
|
+
(cat_cols is not None and (i in cat_cols or c in cat_cols)) for i, c in enumerate(cols)]
|
|
79
|
+
arrs, maps = [], {}
|
|
80
|
+
for i, c in enumerate(cols):
|
|
81
|
+
if cat[i]:
|
|
82
|
+
cats = list(pd.Categorical(X[c]).categories)
|
|
83
|
+
maps[i] = {v: k for k, v in enumerate(cats)}
|
|
84
|
+
arrs.append(X[c].map(maps[i]).to_numpy(dtype=np.float64))
|
|
85
|
+
else:
|
|
86
|
+
arrs.append(X[c].to_numpy(dtype=np.float64))
|
|
87
|
+
return _check_no_inf(np.column_stack(arrs)), np.array(cat), maps
|
|
88
|
+
Xm = np.asarray(X, dtype=np.float64)
|
|
89
|
+
cat = np.zeros(Xm.shape[1], bool)
|
|
90
|
+
if cat_cols is not None:
|
|
91
|
+
for i in cat_cols:
|
|
92
|
+
cat[i] = True
|
|
93
|
+
return _check_no_inf(Xm), cat, {}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class TabMapTransformer(TransformerMixin, BaseEstimator):
|
|
97
|
+
"""Inductive EvoForest-Tab feature map: fit channel maps on train, transform any rows -> phi (n, K).
|
|
98
|
+
|
|
99
|
+
Parameters: ``genome_path`` (the evolved genome), ``cat_features`` (indices/names; else auto by dtype),
|
|
100
|
+
``Dmax`` (feature width cap), ``n_quantiles``.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, n_quantiles=256):
|
|
104
|
+
self.genome_path = genome_path
|
|
105
|
+
self.cat_features = cat_features
|
|
106
|
+
self.Dmax = Dmax
|
|
107
|
+
self.n_quantiles = n_quantiles
|
|
108
|
+
|
|
109
|
+
def __sklearn_tags__(self):
|
|
110
|
+
tags = super().__sklearn_tags__()
|
|
111
|
+
tags.input_tags.allow_nan = True # we median-impute NaN (but reject inf)
|
|
112
|
+
tags.transformer_tags.preserves_dtype = [] # output is always float32/64, not input dtype
|
|
113
|
+
return tags
|
|
114
|
+
|
|
115
|
+
def _validate(self, X, *, reset):
|
|
116
|
+
"""Numeric inputs go through sklearn's check_array (enforces 2D, rejects complex/1D, allows NaN,
|
|
117
|
+
sets/checks n_features_in_) so the estimator passes check_estimator. A pandas DataFrame carrying
|
|
118
|
+
categorical/object columns takes the bespoke path (check_array would reject object dtype)."""
|
|
119
|
+
import pandas as pd
|
|
120
|
+
from sklearn.utils.validation import check_array
|
|
121
|
+
is_df = isinstance(X, pd.DataFrame)
|
|
122
|
+
has_cat = is_df and any(str(d) in ("object", "category", "bool") for d in X.dtypes)
|
|
123
|
+
if has_cat or (is_df and self.cat_features is not None):
|
|
124
|
+
Xv, n = X, X.shape[1]
|
|
125
|
+
else:
|
|
126
|
+
Xv = check_array(X, dtype=np.float64, ensure_2d=True, ensure_all_finite="allow-nan",
|
|
127
|
+
ensure_min_samples=2 if reset else 1, estimator=self)
|
|
128
|
+
n = Xv.shape[1]
|
|
129
|
+
if reset:
|
|
130
|
+
self.n_features_in_ = n
|
|
131
|
+
elif n != self.n_features_in_:
|
|
132
|
+
raise ValueError(f"X has {n} features, but {type(self).__name__} "
|
|
133
|
+
f"is expecting {self.n_features_in_} features as input.")
|
|
134
|
+
return Xv
|
|
135
|
+
|
|
136
|
+
def fit(self, X, y=None):
|
|
137
|
+
X = self._validate(X, reset=True)
|
|
138
|
+
Xm, cat, self.cat_maps_ = _as_float_matrix(X, self.cat_features)
|
|
139
|
+
# median-impute (store train medians), drop constant cols, cap to Dmax by variance
|
|
140
|
+
self.medians_ = np.nanmedian(Xm, axis=0)
|
|
141
|
+
Xi = np.where(np.isnan(Xm), self.medians_, Xm)
|
|
142
|
+
keep = Xi.std(0) > 1e-9
|
|
143
|
+
idx = np.where(keep)[0]
|
|
144
|
+
if len(idx) > self.Dmax:
|
|
145
|
+
idx = idx[np.argsort(-Xi[:, idx].std(0))[:self.Dmax]]
|
|
146
|
+
self.keep_idx_ = idx
|
|
147
|
+
self.cat_kept_ = cat[idx]
|
|
148
|
+
Xk = Xi[:, idx]
|
|
149
|
+
# fit the per-column channel maps
|
|
150
|
+
self.scaler_x_ = StandardScaler().fit(Xk)
|
|
151
|
+
self.qt_ = QuantileTransformer(output_distribution="normal",
|
|
152
|
+
n_quantiles=min(self.n_quantiles, len(Xk)), subsample=10 ** 9).fit(Xk)
|
|
153
|
+
self.freq_maps_ = [{v: c / len(Xk) for v, c in zip(*np.unique(Xk[:, j], return_counts=True))}
|
|
154
|
+
for j in range(Xk.shape[1])]
|
|
155
|
+
F = self._freq(Xk)
|
|
156
|
+
self.scaler_f_ = StandardScaler().fit(F)
|
|
157
|
+
self.genome_ = load_genome(self.genome_path)
|
|
158
|
+
# fix the phi columns + standardization on TRAIN (so transform output is inductive: same width
|
|
159
|
+
# and scale for any batch, including a single row)
|
|
160
|
+
raw = _raw_phi(self.genome_, self._channels(X))
|
|
161
|
+
sd = raw.std(0)
|
|
162
|
+
self.phi_keep_ = np.where(sd > 1e-9)[0]
|
|
163
|
+
self.phi_mu_ = raw[:, self.phi_keep_].mean(0)
|
|
164
|
+
self.phi_sd_ = sd[self.phi_keep_].clip(min=1e-9)
|
|
165
|
+
self.n_output_features_ = len(self.phi_keep_)
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
def _freq(self, Xk):
|
|
169
|
+
F = np.zeros_like(Xk)
|
|
170
|
+
for j in range(Xk.shape[1]):
|
|
171
|
+
m = self.freq_maps_[j]
|
|
172
|
+
F[:, j] = [m.get(v, 0.0) for v in Xk[:, j]]
|
|
173
|
+
return F
|
|
174
|
+
|
|
175
|
+
def _channels(self, X):
|
|
176
|
+
Xm, _, _ = _as_float_matrix(X, self.cat_features)
|
|
177
|
+
# reproduce train category codes for DataFrame inputs is handled in _as_float_matrix via dtype;
|
|
178
|
+
# here we just impute + select the kept columns with train medians.
|
|
179
|
+
Xi = np.where(np.isnan(Xm), self.medians_[: Xm.shape[1]], Xm)
|
|
180
|
+
Xk = Xi[:, self.keep_idx_]
|
|
181
|
+
n, D = Xk.shape
|
|
182
|
+
Dmax = self.Dmax
|
|
183
|
+
def pad(a):
|
|
184
|
+
return np.pad(a, ((0, 0), (0, max(0, Dmax - a.shape[1]))))[:, :Dmax]
|
|
185
|
+
x = pad(self.scaler_x_.transform(Xk))
|
|
186
|
+
xrank = pad(self.qt_.transform(Xk))
|
|
187
|
+
xfreq = pad(self.scaler_f_.transform(self._freq(Xk)))
|
|
188
|
+
fmask = np.zeros((n, Dmax)); fmask[:, :min(D, Dmax)] = 1.0
|
|
189
|
+
iscat = np.zeros((n, Dmax)); iscat[:, :min(D, Dmax)] = self.cat_kept_[:min(D, Dmax)].astype(float)
|
|
190
|
+
t = lambda a: torch.from_numpy(np.nan_to_num(a)).float()
|
|
191
|
+
return {"x": t(x), "xrank": t(xrank), "x_freq": t(xfreq), "fmask": t(fmask), "is_cat": t(iscat), "Dmax": Dmax}
|
|
192
|
+
|
|
193
|
+
def transform(self, X):
|
|
194
|
+
check_is_fitted(self, "phi_keep_")
|
|
195
|
+
X = self._validate(X, reset=False)
|
|
196
|
+
raw = _raw_phi(self.genome_, self._channels(X))[:, self.phi_keep_]
|
|
197
|
+
return (raw - self.phi_mu_) / self.phi_sd_
|
|
198
|
+
|
|
199
|
+
def get_feature_names_out(self, input_features=None):
|
|
200
|
+
check_is_fitted(self, "n_output_features_")
|
|
201
|
+
return np.array([f"evoforest_{i}" for i in range(self.n_output_features_)])
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class TabMapInductiveClassifier(ClassifierMixin, BaseEstimator):
|
|
205
|
+
"""Inductive classifier: EvoForest-Tab feature map -> ridge head. Standard fit->predict (passes the
|
|
206
|
+
ordinary sklearn contract). Use the transductive ``tabmap.TabMapClassifier`` for the support->query
|
|
207
|
+
in-context setting; use this for a drop-in sklearn estimator."""
|
|
208
|
+
|
|
209
|
+
def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, alpha=1.0):
|
|
210
|
+
self.genome_path = genome_path
|
|
211
|
+
self.cat_features = cat_features
|
|
212
|
+
self.Dmax = Dmax
|
|
213
|
+
self.alpha = alpha
|
|
214
|
+
|
|
215
|
+
def __sklearn_tags__(self):
|
|
216
|
+
tags = super().__sklearn_tags__()
|
|
217
|
+
tags.input_tags.allow_nan = True
|
|
218
|
+
return tags
|
|
219
|
+
|
|
220
|
+
def fit(self, X, y):
|
|
221
|
+
if y is None:
|
|
222
|
+
raise ValueError(f"{type(self).__name__} requires y to be passed, but the target y is None.")
|
|
223
|
+
y = np.asarray(y) # de-wrap _NotAnArray before np.unique
|
|
224
|
+
self._tf = TabMapTransformer(self.genome_path, self.cat_features, self.Dmax).fit(X)
|
|
225
|
+
self.classes_ = np.unique(y)
|
|
226
|
+
self.n_features_in_ = self._tf.n_features_in_
|
|
227
|
+
self._head = RidgeClassifierCV(alphas=_np.logspace(-2, 4, 13)).fit(self._tf.transform(X), y)
|
|
228
|
+
return self
|
|
229
|
+
|
|
230
|
+
def predict(self, X):
|
|
231
|
+
check_is_fitted(self, "_head")
|
|
232
|
+
return self._head.predict(self._tf.transform(X))
|
|
233
|
+
|
|
234
|
+
def predict_proba(self, X):
|
|
235
|
+
"""Calibration-free probabilities from the ridge decision function (sigmoid for binary, softmax
|
|
236
|
+
for multiclass) -- RidgeClassifierCV exposes only decision_function."""
|
|
237
|
+
check_is_fitted(self, "_head")
|
|
238
|
+
d = self._head.decision_function(self._tf.transform(X))
|
|
239
|
+
if d.ndim == 1: # binary
|
|
240
|
+
p = 1.0 / (1.0 + np.exp(-d))
|
|
241
|
+
return np.column_stack([1.0 - p, p])
|
|
242
|
+
e = np.exp(d - d.max(1, keepdims=True))
|
|
243
|
+
return e / e.sum(1, keepdims=True)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class TabMapInductiveRegressor(RegressorMixin, BaseEstimator):
|
|
247
|
+
def __init__(self, genome_path=DEFAULT_GENOME, cat_features=None, Dmax=DMAX, alpha=1.0):
|
|
248
|
+
self.genome_path = genome_path
|
|
249
|
+
self.cat_features = cat_features
|
|
250
|
+
self.Dmax = Dmax
|
|
251
|
+
self.alpha = alpha
|
|
252
|
+
|
|
253
|
+
def __sklearn_tags__(self):
|
|
254
|
+
tags = super().__sklearn_tags__()
|
|
255
|
+
tags.input_tags.allow_nan = True
|
|
256
|
+
return tags
|
|
257
|
+
|
|
258
|
+
def fit(self, X, y):
|
|
259
|
+
from sklearn.utils.validation import column_or_1d
|
|
260
|
+
y = column_or_1d(y, warn=True) # warn + ravel on (n,1) y -> check_supervised_y_2d
|
|
261
|
+
self._tf = TabMapTransformer(self.genome_path, self.cat_features, self.Dmax).fit(X)
|
|
262
|
+
self.n_features_in_ = self._tf.n_features_in_
|
|
263
|
+
self._head = RidgeCV(alphas=_np.logspace(-2, 4, 13)).fit(self._tf.transform(X), y)
|
|
264
|
+
return self
|
|
265
|
+
|
|
266
|
+
def predict(self, X):
|
|
267
|
+
check_is_fitted(self, "_head")
|
|
268
|
+
return self._head.predict(self._tf.transform(X))
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# brand-consistent names matching the paper (TabMap* retained as aliases)
|
|
272
|
+
EvoForestTabTransformer = TabMapTransformer
|
|
273
|
+
EvoForestTabInductiveClassifier = TabMapInductiveClassifier
|
|
274
|
+
EvoForestTabInductiveRegressor = TabMapInductiveRegressor
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evoforest-tab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evolved universal tabular feature map + closed-form ridge: an interpretable, training-free, local in-context learner for tabular data.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: tabular,in-context-learning,feature-map,tabpfn,ridge,automl
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: torch>=1.13
|
|
11
|
+
Requires-Dist: numpy>=1.21
|
|
12
|
+
Requires-Dist: pyyaml>=5.4
|
|
13
|
+
Provides-Extra: sklearn
|
|
14
|
+
Requires-Dist: scikit-learn>=1.0; extra == "sklearn"
|
|
15
|
+
Provides-Extra: examples
|
|
16
|
+
Requires-Dist: scikit-learn>=1.0; extra == "examples"
|
|
17
|
+
Requires-Dist: pandas>=1.3; extra == "examples"
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
|
|
20
|
+
# tabmap — EvoForest-Tab: an evolved universal tabular feature map
|
|
21
|
+
|
|
22
|
+
`tabmap` is the reference implementation of **EvoForest-Tab** (the EvoForest computation-search framework specialized to tabular data).
|
|
23
|
+
|
|
24
|
+
`tabmap` is an interpretable, training-free, **local** in-context learner for tabular data: an
|
|
25
|
+
evolved universal feature map `φ: row → ℝᴷ` (16 transform families over rank-gauss, count-encoding,
|
|
26
|
+
and categorical-mask channels) paired with a per-dataset **closed-form Bayesian-ridge head**. Given a
|
|
27
|
+
labeled *support* set and an unlabeled *query* set, it predicts in a single SVD solve — no gradient
|
|
28
|
+
descent, no per-dataset tuning, no GPU. It is competitive with gradient boosting and with the
|
|
29
|
+
published **TabPFN-v2** tabular foundation model, while remaining free to run and fully inspectable.
|
|
30
|
+
|
|
31
|
+
This repository accompanies the paper *"Evolving a Universal Tabular Feature Map: Interpretable,
|
|
32
|
+
Closed-Form In-Context Learning Competitive with Tabular Foundation Models"* and is **stand-alone**:
|
|
33
|
+
the deployment pipeline (feature map + ridge) depends only on `torch`, `numpy`, and `pyyaml`.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e . # editable; or: pip install .
|
|
38
|
+
# deps: torch, numpy, pyyaml (+ scikit-learn for the estimator base classes & examples)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage (scikit-learn style)
|
|
42
|
+
```python
|
|
43
|
+
from evoforest_tab import TabMapClassifier, TabMapRegressor
|
|
44
|
+
|
|
45
|
+
clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support) # X: ndarray or DataFrame
|
|
46
|
+
proba = clf.predict_proba(X_query) # in-context: query needed to fit φ channels
|
|
47
|
+
pred = clf.predict(X_query)
|
|
48
|
+
|
|
49
|
+
reg = TabMapRegressor(n_estimators=6).fit(X_support, y_support)
|
|
50
|
+
yhat = reg.predict(X_query)
|
|
51
|
+
```
|
|
52
|
+
Notes:
|
|
53
|
+
- It is an **in-context** learner: `predict` builds the (label-free, transductive) channels over the
|
|
54
|
+
pooled support+query rows, so the query rows are needed at prediction time (as with TabPFN).
|
|
55
|
+
- `n_estimators` is the random-feature ensemble size (averaged decorrelated seed-variants of `φ`);
|
|
56
|
+
`n_estimators=1` is the single map, `6` is the paper default (variance reduction toward the kernel limit).
|
|
57
|
+
- `cat_features=[...]` marks categorical columns (indices or DataFrame names); omitted → auto-detected.
|
|
58
|
+
- No class-count ceiling (unlike TabPFN-v2's ≤10 classes); runs on CPU in milliseconds.
|
|
59
|
+
|
|
60
|
+
## What's inside
|
|
61
|
+
```
|
|
62
|
+
tabmap/
|
|
63
|
+
_channels.py raw rows -> input channels (col-z, rank-gauss, count-encoding, categorical mask), nan-safe
|
|
64
|
+
_genome.py evaluate the evolved genome (champion.yaml) -> feature matrix Phi; seed-variants for the ensemble
|
|
65
|
+
_ridge.py closed-form Bayesian-ridge head (evidence-maximized lambda), single SVD solve
|
|
66
|
+
estimator.py TabMapClassifier / TabMapRegressor (sklearn API) + K-seed ensemble
|
|
67
|
+
champion.yaml the evolved 16-family genome (the deployment artifact)
|
|
68
|
+
examples/quickstart.py
|
|
69
|
+
reproduce/ scripts + cached TabPFN-v2 predictions to reproduce the paper's experiments
|
|
70
|
+
tests/
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Reproducing the paper
|
|
74
|
+
See [`reproduce/README.md`](reproduce/README.md). The cached TabPFN-v2 cloud predictions are included
|
|
75
|
+
so the head-to-head and routing experiments reproduce **without** any API key.
|
|
76
|
+
|
|
77
|
+
## Contributing this method upstream
|
|
78
|
+
`tabmap` is designed to drop into the tabular ML ecosystem. Best integration targets (most aligned first):
|
|
79
|
+
|
|
80
|
+
| Repo | Why it fits | Integration |
|
|
81
|
+
|---|---|---|
|
|
82
|
+
| **PriorLabs/tabpfn-extensions** | community extensions around TabPFN; our method is a free/local **complementary** in-context learner and a natural **cost-aware router** companion (route hard datasets to TabPFN, the rest to `tabmap`) | add as an extension module + a routing utility (`sklearn`-compatible) |
|
|
83
|
+
| **scikit-learn-contrib** | `TabMapClassifier`/`TabMapRegressor` already follow the estimator API | publish as a standalone `scikit-learn-contrib` project |
|
|
84
|
+
| **skrub** (ex dirty-cat) | tabular feature engineering / encoders; our channels (rank-gauss, count-encoding) + `φ` are a drop-in `TransformerMixin` featurizer | contribute `TabMapEncoder` (transform-only) |
|
|
85
|
+
| **pyg-team/pytorch-frame** | deep tabular; `φ` is a fixed featurizer usable as an input stem | add as an `encoder`/`stype` transform |
|
|
86
|
+
| **autogluon / TabArena** | leaderboard model implementations | submit `tabmap` as a model for the TabArena living benchmark |
|
|
87
|
+
|
|
88
|
+
The estimator's sklearn-compatible surface (`fit`/`predict`/`predict_proba`, `get_params`) is the
|
|
89
|
+
contribution-ready API; the transform-only `build_channels`+`build_phi` path serves the encoder use-cases.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## Combining with a foundation model (e.g. TabPFN)
|
|
93
|
+
`StackedTabularEnsemble` combines TabMap with any in-context base model (such as TabPFN's client) into a
|
|
94
|
+
single, stronger predictor -- the paper's complementarity result (our map tends to win classification,
|
|
95
|
+
TabPFN regression; combining beats either alone). Three methods: `blend` (50/50), `compwt`
|
|
96
|
+
(label-free, weight each model by its support-cross-validated competence), `meta` (a learned ridge head
|
|
97
|
+
over the models' out-of-fold support predictions; most robust). All are leakage-safe and in-context
|
|
98
|
+
(weights/head fit on support, no query labels).
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
|
|
102
|
+
from tabpfn_client import TabPFNClassifier # or any sklearn-surface in-context model
|
|
103
|
+
|
|
104
|
+
ens = StackedTabularEnsemble(
|
|
105
|
+
[TabMapClassifier(n_estimators=6), TabPFNClassifier()],
|
|
106
|
+
task="classification", method="meta", # "meta" | "compwt" | "blend"
|
|
107
|
+
).fit(X_support, y_support)
|
|
108
|
+
proba = ens.predict_proba(X_query)
|
|
109
|
+
```
|
|
110
|
+
The learned head (`meta`) is robust whether the two models are evenly matched or one dominates; the
|
|
111
|
+
label-free `compwt` is a close, deployable second with no meta-learner. See `examples/combine_tabpfn.py`.
|
|
112
|
+
|
|
113
|
+
## Citation
|
|
114
|
+
If you use this library, please cite the accompanying paper *"Evolving a Universal Tabular Feature Map:
|
|
115
|
+
Interpretable, Closed-Form In-Context Learning Competitive with Tabular Foundation Models."* (anonymized
|
|
116
|
+
for review; see `../tabular_paper/`).
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
evoforest_tab/__init__.py,sha256=hc7UzSxYLKk_L3PYiGlPXlrZJ05-ABKaJIJs_xd3hR4,1413
|
|
2
|
+
evoforest_tab/_channels.py,sha256=f_kp4gHARc7R1q-4WX3lRMBsfav1cFbIS2lz15WRxLc,5181
|
|
3
|
+
evoforest_tab/_genome.py,sha256=GlgktMKeZsDI5sxz7IrI3wzjViOVCnuv12vR4XqpDtM,2454
|
|
4
|
+
evoforest_tab/_module.py,sha256=x90Yg2z5RYnwsFQcH17GkIVwOpUu0P2remP5P594E2o,6750
|
|
5
|
+
evoforest_tab/_ridge.py,sha256=5kWanlIIc72zM7OnzVwyedgYkf5jAWUOg0mL9bhnWJk,1902
|
|
6
|
+
evoforest_tab/champion.yaml,sha256=lXCukeBH_EEKQZium6pgkptlkvg7HevJxvkXWDutDeo,6353
|
|
7
|
+
evoforest_tab/combine.py,sha256=gqjHn50rLlcowiPsJ0T_bfXELfCvF7G21BUT8strM-c,7683
|
|
8
|
+
evoforest_tab/estimator.py,sha256=uDoLhklSbZml1P0I9qcqiqtk1i7ORnWUT0OKcHXcSs8,4214
|
|
9
|
+
evoforest_tab/inductive.py,sha256=-GbZc9yFJ9PhtVKxczRKoBi09f4VqrArOUqUPCnhRdE,13173
|
|
10
|
+
evoforest_tab-0.1.0.dist-info/licenses/LICENSE,sha256=mPdFwLq00cOE_zAvx0jTNZ2Jy2Bl0D1RMu9JmYTXpyQ,1075
|
|
11
|
+
evoforest_tab-0.1.0.dist-info/METADATA,sha256=RELHHMnOhLC7aaZTD-OQLTX4GTwanql-IRL4ultoK4c,6709
|
|
12
|
+
evoforest_tab-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
13
|
+
evoforest_tab-0.1.0.dist-info/top_level.txt,sha256=RNTgLpP5y1b6xVDmwFLAxtrGm1AIBcaT_umBfEA67x4,14
|
|
14
|
+
evoforest_tab-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 the tabmap authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
evoforest_tab
|