evoforest-tab 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evoforest_tab-0.1.0/LICENSE +21 -0
- evoforest_tab-0.1.0/PKG-INFO +119 -0
- evoforest_tab-0.1.0/README.md +100 -0
- evoforest_tab-0.1.0/evoforest_tab/__init__.py +28 -0
- evoforest_tab-0.1.0/evoforest_tab/_channels.py +122 -0
- evoforest_tab-0.1.0/evoforest_tab/_genome.py +64 -0
- evoforest_tab-0.1.0/evoforest_tab/_module.py +152 -0
- evoforest_tab-0.1.0/evoforest_tab/_ridge.py +45 -0
- evoforest_tab-0.1.0/evoforest_tab/champion.yaml +77 -0
- evoforest_tab-0.1.0/evoforest_tab/combine.py +165 -0
- evoforest_tab-0.1.0/evoforest_tab/estimator.py +108 -0
- evoforest_tab-0.1.0/evoforest_tab/inductive.py +274 -0
- evoforest_tab-0.1.0/evoforest_tab.egg-info/PKG-INFO +119 -0
- evoforest_tab-0.1.0/evoforest_tab.egg-info/SOURCES.txt +19 -0
- evoforest_tab-0.1.0/evoforest_tab.egg-info/dependency_links.txt +1 -0
- evoforest_tab-0.1.0/evoforest_tab.egg-info/requires.txt +10 -0
- evoforest_tab-0.1.0/evoforest_tab.egg-info/top_level.txt +1 -0
- evoforest_tab-0.1.0/pyproject.toml +23 -0
- evoforest_tab-0.1.0/setup.cfg +4 -0
- evoforest_tab-0.1.0/tests/test_combine.py +46 -0
- evoforest_tab-0.1.0/tests/test_estimator.py +32 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 the tabmap authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evoforest-tab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evolved universal tabular feature map + closed-form ridge: an interpretable, training-free, local in-context learner for tabular data.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: tabular,in-context-learning,feature-map,tabpfn,ridge,automl
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: torch>=1.13
|
|
11
|
+
Requires-Dist: numpy>=1.21
|
|
12
|
+
Requires-Dist: pyyaml>=5.4
|
|
13
|
+
Provides-Extra: sklearn
|
|
14
|
+
Requires-Dist: scikit-learn>=1.0; extra == "sklearn"
|
|
15
|
+
Provides-Extra: examples
|
|
16
|
+
Requires-Dist: scikit-learn>=1.0; extra == "examples"
|
|
17
|
+
Requires-Dist: pandas>=1.3; extra == "examples"
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
|
|
20
|
+
# tabmap — EvoForest-Tab: an evolved universal tabular feature map
|
|
21
|
+
|
|
22
|
+
`tabmap` is the reference implementation of **EvoForest-Tab** (the EvoForest computation-search framework specialized to tabular data).
|
|
23
|
+
|
|
24
|
+
`tabmap` is an interpretable, training-free, **local** in-context learner for tabular data: an
|
|
25
|
+
evolved universal feature map `φ: row → ℝᴷ` (16 transform families over rank-gauss, count-encoding,
|
|
26
|
+
and categorical-mask channels) paired with a per-dataset **closed-form Bayesian-ridge head**. Given a
|
|
27
|
+
labeled *support* set and an unlabeled *query* set, it predicts in a single SVD solve — no gradient
|
|
28
|
+
descent, no per-dataset tuning, no GPU. It is competitive with gradient boosting and with the
|
|
29
|
+
published **TabPFN-v2** tabular foundation model, while remaining free to run and fully inspectable.
|
|
30
|
+
|
|
31
|
+
This repository accompanies the paper *"Evolving a Universal Tabular Feature Map: Interpretable,
|
|
32
|
+
Closed-Form In-Context Learning Competitive with Tabular Foundation Models"* and is **stand-alone**:
|
|
33
|
+
the deployment pipeline (feature map + ridge) depends only on `torch`, `numpy`, and `pyyaml`.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e . # editable; or: pip install .
|
|
38
|
+
# deps: torch, numpy, pyyaml (+ scikit-learn for the estimator base classes & examples)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage (scikit-learn style)
|
|
42
|
+
```python
|
|
43
|
+
from evoforest_tab import TabMapClassifier, TabMapRegressor
|
|
44
|
+
|
|
45
|
+
clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support) # X: ndarray or DataFrame
|
|
46
|
+
proba = clf.predict_proba(X_query) # in-context: query needed to fit φ channels
|
|
47
|
+
pred = clf.predict(X_query)
|
|
48
|
+
|
|
49
|
+
reg = TabMapRegressor(n_estimators=6).fit(X_support, y_support)
|
|
50
|
+
yhat = reg.predict(X_query)
|
|
51
|
+
```
|
|
52
|
+
Notes:
|
|
53
|
+
- It is an **in-context** learner: `predict` builds the (label-free, transductive) channels over the
|
|
54
|
+
pooled support+query rows, so the query rows are needed at prediction time (as with TabPFN).
|
|
55
|
+
- `n_estimators` is the random-feature ensemble size (averaged decorrelated seed-variants of `φ`);
|
|
56
|
+
`n_estimators=1` is the single map, `6` is the paper default (variance reduction toward the kernel limit).
|
|
57
|
+
- `cat_features=[...]` marks categorical columns (indices or DataFrame names); omitted → auto-detected.
|
|
58
|
+
- No class-count ceiling (unlike TabPFN-v2's ≤10 classes); runs on CPU in milliseconds.
|
|
59
|
+
|
|
60
|
+
## What's inside
|
|
61
|
+
```
|
|
62
|
+
tabmap/
|
|
63
|
+
_channels.py raw rows -> input channels (col-z, rank-gauss, count-encoding, categorical mask), nan-safe
|
|
64
|
+
_genome.py evaluate the evolved genome (champion.yaml) -> feature matrix Phi; seed-variants for the ensemble
|
|
65
|
+
_ridge.py closed-form Bayesian-ridge head (evidence-maximized lambda), single SVD solve
|
|
66
|
+
estimator.py TabMapClassifier / TabMapRegressor (sklearn API) + K-seed ensemble
|
|
67
|
+
champion.yaml the evolved 16-family genome (the deployment artifact)
|
|
68
|
+
examples/quickstart.py
|
|
69
|
+
reproduce/ scripts + cached TabPFN-v2 predictions to reproduce the paper's experiments
|
|
70
|
+
tests/
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Reproducing the paper
|
|
74
|
+
See [`reproduce/README.md`](reproduce/README.md). The cached TabPFN-v2 cloud predictions are included
|
|
75
|
+
so the head-to-head and routing experiments reproduce **without** any API key.
|
|
76
|
+
|
|
77
|
+
## Contributing this method upstream
|
|
78
|
+
`tabmap` is designed to drop into the tabular ML ecosystem. Best integration targets (most aligned first):
|
|
79
|
+
|
|
80
|
+
| Repo | Why it fits | Integration |
|
|
81
|
+
|---|---|---|
|
|
82
|
+
| **PriorLabs/tabpfn-extensions** | community extensions around TabPFN; our method is a free/local **complementary** in-context learner and a natural **cost-aware router** companion (route hard datasets to TabPFN, the rest to `tabmap`) | add as an extension module + a routing utility (`sklearn`-compatible) |
|
|
83
|
+
| **scikit-learn-contrib** | `TabMapClassifier`/`TabMapRegressor` already follow the estimator API | publish as a standalone `scikit-learn-contrib` project |
|
|
84
|
+
| **skrub** (ex dirty-cat) | tabular feature engineering / encoders; our channels (rank-gauss, count-encoding) + `φ` are a drop-in `TransformerMixin` featurizer | contribute `TabMapEncoder` (transform-only) |
|
|
85
|
+
| **pyg-team/pytorch-frame** | deep tabular; `φ` is a fixed featurizer usable as an input stem | add as an `encoder`/`stype` transform |
|
|
86
|
+
| **autogluon / TabArena** | leaderboard model implementations | submit `tabmap` as a model for the TabArena living benchmark |
|
|
87
|
+
|
|
88
|
+
The estimator's sklearn-compatible surface (`fit`/`predict`/`predict_proba`, `get_params`) is the
|
|
89
|
+
contribution-ready API; the transform-only `build_channels`+`build_phi` path serves the encoder use-cases.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## Combining with a foundation model (e.g. TabPFN)
|
|
93
|
+
`StackedTabularEnsemble` combines TabMap with any in-context base model (such as TabPFN's client) into a
|
|
94
|
+
single, stronger predictor -- the paper's complementarity result (our map tends to win classification,
|
|
95
|
+
TabPFN regression; combining beats either alone). Three methods: `blend` (50/50), `compwt`
|
|
96
|
+
(label-free, weight each model by its support-cross-validated competence), `meta` (a learned ridge head
|
|
97
|
+
over the models' out-of-fold support predictions; most robust). All are leakage-safe and in-context
|
|
98
|
+
(weights/head fit on support, no query labels).
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
|
|
102
|
+
from tabpfn_client import TabPFNClassifier # or any sklearn-surface in-context model
|
|
103
|
+
|
|
104
|
+
ens = StackedTabularEnsemble(
|
|
105
|
+
[TabMapClassifier(n_estimators=6), TabPFNClassifier()],
|
|
106
|
+
task="classification", method="meta", # "meta" | "compwt" | "blend"
|
|
107
|
+
).fit(X_support, y_support)
|
|
108
|
+
proba = ens.predict_proba(X_query)
|
|
109
|
+
```
|
|
110
|
+
The learned head (`meta`) is robust whether the two models are evenly matched or one dominates; the
|
|
111
|
+
label-free `compwt` is a close, deployable second with no meta-learner. See `examples/combine_tabpfn.py`.
|
|
112
|
+
|
|
113
|
+
## Citation
|
|
114
|
+
If you use this library, please cite the accompanying paper *"Evolving a Universal Tabular Feature Map:
|
|
115
|
+
Interpretable, Closed-Form In-Context Learning Competitive with Tabular Foundation Models."* (anonymized
|
|
116
|
+
for review; see `../tabular_paper/`).
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# tabmap — EvoForest-Tab: an evolved universal tabular feature map
|
|
2
|
+
|
|
3
|
+
`tabmap` is the reference implementation of **EvoForest-Tab** (the EvoForest computation-search framework specialized to tabular data).
|
|
4
|
+
|
|
5
|
+
`tabmap` is an interpretable, training-free, **local** in-context learner for tabular data: an
|
|
6
|
+
evolved universal feature map `φ: row → ℝᴷ` (16 transform families over rank-gauss, count-encoding,
|
|
7
|
+
and categorical-mask channels) paired with a per-dataset **closed-form Bayesian-ridge head**. Given a
|
|
8
|
+
labeled *support* set and an unlabeled *query* set, it predicts in a single SVD solve — no gradient
|
|
9
|
+
descent, no per-dataset tuning, no GPU. It is competitive with gradient boosting and with the
|
|
10
|
+
published **TabPFN-v2** tabular foundation model, while remaining free to run and fully inspectable.
|
|
11
|
+
|
|
12
|
+
This repository accompanies the paper *"Evolving a Universal Tabular Feature Map: Interpretable,
|
|
13
|
+
Closed-Form In-Context Learning Competitive with Tabular Foundation Models"* and is **stand-alone**:
|
|
14
|
+
the deployment pipeline (feature map + ridge) depends only on `torch`, `numpy`, and `pyyaml`.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
```bash
|
|
18
|
+
pip install -e . # editable; or: pip install .
|
|
19
|
+
# deps: torch, numpy, pyyaml (+ scikit-learn for the estimator base classes & examples)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage (scikit-learn style)
|
|
23
|
+
```python
|
|
24
|
+
from evoforest_tab import TabMapClassifier, TabMapRegressor
|
|
25
|
+
|
|
26
|
+
clf = TabMapClassifier(n_estimators=6).fit(X_support, y_support) # X: ndarray or DataFrame
|
|
27
|
+
proba = clf.predict_proba(X_query) # in-context: query needed to fit φ channels
|
|
28
|
+
pred = clf.predict(X_query)
|
|
29
|
+
|
|
30
|
+
reg = TabMapRegressor(n_estimators=6).fit(X_support, y_support)
|
|
31
|
+
yhat = reg.predict(X_query)
|
|
32
|
+
```
|
|
33
|
+
Notes:
|
|
34
|
+
- It is an **in-context** learner: `predict` builds the (label-free, transductive) channels over the
|
|
35
|
+
pooled support+query rows, so the query rows are needed at prediction time (as with TabPFN).
|
|
36
|
+
- `n_estimators` is the random-feature ensemble size (averaged decorrelated seed-variants of `φ`);
|
|
37
|
+
`n_estimators=1` is the single map, `6` is the paper default (variance reduction toward the kernel limit).
|
|
38
|
+
- `cat_features=[...]` marks categorical columns (indices or DataFrame names); omitted → auto-detected.
|
|
39
|
+
- No class-count ceiling (unlike TabPFN-v2's ≤10 classes); runs on CPU in milliseconds.
|
|
40
|
+
|
|
41
|
+
## What's inside
|
|
42
|
+
```
|
|
43
|
+
tabmap/
|
|
44
|
+
_channels.py raw rows -> input channels (col-z, rank-gauss, count-encoding, categorical mask), nan-safe
|
|
45
|
+
_genome.py evaluate the evolved genome (champion.yaml) -> feature matrix Phi; seed-variants for the ensemble
|
|
46
|
+
_ridge.py closed-form Bayesian-ridge head (evidence-maximized lambda), single SVD solve
|
|
47
|
+
estimator.py TabMapClassifier / TabMapRegressor (sklearn API) + K-seed ensemble
|
|
48
|
+
champion.yaml the evolved 16-family genome (the deployment artifact)
|
|
49
|
+
examples/quickstart.py
|
|
50
|
+
reproduce/ scripts + cached TabPFN-v2 predictions to reproduce the paper's experiments
|
|
51
|
+
tests/
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Reproducing the paper
|
|
55
|
+
See [`reproduce/README.md`](reproduce/README.md). The cached TabPFN-v2 cloud predictions are included
|
|
56
|
+
so the head-to-head and routing experiments reproduce **without** any API key.
|
|
57
|
+
|
|
58
|
+
## Contributing this method upstream
|
|
59
|
+
`tabmap` is designed to drop into the tabular ML ecosystem. Best integration targets (most aligned first):
|
|
60
|
+
|
|
61
|
+
| Repo | Why it fits | Integration |
|
|
62
|
+
|---|---|---|
|
|
63
|
+
| **PriorLabs/tabpfn-extensions** | community extensions around TabPFN; our method is a free/local **complementary** in-context learner and a natural **cost-aware router** companion (route hard datasets to TabPFN, the rest to `tabmap`) | add as an extension module + a routing utility (`sklearn`-compatible) |
|
|
64
|
+
| **scikit-learn-contrib** | `TabMapClassifier`/`TabMapRegressor` already follow the estimator API | publish as a standalone `scikit-learn-contrib` project |
|
|
65
|
+
| **skrub** (ex dirty-cat) | tabular feature engineering / encoders; our channels (rank-gauss, count-encoding) + `φ` are a drop-in `TransformerMixin` featurizer | contribute `TabMapEncoder` (transform-only) |
|
|
66
|
+
| **pyg-team/pytorch-frame** | deep tabular; `φ` is a fixed featurizer usable as an input stem | add as an `encoder`/`stype` transform |
|
|
67
|
+
| **autogluon / TabArena** | leaderboard model implementations | submit `tabmap` as a model for the TabArena living benchmark |
|
|
68
|
+
|
|
69
|
+
The estimator's sklearn-compatible surface (`fit`/`predict`/`predict_proba`, `get_params`) is the
|
|
70
|
+
contribution-ready API; the transform-only `build_channels`+`build_phi` path serves the encoder use-cases.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
## Combining with a foundation model (e.g. TabPFN)
|
|
74
|
+
`StackedTabularEnsemble` combines TabMap with any in-context base model (such as TabPFN's client) into a
|
|
75
|
+
single, stronger predictor -- the paper's complementarity result (our map tends to win classification,
|
|
76
|
+
TabPFN regression; combining beats either alone). Three methods: `blend` (50/50), `compwt`
|
|
77
|
+
(label-free, weight each model by its support-cross-validated competence), `meta` (a learned ridge head
|
|
78
|
+
over the models' out-of-fold support predictions; most robust). All are leakage-safe and in-context
|
|
79
|
+
(weights/head fit on support, no query labels).
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from evoforest_tab import TabMapClassifier, StackedTabularEnsemble
|
|
83
|
+
from tabpfn_client import TabPFNClassifier # or any sklearn-surface in-context model
|
|
84
|
+
|
|
85
|
+
ens = StackedTabularEnsemble(
|
|
86
|
+
[TabMapClassifier(n_estimators=6), TabPFNClassifier()],
|
|
87
|
+
task="classification", method="meta", # "meta" | "compwt" | "blend"
|
|
88
|
+
).fit(X_support, y_support)
|
|
89
|
+
proba = ens.predict_proba(X_query)
|
|
90
|
+
```
|
|
91
|
+
The learned head (`meta`) is robust whether the two models are evenly matched or one dominates; the
|
|
92
|
+
label-free `compwt` is a close, deployable second with no meta-learner. See `examples/combine_tabpfn.py`.
|
|
93
|
+
|
|
94
|
+
## Citation
|
|
95
|
+
If you use this library, please cite the accompanying paper *"Evolving a Universal Tabular Feature Map:
|
|
96
|
+
Interpretable, Closed-Form In-Context Learning Competitive with Tabular Foundation Models."* (anonymized
|
|
97
|
+
for review; see `../tabular_paper/`).
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
MIT (see `LICENSE`).
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""evoforest_tab: EvoForest-Tab -- an evolved universal tabular feature map + closed-form ridge head.
|
|
2
|
+
|
|
3
|
+
An interpretable, training-free, local in-context learner competitive with tabular foundation
|
|
4
|
+
models. See README. Main entry points:
|
|
5
|
+
|
|
6
|
+
from evoforest_tab import EvoForestTabClassifier, EvoForestTabRegressor
|
|
7
|
+
"""
|
|
8
|
+
from .estimator import TabMapClassifier, TabMapRegressor
|
|
9
|
+
from ._genome import load_genome, build_phi, seed_variant, DEFAULT_GENOME
|
|
10
|
+
from ._channels import build_channels
|
|
11
|
+
from .combine import StackedTabularEnsemble
|
|
12
|
+
from ._module import EvoForestTabModule
|
|
13
|
+
from .inductive import (
|
|
14
|
+
EvoForestTabInductiveClassifier, EvoForestTabInductiveRegressor, EvoForestTabTransformer,
|
|
15
|
+
TabMapInductiveClassifier, TabMapInductiveRegressor, TabMapTransformer,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# brand-consistent names matching the paper (the TabMap* names remain as aliases)
|
|
19
|
+
EvoForestTabClassifier = TabMapClassifier
|
|
20
|
+
EvoForestTabRegressor = TabMapRegressor
|
|
21
|
+
|
|
22
|
+
__all__ = ["EvoForestTabClassifier", "EvoForestTabRegressor",
|
|
23
|
+
"TabMapClassifier", "TabMapRegressor", "build_channels",
|
|
24
|
+
"build_phi", "load_genome", "seed_variant", "DEFAULT_GENOME",
|
|
25
|
+
"StackedTabularEnsemble", "EvoForestTabModule",
|
|
26
|
+
"EvoForestTabInductiveClassifier", "EvoForestTabInductiveRegressor", "EvoForestTabTransformer",
|
|
27
|
+
"TabMapInductiveClassifier", "TabMapInductiveRegressor", "TabMapTransformer"]
|
|
28
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Channel construction: raw table rows -> the input channels the feature map reads.
|
|
2
|
+
|
|
3
|
+
Faithful to the development pipeline. All channels are UNSUPERVISED and TRANSDUCTIVE (computed over the
|
|
4
|
+
pooled support+query rows, label-free) so the map is leakage-safe in the in-context (support->query)
|
|
5
|
+
setting. Categoricals are ordinal-encoded; missing values are nan-safe; columns are padded/capped to Dmax.
|
|
6
|
+
"""
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
DMAX_DEFAULT = 100
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _nan_col_zscore(X: torch.Tensor) -> torch.Tensor:
|
|
16
|
+
"""Per-column z-score (nan-safe); nan -> 0 after standardizing."""
|
|
17
|
+
nan = torch.isnan(X)
|
|
18
|
+
Xf = torch.where(nan, torch.zeros_like(X), X)
|
|
19
|
+
cnt = (~nan).sum(0).clamp(min=1).to(X.dtype)
|
|
20
|
+
mean = Xf.sum(0) / cnt
|
|
21
|
+
var = (torch.where(nan, torch.zeros_like(X), (X - mean) ** 2)).sum(0) / cnt
|
|
22
|
+
z = (X - mean) / (var.sqrt() + 1e-6)
|
|
23
|
+
return torch.where(nan, torch.zeros_like(z), z)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _col_rankgauss(X: torch.Tensor) -> torch.Tensor:
|
|
27
|
+
"""Per-column rank -> ~N(0,1) (nan ranked last, then zeroed). Transductive, label-free."""
|
|
28
|
+
N = X.shape[0]
|
|
29
|
+
filled = torch.where(torch.isnan(X), torch.full_like(X, float("inf")), X)
|
|
30
|
+
ranks = filled.argsort(0).argsort(0).to(X.dtype)
|
|
31
|
+
u = (ranks + 0.5) / N
|
|
32
|
+
z = math.sqrt(2.0) * torch.erfinv((2 * u - 1).clamp(-1 + 1e-6, 1 - 1e-6))
|
|
33
|
+
return torch.where(torch.isnan(X), torch.zeros_like(z), z)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _col_freq(Xm: np.ndarray) -> np.ndarray:
|
|
37
|
+
"""Per-column count/frequency encoding: each cell -> fraction of rows sharing its value."""
|
|
38
|
+
n, D = Xm.shape
|
|
39
|
+
Fr = np.zeros_like(Xm, dtype=np.float64)
|
|
40
|
+
for j in range(D):
|
|
41
|
+
_, inv, counts = np.unique(Xm[:, j], return_inverse=True, return_counts=True)
|
|
42
|
+
Fr[:, j] = counts[inv] / n
|
|
43
|
+
return Fr
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _pad(X: torch.Tensor, Dmax: int) -> torch.Tensor:
|
|
47
|
+
if X.shape[1] >= Dmax:
|
|
48
|
+
return X[:, :Dmax]
|
|
49
|
+
return torch.cat([X, torch.zeros(X.shape[0], Dmax - X.shape[1], dtype=X.dtype)], dim=1)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _to_ordinal(X, cat_features):
|
|
53
|
+
"""Coerce a 2-D array (numpy object/float or pandas DataFrame) to a float matrix with categoricals
|
|
54
|
+
ordinal-encoded (nan preserved), returning (Xm float64, cat_mask bool)."""
|
|
55
|
+
try:
|
|
56
|
+
import pandas as pd
|
|
57
|
+
is_df = isinstance(X, pd.DataFrame)
|
|
58
|
+
except ImportError:
|
|
59
|
+
is_df = False
|
|
60
|
+
if is_df:
|
|
61
|
+
cols, cat = [], []
|
|
62
|
+
for ci, c in enumerate(X.columns):
|
|
63
|
+
s = X[c]
|
|
64
|
+
auto_cat = str(s.dtype) in ("category", "object", "bool")
|
|
65
|
+
user_cat = cat_features is not None and (ci in cat_features or c in cat_features)
|
|
66
|
+
if auto_cat or user_cat:
|
|
67
|
+
codes = s.astype("category").cat.codes.to_numpy().astype(np.float64)
|
|
68
|
+
codes[codes < 0] = np.nan
|
|
69
|
+
cols.append(codes); cat.append(True)
|
|
70
|
+
else:
|
|
71
|
+
cols.append(s.to_numpy(dtype=np.float64)); cat.append(False)
|
|
72
|
+
return np.column_stack(cols), np.array(cat, dtype=bool)
|
|
73
|
+
Xm = np.asarray(X, dtype=object)
|
|
74
|
+
n, D = Xm.shape
|
|
75
|
+
out = np.zeros((n, D), dtype=np.float64)
|
|
76
|
+
cat = np.zeros(D, dtype=bool)
|
|
77
|
+
for j in range(D):
|
|
78
|
+
col = Xm[:, j]
|
|
79
|
+
user_cat = cat_features is not None and j in cat_features
|
|
80
|
+
is_numeric = np.issubdtype(np.asarray(col).dtype, np.number)
|
|
81
|
+
try:
|
|
82
|
+
fcol = col.astype(np.float64)
|
|
83
|
+
numeric_ok = True
|
|
84
|
+
except (ValueError, TypeError):
|
|
85
|
+
numeric_ok = False
|
|
86
|
+
if user_cat or not numeric_ok or (not is_numeric):
|
|
87
|
+
uniq = {v: i for i, v in enumerate(sorted(set(map(str, col))))}
|
|
88
|
+
out[:, j] = np.array([uniq[str(v)] for v in col], dtype=np.float64)
|
|
89
|
+
cat[j] = True
|
|
90
|
+
else:
|
|
91
|
+
out[:, j] = fcol
|
|
92
|
+
return out, cat
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def build_channels(X, cat_features=None, Dmax: int = DMAX_DEFAULT, device="cpu"):
|
|
96
|
+
"""Build the input-channel dict from raw rows X (n, d). Returns tensors padded to Dmax.
|
|
97
|
+
|
|
98
|
+
X: numpy array or pandas DataFrame (the POOLED support+query rows).
|
|
99
|
+
cat_features: indices (or names) of categorical columns; if None, auto-detect by dtype/cardinality.
|
|
100
|
+
"""
|
|
101
|
+
Xm, cat = _to_ordinal(X, cat_features)
|
|
102
|
+
# median-impute, drop constant columns (matches the dev pipeline)
|
|
103
|
+
med = np.nanmedian(Xm, axis=0)
|
|
104
|
+
inds = np.where(np.isnan(Xm))
|
|
105
|
+
Xm[inds] = np.take(med, inds[1])
|
|
106
|
+
keep = Xm.std(0) > 1e-9
|
|
107
|
+
Xm = Xm[:, keep]; cat = cat[keep]
|
|
108
|
+
if Xm.shape[1] == 0:
|
|
109
|
+
raise ValueError("no non-constant columns after preprocessing")
|
|
110
|
+
# cap to Dmax by top variance
|
|
111
|
+
if Xm.shape[1] > Dmax:
|
|
112
|
+
top = np.argsort(-Xm.std(0))[:Dmax]
|
|
113
|
+
Xm = Xm[:, top]; cat = cat[top]
|
|
114
|
+
n, D = Xm.shape
|
|
115
|
+
Xt = torch.from_numpy(Xm).float().to(device)
|
|
116
|
+
x = _pad(_nan_col_zscore(Xt), Dmax)
|
|
117
|
+
xrank = _pad(_col_rankgauss(Xt), Dmax)
|
|
118
|
+
x_freq = _pad(_nan_col_zscore(torch.from_numpy(_col_freq(Xm)).float().to(device)), Dmax)
|
|
119
|
+
fmask = torch.zeros(n, Dmax, device=device); fmask[:, :min(D, Dmax)] = 1.0
|
|
120
|
+
is_cat = torch.zeros(n, Dmax, device=device)
|
|
121
|
+
is_cat[:, :min(D, Dmax)] = torch.from_numpy(cat[:min(D, Dmax)].astype(np.float32)).to(device)
|
|
122
|
+
return {"x": x, "xrank": xrank, "fmask": fmask, "is_cat": is_cat, "x_freq": x_freq, "Dmax": Dmax}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Evaluate the evolved genome (a list of feature-lambdas) on the input channels to produce the
|
|
2
|
+
feature matrix Phi. The genome is the deployment artifact; each lambda is a small, inspectable
|
|
3
|
+
expression over the channels with fixed seeded random projections."""
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn.functional as F
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
_G = {"torch": torch, "F": F, "np": np, "math": math}
|
|
14
|
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|
15
|
+
DEFAULT_GENOME = os.path.join(_HERE, "champion.yaml")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_genome(path: str = DEFAULT_GENOME) -> dict:
|
|
19
|
+
return yaml.safe_load(open(path))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def seed_variant(genome: dict, offset: int) -> dict:
|
|
23
|
+
"""Offset every manual_seed(N) -> manual_seed(N+offset): a decorrelated random-feature draw of the
|
|
24
|
+
SAME architecture (used for the variance-reducing ensemble)."""
|
|
25
|
+
out = [re.sub(r"manual_seed\((\d+)\)", lambda m: f"manual_seed({int(m.group(1)) + offset})", lam)
|
|
26
|
+
for lam in genome["output"]]
|
|
27
|
+
g = dict(genome); g["output"] = out
|
|
28
|
+
return g
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_phi(genome: dict, channels: dict) -> torch.Tensor:
|
|
32
|
+
"""Apply the genome's output lambdas to the channel dict and stack into a normalized (n, K) matrix."""
|
|
33
|
+
gg = {k: eval(v, _G) for k, v in (genome.get("@globals", {}) or {}).items()}
|
|
34
|
+
n = channels["x"].shape[0]
|
|
35
|
+
signals = []
|
|
36
|
+
for lam in genome["output"]:
|
|
37
|
+
fn = eval(lam, _G)
|
|
38
|
+
na = fn.__code__.co_argcount
|
|
39
|
+
an = fn.__code__.co_varnames[:na]
|
|
40
|
+
args = [channels if a == "input" else gg if a == "globals" else None for a in an]
|
|
41
|
+
signals.append(fn(*args))
|
|
42
|
+
cols = []
|
|
43
|
+
for sig in signals:
|
|
44
|
+
if not torch.is_tensor(sig):
|
|
45
|
+
sig = torch.as_tensor(sig)
|
|
46
|
+
sig = sig.float()
|
|
47
|
+
if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
|
|
48
|
+
sub = [sig[:, c] for c in range(sig.shape[1])]
|
|
49
|
+
else:
|
|
50
|
+
sig = sig.squeeze()
|
|
51
|
+
if sig.dim() != 1 or sig.shape[0] != n:
|
|
52
|
+
continue
|
|
53
|
+
sub = [sig]
|
|
54
|
+
for col in sub:
|
|
55
|
+
col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
|
|
56
|
+
if col.std() < 1e-12:
|
|
57
|
+
continue
|
|
58
|
+
cols.append(col)
|
|
59
|
+
if not cols:
|
|
60
|
+
raise ValueError("genome produced no usable feature columns")
|
|
61
|
+
Phi = torch.stack(cols, dim=1)
|
|
62
|
+
mu = Phi.mean(0, keepdim=True)
|
|
63
|
+
sigma = Phi.std(0, keepdim=True).clamp(min=1e-12)
|
|
64
|
+
return (Phi - mu) / sigma
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Compile the evolved genome (champion.yaml) into a self-contained ``torch.nn.Module``.
|
|
2
|
+
|
|
3
|
+
The genome is the *architecture spec* produced by EvoForest search. In the reference path each family's
|
|
4
|
+
seeded random projection is re-drawn on every forward via ``torch.randn(..., generator=manual_seed(N))``
|
|
5
|
+
inside an ``eval``'d lambda. That is fine for a frozen feature map but (a) cannot be fine-tuned, (b) re-draws
|
|
6
|
+
on every call, and (c) is not a saveable artifact.
|
|
7
|
+
|
|
8
|
+
``EvoForestTabModule`` fixes all three: it materializes every seeded ``randn``/``rand`` draw **once** as a
|
|
9
|
+
frozen ``nn.Parameter`` (``requires_grad=False`` by default), so the module is a self-contained, saveable,
|
|
10
|
+
HuggingFace-publishable checkpoint whose ``forward`` is byte-identical to the reference evaluation -- and the
|
|
11
|
+
random features become fine-tunable simply by calling :meth:`unfreeze_random_features`. The deterministic
|
|
12
|
+
families (rank/stats/frequency) carry no parameters and are reproduced exactly.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import math
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import torch
|
|
20
|
+
import torch.nn.functional as F
|
|
21
|
+
from torch import nn
|
|
22
|
+
|
|
23
|
+
from ._genome import load_genome, DEFAULT_GENOME
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _SeededWeightBank(nn.Module):
|
|
27
|
+
"""Materialize seeded ``torch.randn``/``torch.rand`` draws as frozen Parameters, keyed by (op, seed, shape)."""
|
|
28
|
+
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.weights = nn.ParameterDict()
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _key(op: str, shape, seed: int) -> str:
|
|
35
|
+
return f"{op}__seed{int(seed)}__" + "x".join(str(int(s)) for s in shape)
|
|
36
|
+
|
|
37
|
+
def get(self, op: str, shape, seed: int, device=None) -> torch.Tensor:
|
|
38
|
+
key = self._key(op, shape, seed)
|
|
39
|
+
if key not in self.weights:
|
|
40
|
+
gen = torch.Generator().manual_seed(int(seed)) # CPU draw == reference on CPU
|
|
41
|
+
draw = (torch.randn if op == "randn" else torch.rand)(*shape, generator=gen)
|
|
42
|
+
self.weights[key] = nn.Parameter(draw, requires_grad=False) # frozen by default
|
|
43
|
+
w = self.weights[key]
|
|
44
|
+
return w.to(device) if device is not None else w
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _TorchProxy:
|
|
48
|
+
"""A stand-in for the ``torch`` namespace inside the genome lambdas: seeded ``randn``/``rand`` are routed
|
|
49
|
+
to the materialized weight bank; every other attribute delegates to real ``torch``."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, bank: _SeededWeightBank) -> None:
|
|
52
|
+
object.__setattr__(self, "_bank", bank)
|
|
53
|
+
|
|
54
|
+
def __getattr__(self, name):
|
|
55
|
+
return getattr(torch, name)
|
|
56
|
+
|
|
57
|
+
def randn(self, *shape, generator=None, device=None, dtype=None):
|
|
58
|
+
if generator is None:
|
|
59
|
+
return torch.randn(*shape, device=device, dtype=dtype)
|
|
60
|
+
return self._bank.get("randn", shape, generator.initial_seed(), device)
|
|
61
|
+
|
|
62
|
+
def rand(self, *shape, generator=None, device=None, dtype=None):
|
|
63
|
+
if generator is None:
|
|
64
|
+
return torch.rand(*shape, device=device, dtype=dtype)
|
|
65
|
+
return self._bank.get("rand", shape, generator.initial_seed(), device)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class EvoForestTabModule(nn.Module):
|
|
69
|
+
"""Evolved EvoForest-Tab feature map phi as a compiled, fine-tunable, saveable ``nn.Module``.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
genome : dict | None
|
|
74
|
+
The parsed genome (defaults to the released champion).
|
|
75
|
+
dmax : int
|
|
76
|
+
Padded channel width the module is built for (random-projection shapes depend only on this).
|
|
77
|
+
|
|
78
|
+
Notes
|
|
79
|
+
-----
|
|
80
|
+
``forward(channels)`` takes the channel dict from :func:`evoforest_tab.build_channels` and returns the
|
|
81
|
+
normalized ``(n, K)`` feature matrix, byte-identical to :func:`evoforest_tab.build_phi`. Random-feature
|
|
82
|
+
weights are frozen by default; call :meth:`unfreeze_random_features` to fine-tune them (e.g. at BSC), then
|
|
83
|
+
``state_dict()`` is a publishable derivative checkpoint.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self, genome: dict | None = None, dmax: int = 100) -> None:
|
|
87
|
+
super().__init__()
|
|
88
|
+
genome = genome if genome is not None else load_genome(DEFAULT_GENOME)
|
|
89
|
+
self.output_src = list(genome["output"])
|
|
90
|
+
self.dmax = dmax
|
|
91
|
+
self.bank = _SeededWeightBank()
|
|
92
|
+
self._proxy = _TorchProxy(self.bank)
|
|
93
|
+
env = {"torch": self._proxy, "F": F, "np": np, "math": math}
|
|
94
|
+
self._fns = [eval(src, env) for src in self.output_src] # proxy-bound lambdas
|
|
95
|
+
with torch.no_grad(): # eager-materialize the bank
|
|
96
|
+
self._materialize()
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------ build / fine-tune controls
|
|
99
|
+
def _materialize(self) -> None:
|
|
100
|
+
dummy = self._dummy_channels()
|
|
101
|
+
for fn in self._fns:
|
|
102
|
+
try:
|
|
103
|
+
fn(dummy) # fires the seeded randn/rand -> banks the weights
|
|
104
|
+
except Exception: # noqa: BLE001 - dummy may degenerate post-draw; weights are banked
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
def _dummy_channels(self) -> dict:
|
|
108
|
+
z = torch.randn(4, self.dmax)
|
|
109
|
+
return {"x": z, "xrank": z.clone(), "x_freq": z.clone(),
|
|
110
|
+
"fmask": torch.ones(4, self.dmax), "is_cat": torch.zeros(4, self.dmax), "Dmax": self.dmax}
|
|
111
|
+
|
|
112
|
+
def unfreeze_random_features(self) -> "EvoForestTabModule":
|
|
113
|
+
for p in self.bank.weights.values():
|
|
114
|
+
p.requires_grad_(True)
|
|
115
|
+
return self
|
|
116
|
+
|
|
117
|
+
def freeze_random_features(self) -> "EvoForestTabModule":
|
|
118
|
+
for p in self.bank.weights.values():
|
|
119
|
+
p.requires_grad_(False)
|
|
120
|
+
return self
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def n_random_parameters(self) -> int:
|
|
124
|
+
return sum(p.numel() for p in self.bank.weights.values())
|
|
125
|
+
|
|
126
|
+
# ------------------------------------------------------------------ forward (mirrors build_phi exactly)
|
|
127
|
+
def forward(self, channels: dict) -> torch.Tensor:
|
|
128
|
+
n = channels["x"].shape[0]
|
|
129
|
+
cols = []
|
|
130
|
+
for fn in self._fns:
|
|
131
|
+
sig = fn(channels)
|
|
132
|
+
if not torch.is_tensor(sig):
|
|
133
|
+
sig = torch.as_tensor(sig)
|
|
134
|
+
sig = sig.float()
|
|
135
|
+
if sig.dim() >= 2 and sig.shape[0] == n and sig.shape[1] > 1:
|
|
136
|
+
sub = [sig[:, c] for c in range(sig.shape[1])]
|
|
137
|
+
else:
|
|
138
|
+
sig = sig.squeeze()
|
|
139
|
+
if sig.dim() != 1 or sig.shape[0] != n:
|
|
140
|
+
continue
|
|
141
|
+
sub = [sig]
|
|
142
|
+
for col in sub:
|
|
143
|
+
col = torch.where(torch.isfinite(col), col, torch.zeros_like(col))
|
|
144
|
+
if col.std() < 1e-12:
|
|
145
|
+
continue
|
|
146
|
+
cols.append(col)
|
|
147
|
+
if not cols:
|
|
148
|
+
raise ValueError("genome produced no usable feature columns")
|
|
149
|
+
phi = torch.stack(cols, dim=1)
|
|
150
|
+
mu = phi.mean(0, keepdim=True)
|
|
151
|
+
sigma = phi.std(0, keepdim=True).clamp(min=1e-12)
|
|
152
|
+
return (phi - mu) / sigma
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Closed-form ridge head with Bayesian evidence-maximized regularization (MacKay/Tipping).
|
|
2
|
+
|
|
3
|
+
The head is fit on the support features and applied to the query features in a single SVD solve --
|
|
4
|
+
no gradient descent, no per-dataset hyperparameter grid. lambda is set by evidence maximization,
|
|
5
|
+
which is markedly more stable than a leave-one-out grid in the few-shot (K>>n) regime.
|
|
6
|
+
"""
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
LAM_FLOOR = 1e-2
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def evidence_lambda(U, S, Y, n_iter: int = 60) -> float:
|
|
13
|
+
"""Bayesian-ridge lambda = alpha/beta by evidence maximization, reusing the support SVD (U,S),
|
|
14
|
+
with alpha,beta shared across Y's columns and a floor for the K>n interpolation regime."""
|
|
15
|
+
s2 = S ** 2
|
|
16
|
+
UtY = U.transpose(0, 1) @ Y
|
|
17
|
+
n, M = Y.shape
|
|
18
|
+
ytot = (Y ** 2).sum(); proj = (UtY ** 2).sum()
|
|
19
|
+
lam = torch.tensor(1.0, dtype=S.dtype, device=S.device)
|
|
20
|
+
for _ in range(n_iter):
|
|
21
|
+
h = s2 / (s2 + lam)
|
|
22
|
+
d = S / (s2 + lam)
|
|
23
|
+
wsq = ((d.unsqueeze(1) * UtY) ** 2).sum()
|
|
24
|
+
rss = (ytot - proj) + (((1 - h).unsqueeze(1) * UtY) ** 2).sum()
|
|
25
|
+
gamma = h.sum()
|
|
26
|
+
alpha = (M * gamma) / (wsq + 1e-12)
|
|
27
|
+
beta = (M * n - M * gamma) / (rss + 1e-12)
|
|
28
|
+
lam_new = (alpha / (beta + 1e-12)).clamp(min=LAM_FLOOR, max=1e8)
|
|
29
|
+
if (lam_new - lam).abs() < 1e-3 * lam:
|
|
30
|
+
lam = lam_new
|
|
31
|
+
break
|
|
32
|
+
lam = lam_new
|
|
33
|
+
return float(lam)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def ridge_scores(Phi_s, Y, Phi_q, lam=None):
|
|
37
|
+
"""Standardize by support stats, solve ridge in closed form, return query scores (nq, M)."""
|
|
38
|
+
mu = Phi_s.mean(0, keepdim=True); sd = Phi_s.std(0, keepdim=True).clamp(min=1e-8)
|
|
39
|
+
Phi_s = (Phi_s - mu) / sd; Phi_q = (Phi_q - mu) / sd
|
|
40
|
+
U, S, Vt = torch.linalg.svd(Phi_s, full_matrices=False)
|
|
41
|
+
UY = U.transpose(0, 1) @ Y
|
|
42
|
+
if lam is None:
|
|
43
|
+
lam = evidence_lambda(U, S, Y)
|
|
44
|
+
W = Vt.transpose(0, 1) @ ((S / (S ** 2 + lam)).unsqueeze(1) * UY)
|
|
45
|
+
return Phi_q @ W
|