nullbic 0.1.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nullbic/__init__.py +45 -0
- nullbic/_bindings.py +119 -0
- nullbic/_dll/nullbic.dll +0 -0
- nullbic/cli.py +52 -0
- nullbic/core.py +315 -0
- nullbic/py.typed +0 -0
- nullbic-0.1.0.dist-info/METADATA +198 -0
- nullbic-0.1.0.dist-info/RECORD +12 -0
- nullbic-0.1.0.dist-info/WHEEL +5 -0
- nullbic-0.1.0.dist-info/entry_points.txt +2 -0
- nullbic-0.1.0.dist-info/licenses/LICENSE +21 -0
- nullbic-0.1.0.dist-info/top_level.txt +1 -0
nullbic/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""nullbic — Symbolic regression with automatic null-baseline ΔBIC.
|
|
2
|
+
|
|
3
|
+
Find formulas AND prove they're not noise.
|
|
4
|
+
|
|
5
|
+
Quick start
|
|
6
|
+
-----------
|
|
7
|
+
|
|
8
|
+
>>> from nullbic import Dataset, discover
|
|
9
|
+
>>> ds = Dataset.from_csv("data.csv", target="y")
|
|
10
|
+
>>> report = discover(ds, n_generations=40)
|
|
11
|
+
>>> print(report.summary())
|
|
12
|
+
>>> report.is_real_signal()
|
|
13
|
+
True
|
|
14
|
+
|
|
15
|
+
Pandas
|
|
16
|
+
------
|
|
17
|
+
|
|
18
|
+
>>> from nullbic import Dataset, discover
|
|
19
|
+
>>> import pandas as pd
|
|
20
|
+
>>> df = pd.read_csv("data.csv")
|
|
21
|
+
>>> ds = Dataset.from_pandas(df, target="y")
|
|
22
|
+
>>> report = discover(ds)
|
|
23
|
+
|
|
24
|
+
The report always carries ΔBIC vs F=const, F=linear, and a shuffled-target
|
|
25
|
+
control. Verdict is one of NOISE / WEAK / STRONG.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from .core import (
|
|
29
|
+
Dataset,
|
|
30
|
+
DiscoverConfig,
|
|
31
|
+
DiscoverReport,
|
|
32
|
+
Verdict,
|
|
33
|
+
discover,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"Dataset",
|
|
40
|
+
"DiscoverConfig",
|
|
41
|
+
"DiscoverReport",
|
|
42
|
+
"Verdict",
|
|
43
|
+
"discover",
|
|
44
|
+
"__version__",
|
|
45
|
+
]
|
nullbic/_bindings.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""ctypes bindings to the nullbic Rust DLL. Private — use `nullbic.core` instead."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import ctypes
|
|
5
|
+
import os
|
|
6
|
+
import platform
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# DLL location
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
def _find_dll() -> Path:
|
|
15
|
+
"""Locate the bundled DLL/SO/dylib for the current platform."""
|
|
16
|
+
here = Path(__file__).resolve().parent / "_dll"
|
|
17
|
+
candidates = []
|
|
18
|
+
sysname = platform.system()
|
|
19
|
+
if sysname == "Windows":
|
|
20
|
+
candidates = ["nullbic.dll"]
|
|
21
|
+
elif sysname == "Linux":
|
|
22
|
+
candidates = ["libnullbic.so", "nullbic.so"]
|
|
23
|
+
elif sysname == "Darwin":
|
|
24
|
+
candidates = ["libnullbic.dylib", "nullbic.dylib"]
|
|
25
|
+
for c in candidates:
|
|
26
|
+
p = here / c
|
|
27
|
+
if p.exists():
|
|
28
|
+
return p
|
|
29
|
+
raise RuntimeError(
|
|
30
|
+
f"nullbic: no shared library found for {sysname} in {here}. "
|
|
31
|
+
f"Build the Rust crate and copy the artifact into {here}."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
_DLL_PATH = _find_dll()
|
|
35
|
+
_lib = ctypes.CDLL(str(_DLL_PATH))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# ctypes signatures
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
# Dataset
|
|
43
|
+
_lib.nullbic_dataset_new.restype = ctypes.c_void_p
|
|
44
|
+
_lib.nullbic_dataset_new.argtypes = [ctypes.c_char_p]
|
|
45
|
+
|
|
46
|
+
_lib.nullbic_dataset_add_feature.restype = ctypes.c_int
|
|
47
|
+
_lib.nullbic_dataset_add_feature.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
|
|
48
|
+
|
|
49
|
+
_lib.nullbic_dataset_push_row.restype = ctypes.c_int
|
|
50
|
+
_lib.nullbic_dataset_push_row.argtypes = [
|
|
51
|
+
ctypes.c_void_p,
|
|
52
|
+
ctypes.POINTER(ctypes.c_double),
|
|
53
|
+
ctypes.c_double,
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
_lib.nullbic_dataset_n_rows.restype = ctypes.c_size_t
|
|
57
|
+
_lib.nullbic_dataset_n_rows.argtypes = [ctypes.c_void_p]
|
|
58
|
+
|
|
59
|
+
_lib.nullbic_dataset_free.argtypes = [ctypes.c_void_p]
|
|
60
|
+
|
|
61
|
+
# Discover
|
|
62
|
+
_lib.nullbic_discover.restype = ctypes.c_void_p
|
|
63
|
+
_lib.nullbic_discover.argtypes = [
|
|
64
|
+
ctypes.c_void_p,
|
|
65
|
+
ctypes.c_uint,
|
|
66
|
+
ctypes.c_size_t,
|
|
67
|
+
ctypes.c_uint,
|
|
68
|
+
ctypes.c_uint64,
|
|
69
|
+
ctypes.c_double,
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Report accessors
|
|
73
|
+
_lib.nullbic_report_formula.restype = ctypes.c_void_p
|
|
74
|
+
_lib.nullbic_report_formula.argtypes = [ctypes.c_void_p]
|
|
75
|
+
|
|
76
|
+
_lib.nullbic_report_pretty.restype = ctypes.c_void_p
|
|
77
|
+
_lib.nullbic_report_pretty.argtypes = [ctypes.c_void_p]
|
|
78
|
+
|
|
79
|
+
_lib.nullbic_report_json.restype = ctypes.c_void_p
|
|
80
|
+
_lib.nullbic_report_json.argtypes = [ctypes.c_void_p]
|
|
81
|
+
|
|
82
|
+
_lib.nullbic_report_verdict.restype = ctypes.c_int
|
|
83
|
+
_lib.nullbic_report_verdict.argtypes = [ctypes.c_void_p]
|
|
84
|
+
|
|
85
|
+
for fn in (
|
|
86
|
+
"nullbic_report_mse_train",
|
|
87
|
+
"nullbic_report_mse_test",
|
|
88
|
+
"nullbic_report_bic",
|
|
89
|
+
"nullbic_report_delta_bic_const",
|
|
90
|
+
"nullbic_report_delta_bic_linear",
|
|
91
|
+
"nullbic_report_z_vs_shuffled",
|
|
92
|
+
):
|
|
93
|
+
f = getattr(_lib, fn)
|
|
94
|
+
f.restype = ctypes.c_double
|
|
95
|
+
f.argtypes = [ctypes.c_void_p]
|
|
96
|
+
|
|
97
|
+
_lib.nullbic_report_free.argtypes = [ctypes.c_void_p]
|
|
98
|
+
_lib.nullbic_string_free.argtypes = [ctypes.c_void_p]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Helpers
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def _take_string(ptr: int) -> str:
|
|
106
|
+
"""Copy a Rust-allocated C string into a Python str and free the Rust buf."""
|
|
107
|
+
if not ptr:
|
|
108
|
+
return ""
|
|
109
|
+
s = ctypes.string_at(ptr).decode("utf-8", errors="replace")
|
|
110
|
+
_lib.nullbic_string_free(ptr)
|
|
111
|
+
return s
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def lib():
|
|
115
|
+
"""Return the loaded shared library handle (for advanced/debug use)."""
|
|
116
|
+
return _lib
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
__all__ = ["lib", "_take_string"]
|
nullbic/_dll/nullbic.dll
ADDED
|
Binary file
|
nullbic/cli.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""nullbic CLI — `nullbic data.csv target_col`."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .core import Dataset, DiscoverConfig, Verdict, discover
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main(argv: list[str] | None = None) -> int:
|
|
12
|
+
p = argparse.ArgumentParser(
|
|
13
|
+
prog="nullbic",
|
|
14
|
+
description="Symbolic regression with auto-falsification ΔBIC.",
|
|
15
|
+
)
|
|
16
|
+
p.add_argument("csv", type=Path, help="CSV file with a header row")
|
|
17
|
+
p.add_argument("target", type=str, help="Name of the column to predict")
|
|
18
|
+
p.add_argument("--gens", type=int, default=40, dest="n_generations")
|
|
19
|
+
p.add_argument("--pop", type=int, default=200, dest="pop_size")
|
|
20
|
+
p.add_argument("--depth", type=int, default=4, dest="max_depth")
|
|
21
|
+
p.add_argument("--seed", type=int, default=42)
|
|
22
|
+
p.add_argument("--test-frac", type=float, default=0.2, dest="test_frac")
|
|
23
|
+
p.add_argument("--json", type=Path, default=None, help="Also write JSON report here")
|
|
24
|
+
args = p.parse_args(argv)
|
|
25
|
+
|
|
26
|
+
ds = Dataset.from_csv(args.csv, args.target)
|
|
27
|
+
sys.stderr.write(
|
|
28
|
+
f"loaded {len(ds)} rows, {len(ds.feature_names)} features → "
|
|
29
|
+
f"target '{ds.target_name}'\n"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
cfg = DiscoverConfig(
|
|
33
|
+
n_generations=args.n_generations,
|
|
34
|
+
pop_size=args.pop_size,
|
|
35
|
+
max_depth=args.max_depth,
|
|
36
|
+
seed=args.seed,
|
|
37
|
+
test_frac=args.test_frac,
|
|
38
|
+
)
|
|
39
|
+
rep = discover(ds, cfg)
|
|
40
|
+
print(rep.summary())
|
|
41
|
+
print(f"\nverdict : {rep.verdict.name}")
|
|
42
|
+
print(f"n_train = {rep.n_train} n_test = {rep.n_test} wall = {rep.wall_clock_ms} ms")
|
|
43
|
+
|
|
44
|
+
if args.json:
|
|
45
|
+
args.json.write_text(rep.raw_json)
|
|
46
|
+
sys.stderr.write(f"wrote {args.json}\n")
|
|
47
|
+
|
|
48
|
+
return 0 if rep.verdict != Verdict.NOISE else 1
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
raise SystemExit(main())
|
nullbic/core.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""Public Python API for nullbic — symbolic regression with auto-falsification."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import ctypes
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from enum import IntEnum
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
10
|
+
|
|
11
|
+
from ._bindings import _take_string, lib
|
|
12
|
+
|
|
13
|
+
_lib = lib()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Verdict(IntEnum):
|
|
17
|
+
"""Falsification verdict assigned to every discovery run."""
|
|
18
|
+
|
|
19
|
+
NOISE = 0
|
|
20
|
+
"""Doesn't meaningfully beat F=const or F=linear baselines. Don't trust the formula."""
|
|
21
|
+
|
|
22
|
+
WEAK = 1
|
|
23
|
+
"""Beats F=const but not F=linear. The formula adds non-linearity worth a marginal look."""
|
|
24
|
+
|
|
25
|
+
STRONG = 2
|
|
26
|
+
"""Beats F=const AND F=linear by ≥10 BIC AND > 2σ below the shuffled-target distribution."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class DiscoverReport:
|
|
31
|
+
"""One symbolic-regression run with full auto-falsification context."""
|
|
32
|
+
|
|
33
|
+
formula: str
|
|
34
|
+
verdict: Verdict
|
|
35
|
+
bic: float
|
|
36
|
+
mse_train: float
|
|
37
|
+
mse_test: float
|
|
38
|
+
delta_bic_const: float
|
|
39
|
+
delta_bic_linear: float
|
|
40
|
+
z_vs_shuffled: float
|
|
41
|
+
n_train: int = 0
|
|
42
|
+
n_test: int = 0
|
|
43
|
+
wall_clock_ms: int = 0
|
|
44
|
+
raw_json: str = ""
|
|
45
|
+
|
|
46
|
+
def is_real_signal(self) -> bool:
|
|
47
|
+
"""True iff the verdict is STRONG."""
|
|
48
|
+
return self.verdict == Verdict.STRONG
|
|
49
|
+
|
|
50
|
+
def summary(self) -> str:
|
|
51
|
+
"""Compact one-block textual summary."""
|
|
52
|
+
return (
|
|
53
|
+
f"formula : {self.formula}\n"
|
|
54
|
+
f"verdict : {self.verdict.name}\n"
|
|
55
|
+
f"MSE : train={self.mse_train:.6g} test={self.mse_test:.6g}\n"
|
|
56
|
+
f"BIC : {self.bic:.2f}\n"
|
|
57
|
+
f"ΔBIC : vs const = {self.delta_bic_const:+.2f} "
|
|
58
|
+
f"vs linear = {self.delta_bic_linear:+.2f} "
|
|
59
|
+
f"z vs shuffled = {self.z_vs_shuffled:+.2f}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def __str__(self) -> str:
|
|
63
|
+
return self.summary()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class DiscoverConfig:
|
|
68
|
+
"""Knobs for the GA. Defaults are sensible for ≤10k-row tabular data."""
|
|
69
|
+
|
|
70
|
+
n_generations: int = 40
|
|
71
|
+
pop_size: int = 200
|
|
72
|
+
max_depth: int = 4
|
|
73
|
+
seed: int = 42
|
|
74
|
+
test_frac: float = 0.2
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Dataset wrapper
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
class Dataset:
|
|
82
|
+
"""Rust-backed tabular dataset. Use the `from_*` constructors."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, target_name: str):
|
|
85
|
+
ptr = _lib.nullbic_dataset_new(target_name.encode("utf-8"))
|
|
86
|
+
if not ptr:
|
|
87
|
+
raise MemoryError("nullbic_dataset_new returned NULL")
|
|
88
|
+
self._ptr = ctypes.c_void_p(ptr)
|
|
89
|
+
self._target_name = target_name
|
|
90
|
+
self._feature_names: list[str] = []
|
|
91
|
+
|
|
92
|
+
def add_feature(self, name: str) -> None:
|
|
93
|
+
"""Register a feature name. Must be called for every column before `push_row`."""
|
|
94
|
+
rc = _lib.nullbic_dataset_add_feature(self._ptr, name.encode("utf-8"))
|
|
95
|
+
if rc != 0:
|
|
96
|
+
raise ValueError(f"could not add feature '{name}'")
|
|
97
|
+
self._feature_names.append(name)
|
|
98
|
+
|
|
99
|
+
def push_row(self, features: Sequence[float], target: float) -> None:
|
|
100
|
+
"""Push one row. `features` order must match `add_feature` order."""
|
|
101
|
+
if len(features) != len(self._feature_names):
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"row has {len(features)} features, expected {len(self._feature_names)}"
|
|
104
|
+
)
|
|
105
|
+
arr = (ctypes.c_double * len(features))(*features)
|
|
106
|
+
rc = _lib.nullbic_dataset_push_row(self._ptr, arr, float(target))
|
|
107
|
+
if rc != 0:
|
|
108
|
+
raise ValueError("nullbic_dataset_push_row failed")
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def n_rows(self) -> int:
|
|
112
|
+
return int(_lib.nullbic_dataset_n_rows(self._ptr))
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def feature_names(self) -> list[str]:
|
|
116
|
+
return list(self._feature_names)
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def target_name(self) -> str:
|
|
120
|
+
return self._target_name
|
|
121
|
+
|
|
122
|
+
def __len__(self) -> int:
|
|
123
|
+
return self.n_rows
|
|
124
|
+
|
|
125
|
+
def __repr__(self) -> str:
|
|
126
|
+
return (
|
|
127
|
+
f"Dataset(target={self._target_name!r}, n={self.n_rows}, "
|
|
128
|
+
f"features={self._feature_names})"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# ------- constructors -------
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def from_records(
|
|
135
|
+
cls,
|
|
136
|
+
records: Iterable[Mapping[str, Any]],
|
|
137
|
+
target: str,
|
|
138
|
+
feature_names: Optional[Sequence[str]] = None,
|
|
139
|
+
) -> "Dataset":
|
|
140
|
+
"""Build from an iterable of dicts. Non-numeric values are dropped."""
|
|
141
|
+
recs = list(records)
|
|
142
|
+
if not recs:
|
|
143
|
+
raise ValueError("empty records")
|
|
144
|
+
if feature_names is None:
|
|
145
|
+
keys = list(recs[0].keys())
|
|
146
|
+
feature_names = [k for k in keys if k != target]
|
|
147
|
+
ds = cls(target)
|
|
148
|
+
for f in feature_names:
|
|
149
|
+
ds.add_feature(f)
|
|
150
|
+
n_rejected = 0
|
|
151
|
+
for r in recs:
|
|
152
|
+
try:
|
|
153
|
+
t = float(r[target])
|
|
154
|
+
row = [float(r.get(f, 0.0)) for f in feature_names]
|
|
155
|
+
except (TypeError, ValueError, KeyError):
|
|
156
|
+
n_rejected += 1
|
|
157
|
+
continue
|
|
158
|
+
ds.push_row(row, t)
|
|
159
|
+
if ds.n_rows == 0:
|
|
160
|
+
raise ValueError(f"all {n_rejected} rows rejected (non-numeric data?)")
|
|
161
|
+
return ds
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def from_pandas(cls, df: "pandas.DataFrame", target: str) -> "Dataset":
|
|
165
|
+
"""Build from a pandas DataFrame. Non-numeric columns are skipped."""
|
|
166
|
+
try:
|
|
167
|
+
import pandas as pd # noqa: F401
|
|
168
|
+
except ImportError as e:
|
|
169
|
+
raise ImportError("pandas is not installed; `pip install nullbic[pandas]`") from e
|
|
170
|
+
if target not in df.columns:
|
|
171
|
+
raise ValueError(f"target column '{target}' not in DataFrame")
|
|
172
|
+
numeric = df.select_dtypes(include="number").copy()
|
|
173
|
+
if target not in numeric.columns:
|
|
174
|
+
raise ValueError(f"target column '{target}' is not numeric")
|
|
175
|
+
features = [c for c in numeric.columns if c != target]
|
|
176
|
+
ds = cls(target)
|
|
177
|
+
for f in features:
|
|
178
|
+
ds.add_feature(f)
|
|
179
|
+
for _, row in numeric.iterrows():
|
|
180
|
+
ds.push_row([float(row[f]) for f in features], float(row[target]))
|
|
181
|
+
return ds
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def from_csv(cls, path: Union[str, Path], target: str) -> "Dataset":
|
|
185
|
+
"""Build from a CSV file. First row must be the header."""
|
|
186
|
+
path = Path(path)
|
|
187
|
+
try:
|
|
188
|
+
import pandas as pd
|
|
189
|
+
df = pd.read_csv(path)
|
|
190
|
+
return cls.from_pandas(df, target)
|
|
191
|
+
except ImportError:
|
|
192
|
+
pass
|
|
193
|
+
# Fallback: stdlib csv
|
|
194
|
+
import csv as _csv
|
|
195
|
+
|
|
196
|
+
with open(path, newline="") as fh:
|
|
197
|
+
reader = _csv.DictReader(fh)
|
|
198
|
+
rows = list(reader)
|
|
199
|
+
# Coerce numeric where possible
|
|
200
|
+
def coerce(v):
|
|
201
|
+
try:
|
|
202
|
+
return float(v)
|
|
203
|
+
except (TypeError, ValueError):
|
|
204
|
+
return None
|
|
205
|
+
clean = []
|
|
206
|
+
for r in rows:
|
|
207
|
+
r2 = {k: coerce(v) for k, v in r.items()}
|
|
208
|
+
if r2.get(target) is not None:
|
|
209
|
+
clean.append({k: (v if v is not None else 0.0) for k, v in r2.items()})
|
|
210
|
+
feature_names = [k for k in clean[0].keys() if k != target] if clean else []
|
|
211
|
+
return cls.from_records(clean, target, feature_names)
|
|
212
|
+
|
|
213
|
+
# ------- cleanup -------
|
|
214
|
+
|
|
215
|
+
def close(self) -> None:
|
|
216
|
+
if getattr(self, "_ptr", None):
|
|
217
|
+
_lib.nullbic_dataset_free(self._ptr)
|
|
218
|
+
self._ptr = ctypes.c_void_p(0)
|
|
219
|
+
|
|
220
|
+
def __del__(self):
|
|
221
|
+
try:
|
|
222
|
+
self.close()
|
|
223
|
+
except Exception:
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
def __enter__(self):
|
|
227
|
+
return self
|
|
228
|
+
|
|
229
|
+
def __exit__(self, *_a):
|
|
230
|
+
self.close()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# Discover
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
def discover(
|
|
238
|
+
dataset: Dataset,
|
|
239
|
+
config: Optional[DiscoverConfig] = None,
|
|
240
|
+
**kwargs: Any,
|
|
241
|
+
) -> DiscoverReport:
|
|
242
|
+
"""Run symbolic regression with auto-falsification.
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
dataset : Dataset
|
|
247
|
+
The data to fit.
|
|
248
|
+
config : DiscoverConfig, optional
|
|
249
|
+
GA configuration. If None, defaults are used (40 gens, 200 pop, depth 4).
|
|
250
|
+
**kwargs
|
|
251
|
+
Override any DiscoverConfig field inline (`seed=7`, `n_generations=100`, etc.).
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
DiscoverReport
|
|
256
|
+
Includes formula, verdict, BIC, and all baseline ΔBIC values.
|
|
257
|
+
"""
|
|
258
|
+
cfg = config or DiscoverConfig()
|
|
259
|
+
for k, v in kwargs.items():
|
|
260
|
+
if not hasattr(cfg, k):
|
|
261
|
+
raise TypeError(f"unknown config field: {k}")
|
|
262
|
+
setattr(cfg, k, v)
|
|
263
|
+
|
|
264
|
+
rep_ptr = _lib.nullbic_discover(
|
|
265
|
+
dataset._ptr,
|
|
266
|
+
ctypes.c_uint(cfg.n_generations),
|
|
267
|
+
ctypes.c_size_t(cfg.pop_size),
|
|
268
|
+
ctypes.c_uint(cfg.max_depth),
|
|
269
|
+
ctypes.c_uint64(cfg.seed),
|
|
270
|
+
ctypes.c_double(cfg.test_frac),
|
|
271
|
+
)
|
|
272
|
+
if not rep_ptr:
|
|
273
|
+
raise RuntimeError("nullbic_discover returned NULL")
|
|
274
|
+
rep_ptr = ctypes.c_void_p(rep_ptr)
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
formula = _take_string(_lib.nullbic_report_formula(rep_ptr))
|
|
278
|
+
raw_json = _take_string(_lib.nullbic_report_json(rep_ptr))
|
|
279
|
+
verdict = Verdict(_lib.nullbic_report_verdict(rep_ptr))
|
|
280
|
+
bic = _lib.nullbic_report_bic(rep_ptr)
|
|
281
|
+
mse_tr = _lib.nullbic_report_mse_train(rep_ptr)
|
|
282
|
+
mse_te = _lib.nullbic_report_mse_test(rep_ptr)
|
|
283
|
+
d_const = _lib.nullbic_report_delta_bic_const(rep_ptr)
|
|
284
|
+
d_lin = _lib.nullbic_report_delta_bic_linear(rep_ptr)
|
|
285
|
+
z_sh = _lib.nullbic_report_z_vs_shuffled(rep_ptr)
|
|
286
|
+
|
|
287
|
+
n_train = n_test = wall = 0
|
|
288
|
+
if raw_json:
|
|
289
|
+
try:
|
|
290
|
+
obj = json.loads(raw_json)
|
|
291
|
+
n_train = int(obj.get("n_train", 0))
|
|
292
|
+
n_test = int(obj.get("n_test", 0))
|
|
293
|
+
wall = int(obj.get("wall_clock_ms", 0))
|
|
294
|
+
except json.JSONDecodeError:
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
return DiscoverReport(
|
|
298
|
+
formula=formula,
|
|
299
|
+
verdict=verdict,
|
|
300
|
+
bic=bic,
|
|
301
|
+
mse_train=mse_tr,
|
|
302
|
+
mse_test=mse_te,
|
|
303
|
+
delta_bic_const=d_const,
|
|
304
|
+
delta_bic_linear=d_lin,
|
|
305
|
+
z_vs_shuffled=z_sh,
|
|
306
|
+
n_train=n_train,
|
|
307
|
+
n_test=n_test,
|
|
308
|
+
wall_clock_ms=wall,
|
|
309
|
+
raw_json=raw_json,
|
|
310
|
+
)
|
|
311
|
+
finally:
|
|
312
|
+
_lib.nullbic_report_free(rep_ptr)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
__all__ = ["Dataset", "DiscoverConfig", "DiscoverReport", "Verdict", "discover"]
|
nullbic/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nullbic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Symbolic regression with automatic null-baseline ΔBIC. Find formulas AND prove they're not noise.
|
|
5
|
+
Author: nullbic contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/glogwa68/nullbic
|
|
8
|
+
Project-URL: Repository, https://github.com/glogwa68/nullbic
|
|
9
|
+
Project-URL: Issues, https://github.com/glogwa68/nullbic/issues
|
|
10
|
+
Keywords: symbolic-regression,machine-learning,interpretability,BIC,auto-falsification
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Rust
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
20
|
+
Classifier: Development Status :: 4 - Beta
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: pandas
|
|
25
|
+
Requires-Dist: pandas>=1.3; extra == "pandas"
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pandas>=1.3; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
31
|
+
Requires-Dist: build; extra == "dev"
|
|
32
|
+
Requires-Dist: twine; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# nullbic
|
|
36
|
+
|
|
37
|
+
**Symbolic regression with automatic null-baseline ΔBIC.**
|
|
38
|
+
Find formulas AND prove they're not noise.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from nullbic import Dataset, discover
|
|
42
|
+
|
|
43
|
+
ds = Dataset.from_csv("data.csv", target="y")
|
|
44
|
+
report = discover(ds, n_generations=40)
|
|
45
|
+
|
|
46
|
+
print(report.summary())
|
|
47
|
+
print("real signal?", report.is_real_signal())
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Why this exists
|
|
51
|
+
|
|
52
|
+
Existing symbolic-regression tools (PySR, Eureqa, `gplearn`,
|
|
53
|
+
`SymbolicRegression.jl`) all give you a formula. **None of them, by default,
|
|
54
|
+
tell you whether that formula meaningfully beats a constant model or a
|
|
55
|
+
trivial linear fit.** Most "discoveries" published with these tools are
|
|
56
|
+
indistinguishable from noise, and the user has no easy way to tell.
|
|
57
|
+
|
|
58
|
+
`nullbic` ships those baselines as a first-class output:
|
|
59
|
+
|
|
60
|
+
- **ΔBIC vs F = const** (the mean predictor)
|
|
61
|
+
- **ΔBIC vs F = linear** (best OLS on all features)
|
|
62
|
+
- **z-score vs shuffled-target distribution** (30 shuffles by default)
|
|
63
|
+
- **`Verdict.STRONG` / `WEAK` / `NOISE`** assigned automatically
|
|
64
|
+
|
|
65
|
+
If the discovered formula doesn't beat all three baselines, the verdict
|
|
66
|
+
downgrades. No silent overfit.
|
|
67
|
+
|
|
68
|
+
## Install
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/yourname/nullbic
|
|
72
|
+
cd nullbic
|
|
73
|
+
pip install -e .
|
|
74
|
+
# optional: pandas integration
|
|
75
|
+
pip install -e ".[pandas]"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
The Rust core ships as a pre-built shared library inside the package
|
|
79
|
+
(`nullbic/_dll/nullbic.dll` on Windows; `.so` / `.dylib` on Linux / macOS).
|
|
80
|
+
No Rust toolchain needed at install time.
|
|
81
|
+
|
|
82
|
+
## Usage
|
|
83
|
+
|
|
84
|
+
### From a pandas DataFrame
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import pandas as pd
|
|
88
|
+
from nullbic import Dataset, discover
|
|
89
|
+
|
|
90
|
+
df = pd.read_csv("merge_results.csv")
|
|
91
|
+
ds = Dataset.from_pandas(df, target="cosine_sim")
|
|
92
|
+
rep = discover(ds, n_generations=40, pop_size=200, max_depth=4)
|
|
93
|
+
|
|
94
|
+
if rep.is_real_signal():
|
|
95
|
+
print("Formula:", rep.formula)
|
|
96
|
+
print(f"ΔBIC vs const : {rep.delta_bic_const:+.1f}")
|
|
97
|
+
print(f"ΔBIC vs linear: {rep.delta_bic_linear:+.1f}")
|
|
98
|
+
print(f"z vs shuffled : {rep.z_vs_shuffled:+.2f}")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### From a CSV
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from nullbic import Dataset, discover
|
|
105
|
+
|
|
106
|
+
ds = Dataset.from_csv("data.csv", target="y")
|
|
107
|
+
print(discover(ds).summary())
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### From a CLI
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
nullbic data.csv y --gens=40 --pop=200 --depth=4
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Exit code is `0` if the verdict is `STRONG` or `WEAK`, `1` if `NOISE` —
|
|
117
|
+
handy in CI/CD signal-validation pipelines.
|
|
118
|
+
|
|
119
|
+
## Three real use cases
|
|
120
|
+
|
|
121
|
+
### 1. Kaggle / tabular insight
|
|
122
|
+
|
|
123
|
+
Point at any cleaned dataset. Get a formula. **Get the proof it's not in
|
|
124
|
+
the noise.**
|
|
125
|
+
|
|
126
|
+
### 2. Black-box surrogate
|
|
127
|
+
|
|
128
|
+
Approximate an XGBoost / NN model with a symbolic surrogate; the verdict
|
|
129
|
+
tells you when the surrogate is meaningful vs cosmetic.
|
|
130
|
+
|
|
131
|
+
### 3. Empirical-law audit
|
|
132
|
+
|
|
133
|
+
Feed in a paper's claimed empirical relationship. The verdict says whether
|
|
134
|
+
the relationship really beats a linear baseline on the given data.
|
|
135
|
+
|
|
136
|
+
## How the verdict is assigned
|
|
137
|
+
|
|
138
|
+
| Verdict | Criteria |
|
|
139
|
+
|---------|----------|
|
|
140
|
+
| `STRONG` | ΔBIC vs const < −10 **AND** ΔBIC vs linear < −10 **AND** z vs shuffled < −2 |
|
|
141
|
+
| `WEAK` | Beats const but not all three thresholds |
|
|
142
|
+
| `NOISE` | Doesn't beat const → formula is not extracting any signal |
|
|
143
|
+
|
|
144
|
+
These thresholds match standard model-selection conventions (Kass &
|
|
145
|
+
Raftery 1995 / Schwarz 1978).
|
|
146
|
+
|
|
147
|
+
## Architecture
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
nullbic (Python package)
|
|
151
|
+
└── core.py → public API (Dataset, discover, …)
|
|
152
|
+
└── _bindings.py → ctypes layer (private)
|
|
153
|
+
└── _dll/nullbic.dll → Rust shared library (pre-built)
|
|
154
|
+
|
|
155
|
+
nullbic-core (Rust crate, ~600 LOC)
|
|
156
|
+
└── dataset → tabular rows + train/test split + shuffled
|
|
157
|
+
└── expr → expression trees over named features
|
|
158
|
+
└── optimizer → single-level GA, niching, hyper-mutation
|
|
159
|
+
└── baselines → F=const + F=linear (OLS) + shuffled-target
|
|
160
|
+
└── c_api → extern "C" entry points
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Performance
|
|
164
|
+
|
|
165
|
+
Typical run on 500 rows × 10 features, 40 generations, pop 200:
|
|
166
|
+
|
|
167
|
+
- ~30–80 ms wall-clock on one core
|
|
168
|
+
- ~150 MB peak RAM
|
|
169
|
+
- Deterministic for a given seed
|
|
170
|
+
|
|
171
|
+
The GA is parallel via Rayon; throughput scales near-linearly with cores.
|
|
172
|
+
|
|
173
|
+
## Limitations and honest caveats
|
|
174
|
+
|
|
175
|
+
- The GA is intentionally simple. PySR is more sophisticated when raw
|
|
176
|
+
accuracy is the only goal. **`nullbic` trades a few percent of accuracy
|
|
177
|
+
for the auto-falsification report.**
|
|
178
|
+
- The "linear baseline" is plain OLS with a tiny ridge for numerical
|
|
179
|
+
safety; it's not feature-engineered. If you're comparing to a serious
|
|
180
|
+
linear model, replace `delta_bic_linear` with your own ΔBIC.
|
|
181
|
+
- 580 rows is comfortable. Below ~50 rows, results are unreliable — the
|
|
182
|
+
shuffled-target distribution is too noisy to anchor the z-score.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT. See `LICENSE`.
|
|
187
|
+
|
|
188
|
+
## Citation
|
|
189
|
+
|
|
190
|
+
If `nullbic` contributes to a paper, please cite it as:
|
|
191
|
+
|
|
192
|
+
> nullbic: symbolic regression with auto-falsification ΔBIC. 2026.
|
|
193
|
+
|
|
194
|
+
## Related
|
|
195
|
+
|
|
196
|
+
- [PySR](https://github.com/MilesCranmer/PySR) — sophisticated symbolic regression
|
|
197
|
+
- [gplearn](https://gplearn.readthedocs.io/) — sklearn-compatible GP
|
|
198
|
+
- [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl) — Julia
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
nullbic/__init__.py,sha256=epkVh-RseHaD02R3JvqtbYUEqWhcA9iA2wUD8cpcOus,949
|
|
2
|
+
nullbic/_bindings.py,sha256=jkgDS8p51G1gK4qcTPBpUoIZaXqr2wFbilCVDOdXV0c,3566
|
|
3
|
+
nullbic/cli.py,sha256=MG9iOQHm60uxfVVaFsRXksY9fPDRsPHbsBLSV_hnzqs,1797
|
|
4
|
+
nullbic/core.py,sha256=RrxLTfg9gfSQ4oBnHpqKgjiQtCh2_n2wDJFVqJosF1c,10344
|
|
5
|
+
nullbic/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
nullbic/_dll/nullbic.dll,sha256=Fh6oDvDamhuh_1NNkgsufy3EKy2XU_Jebgnu_hvzjMk,1670041
|
|
7
|
+
nullbic-0.1.0.dist-info/licenses/LICENSE,sha256=_ZS3-iHxrwL9N6dVMm1NRwTmIQs4xxM7GaK2UZYaNlc,1077
|
|
8
|
+
nullbic-0.1.0.dist-info/METADATA,sha256=5Ql_iR7-kqJdTAi6oQU5O33V3wETxYYewOBPvaxNzJE,6672
|
|
9
|
+
nullbic-0.1.0.dist-info/WHEEL,sha256=QR8DNjG6Lr6bNErJWJgF4dP2dJ2N7NpY-BWly1OvcTM,97
|
|
10
|
+
nullbic-0.1.0.dist-info/entry_points.txt,sha256=WcH5B4HG89HHz-8FSz5J6tT6Y_nHfWXjm_4zc73aGb8,45
|
|
11
|
+
nullbic-0.1.0.dist-info/top_level.txt,sha256=uU9_0EFoX7A_k1SXyug-1dovYwz5085UWdiOITgTYLc,8
|
|
12
|
+
nullbic-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 nullbic contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nullbic
|