nullbic 0.1.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nullbic/__init__.py ADDED
@@ -0,0 +1,45 @@
1
+ """nullbic — Symbolic regression with automatic null-baseline ΔBIC.
2
+
3
+ Find formulas AND prove they're not noise.
4
+
5
+ Quick start
6
+ -----------
7
+
8
+ >>> from nullbic import Dataset, discover
9
+ >>> ds = Dataset.from_csv("data.csv", target="y")
10
+ >>> report = discover(ds, n_generations=40)
11
+ >>> print(report.summary())
12
+ >>> report.is_real_signal()
13
+ True
14
+
15
+ Pandas
16
+ ------
17
+
18
+ >>> from nullbic import Dataset, discover
19
+ >>> import pandas as pd
20
+ >>> df = pd.read_csv("data.csv")
21
+ >>> ds = Dataset.from_pandas(df, target="y")
22
+ >>> report = discover(ds)
23
+
24
+ The report always carries ΔBIC vs F=const, F=linear, and a shuffled-target
25
+ control. Verdict is one of NOISE / WEAK / STRONG.
26
+ """
27
+
28
+ from .core import (
29
+ Dataset,
30
+ DiscoverConfig,
31
+ DiscoverReport,
32
+ Verdict,
33
+ discover,
34
+ )
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ "Dataset",
40
+ "DiscoverConfig",
41
+ "DiscoverReport",
42
+ "Verdict",
43
+ "discover",
44
+ "__version__",
45
+ ]
nullbic/_bindings.py ADDED
@@ -0,0 +1,119 @@
1
+ """ctypes bindings to the nullbic Rust DLL. Private — use `nullbic.core` instead."""
2
+ from __future__ import annotations
3
+
4
+ import ctypes
5
+ import os
6
+ import platform
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # DLL location
12
+ # ---------------------------------------------------------------------------
13
+
14
+ def _find_dll() -> Path:
15
+ """Locate the bundled DLL/SO/dylib for the current platform."""
16
+ here = Path(__file__).resolve().parent / "_dll"
17
+ candidates = []
18
+ sysname = platform.system()
19
+ if sysname == "Windows":
20
+ candidates = ["nullbic.dll"]
21
+ elif sysname == "Linux":
22
+ candidates = ["libnullbic.so", "nullbic.so"]
23
+ elif sysname == "Darwin":
24
+ candidates = ["libnullbic.dylib", "nullbic.dylib"]
25
+ for c in candidates:
26
+ p = here / c
27
+ if p.exists():
28
+ return p
29
+ raise RuntimeError(
30
+ f"nullbic: no shared library found for {sysname} in {here}. "
31
+ f"Build the Rust crate and copy the artifact into {here}."
32
+ )
33
+
34
+ _DLL_PATH = _find_dll()
35
+ _lib = ctypes.CDLL(str(_DLL_PATH))
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # ctypes signatures
40
+ # ---------------------------------------------------------------------------
41
+
42
+ # Dataset
43
+ _lib.nullbic_dataset_new.restype = ctypes.c_void_p
44
+ _lib.nullbic_dataset_new.argtypes = [ctypes.c_char_p]
45
+
46
+ _lib.nullbic_dataset_add_feature.restype = ctypes.c_int
47
+ _lib.nullbic_dataset_add_feature.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
48
+
49
+ _lib.nullbic_dataset_push_row.restype = ctypes.c_int
50
+ _lib.nullbic_dataset_push_row.argtypes = [
51
+ ctypes.c_void_p,
52
+ ctypes.POINTER(ctypes.c_double),
53
+ ctypes.c_double,
54
+ ]
55
+
56
+ _lib.nullbic_dataset_n_rows.restype = ctypes.c_size_t
57
+ _lib.nullbic_dataset_n_rows.argtypes = [ctypes.c_void_p]
58
+
59
+ _lib.nullbic_dataset_free.argtypes = [ctypes.c_void_p]
60
+
61
+ # Discover
62
+ _lib.nullbic_discover.restype = ctypes.c_void_p
63
+ _lib.nullbic_discover.argtypes = [
64
+ ctypes.c_void_p,
65
+ ctypes.c_uint,
66
+ ctypes.c_size_t,
67
+ ctypes.c_uint,
68
+ ctypes.c_uint64,
69
+ ctypes.c_double,
70
+ ]
71
+
72
+ # Report accessors
73
+ _lib.nullbic_report_formula.restype = ctypes.c_void_p
74
+ _lib.nullbic_report_formula.argtypes = [ctypes.c_void_p]
75
+
76
+ _lib.nullbic_report_pretty.restype = ctypes.c_void_p
77
+ _lib.nullbic_report_pretty.argtypes = [ctypes.c_void_p]
78
+
79
+ _lib.nullbic_report_json.restype = ctypes.c_void_p
80
+ _lib.nullbic_report_json.argtypes = [ctypes.c_void_p]
81
+
82
+ _lib.nullbic_report_verdict.restype = ctypes.c_int
83
+ _lib.nullbic_report_verdict.argtypes = [ctypes.c_void_p]
84
+
85
+ for fn in (
86
+ "nullbic_report_mse_train",
87
+ "nullbic_report_mse_test",
88
+ "nullbic_report_bic",
89
+ "nullbic_report_delta_bic_const",
90
+ "nullbic_report_delta_bic_linear",
91
+ "nullbic_report_z_vs_shuffled",
92
+ ):
93
+ f = getattr(_lib, fn)
94
+ f.restype = ctypes.c_double
95
+ f.argtypes = [ctypes.c_void_p]
96
+
97
+ _lib.nullbic_report_free.argtypes = [ctypes.c_void_p]
98
+ _lib.nullbic_string_free.argtypes = [ctypes.c_void_p]
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Helpers
103
+ # ---------------------------------------------------------------------------
104
+
105
+ def _take_string(ptr: int) -> str:
106
+ """Copy a Rust-allocated C string into a Python str and free the Rust buf."""
107
+ if not ptr:
108
+ return ""
109
+ s = ctypes.string_at(ptr).decode("utf-8", errors="replace")
110
+ _lib.nullbic_string_free(ptr)
111
+ return s
112
+
113
+
114
+ def lib():
115
+ """Return the loaded shared library handle (for advanced/debug use)."""
116
+ return _lib
117
+
118
+
119
+ __all__ = ["lib", "_take_string"]
Binary file
nullbic/cli.py ADDED
@@ -0,0 +1,52 @@
1
+ """nullbic CLI — `nullbic data.csv target_col`."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from .core import Dataset, DiscoverConfig, Verdict, discover
9
+
10
+
11
+ def main(argv: list[str] | None = None) -> int:
12
+ p = argparse.ArgumentParser(
13
+ prog="nullbic",
14
+ description="Symbolic regression with auto-falsification ΔBIC.",
15
+ )
16
+ p.add_argument("csv", type=Path, help="CSV file with a header row")
17
+ p.add_argument("target", type=str, help="Name of the column to predict")
18
+ p.add_argument("--gens", type=int, default=40, dest="n_generations")
19
+ p.add_argument("--pop", type=int, default=200, dest="pop_size")
20
+ p.add_argument("--depth", type=int, default=4, dest="max_depth")
21
+ p.add_argument("--seed", type=int, default=42)
22
+ p.add_argument("--test-frac", type=float, default=0.2, dest="test_frac")
23
+ p.add_argument("--json", type=Path, default=None, help="Also write JSON report here")
24
+ args = p.parse_args(argv)
25
+
26
+ ds = Dataset.from_csv(args.csv, args.target)
27
+ sys.stderr.write(
28
+ f"loaded {len(ds)} rows, {len(ds.feature_names)} features → "
29
+ f"target '{ds.target_name}'\n"
30
+ )
31
+
32
+ cfg = DiscoverConfig(
33
+ n_generations=args.n_generations,
34
+ pop_size=args.pop_size,
35
+ max_depth=args.max_depth,
36
+ seed=args.seed,
37
+ test_frac=args.test_frac,
38
+ )
39
+ rep = discover(ds, cfg)
40
+ print(rep.summary())
41
+ print(f"\nverdict : {rep.verdict.name}")
42
+ print(f"n_train = {rep.n_train} n_test = {rep.n_test} wall = {rep.wall_clock_ms} ms")
43
+
44
+ if args.json:
45
+ args.json.write_text(rep.raw_json)
46
+ sys.stderr.write(f"wrote {args.json}\n")
47
+
48
+ return 0 if rep.verdict != Verdict.NOISE else 1
49
+
50
+
51
+ if __name__ == "__main__":
52
+ raise SystemExit(main())
nullbic/core.py ADDED
@@ -0,0 +1,315 @@
1
+ """Public Python API for nullbic — symbolic regression with auto-falsification."""
2
+ from __future__ import annotations
3
+
4
+ import ctypes
5
+ import json
6
+ from dataclasses import dataclass, field
7
+ from enum import IntEnum
8
+ from pathlib import Path
9
+ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
10
+
11
+ from ._bindings import _take_string, lib
12
+
13
+ _lib = lib()
14
+
15
+
16
+ class Verdict(IntEnum):
17
+ """Falsification verdict assigned to every discovery run."""
18
+
19
+ NOISE = 0
20
+ """Doesn't meaningfully beat F=const or F=linear baselines. Don't trust the formula."""
21
+
22
+ WEAK = 1
23
+ """Beats F=const but not F=linear. The formula adds non-linearity worth a marginal look."""
24
+
25
+ STRONG = 2
26
+ """Beats F=const AND F=linear by ≥10 BIC AND > 2σ below the shuffled-target distribution."""
27
+
28
+
29
+ @dataclass
30
+ class DiscoverReport:
31
+ """One symbolic-regression run with full auto-falsification context."""
32
+
33
+ formula: str
34
+ verdict: Verdict
35
+ bic: float
36
+ mse_train: float
37
+ mse_test: float
38
+ delta_bic_const: float
39
+ delta_bic_linear: float
40
+ z_vs_shuffled: float
41
+ n_train: int = 0
42
+ n_test: int = 0
43
+ wall_clock_ms: int = 0
44
+ raw_json: str = ""
45
+
46
+ def is_real_signal(self) -> bool:
47
+ """True iff the verdict is STRONG."""
48
+ return self.verdict == Verdict.STRONG
49
+
50
+ def summary(self) -> str:
51
+ """Compact one-block textual summary."""
52
+ return (
53
+ f"formula : {self.formula}\n"
54
+ f"verdict : {self.verdict.name}\n"
55
+ f"MSE : train={self.mse_train:.6g} test={self.mse_test:.6g}\n"
56
+ f"BIC : {self.bic:.2f}\n"
57
+ f"ΔBIC : vs const = {self.delta_bic_const:+.2f} "
58
+ f"vs linear = {self.delta_bic_linear:+.2f} "
59
+ f"z vs shuffled = {self.z_vs_shuffled:+.2f}"
60
+ )
61
+
62
+ def __str__(self) -> str:
63
+ return self.summary()
64
+
65
+
66
+ @dataclass
67
+ class DiscoverConfig:
68
+ """Knobs for the GA. Defaults are sensible for ≤10k-row tabular data."""
69
+
70
+ n_generations: int = 40
71
+ pop_size: int = 200
72
+ max_depth: int = 4
73
+ seed: int = 42
74
+ test_frac: float = 0.2
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Dataset wrapper
79
+ # ---------------------------------------------------------------------------
80
+
81
+ class Dataset:
82
+ """Rust-backed tabular dataset. Use the `from_*` constructors."""
83
+
84
+ def __init__(self, target_name: str):
85
+ ptr = _lib.nullbic_dataset_new(target_name.encode("utf-8"))
86
+ if not ptr:
87
+ raise MemoryError("nullbic_dataset_new returned NULL")
88
+ self._ptr = ctypes.c_void_p(ptr)
89
+ self._target_name = target_name
90
+ self._feature_names: list[str] = []
91
+
92
+ def add_feature(self, name: str) -> None:
93
+ """Register a feature name. Must be called for every column before `push_row`."""
94
+ rc = _lib.nullbic_dataset_add_feature(self._ptr, name.encode("utf-8"))
95
+ if rc != 0:
96
+ raise ValueError(f"could not add feature '{name}'")
97
+ self._feature_names.append(name)
98
+
99
+ def push_row(self, features: Sequence[float], target: float) -> None:
100
+ """Push one row. `features` order must match `add_feature` order."""
101
+ if len(features) != len(self._feature_names):
102
+ raise ValueError(
103
+ f"row has {len(features)} features, expected {len(self._feature_names)}"
104
+ )
105
+ arr = (ctypes.c_double * len(features))(*features)
106
+ rc = _lib.nullbic_dataset_push_row(self._ptr, arr, float(target))
107
+ if rc != 0:
108
+ raise ValueError("nullbic_dataset_push_row failed")
109
+
110
+ @property
111
+ def n_rows(self) -> int:
112
+ return int(_lib.nullbic_dataset_n_rows(self._ptr))
113
+
114
+ @property
115
+ def feature_names(self) -> list[str]:
116
+ return list(self._feature_names)
117
+
118
+ @property
119
+ def target_name(self) -> str:
120
+ return self._target_name
121
+
122
+ def __len__(self) -> int:
123
+ return self.n_rows
124
+
125
+ def __repr__(self) -> str:
126
+ return (
127
+ f"Dataset(target={self._target_name!r}, n={self.n_rows}, "
128
+ f"features={self._feature_names})"
129
+ )
130
+
131
+ # ------- constructors -------
132
+
133
+ @classmethod
134
+ def from_records(
135
+ cls,
136
+ records: Iterable[Mapping[str, Any]],
137
+ target: str,
138
+ feature_names: Optional[Sequence[str]] = None,
139
+ ) -> "Dataset":
140
+ """Build from an iterable of dicts. Non-numeric values are dropped."""
141
+ recs = list(records)
142
+ if not recs:
143
+ raise ValueError("empty records")
144
+ if feature_names is None:
145
+ keys = list(recs[0].keys())
146
+ feature_names = [k for k in keys if k != target]
147
+ ds = cls(target)
148
+ for f in feature_names:
149
+ ds.add_feature(f)
150
+ n_rejected = 0
151
+ for r in recs:
152
+ try:
153
+ t = float(r[target])
154
+ row = [float(r.get(f, 0.0)) for f in feature_names]
155
+ except (TypeError, ValueError, KeyError):
156
+ n_rejected += 1
157
+ continue
158
+ ds.push_row(row, t)
159
+ if ds.n_rows == 0:
160
+ raise ValueError(f"all {n_rejected} rows rejected (non-numeric data?)")
161
+ return ds
162
+
163
+ @classmethod
164
+ def from_pandas(cls, df: "pandas.DataFrame", target: str) -> "Dataset":
165
+ """Build from a pandas DataFrame. Non-numeric columns are skipped."""
166
+ try:
167
+ import pandas as pd # noqa: F401
168
+ except ImportError as e:
169
+ raise ImportError("pandas is not installed; `pip install nullbic[pandas]`") from e
170
+ if target not in df.columns:
171
+ raise ValueError(f"target column '{target}' not in DataFrame")
172
+ numeric = df.select_dtypes(include="number").copy()
173
+ if target not in numeric.columns:
174
+ raise ValueError(f"target column '{target}' is not numeric")
175
+ features = [c for c in numeric.columns if c != target]
176
+ ds = cls(target)
177
+ for f in features:
178
+ ds.add_feature(f)
179
+ for _, row in numeric.iterrows():
180
+ ds.push_row([float(row[f]) for f in features], float(row[target]))
181
+ return ds
182
+
183
+ @classmethod
184
+ def from_csv(cls, path: Union[str, Path], target: str) -> "Dataset":
185
+ """Build from a CSV file. First row must be the header."""
186
+ path = Path(path)
187
+ try:
188
+ import pandas as pd
189
+ df = pd.read_csv(path)
190
+ return cls.from_pandas(df, target)
191
+ except ImportError:
192
+ pass
193
+ # Fallback: stdlib csv
194
+ import csv as _csv
195
+
196
+ with open(path, newline="") as fh:
197
+ reader = _csv.DictReader(fh)
198
+ rows = list(reader)
199
+ # Coerce numeric where possible
200
+ def coerce(v):
201
+ try:
202
+ return float(v)
203
+ except (TypeError, ValueError):
204
+ return None
205
+ clean = []
206
+ for r in rows:
207
+ r2 = {k: coerce(v) for k, v in r.items()}
208
+ if r2.get(target) is not None:
209
+ clean.append({k: (v if v is not None else 0.0) for k, v in r2.items()})
210
+ feature_names = [k for k in clean[0].keys() if k != target] if clean else []
211
+ return cls.from_records(clean, target, feature_names)
212
+
213
+ # ------- cleanup -------
214
+
215
+ def close(self) -> None:
216
+ if getattr(self, "_ptr", None):
217
+ _lib.nullbic_dataset_free(self._ptr)
218
+ self._ptr = ctypes.c_void_p(0)
219
+
220
+ def __del__(self):
221
+ try:
222
+ self.close()
223
+ except Exception:
224
+ pass
225
+
226
+ def __enter__(self):
227
+ return self
228
+
229
+ def __exit__(self, *_a):
230
+ self.close()
231
+
232
+
233
+ # ---------------------------------------------------------------------------
234
+ # Discover
235
+ # ---------------------------------------------------------------------------
236
+
237
+ def discover(
238
+ dataset: Dataset,
239
+ config: Optional[DiscoverConfig] = None,
240
+ **kwargs: Any,
241
+ ) -> DiscoverReport:
242
+ """Run symbolic regression with auto-falsification.
243
+
244
+ Parameters
245
+ ----------
246
+ dataset : Dataset
247
+ The data to fit.
248
+ config : DiscoverConfig, optional
249
+ GA configuration. If None, defaults are used (40 gens, 200 pop, depth 4).
250
+ **kwargs
251
+ Override any DiscoverConfig field inline (`seed=7`, `n_generations=100`, etc.).
252
+
253
+ Returns
254
+ -------
255
+ DiscoverReport
256
+ Includes formula, verdict, BIC, and all baseline ΔBIC values.
257
+ """
258
+ cfg = config or DiscoverConfig()
259
+ for k, v in kwargs.items():
260
+ if not hasattr(cfg, k):
261
+ raise TypeError(f"unknown config field: {k}")
262
+ setattr(cfg, k, v)
263
+
264
+ rep_ptr = _lib.nullbic_discover(
265
+ dataset._ptr,
266
+ ctypes.c_uint(cfg.n_generations),
267
+ ctypes.c_size_t(cfg.pop_size),
268
+ ctypes.c_uint(cfg.max_depth),
269
+ ctypes.c_uint64(cfg.seed),
270
+ ctypes.c_double(cfg.test_frac),
271
+ )
272
+ if not rep_ptr:
273
+ raise RuntimeError("nullbic_discover returned NULL")
274
+ rep_ptr = ctypes.c_void_p(rep_ptr)
275
+
276
+ try:
277
+ formula = _take_string(_lib.nullbic_report_formula(rep_ptr))
278
+ raw_json = _take_string(_lib.nullbic_report_json(rep_ptr))
279
+ verdict = Verdict(_lib.nullbic_report_verdict(rep_ptr))
280
+ bic = _lib.nullbic_report_bic(rep_ptr)
281
+ mse_tr = _lib.nullbic_report_mse_train(rep_ptr)
282
+ mse_te = _lib.nullbic_report_mse_test(rep_ptr)
283
+ d_const = _lib.nullbic_report_delta_bic_const(rep_ptr)
284
+ d_lin = _lib.nullbic_report_delta_bic_linear(rep_ptr)
285
+ z_sh = _lib.nullbic_report_z_vs_shuffled(rep_ptr)
286
+
287
+ n_train = n_test = wall = 0
288
+ if raw_json:
289
+ try:
290
+ obj = json.loads(raw_json)
291
+ n_train = int(obj.get("n_train", 0))
292
+ n_test = int(obj.get("n_test", 0))
293
+ wall = int(obj.get("wall_clock_ms", 0))
294
+ except json.JSONDecodeError:
295
+ pass
296
+
297
+ return DiscoverReport(
298
+ formula=formula,
299
+ verdict=verdict,
300
+ bic=bic,
301
+ mse_train=mse_tr,
302
+ mse_test=mse_te,
303
+ delta_bic_const=d_const,
304
+ delta_bic_linear=d_lin,
305
+ z_vs_shuffled=z_sh,
306
+ n_train=n_train,
307
+ n_test=n_test,
308
+ wall_clock_ms=wall,
309
+ raw_json=raw_json,
310
+ )
311
+ finally:
312
+ _lib.nullbic_report_free(rep_ptr)
313
+
314
+
315
+ __all__ = ["Dataset", "DiscoverConfig", "DiscoverReport", "Verdict", "discover"]
nullbic/py.typed ADDED
File without changes
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: nullbic
3
+ Version: 0.1.0
4
+ Summary: Symbolic regression with automatic null-baseline ΔBIC. Find formulas AND prove they're not noise.
5
+ Author: nullbic contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/glogwa68/nullbic
8
+ Project-URL: Repository, https://github.com/glogwa68/nullbic
9
+ Project-URL: Issues, https://github.com/glogwa68/nullbic/issues
10
+ Keywords: symbolic-regression,machine-learning,interpretability,BIC,auto-falsification
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Rust
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
20
+ Classifier: Development Status :: 4 - Beta
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: pandas
25
+ Requires-Dist: pandas>=1.3; extra == "pandas"
26
+ Provides-Extra: test
27
+ Requires-Dist: pytest>=7; extra == "test"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pandas>=1.3; extra == "dev"
30
+ Requires-Dist: pytest>=7; extra == "dev"
31
+ Requires-Dist: build; extra == "dev"
32
+ Requires-Dist: twine; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # nullbic
36
+
37
+ **Symbolic regression with automatic null-baseline ΔBIC.**
38
+ Find formulas AND prove they're not noise.
39
+
40
+ ```python
41
+ from nullbic import Dataset, discover
42
+
43
+ ds = Dataset.from_csv("data.csv", target="y")
44
+ report = discover(ds, n_generations=40)
45
+
46
+ print(report.summary())
47
+ print("real signal?", report.is_real_signal())
48
+ ```
49
+
50
+ ## Why this exists
51
+
52
+ Existing symbolic-regression tools (PySR, Eureqa, `gplearn`,
53
+ `SymbolicRegression.jl`) all give you a formula. **None of them, by default,
54
+ tell you whether that formula meaningfully beats a constant model or a
55
+ trivial linear fit.** Most "discoveries" published with these tools are
56
+ indistinguishable from noise, and the user has no easy way to tell.
57
+
58
+ `nullbic` ships those baselines as a first-class output:
59
+
60
+ - **ΔBIC vs F = const** (the mean predictor)
61
+ - **ΔBIC vs F = linear** (best OLS on all features)
62
+ - **z-score vs shuffled-target distribution** (30 shuffles by default)
63
+ - **`Verdict.STRONG` / `WEAK` / `NOISE`** assigned automatically
64
+
65
+ If the discovered formula doesn't beat all three baselines, the verdict
66
+ downgrades. No silent overfit.
67
+
68
+ ## Install
69
+
70
+ ```bash
71
+ git clone https://github.com/yourname/nullbic
72
+ cd nullbic
73
+ pip install -e .
74
+ # optional: pandas integration
75
+ pip install -e ".[pandas]"
76
+ ```
77
+
78
+ The Rust core ships as a pre-built shared library inside the package
79
+ (`nullbic/_dll/nullbic.dll` on Windows; `.so` / `.dylib` on Linux / macOS).
80
+ No Rust toolchain needed at install time.
81
+
82
+ ## Usage
83
+
84
+ ### From a pandas DataFrame
85
+
86
+ ```python
87
+ import pandas as pd
88
+ from nullbic import Dataset, discover
89
+
90
+ df = pd.read_csv("merge_results.csv")
91
+ ds = Dataset.from_pandas(df, target="cosine_sim")
92
+ rep = discover(ds, n_generations=40, pop_size=200, max_depth=4)
93
+
94
+ if rep.is_real_signal():
95
+ print("Formula:", rep.formula)
96
+ print(f"ΔBIC vs const : {rep.delta_bic_const:+.1f}")
97
+ print(f"ΔBIC vs linear: {rep.delta_bic_linear:+.1f}")
98
+ print(f"z vs shuffled : {rep.z_vs_shuffled:+.2f}")
99
+ ```
100
+
101
+ ### From a CSV
102
+
103
+ ```python
104
+ from nullbic import Dataset, discover
105
+
106
+ ds = Dataset.from_csv("data.csv", target="y")
107
+ print(discover(ds).summary())
108
+ ```
109
+
110
+ ### From a CLI
111
+
112
+ ```bash
113
+ nullbic data.csv y --gens=40 --pop=200 --depth=4
114
+ ```
115
+
116
+ Exit code is `0` if the verdict is `STRONG` or `WEAK`, `1` if `NOISE` —
117
+ handy in CI/CD signal-validation pipelines.
118
+
119
+ ## Three real use cases
120
+
121
+ ### 1. Kaggle / tabular insight
122
+
123
+ Point at any cleaned dataset. Get a formula. **Get the proof it's not in
124
+ the noise.**
125
+
126
+ ### 2. Black-box surrogate
127
+
128
+ Approximate an XGBoost / NN model with a symbolic surrogate; the verdict
129
+ tells you when the surrogate is meaningful vs cosmetic.
130
+
131
+ ### 3. Empirical-law audit
132
+
133
+ Feed in a paper's claimed empirical relationship. The verdict says whether
134
+ the relationship really beats a linear baseline on the given data.
135
+
136
+ ## How the verdict is assigned
137
+
138
+ | Verdict | Criteria |
139
+ |---------|----------|
140
+ | `STRONG` | ΔBIC vs const < −10 **AND** ΔBIC vs linear < −10 **AND** z vs shuffled < −2 |
141
+ | `WEAK` | Beats const but not all three thresholds |
142
+ | `NOISE` | Doesn't beat const → formula is not extracting any signal |
143
+
144
+ These thresholds match standard model-selection conventions (Kass &
145
+ Raftery 1995 / Schwarz 1978).
146
+
147
+ ## Architecture
148
+
149
+ ```
150
+ nullbic (Python package)
151
+ └── core.py → public API (Dataset, discover, …)
152
+ └── _bindings.py → ctypes layer (private)
153
+ └── _dll/nullbic.dll → Rust shared library (pre-built)
154
+
155
+ nullbic-core (Rust crate, ~600 LOC)
156
+ └── dataset → tabular rows + train/test split + shuffled
157
+ └── expr → expression trees over named features
158
+ └── optimizer → single-level GA, niching, hyper-mutation
159
+ └── baselines → F=const + F=linear (OLS) + shuffled-target
160
+ └── c_api → extern "C" entry points
161
+ ```
162
+
163
+ ## Performance
164
+
165
+ Typical run on 500 rows × 10 features, 40 generations, pop 200:
166
+
167
+ - ~30–80 ms wall-clock on one core
168
+ - ~150 MB peak RAM
169
+ - Deterministic for a given seed
170
+
171
+ The GA is parallel via Rayon; throughput scales near-linearly with cores.
172
+
173
+ ## Limitations and honest caveats
174
+
175
+ - The GA is intentionally simple. PySR is more sophisticated when raw
176
+ accuracy is the only goal. **`nullbic` trades a few percent of accuracy
177
+ for the auto-falsification report.**
178
+ - The "linear baseline" is plain OLS with a tiny ridge for numerical
179
+ safety; it's not feature-engineered. If you're comparing to a serious
180
+ linear model, replace `delta_bic_linear` with your own ΔBIC.
181
+ - 580 rows is comfortable. Below ~50 rows, results are unreliable — the
182
+ shuffled-target distribution is too noisy to anchor the z-score.
183
+
184
+ ## License
185
+
186
+ MIT. See `LICENSE`.
187
+
188
+ ## Citation
189
+
190
+ If `nullbic` contributes to a paper, please cite it as:
191
+
192
+ > nullbic: symbolic regression with auto-falsification ΔBIC. 2026.
193
+
194
+ ## Related
195
+
196
+ - [PySR](https://github.com/MilesCranmer/PySR) — sophisticated symbolic regression
197
+ - [gplearn](https://gplearn.readthedocs.io/) — sklearn-compatible GP
198
+ - [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl) — Julia
@@ -0,0 +1,12 @@
1
+ nullbic/__init__.py,sha256=epkVh-RseHaD02R3JvqtbYUEqWhcA9iA2wUD8cpcOus,949
2
+ nullbic/_bindings.py,sha256=jkgDS8p51G1gK4qcTPBpUoIZaXqr2wFbilCVDOdXV0c,3566
3
+ nullbic/cli.py,sha256=MG9iOQHm60uxfVVaFsRXksY9fPDRsPHbsBLSV_hnzqs,1797
4
+ nullbic/core.py,sha256=RrxLTfg9gfSQ4oBnHpqKgjiQtCh2_n2wDJFVqJosF1c,10344
5
+ nullbic/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ nullbic/_dll/nullbic.dll,sha256=Fh6oDvDamhuh_1NNkgsufy3EKy2XU_Jebgnu_hvzjMk,1670041
7
+ nullbic-0.1.0.dist-info/licenses/LICENSE,sha256=_ZS3-iHxrwL9N6dVMm1NRwTmIQs4xxM7GaK2UZYaNlc,1077
8
+ nullbic-0.1.0.dist-info/METADATA,sha256=5Ql_iR7-kqJdTAi6oQU5O33V3wETxYYewOBPvaxNzJE,6672
9
+ nullbic-0.1.0.dist-info/WHEEL,sha256=QR8DNjG6Lr6bNErJWJgF4dP2dJ2N7NpY-BWly1OvcTM,97
10
+ nullbic-0.1.0.dist-info/entry_points.txt,sha256=WcH5B4HG89HHz-8FSz5J6tT6Y_nHfWXjm_4zc73aGb8,45
11
+ nullbic-0.1.0.dist-info/top_level.txt,sha256=uU9_0EFoX7A_k1SXyug-1dovYwz5085UWdiOITgTYLc,8
12
+ nullbic-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-win_amd64
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ nullbic = nullbic.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 nullbic contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ nullbic