spectraldiag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ cmake_minimum_required(VERSION 3.18)
2
+ project(spectraldiag LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+
7
+ if(MSVC)
8
+ add_compile_options(/O2 /EHsc)
9
+ else()
10
+ add_compile_options(-O3 -ffast-math)
11
+ endif()
12
+
13
+ find_package(Python COMPONENTS Interpreter Development REQUIRED)
14
+
15
+ Python_add_library(_core MODULE src/pymodule.cpp)
16
+ target_include_directories(_core PRIVATE src)
17
+
18
+ install(TARGETS _core DESTINATION spectraldiag)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 spectraldiag contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include src *.cpp *.hpp *.h
4
+ recursive-include python *.py
5
+ include CMakeLists.txt
6
+ include pyproject.toml
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.4
2
+ Name: spectraldiag
3
+ Version: 0.1.0
4
+ Summary: Mathematically rigorous ML model diagnostics: stationarity verdict, effective dimension, barrier certificate
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/drozdisme/spectraldiag
7
+ Project-URL: Repository, https://github.com/drozdisme/spectraldiag
8
+ Keywords: machine-learning,scaling-laws,diagnostics,benchmarking,spectral
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: C++
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.9
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Provides-Extra: torch
19
+ Requires-Dist: torch>=2.0; extra == "torch"
20
+ Provides-Extra: hf
21
+ Requires-Dist: transformers>=4.30; extra == "hf"
22
+ Requires-Dist: torch>=2.0; extra == "hf"
23
+ Provides-Extra: lightning
24
+ Requires-Dist: lightning>=2.0; extra == "lightning"
25
+ Requires-Dist: torch>=2.0; extra == "lightning"
26
+ Provides-Extra: data
27
+ Requires-Dist: numpy>=1.24; extra == "data"
28
+ Requires-Dist: scipy>=1.10; extra == "data"
29
+ Requires-Dist: scikit-learn>=1.3; extra == "data"
30
+ Provides-Extra: all
31
+ Requires-Dist: torch>=2.0; extra == "all"
32
+ Requires-Dist: transformers>=4.30; extra == "all"
33
+ Requires-Dist: lightning>=2.0; extra == "all"
34
+ Requires-Dist: numpy>=1.24; extra == "all"
35
+ Requires-Dist: scipy>=1.10; extra == "all"
36
+ Requires-Dist: scikit-learn>=1.3; extra == "all"
37
+ Dynamic: license-file
38
+
39
+ # spectraldiag
40
+
41
+ **Mathematically rigorous ML model diagnostics.**
42
+
43
+ Answers not "what happened" (that's W&B) but **"why it happened and what to do"** — with mathematical proof, not empirical guessing.
44
+
45
+ ```python
46
+ pip install spectraldiag
47
+ ```
48
+
49
+ ---
50
+
51
+ ## Three core functions
52
+
53
+ ### `stationarity_verdict(ntk_eigs, target_coeffs)`
54
+
55
+ Is your model's feature learning done, or still evolving?
56
+
57
+ ```python
58
+ from spectraldiag import stationarity_verdict
59
+
60
+ result = stationarity_verdict(ntk_eigs, target_coeffs)
61
+ print(result.reason)
62
+ # STATIONARY. Source exponent r_hat=0.491 (±0.031) is consistent with r=0.5
63
+ # (self-organised criticality). Model is pinned to the Sobolev minimax barrier
64
+ # β₀=0.556. Additional data will improve loss at rate D^{-0.556} — no more
65
+ # than 44% further gain possible without compositional restructuring.
66
+ ```
67
+
68
+ **What it computes:** fits the source exponent `r` from the empirical NTK spectrum. `r ≈ 0.5` means your model has self-organised to the critical attractor — it's stationary, permanently bounded by `β₀ = 2s/(2s+d*)`.
69
+
70
+ ### `effective_dimension(laplacian_eigs, approx_errors, model_sizes)`
71
+
72
+ Does your data have compositional structure your model could exploit?
73
+
74
+ ```python
75
+ from spectraldiag import effective_dimension
76
+
77
+ result = effective_dimension(laplacian_eigs, approx_errors, model_sizes)
78
+ print(result.verdict)
79
+ # COMPOSITIONAL STRUCTURE DETECTED. Data intrinsic dimension d*=8.2 but
80
+ # effective task dimension d_loc=2.1. Compositional approximation exponent
81
+ # α=1.19 vs Sobolev baseline α=0.30 — 3.9× compression gain available.
82
+ ```
83
+
84
+ **What it computes:** estimates `d*` from the graph-Laplacian spectrum of your data, `d_loc` from the model-side approximation exponent. If `d_loc < d*`, genuine compositional structure exists — and the phase transition theorem says emergence is real.
85
+
86
+ ### `barrier_certificate(d_star, d_loc, s, current_loss, current_N, current_D)`
87
+
88
+ Where is your model relative to the theoretical ceiling?
89
+
90
+ ```python
91
+ from spectraldiag import barrier_certificate
92
+
93
+ result = barrier_certificate(
94
+ d_star=8.0, d_loc=2.0, s=1.25,
95
+ current_loss=0.42, current_N=1e8, current_D=1e11
96
+ )
97
+ print(result.verdict)
98
+ # BARRIER CERTIFICATE. Theoretical ceiling β₀=0.238. With compositional
99
+ # structure (d_loc=2.0), barrier rises to β'=0.556 — 2.3× faster data
100
+ # scaling. Training budget D=1e11 has NOT passed the crossover D_cross≈...
101
+ ```
102
+
103
+ ---
104
+
105
+ ## One-line integration
106
+
107
+ ```python
108
+ from spectraldiag.callbacks import make_hf_callback
109
+
110
+ trainer = Trainer(
111
+ ...,
112
+ callbacks=[make_hf_callback(eval_data=(X_val, y_val))]
113
+ )
114
+ ```
115
+
116
+ Works with HuggingFace Trainer and PyTorch Lightning out of the box.
117
+
118
+ ---
119
+
120
+ ## Graph-Laplacian protocol (for real data)
121
+
122
+ ```python
123
+ from spectraldiag.graph_lap import graph_laplacian_eigs, estimate_d_star, double_dimension_consistency
124
+
125
+ eig_vals, eig_vecs = graph_laplacian_eigs(X_data, knn=10)
126
+ d_star = estimate_d_star(eig_vals)
127
+
128
+ consistency = double_dimension_consistency(d_star_data=d_star, d_loc_model=d_loc_from_model)
129
+ print(consistency["verdict"])
130
+ ```
131
+
132
+ ---
133
+
134
+ ## Mathematical foundation
135
+
136
+ This library implements the three-paper programme:
137
+
138
+ - **TR** — *Boundaries of Stationary Feature Learning*: the Sobolev minimax barrier `β₀`, self-organised criticality `r=½`, approximation exponent `α=2s/d*`
139
+ - **AB** — *Foundations of a Theory of Composable Abstractions*: defect as projection, effective dimension `d_loc`, subspace gap as the order parameter
140
+ - **BM** — *Spectral Scaling Benchmark*: the decisive-test protocol, source exponent measurement, graph-Laplacian intrinsic dimension estimation
141
+
142
+ The decisive invariant: `d_loc < d*` ⟺ emergence is real ⟺ the phase transition theorem applies.
143
+
144
+ ---
145
+
146
+ ## Build from source
147
+
148
+ ```bash
149
+ pip install -e ".[all]"
150
+ ```
151
+
152
+ Requires only a C++17 compiler. The C++ core is a standard CPython
153
+ extension built automatically by setuptools — no CMake, no pybind11,
154
+ no extra build dependencies.
@@ -0,0 +1,116 @@
1
+ # spectraldiag
2
+
3
+ **Mathematically rigorous ML model diagnostics.**
4
+
5
+ Answers not "what happened" (that's W&B) but **"why it happened and what to do"** — with mathematical proof, not empirical guessing.
6
+
7
+ ```python
8
+ pip install spectraldiag
9
+ ```
10
+
11
+ ---
12
+
13
+ ## Three core functions
14
+
15
+ ### `stationarity_verdict(ntk_eigs, target_coeffs)`
16
+
17
+ Is your model's feature learning done, or still evolving?
18
+
19
+ ```python
20
+ from spectraldiag import stationarity_verdict
21
+
22
+ result = stationarity_verdict(ntk_eigs, target_coeffs)
23
+ print(result.reason)
24
+ # STATIONARY. Source exponent r_hat=0.491 (±0.031) is consistent with r=0.5
25
+ # (self-organised criticality). Model is pinned to the Sobolev minimax barrier
26
+ # β₀=0.556. Additional data will improve loss at rate D^{-0.556} — no more
27
+ # than 44% further gain possible without compositional restructuring.
28
+ ```
29
+
30
+ **What it computes:** fits the source exponent `r` from the empirical NTK spectrum. `r ≈ 0.5` means your model has self-organised to the critical attractor — it's stationary, permanently bounded by `β₀ = 2s/(2s+d*)`.
31
+
32
+ ### `effective_dimension(laplacian_eigs, approx_errors, model_sizes)`
33
+
34
+ Does your data have compositional structure your model could exploit?
35
+
36
+ ```python
37
+ from spectraldiag import effective_dimension
38
+
39
+ result = effective_dimension(laplacian_eigs, approx_errors, model_sizes)
40
+ print(result.verdict)
41
+ # COMPOSITIONAL STRUCTURE DETECTED. Data intrinsic dimension d*=8.2 but
42
+ # effective task dimension d_loc=2.1. Compositional approximation exponent
43
+ # α=1.19 vs Sobolev baseline α=0.30 — 3.9× compression gain available.
44
+ ```
45
+
46
+ **What it computes:** estimates `d*` from the graph-Laplacian spectrum of your data, `d_loc` from the model-side approximation exponent. If `d_loc < d*`, genuine compositional structure exists — and the phase transition theorem says emergence is real.
47
+
48
+ ### `barrier_certificate(d_star, d_loc, s, current_loss, current_N, current_D)`
49
+
50
+ Where is your model relative to the theoretical ceiling?
51
+
52
+ ```python
53
+ from spectraldiag import barrier_certificate
54
+
55
+ result = barrier_certificate(
56
+ d_star=8.0, d_loc=2.0, s=1.25,
57
+ current_loss=0.42, current_N=1e8, current_D=1e11
58
+ )
59
+ print(result.verdict)
60
+ # BARRIER CERTIFICATE. Theoretical ceiling β₀=0.238. With compositional
61
+ # structure (d_loc=2.0), barrier rises to β'=0.556 — 2.3× faster data
62
+ # scaling. Training budget D=1e11 has NOT passed the crossover D_cross≈...
63
+ ```
64
+
65
+ ---
66
+
67
+ ## One-line integration
68
+
69
+ ```python
70
+ from spectraldiag.callbacks import make_hf_callback
71
+
72
+ trainer = Trainer(
73
+ ...,
74
+ callbacks=[make_hf_callback(eval_data=(X_val, y_val))]
75
+ )
76
+ ```
77
+
78
+ Works with HuggingFace Trainer and PyTorch Lightning out of the box.
79
+
80
+ ---
81
+
82
+ ## Graph-Laplacian protocol (for real data)
83
+
84
+ ```python
85
+ from spectraldiag.graph_lap import graph_laplacian_eigs, estimate_d_star, double_dimension_consistency
86
+
87
+ eig_vals, eig_vecs = graph_laplacian_eigs(X_data, knn=10)
88
+ d_star = estimate_d_star(eig_vals)
89
+
90
+ consistency = double_dimension_consistency(d_star_data=d_star, d_loc_model=d_loc_from_model)
91
+ print(consistency["verdict"])
92
+ ```
93
+
94
+ ---
95
+
96
+ ## Mathematical foundation
97
+
98
+ This library implements the three-paper programme:
99
+
100
+ - **TR** — *Boundaries of Stationary Feature Learning*: the Sobolev minimax barrier `β₀`, self-organised criticality `r=½`, approximation exponent `α=2s/d*`
101
+ - **AB** — *Foundations of a Theory of Composable Abstractions*: defect as projection, effective dimension `d_loc`, subspace gap as the order parameter
102
+ - **BM** — *Spectral Scaling Benchmark*: the decisive-test protocol, source exponent measurement, graph-Laplacian intrinsic dimension estimation
103
+
104
+ The decisive invariant: `d_loc < d*` ⟺ emergence is real ⟺ the phase transition theorem applies.
105
+
106
+ ---
107
+
108
+ ## Build from source
109
+
110
+ ```bash
111
+ pip install -e ".[all]"
112
+ ```
113
+
114
+ Requires only a C++17 compiler. The C++ core is a standard CPython
115
+ extension built automatically by setuptools — no CMake, no pybind11,
116
+ no extra build dependencies.
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "spectraldiag"
7
+ version = "0.1.0"
8
+ description = "Mathematically rigorous ML model diagnostics: stationarity verdict, effective dimension, barrier certificate"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.9"
12
+ keywords = ["machine-learning", "scaling-laws", "diagnostics", "benchmarking", "spectral"]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Science/Research",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: C++",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ ]
21
+ dependencies = []
22
+
23
+ [project.optional-dependencies]
24
+ torch = ["torch>=2.0"]
25
+ hf = ["transformers>=4.30", "torch>=2.0"]
26
+ lightning = ["lightning>=2.0", "torch>=2.0"]
27
+ data = ["numpy>=1.24", "scipy>=1.10", "scikit-learn>=1.3"]
28
+ all = ["torch>=2.0", "transformers>=4.30", "lightning>=2.0",
29
+ "numpy>=1.24", "scipy>=1.10", "scikit-learn>=1.3"]
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/drozdisme/spectraldiag"
33
+ Repository = "https://github.com/drozdisme/spectraldiag"
34
+
35
+ [tool.setuptools]
36
+ package-dir = {"" = "python"}
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["python"]
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ from . import _core
4
+
5
+ __version__ = "0.1.0"
6
+ __all__ = [
7
+ "stationarity_verdict", "effective_dimension", "barrier_certificate",
8
+ "snr_gate", "alignment_signal", "warmstart_economy", "SpectralDiagnostics",
9
+ ]
10
+
11
+
12
+ def _to_list(x):
13
+ if x is None:
14
+ return []
15
+ try:
16
+ import numpy as np
17
+ if isinstance(x, np.ndarray):
18
+ return x.flatten().tolist()
19
+ except ImportError:
20
+ pass
21
+ if hasattr(x, "tolist"):
22
+ return x.tolist()
23
+ return [float(v) for v in x]
24
+
25
+
26
+ def _to_mat(x):
27
+ if x is None:
28
+ return []
29
+ try:
30
+ import numpy as np
31
+ if isinstance(x, np.ndarray):
32
+ if x.ndim == 1:
33
+ return [x.tolist()]
34
+ return x.tolist()
35
+ except ImportError:
36
+ pass
37
+ if isinstance(x, (list, tuple)) and len(x) > 0:
38
+ if isinstance(x[0], (list, tuple)):
39
+ return [[float(v) for v in row] for row in x]
40
+ return [[float(v) for v in x]]
41
+ return []
42
+
43
+
44
+ class _Result:
45
+ """Attribute-access wrapper around a result dict from the C++ core."""
46
+ __slots__ = ("_d",)
47
+
48
+ def __init__(self, d):
49
+ object.__setattr__(self, "_d", d)
50
+
51
+ def __getattr__(self, name):
52
+ try:
53
+ return self._d[name]
54
+ except KeyError:
55
+ raise AttributeError(name)
56
+
57
+ def __repr__(self):
58
+ keys = ", ".join(f"{k}={v!r}" for k, v in self._d.items()
59
+ if not isinstance(v, str) or len(v) < 40)
60
+ return f"<{type(self).__name__} {keys}>"
61
+
62
+ def to_dict(self):
63
+ return dict(self._d)
64
+
65
+
66
+ class StatResult(_Result): pass
67
+ class DimResult(_Result): pass
68
+ class BarrierResult(_Result): pass
69
+ class SNRResult(_Result): pass
70
+ class AlignResult(_Result): pass
71
+ class WarmstartResult(_Result): pass
72
+
73
+
74
+ def stationarity_verdict(ntk_eigs, target_coeffs=None, s: float = -1.0) -> StatResult:
75
+ return StatResult(_core.stationarity_verdict(
76
+ _to_list(ntk_eigs), _to_list(target_coeffs), float(s)))
77
+
78
+
79
+ def effective_dimension(laplacian_eigs, approx_errors=None, model_sizes=None) -> DimResult:
80
+ return DimResult(_core.effective_dimension(
81
+ _to_list(laplacian_eigs), _to_list(approx_errors), _to_list(model_sizes)))
82
+
83
+
84
+ def barrier_certificate(d_star: float, d_loc: float, s: float,
85
+ current_loss: float, current_N: float,
86
+ current_D: float) -> BarrierResult:
87
+ return BarrierResult(_core.barrier_certificate(
88
+ float(d_star), float(d_loc), float(s),
89
+ float(current_loss), float(current_N), float(current_D)))
90
+
91
+
92
+ def snr_gate(alpha: float, noise_var: float, signal_var: float) -> SNRResult:
93
+ return SNRResult(_core.snr_gate(float(alpha), float(noise_var), float(signal_var)))
94
+
95
+
96
+ def alignment_signal(subspace_dirs) -> AlignResult:
97
+ return AlignResult(_core.alignment_signal(_to_mat(subspace_dirs)))
98
+
99
+
100
+ def warmstart_economy(source_eigs, target_eigs,
101
+ spectral_floor: float = 0.1) -> WarmstartResult:
102
+ return WarmstartResult(_core.warmstart_economy(
103
+ _to_list(source_eigs), _to_list(target_eigs), float(spectral_floor)))
104
+
105
+
106
+ class SpectralDiagnostics:
107
+ def __init__(self, d_star: float = 2.0, s: float = 1.25, verbose: bool = True):
108
+ self.d_star = d_star
109
+ self.s = s
110
+ self.verbose = verbose
111
+ self._ntk_eigs = []
112
+ self._target_coeffs = []
113
+ self._lap_eigs = []
114
+
115
+ def fit_ntk(self, eigs, target_coeffs=None):
116
+ self._ntk_eigs = _to_list(eigs)
117
+ if target_coeffs is not None:
118
+ self._target_coeffs = _to_list(target_coeffs)
119
+ return self
120
+
121
+ def fit_data(self, lap_eigs):
122
+ self._lap_eigs = _to_list(lap_eigs)
123
+ return self
124
+
125
+ def verdict(self, s: float = -1.0) -> StatResult:
126
+ if not self._ntk_eigs:
127
+ raise ValueError("Call fit_ntk() first.")
128
+ res = stationarity_verdict(self._ntk_eigs, self._target_coeffs,
129
+ s if s > 0 else self.s)
130
+ if self.verbose:
131
+ print(res.reason)
132
+ return res
133
+
134
+ def dim(self, approx_errors=None, model_sizes=None) -> DimResult:
135
+ if not self._lap_eigs:
136
+ raise ValueError("Call fit_data() first.")
137
+ res = effective_dimension(self._lap_eigs, approx_errors, model_sizes)
138
+ if self.verbose:
139
+ print(res.verdict)
140
+ return res
141
+
142
+ def barrier(self, d_loc: float, current_loss: float,
143
+ current_N: float, current_D: float) -> BarrierResult:
144
+ res = barrier_certificate(self.d_star, d_loc, self.s,
145
+ current_loss, current_N, current_D)
146
+ if self.verbose:
147
+ print(res.verdict)
148
+ return res