natc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natc-0.1.0/LICENSE +22 -0
- natc-0.1.0/PKG-INFO +108 -0
- natc-0.1.0/README.md +62 -0
- natc-0.1.0/natc/__init__.py +7 -0
- natc-0.1.0/natc/attention/__init__.py +123 -0
- natc-0.1.0/natc/benchmark/__init__.py +135 -0
- natc-0.1.0/natc/benchmark/cli.py +30 -0
- natc-0.1.0/natc/cache/__init__.py +217 -0
- natc-0.1.0/natc/capsules/__init__.py +160 -0
- natc-0.1.0/natc/compiler/__init__.py +135 -0
- natc-0.1.0/natc/config.py +45 -0
- natc-0.1.0/natc/cpu/__init__.py +80 -0
- natc-0.1.0/natc/dna/__init__.py +295 -0
- natc-0.1.0/natc/fractal/__init__.py +110 -0
- natc-0.1.0/natc/integrations/__init__.py +16 -0
- natc-0.1.0/natc/integrations/huggingface.py +61 -0
- natc-0.1.0/natc/model.py +229 -0
- natc-0.1.0/natc/py.typed +1 -0
- natc-0.1.0/natc/synthesis/__init__.py +176 -0
- natc-0.1.0/natc/utils.py +140 -0
- natc-0.1.0/natc.egg-info/PKG-INFO +108 -0
- natc-0.1.0/natc.egg-info/SOURCES.txt +28 -0
- natc-0.1.0/natc.egg-info/dependency_links.txt +1 -0
- natc-0.1.0/natc.egg-info/entry_points.txt +2 -0
- natc-0.1.0/natc.egg-info/requires.txt +27 -0
- natc-0.1.0/natc.egg-info/top_level.txt +1 -0
- natc-0.1.0/pyproject.toml +85 -0
- natc-0.1.0/setup.cfg +4 -0
- natc-0.1.0/tests/test_dna.py +31 -0
- natc-0.1.0/tests/test_runtime_modules.py +95 -0
natc-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 NATC Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
natc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: natc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: NeuroSymbolic Adaptive Tensor Compression for CPU-first dynamic inference.
|
|
5
|
+
Author: NATC Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Jatinverma0786/NATC
|
|
8
|
+
Project-URL: Documentation, https://github.com/Jatinverma0786/NATC#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/Jatinverma0786/NATC
|
|
10
|
+
Keywords: tensor-compression,inference,llm,sparse-attention,neural-cache
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: torch>=2.1
|
|
24
|
+
Requires-Dist: transformers>=4.40
|
|
25
|
+
Requires-Dist: accelerate>=0.28
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.6
|
|
27
|
+
Requires-Dist: faiss-cpu>=1.8
|
|
28
|
+
Requires-Dist: numba>=0.59
|
|
29
|
+
Requires-Dist: onnxruntime>=1.17
|
|
30
|
+
Requires-Dist: safetensors>=0.4
|
|
31
|
+
Requires-Dist: scipy>=1.10
|
|
32
|
+
Provides-Extra: openvino
|
|
33
|
+
Requires-Dist: openvino>=2024.0; extra == "openvino"
|
|
34
|
+
Provides-Extra: triton
|
|
35
|
+
Requires-Dist: triton>=2.3; extra == "triton"
|
|
36
|
+
Provides-Extra: llama-cpp
|
|
37
|
+
Requires-Dist: llama-cpp-python>=0.2; extra == "llama-cpp"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
42
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
43
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
44
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# NATC
|
|
48
|
+
|
|
49
|
+
NATC, short for NeuroSymbolic Adaptive Tensor Compression, is a Python framework for
|
|
50
|
+
experimenting with CPU-first dynamic inference architecture:
|
|
51
|
+
|
|
52
|
+
- latent Knowledge DNA encoding for tensors and model state dictionaries
|
|
53
|
+
- on-demand weight reconstruction and sparse materialization
|
|
54
|
+
- prompt-routed reasoning capsules
|
|
55
|
+
- predictive sparse attention routing
|
|
56
|
+
- persistent neural fragment caching
|
|
57
|
+
- recursive fractal tensor storage
|
|
58
|
+
- prompt compilation into execution plans
|
|
59
|
+
- CPU-oriented kernels and benchmark utilities
|
|
60
|
+
|
|
61
|
+
Install from a local checkout:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[dev]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Public API:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from natc import NATCModel
|
|
71
|
+
|
|
72
|
+
model = NATCModel.from_pretrained("distilgpt2")
|
|
73
|
+
model.enable_dna()
|
|
74
|
+
model.enable_capsules()
|
|
75
|
+
model.enable_sparse_attention()
|
|
76
|
+
model.enable_cache()
|
|
77
|
+
model.enable_cpu_acceleration()
|
|
78
|
+
|
|
79
|
+
print(model.generate("Explain quantum mechanics"))
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
The HuggingFace backend is loaded lazily. If a remote model cannot be downloaded in
|
|
83
|
+
the current environment, NATC falls back to a deterministic local text backend so the
|
|
84
|
+
compression, routing, caching, and compiler pipeline remains testable offline.
|
|
85
|
+
|
|
86
|
+
## Quick Encoder Example
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import numpy as np
|
|
90
|
+
from natc.dna import encode_model, decode_model
|
|
91
|
+
|
|
92
|
+
state = {"linear.weight": np.random.default_rng(0).normal(size=(64, 32))}
|
|
93
|
+
dna = encode_model(state, rank=8)
|
|
94
|
+
reconstructed = decode_model(dna)
|
|
95
|
+
|
|
96
|
+
print(dna.compression_ratio())
|
|
97
|
+
print(reconstructed["linear.weight"].shape)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Benchmark
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
natc-benchmark --layers 4 --rows 128 --cols 128 --rank 16
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The benchmark reports compression ratio, memory saved, synthetic tokens/sec, cache hit
|
|
107
|
+
ratio, latency, throughput, and CPU efficiency in JSON.
|
|
108
|
+
|
natc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# NATC
|
|
2
|
+
|
|
3
|
+
NATC, short for NeuroSymbolic Adaptive Tensor Compression, is a Python framework for
|
|
4
|
+
experimenting with CPU-first dynamic inference architecture:
|
|
5
|
+
|
|
6
|
+
- latent Knowledge DNA encoding for tensors and model state dictionaries
|
|
7
|
+
- on-demand weight reconstruction and sparse materialization
|
|
8
|
+
- prompt-routed reasoning capsules
|
|
9
|
+
- predictive sparse attention routing
|
|
10
|
+
- persistent neural fragment caching
|
|
11
|
+
- recursive fractal tensor storage
|
|
12
|
+
- prompt compilation into execution plans
|
|
13
|
+
- CPU-oriented kernels and benchmark utilities
|
|
14
|
+
|
|
15
|
+
Install from a local checkout:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -e ".[dev]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Public API:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from natc import NATCModel
|
|
25
|
+
|
|
26
|
+
model = NATCModel.from_pretrained("distilgpt2")
|
|
27
|
+
model.enable_dna()
|
|
28
|
+
model.enable_capsules()
|
|
29
|
+
model.enable_sparse_attention()
|
|
30
|
+
model.enable_cache()
|
|
31
|
+
model.enable_cpu_acceleration()
|
|
32
|
+
|
|
33
|
+
print(model.generate("Explain quantum mechanics"))
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The HuggingFace backend is loaded lazily. If a remote model cannot be downloaded in
|
|
37
|
+
the current environment, NATC falls back to a deterministic local text backend so the
|
|
38
|
+
compression, routing, caching, and compiler pipeline remains testable offline.
|
|
39
|
+
|
|
40
|
+
## Quick Encoder Example
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import numpy as np
|
|
44
|
+
from natc.dna import encode_model, decode_model
|
|
45
|
+
|
|
46
|
+
state = {"linear.weight": np.random.default_rng(0).normal(size=(64, 32))}
|
|
47
|
+
dna = encode_model(state, rank=8)
|
|
48
|
+
reconstructed = decode_model(dna)
|
|
49
|
+
|
|
50
|
+
print(dna.compression_ratio())
|
|
51
|
+
print(reconstructed["linear.weight"].shape)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Benchmark
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
natc-benchmark --layers 4 --rows 128 --cols 128 --rank 16
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The benchmark reports compression ratio, memory saved, synthetic tokens/sec, cache hit
|
|
61
|
+
ratio, latency, throughput, and CPU efficiency in JSON.
|
|
62
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Predictive sparse attention routing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from natc.utils import text_embedding
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(slots=True)
|
|
13
|
+
class SparseAttentionResult:
|
|
14
|
+
"""Sparse attention output and its selected graph."""
|
|
15
|
+
|
|
16
|
+
output: np.ndarray
|
|
17
|
+
weights: np.ndarray
|
|
18
|
+
adjacency: np.ndarray
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SparseGraphBuilder:
|
|
22
|
+
"""Build top-k token adjacency graphs from similarity scores."""
|
|
23
|
+
|
|
24
|
+
def build(self, scores: np.ndarray, *, top_k: int) -> np.ndarray:
|
|
25
|
+
if scores.ndim != 2:
|
|
26
|
+
raise ValueError("scores must be a 2D matrix")
|
|
27
|
+
top_k = max(1, min(top_k, scores.shape[1]))
|
|
28
|
+
adjacency = np.zeros_like(scores, dtype=bool)
|
|
29
|
+
indices = np.argpartition(scores, -top_k, axis=1)[:, -top_k:]
|
|
30
|
+
rows = np.arange(scores.shape[0])[:, None]
|
|
31
|
+
adjacency[rows, indices] = True
|
|
32
|
+
return adjacency
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AttentionRouter:
|
|
36
|
+
"""Predict likely attention paths before dense attention execution."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, *, top_k: int = 32) -> None:
|
|
39
|
+
self.top_k = top_k
|
|
40
|
+
self.graph_builder = SparseGraphBuilder()
|
|
41
|
+
|
|
42
|
+
def route_tokens(self, tokens: list[str], *, top_k: int | None = None) -> np.ndarray:
|
|
43
|
+
if not tokens:
|
|
44
|
+
return np.zeros((0, 0), dtype=bool)
|
|
45
|
+
embeddings = np.stack([text_embedding(token, dimensions=64) for token in tokens])
|
|
46
|
+
scores = embeddings @ embeddings.T
|
|
47
|
+
return self.graph_builder.build(scores, top_k=top_k or self.top_k)
|
|
48
|
+
|
|
49
|
+
def route_embeddings(self, embeddings: np.ndarray, *, top_k: int | None = None) -> np.ndarray:
|
|
50
|
+
embeddings = _as_2d(embeddings)
|
|
51
|
+
scores = embeddings @ embeddings.T / max(1.0, float(embeddings.shape[-1]) ** 0.5)
|
|
52
|
+
return self.graph_builder.build(scores, top_k=top_k or self.top_k)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class PredictiveAttention:
|
|
56
|
+
"""Compute sparse scaled dot-product attention using predicted top-k edges."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, *, top_k: int = 32) -> None:
|
|
59
|
+
self.router = AttentionRouter(top_k=top_k)
|
|
60
|
+
|
|
61
|
+
def __call__(
|
|
62
|
+
self,
|
|
63
|
+
query: np.ndarray,
|
|
64
|
+
key: np.ndarray,
|
|
65
|
+
value: np.ndarray,
|
|
66
|
+
*,
|
|
67
|
+
top_k: int | None = None,
|
|
68
|
+
) -> SparseAttentionResult:
|
|
69
|
+
return self.forward(query, key, value, top_k=top_k)
|
|
70
|
+
|
|
71
|
+
def forward(
|
|
72
|
+
self,
|
|
73
|
+
query: np.ndarray,
|
|
74
|
+
key: np.ndarray,
|
|
75
|
+
value: np.ndarray,
|
|
76
|
+
*,
|
|
77
|
+
top_k: int | None = None,
|
|
78
|
+
) -> SparseAttentionResult:
|
|
79
|
+
query = _as_2d(query)
|
|
80
|
+
key = _as_2d(key)
|
|
81
|
+
value = _as_2d(value)
|
|
82
|
+
if query.shape[-1] != key.shape[-1]:
|
|
83
|
+
raise ValueError("query and key dimensions must match")
|
|
84
|
+
if key.shape[0] != value.shape[0]:
|
|
85
|
+
raise ValueError("key and value token counts must match")
|
|
86
|
+
|
|
87
|
+
scores = query @ key.T / max(1.0, float(query.shape[-1]) ** 0.5)
|
|
88
|
+
adjacency = SparseGraphBuilder().build(scores, top_k=top_k or self.router.top_k)
|
|
89
|
+
masked_scores = np.where(adjacency, scores, -np.inf)
|
|
90
|
+
weights = _softmax(masked_scores, axis=1)
|
|
91
|
+
output = weights @ value
|
|
92
|
+
return SparseAttentionResult(
|
|
93
|
+
output=output.astype(np.float32, copy=False),
|
|
94
|
+
weights=weights.astype(np.float32, copy=False),
|
|
95
|
+
adjacency=adjacency,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _softmax(values: np.ndarray, *, axis: int) -> np.ndarray:
|
|
100
|
+
maximum = np.max(values, axis=axis, keepdims=True)
|
|
101
|
+
shifted = np.exp(values - maximum)
|
|
102
|
+
shifted[~np.isfinite(values)] = 0.0
|
|
103
|
+
denom = shifted.sum(axis=axis, keepdims=True)
|
|
104
|
+
denom[denom == 0.0] = 1.0
|
|
105
|
+
return shifted / denom
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _as_2d(array: np.ndarray) -> np.ndarray:
|
|
109
|
+
array = np.asarray(array, dtype=np.float32)
|
|
110
|
+
if array.ndim == 1:
|
|
111
|
+
return array.reshape(1, -1)
|
|
112
|
+
if array.ndim != 2:
|
|
113
|
+
raise ValueError("attention arrays must be 1D or 2D")
|
|
114
|
+
return array
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
__all__ = [
|
|
118
|
+
"AttentionRouter",
|
|
119
|
+
"PredictiveAttention",
|
|
120
|
+
"SparseAttentionResult",
|
|
121
|
+
"SparseGraphBuilder",
|
|
122
|
+
]
|
|
123
|
+
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Benchmark suite for NATC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import tempfile
|
|
7
|
+
import time
|
|
8
|
+
import tracemalloc
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from natc.cache import NeuralCache
|
|
15
|
+
from natc.dna import decode_model, encode_model
|
|
16
|
+
from natc.model import NATCModel
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class BenchmarkResult:
|
|
21
|
+
"""NATC benchmark metrics."""
|
|
22
|
+
|
|
23
|
+
compression_ratio: float
|
|
24
|
+
memory_saved: float
|
|
25
|
+
speedup: float
|
|
26
|
+
cpu_efficiency: float
|
|
27
|
+
tokens_per_sec: float
|
|
28
|
+
cache_hit_ratio: float
|
|
29
|
+
latency: float
|
|
30
|
+
throughput: float
|
|
31
|
+
|
|
32
|
+
def to_dict(self) -> dict[str, float]:
|
|
33
|
+
return {
|
|
34
|
+
"compression_ratio": self.compression_ratio,
|
|
35
|
+
"memory_saved": self.memory_saved,
|
|
36
|
+
"speedup": self.speedup,
|
|
37
|
+
"cpu_efficiency": self.cpu_efficiency,
|
|
38
|
+
"tokens_per_sec": self.tokens_per_sec,
|
|
39
|
+
"cache_hit_ratio": self.cache_hit_ratio,
|
|
40
|
+
"latency": self.latency,
|
|
41
|
+
"throughput": self.throughput,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def to_json(self) -> str:
|
|
45
|
+
return json.dumps(self.to_dict(), indent=2, sort_keys=True)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BenchmarkRunner:
|
|
49
|
+
"""Run synthetic compression and inference benchmarks."""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
*,
|
|
54
|
+
layers: int = 4,
|
|
55
|
+
rows: int = 128,
|
|
56
|
+
cols: int = 128,
|
|
57
|
+
rank: int = 16,
|
|
58
|
+
seed: int = 13,
|
|
59
|
+
) -> None:
|
|
60
|
+
self.layers = layers
|
|
61
|
+
self.rows = rows
|
|
62
|
+
self.cols = cols
|
|
63
|
+
self.rank = rank
|
|
64
|
+
self.seed = seed
|
|
65
|
+
|
|
66
|
+
def run(self) -> BenchmarkResult:
|
|
67
|
+
state = self._synthetic_state()
|
|
68
|
+
|
|
69
|
+
tracemalloc.start()
|
|
70
|
+
start = time.perf_counter()
|
|
71
|
+
dense_outputs = _dense_reference(state)
|
|
72
|
+
dense_latency = time.perf_counter() - start
|
|
73
|
+
dense_current, dense_peak = tracemalloc.get_traced_memory()
|
|
74
|
+
|
|
75
|
+
start = time.perf_counter()
|
|
76
|
+
dna = encode_model(state, rank=self.rank)
|
|
77
|
+
reconstructed = decode_model(dna)
|
|
78
|
+
compressed_outputs = _dense_reference(reconstructed)
|
|
79
|
+
compressed_latency = time.perf_counter() - start
|
|
80
|
+
compressed_current, compressed_peak = tracemalloc.get_traced_memory()
|
|
81
|
+
tracemalloc.stop()
|
|
82
|
+
|
|
83
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
84
|
+
cache = NeuralCache(tmpdir)
|
|
85
|
+
cache.put("python pattern", {"value": "cached"})
|
|
86
|
+
cache.get("python pattern")
|
|
87
|
+
cache.get("python patterns")
|
|
88
|
+
cache_hit_ratio = cache.hit_ratio()
|
|
89
|
+
|
|
90
|
+
model = NATCModel.from_state_dict(state, rank=self.rank)
|
|
91
|
+
generated_start = time.perf_counter()
|
|
92
|
+
output = model.generate("Explain a Python matrix multiplication pattern", max_new_tokens=32)
|
|
93
|
+
generated_latency = max(time.perf_counter() - generated_start, 1e-9)
|
|
94
|
+
token_count = max(1, len(output.split()))
|
|
95
|
+
|
|
96
|
+
dense_memory = max(dense_peak, dense_current, 1)
|
|
97
|
+
compressed_memory = max(compressed_peak - dense_peak, compressed_current, 1)
|
|
98
|
+
memory_saved = max(0.0, 1.0 - (compressed_memory / dense_memory))
|
|
99
|
+
speedup = dense_latency / max(compressed_latency, 1e-9)
|
|
100
|
+
throughput = len(compressed_outputs) / max(compressed_latency, 1e-9)
|
|
101
|
+
cpu_efficiency = min(1.0, speedup / max(1.0, self.rank))
|
|
102
|
+
|
|
103
|
+
return BenchmarkResult(
|
|
104
|
+
compression_ratio=dna.compression_ratio(),
|
|
105
|
+
memory_saved=memory_saved,
|
|
106
|
+
speedup=speedup,
|
|
107
|
+
cpu_efficiency=cpu_efficiency,
|
|
108
|
+
tokens_per_sec=token_count / generated_latency,
|
|
109
|
+
cache_hit_ratio=cache_hit_ratio,
|
|
110
|
+
latency=compressed_latency,
|
|
111
|
+
throughput=throughput,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def _synthetic_state(self) -> dict[str, np.ndarray]:
|
|
115
|
+
rng = np.random.default_rng(self.seed)
|
|
116
|
+
state = {}
|
|
117
|
+
base_left = rng.normal(size=(self.rows, self.rank)).astype(np.float32)
|
|
118
|
+
base_right = rng.normal(size=(self.rank, self.cols)).astype(np.float32)
|
|
119
|
+
for layer in range(self.layers):
|
|
120
|
+
noise = rng.normal(scale=0.01, size=(self.rows, self.cols)).astype(np.float32)
|
|
121
|
+
state[f"layers.{layer}.weight"] = base_left @ base_right + noise
|
|
122
|
+
return state
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def run_benchmark(**kwargs: Any) -> dict[str, float]:
|
|
126
|
+
return BenchmarkRunner(**kwargs).run().to_dict()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _dense_reference(state: dict[str, np.ndarray]) -> list[np.ndarray]:
|
|
130
|
+
vector = np.ones((next(iter(state.values())).shape[1], 1), dtype=np.float32)
|
|
131
|
+
return [weight @ vector for weight in state.values()]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
__all__ = ["BenchmarkResult", "BenchmarkRunner", "run_benchmark"]
|
|
135
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Command-line benchmark runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
from natc.benchmark import BenchmarkRunner
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main() -> None:
|
|
11
|
+
parser = argparse.ArgumentParser(description="Run NATC benchmark suite.")
|
|
12
|
+
parser.add_argument("--layers", type=int, default=4)
|
|
13
|
+
parser.add_argument("--rows", type=int, default=128)
|
|
14
|
+
parser.add_argument("--cols", type=int, default=128)
|
|
15
|
+
parser.add_argument("--rank", type=int, default=16)
|
|
16
|
+
parser.add_argument("--seed", type=int, default=13)
|
|
17
|
+
args = parser.parse_args()
|
|
18
|
+
result = BenchmarkRunner(
|
|
19
|
+
layers=args.layers,
|
|
20
|
+
rows=args.rows,
|
|
21
|
+
cols=args.cols,
|
|
22
|
+
rank=args.rank,
|
|
23
|
+
seed=args.seed,
|
|
24
|
+
).run()
|
|
25
|
+
print(result.to_json())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
main()
|
|
30
|
+
|