aneforge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aneforge/__init__.py +136 -0
- aneforge/_blob.py +112 -0
- aneforge/_bridges/__init__.py +3 -0
- aneforge/_bridges/_netplist.py +3241 -0
- aneforge/_bridges/ane_cost_volume_fused.py +97 -0
- aneforge/_bridges/ane_cross_correlation_fused.py +88 -0
- aneforge/_bridges/ane_cross_product_fused.py +71 -0
- aneforge/_bridges/ane_dynamic_slice_fused.py +65 -0
- aneforge/_bridges/ane_fps_fused.py +93 -0
- aneforge/_bridges/ane_input_view_fused.py +69 -0
- aneforge/_bridges/ane_radius_search_fused.py +91 -0
- aneforge/_bridges/ane_rank_fused.py +255 -0
- aneforge/_bridges/ane_rearrange_fused.py +315 -0
- aneforge/_bridges/ane_sdpa_fused.py +261 -0
- aneforge/_bridges/ane_structural_fused.py +111 -0
- aneforge/_bridges/lrn_fused.py +137 -0
- aneforge/_bridges/minmax_norm_fused.py +136 -0
- aneforge/_bridges/scaled_elementwise_fused.py +125 -0
- aneforge/_capabilities.py +1033 -0
- aneforge/_circuit.py +76 -0
- aneforge/_compile.py +1735 -0
- aneforge/_cost.py +859 -0
- aneforge/_invokers/README.md +22 -0
- aneforge/_invokers/layer_invoker.mm +562 -0
- aneforge/_invokers/persistent_worker.mm +488 -0
- aneforge/_invokers/rank_invoker.mm +406 -0
- aneforge/_invokers/sdpa_invoker.mm +520 -0
- aneforge/_lib/ane_e5rt_dispatch.mm +1026 -0
- aneforge/_lib/build.sh +12 -0
- aneforge/_lib/e5rt_api.h +133 -0
- aneforge/_netplist_worker.py +326 -0
- aneforge/_op_catalog.py +272 -0
- aneforge/_optimize.py +886 -0
- aneforge/_paired.py +223 -0
- aneforge/_rewrite.py +333 -0
- aneforge/_runtime.py +347 -0
- aneforge/_targets.py +462 -0
- aneforge/ane_cost_model.json +1954 -0
- aneforge/autograd.py +1444 -0
- aneforge/build.py +81 -0
- aneforge/costmodel_curves.json +1482 -0
- aneforge/dsp.py +650 -0
- aneforge/einsum.py +385 -0
- aneforge/fft.py +590 -0
- aneforge/full_mil_vocabulary_sweep.json +1662 -0
- aneforge/graph.py +1193 -0
- aneforge/linalg.py +1060 -0
- aneforge/models.py +238 -0
- aneforge/special.py +440 -0
- aneforge/streaming.py +112 -0
- aneforge-0.1.0.dist-info/METADATA +220 -0
- aneforge-0.1.0.dist-info/RECORD +54 -0
- aneforge-0.1.0.dist-info/WHEEL +4 -0
- aneforge-0.1.0.dist-info/licenses/LICENSE +31 -0
aneforge/__init__.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""aneforge — a clean graph→compile→run frontend for the Apple Neural Engine.
|
|
2
|
+
|
|
3
|
+
Build a small tensor graph, `compile` it into ONE fused e5rt program, and run it
|
|
4
|
+
on the ANE. Fusing is the point: the ANE penalises many tiny dispatches, so a whole
|
|
5
|
+
subgraph becomes a single program. Weights pack automatically into one BLOBFILE —
|
|
6
|
+
fp16, or per-channel int8 *streamed* (dequantised during the tile DMA) when
|
|
7
|
+
`int8=True`.
|
|
8
|
+
|
|
9
|
+
import aneforge as af
|
|
10
|
+
|
|
11
|
+
x = af.input((1, 3, 32, 32))
|
|
12
|
+
h = af.conv(x, W1, pad=1).relu()
|
|
13
|
+
h = af.conv(h, W2, pad=1).relu()
|
|
14
|
+
y = h.mean((2, 3)).reshape(1, C) @ Wfc
|
|
15
|
+
net = af.compile(y, int8=True) # one fused ANE program
|
|
16
|
+
net = af.compile(y, compress="int4") # 4-bit LUT weights, accuracy-gated
|
|
17
|
+
out = net(image) # run on the ANE
|
|
18
|
+
|
|
19
|
+
Op surface:
|
|
20
|
+
- linear algebra: conv, conv_transpose; matmul/linear via `@`; bmm
|
|
21
|
+
- dynamic_conv: conv with a RUNTIME-tensor weight (hypernetworks / per-sample kernels;
|
|
22
|
+
native ANE dynamic kernel, batch-1 only)
|
|
23
|
+
- activations: relu/silu/gelu/sigmoid/tanh/exp/log/sqrt/rsqrt/abs/square/
|
|
24
|
+
sin/cos/erf/softplus/relu6/elu/leaky_relu/clip
|
|
25
|
+
- arithmetic: add/sub/mul/div(`/`)/maximum/minimum/pow
|
|
26
|
+
- reductions/norms: mean/sum/amax/amin, softmax, l2_norm, rms_norm/layer_norm/
|
|
27
|
+
group_norm/batch_norm
|
|
28
|
+
- spatial/shape: max_pool/avg_pool, upsample, concat, reshape/transpose,
|
|
29
|
+
pixel_shuffle/pixel_unshuffle
|
|
30
|
+
- nn helpers: mha, cross_attention, geglu
|
|
31
|
+
|
|
32
|
+
Two op routes. Most ops are FUSED e5rt-MIL: they lower to MIL and fuse into ONE
|
|
33
|
+
program (no graph cut). A second family are NETPLIST-BRIDGE ops — native Path-A
|
|
34
|
+
hardware layers Apple's MIL frontend never emits (sdpa, argmax/topk/sort,
|
|
35
|
+
cross_product/cross_correlation/cost_volume, fps/radius_search, minmax_norm/lrn,
|
|
36
|
+
the space/channel/batch rearranges, flatten/input_view/dynamic_slice/
|
|
37
|
+
scaled_elementwise). Each bridge op CUTS the graph: surrounding regions run as
|
|
38
|
+
e5rt programs, the bridge node runs as a separate native sub-program (sub-ms via
|
|
39
|
+
the A2 persistent worker), and `compile` returns a SegmentedModel.
|
|
40
|
+
|
|
41
|
+
Image input: `af.image_input(shape, scale=1/255, bias=0.0)` declares a uint8 input
|
|
42
|
+
port and dequantises it on the engine (`cast -> scale -> bias`), so raw camera /
|
|
43
|
+
decoded-video bytes feed the model directly (host skips the float-convert/repack);
|
|
44
|
+
`scale`/`bias` are scalar or per-channel (length-C, broadcast over NCHW).
|
|
45
|
+
|
|
46
|
+
Pretrained loaders: `af.load("…/all-MiniLM-L6-v2")` (sentence encoder),
|
|
47
|
+
`af.load_resnet18()` (ImageNet classifier).
|
|
48
|
+
|
|
49
|
+
Design rules: compute is fp16 only (fp32/int32/bf16
|
|
50
|
+
rejected); reductions/matmuls use a WIDE (fp32-class) accumulator fed by radix-4
|
|
51
|
+
fp16-rounded input tiles — representable sums are near-exact (a sum/dot of 16384 ones is
|
|
52
|
+
bit-exact, where naive fp16 would stall at ~2048), and a +1 survives next to a 16000
|
|
53
|
+
partial that an fp16 running sum would swallow. The fp16 limit is at the products and the
|
|
54
|
+
I/O cast, not the running sum, so cancellation-heavy reductions still lose precision;
|
|
55
|
+
`int8=True` streams weights at half the bytes. `compress=`
|
|
56
|
+
chooses weight encoding: None (fp16, default), 'int8' (per-channel), 'int4'
|
|
57
|
+
(LUT palettization, per-tensor, with an accuracy-gated fallback to int8/fp16 set by
|
|
58
|
+
`compress_atol`), 'sparse' (unstructured bitmask, emitted when the weight is >=50%
|
|
59
|
+
zeros, else fp16), or 'auto' (per-weight: sparse if sparse, else int4 if accurate,
|
|
60
|
+
else int8, else fp16). `int8=True` is the alias for `compress='int8'`. Wraps the
|
|
61
|
+
unentitled Espresso `e5rt` runtime only — no CoreML, no entitlement.
|
|
62
|
+
|
|
63
|
+
aneforge also has a tiny reverse-mode autograd (`autograd.py`): `af.parameter` /
|
|
64
|
+
`af.backward` / `af.mse` / `af.SGD` / `af.Trainer` train a small model with the
|
|
65
|
+
forward and backward passes compiled and run on the ANE. It also does
|
|
66
|
+
classification: `af.softmax_cross_entropy` (analytic fp16-stable on-ANE gradient) +
|
|
67
|
+
`af.Adam` train a 784->128->10 MLP on MNIST to ~97% test accuracy.
|
|
68
|
+
`Trainer(..., device_optimizer=True)` additionally runs the OPTIMIZER STEP on the
|
|
69
|
+
ANE (SGD/Adam update as graph ops), so all training tensor-math is on the engine;
|
|
70
|
+
the host only computes the scalar lr_t and shuttles state/grads (the host<->device
|
|
71
|
+
state round-trip remains). See examples/train_mnist_mlp.py.
|
|
72
|
+
|
|
73
|
+
Layout: graph.py (Tensor + ops), _compile.py (per-op emit registry + compile),
|
|
74
|
+
_blob.py (weight packing), autograd.py (on-ANE autograd), models.py (pretrained loaders).
|
|
75
|
+
"""
|
|
76
|
+
__version__ = "0.1.0"
|
|
77
|
+
|
|
78
|
+
from ._op_catalog import (OP_CATALOG, op_info, device_status, is_native, ops_on,
|
|
79
|
+
min_native_family, walled_everywhere, categories)
|
|
80
|
+
from .graph import (Tensor, affine, batch_norm, batch_to_space, channel_to_space, concat,
|
|
81
|
+
conv, conv_transpose, crop, dynamic_conv, cross_attention, cross_correlation,
|
|
82
|
+
cross_product, cost_volume, depth_to_space, dynamic_slice, einsum_native,
|
|
83
|
+
flatten, fps, gather, geglu, image_input, input, input_view, instance_norm,
|
|
84
|
+
local_response_norm, lrn, maximum, minimum, mha, minmax_norm,
|
|
85
|
+
pixel_shuffle, pixel_unshuffle, radius_search, resize_bilinear,
|
|
86
|
+
resize_nearest_neighbor, scaled_elementwise, sdpa, select, space_to_batch,
|
|
87
|
+
space_to_channel, space_to_depth, split, sort, stack, topk,
|
|
88
|
+
upsample_bilinear, where)
|
|
89
|
+
from ._compile import (Model, SegmentedModel, compile, PrecisionWarning,
|
|
90
|
+
CrossChipFP16Warning, DispatchFloorWarning)
|
|
91
|
+
from ._paired import Paired, paired
|
|
92
|
+
from ._optimize import tune, tune_precision
|
|
93
|
+
from ._cost import estimate, estimate_provenance, precision_risk, project_peak
|
|
94
|
+
from ._circuit import CompileBackoffError, reset as reset_compile_breaker
|
|
95
|
+
from ._rewrite import reduce_sum_to_matmul, paired_subtract
|
|
96
|
+
from .autograd import (Adam, adam_step, backward, backward_from, conv2d, conv_param,
|
|
97
|
+
mse, parameter, SGD, softmax_cross_entropy, Trainer, UnrolledTrainer)
|
|
98
|
+
from .streaming import CheckpointedStack
|
|
99
|
+
from .models import Encoder, Vision, load, load_resnet18, conv_block, cifar_cnn, group_norm_train
|
|
100
|
+
|
|
101
|
+
__all__ = [
|
|
102
|
+
"Tensor", "Model", "SegmentedModel", "PrecisionWarning", "CrossChipFP16Warning",
|
|
103
|
+
"DispatchFloorWarning",
|
|
104
|
+
"input", "image_input", "conv", "conv_transpose", "dynamic_conv", "concat",
|
|
105
|
+
"batch_norm", "maximum", "minimum", "mha", "cross_attention", "geglu", "sdpa",
|
|
106
|
+
"pixel_shuffle", "pixel_unshuffle", "topk", "sort", "cross_product",
|
|
107
|
+
"cross_correlation", "cost_volume", "fps", "radius_search", "minmax_norm", "lrn",
|
|
108
|
+
"space_to_channel", "channel_to_space", "space_to_batch", "batch_to_space",
|
|
109
|
+
"flatten", "input_view", "dynamic_slice", "scaled_elementwise",
|
|
110
|
+
"stack", "split", "select", "where", "OP_CATALOG", "op_info", "device_status", "is_native", "ops_on", "min_native_family", "walled_everywhere", "categories", "gather", "instance_norm", "local_response_norm", "einsum_native",
|
|
111
|
+
"space_to_depth", "depth_to_space", "crop", "resize_nearest_neighbor",
|
|
112
|
+
"resize_bilinear", "upsample_bilinear", "affine",
|
|
113
|
+
"compile", "tune", "tune_precision", "estimate", "estimate_provenance",
|
|
114
|
+
"precision_risk", "project_peak",
|
|
115
|
+
"CompileBackoffError", "reset_compile_breaker",
|
|
116
|
+
"reduce_sum_to_matmul", "paired_subtract",
|
|
117
|
+
"load", "load_resnet18", "Encoder", "Vision", "conv_block", "cifar_cnn", "group_norm_train",
|
|
118
|
+
"Paired", "paired",
|
|
119
|
+
"parameter", "backward", "backward_from", "mse", "SGD", "Adam",
|
|
120
|
+
"softmax_cross_entropy", "Trainer", "UnrolledTrainer", "adam_step",
|
|
121
|
+
"conv_param", "conv2d", "CheckpointedStack",
|
|
122
|
+
"fft", "linalg", "special", "einsum", "dsp",
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
# Applied-math submodules (import for discoverability: af.fft / af.linalg / af.special /
|
|
126
|
+
# af.dsp). Each is self-contained over the public ops.
|
|
127
|
+
from . import fft as fft
|
|
128
|
+
from . import linalg as linalg
|
|
129
|
+
from . import special as special
|
|
130
|
+
from . import einsum as einsum
|
|
131
|
+
from . import dsp as dsp
|
|
132
|
+
|
|
133
|
+
# `af.einsum(...)` is the general decomposer, directly callable. The package attribute
|
|
134
|
+
# shadows the submodule of the same name; `import aneforge.einsum` and
|
|
135
|
+
# `from aneforge.einsum import ...` still resolve to the module via sys.modules.
|
|
136
|
+
from .einsum import einsum # noqa: F811
|
aneforge/_blob.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Weight packing for aneforge: one BLOBFILE holding every constant the program
|
|
2
|
+
needs (fp16, or per-channel int8 for streaming). The container is a 64-byte header
|
|
3
|
+
followed, per blob, by a 64-byte descriptor (magic 0xDEADBEEF, dtype code, length,
|
|
4
|
+
data offset) and the raw bytes. dtype codes: 1 = fp16, 4 = int8, 9 = uint1 (sparse
|
|
5
|
+
mask), 11 = uint4 (LUT)."""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import struct
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
_HEADER = 64
|
|
13
|
+
_DESCRIPTOR = 64
|
|
14
|
+
FP16, INT8 = 1, 4
|
|
15
|
+
UINT1 = 9 # BLOBFILE container dtype code for the 1-bit sparse mask (Task 8 spike;
|
|
16
|
+
# distinct from the MIL-proto DataType enum). FP16=1, INT8=4, UINT4=11.
|
|
17
|
+
UINT4 = 11 # BLOBFILE container dtype code for packed 4-bit LUT indices (Task 0 spike:
|
|
18
|
+
# read from CoreML's own weight.bin; distinct from the MIL-proto DataType
|
|
19
|
+
# enum). FP16=1, INT8=4 already exist.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fp16_bytes(a: np.ndarray) -> bytes:
|
|
23
|
+
return np.ascontiguousarray(a.astype(np.float16)).tobytes()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def quantize_per_row(W: np.ndarray) -> tuple[bytes, bytes]:
|
|
27
|
+
"""Per-output-channel symmetric int8 quant of [OUT, IN] -> (int8 bytes, fp16
|
|
28
|
+
scale bytes). Reconstruction: `int8 * scale[:, None]` (zero_point = 0)."""
|
|
29
|
+
W = W.astype(np.float32)
|
|
30
|
+
scale = np.clip(np.abs(W).max(axis=1, keepdims=True) / 127.0, 1e-8, None)
|
|
31
|
+
q = np.round(W / scale).clip(-127, 127).astype(np.int8)
|
|
32
|
+
return np.ascontiguousarray(q).tobytes(), fp16_bytes(scale[:, 0])
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def palettize_lut4(W: np.ndarray) -> tuple[bytes, bytes]:
|
|
36
|
+
"""Per-tensor 4-bit LUT palettization of an [OUT, IN] weight ->
|
|
37
|
+
(packed-index bytes, fp16 codebook bytes). 16 centroids via deterministic
|
|
38
|
+
Lloyd iterations seeded from quantiles (no RNG, so output is byte-stable).
|
|
39
|
+
Indices are packed two-per-byte, low nibble first; reconstruction is
|
|
40
|
+
`codebook[index]` (matches the on-device constexpr_lut_to_dense)."""
|
|
41
|
+
flat = W.reshape(-1).astype(np.float32)
|
|
42
|
+
edges = np.quantile(flat, np.linspace(0.0, 1.0, 17))
|
|
43
|
+
centroids = ((edges[:-1] + edges[1:]) / 2.0).astype(np.float32)
|
|
44
|
+
for _ in range(20):
|
|
45
|
+
idx = np.abs(flat[:, None] - centroids[None, :]).argmin(axis=1)
|
|
46
|
+
for k in range(16):
|
|
47
|
+
sel = flat[idx == k]
|
|
48
|
+
if sel.size:
|
|
49
|
+
centroids[k] = sel.mean()
|
|
50
|
+
idx = np.abs(flat[:, None] - centroids[None, :]).argmin(axis=1).astype(np.uint8)
|
|
51
|
+
idx = idx.reshape(W.shape[0], -1)
|
|
52
|
+
if idx.shape[1] % 2: # pad odd row length with 0
|
|
53
|
+
idx = np.pad(idx, ((0, 0), (0, 1)))
|
|
54
|
+
packed = (idx[:, 0::2] | (idx[:, 1::2] << 4)).astype(np.uint8)
|
|
55
|
+
return np.ascontiguousarray(packed).tobytes(), fp16_bytes(centroids)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def quantize_blockwise(W: np.ndarray, block_size: int = 32) -> tuple[bytes, bytes, int]:
|
|
59
|
+
"""Per-block symmetric int8 quant of [OUT, IN] -> (int8 data bytes, fp16 scale
|
|
60
|
+
bytes, nblocks). The inner dim splits into `nblocks` contiguous blocks of
|
|
61
|
+
`block_size` columns, each with its own scale; the layout the on-device
|
|
62
|
+
`constexpr_blockwise_shift_scale(data=.., scale=..)` reconstructs as
|
|
63
|
+
`data.reshape(OUT, nblocks, block_size) * scale[:, :, None]` (zero offset). The
|
|
64
|
+
scale tensor is [OUT, nblocks]. `block_size` is clamped to divide IN: if IN is
|
|
65
|
+
not a multiple, the largest divisor <= block_size is used (per-tensor when IN is
|
|
66
|
+
prime-ish). Finer blocks track local weight scale -> lower error than a single
|
|
67
|
+
per-tensor scale."""
|
|
68
|
+
W = W.astype(np.float32)
|
|
69
|
+
OUT, IN = W.shape
|
|
70
|
+
bs = int(block_size)
|
|
71
|
+
while bs > 1 and IN % bs:
|
|
72
|
+
bs -= 1
|
|
73
|
+
nblocks = IN // bs
|
|
74
|
+
Wb = W.reshape(OUT, nblocks, bs)
|
|
75
|
+
scale = np.clip(np.abs(Wb).max(axis=2) / 127.0, 1e-8, None) # [OUT, nblocks]
|
|
76
|
+
q = np.round(Wb / scale[:, :, None]).clip(-127, 127).astype(np.int8).reshape(OUT, IN)
|
|
77
|
+
return np.ascontiguousarray(q).tobytes(), fp16_bytes(scale), nblocks
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def sparsify(W: np.ndarray) -> tuple[bytes, bytes]:
|
|
81
|
+
"""Bitmask sparse encoding of [OUT, IN] -> (packed 1-bit mask bytes, fp16
|
|
82
|
+
nonzero-value bytes), row-major (C order). mask bit 1 = keep (nonzero),
|
|
83
|
+
LSB-first within each byte (element i -> bit i%8 of byte i//8). Reconstruction
|
|
84
|
+
scatters the nonzeros into the set positions (matches constexpr_sparse_to_dense).
|
|
85
|
+
Lossless for the kept values (fp16 rounding only)."""
|
|
86
|
+
flat = W.reshape(-1)
|
|
87
|
+
nz = flat != 0.0
|
|
88
|
+
mask = np.packbits(nz.astype(np.uint8), bitorder="little")
|
|
89
|
+
return np.ascontiguousarray(mask).tobytes(), fp16_bytes(flat[nz])
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BlobWriter:
|
|
93
|
+
"""Accumulates weight payloads; `add` returns each blob's descriptor offset
|
|
94
|
+
(the value a MIL `BLOBFILE(offset=...)` reference uses)."""
|
|
95
|
+
|
|
96
|
+
def __init__(self) -> None:
|
|
97
|
+
self._items: list[tuple[bytes, int]] = []
|
|
98
|
+
|
|
99
|
+
def add(self, payload: bytes, code: int) -> int:
|
|
100
|
+
offset = _HEADER + sum(_DESCRIPTOR + len(p) for p, _ in self._items)
|
|
101
|
+
self._items.append((payload, code))
|
|
102
|
+
return offset
|
|
103
|
+
|
|
104
|
+
def build(self) -> bytes:
|
|
105
|
+
parts = [struct.pack("<II", len(self._items), 2) + b"\0" * 56]
|
|
106
|
+
cursor = _HEADER
|
|
107
|
+
for payload, code in self._items:
|
|
108
|
+
data_offset = cursor + _DESCRIPTOR
|
|
109
|
+
parts.append(struct.pack("<IIQQ", 0xDEADBEEF, code, len(payload), data_offset) + b"\0" * 40)
|
|
110
|
+
parts.append(payload)
|
|
111
|
+
cursor = data_offset + len(payload)
|
|
112
|
+
return b"".join(parts)
|