aneforge 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aneforge-0.1.0/.githooks/pre-commit +35 -0
- aneforge-0.1.0/.github/workflows/ci.yml +103 -0
- aneforge-0.1.0/.github/workflows/release.yml +70 -0
- aneforge-0.1.0/.gitignore +111 -0
- aneforge-0.1.0/.readthedocs.yaml +15 -0
- aneforge-0.1.0/CITATION.cff +12 -0
- aneforge-0.1.0/CONTRIBUTING.md +53 -0
- aneforge-0.1.0/LICENSE +31 -0
- aneforge-0.1.0/PKG-INFO +220 -0
- aneforge-0.1.0/README.md +183 -0
- aneforge-0.1.0/SECURITY.md +50 -0
- aneforge-0.1.0/aneforge/__init__.py +136 -0
- aneforge-0.1.0/aneforge/_blob.py +112 -0
- aneforge-0.1.0/aneforge/_bridges/__init__.py +3 -0
- aneforge-0.1.0/aneforge/_bridges/_netplist.py +3241 -0
- aneforge-0.1.0/aneforge/_bridges/ane_cost_volume_fused.py +97 -0
- aneforge-0.1.0/aneforge/_bridges/ane_cross_correlation_fused.py +88 -0
- aneforge-0.1.0/aneforge/_bridges/ane_cross_product_fused.py +71 -0
- aneforge-0.1.0/aneforge/_bridges/ane_dynamic_slice_fused.py +65 -0
- aneforge-0.1.0/aneforge/_bridges/ane_fps_fused.py +93 -0
- aneforge-0.1.0/aneforge/_bridges/ane_input_view_fused.py +69 -0
- aneforge-0.1.0/aneforge/_bridges/ane_radius_search_fused.py +91 -0
- aneforge-0.1.0/aneforge/_bridges/ane_rank_fused.py +255 -0
- aneforge-0.1.0/aneforge/_bridges/ane_rearrange_fused.py +315 -0
- aneforge-0.1.0/aneforge/_bridges/ane_sdpa_fused.py +261 -0
- aneforge-0.1.0/aneforge/_bridges/ane_structural_fused.py +111 -0
- aneforge-0.1.0/aneforge/_bridges/lrn_fused.py +137 -0
- aneforge-0.1.0/aneforge/_bridges/minmax_norm_fused.py +136 -0
- aneforge-0.1.0/aneforge/_bridges/scaled_elementwise_fused.py +125 -0
- aneforge-0.1.0/aneforge/_capabilities.py +1033 -0
- aneforge-0.1.0/aneforge/_circuit.py +76 -0
- aneforge-0.1.0/aneforge/_compile.py +1735 -0
- aneforge-0.1.0/aneforge/_cost.py +859 -0
- aneforge-0.1.0/aneforge/_invokers/README.md +22 -0
- aneforge-0.1.0/aneforge/_invokers/layer_invoker.mm +562 -0
- aneforge-0.1.0/aneforge/_invokers/persistent_worker.mm +488 -0
- aneforge-0.1.0/aneforge/_invokers/rank_invoker.mm +406 -0
- aneforge-0.1.0/aneforge/_invokers/sdpa_invoker.mm +520 -0
- aneforge-0.1.0/aneforge/_lib/ane_e5rt_dispatch.mm +1026 -0
- aneforge-0.1.0/aneforge/_lib/build.sh +12 -0
- aneforge-0.1.0/aneforge/_lib/e5rt_api.h +133 -0
- aneforge-0.1.0/aneforge/_netplist_worker.py +326 -0
- aneforge-0.1.0/aneforge/_op_catalog.py +272 -0
- aneforge-0.1.0/aneforge/_optimize.py +886 -0
- aneforge-0.1.0/aneforge/_paired.py +223 -0
- aneforge-0.1.0/aneforge/_rewrite.py +333 -0
- aneforge-0.1.0/aneforge/_runtime.py +347 -0
- aneforge-0.1.0/aneforge/_targets.py +462 -0
- aneforge-0.1.0/aneforge/ane_cost_model.json +1954 -0
- aneforge-0.1.0/aneforge/autograd.py +1444 -0
- aneforge-0.1.0/aneforge/build.py +81 -0
- aneforge-0.1.0/aneforge/costmodel_curves.json +1482 -0
- aneforge-0.1.0/aneforge/dsp.py +650 -0
- aneforge-0.1.0/aneforge/einsum.py +385 -0
- aneforge-0.1.0/aneforge/fft.py +590 -0
- aneforge-0.1.0/aneforge/full_mil_vocabulary_sweep.json +1662 -0
- aneforge-0.1.0/aneforge/graph.py +1193 -0
- aneforge-0.1.0/aneforge/linalg.py +1060 -0
- aneforge-0.1.0/aneforge/models.py +238 -0
- aneforge-0.1.0/aneforge/special.py +440 -0
- aneforge-0.1.0/aneforge/streaming.py +112 -0
- aneforge-0.1.0/bench/README.md +31 -0
- aneforge-0.1.0/bench/below_ridge_fusion.py +229 -0
- aneforge-0.1.0/bench/compress_speedup_bench.py +112 -0
- aneforge-0.1.0/bench/cross_path_compress_bench.py +119 -0
- aneforge-0.1.0/bench/decode_int8_accuracy.py +117 -0
- aneforge-0.1.0/bench/decode_measurement.py +511 -0
- aneforge-0.1.0/bench/device_bandwidth_roofline.py +513 -0
- aneforge-0.1.0/bench/device_compare.py +640 -0
- aneforge-0.1.0/bench/device_compare_wattcomplete.py +810 -0
- aneforge-0.1.0/bench/device_saturation_sweep.py +428 -0
- aneforge-0.1.0/bench/device_serving_sweep.py +564 -0
- aneforge-0.1.0/bench/encoder_serving_crosspath.py +316 -0
- aneforge-0.1.0/bench/fused_gpu_baseline.py +403 -0
- aneforge-0.1.0/bench/gemv_bandwidth_sweep.py +271 -0
- aneforge-0.1.0/bench/model_int4_bench.py +312 -0
- aneforge-0.1.0/bench/real_models_fp16.py +219 -0
- aneforge-0.1.0/bench/results/below_ridge_fusion.json +94 -0
- aneforge-0.1.0/bench/results/compress_speedup_bench.json +503 -0
- aneforge-0.1.0/bench/results/cross_path_compress_bench.json +152 -0
- aneforge-0.1.0/bench/results/decode_int8_accuracy.json +32 -0
- aneforge-0.1.0/bench/results/decode_measurement_results.json +485 -0
- aneforge-0.1.0/bench/results/device_bandwidth_roofline_results.json +1410 -0
- aneforge-0.1.0/bench/results/device_compare_wattcomplete_results.json +1124 -0
- aneforge-0.1.0/bench/results/device_compare_wattcomplete_results_M1.json +964 -0
- aneforge-0.1.0/bench/results/device_compare_wattcomplete_results_M2.json +1124 -0
- aneforge-0.1.0/bench/results/device_compare_wattcomplete_results_M5.json +1124 -0
- aneforge-0.1.0/bench/results/device_saturation_sweep_results.json +708 -0
- aneforge-0.1.0/bench/results/device_serving_sweep_results.json +1838 -0
- aneforge-0.1.0/bench/results/encoder_serving_crosspath.json +245 -0
- aneforge-0.1.0/bench/results/fused_gpu_baseline_results.json +301 -0
- aneforge-0.1.0/bench/results/gemv_bandwidth_sweep_results.json +364 -0
- aneforge-0.1.0/bench/results/model_int4_bench.json +158 -0
- aneforge-0.1.0/bench/results/real_models_fp16_results.json +89 -0
- aneforge-0.1.0/bench/results/roofline_analysis_results.json +615 -0
- aneforge-0.1.0/bench/roofline_analysis.py +611 -0
- aneforge-0.1.0/docs/aneforge-api.md +322 -0
- aneforge-0.1.0/docs/api/compile.md +25 -0
- aneforge-0.1.0/docs/api/graph.md +6 -0
- aneforge-0.1.0/docs/api/index.md +12 -0
- aneforge-0.1.0/docs/api/math.md +25 -0
- aneforge-0.1.0/docs/api/models.md +5 -0
- aneforge-0.1.0/docs/api/training.md +10 -0
- aneforge-0.1.0/docs/assets/demo.png +0 -0
- aneforge-0.1.0/docs/assets/demo.tape +35 -0
- aneforge-0.1.0/docs/assets/fluid_vorticity.png +0 -0
- aneforge-0.1.0/docs/capabilities.json +2174 -0
- aneforge-0.1.0/docs/capabilities.md +283 -0
- aneforge-0.1.0/docs/cross-chip.md +258 -0
- aneforge-0.1.0/docs/development.md +104 -0
- aneforge-0.1.0/docs/dispatch.md +161 -0
- aneforge-0.1.0/docs/e5rt-dispatch-reference.md +335 -0
- aneforge-0.1.0/docs/faq.md +259 -0
- aneforge-0.1.0/docs/gen_op_catalog.py +60 -0
- aneforge-0.1.0/docs/getting-started.md +131 -0
- aneforge-0.1.0/docs/glossary.md +203 -0
- aneforge-0.1.0/docs/index.md +56 -0
- aneforge-0.1.0/docs/mil-primer.md +317 -0
- aneforge-0.1.0/docs/op-catalog.md +248 -0
- aneforge-0.1.0/docs/reproducibility.md +88 -0
- aneforge-0.1.0/docs/requirements.txt +6 -0
- aneforge-0.1.0/docs/roadmap.md +303 -0
- aneforge-0.1.0/docs/stylesheets/extra.css +15 -0
- aneforge-0.1.0/docs/training.md +353 -0
- aneforge-0.1.0/examples/README.md +111 -0
- aneforge-0.1.0/examples/_common.py +79 -0
- aneforge-0.1.0/examples/autotune.py +271 -0
- aneforge-0.1.0/examples/benchmarks/README.md +12 -0
- aneforge-0.1.0/examples/benchmarks/bench_encoder_batched.py +102 -0
- aneforge-0.1.0/examples/benchmarks/bench_encoder_gpu.py +66 -0
- aneforge-0.1.0/examples/benchmarks/rank_worker_bench.py +98 -0
- aneforge-0.1.0/examples/benchmarks/sdpa_worker_bench.py +93 -0
- aneforge-0.1.0/examples/benchmarks/topk_worker_bench.py +95 -0
- aneforge-0.1.0/examples/cifar_data.py +44 -0
- aneforge-0.1.0/examples/compress_weights.py +145 -0
- aneforge-0.1.0/examples/data/mnist_subset.npz +0 -0
- aneforge-0.1.0/examples/demo.py +153 -0
- aneforge-0.1.0/examples/demos/README.md +71 -0
- aneforge-0.1.0/examples/demos/ane_vs_gpu_cpu.py +63 -0
- aneforge-0.1.0/examples/demos/batching_amortization.py +42 -0
- aneforge-0.1.0/examples/demos/capability_surface.py +40 -0
- aneforge-0.1.0/examples/demos/chaining_depth.py +49 -0
- aneforge-0.1.0/examples/demos/cross_chip_cost_model.py +53 -0
- aneforge-0.1.0/examples/demos/dispatch_no_coreml.py +46 -0
- aneforge-0.1.0/examples/demos/entitlement_boundary.py +39 -0
- aneforge-0.1.0/examples/demos/execution_model_floor.py +46 -0
- aneforge-0.1.0/examples/demos/hidden_layers.py +51 -0
- aneforge-0.1.0/examples/demos/llm_attention_kvcache.py +59 -0
- aneforge-0.1.0/examples/demos/mil_dialect.py +37 -0
- aneforge-0.1.0/examples/demos/numerical_scientific.py +47 -0
- aneforge-0.1.0/examples/demos/numerics_fp16.py +49 -0
- aneforge-0.1.0/examples/demos/optimization_autotune.py +53 -0
- aneforge-0.1.0/examples/demos/pitfalls_limits.py +53 -0
- aneforge-0.1.0/examples/demos/power_efficiency.py +53 -0
- aneforge-0.1.0/examples/demos/resident_state.py +44 -0
- aneforge-0.1.0/examples/demos/roofline_bandwidth.py +44 -0
- aneforge-0.1.0/examples/demos/roofline_compute.py +45 -0
- aneforge-0.1.0/examples/demos/single_in_flight.py +55 -0
- aneforge-0.1.0/examples/demos/training_on_ane.py +55 -0
- aneforge-0.1.0/examples/demos/vision_conv_encoder.py +63 -0
- aneforge-0.1.0/examples/demos/weights_compression.py +54 -0
- aneforge-0.1.0/examples/demos/what_the_ane_is.py +41 -0
- aneforge-0.1.0/examples/demos/zero_copy_io.py +56 -0
- aneforge-0.1.0/examples/eigenvalues_svd.py +66 -0
- aneforge-0.1.0/examples/factorize.py +39 -0
- aneforge-0.1.0/examples/fft.py +37 -0
- aneforge-0.1.0/examples/fluid_vorticity.py +287 -0
- aneforge-0.1.0/examples/gpt_generate_ane.py +168 -0
- aneforge-0.1.0/examples/gpt_multilayer_resident.py +116 -0
- aneforge-0.1.0/examples/heat_equation.py +142 -0
- aneforge-0.1.0/examples/llama_block_causal.py +79 -0
- aneforge-0.1.0/examples/make_mnist_subset.py +63 -0
- aneforge-0.1.0/examples/native_geometry.py +78 -0
- aneforge-0.1.0/examples/native_norms.py +59 -0
- aneforge-0.1.0/examples/native_pixel_ops.py +126 -0
- aneforge-0.1.0/examples/native_ranking.py +51 -0
- aneforge-0.1.0/examples/nbody.py +109 -0
- aneforge-0.1.0/examples/paired_fp16.py +244 -0
- aneforge-0.1.0/examples/pointcloud.py +88 -0
- aneforge-0.1.0/examples/poisson_spectral.py +101 -0
- aneforge-0.1.0/examples/quickstart.py +92 -0
- aneforge-0.1.0/examples/resnet18.py +38 -0
- aneforge-0.1.0/examples/sd15.py +504 -0
- aneforge-0.1.0/examples/sd_unet.py +105 -0
- aneforge-0.1.0/examples/sd_vae.py +98 -0
- aneforge-0.1.0/examples/sdpa.py +65 -0
- aneforge-0.1.0/examples/sentence_embeddings.py +61 -0
- aneforge-0.1.0/examples/solve_linear_systems.py +40 -0
- aneforge-0.1.0/examples/spectral_analysis.py +142 -0
- aneforge-0.1.0/examples/superres_espcn.py +82 -0
- aneforge-0.1.0/examples/train_charlm.py +110 -0
- aneforge-0.1.0/examples/train_charlm_corpus.py +120 -0
- aneforge-0.1.0/examples/train_charlm_deep.py +137 -0
- aneforge-0.1.0/examples/train_cifar_cnn.py +116 -0
- aneforge-0.1.0/examples/train_llama_block.py +64 -0
- aneforge-0.1.0/examples/train_mnist_cnn.py +67 -0
- aneforge-0.1.0/examples/train_mnist_mlp.py +65 -0
- aneforge-0.1.0/examples/train_transformer.py +105 -0
- aneforge-0.1.0/examples/train_transformer_prenorm.py +66 -0
- aneforge-0.1.0/examples/vit.py +259 -0
- aneforge-0.1.0/mkdocs.yml +100 -0
- aneforge-0.1.0/pyproject.toml +110 -0
- aneforge-0.1.0/scripts/reproduce.sh +126 -0
- aneforge-0.1.0/tests/_corpus.py +162 -0
- aneforge-0.1.0/tests/conftest.py +23 -0
- aneforge-0.1.0/tests/op_smoketest.py +99 -0
- aneforge-0.1.0/tests/run_corpus.py +62 -0
- aneforge-0.1.0/tests/test_autograd.py +1349 -0
- aneforge-0.1.0/tests/test_blas.py +347 -0
- aneforge-0.1.0/tests/test_broad.py +155 -0
- aneforge-0.1.0/tests/test_builder_guards.py +55 -0
- aneforge-0.1.0/tests/test_compile_breaker.py +114 -0
- aneforge-0.1.0/tests/test_compile_targets.py +105 -0
- aneforge-0.1.0/tests/test_compress.py +528 -0
- aneforge-0.1.0/tests/test_conv_int8.py +139 -0
- aneforge-0.1.0/tests/test_corners.py +280 -0
- aneforge-0.1.0/tests/test_cost_model_analytic.py +208 -0
- aneforge-0.1.0/tests/test_cross_chip_ops.py +130 -0
- aneforge-0.1.0/tests/test_cross_compile.py +81 -0
- aneforge-0.1.0/tests/test_cross_compile_matrix.py +68 -0
- aneforge-0.1.0/tests/test_decoder_block.py +119 -0
- aneforge-0.1.0/tests/test_dispatch_floor_warning.py +41 -0
- aneforge-0.1.0/tests/test_dynamic_conv.py +66 -0
- aneforge-0.1.0/tests/test_fft2.py +80 -0
- aneforge-0.1.0/tests/test_fp16_cross_chip.py +120 -0
- aneforge-0.1.0/tests/test_group_norm_tiling.py +83 -0
- aneforge-0.1.0/tests/test_image_input.py +137 -0
- aneforge-0.1.0/tests/test_lapack.py +205 -0
- aneforge-0.1.0/tests/test_linalg.py +231 -0
- aneforge-0.1.0/tests/test_multilayer_resident.py +14 -0
- aneforge-0.1.0/tests/test_new_ops.py +119 -0
- aneforge-0.1.0/tests/test_nn_blocks.py +446 -0
- aneforge-0.1.0/tests/test_numerical.py +707 -0
- aneforge-0.1.0/tests/test_op_catalog.py +40 -0
- aneforge-0.1.0/tests/test_op_coverage.py +169 -0
- aneforge-0.1.0/tests/test_pde_ode.py +866 -0
- aneforge-0.1.0/tests/test_routes.py +172 -0
- aneforge-0.1.0/tests/test_sdpa_causal.py +80 -0
- aneforge-0.1.0/tests/test_shapes.py +152 -0
- aneforge-0.1.0/tests/test_special_trig.py +62 -0
- aneforge-0.1.0/tests/test_spectral_sci.py +656 -0
- aneforge-0.1.0/tests/test_streaming.py +104 -0
- aneforge-0.1.0/tests/test_synthetic.py +361 -0
- aneforge-0.1.0/tests/test_targets.py +268 -0
- aneforge-0.1.0/tests/test_train_cifar.py +140 -0
- aneforge-0.1.0/tests/test_tune_guards.py +230 -0
- aneforge-0.1.0/tests/test_vjp_sweep.py +69 -0
- aneforge-0.1.0/tests/test_zero_copy_io.py +54 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
# Pre-commit gate for ANEForge: the off-hardware CI checks, run before every commit so
|
|
3
|
+
# broken code cannot land. Mirrors .github/workflows/ci.yml's lint + build jobs:
|
|
4
|
+
# 1. ruff check (lint)
|
|
5
|
+
# 2. compileall aneforge (every module byte-compiles)
|
|
6
|
+
# The on-device corpus (tests/run_corpus.py) and pytest suites need ANE hardware, so they
|
|
7
|
+
# are NOT run here (they gate separately; see docs development.md).
|
|
8
|
+
#
|
|
9
|
+
# Enable once per clone (git does not auto-trust committed hooks):
|
|
10
|
+
# git config core.hooksPath .githooks
|
|
11
|
+
# Bypass a single commit with: git commit --no-verify
|
|
12
|
+
set -e
|
|
13
|
+
cd "$(git rev-parse --show-toplevel)"
|
|
14
|
+
|
|
15
|
+
if [ -x .venv/bin/ruff ]; then RUFF=.venv/bin/ruff
|
|
16
|
+
elif command -v ruff >/dev/null 2>&1; then RUFF=ruff
|
|
17
|
+
else echo "pre-commit: ruff not found - run: pip install -e '.[dev]'" >&2; exit 1; fi
|
|
18
|
+
|
|
19
|
+
if [ -x .venv/bin/python ]; then PY=.venv/bin/python
|
|
20
|
+
elif command -v python3 >/dev/null 2>&1; then PY=python3
|
|
21
|
+
else echo "pre-commit: python3 not found" >&2; exit 1; fi
|
|
22
|
+
|
|
23
|
+
echo "pre-commit: ruff check"
|
|
24
|
+
if ! "$RUFF" check; then
|
|
25
|
+
echo "pre-commit: BLOCKED - ruff check failed. Fix (or '$RUFF check --fix'), or bypass with 'git commit --no-verify'." >&2
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
echo "pre-commit: compileall aneforge"
|
|
30
|
+
if ! "$PY" -m compileall -q aneforge; then
|
|
31
|
+
echo "pre-commit: BLOCKED - a module in aneforge/ failed to byte-compile." >&2
|
|
32
|
+
exit 1
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
echo "pre-commit: OK"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
# ANEForge dispatches to the Apple Neural Engine, which GitHub's hosted runners cannot
|
|
4
|
+
# reach: their macOS images are Apple Silicon but virtualized, and the ANE is not passed
|
|
5
|
+
# through to the guest (the same reason CoreML falls back to CPU in a VM). So the on-device
|
|
6
|
+
# correctness corpus (tests/run_corpus.py) is run by hand on a Mac with an ANE, e.g.
|
|
7
|
+
# `bash scripts/reproduce.sh gates`. This workflow covers everything CI *can* verify:
|
|
8
|
+
# lint, packaging, a multi-version compile + import matrix, the hardware-free unit tests,
|
|
9
|
+
# the documentation build, and — on a hosted macOS runner — that the Objective-C++
|
|
10
|
+
# dispatch dylib actually compiles (which the Linux jobs cannot do).
|
|
11
|
+
|
|
12
|
+
on:
|
|
13
|
+
push:
|
|
14
|
+
branches: [main]
|
|
15
|
+
pull_request:
|
|
16
|
+
|
|
17
|
+
concurrency:
|
|
18
|
+
group: ci-${{ github.ref }}
|
|
19
|
+
cancel-in-progress: true
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
lint:
|
|
23
|
+
name: lint (ruff)
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v6
|
|
27
|
+
- uses: actions/setup-python@v6
|
|
28
|
+
with:
|
|
29
|
+
python-version: "3.12"
|
|
30
|
+
- run: pip install ruff
|
|
31
|
+
- run: ruff check
|
|
32
|
+
|
|
33
|
+
build:
|
|
34
|
+
name: package (build + twine)
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/checkout@v6
|
|
38
|
+
- uses: actions/setup-python@v6
|
|
39
|
+
with:
|
|
40
|
+
python-version: "3.12"
|
|
41
|
+
- run: pip install build twine
|
|
42
|
+
- run: python -m build
|
|
43
|
+
- run: twine check dist/*
|
|
44
|
+
|
|
45
|
+
compile-import:
|
|
46
|
+
name: compile + import (py ${{ matrix.python }})
|
|
47
|
+
runs-on: ubuntu-latest
|
|
48
|
+
strategy:
|
|
49
|
+
fail-fast: false
|
|
50
|
+
matrix:
|
|
51
|
+
python: ["3.10", "3.11", "3.12", "3.13"]
|
|
52
|
+
steps:
|
|
53
|
+
- uses: actions/checkout@v6
|
|
54
|
+
- uses: actions/setup-python@v6
|
|
55
|
+
with:
|
|
56
|
+
python-version: ${{ matrix.python }}
|
|
57
|
+
# Every module byte-compiles (no import, so no dylib/ANE needed).
|
|
58
|
+
- run: python -m compileall -q aneforge
|
|
59
|
+
# The package imports with only NumPy and builds a graph; compile/dispatch is lazy
|
|
60
|
+
# and needs the ANE, so it is exercised on-device, not here.
|
|
61
|
+
- run: pip install -e .
|
|
62
|
+
- run: python -c "import aneforge as af; print('aneforge', af.__version__); af.input((1, 3, 8, 8))"
|
|
63
|
+
|
|
64
|
+
unit-tests:
|
|
65
|
+
name: off-device unit tests
|
|
66
|
+
runs-on: ubuntu-latest
|
|
67
|
+
steps:
|
|
68
|
+
- uses: actions/checkout@v6
|
|
69
|
+
- uses: actions/setup-python@v6
|
|
70
|
+
with:
|
|
71
|
+
python-version: "3.12"
|
|
72
|
+
- run: pip install -e ".[dev]"
|
|
73
|
+
# The compile-backoff rate-limiter is a pure unit test (fake clock, no ANE).
|
|
74
|
+
- run: pytest tests/test_compile_breaker.py -q
|
|
75
|
+
|
|
76
|
+
docs:
|
|
77
|
+
name: docs (mkdocs build)
|
|
78
|
+
runs-on: ubuntu-latest
|
|
79
|
+
steps:
|
|
80
|
+
- uses: actions/checkout@v6
|
|
81
|
+
- uses: actions/setup-python@v6
|
|
82
|
+
with:
|
|
83
|
+
python-version: "3.12"
|
|
84
|
+
- run: pip install -r docs/requirements.txt
|
|
85
|
+
- run: mkdocs build --strict
|
|
86
|
+
|
|
87
|
+
macos:
|
|
88
|
+
name: macOS build + smoke (Apple Silicon, no ANE)
|
|
89
|
+
runs-on: macos-14
|
|
90
|
+
steps:
|
|
91
|
+
- uses: actions/checkout@v6
|
|
92
|
+
- uses: actions/setup-python@v6
|
|
93
|
+
with:
|
|
94
|
+
python-version: "3.12"
|
|
95
|
+
# The Objective-C++ dispatch shim links Apple frameworks and only builds on macOS;
|
|
96
|
+
# this proves it compiles. The hosted runner has no ANE, so it is never dispatched
|
|
97
|
+
# (the dylib loads lazily on first dispatch, which does not happen here).
|
|
98
|
+
- run: sh aneforge/_lib/build.sh
|
|
99
|
+
- run: python -m compileall -q aneforge
|
|
100
|
+
- run: pip install -e ".[dev]"
|
|
101
|
+
# Import + graph build on macOS, and the hardware-free unit test.
|
|
102
|
+
- run: python -c "import aneforge as af; print('aneforge', af.__version__); af.input((1, 3, 8, 8))"
|
|
103
|
+
- run: pytest tests/test_compile_breaker.py -q
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Build and publish aneforge to PyPI via Trusted Publishing (OIDC, no stored token).
|
|
2
|
+
#
|
|
3
|
+
# One-time setup on PyPI (https://pypi.org/manage/account/publishing/): add a pending
|
|
4
|
+
# publisher for project "aneforge", owner "sbryngelson", repo "ANEForge", workflow
|
|
5
|
+
# "release.yml", environment "pypi". Repeat on https://test.pypi.org with environment
|
|
6
|
+
# "testpypi" if you want the dry-run path.
|
|
7
|
+
#
|
|
8
|
+
# Publish to PyPI: push a tag matching v* (e.g. `git tag v0.1.0 && git push --tags`).
|
|
9
|
+
# Dry-run to TestPyPI: Actions tab -> Release -> Run workflow -> target = testpypi.
|
|
10
|
+
name: Release
|
|
11
|
+
|
|
12
|
+
on:
|
|
13
|
+
push:
|
|
14
|
+
tags: ["v*"]
|
|
15
|
+
workflow_dispatch:
|
|
16
|
+
inputs:
|
|
17
|
+
target:
|
|
18
|
+
description: "Publish target"
|
|
19
|
+
type: choice
|
|
20
|
+
options: [testpypi, pypi]
|
|
21
|
+
default: testpypi
|
|
22
|
+
|
|
23
|
+
permissions:
|
|
24
|
+
contents: read
|
|
25
|
+
|
|
26
|
+
jobs:
|
|
27
|
+
build:
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
steps:
|
|
30
|
+
- uses: actions/checkout@v6
|
|
31
|
+
- uses: actions/setup-python@v6
|
|
32
|
+
with:
|
|
33
|
+
python-version: "3.12"
|
|
34
|
+
- run: python -m pip install --upgrade build
|
|
35
|
+
- run: python -m build
|
|
36
|
+
- run: python -m pip install --upgrade twine && python -m twine check dist/*
|
|
37
|
+
- uses: actions/upload-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist/
|
|
41
|
+
|
|
42
|
+
publish-testpypi:
|
|
43
|
+
needs: build
|
|
44
|
+
if: github.event_name == 'workflow_dispatch' && inputs.target == 'testpypi'
|
|
45
|
+
runs-on: ubuntu-latest
|
|
46
|
+
environment: testpypi
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
steps:
|
|
50
|
+
- uses: actions/download-artifact@v4
|
|
51
|
+
with:
|
|
52
|
+
name: dist
|
|
53
|
+
path: dist/
|
|
54
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
55
|
+
with:
|
|
56
|
+
repository-url: https://test.pypi.org/legacy/
|
|
57
|
+
|
|
58
|
+
publish-pypi:
|
|
59
|
+
needs: build
|
|
60
|
+
if: github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.target == 'pypi')
|
|
61
|
+
runs-on: ubuntu-latest
|
|
62
|
+
environment: pypi
|
|
63
|
+
permissions:
|
|
64
|
+
id-token: write
|
|
65
|
+
steps:
|
|
66
|
+
- uses: actions/download-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: dist
|
|
69
|
+
path: dist/
|
|
70
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# =====================================================================
|
|
2
|
+
# ANEForge .gitignore
|
|
3
|
+
#
|
|
4
|
+
# Source only. Everything generated (compiled binaries, traces, model
|
|
5
|
+
# dumps, runtime sweeps) stays local and is not tracked.
|
|
6
|
+
# =====================================================================
|
|
7
|
+
|
|
8
|
+
# Generated artifacts: runtime outputs, traces, model dumps.
|
|
9
|
+
ane_artifacts/
|
|
10
|
+
|
|
11
|
+
# Build outputs: per-machine compiled invoker/probe binaries, dSYM bundles, cache.
|
|
12
|
+
# Invokers rebuild from aneforge/_invokers/; the dispatch dylib from aneforge/_lib/.
|
|
13
|
+
ane_build/
|
|
14
|
+
|
|
15
|
+
# The package's bundled dispatch dylib is built on the target Mac (sh
|
|
16
|
+
# aneforge/_lib/build.sh); we ship the source, not the per-machine binary.
|
|
17
|
+
aneforge/_lib/*.dylib
|
|
18
|
+
|
|
19
|
+
# Local RE/paper test tooling (fuzzers, capability census + its sweep data).
|
|
20
|
+
# Untracked on purpose: not user-facing proof, not collected by pytest, not shipped.
|
|
21
|
+
utils/
|
|
22
|
+
|
|
23
|
+
# Python build artifacts (python -m build / setuptools).
|
|
24
|
+
build/
|
|
25
|
+
dist/
|
|
26
|
+
|
|
27
|
+
# =====================================================================
|
|
28
|
+
# macOS / OS junk
|
|
29
|
+
# =====================================================================
|
|
30
|
+
.DS_Store
|
|
31
|
+
._.DS_Store
|
|
32
|
+
**/.DS_Store
|
|
33
|
+
|
|
34
|
+
# =====================================================================
|
|
35
|
+
# Python
|
|
36
|
+
# =====================================================================
|
|
37
|
+
__pycache__/
|
|
38
|
+
*.py[cod]
|
|
39
|
+
*$py.class
|
|
40
|
+
*.so
|
|
41
|
+
.Python
|
|
42
|
+
*.egg-info/
|
|
43
|
+
.eggs/
|
|
44
|
+
.pytest_cache/
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.ruff_cache/
|
|
47
|
+
*.venv/
|
|
48
|
+
venv/
|
|
49
|
+
.venv/
|
|
50
|
+
|
|
51
|
+
# =====================================================================
|
|
52
|
+
# Editors
|
|
53
|
+
# =====================================================================
|
|
54
|
+
*.swp
|
|
55
|
+
*.swo
|
|
56
|
+
*.swn
|
|
57
|
+
.vscode/
|
|
58
|
+
.idea/
|
|
59
|
+
.cursor/
|
|
60
|
+
|
|
61
|
+
# =====================================================================
|
|
62
|
+
# Local scratch / temp output the fuzzer or probes may drop into the repo
|
|
63
|
+
# =====================================================================
|
|
64
|
+
/tmp_*/
|
|
65
|
+
/scratch_*/
|
|
66
|
+
*.tmp
|
|
67
|
+
|
|
68
|
+
# =====================================================================
|
|
69
|
+
# Local AI-tool session files
|
|
70
|
+
# =====================================================================
|
|
71
|
+
.codex/
|
|
72
|
+
.claude/
|
|
73
|
+
|
|
74
|
+
# =====================================================================
|
|
75
|
+
# Secrets — defensive, none expected in this tree
|
|
76
|
+
# =====================================================================
|
|
77
|
+
*.env
|
|
78
|
+
*.env.local
|
|
79
|
+
.env.*
|
|
80
|
+
credentials.json
|
|
81
|
+
*_credentials.json
|
|
82
|
+
*.pem
|
|
83
|
+
*.key
|
|
84
|
+
auth.json
|
|
85
|
+
|
|
86
|
+
# =====================================================================
|
|
87
|
+
# Example outputs and on-disk caches (regenerated on demand)
|
|
88
|
+
# =====================================================================
|
|
89
|
+
.aneforge_cache/
|
|
90
|
+
examples/sd15_out.png
|
|
91
|
+
|
|
92
|
+
# vhs intermediate; the shipped README asset is demo.png (APNG), see demo.tape.
|
|
93
|
+
docs/assets/demo.gif
|
|
94
|
+
|
|
95
|
+
# LaTeX figure build artifacts
|
|
96
|
+
docs/figures/*.aux
|
|
97
|
+
docs/figures/*.log
|
|
98
|
+
docs/figures/*.synctex.gz
|
|
99
|
+
|
|
100
|
+
# Full-MNIST loader cache (fetched, never committed)
|
|
101
|
+
examples/data/mnist_cache/
|
|
102
|
+
|
|
103
|
+
# Debug matrix dumps from the solver probes
|
|
104
|
+
.dbg_slmat/
|
|
105
|
+
|
|
106
|
+
# CIFAR-10 dataset downloaded by examples/cifar_data.py (large, regenerated on demand)
|
|
107
|
+
examples/data/cifar10/
|
|
108
|
+
examples/data/cifar_cnn.npz
|
|
109
|
+
|
|
110
|
+
# MkDocs build output
|
|
111
|
+
site/
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Read the Docs build configuration for the ANEForge package documentation.
|
|
2
|
+
# https://docs.readthedocs.io/en/stable/config-file/v2.html
|
|
3
|
+
version: 2
|
|
4
|
+
|
|
5
|
+
build:
|
|
6
|
+
os: ubuntu-24.04
|
|
7
|
+
tools:
|
|
8
|
+
python: "3.12"
|
|
9
|
+
|
|
10
|
+
mkdocs:
|
|
11
|
+
configuration: mkdocs.yml
|
|
12
|
+
|
|
13
|
+
python:
|
|
14
|
+
install:
|
|
15
|
+
- requirements: docs/requirements.txt
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use this software, please cite it as below."
|
|
3
|
+
title: "ANEForge: a direct, CoreML-free Apple Neural Engine backend"
|
|
4
|
+
type: software
|
|
5
|
+
authors:
|
|
6
|
+
- family-names: Bryngelson
|
|
7
|
+
given-names: "Spencer H."
|
|
8
|
+
affiliation: "Georgia Institute of Technology"
|
|
9
|
+
repository-code: "https://github.com/sbryngelson/ANEForge"
|
|
10
|
+
license: MIT
|
|
11
|
+
version: "0.1.0"
|
|
12
|
+
date-released: "2026-06-10"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Build, test, and "adding an operator" details are in
|
|
4
|
+
[`docs/development.md`](docs/development.md). This is the short version.
|
|
5
|
+
|
|
6
|
+
## Bug reports
|
|
7
|
+
|
|
8
|
+
Behavior on the ANE is per-chip and per-OS, so include:
|
|
9
|
+
|
|
10
|
+
- The chip (M1 through M5, or the A-series equivalent) and the macOS version.
|
|
11
|
+
- A minimal graph that reproduces it: ops, shapes, dtypes.
|
|
12
|
+
- Expected versus actual: numbers, traceback, or compile error.
|
|
13
|
+
|
|
14
|
+
Two things are expected rather than bugs: a graph that compiles for one family
|
|
15
|
+
but overflows a dimension cap on another (caps are per family, see
|
|
16
|
+
[`docs/capabilities.md`](docs/capabilities.md)), and small CPU/ANE divergence at
|
|
17
|
+
boundary fp16 values. A wrong result well inside the fp16 range is a real bug.
|
|
18
|
+
|
|
19
|
+
Report security issues privately, not in a public issue: see
|
|
20
|
+
[`SECURITY.md`](SECURITY.md).
|
|
21
|
+
|
|
22
|
+
## Changes
|
|
23
|
+
|
|
24
|
+
Operator-coverage gaps are the place to start: anything in
|
|
25
|
+
[`docs/capabilities.md`](docs/capabilities.md) not yet covered, via the four-step
|
|
26
|
+
path in [`docs/development.md`](docs/development.md#adding-an-operator). Open an
|
|
27
|
+
issue first for larger or architectural changes.
|
|
28
|
+
|
|
29
|
+
## Setup
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
pip install -e ".[dev]" # ruff + pytest
|
|
33
|
+
sh aneforge/_lib/build.sh # build the dispatch dylib (needs the Mac)
|
|
34
|
+
git config core.hooksPath .githooks # off-hardware pre-commit checks
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
The corpus is the gate and must pass before a change lands:
|
|
38
|
+
|
|
39
|
+
```sh
|
|
40
|
+
KMP_DUPLICATE_LIB_OK=TRUE PYTHONPATH=. python3 tests/run_corpus.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Most tests need a real ANE, so CI runs only the off-hardware checks. Run the
|
|
44
|
+
corpus and the pytest suite on your Mac before opening a pull request.
|
|
45
|
+
|
|
46
|
+
## Style
|
|
47
|
+
|
|
48
|
+
Python 3.10+, linted with `ruff`. Match the surrounding packed style; do not
|
|
49
|
+
reformat existing code.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
Contributions are licensed under the [MIT License](LICENSE).
|
aneforge-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Spencer H. Bryngelson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
NOTE ON SCOPE. ANEForge is an independent research project. It dispatches to the
|
|
26
|
+
Apple Neural Engine through private, undocumented system symbols (the e5rt
|
|
27
|
+
runtime) and is not an Apple product, is not endorsed by Apple, and relies on no
|
|
28
|
+
Apple API contract. "Apple", "Apple Neural Engine", and related marks belong to
|
|
29
|
+
Apple Inc. The MIT grant above covers this project's own source only; it confers
|
|
30
|
+
no rights in Apple software, and the private interfaces it calls may change or
|
|
31
|
+
break without notice. Use is intended for research and interoperability.
|
aneforge-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aneforge
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Direct Apple Neural Engine (ANE) backend. A CoreML-free Python frontend that compiles operator graphs into a single fused e5rt program and dispatches them to the ANE.
|
|
5
|
+
Project-URL: Homepage, https://github.com/sbryngelson/ANEForge
|
|
6
|
+
Project-URL: Repository, https://github.com/sbryngelson/ANEForge
|
|
7
|
+
Project-URL: Documentation, https://aneforge.readthedocs.io
|
|
8
|
+
Project-URL: Issues, https://github.com/sbryngelson/ANEForge/issues
|
|
9
|
+
Author-email: Spencer Bryngelson <shb@gatech.edu>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ane,apple-neural-engine,apple-silicon,e5rt,espresso,inference,machine-learning
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: numpy
|
|
25
|
+
Provides-Extra: bench
|
|
26
|
+
Requires-Dist: mlx>=0.31; extra == 'bench'
|
|
27
|
+
Requires-Dist: torch; extra == 'bench'
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-forked; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
32
|
+
Provides-Extra: models
|
|
33
|
+
Requires-Dist: torch; extra == 'models'
|
|
34
|
+
Requires-Dist: torchvision; extra == 'models'
|
|
35
|
+
Requires-Dist: transformers; extra == 'models'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# ANEForge
|
|
39
|
+
|
|
40
|
+
[](https://github.com/sbryngelson/ANEForge/actions/workflows/ci.yml)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
[](#install)
|
|
43
|
+
|
|
44
|
+
**Train and run neural networks directly on the Apple Neural Engine, from
|
|
45
|
+
Python, with no CoreML.**
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<img src="docs/assets/demo.png" width="680"
|
|
49
|
+
alt="A small transformer trains from scratch and generates text live on the Apple Neural Engine">
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<sub>A transformer training from scratch on the engine (forward, backward, and
|
|
54
|
+
Adam), then completing a prompt. Reproduce with <a href="examples/demo.py"><code>python examples/demo.py</code></a>.</sub>
|
|
55
|
+
</p>
|
|
56
|
+
|
|
57
|
+
Apple exposes the Neural Engine only through CoreML, and only for inference.
|
|
58
|
+
CoreML decides whether your model lands on the engine or quietly falls back to the
|
|
59
|
+
CPU or GPU, and it gives you no way to train there. ANEForge skips it: it compiles
|
|
60
|
+
a tensor graph into one ANE program and dispatches that program through the same
|
|
61
|
+
private `aned` stack CoreML, MPSGraph, and Espresso use internally. From there:
|
|
62
|
+
|
|
63
|
+
- **Training runs on the engine.** The forward pass, the backward pass, and the
|
|
64
|
+
Adam update all compile to ANE programs. A CNN trains from scratch on CIFAR-10 to
|
|
65
|
+
71%, on a chip Apple ships for inference only.
|
|
66
|
+
- **Hardware layers CoreML can't reach.** `af.sdpa` drives the engine's
|
|
67
|
+
fused-attention layer directly, the one Apple's compiler decomposes and never
|
|
68
|
+
emits; 18 other native layers (`argmax`, `topk`, `sort`, geometry) come the same way.
|
|
69
|
+
- **The engine, never a fallback.** A pretrained ResNet-18 runs end to end in
|
|
70
|
+
0.33 ms, matching reference to cosine 1.0000, at a fraction of the GPU's
|
|
71
|
+
energy (table below).
|
|
72
|
+
- **Cross-compilation for chips you don't own.** Lower and gate a graph for any of
|
|
73
|
+
28 ANE targets (M1-M5) from one machine, and estimate its latency without running it.
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import aneforge as af
|
|
77
|
+
|
|
78
|
+
x = af.input((1, 3, 32, 32)) # a lazy graph input
|
|
79
|
+
y = af.conv(x, W, pad=1).relu().mean((2, 3))
|
|
80
|
+
net = af.compile(y, compress="int8") # graph -> one fused ANE program
|
|
81
|
+
out = net(image) # callable; runs on ANE silicon
|
|
82
|
+
|
|
83
|
+
# ...or load a pretrained model
|
|
84
|
+
enc = af.load(".../all-MiniLM-L6-v2") # MiniLM sentence encoder
|
|
85
|
+
vec = enc(tokens) # on-device, cosine 1.0000 vs reference
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
A graph is built from 58 fused operators plus 19 native bridge operators, lowered
|
|
89
|
+
into one program and reused across calls, near a 70 us dispatch floor.
|
|
90
|
+
|
|
91
|
+
> **Status:** research project on Apple Silicon / macOS, verified on M5 Pro and M1
|
|
92
|
+
> Max. Relies on private framework symbols that may change without notice. Not
|
|
93
|
+
> affiliated with Apple.
|
|
94
|
+
|
|
95
|
+
## Install
|
|
96
|
+
|
|
97
|
+
Apple Silicon Mac, macOS 14+, Xcode command-line tools, Python 3.10+.
|
|
98
|
+
|
|
99
|
+
```sh
|
|
100
|
+
git clone https://github.com/sbryngelson/ANEForge.git
|
|
101
|
+
cd ANEForge
|
|
102
|
+
pip install -e . # core dependency is just NumPy
|
|
103
|
+
PYTHONPATH=. python3 tests/op_smoketest.py # compile + run each op on the ANE
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The `e5rt` dispatch shim links Apple frameworks, so it compiles from source on your
|
|
107
|
+
Mac. That happens automatically the first time you dispatch to the ANE; build it
|
|
108
|
+
ahead of time with `python -m aneforge.build` if you prefer.
|
|
109
|
+
|
|
110
|
+
Optional extras: `pip install -e ".[models]"` (torch / torchvision / transformers
|
|
111
|
+
for the pretrained loaders) and `".[bench]"` (mlx / torch for the GPU-comparison
|
|
112
|
+
tools). Then browse [`examples/`](examples/), starting with
|
|
113
|
+
[`examples/quickstart.py`](examples/quickstart.py).
|
|
114
|
+
|
|
115
|
+
## How it compares
|
|
116
|
+
|
|
117
|
+
| | On the ANE | No CoreML | Trains on it |
|
|
118
|
+
| --------------------- | :---------------: | :-------: | :----------: |
|
|
119
|
+
| CoreML / coremltools | scheduler chooses | -- | no |
|
|
120
|
+
| MLX, PyTorch (MPS) | no (GPU) | yes | on the GPU |
|
|
121
|
+
| **ANEForge** | **yes (direct)** | **yes** | **yes** |
|
|
122
|
+
|
|
123
|
+
CoreML is the only public door to the engine, and it only ever decides whether to
|
|
124
|
+
use it. ANEForge compiles to the engine directly, from an ordinary user process,
|
|
125
|
+
with no entitlement and without disabling system integrity protection.
|
|
126
|
+
|
|
127
|
+
## Measured
|
|
128
|
+
|
|
129
|
+
Single input, fp16, on an M5 Pro. The GPU baseline is PyTorch on Metal (MPS) at
|
|
130
|
+
fp16; energy is whole-package, read with `powermetrics`.
|
|
131
|
+
|
|
132
|
+
| Pretrained model | ANE | GPU (fp16) | ANE energy | GPU energy |
|
|
133
|
+
| ---------------- | ------: | ---------: | ---------: | ---------: |
|
|
134
|
+
| ResNet-18 | 0.33 ms | 2.03 ms | 2.2 mJ | 35 mJ |
|
|
135
|
+
| MiniLM encoder | 0.53 ms | 1.92 ms | 2.4 mJ | 21 mJ |
|
|
136
|
+
| ViT-B/16 | 18.3 ms | 15.9 ms | 75 mJ | 612 mJ |
|
|
137
|
+
|
|
138
|
+
The engine is faster on the convolutional and encoder workloads and 8 to 16x more
|
|
139
|
+
energy efficient on all three, even on ViT-B/16 where the GPU edges it on latency.
|
|
140
|
+
Reproduce with
|
|
141
|
+
[`bench/device_compare_wattcomplete.py`](bench/device_compare_wattcomplete.py)
|
|
142
|
+
and [`bench/real_models_fp16.py`](bench/real_models_fp16.py); the full per-workload
|
|
143
|
+
device map (16 classes, measured on M1 / M2 / M5) is in
|
|
144
|
+
[`bench/results/`](bench/results/).
|
|
145
|
+
|
|
146
|
+
## A fluid simulation on the Neural Engine
|
|
147
|
+
|
|
148
|
+
<p align="center">
|
|
149
|
+
<img src="docs/assets/fluid_vorticity.png" width="400"
|
|
150
|
+
alt="A passive dye shaped as the word ANEForge stirred into glowing filaments by a fluid simulation on the Apple Neural Engine">
|
|
151
|
+
</p>
|
|
152
|
+
|
|
153
|
+
A passive dye is painted as the word ANEForge, and a 2-D incompressible
|
|
154
|
+
Navier-Stokes flow (pseudo-spectral) stirs it into thin glowing filaments. Every
|
|
155
|
+
Fourier transform in the 2,200-step loop runs on the ANE, and the whole
|
|
156
|
+
simulation costs about 9 J at the measured 1.48 W rail. Reproduce with
|
|
157
|
+
[`python examples/fluid_vorticity.py`](examples/fluid_vorticity.py).
|
|
158
|
+
|
|
159
|
+
## What it does
|
|
160
|
+
|
|
161
|
+
- **Graph -> compile -> run.** 58 fused operators (conv/pool, `matmul`/`bmm`/`einsum`,
|
|
162
|
+
activations, reductions, norms, softmax, attention, shape/geometry) into one
|
|
163
|
+
program with int8/int4/fp16 weights, plus a bridge route for 19 native ops the
|
|
164
|
+
public toolchain never emits.
|
|
165
|
+
- **Streaming weight compression.** int8, int4-LUT, or sparse weights streamed from
|
|
166
|
+
the engine's dequant path (~4x smaller for int4), accuracy-gated.
|
|
167
|
+
- **On-device uint8 image input,** dequantized in-graph, so raw camera or video
|
|
168
|
+
bytes feed the model directly.
|
|
169
|
+
- **Resident state.** KV-cache and optimizer state kept on the engine across steps
|
|
170
|
+
via buffer aliasing (`share_buffer`).
|
|
171
|
+
- **Accuracy-preserving optimizer.** `af.tune` measures equivalent lowerings on the
|
|
172
|
+
engine and returns the lossless pick.
|
|
173
|
+
- **Linear algebra and spectral methods.** `aneforge.linalg` and `aneforge.fft` as
|
|
174
|
+
static-dataflow graphs.
|
|
175
|
+
|
|
176
|
+
## What runs
|
|
177
|
+
|
|
178
|
+
Pretrained models, each fused into one ANE program:
|
|
179
|
+
|
|
180
|
+
| Model | Task | Fidelity vs reference |
|
|
181
|
+
| ------------------ | -------------------------- | ----------------------- |
|
|
182
|
+
| ResNet-18 | ImageNet classification | cosine 1.0000 |
|
|
183
|
+
| ViT-B/16 | vision transformer encoder | cosine 1.0000 |
|
|
184
|
+
| all-MiniLM-L6-v2 | sentence embedding | cosine 1.0000 |
|
|
185
|
+
| ESPCN | super-resolution | runs end to end |
|
|
186
|
+
| Stable Diffusion 1.5 | U-Net + VAE (per component) | U-Net 1.5%, VAE 4.4% rel. |
|
|
187
|
+
|
|
188
|
+
Trained from scratch on the engine: an MLP, a CNN (CIFAR-10 to 71%), a transformer
|
|
189
|
+
block, a LLaMA-style block, and a character language model. Operator coverage is
|
|
190
|
+
tracked op by op across M1 to M5 in the [op catalog](docs/op-catalog.md), the
|
|
191
|
+
exhaustive native-MIL-op x device table; [capabilities](docs/capabilities.md) has
|
|
192
|
+
the dtype matrix and the known limits.
|
|
193
|
+
|
|
194
|
+
## Verify
|
|
195
|
+
|
|
196
|
+
The correctness corpus compiles and runs every op and kernel on the ANE, and is
|
|
197
|
+
the project's reproducibility gate:
|
|
198
|
+
|
|
199
|
+
```sh
|
|
200
|
+
KMP_DUPLICATE_LIB_OK=TRUE PYTHONPATH=. python3 tests/run_corpus.py
|
|
201
|
+
KMP_DUPLICATE_LIB_OK=TRUE PYTHONPATH=. python3 -m pytest tests/ -q
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Documentation
|
|
205
|
+
|
|
206
|
+
The manual lives in [`docs/`](docs/) (MkDocs; `pip install -r docs/requirements.txt`,
|
|
207
|
+
then `mkdocs serve`), starting at [`docs/index.md`](docs/index.md). The API is
|
|
208
|
+
documented in the module docstrings, and runnable usage in [`examples/`](examples/).
|
|
209
|
+
|
|
210
|
+
## Contributing
|
|
211
|
+
|
|
212
|
+
[`CONTRIBUTING.md`](CONTRIBUTING.md) has the bug-report checklist (include your
|
|
213
|
+
chip and macOS version), the development setup, and where to start. Report security
|
|
214
|
+
issues privately per [`SECURITY.md`](SECURITY.md).
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
[MIT](LICENSE). The Apple Neural Engine is proprietary hardware, and the framework
|
|
219
|
+
symbols this project calls are private, undocumented, and may change at any time.
|
|
220
|
+
Nothing here is endorsed by, or constitutes an API contract from, Apple.
|