PyPI - emx-onnx-cgen - Versions diffs - 0.3.8__py3-none-any.whl → 0.4.1.dev0__py3-none-any.whl - Mend

emx-onnx-cgen 0.3.8py3-none-any.whl → 0.4.1.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

emx_onnx_cgen/_build_info.py +1 -1
emx_onnx_cgen/_version.py +2 -2
emx_onnx_cgen/cli.py +1025 -162
emx_onnx_cgen/codegen/__init__.py +2 -0
emx_onnx_cgen/codegen/c_emitter.py +2081 -458
emx_onnx_cgen/compiler.py +157 -75
emx_onnx_cgen/determinism.py +39 -0
emx_onnx_cgen/ir/context.py +25 -15
emx_onnx_cgen/ir/model.py +1 -0
emx_onnx_cgen/ir/op_base.py +32 -7
emx_onnx_cgen/ir/ops/__init__.py +20 -0
emx_onnx_cgen/ir/ops/elementwise.py +138 -22
emx_onnx_cgen/ir/ops/misc.py +95 -0
emx_onnx_cgen/ir/ops/nn.py +361 -38
emx_onnx_cgen/ir/ops/reduce.py +1 -16
emx_onnx_cgen/lowering/__init__.py +9 -0
emx_onnx_cgen/lowering/arg_reduce.py +0 -4
emx_onnx_cgen/lowering/average_pool.py +157 -27
emx_onnx_cgen/lowering/bernoulli.py +73 -0
emx_onnx_cgen/lowering/common.py +48 -0
emx_onnx_cgen/lowering/concat.py +41 -7
emx_onnx_cgen/lowering/conv.py +19 -8
emx_onnx_cgen/lowering/conv_integer.py +103 -0
emx_onnx_cgen/lowering/dequantize_linear.py +128 -0
emx_onnx_cgen/lowering/elementwise.py +140 -43
emx_onnx_cgen/lowering/gather.py +11 -2
emx_onnx_cgen/lowering/gemm.py +7 -124
emx_onnx_cgen/lowering/global_max_pool.py +0 -5
emx_onnx_cgen/lowering/gru.py +323 -0
emx_onnx_cgen/lowering/hamming_window.py +104 -0
emx_onnx_cgen/lowering/hardmax.py +1 -37
emx_onnx_cgen/lowering/identity.py +7 -6
emx_onnx_cgen/lowering/logsoftmax.py +1 -35
emx_onnx_cgen/lowering/lp_pool.py +15 -4
emx_onnx_cgen/lowering/matmul.py +3 -105
emx_onnx_cgen/lowering/optional_has_element.py +28 -0
emx_onnx_cgen/lowering/qlinear_mul.py +116 -0
emx_onnx_cgen/lowering/reduce.py +0 -5
emx_onnx_cgen/lowering/reshape.py +7 -16
emx_onnx_cgen/lowering/shape.py +14 -8
emx_onnx_cgen/lowering/slice.py +14 -4
emx_onnx_cgen/lowering/softmax.py +1 -35
emx_onnx_cgen/lowering/split.py +37 -3
emx_onnx_cgen/lowering/tfidf_vectorizer.py +199 -0
emx_onnx_cgen/lowering/tile.py +38 -1
emx_onnx_cgen/lowering/topk.py +1 -5
emx_onnx_cgen/lowering/transpose.py +9 -3
emx_onnx_cgen/lowering/unsqueeze.py +11 -16
emx_onnx_cgen/lowering/upsample.py +151 -0
emx_onnx_cgen/lowering/variadic.py +1 -1
emx_onnx_cgen/lowering/where.py +0 -5
emx_onnx_cgen/onnx_import.py +578 -14
emx_onnx_cgen/ops.py +3 -0
emx_onnx_cgen/templates/adagrad_op.c.j2 +16 -0
emx_onnx_cgen/templates/arg_reduce_op.c.j2 +18 -0
emx_onnx_cgen/templates/attention_op.c.j2 +189 -0
emx_onnx_cgen/templates/average_pool_op.c.j2 +126 -0
emx_onnx_cgen/templates/batch_norm_op.c.j2 +11 -0
emx_onnx_cgen/templates/bernoulli_op.c.j2 +34 -0
emx_onnx_cgen/templates/binary_op.c.j2 +9 -0
emx_onnx_cgen/templates/cast_op.c.j2 +9 -0
emx_onnx_cgen/templates/clip_op.c.j2 +14 -0
emx_onnx_cgen/templates/concat_op.c.j2 +28 -0
emx_onnx_cgen/templates/constant_of_shape_op.c.j2 +10 -0
emx_onnx_cgen/templates/conv_integer_op.c.j2 +34 -0
emx_onnx_cgen/templates/conv_op.c.j2 +32 -0
emx_onnx_cgen/templates/conv_transpose_op.c.j2 +43 -0
emx_onnx_cgen/templates/cumsum_op.c.j2 +51 -0
emx_onnx_cgen/templates/depth_to_space_op.c.j2 +26 -0
emx_onnx_cgen/templates/dequantize_linear_op.c.j2 +10 -0
emx_onnx_cgen/templates/einsum_op.c.j2 +55 -0
emx_onnx_cgen/templates/expand_op.c.j2 +14 -0
emx_onnx_cgen/templates/eye_like_op.c.j2 +27 -0
emx_onnx_cgen/templates/gather_elements_op.c.j2 +13 -0
emx_onnx_cgen/templates/gather_nd_op.c.j2 +29 -0
emx_onnx_cgen/templates/gather_op.c.j2 +13 -0
emx_onnx_cgen/templates/gemm_op.c.j2 +35 -0
emx_onnx_cgen/templates/grid_sample_op.c.j2 +184 -0
emx_onnx_cgen/templates/group_normalization_op.c.j2 +46 -0
emx_onnx_cgen/templates/gru_op.c.j2 +152 -0
emx_onnx_cgen/templates/hamming_window_op.c.j2 +12 -0
emx_onnx_cgen/templates/hardmax_op.c.j2 +24 -0
emx_onnx_cgen/templates/identity_op.c.j2 +9 -0
emx_onnx_cgen/templates/instance_normalization_op.c.j2 +35 -0
emx_onnx_cgen/templates/layer_normalization_op.c.j2 +65 -0
emx_onnx_cgen/templates/logsoftmax_op.c.j2 +27 -0
emx_onnx_cgen/templates/lp_normalization_op.c.j2 +27 -0
emx_onnx_cgen/templates/lp_pool_op.c.j2 +24 -0
emx_onnx_cgen/templates/lrn_op.c.j2 +20 -0
emx_onnx_cgen/templates/lstm_op.c.j2 +175 -0
emx_onnx_cgen/templates/matmul_op.c.j2 +13 -0
emx_onnx_cgen/templates/maxpool_op.c.j2 +118 -0
emx_onnx_cgen/templates/mean_variance_normalization_op.c.j2 +34 -0
emx_onnx_cgen/templates/multi_input_op.c.j2 +15 -0
emx_onnx_cgen/templates/negative_log_likelihood_loss_op.c.j2 +54 -0
emx_onnx_cgen/templates/nonmax_suppression_op.c.j2 +179 -0
emx_onnx_cgen/templates/nonzero_op.c.j2 +15 -0
emx_onnx_cgen/templates/one_hot_op.c.j2 +25 -0
emx_onnx_cgen/templates/optional_has_element_op.c.j2 +4 -0
emx_onnx_cgen/templates/pad_op.c.j2 +80 -0
emx_onnx_cgen/templates/qlinear_matmul_op.c.j2 +33 -0
emx_onnx_cgen/templates/qlinear_mul_op.c.j2 +18 -0
emx_onnx_cgen/templates/quantize_linear_op.c.j2 +13 -0
emx_onnx_cgen/templates/range_op.c.j2 +8 -0
emx_onnx_cgen/templates/reduce_op.c.j2 +28 -0
emx_onnx_cgen/templates/reduce_op_dynamic.c.j2 +77 -0
emx_onnx_cgen/templates/reshape_op.c.j2 +18 -0
emx_onnx_cgen/templates/resize_op.c.j2 +277 -0
emx_onnx_cgen/templates/rms_normalization_op.c.j2 +28 -0
emx_onnx_cgen/templates/rotary_embedding_op.c.j2 +66 -0
emx_onnx_cgen/templates/scatter_nd_op.c.j2 +52 -0
emx_onnx_cgen/templates/shape_op.c.j2 +6 -0
emx_onnx_cgen/templates/size_op.c.j2 +4 -0
emx_onnx_cgen/templates/slice_op.c.j2 +9 -0
emx_onnx_cgen/templates/slice_op_dynamic.c.j2 +70 -0
emx_onnx_cgen/templates/softmax_cross_entropy_loss_op.c.j2 +105 -0
emx_onnx_cgen/templates/softmax_op.c.j2 +26 -0
emx_onnx_cgen/templates/space_to_depth_op.c.j2 +22 -0
emx_onnx_cgen/templates/split_op.c.j2 +18 -0
emx_onnx_cgen/templates/tensor_scatter_op.c.j2 +44 -0
emx_onnx_cgen/templates/testbench.c.j2 +161 -0
emx_onnx_cgen/templates/tfidf_vectorizer_op.c.j2 +144 -0
emx_onnx_cgen/templates/tile_op.c.j2 +14 -0
emx_onnx_cgen/templates/topk_op.c.j2 +50 -0
emx_onnx_cgen/templates/transpose_op.c.j2 +9 -0
emx_onnx_cgen/templates/trilu_op.c.j2 +33 -0
emx_onnx_cgen/templates/unary_op.c.j2 +23 -0
emx_onnx_cgen/templates/where_op.c.j2 +9 -0
emx_onnx_cgen/verification.py +45 -5
{emx_onnx_cgen-0.3.8.dist-info → emx_onnx_cgen-0.4.1.dev0.dist-info}/METADATA +33 -15
emx_onnx_cgen-0.4.1.dev0.dist-info/RECORD +190 -0
{emx_onnx_cgen-0.3.8.dist-info → emx_onnx_cgen-0.4.1.dev0.dist-info}/WHEEL +1 -1
emx_onnx_cgen/runtime/__init__.py +0 -1
emx_onnx_cgen/runtime/evaluator.py +0 -2955
emx_onnx_cgen-0.3.8.dist-info/RECORD +0 -107
{emx_onnx_cgen-0.3.8.dist-info → emx_onnx_cgen-0.4.1.dev0.dist-info}/entry_points.txt +0 -0
{emx_onnx_cgen-0.3.8.dist-info → emx_onnx_cgen-0.4.1.dev0.dist-info}/top_level.txt +0 -0

emx_onnx_cgen/cli.py CHANGED Viewed

@@ -7,15 +7,16 @@ import logging
 import os
 import shlex
 import shutil
+import signal
 import subprocess
 import sys
 import tempfile
 import time
-import signal
-from pathlib import Path
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Mapping, Sequence
+from pathlib import Path
+from typing import Any, Mapping, Sequence, TextIO
+import numpy as np
 import onnx
 from onnx import numpy_helper
@@ -23,25 +24,220 @@ from ._build_info import BUILD_DATE, GIT_VERSION
 from .compiler import Compiler, CompilerOptions
 from .errors import CodegenError, ShapeInferenceError, UnsupportedOpError
 from .onnx_import import import_onnx
+from .determinism import deterministic_reference_runtime
 from .onnxruntime_utils import make_deterministic_session_options
 from .testbench import decode_testbench_array
-from .verification import format_success_message, max_ulp_diff
+from .verification import format_success_message, worst_ulp_diff
 LOGGER = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    import numpy as np
+_NONDETERMINISTIC_OPERATORS = {"Bernoulli"}
 @dataclass(frozen=True)
 class CliResult:
     exit_code: int
     command_line: str
-    error: str | None = None
-    success_message: str | None = None
+    result: str | None = None
     generated: str | None = None
     data_source: str | None = None
     operators: list[str] | None = None
+    opset_version: int | None = None
+    generated_checksum: str | None = None
+@dataclass(frozen=True)
+class _WorstDiff:
+    output_name: str
+    node_name: str | None
+    index: tuple[int, ...]
+    got: float
+    reference: float
+    ulp: int
+@dataclass(frozen=True)
+class _WorstAbsDiff:
+    output_name: str
+    node_name: str | None
+    index: tuple[int, ...]
+    got: object
+    reference: object
+    abs_diff: float | int
+class _VerifyReporter:
+    def __init__(
+        self,
+        stream: TextIO | None = None,
+        *,
+        color_mode: str = "auto",
+    ) -> None:
+        self._stream = stream or sys.stdout
+        self._use_color = self._should_use_color(color_mode)
+    def _should_use_color(self, color_mode: str) -> bool:
+        if color_mode == "always":
+            return True
+        if color_mode == "never":
+            return False
+        if not hasattr(self._stream, "isatty"):
+            return False
+        return bool(self._stream.isatty())
+    def _color(self, text: str, code: str) -> str:
+        if not self._use_color:
+            return text
+        return f"\x1b[{code}m{text}\x1b[0m"
+    def start_step(self, label: str) -> float:
+        print(f"{label} ...", end=" ", file=self._stream, flush=True)
+        return time.perf_counter()
+    def step_ok(self, started_at: float) -> None:
+        duration = time.perf_counter() - started_at
+        ok = self._color("OK", "32")
+        dim = self._color(f"({duration:.3f}s)", "90")
+        print(f"{ok} {dim}", file=self._stream)
+    def step_ok_simple(self) -> None:
+        ok = self._color("OK", "32")
+        print(ok, file=self._stream)
+    def step_ok_detail(self, detail: str) -> None:
+        ok = self._color("OK", "32")
+        dim = self._color(f"({detail})", "90")
+        print(f"{ok} {dim}", file=self._stream)
+    def step_fail(self, reason: str) -> None:
+        fail = self._color("FAIL", "31")
+        print(f"{fail} ({reason})", file=self._stream)
+    def note(self, message: str) -> None:
+        label = self._color("Note:", "33")
+        print(f"{label} {message}", file=self._stream)
+    def info(self, message: str) -> None:
+        print(message, file=self._stream)
+    def result(self, message: str, *, ok: bool) -> None:
+        colored = self._color(message, "32" if ok else "31")
+        print(f"Result: {colored}", file=self._stream)
+class _NullVerifyReporter(_VerifyReporter):
+    def __init__(self) -> None:
+        super().__init__(stream=sys.stdout, color_mode="never")
+    def start_step(self, label: str) -> float:
+        return time.perf_counter()
+    def step_ok(self, started_at: float) -> None:
+        return None
+    def step_ok_simple(self) -> None:
+        return None
+    def step_ok_detail(self, detail: str) -> None:
+        return None
+    def step_fail(self, reason: str) -> None:
+        return None
+    def note(self, message: str) -> None:
+        return None
+    def info(self, message: str) -> None:
+        return None
+    def result(self, message: str, *, ok: bool) -> None:
+        return None
+def _format_artifact_size(size_bytes: int) -> str:
+    if size_bytes < 1024:
+        return f"{size_bytes} bytes"
+    return f"{size_bytes / 1024:.1f} KiB"
+def _report_generated_artifacts(
+    reporter: _VerifyReporter,
+    *,
+    artifacts: Sequence[tuple[str, int]],
+) -> None:
+    for name, size_bytes in artifacts:
+        reporter.info(f"  {name} ({_format_artifact_size(size_bytes)})")
+def _worst_ulp_diff(
+    actual: "np.ndarray", expected: "np.ndarray"
+) -> tuple[int, tuple[tuple[int, ...], float, float] | None]:
+    if actual.shape != expected.shape:
+        raise ValueError(
+            f"Shape mismatch for ULP calculation: {actual.shape} vs {expected.shape}"
+        )
+    if not np.issubdtype(expected.dtype, np.floating):
+        return 0, None
+    if actual.size == 0:
+        return 0, None
+    dtype = expected.dtype
+    actual_cast = actual.astype(dtype, copy=False)
+    expected_cast = expected.astype(dtype, copy=False)
+    max_diff = 0
+    worst: tuple[tuple[int, ...], float, float] | None = None
+    iterator = np.nditer(
+        [actual_cast, expected_cast], flags=["refs_ok", "multi_index"]
+    )
+    for actual_value, expected_value in iterator:
+        actual_scalar = float(actual_value[()])
+        expected_scalar = float(expected_value[()])
+        diff = ulp_intdiff_float(actual_value[()], expected_value[()])
+        if diff > max_diff:
+            max_diff = diff
+            worst = (
+                iterator.multi_index,
+                actual_scalar,
+                expected_scalar,
+            )
+    return max_diff, worst
+def _worst_abs_diff(
+    actual: "np.ndarray", expected: "np.ndarray"
+) -> tuple[float | int, tuple[tuple[int, ...], object, object] | None]:
+    if actual.shape != expected.shape:
+        raise ValueError(
+            f"Shape mismatch for diff calculation: {actual.shape} vs {expected.shape}"
+        )
+    if actual.size == 0:
+        return 0, None
+    dtype = expected.dtype
+    actual_cast = actual.astype(dtype, copy=False)
+    expected_cast = expected.astype(dtype, copy=False)
+    max_diff: float | int = 0
+    worst: tuple[tuple[int, ...], object, object] | None = None
+    iterator = np.nditer(
+        [actual_cast, expected_cast], flags=["refs_ok", "multi_index"]
+    )
+    for actual_value, expected_value in iterator:
+        actual_scalar = actual_value[()]
+        expected_scalar = expected_value[()]
+        if actual_scalar == expected_scalar:
+            continue
+        try:
+            if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_):
+                diff: float | int = abs(int(actual_scalar) - int(expected_scalar))
+            else:
+                diff = float(abs(actual_scalar - expected_scalar))
+        except Exception:
+            diff = 1
+        if diff > max_diff:
+            max_diff = diff
+            worst = (
+                iterator.multi_index,
+                actual_scalar,
+                expected_scalar,
+            )
+    return max_diff, worst
 def run_cli_command(
@@ -56,18 +252,26 @@ def run_cli_command(
     parser = _build_parser()
     args = parser.parse_args(parse_argv)
     args.command_line = _format_command_line(raw_argv)
+    _apply_base_dir(args, parser)
     try:
         if args.command != "compile":
-            success_message, error, operators = _verify_model(
-                args, include_build_details=False
+            (
+                success_message,
+                error,
+                operators,
+                opset_version,
+                generated_checksum,
+            ) = _verify_model(
+                args, include_build_details=False, reporter=_NullVerifyReporter()
             )
             return CliResult(
                 exit_code=0 if error is None else 1,
                 command_line=args.command_line,
-                error=error,
-                success_message=success_message,
+                result=error or success_message,
                 operators=operators,
+                opset_version=opset_version,
+                generated_checksum=generated_checksum,
             )
         generated, data_source, error = _compile_model(
             args, testbench_inputs=testbench_inputs
@@ -76,12 +280,12 @@ def run_cli_command(
             return CliResult(
                 exit_code=1,
                 command_line=args.command_line,
-                error=error,
+                result=error,
             )
         return CliResult(
             exit_code=0,
             command_line=args.command_line,
-            success_message="",
+            result="",
             generated=generated,
             data_source=data_source,
         )
@@ -90,7 +294,7 @@ def run_cli_command(
         return CliResult(
             exit_code=1,
             command_line=args.command_line,
-            error=str(exc),
+            result=str(exc),
         )
@@ -102,6 +306,24 @@ def _build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(prog="emx-onnx-cgen", description=description)
     subparsers = parser.add_subparsers(dest="command", required=True)
+    def add_color_flag(subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "--color",
+            choices=("auto", "always", "never"),
+            default="auto",
+            help=(
+                "Colorize CLI output (default: auto; options: auto, always, never)"
+            ),
+        )
+    def add_verbose_flag(subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "--verbose",
+            "-v",
+            action="store_true",
+            help="Enable verbose logging (includes codegen timing).",
+        )
     def add_restrict_flags(subparser: argparse.ArgumentParser) -> None:
         restrict_group = subparser.add_mutually_exclusive_group()
         restrict_group.add_argument(
@@ -118,9 +340,47 @@ def _build_parser() -> argparse.ArgumentParser:
         )
         subparser.set_defaults(restrict_arrays=True)
+    def add_fp32_accumulation_strategy_flag(
+        subparser: argparse.ArgumentParser,
+    ) -> None:
+        subparser.add_argument(
+            "--fp32-accumulation-strategy",
+            choices=("simple", "fp64"),
+            default="simple",
+            help=(
+                "Accumulation strategy for float32 inputs "
+                "(simple uses float32, fp64 uses double; default: simple)"
+            ),
+        )
+    def add_fp16_accumulation_strategy_flag(
+        subparser: argparse.ArgumentParser,
+    ) -> None:
+        subparser.add_argument(
+            "--fp16-accumulation-strategy",
+            choices=("simple", "fp32"),
+            default="fp32",
+            help=(
+                "Accumulation strategy for float16 inputs "
+                "(simple uses float16, fp32 uses float; default: fp32)"
+            ),
+        )
     compile_parser = subparsers.add_parser(
         "compile", help="Compile an ONNX model into C source"
     )
+    add_color_flag(compile_parser)
+    add_verbose_flag(compile_parser)
+    compile_parser.add_argument(
+        "--model-base-dir",
+        "-B",
+        type=Path,
+        default=None,
+        help=(
+            "Base directory for resolving the model path "
+            "(example: tool --model-base-dir /data model.onnx)"
+        ),
+    )
     compile_parser.add_argument("model", type=Path, help="Path to the ONNX model")
     compile_parser.add_argument(
         "output",
@@ -132,12 +392,6 @@ def _build_parser() -> argparse.ArgumentParser:
             "e.g., model.onnx -> model.c)"
         ),
     )
-    compile_parser.add_argument(
-        "--template-dir",
-        type=Path,
-        default=Path("templates"),
-        help="Template directory (default: templates)",
-    )
     compile_parser.add_argument(
         "--model-name",
         type=str,
@@ -167,9 +421,10 @@ def _build_parser() -> argparse.ArgumentParser:
         ),
     )
     compile_parser.add_argument(
-        "--large-temp-threshold-bytes",
+        "--large-temp-threshold",
         type=int,
         default=1024,
+        dest="large_temp_threshold_bytes",
         help=(
             "Mark temporary buffers larger than this threshold as static "
             "(default: 1024)"
@@ -178,25 +433,33 @@ def _build_parser() -> argparse.ArgumentParser:
     compile_parser.add_argument(
         "--large-weight-threshold",
         type=int,
-        default=1024 * 1024,
+        default=100 * 1024,
         help=(
-            "Store weights larger than this element count in a binary file "
-            "(default: 1048576; set to 0 to disable)"
+            "Store weights in a binary file once the cumulative byte size "
+            "exceeds this threshold (default: 102400; set to 0 to disable)"
         ),
     )
     add_restrict_flags(compile_parser)
+    add_fp32_accumulation_strategy_flag(compile_parser)
+    add_fp16_accumulation_strategy_flag(compile_parser)
     verify_parser = subparsers.add_parser(
         "verify",
         help="Compile an ONNX model and verify outputs against ONNX Runtime",
     )
-    verify_parser.add_argument("model", type=Path, help="Path to the ONNX model")
+    add_color_flag(verify_parser)
+    add_verbose_flag(verify_parser)
     verify_parser.add_argument(
-        "--template-dir",
+        "--model-base-dir",
+        "-B",
         type=Path,
-        default=Path("templates"),
-        help="Template directory (default: templates)",
+        default=None,
+        help=(
+            "Base directory for resolving the model and test data paths "
+            "(example: tool --model-base-dir /data model.onnx --test-data-dir inputs)"
+        ),
     )
+    verify_parser.add_argument("model", type=Path, help="Path to the ONNX model")
     verify_parser.add_argument(
         "--model-name",
         type=str,
@@ -219,9 +482,10 @@ def _build_parser() -> argparse.ArgumentParser:
         ),
     )
     verify_parser.add_argument(
-        "--large-temp-threshold-bytes",
+        "--large-temp-threshold",
         type=int,
         default=1024,
+        dest="large_temp_threshold_bytes",
         help=(
             "Mark temporary buffers larger than this threshold as static "
             "(default: 1024)"
@@ -230,10 +494,10 @@ def _build_parser() -> argparse.ArgumentParser:
     verify_parser.add_argument(
         "--large-weight-threshold",
         type=int,
-        default=1024,
+        default=100 * 1024,
         help=(
-            "Store weights larger than this element count in a binary file "
-            "(default: 1024)"
+            "Store weights in a binary file once the cumulative byte size "
+            "exceeds this threshold (default: 102400)"
         ),
     )
     verify_parser.add_argument(
@@ -245,30 +509,100 @@ def _build_parser() -> argparse.ArgumentParser:
             "(default: use random testbench inputs)"
         ),
     )
+    verify_parser.add_argument(
+        "--temp-dir-root",
+        type=Path,
+        default=None,
+        help=(
+            "Root directory in which to create a temporary verification "
+            "directory (default: system temp dir)"
+        ),
+    )
+    verify_parser.add_argument(
+        "--temp-dir",
+        type=Path,
+        default=None,
+        help=(
+            "Exact directory to use for temporary verification files "
+            "(default: create a temporary directory)"
+        ),
+    )
+    verify_parser.add_argument(
+        "--keep-temp-dir",
+        action="store_true",
+        help="Keep the temporary verification directory (default: delete it)",
+    )
     verify_parser.add_argument(
         "--max-ulp",
         type=int,
         default=100,
         help="Maximum allowed ULP difference for floating outputs (default: 100)",
     )
+    verify_parser.add_argument(
+        "--atol-eps",
+        type=float,
+        default=1.0,
+        help=(
+            "Absolute tolerance as a multiple of machine epsilon for ULP checks "
+            "(default: 1.0)"
+        ),
+    )
     verify_parser.add_argument(
         "--runtime",
         choices=("onnxruntime", "onnx-reference"),
-        default="onnx-reference",
+        default="onnxruntime",
         help=(
-            "Runtime backend for verification (default: onnx-reference; "
+            "Runtime backend for verification (default: onnxruntime; "
             "options: onnxruntime, onnx-reference)"
         ),
     )
+    verify_parser.add_argument(
+        "--expected-checksum",
+        type=str,
+        default=None,
+        help=(
+            "Expected generated C checksum (sha256). When it matches the "
+            "computed checksum, verification exits early with CHECKSUM."
+        ),
+    )
     add_restrict_flags(verify_parser)
+    add_fp32_accumulation_strategy_flag(verify_parser)
+    add_fp16_accumulation_strategy_flag(verify_parser)
     return parser
+def _resolve_with_base_dir(base_dir: Path, path: Path) -> Path:
+    if path.is_absolute():
+        return path
+    return Path(os.path.normpath(os.path.join(base_dir, path)))
+def _apply_base_dir(
+    args: argparse.Namespace, parser: argparse.ArgumentParser
+) -> None:
+    model_base_dir: Path | None = args.model_base_dir
+    if model_base_dir is None:
+        return
+    if not model_base_dir.exists() or not model_base_dir.is_dir():
+        parser.error(
+            f"--model-base-dir {model_base_dir} does not exist or is not a directory"
+        )
+    path_fields = ("model", "test_data_dir")
+    for field in path_fields:
+        value = getattr(args, field, None)
+        if value is None:
+            continue
+        if not isinstance(value, Path):
+            continue
+        setattr(args, field, _resolve_with_base_dir(model_base_dir, value))
 def main(argv: Sequence[str] | None = None) -> int:
     logging.basicConfig(level=logging.INFO)
     parser = _build_parser()
     args = parser.parse_args(argv)
     args.command_line = _format_command_line(argv)
+    _apply_base_dir(args, parser)
     if args.command == "compile":
         return _handle_compile(args)
@@ -279,27 +613,28 @@ def main(argv: Sequence[str] | None = None) -> int:
 def _handle_compile(args: argparse.Namespace) -> int:
+    reporter = _VerifyReporter(color_mode=args.color)
     model_path: Path = args.model
     output_path: Path = args.output or model_path.with_suffix(".c")
     model_name = args.model_name or "model"
-    generated, data_source, weight_data, error = _compile_model(args)
+    generated, data_source, weight_data, error = _compile_model(
+        args, reporter=reporter
+    )
     if error:
-        LOGGER.error("Failed to compile %s: %s", model_path, error)
+        reporter.info("")
+        reporter.result(error, ok=False)
         return 1
     output_path.parent.mkdir(parents=True, exist_ok=True)
     output_path.write_text(generated or "", encoding="utf-8")
-    LOGGER.info("Wrote C source to %s", output_path)
     if data_source is not None:
         data_path = output_path.with_name(
             f"{output_path.stem}_data{output_path.suffix}"
         )
         data_path.write_text(data_source, encoding="utf-8")
-        LOGGER.info("Wrote data source to %s", data_path)
     if weight_data is not None:
         weights_path = output_path.with_name(f"{model_name}.bin")
         weights_path.write_bytes(weight_data)
-        LOGGER.info("Wrote weights binary to %s", weights_path)
     return 0
@@ -307,23 +642,50 @@ def _compile_model(
     args: argparse.Namespace,
     *,
     testbench_inputs: Mapping[str, "np.ndarray"] | None = None,
+    reporter: _VerifyReporter | None = None,
 ) -> tuple[str | None, str | None, bytes | None, str | None]:
     model_path: Path = args.model
     model_name = args.model_name or "model"
+    active_reporter = reporter or _NullVerifyReporter()
+    load_started = active_reporter.start_step(
+        f"Loading model {model_path.name}"
+    )
+    timings: dict[str, float] = {}
+    try:
+        model, model_checksum = _load_model_and_checksum(model_path)
+        active_reporter.step_ok(load_started)
+    except OSError as exc:
+        active_reporter.step_fail(str(exc))
+        return None, None, None, str(exc)
+    operators = _collect_model_operators(model)
+    opset_version = _model_opset_version(model)
+    _report_model_details(
+        active_reporter,
+        model_path=model_path,
+        model_checksum=model_checksum,
+        operators=operators,
+        opset_version=opset_version,
+        node_count=len(model.graph.node),
+        initializer_count=len(model.graph.initializer),
+        input_count=len(model.graph.input),
+        output_count=len(model.graph.output),
+    )
+    active_reporter.info("")
+    codegen_started = active_reporter.start_step("Generating C code")
     try:
-        model_checksum = _model_checksum(model_path)
-        model = onnx.load_model(model_path)
         options = CompilerOptions(
-            template_dir=args.template_dir,
             model_name=model_name,
             emit_testbench=args.emit_testbench,
             command_line=args.command_line,
             model_checksum=model_checksum,
             restrict_arrays=args.restrict_arrays,
+            fp32_accumulation_strategy=args.fp32_accumulation_strategy,
+            fp16_accumulation_strategy=args.fp16_accumulation_strategy,
             truncate_weights_after=args.truncate_weights_after,
             large_temp_threshold_bytes=args.large_temp_threshold_bytes,
             large_weight_threshold=args.large_weight_threshold,
             testbench_inputs=testbench_inputs,
+            timings=timings,
         )
         compiler = Compiler(options)
         if args.emit_data_file:
@@ -333,8 +695,26 @@ def _compile_model(
         else:
             generated, weight_data = compiler.compile_with_weight_data(model)
             data_source = None
-    except (OSError, CodegenError, ShapeInferenceError, UnsupportedOpError) as exc:
+        active_reporter.step_ok(codegen_started)
+        if args.verbose:
+            _report_codegen_timings(active_reporter, timings=timings)
+    except (CodegenError, ShapeInferenceError, UnsupportedOpError) as exc:
+        active_reporter.step_fail(str(exc))
         return None, None, None, str(exc)
+    output_path: Path = args.output or model_path.with_suffix(".c")
+    artifacts = [(str(output_path), len(generated.encode("utf-8")))]
+    if data_source is not None:
+        data_path = output_path.with_name(
+            f"{output_path.stem}_data{output_path.suffix}"
+        )
+        artifacts.append((str(data_path), len(data_source.encode("utf-8"))))
+    if weight_data is not None:
+        weights_path = output_path.with_name(f"{model_name}.bin")
+        artifacts.append((str(weights_path), len(weight_data)))
+    _report_generated_artifacts(active_reporter, artifacts=artifacts)
+    active_reporter.info(
+        f"  Generated checksum (sha256): {_generated_checksum(generated)}"
+    )
     return generated, data_source, weight_data, None
@@ -363,21 +743,27 @@ def _resolve_compiler(cc: str | None, prefer_ccache: bool = False) -> list[str]
     if env_cc:
         return resolve_tokens(shlex.split(env_cc))
     for candidate in ("cc", "gcc", "clang"):
-        resolved = shutil.which(candidate)
-        if resolved:
-            return maybe_prefix_ccache([resolved])
+        if shutil.which(candidate):
+            return maybe_prefix_ccache([candidate])
     return None
 def _handle_verify(args: argparse.Namespace) -> int:
-    success_message, error, _operators = _verify_model(
-        args, include_build_details=True
-    )
+    reporter = _VerifyReporter(color_mode=args.color)
+    (
+        success_message,
+        error,
+        _operators,
+        _opset_version,
+        generated_checksum,
+    ) = _verify_model(args, include_build_details=True, reporter=reporter)
     if error is not None:
-        LOGGER.error("Verification failed: %s", error)
+        reporter.info("")
+        reporter.result(error, ok=False)
         return 1
     if success_message:
-        LOGGER.info("%s", success_message)
+        reporter.info("")
+        reporter.result(success_message, ok=True)
     return 0
@@ -385,12 +771,9 @@ def _verify_model(
     args: argparse.Namespace,
     *,
     include_build_details: bool,
-) -> tuple[str | None, str | None, list[str]]:
-    import numpy as np
-    def log_step(step: str, started_at: float) -> None:
-        duration = time.perf_counter() - started_at
-        LOGGER.info("verify step %s: %.3fs", step, duration)
+    reporter: _VerifyReporter | None = None,
+) -> tuple[str | None, str | None, list[str], int | None, str | None]:
+    active_reporter = reporter or _NullVerifyReporter()
     def describe_exit_code(returncode: int) -> str:
         if returncode >= 0:
@@ -404,54 +787,176 @@ def _verify_model(
     model_path: Path = args.model
     model_name = args.model_name or "model"
-    model_checksum = _model_checksum(model_path)
+    model, model_checksum = _load_model_and_checksum(model_path)
     compiler_cmd = _resolve_compiler(args.cc, prefer_ccache=False)
     if compiler_cmd is None:
         return (
             None,
             "No C compiler found (set --cc or CC environment variable).",
             [],
+            None,
+            None,
         )
+    temp_dir_root: Path | None = args.temp_dir_root
+    explicit_temp_dir: Path | None = args.temp_dir
+    if temp_dir_root is not None and explicit_temp_dir is not None:
+        return (
+            None,
+            "Cannot set both --temp-dir-root and --temp-dir.",
+            operators,
+            opset_version,
+            generated_checksum,
+        )
+    if temp_dir_root is not None:
+        if temp_dir_root.exists() and not temp_dir_root.is_dir():
+            return (
+                None,
+                f"Verification temp dir root is not a directory: {temp_dir_root}",
+                operators,
+                opset_version,
+                generated_checksum,
+            )
+        temp_dir_root.mkdir(parents=True, exist_ok=True)
+    if explicit_temp_dir is not None:
+        if explicit_temp_dir.exists() and not explicit_temp_dir.is_dir():
+            return (
+                None,
+                f"Verification temp dir is not a directory: {explicit_temp_dir}",
+                operators,
+                opset_version,
+                generated_checksum,
+            )
+    temp_dir: tempfile.TemporaryDirectory | None = None
+    cleanup_created_dir = False
+    if explicit_temp_dir is not None:
+        temp_path = explicit_temp_dir
+        if not temp_path.exists():
+            temp_path.mkdir(parents=True, exist_ok=True)
+            cleanup_created_dir = not args.keep_temp_dir
+    elif args.keep_temp_dir:
+        temp_path = Path(
+            tempfile.mkdtemp(
+                dir=str(temp_dir_root) if temp_dir_root is not None else None
+            )
+        )
+    else:
+        temp_dir = tempfile.TemporaryDirectory(
+            dir=str(temp_dir_root) if temp_dir_root is not None else None
+        )
+        temp_path = Path(temp_dir.name)
+    keep_label = (
+        "--keep-temp-dir set" if args.keep_temp_dir else "--keep-temp-dir not set"
+    )
+    active_reporter.note(
+        f"Using temporary folder [{keep_label}]: {temp_path}"
+    )
+    active_reporter.info("")
+    load_started = active_reporter.start_step(f"Loading model {model_path.name}")
     try:
-        model = onnx.load_model(model_path)
+        model, model_checksum = _load_model_and_checksum(model_path)
     except OSError as exc:
-        return None, str(exc), []
+        active_reporter.step_fail(str(exc))
+        return None, str(exc), [], None, None
+    active_reporter.step_ok(load_started)
     operators = _collect_model_operators(model)
-    operators_display = ", ".join(operators) if operators else "(none)"
-    LOGGER.info("verify operators: %s", operators_display)
+    opset_version = _model_opset_version(model)
+    _report_model_details(
+        active_reporter,
+        model_path=model_path,
+        model_checksum=model_checksum,
+        operators=operators,
+        opset_version=opset_version,
+        node_count=len(model.graph.node),
+        initializer_count=len(model.graph.initializer),
+        input_count=len(model.graph.input),
+        output_count=len(model.graph.output),
+    )
+    timings: dict[str, float] = {}
     try:
-        testbench_inputs = _load_test_data_inputs(model, args.test_data_dir)
+        active_reporter.info("")
+        codegen_started = active_reporter.start_step("Generating C code")
+        testbench_inputs, testbench_optional_inputs = _load_test_data_inputs(
+            model, args.test_data_dir
+        )
+        testbench_outputs = _load_test_data_outputs(model, args.test_data_dir)
         options = CompilerOptions(
-            template_dir=args.template_dir,
             model_name=model_name,
             emit_testbench=True,
-            command_line=args.command_line,
+            command_line=None,
             model_checksum=model_checksum,
             restrict_arrays=args.restrict_arrays,
+            fp32_accumulation_strategy=args.fp32_accumulation_strategy,
+            fp16_accumulation_strategy=args.fp16_accumulation_strategy,
             truncate_weights_after=args.truncate_weights_after,
             large_temp_threshold_bytes=args.large_temp_threshold_bytes,
             large_weight_threshold=args.large_weight_threshold,
             testbench_inputs=testbench_inputs,
+            testbench_optional_inputs=testbench_optional_inputs,
+            timings=timings,
         )
         compiler = Compiler(options)
-        codegen_started = time.perf_counter()
         generated, weight_data = compiler.compile_with_weight_data(model)
-        log_step("codegen", codegen_started)
+        active_reporter.step_ok(codegen_started)
+        if args.verbose:
+            _report_codegen_timings(active_reporter, timings=timings)
+        artifacts = [("model.c", len(generated.encode("utf-8")))]
+        if weight_data is not None:
+            artifacts.append((f"{model_name}.bin", len(weight_data)))
+        _report_generated_artifacts(active_reporter, artifacts=artifacts)
     except (CodegenError, ShapeInferenceError, UnsupportedOpError) as exc:
-        return None, str(exc), operators
+        active_reporter.step_fail(str(exc))
+        return None, str(exc), operators, opset_version, None
+    generated_checksum = _generated_checksum(generated)
+    active_reporter.info(f"  Generated checksum (sha256): {generated_checksum}")
+    expected_checksum = args.expected_checksum
+    if expected_checksum and expected_checksum == generated_checksum:
+        return "CHECKSUM", None, operators, opset_version, generated_checksum
     try:
         graph = import_onnx(model)
         output_dtypes = {value.name: value.type.dtype for value in graph.outputs}
         input_dtypes = {value.name: value.type.dtype for value in graph.inputs}
     except (KeyError, UnsupportedOpError, ShapeInferenceError) as exc:
-        return None, f"Failed to resolve model dtype: {exc}", operators
+        return (
+            None,
+            f"Failed to resolve model dtype: {exc}",
+            operators,
+            opset_version,
+            None,
+        )
-    with tempfile.TemporaryDirectory() as temp_dir:
-        temp_path = Path(temp_dir)
-        LOGGER.info("verify temp dir: %s", temp_path)
+    def _cleanup_temp() -> None:
+        if temp_dir is None and not cleanup_created_dir:
+            return
+        if temp_dir is None:
+            shutil.rmtree(temp_path)
+        else:
+            temp_dir.cleanup()
+    try:
+        payload: dict[str, Any] | None = None
+        testbench_input_path: Path | None = None
+        if testbench_inputs:
+            input_order = [value.name for value in graph.inputs]
+            testbench_input_path = temp_path / "testbench_inputs.bin"
+            with testbench_input_path.open("wb") as handle:
+                for name in input_order:
+                    array = testbench_inputs.get(name)
+                    if array is None:
+                        return (
+                            None,
+                            f"Missing testbench input data for {name}.",
+                            operators,
+                            opset_version,
+                            generated_checksum,
+                        )
+                    dtype = input_dtypes[name].np_dtype
+                    blob = np.ascontiguousarray(
+                        array.astype(dtype, copy=False)
+                    ).tobytes(order="C")
+                    handle.write(blob)
         c_path = temp_path / "model.c"
         weights_path = temp_path / f"{model_name}.bin"
         exe_path = temp_path / "model"
@@ -459,126 +964,302 @@ def _verify_model(
         if weight_data is not None:
             weights_path.write_bytes(weight_data)
         try:
-            compile_started = time.perf_counter()
             compile_cmd = [
                 *compiler_cmd,
                 "-std=c99",
-                "-O2",
-                str(c_path),
+                "-O1",
+                "-fsanitize=address,undefined",
+                "-Wall",
+                "-Werror",
+                str(c_path.name),
                 "-o",
-                str(exe_path),
+                str(exe_path.name),
                 "-lm",
             ]
-            LOGGER.info("verify compile command: %s", shlex.join(compile_cmd))
+            active_reporter.info("")
+            compile_started = active_reporter.start_step("Compiling C code")
             subprocess.run(
                 compile_cmd,
                 check=True,
                 capture_output=True,
                 text=True,
+                cwd=temp_path,
+            )
+            active_reporter.step_ok(compile_started)
+            active_reporter.info(
+                f"  Compile command: {shlex.join(compile_cmd)}"
             )
-            log_step("compile", compile_started)
+            active_reporter.info("")
+            if args.test_data_dir is not None:
+                active_reporter.info(
+                    f"Verifying using test data set: {args.test_data_dir.name}"
+                )
+            else:
+                active_reporter.info(
+                    "Verifying using generated random inputs"
+                )
         except subprocess.CalledProcessError as exc:
             message = "Failed to build testbench."
             if include_build_details:
                 details = exc.stderr.strip()
                 if details:
                     message = f"{message} {details}"
-            return None, message, operators
+            active_reporter.step_fail(message)
+            return None, message, operators, opset_version, generated_checksum
         try:
-            run_started = time.perf_counter()
+            run_started = active_reporter.start_step(
+                "  Running generated binary"
+            )
+            run_cmd = [str(exe_path)]
+            if testbench_input_path is not None:
+                run_cmd.append(str(testbench_input_path))
             result = subprocess.run(
-                [str(exe_path)],
+                run_cmd,
                 check=True,
                 capture_output=True,
                 text=True,
                 cwd=temp_path,
             )
-            log_step("run", run_started)
+            active_reporter.step_ok(run_started)
+            result_json_path = temp_path / "testbench.json"
+            result_json_path.write_text(result.stdout, encoding="utf-8")
+            try:
+                payload = json.loads(result_json_path.read_text(encoding="utf-8"))
+            except json.JSONDecodeError as exc:
+                return (
+                    None,
+                    f"Failed to parse testbench JSON: {exc}",
+                    operators,
+                    opset_version,
+                    generated_checksum,
+                )
         except subprocess.CalledProcessError as exc:
+            active_reporter.step_fail(describe_exit_code(exc.returncode))
             return None, (
                 "Testbench execution failed: " + describe_exit_code(exc.returncode)
-            ), operators
+            ), operators, opset_version, generated_checksum
+        if payload is None:
+            return (
+                None,
+                "Failed to parse testbench JSON: missing output.",
+                operators,
+                opset_version,
+                generated_checksum,
+            )
-    try:
-        payload = json.loads(result.stdout)
-    except json.JSONDecodeError as exc:
-        return None, f"Failed to parse testbench JSON: {exc}", operators
-    if testbench_inputs:
-        inputs = {
-            name: values.astype(input_dtypes[name].np_dtype, copy=False)
-            for name, values in testbench_inputs.items()
-        }
-    else:
-        inputs = {
-            name: decode_testbench_array(
-                value["data"], input_dtypes[name].np_dtype
+        if testbench_inputs:
+            inputs = {
+                name: values.astype(input_dtypes[name].np_dtype, copy=False)
+                for name, values in testbench_inputs.items()
+            }
+        else:
+            inputs = {
+                name: decode_testbench_array(
+                    value["data"], input_dtypes[name].np_dtype
+                )
+                for name, value in payload["inputs"].items()
+            }
+        runtime_outputs: dict[str, np.ndarray] | None = None
+        if testbench_outputs is not None:
+            runtime_outputs = {
+                name: output.astype(output_dtypes[name].np_dtype, copy=False)
+                for name, output in testbench_outputs.items()
+            }
+        else:
+            runtime_name = args.runtime
+            custom_domains = sorted(
+                {
+                    opset.domain
+                    for opset in model.opset_import
+                    if opset.domain not in {"", "ai.onnx"}
+                }
+            )
+            if runtime_name == "onnx-reference" and custom_domains:
+                active_reporter.note(
+                    "Runtime: switching to onnxruntime for custom domains "
+                    f"{', '.join(custom_domains)}"
+                )
+                runtime_name = "onnxruntime"
+            runtime_started = active_reporter.start_step(
+                f"  Running {runtime_name} [--runtime={args.runtime}]"
             )
-            for name, value in payload["inputs"].items()
+            try:
+                if runtime_name == "onnxruntime":
+                    import onnxruntime as ort
+                    sess_options = make_deterministic_session_options(ort)
+                    sess = ort.InferenceSession(
+                        model.SerializeToString(),
+                        sess_options=sess_options,
+                        providers=["CPUExecutionProvider"],
+                    )
+                    runtime_outputs_list = sess.run(None, inputs)
+                else:
+                    from onnx.reference import ReferenceEvaluator
+                    with deterministic_reference_runtime():
+                        evaluator = ReferenceEvaluator(model)
+                        runtime_outputs_list = evaluator.run(None, inputs)
+            except Exception as exc:
+                active_reporter.step_fail(str(exc))
+                message = str(exc)
+                if runtime_name == "onnxruntime" and "NOT_IMPLEMENTED" in message:
+                    active_reporter.note(
+                        f"Skipping verification for {model_path}: "
+                        "ONNX Runtime does not support the model "
+                        f"({message})"
+                    )
+                    return "", None, operators, opset_version, generated_checksum
+                return (
+                    None,
+                    f"{runtime_name} failed to run {model_path}: {message}",
+                    operators,
+                    opset_version,
+                    generated_checksum,
+                )
+            active_reporter.step_ok(runtime_started)
+            runtime_outputs = {
+                value.name: output
+                for value, output in zip(graph.outputs, runtime_outputs_list)
+            }
+        nondeterministic_ops = sorted(
+            set(operators).intersection(_NONDETERMINISTIC_OPERATORS)
+        )
+        if nondeterministic_ops:
+            active_reporter.note(
+                "Skipping output comparison for non-deterministic operator(s): "
+                f"{', '.join(nondeterministic_ops)}"
+            )
+            return (
+                "OK (non-deterministic output)",
+                None,
+                operators,
+                opset_version,
+                generated_checksum,
+            )
+        payload_outputs = payload.get("outputs", {})
+        max_ulp = 0
+        worst_diff: _WorstDiff | None = None
+        max_abs_diff: float | int = 0
+        worst_abs_diff: _WorstAbsDiff | None = None
+        output_nodes = {
+            output_name: node
+            for node in graph.nodes
+            for output_name in node.outputs
         }
-    runtime_name = args.runtime
-    runtime_started = time.perf_counter()
-    try:
-        if runtime_name == "onnxruntime":
-            import onnxruntime as ort
-            sess_options = make_deterministic_session_options(ort)
-            sess = ort.InferenceSession(
-                model.SerializeToString(),
-                sess_options=sess_options,
-                providers=["CPUExecutionProvider"],
+        active_reporter.start_step(
+            f"  Comparing outputs [--max-ulp={args.max_ulp}]"
+        )
+        try:
+            for value in graph.outputs:
+                runtime_out = runtime_outputs[value.name]
+                output_payload = payload_outputs.get(value.name)
+                if output_payload is None:
+                    raise AssertionError(
+                        f"Missing output {value.name} in testbench data"
+                    )
+                info = output_dtypes[value.name]
+                output_data = decode_testbench_array(
+                    output_payload["data"], info.np_dtype
+                ).astype(info.np_dtype, copy=False)
+                runtime_out = runtime_out.astype(info.np_dtype, copy=False)
+                output_data = output_data.reshape(runtime_out.shape)
+                if np.issubdtype(info.np_dtype, np.floating):
+                    output_max, output_worst = worst_ulp_diff(
+                        output_data,
+                        runtime_out,
+                        atol_eps=args.atol_eps,
+                    )
+                    if output_max > max_ulp:
+                        max_ulp = output_max
+                        if output_worst is not None:
+                            node = output_nodes.get(value.name)
+                            worst_diff = _WorstDiff(
+                                output_name=value.name,
+                                node_name=node.name if node else None,
+                                index=output_worst[0],
+                                got=float(output_worst[1]),
+                                reference=float(output_worst[2]),
+                                ulp=output_max,
+                            )
+                else:
+                    output_max, output_worst = _worst_abs_diff(
+                        output_data, runtime_out
+                    )
+                    if output_max > max_abs_diff:
+                        max_abs_diff = output_max
+                        if output_worst is not None:
+                            node = output_nodes.get(value.name)
+                            worst_abs_diff = _WorstAbsDiff(
+                                output_name=value.name,
+                                node_name=node.name if node else None,
+                                index=output_worst[0],
+                                got=output_worst[1],
+                                reference=output_worst[2],
+                                abs_diff=output_max,
+                            )
+        except AssertionError as exc:
+            active_reporter.step_fail(str(exc))
+            return None, str(exc), operators, opset_version, generated_checksum
+        if max_abs_diff > 0:
+            active_reporter.step_fail(f"max abs diff {max_abs_diff}")
+            if worst_abs_diff is not None:
+                node_label = worst_abs_diff.node_name or "(unknown)"
+                index_display = ", ".join(str(dim) for dim in worst_abs_diff.index)
+                active_reporter.info(
+                    "  Worst diff: output="
+                    f"{worst_abs_diff.output_name} node={node_label} "
+                    f"index=[{index_display}] "
+                    f"got={worst_abs_diff.got} "
+                    f"ref={worst_abs_diff.reference} "
+                    f"abs_diff={worst_abs_diff.abs_diff}"
+                )
+            return (
+                None,
+                f"Arrays are not equal (max abs diff {max_abs_diff})",
+                operators,
+                opset_version,
+                generated_checksum,
             )
-            runtime_outputs = sess.run(None, inputs)
-        else:
-            from onnx.reference import ReferenceEvaluator
-            evaluator = ReferenceEvaluator(model)
-            runtime_outputs = evaluator.run(None, inputs)
-    except Exception as exc:
-        log_step(runtime_name, runtime_started)
-        message = str(exc)
-        if runtime_name == "onnxruntime" and "NOT_IMPLEMENTED" in message:
-            LOGGER.warning(
-                "Skipping verification for %s: ONNX Runtime does not support the model (%s)",
-                model_path,
-                message,
+        if max_ulp > args.max_ulp:
+            active_reporter.step_fail(f"max ULP {max_ulp}")
+            if worst_diff is not None:
+                node_label = worst_diff.node_name or "(unknown)"
+                index_display = ", ".join(str(dim) for dim in worst_diff.index)
+                active_reporter.info(
+                    "  Worst diff: output="
+                    f"{worst_diff.output_name} node={node_label} "
+                    f"index=[{index_display}] "
+                    f"got={worst_diff.got:.8g} "
+                    f"ref={worst_diff.reference:.8g} "
+                    f"ulp={worst_diff.ulp}"
+                )
+            return (
+                None,
+                f"Out of tolerance (max ULP {max_ulp})",
+                operators,
+                opset_version,
+                generated_checksum,
             )
-            return "", None, operators
+        active_reporter.step_ok_simple()
+        active_reporter.info(f"    Maximum ULP: {max_ulp}")
         return (
+            format_success_message(max_ulp),
             None,
-            f"{runtime_name} failed to run {model_path}: {message}",
             operators,
+            opset_version,
+            generated_checksum,
         )
-    log_step(runtime_name, runtime_started)
-    payload_outputs = payload.get("outputs", {})
-    max_ulp = 0
-    try:
-        for value, runtime_out in zip(graph.outputs, runtime_outputs):
-            output_payload = payload_outputs.get(value.name)
-            if output_payload is None:
-                raise AssertionError(f"Missing output {value.name} in testbench data")
-            info = output_dtypes[value.name]
-            output_data = decode_testbench_array(
-                output_payload["data"], info.np_dtype
-            ).astype(info.np_dtype, copy=False)
-            runtime_out = runtime_out.astype(info.np_dtype, copy=False)
-            output_data = output_data.reshape(runtime_out.shape)
-            if np.issubdtype(info.np_dtype, np.floating):
-                max_ulp = max(max_ulp, max_ulp_diff(output_data, runtime_out))
-            else:
-                np.testing.assert_array_equal(output_data, runtime_out)
-    except AssertionError as exc:
-        return None, str(exc), operators
-    if max_ulp > args.max_ulp:
-        return None, f"Out of tolerance (max ULP {max_ulp})", operators
-    return format_success_message(max_ulp), None, operators
+    finally:
+        active_reporter.info("")
+        _cleanup_temp()
 def _load_test_data_inputs(
     model: onnx.ModelProto, data_dir: Path | None
-) -> dict[str, "np.ndarray"] | None:
+) -> tuple[dict[str, "np.ndarray"] | None, dict[str, bool] | None]:
     if data_dir is None:
-        return None
+        return None, None
     if not data_dir.exists():
         raise CodegenError(f"Test data directory not found: {data_dir}")
     input_files = sorted(
@@ -587,26 +1268,115 @@ def _load_test_data_inputs(
     )
     if not input_files:
         raise CodegenError(f"No input_*.pb files found in {data_dir}")
-    if len(input_files) != len(model.graph.input):
+    initializer_names = {init.name for init in model.graph.initializer}
+    initializer_names.update(
+        sparse_init.name for sparse_init in model.graph.sparse_initializer
+    )
+    model_inputs = [
+        value_info
+        for value_info in model.graph.input
+        if value_info.name not in initializer_names
+    ]
+    if len(input_files) != len(model_inputs):
         raise CodegenError(
             "Test data input count does not match model inputs: "
-            f"{len(input_files)} vs {len(model.graph.input)}."
+            f"{len(input_files)} vs {len(model_inputs)}."
         )
-    for value_info in model.graph.input:
+    for value_info in model_inputs:
         value_kind = value_info.type.WhichOneof("value")
-        if value_kind != "tensor_type":
+        if value_kind not in {"tensor_type", "optional_type"}:
             LOGGER.warning(
                 "Skipping test data load for non-tensor input %s (type %s).",
                 value_info.name,
                 value_kind or "unknown",
             )
-            return None
+            return None, None
     inputs: dict[str, np.ndarray] = {}
+    optional_flags: dict[str, bool] = {}
     for index, path in enumerate(input_files):
+        value_info = model_inputs[index]
+        value_kind = value_info.type.WhichOneof("value")
+        if value_kind == "tensor_type":
+            tensor = onnx.TensorProto()
+            tensor.ParseFromString(path.read_bytes())
+            inputs[value_info.name] = numpy_helper.to_array(tensor)
+            continue
+        optional = onnx.OptionalProto()
+        optional.ParseFromString(path.read_bytes())
+        elem_type = value_info.type.optional_type.elem_type
+        if elem_type.WhichOneof("value") != "tensor_type":
+            LOGGER.warning(
+                "Skipping test data load for non-tensor optional input %s.",
+                value_info.name,
+            )
+            return None, None
+        tensor_type = elem_type.tensor_type
+        if optional.HasField("tensor_value"):
+            inputs[value_info.name] = numpy_helper.to_array(
+                optional.tensor_value
+            )
+            optional_flags[value_info.name] = True
+            continue
+        if not tensor_type.HasField("elem_type"):
+            raise CodegenError(
+                f"Optional input {value_info.name} is missing elem_type."
+            )
+        dtype_info = onnx._mapping.TENSOR_TYPE_MAP.get(tensor_type.elem_type)
+        if dtype_info is None:
+            raise CodegenError(
+                f"Optional input {value_info.name} has unsupported elem_type."
+            )
+        shape: list[int] = []
+        for dim in tensor_type.shape.dim:
+            if dim.HasField("dim_value"):
+                shape.append(dim.dim_value)
+            elif dim.HasField("dim_param"):
+                shape.append(1)
+            else:
+                raise CodegenError(
+                    f"Optional input {value_info.name} has unknown shape."
+                )
+        inputs[value_info.name] = np.zeros(
+            tuple(shape), dtype=dtype_info.np_dtype
+        )
+        optional_flags[value_info.name] = False
+    return inputs, optional_flags
+def _load_test_data_outputs(
+    model: onnx.ModelProto, data_dir: Path | None
+) -> dict[str, "np.ndarray"] | None:
+    if data_dir is None:
+        return None
+    if not data_dir.exists():
+        raise CodegenError(f"Test data directory not found: {data_dir}")
+    output_files = sorted(
+        data_dir.glob("output_*.pb"),
+        key=lambda path: int(path.stem.split("_")[-1]),
+    )
+    if not output_files:
+        return None
+    model_outputs = list(model.graph.output)
+    if len(output_files) != len(model_outputs):
+        raise CodegenError(
+            "Test data output count does not match model outputs: "
+            f"{len(output_files)} vs {len(model_outputs)}."
+        )
+    for value_info in model_outputs:
+        value_kind = value_info.type.WhichOneof("value")
+        if value_kind != "tensor_type":
+            LOGGER.warning(
+                "Skipping test data load for non-tensor output %s (type %s).",
+                value_info.name,
+                value_kind or "unknown",
+            )
+            return None
+    outputs: dict[str, np.ndarray] = {}
+    for index, path in enumerate(output_files):
         tensor = onnx.TensorProto()
         tensor.ParseFromString(path.read_bytes())
-        inputs[model.graph.input[index].name] = numpy_helper.to_array(tensor)
-    return inputs
+        outputs[model_outputs[index].name] = numpy_helper.to_array(tensor)
+    return outputs
 def _format_command_line(argv: Sequence[str] | None) -> str:
@@ -615,15 +1385,97 @@ def _format_command_line(argv: Sequence[str] | None) -> str:
     args = [str(arg) for arg in argv[1:]]
     if not args:
         return ""
-    return shlex.join(args)
+    filtered: list[str] = []
+    skip_next = False
+    for arg in args:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--expected-checksum":
+            skip_next = True
+            continue
+        if arg.startswith("--expected-checksum="):
+            continue
+        filtered.append(arg)
+    if not filtered:
+        return ""
+    return shlex.join(filtered)
+def _load_model_and_checksum(
+    model_path: Path,
+) -> tuple[onnx.ModelProto, str]:
+    model_bytes = model_path.read_bytes()
+    digest = hashlib.sha256()
+    digest.update(model_bytes)
+    model = onnx.load_model_from_string(model_bytes)
+    return model, digest.hexdigest()
-def _model_checksum(model_path: Path) -> str:
+def _generated_checksum(generated: str) -> str:
     digest = hashlib.sha256()
-    digest.update(model_path.read_bytes())
+    digest.update(generated.encode("utf-8"))
     return digest.hexdigest()
+def _report_model_details(
+    reporter: _VerifyReporter,
+    *,
+    model_path: Path,
+    model_checksum: str,
+    operators: Sequence[str],
+    opset_version: int | None,
+    node_count: int,
+    initializer_count: int,
+    input_count: int,
+    output_count: int,
+) -> None:
+    operators_display = ", ".join(operators) if operators else "(none)"
+    reporter.info(
+        f"  Model operators ({len(operators)}): {operators_display}"
+    )
+    reporter.info(
+        f"  Model file size: {_format_artifact_size(model_path.stat().st_size)}"
+    )
+    reporter.info(f"  Model checksum (sha256): {model_checksum}")
+    if opset_version is not None:
+        reporter.info(f"  Opset version: {opset_version}")
+    reporter.info(
+        "  Counts: "
+        f"nodes={node_count}, "
+        f"initializers={initializer_count}, "
+        f"inputs={input_count}, "
+        f"outputs={output_count}"
+    )
+def _report_codegen_timings(
+    reporter: _VerifyReporter, *, timings: Mapping[str, float]
+) -> None:
+    if not timings:
+        return
+    order = [
+        ("import_onnx", "import"),
+        ("concretize_shapes", "concretize"),
+        ("resolve_testbench_inputs", "testbench"),
+        ("collect_variable_dims", "var_dims"),
+        ("lower_model", "lower"),
+        ("emit_model", "emit"),
+        ("emit_model_with_data_file", "emit_data"),
+        ("collect_weight_data", "weights"),
+    ]
+    seen = set()
+    parts: list[str] = []
+    for key, label in order:
+        if key not in timings:
+            continue
+        parts.append(f"{label}={timings[key]:.3f}s")
+        seen.add(key)
+    for key in sorted(k for k in timings if k not in seen):
+        parts.append(f"{key}={timings[key]:.3f}s")
+    reporter.info(f"  Codegen timing: {', '.join(parts)}")
 def _collect_model_operators(model: onnx.ModelProto) -> list[str]:
     operators: list[str] = []
     seen: set[str] = set()
@@ -634,3 +1486,14 @@ def _collect_model_operators(model: onnx.ModelProto) -> list[str]:
         seen.add(op_name)
         operators.append(op_name)
     return operators
+def _model_opset_version(model: onnx.ModelProto, *, domain: str = "") -> int | None:
+    if not model.opset_import:
+        return None
+    domains = (domain,) if domain else ("", "ai.onnx")
+    for target_domain in domains:
+        for opset in model.opset_import:
+            if opset.domain == target_domain:
+                return opset.version
+    return None

emx-onnx-cgen 0.3.8__py3-none-any.whl → 0.4.1.dev0__py3-none-any.whl

emx-onnx-cgen 0.3.8py3-none-any.whl → 0.4.1.dev0py3-none-any.whl