PyPI - emx-onnx-cgen - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

emx-onnx-cgen 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of emx-onnx-cgen might be problematic. Click here for more details.

Files changed (42) hide show

emx_onnx_cgen/_build_info.py +1 -1
emx_onnx_cgen/_version.py +34 -0
emx_onnx_cgen/cli.py +340 -59
emx_onnx_cgen/codegen/c_emitter.py +2369 -111
emx_onnx_cgen/compiler.py +188 -5
emx_onnx_cgen/ir/model.py +1 -0
emx_onnx_cgen/lowering/common.py +379 -2
emx_onnx_cgen/lowering/conv_transpose.py +301 -0
emx_onnx_cgen/lowering/einsum.py +153 -0
emx_onnx_cgen/lowering/gather_elements.py +1 -3
emx_onnx_cgen/lowering/gather_nd.py +79 -0
emx_onnx_cgen/lowering/global_max_pool.py +59 -0
emx_onnx_cgen/lowering/hardmax.py +53 -0
emx_onnx_cgen/lowering/identity.py +6 -5
emx_onnx_cgen/lowering/logsoftmax.py +5 -1
emx_onnx_cgen/lowering/lp_pool.py +141 -0
emx_onnx_cgen/lowering/matmul.py +6 -7
emx_onnx_cgen/lowering/negative_log_likelihood_loss.py +12 -12
emx_onnx_cgen/lowering/nonzero.py +42 -0
emx_onnx_cgen/lowering/one_hot.py +120 -0
emx_onnx_cgen/lowering/quantize_linear.py +126 -0
emx_onnx_cgen/lowering/reduce.py +5 -6
emx_onnx_cgen/lowering/reshape.py +223 -51
emx_onnx_cgen/lowering/scatter_nd.py +82 -0
emx_onnx_cgen/lowering/softmax.py +5 -1
emx_onnx_cgen/lowering/squeeze.py +5 -5
emx_onnx_cgen/lowering/topk.py +116 -0
emx_onnx_cgen/lowering/trilu.py +89 -0
emx_onnx_cgen/lowering/unsqueeze.py +5 -5
emx_onnx_cgen/onnx_import.py +4 -0
emx_onnx_cgen/onnxruntime_utils.py +11 -0
emx_onnx_cgen/ops.py +4 -0
emx_onnx_cgen/runtime/evaluator.py +460 -42
emx_onnx_cgen/testbench.py +23 -0
emx_onnx_cgen/verification.py +61 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.0.dist-info}/METADATA +31 -5
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.0.dist-info}/RECORD +42 -25
shared/scalar_functions.py +49 -17
shared/ulp.py +48 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.0.dist-info}/WHEEL +0 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.0.dist-info}/entry_points.txt +0 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.0.dist-info}/top_level.txt +0 -0

emx_onnx_cgen/codegen/c_emitter.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from enum import Enum
 import itertools
+import math
+from math import prod
 from pathlib import Path
 import re
+import struct
 from typing import Mapping, Sequence
 from jinja2 import Environment, FileSystemLoader, Template, select_autoescape
+import numpy as np
 from ..errors import CodegenError
 from ..ops import (
@@ -24,6 +29,38 @@ from shared.scalar_types import ScalarFunctionError, ScalarType
 def _format_c_indentation(source: str, *, indent: str = "    ") -> str:
+    def strip_string_literals(line: str) -> str:
+        sanitized: list[str] = []
+        in_string = False
+        in_char = False
+        escape = False
+        for char in line:
+            if escape:
+                escape = False
+                if not (in_string or in_char):
+                    sanitized.append(char)
+                continue
+            if in_string:
+                if char == "\\":
+                    escape = True
+                elif char == '"':
+                    in_string = False
+                continue
+            if in_char:
+                if char == "\\":
+                    escape = True
+                elif char == "'":
+                    in_char = False
+                continue
+            if char == '"':
+                in_string = True
+                continue
+            if char == "'":
+                in_char = True
+                continue
+            sanitized.append(char)
+        return "".join(sanitized)
     formatted_lines: list[str] = []
     indent_level = 0
     for line in source.splitlines():
@@ -34,8 +71,9 @@ def _format_c_indentation(source: str, *, indent: str = "    ") -> str:
         if stripped.startswith("}"):
             indent_level = max(indent_level - 1, 0)
         formatted_lines.append(f"{indent * indent_level}{stripped}")
-        open_count = stripped.count("{")
-        close_count = stripped.count("}")
+        sanitized = strip_string_literals(stripped)
+        open_count = sanitized.count("{")
+        close_count = sanitized.count("}")
         if stripped.startswith("}"):
             close_count = max(close_count - 1, 0)
         indent_level += open_count - close_count
@@ -119,6 +157,8 @@ class BinaryOp:
     output: str
     function: ScalarFunction
     operator_kind: OperatorKind
+    input0_shape: tuple[int, ...]
+    input1_shape: tuple[int, ...]
     shape: tuple[int, ...]
     dtype: ScalarType
     input_dtype: ScalarType
@@ -211,6 +251,26 @@ class MatMulOp:
     dtype: ScalarType
+class EinsumKind(str, Enum):
+    REDUCE_ALL = "reduce_all"
+    SUM_J = "sum_j"
+    TRANSPOSE = "transpose"
+    DOT = "dot"
+    BATCH_MATMUL = "batch_matmul"
+    BATCH_DIAGONAL = "batch_diagonal"
+@dataclass(frozen=True)
+class EinsumOp:
+    inputs: tuple[str, ...]
+    output: str
+    kind: EinsumKind
+    input_shapes: tuple[tuple[int, ...], ...]
+    output_shape: tuple[int, ...]
+    dtype: ScalarType
+    input_dtype: ScalarType
 @dataclass(frozen=True)
 class GemmOp:
     input_a: str
@@ -305,6 +365,27 @@ class ConvOp:
         return self.out_spatial[1]
+@dataclass(frozen=True)
+class ConvTransposeOp:
+    input0: str
+    weights: str
+    bias: str | None
+    output: str
+    batch: int
+    in_channels: int
+    out_channels: int
+    spatial_rank: int
+    in_spatial: tuple[int, ...]
+    out_spatial: tuple[int, ...]
+    kernel_shape: tuple[int, ...]
+    strides: tuple[int, ...]
+    pads: tuple[int, ...]
+    dilations: tuple[int, ...]
+    output_padding: tuple[int, ...]
+    group: int
+    dtype: ScalarType
 @dataclass(frozen=True)
 class AveragePoolOp:
     input0: str
@@ -327,6 +408,41 @@ class AveragePoolOp:
     dtype: ScalarType
+@dataclass(frozen=True)
+class LpPoolOp:
+    input0: str
+    output: str
+    batch: int
+    channels: int
+    in_h: int
+    in_w: int
+    out_h: int
+    out_w: int
+    kernel_h: int
+    kernel_w: int
+    stride_h: int
+    stride_w: int
+    pad_top: int
+    pad_left: int
+    pad_bottom: int
+    pad_right: int
+    p: int
+    dtype: ScalarType
+@dataclass(frozen=True)
+class QuantizeLinearOp:
+    input0: str
+    scale: str
+    zero_point: str | None
+    output: str
+    input_shape: tuple[int, ...]
+    axis: int | None
+    dtype: ScalarType
+    input_dtype: ScalarType
+    scale_dtype: ScalarType
 @dataclass(frozen=True)
 class SoftmaxOp:
     input0: str
@@ -351,6 +467,18 @@ class LogSoftmaxOp:
     dtype: ScalarType
+@dataclass(frozen=True)
+class HardmaxOp:
+    input0: str
+    output: str
+    outer: int
+    axis_size: int
+    inner: int
+    axis: int
+    shape: tuple[int, ...]
+    dtype: ScalarType
 @dataclass(frozen=True)
 class NegativeLogLikelihoodLossOp:
     input0: str
@@ -595,6 +723,34 @@ class GatherOp:
     indices_dtype: ScalarType
+@dataclass(frozen=True)
+class GatherNDOp:
+    data: str
+    indices: str
+    output: str
+    batch_dims: int
+    data_shape: tuple[int, ...]
+    indices_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    dtype: ScalarType
+    indices_dtype: ScalarType
+@dataclass(frozen=True)
+class ScatterNDOp:
+    data: str
+    indices: str
+    updates: str
+    output: str
+    data_shape: tuple[int, ...]
+    indices_shape: tuple[int, ...]
+    updates_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    reduction: str
+    dtype: ScalarType
+    indices_dtype: ScalarType
 @dataclass(frozen=True)
 class TransposeOp:
     input0: str
@@ -635,6 +791,21 @@ class EyeLikeOp:
     input_dtype: ScalarType
+@dataclass(frozen=True)
+class TriluOp:
+    input0: str
+    output: str
+    input_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    upper: bool
+    k_value: int
+    k_input: str | None
+    k_input_shape: tuple[int, ...] | None
+    k_input_dtype: ScalarType | None
+    dtype: ScalarType
+    input_dtype: ScalarType
 @dataclass(frozen=True)
 class TileOp:
     input0: str
@@ -800,6 +971,22 @@ class ArgReduceOp:
     output_dtype: ScalarType
+@dataclass(frozen=True)
+class TopKOp:
+    input0: str
+    output_values: str
+    output_indices: str
+    input_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    axis: int
+    k: int
+    largest: bool
+    sorted: bool
+    input_dtype: ScalarType
+    output_values_dtype: ScalarType
+    output_indices_dtype: ScalarType
 @dataclass(frozen=True)
 class ConstantOfShapeOp:
     input0: str
@@ -833,6 +1020,16 @@ class SizeOp:
     input_dtype: ScalarType
+@dataclass(frozen=True)
+class NonZeroOp:
+    input0: str
+    output: str
+    input_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    dtype: ScalarType
+    input_dtype: ScalarType
 @dataclass(frozen=True)
 class ExpandOp:
     input0: str
@@ -871,6 +1068,22 @@ class RangeOp:
     input_dtype: ScalarType
+@dataclass(frozen=True)
+class OneHotOp:
+    indices: str
+    depth: str
+    values: str
+    output: str
+    axis: int
+    indices_shape: tuple[int, ...]
+    values_shape: tuple[int, ...]
+    output_shape: tuple[int, ...]
+    depth_dim: int
+    dtype: ScalarType
+    indices_dtype: ScalarType
+    depth_dtype: ScalarType
 @dataclass(frozen=True)
 class SplitOp:
     input0: str
@@ -937,11 +1150,15 @@ class LoweredModel:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -953,16 +1170,20 @@ class LoweredModel:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -972,12 +1193,15 @@ class LoweredModel:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
         ...,
     ]
@@ -986,7 +1210,15 @@ class LoweredModel:
 class CEmitter:
-    def __init__(self, template_dir: Path, *, restrict_arrays: bool = True) -> None:
+    def __init__(
+        self,
+        template_dir: Path,
+        *,
+        restrict_arrays: bool = True,
+        truncate_weights_after: int | None = None,
+        large_temp_threshold_bytes: int = 1024,
+        large_weight_threshold: int = 1024,
+    ) -> None:
         self._env = Environment(
             loader=FileSystemLoader(str(template_dir)),
             autoescape=select_autoescape(enabled_extensions=()),
@@ -994,6 +1226,15 @@ class CEmitter:
             lstrip_blocks=True,
         )
         self._restrict_arrays = restrict_arrays
+        if truncate_weights_after is not None and truncate_weights_after < 1:
+            raise CodegenError("truncate_weights_after must be >= 1")
+        self._truncate_weights_after = truncate_weights_after
+        if large_temp_threshold_bytes < 0:
+            raise CodegenError("large_temp_threshold_bytes must be >= 0")
+        self._large_temp_threshold_bytes = large_temp_threshold_bytes
+        if large_weight_threshold < 0:
+            raise CodegenError("large_weight_threshold must be >= 0")
+        self._large_weight_threshold = large_weight_threshold
     @staticmethod
     def _sanitize_identifier(name: str) -> str:
@@ -1006,10 +1247,8 @@ class CEmitter:
     def _op_function_name(self, model: LoweredModel, index: int) -> str:
         node_info = model.node_infos[index]
-        parts = [f"node{index}", node_info.op_type]
-        if node_info.name:
-            parts.append(node_info.name)
-        base_name = "_".join(parts)
+        suffix = node_info.name or node_info.op_type
+        base_name = f"node{index}_{suffix}".lower()
         return self._sanitize_identifier(base_name)
     @staticmethod
@@ -1094,7 +1333,9 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
@@ -1110,16 +1351,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -1129,12 +1374,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
     ) -> tuple[str, ...]:
         if isinstance(op, BinaryOp):
@@ -1155,8 +1403,16 @@ class CEmitter:
             return tuple(names)
         if isinstance(op, CastOp):
             return (op.input0, op.output)
+        if isinstance(op, QuantizeLinearOp):
+            names = [op.input0, op.scale]
+            if op.zero_point is not None:
+                names.append(op.zero_point)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, MatMulOp):
             return (op.input0, op.input1, op.output)
+        if isinstance(op, EinsumOp):
+            return (*op.inputs, op.output)
         if isinstance(op, GemmOp):
             names = [op.input_a, op.input_b]
             if op.input_c is not None:
@@ -1187,8 +1443,16 @@ class CEmitter:
                 names.append(op.bias)
             names.append(op.output)
             return tuple(names)
+        if isinstance(op, ConvTransposeOp):
+            names = [op.input0, op.weights]
+            if op.bias is not None:
+                names.append(op.bias)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, AveragePoolOp):
             return (op.input0, op.output)
+        if isinstance(op, LpPoolOp):
+            return (op.input0, op.output)
         if isinstance(op, BatchNormOp):
             return (op.input0, op.scale, op.bias, op.mean, op.variance, op.output)
         if isinstance(op, LpNormalizationOp):
@@ -1230,7 +1494,7 @@ class CEmitter:
             if op.output_y_c is not None:
                 names.append(op.output_y_c)
             return tuple(names)
-        if isinstance(op, (SoftmaxOp, LogSoftmaxOp)):
+        if isinstance(op, (SoftmaxOp, LogSoftmaxOp, HardmaxOp)):
             return (op.input0, op.output)
         if isinstance(op, NegativeLogLikelihoodLossOp):
             names = [op.input0, op.target]
@@ -1255,6 +1519,10 @@ class CEmitter:
             return (op.data, op.indices, op.output)
         if isinstance(op, GatherOp):
             return (op.data, op.indices, op.output)
+        if isinstance(op, GatherNDOp):
+            return (op.data, op.indices, op.output)
+        if isinstance(op, ScatterNDOp):
+            return (op.data, op.indices, op.updates, op.output)
         if isinstance(op, ConcatOp):
             return (*op.inputs, op.output)
         if isinstance(op, ConstantOfShapeOp):
@@ -1263,6 +1531,8 @@ class CEmitter:
             return (op.input0, op.output)
         if isinstance(op, SizeOp):
             return (op.input0, op.output)
+        if isinstance(op, NonZeroOp):
+            return (op.input0, op.output)
         if isinstance(op, ExpandOp):
             return (op.input0, op.output)
         if isinstance(op, CumSumOp):
@@ -1273,6 +1543,8 @@ class CEmitter:
             return tuple(names)
         if isinstance(op, RangeOp):
             return (op.start, op.limit, op.delta, op.output)
+        if isinstance(op, OneHotOp):
+            return (op.indices, op.depth, op.values, op.output)
         if isinstance(op, SplitOp):
             return (op.input0, *op.outputs)
         if isinstance(op, ReshapeOp):
@@ -1281,6 +1553,12 @@ class CEmitter:
             return (op.input0, op.output)
         if isinstance(op, EyeLikeOp):
             return (op.input0, op.output)
+        if isinstance(op, TriluOp):
+            names = [op.input0]
+            if op.k_input is not None:
+                names.append(op.k_input)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, TileOp):
             return (op.input0, op.output)
         if isinstance(op, PadOp):
@@ -1320,6 +1598,8 @@ class CEmitter:
             return tuple(names)
         if isinstance(op, GridSampleOp):
             return (op.input0, op.grid, op.output)
+        if isinstance(op, TopKOp):
+            return (op.input0, op.output_values, op.output_indices)
         if isinstance(op, ReduceOp):
             names = [op.input0]
             if op.axes_input is not None:
@@ -1331,12 +1611,14 @@ class CEmitter:
     def _build_name_map(self, model: LoweredModel) -> dict[str, str]:
         used: set[str] = set()
         name_map: dict[str, str] = {}
+        constant_names = {const.name for const in model.constants}
         names = [model.name]
         names.extend(model.input_names)
         names.extend(model.output_names)
-        names.extend(const.name for const in model.constants)
         for op in model.ops:
-            names.extend(self._op_names(op))
+            names.extend(
+                name for name in self._op_names(op) if name not in constant_names
+            )
         for name in names:
             if name in name_map:
                 continue
@@ -1344,6 +1626,14 @@ class CEmitter:
             unique = self._ensure_unique_identifier(sanitized, used)
             name_map[name] = unique
             used.add(unique)
+        for index, const in enumerate(model.constants, start=1):
+            if const.name in name_map:
+                continue
+            base_name = self._sanitize_identifier(const.name.lower())
+            weight_name = f"weight{index}_{base_name}"
+            unique = self._ensure_unique_identifier(weight_name, used)
+            name_map[const.name] = unique
+            used.add(unique)
         return name_map
     @staticmethod
@@ -1362,11 +1652,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -1378,16 +1672,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -1397,12 +1695,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
         name_map: dict[str, str],
     ) -> (
@@ -1412,11 +1713,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -1428,16 +1733,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -1447,12 +1756,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp
     ):
         if isinstance(op, BinaryOp):
@@ -1462,6 +1774,8 @@ class CEmitter:
                 output=name_map.get(op.output, op.output),
                 function=op.function,
                 operator_kind=op.operator_kind,
+                input0_shape=op.input0_shape,
+                input1_shape=op.input1_shape,
                 shape=op.shape,
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
@@ -1518,6 +1832,18 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 dtype=op.dtype,
             )
+        if isinstance(op, QuantizeLinearOp):
+            return QuantizeLinearOp(
+                input0=name_map.get(op.input0, op.input0),
+                scale=name_map.get(op.scale, op.scale),
+                zero_point=self._map_optional_name(name_map, op.zero_point),
+                output=name_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                axis=op.axis,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+                scale_dtype=op.scale_dtype,
+            )
         if isinstance(op, MatMulOp):
             return MatMulOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1536,6 +1862,16 @@ class CEmitter:
                 right_vector=op.right_vector,
                 dtype=op.dtype,
             )
+        if isinstance(op, EinsumOp):
+            return EinsumOp(
+                inputs=tuple(name_map.get(name, name) for name in op.inputs),
+                output=name_map.get(op.output, op.output),
+                kind=op.kind,
+                input_shapes=op.input_shapes,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, GemmOp):
             return GemmOp(
                 input_a=name_map.get(op.input_a, op.input_a),
@@ -1629,6 +1965,26 @@ class CEmitter:
                 group=op.group,
                 dtype=op.dtype,
             )
+        if isinstance(op, ConvTransposeOp):
+            return ConvTransposeOp(
+                input0=name_map.get(op.input0, op.input0),
+                weights=name_map.get(op.weights, op.weights),
+                bias=self._map_optional_name(name_map, op.bias),
+                output=name_map.get(op.output, op.output),
+                batch=op.batch,
+                in_channels=op.in_channels,
+                out_channels=op.out_channels,
+                spatial_rank=op.spatial_rank,
+                in_spatial=op.in_spatial,
+                out_spatial=op.out_spatial,
+                kernel_shape=op.kernel_shape,
+                strides=op.strides,
+                pads=op.pads,
+                dilations=op.dilations,
+                output_padding=op.output_padding,
+                group=op.group,
+                dtype=op.dtype,
+            )
         if isinstance(op, AveragePoolOp):
             return AveragePoolOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1650,6 +2006,27 @@ class CEmitter:
                 count_include_pad=op.count_include_pad,
                 dtype=op.dtype,
             )
+        if isinstance(op, LpPoolOp):
+            return LpPoolOp(
+                input0=name_map.get(op.input0, op.input0),
+                output=name_map.get(op.output, op.output),
+                batch=op.batch,
+                channels=op.channels,
+                in_h=op.in_h,
+                in_w=op.in_w,
+                out_h=op.out_h,
+                out_w=op.out_w,
+                kernel_h=op.kernel_h,
+                kernel_w=op.kernel_w,
+                stride_h=op.stride_h,
+                stride_w=op.stride_w,
+                pad_top=op.pad_top,
+                pad_left=op.pad_left,
+                pad_bottom=op.pad_bottom,
+                pad_right=op.pad_right,
+                p=op.p,
+                dtype=op.dtype,
+            )
         if isinstance(op, BatchNormOp):
             return BatchNormOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1813,6 +2190,17 @@ class CEmitter:
                 shape=op.shape,
                 dtype=op.dtype,
             )
+        if isinstance(op, HardmaxOp):
+            return HardmaxOp(
+                input0=name_map.get(op.input0, op.input0),
+                output=name_map.get(op.output, op.output),
+                outer=op.outer,
+                axis_size=op.axis_size,
+                inner=op.inner,
+                axis=op.axis,
+                shape=op.shape,
+                dtype=op.dtype,
+            )
         if isinstance(op, NegativeLogLikelihoodLossOp):
             return NegativeLogLikelihoodLossOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1909,6 +2297,32 @@ class CEmitter:
                 dtype=op.dtype,
                 indices_dtype=op.indices_dtype,
             )
+        if isinstance(op, GatherNDOp):
+            return GatherNDOp(
+                data=name_map.get(op.data, op.data),
+                indices=name_map.get(op.indices, op.indices),
+                output=name_map.get(op.output, op.output),
+                batch_dims=op.batch_dims,
+                data_shape=op.data_shape,
+                indices_shape=op.indices_shape,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+            )
+        if isinstance(op, ScatterNDOp):
+            return ScatterNDOp(
+                data=name_map.get(op.data, op.data),
+                indices=name_map.get(op.indices, op.indices),
+                updates=name_map.get(op.updates, op.updates),
+                output=name_map.get(op.output, op.output),
+                data_shape=op.data_shape,
+                indices_shape=op.indices_shape,
+                updates_shape=op.updates_shape,
+                output_shape=op.output_shape,
+                reduction=op.reduction,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+            )
         if isinstance(op, TransposeOp):
             return TransposeOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1945,6 +2359,20 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, TriluOp):
+            return TriluOp(
+                input0=name_map.get(op.input0, op.input0),
+                output=name_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                upper=op.upper,
+                k_value=op.k_value,
+                k_input=self._map_optional_name(name_map, op.k_input),
+                k_input_shape=op.k_input_shape,
+                k_input_dtype=op.k_input_dtype,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, TileOp):
             return TileOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2101,6 +2529,21 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 output_dtype=op.output_dtype,
             )
+        if isinstance(op, TopKOp):
+            return TopKOp(
+                input0=name_map.get(op.input0, op.input0),
+                output_values=name_map.get(op.output_values, op.output_values),
+                output_indices=name_map.get(op.output_indices, op.output_indices),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                axis=op.axis,
+                k=op.k,
+                largest=op.largest,
+                sorted=op.sorted,
+                input_dtype=op.input_dtype,
+                output_values_dtype=op.output_values_dtype,
+                output_indices_dtype=op.output_indices_dtype,
+            )
         if isinstance(op, ConstantOfShapeOp):
             return ConstantOfShapeOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2131,6 +2574,15 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, NonZeroOp):
+            return NonZeroOp(
+                input0=name_map.get(op.input0, op.input0),
+                output=name_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, ExpandOp):
             return ExpandOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2166,6 +2618,21 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, OneHotOp):
+            return OneHotOp(
+                indices=name_map.get(op.indices, op.indices),
+                depth=name_map.get(op.depth, op.depth),
+                values=name_map.get(op.values, op.values),
+                output=name_map.get(op.output, op.output),
+                axis=op.axis,
+                indices_shape=op.indices_shape,
+                values_shape=op.values_shape,
+                output_shape=op.output_shape,
+                depth_dim=op.depth_dim,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+                depth_dtype=op.depth_dtype,
+            )
         if isinstance(op, SplitOp):
             return SplitOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2246,11 +2713,19 @@ class CEmitter:
                 "unary": self._env.get_template("unary_op.c.j2"),
                 "clip": self._env.get_template("clip_op.c.j2"),
                 "cast": self._env.get_template("cast_op.c.j2"),
+                "quantize_linear": self._env.get_template(
+                    "quantize_linear_op.c.j2"
+                ),
                 "matmul": self._env.get_template("matmul_op.c.j2"),
+                "einsum": self._env.get_template("einsum_op.c.j2"),
                 "gemm": self._env.get_template("gemm_op.c.j2"),
                 "attention": self._env.get_template("attention_op.c.j2"),
                 "conv": self._env.get_template("conv_op.c.j2"),
+                "conv_transpose": self._env.get_template(
+                    "conv_transpose_op.c.j2"
+                ),
                 "avg_pool": self._env.get_template("average_pool_op.c.j2"),
+                "lp_pool": self._env.get_template("lp_pool_op.c.j2"),
                 "batch_norm": self._env.get_template("batch_norm_op.c.j2"),
                 "lp_norm": self._env.get_template("lp_normalization_op.c.j2"),
                 "instance_norm": self._env.get_template(
@@ -2270,6 +2745,7 @@ class CEmitter:
                 "lstm": self._env.get_template("lstm_op.c.j2"),
                 "softmax": self._env.get_template("softmax_op.c.j2"),
                 "logsoftmax": self._env.get_template("logsoftmax_op.c.j2"),
+                "hardmax": self._env.get_template("hardmax_op.c.j2"),
                 "nllloss": self._env.get_template(
                     "negative_log_likelihood_loss_op.c.j2"
                 ),
@@ -2280,10 +2756,13 @@ class CEmitter:
                 "concat": self._env.get_template("concat_op.c.j2"),
                 "gather_elements": self._env.get_template("gather_elements_op.c.j2"),
                 "gather": self._env.get_template("gather_op.c.j2"),
+                "gather_nd": self._env.get_template("gather_nd_op.c.j2"),
+                "scatter_nd": self._env.get_template("scatter_nd_op.c.j2"),
                 "transpose": self._env.get_template("transpose_op.c.j2"),
                 "reshape": self._env.get_template("reshape_op.c.j2"),
                 "identity": self._env.get_template("identity_op.c.j2"),
                 "eye_like": self._env.get_template("eye_like_op.c.j2"),
+                "trilu": self._env.get_template("trilu_op.c.j2"),
                 "tile": self._env.get_template("tile_op.c.j2"),
                 "pad": self._env.get_template("pad_op.c.j2"),
                 "depth_to_space": self._env.get_template("depth_to_space_op.c.j2"),
@@ -2299,14 +2778,17 @@ class CEmitter:
                     "reduce_op_dynamic.c.j2"
                 ),
                 "arg_reduce": self._env.get_template("arg_reduce_op.c.j2"),
+                "topk": self._env.get_template("topk_op.c.j2"),
                 "constant_of_shape": self._env.get_template(
                     "constant_of_shape_op.c.j2"
                 ),
                 "shape": self._env.get_template("shape_op.c.j2"),
                 "size": self._env.get_template("size_op.c.j2"),
+                "nonzero": self._env.get_template("nonzero_op.c.j2"),
                 "expand": self._env.get_template("expand_op.c.j2"),
                 "cumsum": self._env.get_template("cumsum_op.c.j2"),
                 "range": self._env.get_template("range_op.c.j2"),
+                "one_hot": self._env.get_template("one_hot_op.c.j2"),
                 "split": self._env.get_template("split_op.c.j2"),
             }
             if emit_testbench:
@@ -2328,6 +2810,9 @@ class CEmitter:
         testbench_inputs = self._sanitize_testbench_inputs(
             testbench_inputs, name_map
         )
+        inline_constants, large_constants = self._partition_constants(
+            model.constants
+        )
         (
             dim_order,
             input_dim_names,
@@ -2353,11 +2838,15 @@ class CEmitter:
         unary_template = templates["unary"]
         clip_template = templates["clip"]
         cast_template = templates["cast"]
+        quantize_linear_template = templates["quantize_linear"]
         matmul_template = templates["matmul"]
+        einsum_template = templates["einsum"]
         gemm_template = templates["gemm"]
         attention_template = templates["attention"]
         conv_template = templates["conv"]
+        conv_transpose_template = templates["conv_transpose"]
         avg_pool_template = templates["avg_pool"]
+        lp_pool_template = templates["lp_pool"]
         batch_norm_template = templates["batch_norm"]
         lp_norm_template = templates["lp_norm"]
         instance_norm_template = templates["instance_norm"]
@@ -2369,16 +2858,20 @@ class CEmitter:
         lstm_template = templates["lstm"]
         softmax_template = templates["softmax"]
         logsoftmax_template = templates["logsoftmax"]
+        hardmax_template = templates["hardmax"]
         nllloss_template = templates["nllloss"]
         softmax_cross_entropy_loss_template = templates["softmax_cross_entropy_loss"]
         maxpool_template = templates["maxpool"]
         concat_template = templates["concat"]
         gather_elements_template = templates["gather_elements"]
         gather_template = templates["gather"]
+        gather_nd_template = templates["gather_nd"]
+        scatter_nd_template = templates["scatter_nd"]
         transpose_template = templates["transpose"]
         reshape_template = templates["reshape"]
         identity_template = templates["identity"]
         eye_like_template = templates["eye_like"]
+        trilu_template = templates["trilu"]
         tile_template = templates["tile"]
         pad_template = templates["pad"]
         depth_to_space_template = templates["depth_to_space"]
@@ -2390,12 +2883,15 @@ class CEmitter:
         reduce_template = templates["reduce"]
         reduce_dynamic_template = templates["reduce_dynamic"]
         arg_reduce_template = templates["arg_reduce"]
+        topk_template = templates["topk"]
         constant_of_shape_template = templates["constant_of_shape"]
         shape_template = templates["shape"]
         size_template = templates["size"]
+        nonzero_template = templates["nonzero"]
         expand_template = templates["expand"]
         cumsum_template = templates["cumsum"]
         range_template = templates["range"]
+        one_hot_template = templates["one_hot"]
         split_template = templates["split"]
         testbench_template = templates.get("testbench")
         reserved_names = {
@@ -2427,11 +2923,15 @@ class CEmitter:
                 unary_template=unary_template,
                 clip_template=clip_template,
                 cast_template=cast_template,
+                quantize_linear_template=quantize_linear_template,
                 matmul_template=matmul_template,
+                einsum_template=einsum_template,
                 gemm_template=gemm_template,
                 attention_template=attention_template,
                 conv_template=conv_template,
+                conv_transpose_template=conv_transpose_template,
                 avg_pool_template=avg_pool_template,
+                lp_pool_template=lp_pool_template,
                 batch_norm_template=batch_norm_template,
                 lp_norm_template=lp_norm_template,
                 instance_norm_template=instance_norm_template,
@@ -2443,16 +2943,20 @@ class CEmitter:
                 lstm_template=lstm_template,
                 softmax_template=softmax_template,
                 logsoftmax_template=logsoftmax_template,
+                hardmax_template=hardmax_template,
                 nllloss_template=nllloss_template,
                 softmax_cross_entropy_loss_template=softmax_cross_entropy_loss_template,
                 maxpool_template=maxpool_template,
                 concat_template=concat_template,
                 gather_elements_template=gather_elements_template,
                 gather_template=gather_template,
+                gather_nd_template=gather_nd_template,
+                scatter_nd_template=scatter_nd_template,
                 transpose_template=transpose_template,
                 reshape_template=reshape_template,
                 identity_template=identity_template,
                 eye_like_template=eye_like_template,
+                trilu_template=trilu_template,
                 tile_template=tile_template,
                 pad_template=pad_template,
                 depth_to_space_template=depth_to_space_template,
@@ -2464,12 +2968,15 @@ class CEmitter:
                 reduce_template=reduce_template,
                 reduce_dynamic_template=reduce_dynamic_template,
                 arg_reduce_template=arg_reduce_template,
+                topk_template=topk_template,
                 constant_of_shape_template=constant_of_shape_template,
                 shape_template=shape_template,
                 size_template=size_template,
+                nonzero_template=nonzero_template,
                 expand_template=expand_template,
                 cumsum_template=cumsum_template,
                 range_template=range_template,
+                one_hot_template=one_hot_template,
                 split_template=split_template,
                 scalar_registry=scalar_registry,
                 dim_args=dim_args,
@@ -2495,25 +3002,45 @@ class CEmitter:
         scalar_preamble = [
             line for line in scalar_include_lines if not line.startswith("#include ")
         ]
+        testbench_math_include = set()
+        if emit_testbench and self._testbench_requires_math(
+            model, testbench_inputs
+        ):
+            testbench_math_include.add("#include <math.h>")
         includes = self._collect_includes(
             model,
             resolved_ops,
             emit_testbench=emit_testbench,
-            extra_includes=scalar_includes,
+            extra_includes=scalar_includes | testbench_math_include,
+            needs_weight_loader=bool(large_constants),
         )
-        sections = [self._emit_header_comment(model.header), "", *includes]
+        sections = [
+            self._emit_header_comment(model.header),
+            "",
+            *includes,
+            "",
+            self._emit_index_type_define(),
+        ]
         if scalar_preamble:
             sections.extend(("", *scalar_preamble))
         sections.append("")
-        constants_section = self._emit_constant_definitions(model.constants)
+        constants_section = self._emit_constant_definitions(inline_constants)
         if constants_section:
             sections.extend((constants_section.rstrip(), ""))
+        large_constants_section = self._emit_constant_storage_definitions(
+            large_constants
+        )
+        if large_constants_section:
+            sections.extend((large_constants_section.rstrip(), ""))
         if scalar_functions:
             sections.extend(("\n".join(scalar_functions), ""))
+        weight_loader = self._emit_weight_loader(model, large_constants)
         sections.extend(
             (
                 operator_fns.rstrip(),
                 "",
+                weight_loader.rstrip(),
+                "",
                 wrapper_fn,
             )
         )
@@ -2527,6 +3054,7 @@ class CEmitter:
                         testbench_inputs=testbench_inputs,
                         dim_order=dim_order,
                         dim_values=dim_values,
+                        weight_data_filename=self._weight_data_filename(model),
                     ),
                 )
             )
@@ -2549,6 +3077,9 @@ class CEmitter:
         testbench_inputs = self._sanitize_testbench_inputs(
             testbench_inputs, name_map
         )
+        inline_constants, large_constants = self._partition_constants(
+            model.constants
+        )
         (
             dim_order,
             input_dim_names,
@@ -2574,11 +3105,15 @@ class CEmitter:
         unary_template = templates["unary"]
         clip_template = templates["clip"]
         cast_template = templates["cast"]
+        quantize_linear_template = templates["quantize_linear"]
         matmul_template = templates["matmul"]
+        einsum_template = templates["einsum"]
         gemm_template = templates["gemm"]
         attention_template = templates["attention"]
         conv_template = templates["conv"]
+        conv_transpose_template = templates["conv_transpose"]
         avg_pool_template = templates["avg_pool"]
+        lp_pool_template = templates["lp_pool"]
         batch_norm_template = templates["batch_norm"]
         lp_norm_template = templates["lp_norm"]
         instance_norm_template = templates["instance_norm"]
@@ -2590,16 +3125,20 @@ class CEmitter:
         lstm_template = templates["lstm"]
         softmax_template = templates["softmax"]
         logsoftmax_template = templates["logsoftmax"]
+        hardmax_template = templates["hardmax"]
         nllloss_template = templates["nllloss"]
         softmax_cross_entropy_loss_template = templates["softmax_cross_entropy_loss"]
         maxpool_template = templates["maxpool"]
         concat_template = templates["concat"]
         gather_elements_template = templates["gather_elements"]
         gather_template = templates["gather"]
+        gather_nd_template = templates["gather_nd"]
+        scatter_nd_template = templates["scatter_nd"]
         transpose_template = templates["transpose"]
         reshape_template = templates["reshape"]
         identity_template = templates["identity"]
         eye_like_template = templates["eye_like"]
+        trilu_template = templates["trilu"]
         tile_template = templates["tile"]
         pad_template = templates["pad"]
         depth_to_space_template = templates["depth_to_space"]
@@ -2611,12 +3150,15 @@ class CEmitter:
         reduce_template = templates["reduce"]
         reduce_dynamic_template = templates["reduce_dynamic"]
         arg_reduce_template = templates["arg_reduce"]
+        topk_template = templates["topk"]
         constant_of_shape_template = templates["constant_of_shape"]
         shape_template = templates["shape"]
         size_template = templates["size"]
+        nonzero_template = templates["nonzero"]
         expand_template = templates["expand"]
         cumsum_template = templates["cumsum"]
         range_template = templates["range"]
+        one_hot_template = templates["one_hot"]
         split_template = templates["split"]
         testbench_template = templates.get("testbench")
         reserved_names = {
@@ -2648,11 +3190,15 @@ class CEmitter:
                 unary_template=unary_template,
                 clip_template=clip_template,
                 cast_template=cast_template,
+                quantize_linear_template=quantize_linear_template,
                 matmul_template=matmul_template,
+                einsum_template=einsum_template,
                 gemm_template=gemm_template,
                 attention_template=attention_template,
                 conv_template=conv_template,
+                conv_transpose_template=conv_transpose_template,
                 avg_pool_template=avg_pool_template,
+                lp_pool_template=lp_pool_template,
                 batch_norm_template=batch_norm_template,
                 lp_norm_template=lp_norm_template,
                 instance_norm_template=instance_norm_template,
@@ -2664,16 +3210,20 @@ class CEmitter:
                 lstm_template=lstm_template,
                 softmax_template=softmax_template,
                 logsoftmax_template=logsoftmax_template,
+                hardmax_template=hardmax_template,
                 nllloss_template=nllloss_template,
                 softmax_cross_entropy_loss_template=softmax_cross_entropy_loss_template,
                 maxpool_template=maxpool_template,
                 concat_template=concat_template,
                 gather_elements_template=gather_elements_template,
                 gather_template=gather_template,
+                gather_nd_template=gather_nd_template,
+                scatter_nd_template=scatter_nd_template,
                 transpose_template=transpose_template,
                 reshape_template=reshape_template,
                 identity_template=identity_template,
                 eye_like_template=eye_like_template,
+                trilu_template=trilu_template,
                 tile_template=tile_template,
                 pad_template=pad_template,
                 depth_to_space_template=depth_to_space_template,
@@ -2685,12 +3235,15 @@ class CEmitter:
                 reduce_template=reduce_template,
                 reduce_dynamic_template=reduce_dynamic_template,
                 arg_reduce_template=arg_reduce_template,
+                topk_template=topk_template,
                 constant_of_shape_template=constant_of_shape_template,
                 shape_template=shape_template,
                 size_template=size_template,
+                nonzero_template=nonzero_template,
                 expand_template=expand_template,
                 cumsum_template=cumsum_template,
                 range_template=range_template,
+                one_hot_template=one_hot_template,
                 split_template=split_template,
                 scalar_registry=scalar_registry,
                 dim_args=dim_args,
@@ -2716,25 +3269,45 @@ class CEmitter:
         scalar_preamble = [
             line for line in scalar_include_lines if not line.startswith("#include ")
         ]
+        testbench_math_include = set()
+        if emit_testbench and self._testbench_requires_math(
+            model, testbench_inputs
+        ):
+            testbench_math_include.add("#include <math.h>")
         includes = self._collect_includes(
             model,
             resolved_ops,
             emit_testbench=emit_testbench,
-            extra_includes=scalar_includes,
+            extra_includes=scalar_includes | testbench_math_include,
+            needs_weight_loader=bool(large_constants),
         )
-        sections = [self._emit_header_comment(model.header), "", *includes]
+        sections = [
+            self._emit_header_comment(model.header),
+            "",
+            *includes,
+            "",
+            self._emit_index_type_define(),
+        ]
         if scalar_preamble:
             sections.extend(("", *scalar_preamble))
         sections.append("")
-        constants_section = self._emit_constant_declarations(model.constants)
+        constants_section = self._emit_constant_declarations(inline_constants)
         if constants_section:
             sections.extend((constants_section.rstrip(), ""))
+        large_constants_section = self._emit_constant_storage_definitions(
+            large_constants
+        )
+        if large_constants_section:
+            sections.extend((large_constants_section.rstrip(), ""))
         if scalar_functions:
             sections.extend(("\n".join(scalar_functions), ""))
+        weight_loader = self._emit_weight_loader(model, large_constants)
         sections.extend(
             (
                 operator_fns.rstrip(),
                 "",
+                weight_loader.rstrip(),
+                "",
                 wrapper_fn,
             )
         )
@@ -2748,6 +3321,7 @@ class CEmitter:
                         testbench_inputs=testbench_inputs,
                         dim_order=dim_order,
                         dim_values=dim_values,
+                        weight_data_filename=self._weight_data_filename(model),
                     ),
                 )
             )
@@ -2755,14 +3329,14 @@ class CEmitter:
         main_rendered = "\n".join(sections)
         if not main_rendered.endswith("\n"):
             main_rendered += "\n"
-        data_includes = self._collect_constant_includes(model.constants)
+        data_includes = self._collect_constant_includes(inline_constants)
         data_sections = [self._emit_header_comment(model.header), ""]
         if data_includes:
             data_sections.extend((*data_includes, ""))
         else:
             data_sections.append("")
         data_constants = self._emit_constant_definitions(
-            model.constants, storage_prefix="const"
+            inline_constants, storage_prefix="const"
         )
         if data_constants:
             data_sections.append(data_constants.rstrip())
@@ -2856,6 +3430,23 @@ class CEmitter:
         comment_lines.append(" */")
         return "\n".join(comment_lines)
+    @staticmethod
+    def _emit_constant_comment(constant: ConstTensor, index: int) -> str:
+        shape = constant.shape
+        lines = [
+            f"Weight {index}:",
+            f"Name: {constant.name}",
+            f"Shape: {shape if shape else '[]'}",
+            f"Elements: {CEmitter._element_count(shape)}",
+            f"Dtype: {constant.dtype.onnx_name}",
+        ]
+        comment_lines = ["/*"]
+        comment_lines.extend(
+            f" * {line}" if line else " *" for line in lines
+        )
+        comment_lines.append(" */")
+        return "\n".join(comment_lines)
     @staticmethod
     def _collect_constant_includes(constants: tuple[ConstTensor, ...]) -> list[str]:
         if not constants:
@@ -2920,6 +3511,7 @@ class CEmitter:
             ScalarFunction.FMOD,
             ScalarFunction.REMAINDER,
             ScalarFunction.LEAKY_RELU,
+            ScalarFunction.MISH,
             ScalarFunction.MUL,
             ScalarFunction.NEG,
             ScalarFunction.LOGICAL_NOT,
@@ -3005,11 +3597,15 @@ class CEmitter:
             | UnaryOp
             | ClipOp
             | CastOp
+            | QuantizeLinearOp
             | MatMulOp
+            | EinsumOp
             | GemmOp
             | AttentionOp
             | ConvOp
+            | ConvTransposeOp
             | AveragePoolOp
+            | LpPoolOp
             | BatchNormOp
             | LpNormalizationOp
             | InstanceNormalizationOp
@@ -3021,16 +3617,20 @@ class CEmitter:
             | LstmOp
             | SoftmaxOp
             | LogSoftmaxOp
+            | HardmaxOp
             | NegativeLogLikelihoodLossOp
             | SoftmaxCrossEntropyLossOp
             | MaxPoolOp
             | ConcatOp
             | GatherElementsOp
             | GatherOp
+            | GatherNDOp
+            | ScatterNDOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
             | EyeLikeOp
+            | TriluOp
             | TileOp
             | DepthToSpaceOp
             | SpaceToDepthOp
@@ -3039,21 +3639,27 @@ class CEmitter:
             | GridSampleOp
             | ReduceOp
             | ArgReduceOp
+            | TopKOp
             | ConstantOfShapeOp
             | ShapeOp
             | SizeOp
+            | NonZeroOp
             | ExpandOp
             | CumSumOp
             | RangeOp
+            | OneHotOp
             | SplitOp
         ],
         *,
         emit_testbench: bool,
         extra_includes: set[str] | None = None,
+        needs_weight_loader: bool = False,
     ) -> list[str]:
-        includes: set[str] = {"#include <stddef.h>"}
+        includes: set[str] = {"#include <stdint.h>"}
         if emit_testbench:
-            includes.update({"#include <stdio.h>", "#include <stdint.h>"})
+            includes.add("#include <stdio.h>")
+        if needs_weight_loader:
+            includes.add("#include <stdio.h>")
         if extra_includes:
             includes.update(extra_includes)
         if any(
@@ -3074,7 +3680,9 @@ class CEmitter:
             *constant_of_shape_inputs,
         }
         model_dtypes.update(
-            op.dtype for op in resolved_ops if not isinstance(op, ArgReduceOp)
+            op.dtype
+            for op in resolved_ops
+            if not isinstance(op, (ArgReduceOp, TopKOp))
         )
         arg_reduce_dtypes = {
             dtype
@@ -3083,6 +3691,17 @@ class CEmitter:
             for dtype in (op.input_dtype, op.output_dtype)
         }
         model_dtypes.update(arg_reduce_dtypes)
+        topk_dtypes = {
+            dtype
+            for op in resolved_ops
+            if isinstance(op, TopKOp)
+            for dtype in (
+                op.input_dtype,
+                op.output_values_dtype,
+                op.output_indices_dtype,
+            )
+        }
+        model_dtypes.update(topk_dtypes)
         slice_input_dtypes = {
             dtype
             for op in resolved_ops
@@ -3095,12 +3714,18 @@ class CEmitter:
             )
             if dtype is not None
         }
+        trilu_k_dtypes = {
+            op.k_input_dtype
+            for op in resolved_ops
+            if isinstance(op, TriluOp) and op.k_input_dtype is not None
+        }
         maxpool_indices_dtypes = {
             op.indices_dtype
             for op in resolved_ops
             if isinstance(op, MaxPoolOp) and op.indices_dtype is not None
         }
         model_dtypes.update(maxpool_indices_dtypes)
+        model_dtypes.update(trilu_k_dtypes)
         nll_target_dtypes = {
             op.target_dtype
             for op in resolved_ops
@@ -3124,12 +3749,20 @@ class CEmitter:
             for op in resolved_ops
         ):
             includes.add("#include <stdbool.h>")
+        if any(
+            isinstance(op, SoftmaxCrossEntropyLossOp)
+            and op.ignore_index is not None
+            for op in resolved_ops
+        ):
+            includes.add("#include <stdbool.h>")
         if any(
             isinstance(op, UnaryOp)
             and unary_op_symbol(op.function, dtype=op.dtype) in {"llabs", "abs"}
             for op in resolved_ops
         ):
             includes.add("#include <stdlib.h>")
+        if any(isinstance(op, PadOp) for op in resolved_ops):
+            includes.add("#include <stddef.h>")
         if CEmitter._needs_math(resolved_ops):
             includes.add("#include <math.h>")
         if CEmitter._needs_limits(resolved_ops):
@@ -3140,9 +3773,9 @@ class CEmitter:
         ):
             includes.add("#include <string.h>")
         ordered_includes = (
-            "#include <stddef.h>",
-            "#include <stdio.h>",
             "#include <stdint.h>",
+            "#include <stdio.h>",
+            "#include <stddef.h>",
             "#include <stdbool.h>",
             "#include <stdlib.h>",
             "#include <math.h>",
@@ -3152,6 +3785,16 @@ class CEmitter:
         )
         return [include for include in ordered_includes if include in includes]
+    @staticmethod
+    def _emit_index_type_define() -> str:
+        return "\n".join(
+            (
+                "#ifndef idx_t",
+                "#define idx_t int32_t",
+                "#endif",
+            )
+        )
     @staticmethod
     def _needs_stdint(
         model_dtypes: set[ScalarType],
@@ -3186,11 +3829,15 @@ class CEmitter:
             | UnaryOp
             | ClipOp
             | CastOp
+            | QuantizeLinearOp
             | MatMulOp
+            | EinsumOp
             | GemmOp
             | AttentionOp
             | ConvOp
+            | ConvTransposeOp
             | AveragePoolOp
+            | LpPoolOp
             | BatchNormOp
             | LpNormalizationOp
             | InstanceNormalizationOp
@@ -3202,16 +3849,20 @@ class CEmitter:
             | LstmOp
             | SoftmaxOp
             | LogSoftmaxOp
+            | HardmaxOp
             | NegativeLogLikelihoodLossOp
             | SoftmaxCrossEntropyLossOp
             | MaxPoolOp
             | ConcatOp
             | GatherElementsOp
             | GatherOp
+            | GatherNDOp
+            | ScatterNDOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
             | EyeLikeOp
+            | TriluOp
             | TileOp
             | DepthToSpaceOp
             | SpaceToDepthOp
@@ -3220,12 +3871,15 @@ class CEmitter:
             | GridSampleOp
             | ReduceOp
             | ArgReduceOp
+            | TopKOp
             | ConstantOfShapeOp
             | ShapeOp
             | SizeOp
+            | NonZeroOp
             | ExpandOp
             | CumSumOp
             | RangeOp
+            | OneHotOp
             | SplitOp
         ],
     ) -> bool:
@@ -3322,6 +3976,11 @@ class CEmitter:
             for op in resolved_ops
         ):
             return True
+        if any(
+            isinstance(op, (LpPoolOp, QuantizeLinearOp))
+            for op in resolved_ops
+        ):
+            return True
         return False
     @staticmethod
@@ -3331,11 +3990,15 @@ class CEmitter:
             | UnaryOp
             | ClipOp
             | CastOp
+            | QuantizeLinearOp
             | MatMulOp
+            | EinsumOp
             | GemmOp
             | AttentionOp
             | ConvOp
+            | ConvTransposeOp
             | AveragePoolOp
+            | LpPoolOp
             | BatchNormOp
             | LpNormalizationOp
             | InstanceNormalizationOp
@@ -3347,16 +4010,19 @@ class CEmitter:
             | LstmOp
             | SoftmaxOp
             | LogSoftmaxOp
+            | HardmaxOp
             | NegativeLogLikelihoodLossOp
             | SoftmaxCrossEntropyLossOp
             | MaxPoolOp
             | ConcatOp
             | GatherElementsOp
             | GatherOp
+            | GatherNDOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
             | EyeLikeOp
+            | TriluOp
             | TileOp
             | DepthToSpaceOp
             | SpaceToDepthOp
@@ -3365,12 +4031,15 @@ class CEmitter:
             | GridSampleOp
             | ReduceOp
             | ArgReduceOp
+            | TopKOp
             | ConstantOfShapeOp
             | ShapeOp
             | SizeOp
+            | NonZeroOp
             | ExpandOp
             | CumSumOp
             | RangeOp
+            | OneHotOp
             | SplitOp
         ],
     ) -> bool:
@@ -3400,6 +4069,11 @@ class CEmitter:
             for op in resolved_ops
         ):
             return True
+        if any(
+            isinstance(op, QuantizeLinearOp) and op.dtype.is_integer
+            for op in resolved_ops
+        ):
+            return True
         return False
     def _emit_model_wrapper(
@@ -3411,11 +4085,15 @@ class CEmitter:
             | UnaryOp
             | ClipOp
             | CastOp
+            | QuantizeLinearOp
             | MatMulOp
+            | EinsumOp
             | GemmOp
             | AttentionOp
             | ConvOp
+            | ConvTransposeOp
             | AveragePoolOp
+            | LpPoolOp
             | BatchNormOp
             | LpNormalizationOp
             | InstanceNormalizationOp
@@ -3427,16 +4105,19 @@ class CEmitter:
             | LstmOp
             | SoftmaxOp
             | LogSoftmaxOp
+            | HardmaxOp
             | NegativeLogLikelihoodLossOp
             | SoftmaxCrossEntropyLossOp
             | MaxPoolOp
             | ConcatOp
             | GatherElementsOp
             | GatherOp
+            | GatherNDOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
             | EyeLikeOp
+            | TriluOp
             | TileOp
             | DepthToSpaceOp
             | SpaceToDepthOp
@@ -3445,12 +4126,15 @@ class CEmitter:
             | GridSampleOp
             | ReduceOp
             | ArgReduceOp
+            | TopKOp
             | ConstantOfShapeOp
             | ShapeOp
             | SizeOp
+            | NonZeroOp
             | ExpandOp
             | CumSumOp
             | RangeOp
+            | OneHotOp
             | SplitOp
         ],
         temp_buffers: tuple[TempBuffer, ...],
@@ -3480,8 +4164,14 @@ class CEmitter:
         lines = [f"void {model.name}({signature}) {{"]
         for temp in temp_buffers:
             c_type = temp.dtype.c_type
+            storage = (
+                "static "
+                if self._temp_buffer_size_bytes(temp)
+                > self._large_temp_threshold_bytes
+                else ""
+            )
             lines.append(
-                f"    {c_type} {temp.name}{self._array_suffix(temp.shape)};"
+                f"    {storage}{c_type} {temp.name}{self._array_suffix(temp.shape)};"
             )
         for index, op in enumerate(resolved_ops):
             op_name = self._op_function_name(model, index)
@@ -3490,6 +4180,13 @@ class CEmitter:
         lines.append("}")
         return "\n".join(lines)
+    @staticmethod
+    def _temp_buffer_size_bytes(temp: TempBuffer) -> int:
+        element_count = 1
+        for dim in temp.shape:
+            element_count *= dim
+        return element_count * temp.dtype.np_dtype.itemsize
     @staticmethod
     def _build_op_call(
         op: BinaryOp
@@ -3497,11 +4194,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -3513,16 +4214,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -3532,12 +4237,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
         dim_order: Sequence[str],
     ) -> str:
@@ -3556,6 +4264,9 @@ class CEmitter:
         if isinstance(op, MatMulOp):
             args.extend([op.input0, op.input1, op.output])
             return ", ".join(args)
+        if isinstance(op, EinsumOp):
+            args.extend([*op.inputs, op.output])
+            return ", ".join(args)
         if isinstance(op, GemmOp):
             if op.input_c is None:
                 args.extend([op.input_a, op.input_b, op.output])
@@ -3574,6 +4285,13 @@ class CEmitter:
             call_parts.append(op.output)
             args.extend(call_parts)
             return ", ".join(args)
+        if isinstance(op, QuantizeLinearOp):
+            call_parts = [op.input0, op.scale]
+            if op.zero_point is not None:
+                call_parts.append(op.zero_point)
+            call_parts.append(op.output)
+            args.extend(call_parts)
+            return ", ".join(args)
         if isinstance(op, AttentionOp):
             call_parts = [op.input_q, op.input_k, op.input_v]
             if op.input_attn_mask is not None:
@@ -3599,9 +4317,18 @@ class CEmitter:
                 return ", ".join(args)
             args.extend([op.input0, op.weights, op.bias, op.output])
             return ", ".join(args)
+        if isinstance(op, ConvTransposeOp):
+            if op.bias is None:
+                args.extend([op.input0, op.weights, op.output])
+                return ", ".join(args)
+            args.extend([op.input0, op.weights, op.bias, op.output])
+            return ", ".join(args)
         if isinstance(op, AveragePoolOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
+        if isinstance(op, LpPoolOp):
+            args.extend([op.input0, op.output])
+            return ", ".join(args)
         if isinstance(op, BatchNormOp):
             args.extend(
                 [op.input0, op.scale, op.bias, op.mean, op.variance, op.output]
@@ -3653,7 +4380,7 @@ class CEmitter:
                 call_parts.append(op.output_y_c)
             args.extend(call_parts)
             return ", ".join(args)
-        if isinstance(op, (SoftmaxOp, LogSoftmaxOp)):
+        if isinstance(op, (SoftmaxOp, LogSoftmaxOp, HardmaxOp)):
             args.extend([op.input0, op.output])
             return ", ".join(args)
         if isinstance(op, NegativeLogLikelihoodLossOp):
@@ -3684,6 +4411,12 @@ class CEmitter:
         if isinstance(op, GatherOp):
             args.extend([op.data, op.indices, op.output])
             return ", ".join(args)
+        if isinstance(op, GatherNDOp):
+            args.extend([op.data, op.indices, op.output])
+            return ", ".join(args)
+        if isinstance(op, ScatterNDOp):
+            args.extend([op.data, op.indices, op.updates, op.output])
+            return ", ".join(args)
         if isinstance(op, ConcatOp):
             args.extend([*op.inputs, op.output])
             return ", ".join(args)
@@ -3696,9 +4429,18 @@ class CEmitter:
         if isinstance(op, SizeOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
+        if isinstance(op, NonZeroOp):
+            args.extend([op.input0, op.output])
+            return ", ".join(args)
         if isinstance(op, ExpandOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
+        if isinstance(op, TriluOp):
+            call_parts = [op.input0, op.output]
+            if op.k_input is not None:
+                call_parts.append(op.k_input)
+            args.extend(call_parts)
+            return ", ".join(args)
         if isinstance(op, CumSumOp):
             args.append(op.input0)
             if op.axis_input is not None:
@@ -3708,6 +4450,9 @@ class CEmitter:
         if isinstance(op, RangeOp):
             args.extend([op.start, op.limit, op.delta, op.output])
             return ", ".join(args)
+        if isinstance(op, OneHotOp):
+            args.extend([op.indices, op.depth, op.values, op.output])
+            return ", ".join(args)
         if isinstance(op, SplitOp):
             args.extend([op.input0, *op.outputs])
             return ", ".join(args)
@@ -3749,6 +4494,12 @@ class CEmitter:
             call_parts.append(op.output)
             args.extend(call_parts)
             return ", ".join(args)
+        if isinstance(op, TriluOp):
+            call_parts = [op.input0, op.output]
+            if op.k_input is not None:
+                call_parts.append(op.k_input)
+            args.extend(call_parts)
+            return ", ".join(args)
         if isinstance(op, GridSampleOp):
             args.extend([op.input0, op.grid, op.output])
             return ", ".join(args)
@@ -3761,6 +4512,9 @@ class CEmitter:
         if isinstance(op, ArgReduceOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
+        if isinstance(op, TopKOp):
+            args.extend([op.input0, op.output_values, op.output_indices])
+            return ", ".join(args)
         args.extend([op.input0, op.output])
         return ", ".join(args)
@@ -3792,11 +4546,11 @@ class CEmitter:
             return {}
         if len(intermediates) == 1:
             name, shape, dtype = intermediates[0]
-            temp_name = allocate_temp_name("tmp")
+            temp_name = allocate_temp_name(f"tmp0_{name}")
             return {name: TempBuffer(name=temp_name, shape=shape, dtype=dtype)}
         return {
             name: TempBuffer(
-                name=allocate_temp_name(f"tmp{index}"),
+                name=allocate_temp_name(f"tmp{index}_{name}"),
                 shape=shape,
                 dtype=dtype,
             )
@@ -3811,11 +4565,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -3827,16 +4585,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | DepthToSpaceOp
         | SpaceToDepthOp
@@ -3845,12 +4607,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
         temp_map: dict[str, str],
     ) -> (
@@ -3860,11 +4625,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -3876,16 +4645,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | DepthToSpaceOp
         | SpaceToDepthOp
@@ -3894,12 +4667,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp
     ):
         if isinstance(op, BinaryOp):
@@ -3909,6 +4685,8 @@ class CEmitter:
                 output=temp_map.get(op.output, op.output),
                 function=op.function,
                 operator_kind=op.operator_kind,
+                input0_shape=op.input0_shape,
+                input1_shape=op.input1_shape,
                 shape=op.shape,
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
@@ -3979,6 +4757,16 @@ class CEmitter:
                 right_vector=op.right_vector,
                 dtype=op.dtype,
             )
+        if isinstance(op, EinsumOp):
+            return EinsumOp(
+                inputs=tuple(temp_map.get(name, name) for name in op.inputs),
+                output=temp_map.get(op.output, op.output),
+                kind=op.kind,
+                input_shapes=op.input_shapes,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, CastOp):
             return CastOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -3987,6 +4775,22 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 dtype=op.dtype,
             )
+        if isinstance(op, QuantizeLinearOp):
+            return QuantizeLinearOp(
+                input0=temp_map.get(op.input0, op.input0),
+                scale=temp_map.get(op.scale, op.scale),
+                zero_point=(
+                    temp_map.get(op.zero_point, op.zero_point)
+                    if op.zero_point is not None
+                    else None
+                ),
+                output=temp_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                axis=op.axis,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+                scale_dtype=op.scale_dtype,
+            )
         if isinstance(op, GemmOp):
             return GemmOp(
                 input_a=temp_map.get(op.input_a, op.input_a),
@@ -4160,6 +4964,26 @@ class CEmitter:
                 group=op.group,
                 dtype=op.dtype,
             )
+        if isinstance(op, ConvTransposeOp):
+            return ConvTransposeOp(
+                input0=temp_map.get(op.input0, op.input0),
+                weights=temp_map.get(op.weights, op.weights),
+                bias=temp_map.get(op.bias, op.bias) if op.bias else None,
+                output=temp_map.get(op.output, op.output),
+                batch=op.batch,
+                in_channels=op.in_channels,
+                out_channels=op.out_channels,
+                spatial_rank=op.spatial_rank,
+                in_spatial=op.in_spatial,
+                out_spatial=op.out_spatial,
+                kernel_shape=op.kernel_shape,
+                strides=op.strides,
+                pads=op.pads,
+                dilations=op.dilations,
+                output_padding=op.output_padding,
+                group=op.group,
+                dtype=op.dtype,
+            )
         if isinstance(op, AveragePoolOp):
             return AveragePoolOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4181,6 +5005,27 @@ class CEmitter:
                 count_include_pad=op.count_include_pad,
                 dtype=op.dtype,
             )
+        if isinstance(op, LpPoolOp):
+            return LpPoolOp(
+                input0=temp_map.get(op.input0, op.input0),
+                output=temp_map.get(op.output, op.output),
+                batch=op.batch,
+                channels=op.channels,
+                in_h=op.in_h,
+                in_w=op.in_w,
+                out_h=op.out_h,
+                out_w=op.out_w,
+                kernel_h=op.kernel_h,
+                kernel_w=op.kernel_w,
+                stride_h=op.stride_h,
+                stride_w=op.stride_w,
+                pad_top=op.pad_top,
+                pad_left=op.pad_left,
+                pad_bottom=op.pad_bottom,
+                pad_right=op.pad_right,
+                p=op.p,
+                dtype=op.dtype,
+            )
         if isinstance(op, BatchNormOp):
             return BatchNormOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4318,6 +5163,17 @@ class CEmitter:
                 shape=op.shape,
                 dtype=op.dtype,
             )
+        if isinstance(op, HardmaxOp):
+            return HardmaxOp(
+                input0=temp_map.get(op.input0, op.input0),
+                output=temp_map.get(op.output, op.output),
+                outer=op.outer,
+                axis_size=op.axis_size,
+                inner=op.inner,
+                axis=op.axis,
+                shape=op.shape,
+                dtype=op.dtype,
+            )
         if isinstance(op, NegativeLogLikelihoodLossOp):
             return NegativeLogLikelihoodLossOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4419,6 +5275,32 @@ class CEmitter:
                 dtype=op.dtype,
                 indices_dtype=op.indices_dtype,
             )
+        if isinstance(op, GatherNDOp):
+            return GatherNDOp(
+                data=temp_map.get(op.data, op.data),
+                indices=temp_map.get(op.indices, op.indices),
+                output=temp_map.get(op.output, op.output),
+                batch_dims=op.batch_dims,
+                data_shape=op.data_shape,
+                indices_shape=op.indices_shape,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+            )
+        if isinstance(op, ScatterNDOp):
+            return ScatterNDOp(
+                data=temp_map.get(op.data, op.data),
+                indices=temp_map.get(op.indices, op.indices),
+                updates=temp_map.get(op.updates, op.updates),
+                output=temp_map.get(op.output, op.output),
+                data_shape=op.data_shape,
+                indices_shape=op.indices_shape,
+                updates_shape=op.updates_shape,
+                output_shape=op.output_shape,
+                reduction=op.reduction,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+            )
         if isinstance(op, ConcatOp):
             return ConcatOp(
                 inputs=tuple(temp_map.get(name, name) for name in op.inputs),
@@ -4458,6 +5340,15 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, NonZeroOp):
+            return NonZeroOp(
+                input0=temp_map.get(op.input0, op.input0),
+                output=temp_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, ExpandOp):
             return ExpandOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4493,6 +5384,21 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, OneHotOp):
+            return OneHotOp(
+                indices=temp_map.get(op.indices, op.indices),
+                depth=temp_map.get(op.depth, op.depth),
+                values=temp_map.get(op.values, op.values),
+                output=temp_map.get(op.output, op.output),
+                axis=op.axis,
+                indices_shape=op.indices_shape,
+                values_shape=op.values_shape,
+                output_shape=op.output_shape,
+                depth_dim=op.depth_dim,
+                dtype=op.dtype,
+                indices_dtype=op.indices_dtype,
+                depth_dtype=op.depth_dtype,
+            )
         if isinstance(op, SplitOp):
             return SplitOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4542,6 +5448,24 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, TriluOp):
+            return TriluOp(
+                input0=temp_map.get(op.input0, op.input0),
+                output=temp_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                upper=op.upper,
+                k_value=op.k_value,
+                k_input=(
+                    temp_map.get(op.k_input, op.k_input)
+                    if op.k_input is not None
+                    else None
+                ),
+                k_input_shape=op.k_input_shape,
+                k_input_dtype=op.k_input_dtype,
+                dtype=op.dtype,
+                input_dtype=op.input_dtype,
+            )
         if isinstance(op, TileOp):
             return TileOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -4726,6 +5650,21 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 output_dtype=op.output_dtype,
             )
+        if isinstance(op, TopKOp):
+            return TopKOp(
+                input0=temp_map.get(op.input0, op.input0),
+                output_values=temp_map.get(op.output_values, op.output_values),
+                output_indices=temp_map.get(op.output_indices, op.output_indices),
+                input_shape=op.input_shape,
+                output_shape=op.output_shape,
+                axis=op.axis,
+                k=op.k,
+                largest=op.largest,
+                sorted=op.sorted,
+                input_dtype=op.input_dtype,
+                output_values_dtype=op.output_values_dtype,
+                output_indices_dtype=op.output_indices_dtype,
+            )
         return UnaryOp(
             input0=temp_map.get(op.input0, op.input0),
             output=temp_map.get(op.output, op.output),
@@ -4743,11 +5682,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -4759,16 +5702,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | DepthToSpaceOp
         | SpaceToDepthOp
@@ -4777,12 +5724,15 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
         index: int,
         *,
@@ -4798,11 +5748,15 @@ class CEmitter:
         unary_template,
         clip_template,
         cast_template,
+        quantize_linear_template,
         matmul_template,
+        einsum_template,
         gemm_template,
         attention_template,
         conv_template,
+        conv_transpose_template,
         avg_pool_template,
+        lp_pool_template,
         batch_norm_template,
         lp_norm_template,
         instance_norm_template,
@@ -4814,16 +5768,20 @@ class CEmitter:
         lstm_template,
         softmax_template,
         logsoftmax_template,
+        hardmax_template,
         nllloss_template,
         softmax_cross_entropy_loss_template,
         maxpool_template,
         concat_template,
         gather_elements_template,
         gather_template,
+        gather_nd_template,
+        scatter_nd_template,
         transpose_template,
         reshape_template,
         identity_template,
         eye_like_template,
+        trilu_template,
         tile_template,
         pad_template,
         depth_to_space_template,
@@ -4835,12 +5793,15 @@ class CEmitter:
         reduce_template,
         reduce_dynamic_template,
         arg_reduce_template,
+        topk_template,
         constant_of_shape_template,
         shape_template,
         size_template,
+        nonzero_template,
         expand_template,
         cumsum_template,
         range_template,
+        one_hot_template,
         split_template,
         scalar_registry: ScalarFunctionRegistry | None = None,
         dim_args: str = "",
@@ -4885,21 +5846,27 @@ class CEmitter:
             output_dim_names = _dim_names_for(op.output)
             shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
             loop_vars = CEmitter._loop_vars(op.shape)
-            array_suffix = self._param_array_suffix(op.shape, output_dim_names)
+            output_suffix = self._param_array_suffix(op.shape, output_dim_names)
+            input0_suffix = self._param_array_suffix(
+                op.input0_shape, _dim_names_for(op.input0)
+            )
+            input1_suffix = self._param_array_suffix(
+                op.input1_shape, _dim_names_for(op.input1)
+            )
             input_c_type = op.input_dtype.c_type
             output_c_type = op.dtype.c_type
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], input_c_type, array_suffix, True),
-                    (params["input1"], input_c_type, array_suffix, True),
-                    (params["output"], output_c_type, array_suffix, False),
+                    (params["input0"], input_c_type, input0_suffix, True),
+                    (params["input1"], input_c_type, input1_suffix, True),
+                    (params["output"], output_c_type, output_suffix, False),
                 ]
             )
             common = {
                 "model_name": model.name,
                 "op_name": op_name,
                 "element_count": CEmitter._element_count_expr(shape),
-                "array_suffix": array_suffix,
+                "array_suffix": output_suffix,
                 "shape": shape,
                 "loop_vars": loop_vars,
                 "input_c_type": input_c_type,
@@ -4908,11 +5875,17 @@ class CEmitter:
                 "dim_args": dim_args,
                 "params": param_decls,
             }
-            left_expr = f"{params['input0']}" + "".join(
-                f"[{var}]" for var in loop_vars
+            left_expr = CEmitter._broadcast_index_expr(
+                params["input0"],
+                op.input0_shape,
+                op.shape,
+                loop_vars,
             )
-            right_expr = f"{params['input1']}" + "".join(
-                f"[{var}]" for var in loop_vars
+            right_expr = CEmitter._broadcast_index_expr(
+                params["input1"],
+                op.input1_shape,
+                op.shape,
+                loop_vars,
             )
             operator_expr = None
             operator = op_spec.operator
@@ -5177,39 +6150,170 @@ class CEmitter:
                 k=op.k,
             ).rstrip()
             return with_node_comment(rendered)
-        if isinstance(op, GemmOp):
+        if isinstance(op, EinsumOp):
             params = self._shared_param_map(
                 [
-                    ("input_a", op.input_a),
-                    ("input_b", op.input_b),
-                    ("input_c", op.input_c),
+                    *(
+                        (f"input{idx}", name)
+                        for idx, name in enumerate(op.inputs)
+                    ),
                     ("output", op.output),
                 ]
             )
-            input_a_shape = (op.k, op.m) if op.trans_a else (op.m, op.k)
-            input_b_shape = (op.n, op.k) if op.trans_b else (op.k, op.n)
-            input_a_suffix = self._param_array_suffix(input_a_shape)
-            input_b_suffix = self._param_array_suffix(input_b_shape)
-            output_suffix = self._param_array_suffix((op.m, op.n))
-            c_suffix = (
-                self._param_array_suffix(op.c_shape)
-                if op.c_shape is not None
-                else ""
+            output_dim_names = _dim_names_for(op.output)
+            output_shape = CEmitter._shape_dim_exprs(
+                op.output_shape, output_dim_names
             )
+            output_loop_vars = CEmitter._loop_vars(op.output_shape)
+            if output_loop_vars:
+                output_expr = f"{params['output']}" + "".join(
+                    f"[{var}]" for var in output_loop_vars
+                )
+            else:
+                output_expr = f"{params['output']}[0]"
+            input_shapes = op.input_shapes
+            input_dim_names = [
+                _dim_names_for(name) for name in op.inputs
+            ]
+            input_suffixes = [
+                self._param_array_suffix(shape, dim_names)
+                for shape, dim_names in zip(input_shapes, input_dim_names)
+            ]
             param_decls = self._build_param_decls(
                 [
-                    (params["input_a"], c_type, input_a_suffix, True),
-                    (params["input_b"], c_type, input_b_suffix, True),
+                    *(
+                        (
+                            params[f"input{idx}"],
+                            op.input_dtype.c_type,
+                            input_suffixes[idx],
+                            True,
+                        )
+                        for idx in range(len(op.inputs))
+                    ),
                     (
-                        params["input_c"],
-                        c_type,
-                        c_suffix,
-                        True,
-                    )
-                    if params["input_c"]
-                    else (None, "", "", True),
-                    (params["output"], c_type, output_suffix, False),
-                ]
+                        params["output"],
+                        op.dtype.c_type,
+                        self._param_array_suffix(op.output_shape, output_dim_names),
+                        False,
+                    ),
+                ]
+            )
+            input_loop_vars: tuple[str, ...] = ()
+            input_loop_bounds: tuple[str | int, ...] = ()
+            reduce_loop_var = "k"
+            reduce_loop_bound: str | int | None = None
+            input_expr = None
+            input0_expr = None
+            input1_expr = None
+            if op.kind == EinsumKind.REDUCE_ALL:
+                input_loop_vars = CEmitter._loop_vars(input_shapes[0])
+                input_loop_bounds = tuple(
+                    CEmitter._shape_dim_exprs(
+                        input_shapes[0], input_dim_names[0]
+                    )
+                )
+                if input_loop_vars:
+                    input_expr = f"{params['input0']}" + "".join(
+                        f"[{var}]" for var in input_loop_vars
+                    )
+                else:
+                    input_expr = f"{params['input0']}[0]"
+            elif op.kind == EinsumKind.SUM_J:
+                input_shape_exprs = CEmitter._shape_dim_exprs(
+                    input_shapes[0], input_dim_names[0]
+                )
+                reduce_loop_bound = input_shape_exprs[1]
+                input_expr = (
+                    f"{params['input0']}"
+                    f"[{output_loop_vars[0]}][{reduce_loop_var}]"
+                )
+            elif op.kind == EinsumKind.TRANSPOSE:
+                input_expr = (
+                    f"{params['input0']}"
+                    f"[{output_loop_vars[1]}][{output_loop_vars[0]}]"
+                )
+            elif op.kind == EinsumKind.DOT:
+                input_shape_exprs = CEmitter._shape_dim_exprs(
+                    input_shapes[0], input_dim_names[0]
+                )
+                reduce_loop_bound = input_shape_exprs[0]
+                input0_expr = f"{params['input0']}[{reduce_loop_var}]"
+                input1_expr = f"{params['input1']}[{reduce_loop_var}]"
+            elif op.kind == EinsumKind.BATCH_MATMUL:
+                input_shape_exprs = CEmitter._shape_dim_exprs(
+                    input_shapes[0], input_dim_names[0]
+                )
+                reduce_loop_bound = input_shape_exprs[2]
+                input0_expr = (
+                    f"{params['input0']}"
+                    f"[{output_loop_vars[0]}]"
+                    f"[{output_loop_vars[1]}][{reduce_loop_var}]"
+                )
+                input1_expr = (
+                    f"{params['input1']}"
+                    f"[{output_loop_vars[0]}]"
+                    f"[{reduce_loop_var}][{output_loop_vars[2]}]"
+                )
+            elif op.kind == EinsumKind.BATCH_DIAGONAL:
+                diag_var = output_loop_vars[-1]
+                prefix_vars = output_loop_vars[:-1]
+                input_expr = f"{params['input0']}" + "".join(
+                    f"[{var}]" for var in prefix_vars
+                )
+                input_expr += f"[{diag_var}][{diag_var}]"
+            rendered = einsum_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                params=param_decls,
+                dim_args=dim_args,
+                kind=op.kind.value,
+                output_loop_vars=output_loop_vars,
+                output_loop_bounds=output_shape,
+                output_expr=output_expr,
+                acc_type=op.dtype.c_type,
+                zero_literal=zero_literal,
+                input_loop_vars=input_loop_vars,
+                input_loop_bounds=input_loop_bounds,
+                reduce_loop_var=reduce_loop_var,
+                reduce_loop_bound=reduce_loop_bound,
+                input_expr=input_expr,
+                input0_expr=input0_expr,
+                input1_expr=input1_expr,
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, GemmOp):
+            params = self._shared_param_map(
+                [
+                    ("input_a", op.input_a),
+                    ("input_b", op.input_b),
+                    ("input_c", op.input_c),
+                    ("output", op.output),
+                ]
+            )
+            input_a_shape = (op.k, op.m) if op.trans_a else (op.m, op.k)
+            input_b_shape = (op.n, op.k) if op.trans_b else (op.k, op.n)
+            input_a_suffix = self._param_array_suffix(input_a_shape)
+            input_b_suffix = self._param_array_suffix(input_b_shape)
+            output_suffix = self._param_array_suffix((op.m, op.n))
+            c_suffix = (
+                self._param_array_suffix(op.c_shape)
+                if op.c_shape is not None
+                else ""
+            )
+            param_decls = self._build_param_decls(
+                [
+                    (params["input_a"], c_type, input_a_suffix, True),
+                    (params["input_b"], c_type, input_b_suffix, True),
+                    (
+                        params["input_c"],
+                        c_type,
+                        c_suffix,
+                        True,
+                    )
+                    if params["input_c"]
+                    else (None, "", "", True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
             )
             alpha_literal = CEmitter._format_literal(op.dtype, op.alpha)
             beta_literal = CEmitter._format_literal(op.dtype, op.beta)
@@ -5556,6 +6660,81 @@ class CEmitter:
                 in_indices=in_indices,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, ConvTransposeOp):
+            params = self._shared_param_map(
+                [
+                    ("input0", op.input0),
+                    ("weights", op.weights),
+                    ("bias", op.bias),
+                    ("output", op.output),
+                ]
+            )
+            input_shape = (op.batch, op.in_channels, *op.in_spatial)
+            weight_shape = (
+                op.in_channels,
+                op.out_channels // op.group,
+                *op.kernel_shape,
+            )
+            output_shape = (op.batch, op.out_channels, *op.out_spatial)
+            in_indices = tuple(f"id{dim}" for dim in range(op.spatial_rank))
+            kernel_indices = tuple(
+                f"kd{dim}" for dim in range(op.spatial_rank)
+            )
+            out_indices = tuple(f"od{dim}" for dim in range(op.spatial_rank))
+            pad_begin = op.pads[: op.spatial_rank]
+            group_in_channels = op.in_channels // op.group
+            group_out_channels = op.out_channels // op.group
+            input_suffix = self._param_array_suffix(input_shape)
+            weight_suffix = self._param_array_suffix(weight_shape)
+            bias_suffix = self._param_array_suffix((op.out_channels,))
+            output_suffix = self._param_array_suffix(output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], c_type, input_suffix, True),
+                    (params["weights"], c_type, weight_suffix, True),
+                    (
+                        params["bias"],
+                        c_type,
+                        bias_suffix,
+                        True,
+                    )
+                    if params["bias"]
+                    else (None, "", "", True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = conv_transpose_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                weights=params["weights"],
+                bias=params["bias"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                zero_literal=zero_literal,
+                input_suffix=input_suffix,
+                weight_suffix=weight_suffix,
+                bias_suffix=bias_suffix,
+                output_suffix=output_suffix,
+                batch=op.batch,
+                in_channels=op.in_channels,
+                out_channels=op.out_channels,
+                spatial_rank=op.spatial_rank,
+                in_spatial=op.in_spatial,
+                out_spatial=op.out_spatial,
+                kernel_shape=op.kernel_shape,
+                strides=op.strides,
+                pads_begin=pad_begin,
+                dilations=op.dilations,
+                group=op.group,
+                group_in_channels=group_in_channels,
+                group_out_channels=group_out_channels,
+                in_indices=in_indices,
+                kernel_indices=kernel_indices,
+                out_indices=out_indices,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, AveragePoolOp):
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
@@ -5597,6 +6776,49 @@ class CEmitter:
                 count_include_pad=int(op.count_include_pad),
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, LpPoolOp):
+            params = self._shared_param_map(
+                [("input0", op.input0), ("output", op.output)]
+            )
+            input_shape = (op.batch, op.channels, op.in_h, op.in_w)
+            output_shape = (op.batch, op.channels, op.out_h, op.out_w)
+            input_suffix = self._param_array_suffix(input_shape)
+            output_suffix = self._param_array_suffix(output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], c_type, input_suffix, True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = lp_pool_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                input_suffix=input_suffix,
+                output_suffix=output_suffix,
+                batch=op.batch,
+                channels=op.channels,
+                in_h=op.in_h,
+                in_w=op.in_w,
+                out_h=op.out_h,
+                out_w=op.out_w,
+                kernel_h=op.kernel_h,
+                kernel_w=op.kernel_w,
+                stride_h=op.stride_h,
+                stride_w=op.stride_w,
+                pad_top=op.pad_top,
+                pad_left=op.pad_left,
+                pad_bottom=op.pad_bottom,
+                pad_right=op.pad_right,
+                p=op.p,
+                zero_literal=zero_literal,
+                abs_fn=CEmitter._math_fn(op.dtype, "fabsf", "fabs"),
+                pow_fn=CEmitter._math_fn(op.dtype, "powf", "pow"),
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, BatchNormOp):
             params = self._shared_param_map(
                 [
@@ -5769,6 +6991,19 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, LayerNormalizationOp):
+            acc_dtype = (
+                ScalarType.F32
+                if op.dtype in {ScalarType.F16, ScalarType.F32}
+                else op.dtype
+            )
+            acc_type = acc_dtype.c_type
+            acc_zero_literal = CEmitter._format_literal(acc_dtype, 0)
+            acc_one_literal = CEmitter._format_literal(acc_dtype, 1)
+            acc_epsilon_literal = CEmitter._format_floating(
+                op.epsilon, acc_dtype
+            )
+            acc_sqrt_fn = CEmitter._math_fn(acc_dtype, "sqrtf", "sqrt")
+            use_kahan = op.dtype in {ScalarType.F16, ScalarType.F32}
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -5878,8 +7113,12 @@ class CEmitter:
                 bias_index_vars=bias_index_vars,
                 mean_index_vars=mean_index_vars,
                 inner=op.inner,
-                epsilon_literal=CEmitter._format_floating(op.epsilon, op.dtype),
-                sqrt_fn=CEmitter._math_fn(op.dtype, "sqrtf", "sqrt"),
+                acc_type=acc_type,
+                acc_zero_literal=acc_zero_literal,
+                acc_one_literal=acc_one_literal,
+                acc_epsilon_literal=acc_epsilon_literal,
+                acc_sqrt_fn=acc_sqrt_fn,
+                use_kahan=use_kahan,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, MeanVarianceNormalizationOp):
@@ -6244,7 +7483,41 @@ class CEmitter:
                 log_fn=CEmitter._math_fn(op.dtype, "logf", "log"),
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, HardmaxOp):
+            params = self._shared_param_map(
+                [("input0", op.input0), ("output", op.output)]
+            )
+            array_suffix = self._param_array_suffix(op.shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], c_type, array_suffix, True),
+                    (params["output"], c_type, array_suffix, False),
+                ]
+            )
+            rendered = hardmax_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                array_suffix=array_suffix,
+                outer=op.outer,
+                axis_size=op.axis_size,
+                inner=op.inner,
+                zero_literal=zero_literal,
+                one_literal=CEmitter._format_literal(op.dtype, 1),
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, NegativeLogLikelihoodLossOp):
+            acc_dtype = (
+                ScalarType.F64
+                if op.dtype in {ScalarType.F16, ScalarType.F32}
+                else op.dtype
+            )
+            acc_type = acc_dtype.c_type
+            acc_zero_literal = CEmitter._format_literal(acc_dtype, 0)
+            acc_one_literal = CEmitter._format_literal(acc_dtype, 1)
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -6292,9 +7565,22 @@ class CEmitter:
                 ignore_index=op.ignore_index,
                 zero_literal=zero_literal,
                 one_literal=CEmitter._format_literal(op.dtype, 1),
+                acc_type=acc_type,
+                acc_zero_literal=acc_zero_literal,
+                acc_one_literal=acc_one_literal,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, SoftmaxCrossEntropyLossOp):
+            acc_dtype = (
+                ScalarType.F64
+                if op.dtype in {ScalarType.F16, ScalarType.F32}
+                else op.dtype
+            )
+            acc_type = acc_dtype.c_type
+            acc_zero_literal = CEmitter._format_literal(acc_dtype, 0)
+            acc_one_literal = CEmitter._format_literal(acc_dtype, 1)
+            acc_exp_fn = CEmitter._math_fn(acc_dtype, "expf", "exp")
+            acc_log_fn = CEmitter._math_fn(acc_dtype, "logf", "log")
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -6361,8 +7647,11 @@ class CEmitter:
                 ignore_index=ignore_index,
                 zero_literal=zero_literal,
                 one_literal=CEmitter._format_literal(op.dtype, 1),
-                exp_fn=CEmitter._math_fn(op.dtype, "expf", "exp"),
-                log_fn=CEmitter._math_fn(op.dtype, "logf", "log"),
+                acc_type=acc_type,
+                acc_zero_literal=acc_zero_literal,
+                acc_one_literal=acc_one_literal,
+                acc_exp_fn=acc_exp_fn,
+                acc_log_fn=acc_log_fn,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, MaxPoolOp):
@@ -6569,6 +7858,180 @@ class CEmitter:
                 axis_dim=op.data_shape[op.axis],
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, GatherNDOp):
+            params = self._shared_param_map(
+                [
+                    ("data", op.data),
+                    ("indices", op.indices),
+                    ("output", op.output),
+                ]
+            )
+            indices_dim_names = _dim_names_for(op.indices)
+            data_dim_names = _dim_names_for(op.data)
+            data_shape = CEmitter._shape_dim_exprs(op.data_shape, data_dim_names)
+            indices_shape = CEmitter._shape_dim_exprs(
+                op.indices_shape, indices_dim_names
+            )
+            indices_prefix_shape = indices_shape[:-1]
+            indices_prefix_loop_vars = (
+                CEmitter._loop_vars(op.indices_shape[:-1])
+                if op.indices_shape[:-1]
+                else ()
+            )
+            index_depth = op.indices_shape[-1]
+            tail_shape = data_shape[op.batch_dims + index_depth :]
+            tail_loop_vars = (
+                tuple(f"t{index}" for index in range(len(tail_shape)))
+                if tail_shape
+                else ()
+            )
+            output_loop_vars = (*indices_prefix_loop_vars, *tail_loop_vars)
+            if output_loop_vars:
+                output_index_expr = params["output"] + "".join(
+                    f"[{var}]" for var in output_loop_vars
+                )
+            else:
+                output_index_expr = f"{params['output']}[0]"
+            data_index_vars = (
+                *indices_prefix_loop_vars[: op.batch_dims],
+                *tuple(f"index{idx}" for idx in range(index_depth)),
+                *tail_loop_vars,
+            )
+            data_index_expr = params["data"] + "".join(
+                f"[{var}]" for var in data_index_vars
+            )
+            data_suffix = self._param_array_suffix(op.data_shape)
+            indices_suffix = self._param_array_suffix(op.indices_shape)
+            output_suffix = self._param_array_suffix(op.output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["data"], c_type, data_suffix, True),
+                    (
+                        params["indices"],
+                        op.indices_dtype.c_type,
+                        indices_suffix,
+                        True,
+                    ),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = gather_nd_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                data=params["data"],
+                indices=params["indices"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                data_suffix=data_suffix,
+                indices_suffix=indices_suffix,
+                output_suffix=output_suffix,
+                indices_prefix_shape=indices_prefix_shape,
+                indices_prefix_loop_vars=indices_prefix_loop_vars,
+                index_depth=index_depth,
+                tail_shape=tail_shape,
+                tail_loop_vars=tail_loop_vars,
+                output_index_expr=output_index_expr,
+                data_index_expr=data_index_expr,
+                batch_dims=op.batch_dims,
+                data_shape=data_shape,
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, ScatterNDOp):
+            params = self._shared_param_map(
+                [
+                    ("data", op.data),
+                    ("indices", op.indices),
+                    ("updates", op.updates),
+                    ("output", op.output),
+                ]
+            )
+            output_dim_names = _dim_names_for(op.output)
+            indices_dim_names = _dim_names_for(op.indices)
+            updates_dim_names = _dim_names_for(op.updates)
+            data_dim_names = _dim_names_for(op.data)
+            output_shape = CEmitter._shape_dim_exprs(
+                op.output_shape, output_dim_names
+            )
+            data_shape = CEmitter._shape_dim_exprs(op.data_shape, data_dim_names)
+            indices_shape = CEmitter._shape_dim_exprs(
+                op.indices_shape, indices_dim_names
+            )
+            output_loop_vars = CEmitter._loop_vars(op.output_shape)
+            indices_prefix_shape = indices_shape[:-1]
+            indices_prefix_loop_vars = (
+                CEmitter._loop_vars(op.indices_shape[:-1])
+                if op.indices_shape[:-1]
+                else ()
+            )
+            index_depth = op.indices_shape[-1]
+            tail_shape = output_shape[index_depth:]
+            tail_loop_vars = (
+                tuple(
+                    f"t{index}"
+                    for index in range(len(op.output_shape[index_depth:]))
+                )
+                if op.output_shape[index_depth:]
+                else ()
+            )
+            index_vars = tuple(f"index{idx}" for idx in range(index_depth))
+            output_index_expr = f"{params['output']}" + "".join(
+                f"[{var}]" for var in (*index_vars, *tail_loop_vars)
+            )
+            updates_index_vars = (*indices_prefix_loop_vars, *tail_loop_vars)
+            if not op.updates_shape:
+                updates_index_expr = f"{params['updates']}[0]"
+            else:
+                updates_index_expr = f"{params['updates']}" + "".join(
+                    f"[{var}]" for var in updates_index_vars
+                )
+            data_suffix = self._param_array_suffix(
+                op.data_shape, data_dim_names
+            )
+            indices_suffix = self._param_array_suffix(
+                op.indices_shape, indices_dim_names
+            )
+            updates_suffix = self._param_array_suffix(
+                op.updates_shape, updates_dim_names
+            )
+            output_suffix = self._param_array_suffix(
+                op.output_shape, output_dim_names
+            )
+            param_decls = self._build_param_decls(
+                [
+                    (params["data"], c_type, data_suffix, True),
+                    (
+                        params["indices"],
+                        op.indices_dtype.c_type,
+                        indices_suffix,
+                        True,
+                    ),
+                    (params["updates"], c_type, updates_suffix, True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = scatter_nd_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                data=params["data"],
+                indices=params["indices"],
+                updates=params["updates"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                output_shape=output_shape,
+                output_loop_vars=output_loop_vars,
+                indices_prefix_shape=indices_prefix_shape,
+                indices_prefix_loop_vars=indices_prefix_loop_vars,
+                index_depth=index_depth,
+                data_shape=data_shape,
+                tail_shape=tail_shape,
+                tail_loop_vars=tail_loop_vars,
+                output_index_expr=output_index_expr,
+                updates_index_expr=updates_index_expr,
+                reduction=op.reduction,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, TransposeOp):
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
@@ -6608,6 +8071,7 @@ class CEmitter:
                 [("input0", op.input0), ("output", op.output)]
             )
             input_suffix = self._param_array_suffix(op.input_shape)
+            output_shape = CEmitter._codegen_shape(op.output_shape)
             output_suffix = self._param_array_suffix(op.output_shape)
             param_decls = self._build_param_decls(
                 [
@@ -6615,6 +8079,7 @@ class CEmitter:
                     (params["output"], c_type, output_suffix, False),
                 ]
             )
+            loop_vars = CEmitter._loop_vars(op.output_shape)
             rendered = reshape_template.render(
                 model_name=model.name,
                 op_name=op_name,
@@ -6625,6 +8090,8 @@ class CEmitter:
                 input_suffix=input_suffix,
                 output_suffix=output_suffix,
                 element_count=CEmitter._element_count(op.output_shape),
+                output_shape=output_shape,
+                loop_vars=loop_vars,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, IdentityOp):
@@ -6691,6 +8158,61 @@ class CEmitter:
                 one_literal=f"(({c_type})1)",
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, TriluOp):
+            param_specs = [("input0", op.input0), ("output", op.output)]
+            if op.k_input is not None:
+                param_specs.append(("k_input", op.k_input))
+            params = self._shared_param_map(param_specs)
+            output_dim_names = _dim_names_for(op.output)
+            shape = CEmitter._shape_dim_exprs(op.output_shape, output_dim_names)
+            output_suffix = self._param_array_suffix(op.output_shape, output_dim_names)
+            input_suffix = self._param_array_suffix(
+                op.input_shape, _dim_names_for(op.input0)
+            )
+            k_suffix = ""
+            if op.k_input is not None and op.k_input_shape is not None:
+                k_suffix = self._param_array_suffix(
+                    op.k_input_shape, _dim_names_for(op.k_input)
+                )
+            batch_dims = op.output_shape[:-2]
+            batch_size = CEmitter._element_count(batch_dims or (1,))
+            param_decls = [
+                (params["input0"], c_type, input_suffix, True),
+                (params["output"], c_type, output_suffix, False),
+            ]
+            if op.k_input is not None and op.k_input_dtype is not None:
+                param_decls.append(
+                    (
+                        params["k_input"],
+                        op.k_input_dtype.c_type,
+                        k_suffix,
+                        True,
+                    )
+                )
+            rendered = trilu_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output=params["output"],
+                k_input=params.get("k_input"),
+                params=self._build_param_decls(param_decls),
+                c_type=c_type,
+                k_c_type=(
+                    op.k_input_dtype.c_type
+                    if op.k_input_dtype is not None
+                    else ScalarType.I64.c_type
+                ),
+                input_suffix=input_suffix,
+                output_suffix=output_suffix,
+                shape=shape,
+                batch_size=batch_size,
+                rows=op.output_shape[-2],
+                cols=op.output_shape[-1],
+                k_value=op.k_value,
+                upper=op.upper,
+                zero_literal=zero_literal,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, TileOp):
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
@@ -7224,17 +8746,19 @@ class CEmitter:
             update_expr = None
             init_literal = None
             final_expr = "acc"
+            use_kahan = False
+            kahan_value_expr = None
             fabs_fn = CEmitter._math_fn(op.dtype, "fabsf", "fabs")
             exp_fn = CEmitter._math_fn(op.dtype, "expf", "exp")
             log_fn = CEmitter._math_fn(op.dtype, "logf", "log")
             sqrt_fn = CEmitter._math_fn(op.dtype, "sqrtf", "sqrt")
-            count_literal = CEmitter._format_literal(
-                op.dtype, op.reduce_count
-            )
             if op.reduce_kind == "sum":
                 init_literal = zero_literal
                 update_expr = f"acc += {value_expr};"
             elif op.reduce_kind == "mean":
+                count_literal = CEmitter._format_literal(
+                    op.dtype, op.reduce_count
+                )
                 init_literal = zero_literal
                 update_expr = f"acc += {value_expr};"
                 final_expr = f"acc / {count_literal}"
@@ -7269,6 +8793,24 @@ class CEmitter:
                 raise CodegenError(
                     f"Unsupported reduce kind {op.reduce_kind}"
                 )
+            if op.dtype in {ScalarType.F16, ScalarType.F32} and op.reduce_kind in {
+                "sum",
+                "mean",
+                "logsum",
+                "logsumexp",
+                "l1",
+                "l2",
+                "sumsquare",
+            }:
+                use_kahan = True
+                if op.reduce_kind == "logsumexp":
+                    kahan_value_expr = f"{exp_fn}({value_expr})"
+                elif op.reduce_kind == "l1":
+                    kahan_value_expr = f"{fabs_fn}({value_expr})"
+                elif op.reduce_kind in {"l2", "sumsquare"}:
+                    kahan_value_expr = f"{value_expr} * {value_expr}"
+                else:
+                    kahan_value_expr = value_expr
             input_suffix = self._param_array_suffix(op.input_shape)
             output_suffix = self._param_array_suffix(op.output_shape)
             param_decls = self._build_param_decls(
@@ -7292,8 +8834,11 @@ class CEmitter:
                 reduce_dims=reduce_dims,
                 output_index_expr=output_index_expr,
                 init_literal=init_literal,
+                zero_literal=zero_literal,
                 update_expr=update_expr,
                 final_expr=final_expr,
+                use_kahan=use_kahan,
+                kahan_value_expr=kahan_value_expr,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ArgReduceOp):
@@ -7367,6 +8912,83 @@ class CEmitter:
                 dim_args=dim_args,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, TopKOp):
+            params = self._shared_param_map(
+                [
+                    ("input0", op.input0),
+                    ("output_values", op.output_values),
+                    ("output_indices", op.output_indices),
+                ]
+            )
+            output_shape = CEmitter._codegen_shape(op.output_shape)
+            outer_shape = tuple(
+                dim for axis, dim in enumerate(output_shape) if axis != op.axis
+            )
+            outer_loop_vars = CEmitter._loop_vars(outer_shape)
+            reduce_var = "r0"
+            k_var = "k0"
+            input_indices: list[str] = []
+            output_indices: list[str] = []
+            outer_index = 0
+            for axis in range(len(op.input_shape)):
+                if axis == op.axis:
+                    input_indices.append(reduce_var)
+                    output_indices.append(k_var)
+                else:
+                    input_indices.append(outer_loop_vars[outer_index])
+                    output_indices.append(outer_loop_vars[outer_index])
+                    outer_index += 1
+            input_index_expr = "".join(f"[{var}]" for var in input_indices)
+            output_index_expr = "".join(f"[{var}]" for var in output_indices)
+            compare_expr = (
+                "(a > b) || ((a == b) && (ai < bi))"
+                if op.largest
+                else "(a < b) || ((a == b) && (ai < bi))"
+            )
+            input_suffix = self._param_array_suffix(op.input_shape)
+            output_suffix = self._param_array_suffix(op.output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
+                    (
+                        params["output_values"],
+                        op.output_values_dtype.c_type,
+                        output_suffix,
+                        False,
+                    ),
+                    (
+                        params["output_indices"],
+                        op.output_indices_dtype.c_type,
+                        output_suffix,
+                        False,
+                    ),
+                ]
+            )
+            rendered = topk_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output_values=params["output_values"],
+                output_indices=params["output_indices"],
+                params=param_decls,
+                input_c_type=op.input_dtype.c_type,
+                output_values_c_type=op.output_values_dtype.c_type,
+                output_indices_c_type=op.output_indices_dtype.c_type,
+                input_suffix=input_suffix,
+                output_suffix=output_suffix,
+                output_shape=output_shape,
+                outer_shape=outer_shape,
+                outer_loop_vars=outer_loop_vars,
+                reduce_var=reduce_var,
+                k_var=k_var,
+                axis_dim=op.input_shape[op.axis],
+                k=op.k,
+                input_index_expr=input_index_expr,
+                output_index_expr=output_index_expr,
+                compare_expr=compare_expr,
+                dim_args=dim_args,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, ReduceOp):
             name_params = self._shared_param_map(
                 [
@@ -7545,35 +9167,76 @@ class CEmitter:
                 c_type=c_type,
                 input_suffix=input_suffix,
                 output_suffix=output_suffix,
-                values=[
-                    CEmitter._format_literal(op.dtype, value)
-                    for value in op.values
-                ],
+                values=[
+                    CEmitter._format_literal(op.dtype, value)
+                    for value in op.values
+                ],
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, SizeOp):
+            params = self._shared_param_map(
+                [("input0", op.input0), ("output", op.output)]
+            )
+            input_suffix = self._param_array_suffix(op.input_shape)
+            output_suffix = self._param_array_suffix(op.output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = size_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output=params["output"],
+                params=param_decls,
+                input_c_type=op.input_dtype.c_type,
+                c_type=c_type,
+                input_suffix=input_suffix,
+                output_suffix=output_suffix,
+                value=CEmitter._format_literal(op.dtype, op.value),
             ).rstrip()
             return with_node_comment(rendered)
-        if isinstance(op, SizeOp):
+        if isinstance(op, NonZeroOp):
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            input_suffix = self._param_array_suffix(op.input_shape)
-            output_suffix = self._param_array_suffix(op.output_shape)
+            input_dim_names = _dim_names_for(op.input0)
+            output_dim_names = _dim_names_for(op.output)
+            input_shape = CEmitter._shape_dim_exprs(
+                op.input_shape, input_dim_names
+            )
+            loop_vars = CEmitter._loop_vars(op.input_shape)
+            input_suffix = self._param_array_suffix(
+                op.input_shape, input_dim_names
+            )
+            output_suffix = self._param_array_suffix(
+                op.output_shape, output_dim_names
+            )
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], op.input_dtype.c_type, input_suffix, True),
                     (params["output"], c_type, output_suffix, False),
                 ]
             )
-            rendered = size_template.render(
+            input_expr = f"{params['input0']}" + "".join(
+                f"[{var}]" for var in loop_vars
+            )
+            rendered = nonzero_template.render(
                 model_name=model.name,
                 op_name=op_name,
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
                 input_c_type=op.input_dtype.c_type,
-                c_type=c_type,
+                output_c_type=c_type,
                 input_suffix=input_suffix,
                 output_suffix=output_suffix,
-                value=CEmitter._format_literal(op.dtype, op.value),
+                input_shape=input_shape,
+                loop_vars=loop_vars,
+                input_expr=input_expr,
+                zero_literal=op.input_dtype.zero_literal,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ExpandOp):
@@ -7692,6 +9355,74 @@ class CEmitter:
                 length=op.length,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, OneHotOp):
+            params = self._shared_param_map(
+                [
+                    ("indices", op.indices),
+                    ("depth", op.depth),
+                    ("values", op.values),
+                    ("output", op.output),
+                ]
+            )
+            output_dim_names = _dim_names_for(op.output)
+            indices_dim_names = _dim_names_for(op.indices)
+            values_dim_names = _dim_names_for(op.values)
+            output_shape = CEmitter._codegen_shape(op.output_shape)
+            loop_vars = CEmitter._loop_vars(output_shape)
+            indices_indices = tuple(
+                var for idx, var in enumerate(loop_vars) if idx != op.axis
+            )
+            if not indices_indices:
+                indices_indices = ("0",)
+            output_suffix = self._param_array_suffix(
+                op.output_shape, output_dim_names
+            )
+            indices_suffix = self._param_array_suffix(
+                op.indices_shape, indices_dim_names
+            )
+            values_suffix = self._param_array_suffix(
+                op.values_shape, values_dim_names
+            )
+            depth_suffix = self._param_array_suffix(())
+            param_decls = self._build_param_decls(
+                [
+                    (
+                        params["indices"],
+                        op.indices_dtype.c_type,
+                        indices_suffix,
+                        True,
+                    ),
+                    (
+                        params["depth"],
+                        op.depth_dtype.c_type,
+                        depth_suffix,
+                        True,
+                    ),
+                    (params["values"], c_type, values_suffix, True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = one_hot_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                indices=params["indices"],
+                depth=params["depth"],
+                values=params["values"],
+                output=params["output"],
+                params=param_decls,
+                indices_suffix=indices_suffix,
+                depth_suffix=depth_suffix,
+                values_suffix=values_suffix,
+                output_suffix=output_suffix,
+                output_shape=output_shape,
+                loop_vars=loop_vars,
+                indices_indices=indices_indices,
+                axis_index=loop_vars[op.axis],
+                depth_dim=op.depth_dim,
+                indices_c_type=op.indices_dtype.c_type,
+                c_type=c_type,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, SplitOp):
             output_params = [
                 (f"output_{index}", name)
@@ -7772,6 +9503,86 @@ class CEmitter:
                 dim_args=dim_args,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, QuantizeLinearOp):
+            params = self._shared_param_map(
+                [
+                    ("input0", op.input0),
+                    ("scale", op.scale),
+                    ("zero_point", op.zero_point),
+                    ("output", op.output),
+                ]
+            )
+            output_dim_names = _dim_names_for(op.output)
+            shape = CEmitter._shape_dim_exprs(op.input_shape, output_dim_names)
+            loop_vars = CEmitter._loop_vars(op.input_shape)
+            input_suffix = self._param_array_suffix(
+                op.input_shape, _dim_names_for(op.input0)
+            )
+            scale_shape = (
+                ()
+                if op.axis is None
+                else (op.input_shape[op.axis],)
+            )
+            scale_suffix = self._param_array_suffix(
+                scale_shape, _dim_names_for(op.scale)
+            )
+            zero_point_suffix = self._param_array_suffix(
+                scale_shape, _dim_names_for(op.zero_point or "")
+            )
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
+                    (params["scale"], op.scale_dtype.c_type, scale_suffix, True),
+                    (
+                        params["zero_point"],
+                        op.dtype.c_type,
+                        zero_point_suffix,
+                        True,
+                    )
+                    if params["zero_point"]
+                    else (None, "", "", True),
+                    (params["output"], op.dtype.c_type, input_suffix, False),
+                ]
+            )
+            compute_type = "double" if op.input_dtype == ScalarType.F64 else "float"
+            round_fn = CEmitter._math_fn(
+                op.input_dtype, "nearbyintf", "nearbyint"
+            )
+            scale_index = "0" if op.axis is None else loop_vars[op.axis]
+            input_expr = f"{params['input0']}" + "".join(
+                f"[{var}]" for var in loop_vars
+            )
+            output_expr = f"{params['output']}" + "".join(
+                f"[{var}]" for var in loop_vars
+            )
+            scale_expr = f"{params['scale']}[{scale_index}]"
+            if params["zero_point"]:
+                zero_expr = f"{params['zero_point']}[{scale_index}]"
+            else:
+                zero_expr = "0"
+            rendered = quantize_linear_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                scale=params["scale"],
+                zero_point=params["zero_point"],
+                output=params["output"],
+                params=param_decls,
+                compute_type=compute_type,
+                input_c_type=op.input_dtype.c_type,
+                output_c_type=op.dtype.c_type,
+                shape=shape,
+                loop_vars=loop_vars,
+                input_expr=input_expr,
+                scale_expr=scale_expr,
+                zero_expr=zero_expr,
+                output_expr=output_expr,
+                round_fn=round_fn,
+                min_literal=op.dtype.min_literal,
+                max_literal=op.dtype.max_literal,
+                dim_args=dim_args,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, ClipOp):
             params = self._shared_param_map(
                 [
@@ -7934,11 +9745,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -7950,16 +9765,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -7968,16 +9787,20 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
     ) -> str:
         if isinstance(op, SplitOp):
             return op.outputs[0]
+        if isinstance(op, TopKOp):
+            return op.output_values
         return op.output
     @staticmethod
@@ -7988,11 +9811,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -8004,16 +9831,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -8022,18 +9853,28 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
     ) -> tuple[tuple[str, tuple[int, ...]], ...]:
         if isinstance(op, BinaryOp):
-            return ((op.input0, op.shape), (op.input1, op.shape))
+            return (
+                (op.input0, op.input0_shape),
+                (op.input1, op.input1_shape),
+            )
         if isinstance(op, MultiInputBinaryOp):
             return tuple((name, op.shape) for name in op.inputs)
+        if isinstance(op, EinsumOp):
+            return tuple(
+                (name, shape)
+                for name, shape in zip(op.inputs, op.input_shapes)
+            )
         if isinstance(op, UnaryOp):
             return ((op.input0, op.shape),)
         if isinstance(op, LpNormalizationOp):
@@ -8068,10 +9909,27 @@ class CEmitter:
             return tuple(inputs)
         if isinstance(op, CastOp):
             return ((op.input0, op.shape),)
+        if isinstance(op, NonZeroOp):
+            return ((op.input0, op.input_shape),)
+        if isinstance(op, QuantizeLinearOp):
+            scale_shape = (
+                ()
+                if op.axis is None
+                else (op.input_shape[op.axis],)
+            )
+            inputs = [(op.input0, op.input_shape), (op.scale, scale_shape)]
+            if op.zero_point is not None:
+                inputs.append((op.zero_point, scale_shape))
+            return tuple(inputs)
         if isinstance(op, IdentityOp):
             return ((op.input0, op.shape),)
         if isinstance(op, EyeLikeOp):
             return ((op.input0, op.output_shape),)
+        if isinstance(op, TriluOp):
+            inputs = [(op.input0, op.input_shape)]
+            if op.k_input is not None and op.k_input_shape is not None:
+                inputs.append((op.k_input, op.k_input_shape))
+            return tuple(inputs)
         if isinstance(op, GridSampleOp):
             return ((op.input0, op.input_shape), (op.grid, op.grid_shape))
         if isinstance(op, PadOp):
@@ -8083,8 +9941,22 @@ class CEmitter:
             if op.value_input is not None and op.value_shape is not None:
                 inputs.append((op.value_input, op.value_shape))
             return tuple(inputs)
+        if isinstance(op, ScatterNDOp):
+            return ((op.data, op.data_shape),)
         if isinstance(op, CumSumOp):
             return ((op.input0, op.input_shape),)
+        if isinstance(op, RangeOp):
+            return ((op.start, ()), (op.limit, ()), (op.delta, ()))
+        if isinstance(op, OneHotOp):
+            return (
+                (op.indices, op.indices_shape),
+                (op.depth, ()),
+                (op.values, op.values_shape),
+            )
+        if isinstance(op, SplitOp):
+            return ((op.input0, op.input_shape),)
+        if isinstance(op, TopKOp):
+            return ((op.input0, op.input_shape),)
         return ()
     def _propagate_tensor_dim_names(
@@ -8096,11 +9968,15 @@ class CEmitter:
             | UnaryOp
             | ClipOp
             | CastOp
+            | QuantizeLinearOp
             | MatMulOp
+            | EinsumOp
             | GemmOp
             | AttentionOp
             | ConvOp
+            | ConvTransposeOp
             | AveragePoolOp
+            | LpPoolOp
             | BatchNormOp
             | LpNormalizationOp
             | InstanceNormalizationOp
@@ -8112,16 +9988,19 @@ class CEmitter:
             | LstmOp
             | SoftmaxOp
             | LogSoftmaxOp
+            | HardmaxOp
             | NegativeLogLikelihoodLossOp
             | SoftmaxCrossEntropyLossOp
             | MaxPoolOp
             | ConcatOp
             | GatherElementsOp
             | GatherOp
+            | GatherNDOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
             | EyeLikeOp
+            | TriluOp
             | TileOp
             | PadOp
             | DepthToSpaceOp
@@ -8130,11 +10009,14 @@ class CEmitter:
             | GridSampleOp
             | ReduceOp
             | ArgReduceOp
+            | TopKOp
             | ConstantOfShapeOp
             | ShapeOp
             | SizeOp
+            | NonZeroOp
             | ExpandOp
             | RangeOp
+            | OneHotOp
             | SplitOp
         ],
         tensor_dim_names: dict[str, dict[int, str]],
@@ -8157,11 +10039,15 @@ class CEmitter:
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -8173,16 +10059,20 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
+        | ScatterNDOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
         | EyeLikeOp
+        | TriluOp
         | TileOp
         | PadOp
         | DepthToSpaceOp
@@ -8191,11 +10081,14 @@ class CEmitter:
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | RangeOp
+        | OneHotOp
         | SplitOp,
     ) -> tuple[tuple[str, tuple[int, ...], str], ...]:
         if isinstance(op, AttentionOp):
@@ -8292,6 +10185,19 @@ class CEmitter:
             )
         if isinstance(op, ArgReduceOp):
             return ((op.output, CEmitter._op_output_shape(op), op.output_dtype),)
+        if isinstance(op, TopKOp):
+            return (
+                (
+                    op.output_values,
+                    CEmitter._op_output_shape(op),
+                    op.output_values_dtype,
+                ),
+                (
+                    op.output_indices,
+                    CEmitter._op_output_shape(op),
+                    op.output_indices_dtype,
+                ),
+            )
         return ((op.output, CEmitter._op_output_shape(op), op.dtype),)
     @staticmethod
@@ -8303,6 +10209,7 @@ class CEmitter:
         | ClipOp
         | CastOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
@@ -8318,25 +10225,34 @@ class CEmitter:
         | LstmOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
         | TransposeOp
         | ReshapeOp
+        | IdentityOp
+        | EyeLikeOp
+        | TriluOp
+        | TileOp
         | SliceOp
         | ResizeOp
         | GridSampleOp
         | ReduceOp
         | ArgReduceOp
+        | TopKOp
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp
         | PadOp,
     ) -> tuple[int, ...]:
@@ -8350,16 +10266,24 @@ class CEmitter:
             return op.shape
         if isinstance(op, ClipOp):
             return op.output_shape
+        if isinstance(op, QuantizeLinearOp):
+            return op.input_shape
         if isinstance(op, CastOp):
             return op.shape
         if isinstance(op, MatMulOp):
-            return (op.m, op.n)
+            return op.output_shape
+        if isinstance(op, EinsumOp):
+            return op.output_shape
         if isinstance(op, GemmOp):
             return (op.m, op.n)
         if isinstance(op, ConvOp):
             return (op.batch, op.out_channels, *op.out_spatial)
+        if isinstance(op, ConvTransposeOp):
+            return (op.batch, op.out_channels, *op.out_spatial)
         if isinstance(op, AveragePoolOp):
             return (op.batch, op.channels, op.out_h, op.out_w)
+        if isinstance(op, LpPoolOp):
+            return (op.batch, op.channels, op.out_h, op.out_w)
         if isinstance(op, BatchNormOp):
             return op.shape
         if isinstance(
@@ -8380,6 +10304,8 @@ class CEmitter:
             return op.shape
         if isinstance(op, LogSoftmaxOp):
             return op.shape
+        if isinstance(op, HardmaxOp):
+            return op.shape
         if isinstance(op, NegativeLogLikelihoodLossOp):
             return op.output_shape
         if isinstance(op, SoftmaxCrossEntropyLossOp):
@@ -8392,6 +10318,10 @@ class CEmitter:
             return op.output_shape
         if isinstance(op, GatherOp):
             return op.output_shape
+        if isinstance(op, GatherNDOp):
+            return op.output_shape
+        if isinstance(op, ScatterNDOp):
+            return op.output_shape
         if isinstance(op, TransposeOp):
             return op.output_shape
         if isinstance(op, ReshapeOp):
@@ -8400,6 +10330,8 @@ class CEmitter:
             return op.shape
         if isinstance(op, EyeLikeOp):
             return op.output_shape
+        if isinstance(op, TriluOp):
+            return op.output_shape
         if isinstance(op, TileOp):
             return op.output_shape
         if isinstance(op, PadOp):
@@ -8418,18 +10350,24 @@ class CEmitter:
             return op.output_shape
         if isinstance(op, ArgReduceOp):
             return op.output_shape
+        if isinstance(op, TopKOp):
+            return op.output_shape
         if isinstance(op, ConstantOfShapeOp):
             return op.shape
         if isinstance(op, ShapeOp):
             return op.output_shape
         if isinstance(op, SizeOp):
             return op.output_shape
+        if isinstance(op, NonZeroOp):
+            return op.output_shape
         if isinstance(op, ExpandOp):
             return op.output_shape
         if isinstance(op, CumSumOp):
             return op.input_shape
         if isinstance(op, RangeOp):
             return op.output_shape
+        if isinstance(op, OneHotOp):
+            return op.output_shape
         if op.output_rank == 3:
             return (op.batch, op.q_seq, op.q_heads * op.v_head_size)
         return (op.batch, op.q_heads, op.q_seq, op.v_head_size)
@@ -8441,11 +10379,16 @@ class CEmitter:
         | WhereOp
         | UnaryOp
         | ClipOp
+        | CastOp
+        | QuantizeLinearOp
         | MatMulOp
+        | EinsumOp
         | GemmOp
         | AttentionOp
         | ConvOp
+        | ConvTransposeOp
         | AveragePoolOp
+        | LpPoolOp
         | BatchNormOp
         | LpNormalizationOp
         | InstanceNormalizationOp
@@ -8455,14 +10398,20 @@ class CEmitter:
         | RMSNormalizationOp
         | SoftmaxOp
         | LogSoftmaxOp
+        | HardmaxOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
         | ConcatOp
         | GatherElementsOp
         | GatherOp
+        | GatherNDOp
         | TransposeOp
         | ReshapeOp
+        | IdentityOp
+        | EyeLikeOp
+        | TriluOp
+        | TileOp
         | ResizeOp
         | GridSampleOp
         | ReduceOp
@@ -8470,21 +10419,25 @@ class CEmitter:
         | ConstantOfShapeOp
         | ShapeOp
         | SizeOp
+        | NonZeroOp
         | ExpandOp
         | CumSumOp
         | RangeOp
+        | OneHotOp
         | SplitOp
         | PadOp,
     ) -> ScalarType:
         if isinstance(op, ArgReduceOp):
             return op.output_dtype
+        if isinstance(op, TopKOp):
+            return op.output_values_dtype
         return op.dtype
     @staticmethod
     def _codegen_shape(shape: tuple[int, ...]) -> tuple[int, ...]:
         if not shape:
             return (1,)
-        return shape
+        return tuple(max(1, dim) if isinstance(dim, int) else dim for dim in shape)
     @staticmethod
     def _array_suffix(shape: tuple[int, ...]) -> str:
@@ -8623,6 +10576,8 @@ class CEmitter:
         dim_names: Mapping[int, str] | None,
     ) -> tuple[str | int, ...]:
         dim_names = dim_names or {}
+        if not shape:
+            shape = (1,)
         return tuple(
             dim_names.get(index, dim) for index, dim in enumerate(shape)
         )
@@ -8677,7 +10632,8 @@ class CEmitter:
     @staticmethod
     def _element_count(shape: tuple[int, ...]) -> int:
-        shape = CEmitter._codegen_shape(shape)
+        if not shape:
+            return 1
         count = 1
         for dim in shape:
             if dim < 0:
@@ -8745,6 +10701,7 @@ class CEmitter:
         testbench_inputs: Mapping[str, tuple[float | int | bool, ...]] | None = None,
         dim_order: Sequence[str],
         dim_values: Mapping[str, int],
+        weight_data_filename: str,
     ) -> str:
         input_counts = tuple(
             self._element_count(shape) for shape in model.input_shapes
@@ -8755,7 +10712,8 @@ class CEmitter:
             model.input_names, model.input_shapes, input_counts, model.input_dtypes
         ):
             codegen_shape = self._codegen_shape(shape)
-            loop_vars = self._loop_vars(codegen_shape)
+            loop_shape = (1,) if not shape else shape
+            loop_vars = self._loop_vars(loop_shape)
             if dtype in {ScalarType.F16, ScalarType.F32}:
                 random_expr = "rng_next_float()"
             elif dtype == ScalarType.F64:
@@ -8769,20 +10727,26 @@ class CEmitter:
             constant_lines = None
             if constant_values is not None:
                 constant_name = f"{name}_testbench_data"
-                constant_lines = [
-                    self._format_value(value, dtype)
-                    for value in constant_values
-                ]
+                if constant_values:
+                    constant_lines = [
+                        self._format_value(value, dtype)
+                        for value in constant_values
+                    ]
+                else:
+                    constant_lines = [self._format_value(0, dtype)]
             inputs.append(
                 {
                     "name": name,
-                    "shape": codegen_shape,
+                    "shape": loop_shape,
                     "shape_literal": ",".join(str(dim) for dim in shape),
                     "count": count,
                     "array_suffix": self._array_suffix(codegen_shape),
+                    "array_index_expr": "".join(
+                        f"[{var}]" for var in loop_vars
+                    ),
                     "loop_vars": loop_vars,
-                    "rank": len(codegen_shape),
-                    "index_expr": self._index_expr(codegen_shape, loop_vars),
+                    "rank": len(loop_shape),
+                    "index_expr": self._index_expr(loop_shape, loop_vars),
                     "dtype": dtype,
                     "c_type": dtype.c_type,
                     "random_expr": random_expr,
@@ -8797,17 +10761,21 @@ class CEmitter:
             model.output_names, model.output_shapes, model.output_dtypes
         ):
             codegen_shape = self._codegen_shape(shape)
-            output_loop_vars = self._loop_vars(codegen_shape)
+            loop_shape = (1,) if not shape else shape
+            output_loop_vars = self._loop_vars(loop_shape)
             outputs.append(
                 {
                     "name": name,
-                    "shape": codegen_shape,
+                    "shape": loop_shape,
                     "shape_literal": ",".join(str(dim) for dim in shape),
-                    "count": self._element_count(codegen_shape),
+                    "count": self._element_count(shape),
                     "array_suffix": self._array_suffix(codegen_shape),
+                    "array_index_expr": "".join(
+                        f"[{var}]" for var in output_loop_vars
+                    ),
                     "loop_vars": output_loop_vars,
-                    "rank": len(codegen_shape),
-                    "index_expr": self._index_expr(codegen_shape, output_loop_vars),
+                    "rank": len(loop_shape),
+                    "index_expr": self._index_expr(loop_shape, output_loop_vars),
                     "dtype": dtype,
                     "c_type": dtype.c_type,
                     "print_format": self._print_format(dtype),
@@ -8822,9 +10790,87 @@ class CEmitter:
             ],
             inputs=inputs,
             outputs=outputs,
+            weight_data_filename=weight_data_filename,
         ).rstrip()
         return _format_c_indentation(rendered)
+    @staticmethod
+    def _testbench_requires_math(
+        model: LoweredModel,
+        testbench_inputs: Mapping[str, tuple[float | int | bool, ...]] | None,
+    ) -> bool:
+        if not testbench_inputs:
+            return False
+        dtype_map = dict(zip(model.input_names, model.input_dtypes))
+        float_dtypes = {ScalarType.F16, ScalarType.F32, ScalarType.F64}
+        for name, values in testbench_inputs.items():
+            if dtype_map.get(name) not in float_dtypes:
+                continue
+            for value in values:
+                if not math.isfinite(float(value)):
+                    return True
+        return False
+    def _partition_constants(
+        self, constants: tuple[ConstTensor, ...]
+    ) -> tuple[tuple[ConstTensor, ...], tuple[ConstTensor, ...]]:
+        if self._large_weight_threshold <= 0:
+            return (), constants
+        inline: list[ConstTensor] = []
+        large: list[ConstTensor] = []
+        for const in constants:
+            if self._element_count(const.shape) > self._large_weight_threshold:
+                large.append(const)
+            else:
+                inline.append(const)
+        return tuple(inline), tuple(large)
+    @staticmethod
+    def _weight_data_filename(model: LoweredModel) -> str:
+        return f"{model.name}.bin"
+    def _emit_weight_loader(
+        self, model: LoweredModel, large_constants: tuple[ConstTensor, ...]
+    ) -> str:
+        lines = [f"_Bool {model.name}_load(const char *path) {{"]
+        if not large_constants:
+            lines.append("    (void)path;")
+            lines.append("    return 1;")
+            lines.append("}")
+            return _format_c_indentation("\n".join(lines))
+        lines.append("    FILE *file = fopen(path, \"rb\");")
+        lines.append("    if (!file) {")
+        lines.append("        return 0;")
+        lines.append("    }")
+        lines.append(
+            f"    _Bool ok = {model.name}_load_file(file);"
+        )
+        lines.append("    fclose(file);")
+        lines.append("    return ok;")
+        lines.append("}")
+        lines.append("")
+        lines.append(f"static _Bool {model.name}_load_file(FILE *file) {{")
+        for const in large_constants:
+            shape = self._codegen_shape(const.shape)
+            loop_vars = self._loop_vars(shape)
+            for depth, var in enumerate(loop_vars):
+                lines.append(
+                    f"    for (idx_t {var} = 0; {var} < {shape[depth]}; ++{var}) {{"
+                )
+            index_expr = "".join(f"[{var}]" for var in loop_vars)
+            zero_index = "[0]" * len(shape)
+            lines.append(
+                f"        if (fread(&{const.name}{index_expr}, "
+                f"sizeof({const.name}{zero_index}), 1, file) != 1) {{"
+            )
+            lines.append("            return 0;")
+            lines.append("        }")
+            for _ in loop_vars[::-1]:
+                lines.append("    }")
+        lines.append("    return 1;")
+        lines.append("}")
+        return _format_c_indentation("\n".join(lines))
     def _emit_constant_definitions(
         self,
         constants: tuple[ConstTensor, ...],
@@ -8834,26 +10880,31 @@ class CEmitter:
         if not constants:
             return ""
         lines: list[str] = []
-        for const in constants:
+        for index, const in enumerate(constants, start=1):
+            lines.append(self._emit_constant_comment(const, index))
             c_type = const.dtype.c_type
-            array_suffix = self._array_suffix(const.shape)
+            shape = self._codegen_shape(const.shape)
+            array_suffix = self._array_suffix(shape)
             values = [
-                self._format_value(value, const.dtype) for value in const.data
+                self._format_weight_value(value, const.dtype)
+                for value in const.data
             ]
             lines.append(
                 f"{storage_prefix} {c_type} {const.name}{array_suffix} = {{"
             )
             if values:
-                chunk_size = 8
-                chunks = [
-                    values[index : index + chunk_size]
-                    for index in range(0, len(values), chunk_size)
-                ]
-                for chunk_index, chunk in enumerate(chunks):
-                    line = "    " + ", ".join(chunk)
-                    if chunk_index != len(chunks) - 1:
-                        line += ","
-                    lines.append(line)
+                if (
+                    self._truncate_weights_after is not None
+                    and len(values) > self._truncate_weights_after
+                ):
+                    truncated_lines, _, _, _ = (
+                        self._emit_initializer_lines_truncated(
+                            values, shape, self._truncate_weights_after
+                        )
+                    )
+                    lines.extend(truncated_lines)
+                else:
+                    lines.extend(self._emit_initializer_lines(values, shape))
             lines.append("};")
             lines.append("")
         if lines and not lines[-1]:
@@ -8866,12 +10917,44 @@ class CEmitter:
         if not constants:
             return ""
         lines = []
-        for const in constants:
+        for index, const in enumerate(constants, start=1):
+            lines.append(self._emit_constant_comment(const, index))
             c_type = const.dtype.c_type
             array_suffix = self._array_suffix(const.shape)
             lines.append(f"extern const {c_type} {const.name}{array_suffix};")
         return "\n".join(lines)
+    def _emit_constant_storage_definitions(
+        self,
+        constants: tuple[ConstTensor, ...],
+        *,
+        storage_prefix: str = "static",
+    ) -> str:
+        if not constants:
+            return ""
+        lines: list[str] = []
+        for index, const in enumerate(constants, start=1):
+            lines.append(self._emit_constant_comment(const, index))
+            c_type = const.dtype.c_type
+            array_suffix = self._array_suffix(const.shape)
+            lines.append(f"{storage_prefix} {c_type} {const.name}{array_suffix};")
+            lines.append("")
+        if lines and not lines[-1]:
+            lines.pop()
+        return "\n".join(lines)
+    def collect_weight_data(
+        self, constants: tuple[ConstTensor, ...]
+    ) -> bytes | None:
+        _, large_constants = self._partition_constants(constants)
+        if not large_constants:
+            return None
+        chunks: list[bytes] = []
+        for const in large_constants:
+            array = np.asarray(const.data, dtype=const.dtype.np_dtype)
+            chunks.append(array.tobytes(order="C"))
+        return b"".join(chunks)
     @staticmethod
     def _index_expr(shape: tuple[int, ...], loop_vars: tuple[str, ...]) -> str:
         shape = CEmitter._codegen_shape(shape)
@@ -8886,6 +10969,10 @@ class CEmitter:
     @staticmethod
     def _format_float(value: float) -> str:
+        if math.isnan(value):
+            return "NAN"
+        if math.isinf(value):
+            return "-INFINITY" if value < 0 else "INFINITY"
         formatted = f"{value:.9g}"
         if "e" not in formatted and "E" not in formatted and "." not in formatted:
             formatted = f"{formatted}.0"
@@ -8897,11 +10984,57 @@ class CEmitter:
     @staticmethod
     def _format_double(value: float) -> str:
+        if math.isnan(value):
+            return "NAN"
+        if math.isinf(value):
+            return "-INFINITY" if value < 0 else "INFINITY"
         formatted = f"{value:.17g}"
         if "e" not in formatted and "E" not in formatted and "." not in formatted:
             formatted = f"{formatted}.0"
         return formatted
+    @staticmethod
+    def _format_float32_hex(value: float) -> str:
+        bits = struct.unpack("<I", struct.pack("<f", float(value)))[0]
+        sign = "-" if (bits >> 31) else ""
+        exponent = (bits >> 23) & 0xFF
+        mantissa = bits & 0x7FFFFF
+        if exponent == 0 and mantissa == 0:
+            return f"{sign}0x0.0p+0"
+        if exponent == 0xFF:
+            if mantissa == 0:
+                return f"{sign}INFINITY"
+            return "NAN"
+        if exponent == 0:
+            shift = mantissa.bit_length() - 1
+            exponent_val = shift - 149
+            fraction = (mantissa - (1 << shift)) << (24 - shift)
+        else:
+            exponent_val = exponent - 127
+            fraction = mantissa << 1
+        return f"{sign}0x1.{fraction:06x}p{exponent_val:+d}"
+    @staticmethod
+    def _format_float64_hex(value: float) -> str:
+        bits = struct.unpack("<Q", struct.pack("<d", float(value)))[0]
+        sign = "-" if (bits >> 63) else ""
+        exponent = (bits >> 52) & 0x7FF
+        mantissa = bits & 0xFFFFFFFFFFFFF
+        if exponent == 0 and mantissa == 0:
+            return f"{sign}0x0.0p+0"
+        if exponent == 0x7FF:
+            if mantissa == 0:
+                return f"{sign}INFINITY"
+            return "NAN"
+        if exponent == 0:
+            shift = mantissa.bit_length() - 1
+            exponent_val = shift - 1074
+            fraction = (mantissa - (1 << shift)) << (52 - shift)
+        else:
+            exponent_val = exponent - 1023
+            fraction = mantissa
+        return f"{sign}0x1.{fraction:013x}p{exponent_val:+d}"
     @staticmethod
     def _format_floating(value: float, dtype: ScalarType) -> str:
         if dtype == ScalarType.F64:
@@ -8992,14 +11125,139 @@ class CEmitter:
             return self._format_int(int(value), 8, "INT8_MIN")
         raise CodegenError(f"Unsupported dtype {dtype.onnx_name}")
+    def _format_weight_value(
+        self, value: float | int | bool, dtype: ScalarType
+    ) -> str:
+        if dtype == ScalarType.F16:
+            formatted = self._format_float32_hex(float(value))
+            if formatted == "NAN" or formatted.endswith("INFINITY"):
+                return f"(_Float16){formatted}"
+            return f"(_Float16){formatted}f"
+        if dtype == ScalarType.F32:
+            formatted = self._format_float32_hex(float(value))
+            if formatted == "NAN" or formatted.endswith("INFINITY"):
+                return formatted
+            return f"{formatted}f"
+        if dtype == ScalarType.F64:
+            return self._format_float64_hex(float(value))
+        if dtype == ScalarType.BOOL:
+            return "true" if bool(value) else "false"
+        if dtype == ScalarType.U64:
+            return self._format_uint(int(value), 64, "UINT64_MAX")
+        if dtype == ScalarType.U32:
+            return self._format_uint(int(value), 32, "UINT32_MAX")
+        if dtype == ScalarType.U16:
+            return self._format_uint(int(value), 16, "UINT16_MAX")
+        if dtype == ScalarType.U8:
+            return self._format_uint(int(value), 8, "UINT8_MAX")
+        if dtype == ScalarType.I64:
+            return self._format_int64(int(value))
+        if dtype == ScalarType.I32:
+            return self._format_int(int(value), 32, "INT32_MIN")
+        if dtype == ScalarType.I16:
+            return self._format_int(int(value), 16, "INT16_MIN")
+        if dtype == ScalarType.I8:
+            return self._format_int(int(value), 8, "INT8_MIN")
+        raise CodegenError(f"Unsupported dtype {dtype.onnx_name}")
+    @staticmethod
+    def _emit_initializer_lines(
+        values: Sequence[str],
+        shape: tuple[int, ...],
+        indent: str = "    ",
+        per_line: int = 8,
+    ) -> list[str]:
+        if len(shape) == 1:
+            lines: list[str] = []
+            for index in range(0, len(values), per_line):
+                chunk = ", ".join(values[index : index + per_line])
+                lines.append(f"{indent}{chunk},")
+            if lines:
+                lines[-1] = lines[-1].rstrip(",")
+            return lines
+        sub_shape = shape[1:]
+        sub_size = prod(sub_shape)
+        lines = []
+        for index in range(shape[0]):
+            start = index * sub_size
+            end = start + sub_size
+            lines.append(f"{indent}{{")
+            lines.extend(
+                CEmitter._emit_initializer_lines(
+                    values[start:end],
+                    sub_shape,
+                    indent + "    ",
+                    per_line,
+                )
+            )
+            lines.append(f"{indent}}},")
+        if lines:
+            lines[-1] = lines[-1].rstrip(",")
+        return lines
+    @staticmethod
+    def _emit_initializer_lines_truncated(
+        values: Sequence[str],
+        shape: tuple[int, ...],
+        truncate_after: int,
+        indent: str = "    ",
+        per_line: int = 8,
+        start_index: int = 0,
+        emitted: int = 0,
+    ) -> tuple[list[str], int, int, bool]:
+        if len(shape) == 1:
+            items: list[str] = []
+            truncated = False
+            index = start_index
+            for _ in range(shape[0]):
+                if emitted >= truncate_after:
+                    items.append("...")
+                    truncated = True
+                    break
+                items.append(values[index])
+                index += 1
+                emitted += 1
+            lines: list[str] = []
+            for item_index in range(0, len(items), per_line):
+                chunk = ", ".join(items[item_index : item_index + per_line])
+                lines.append(f"{indent}{chunk},")
+            if lines:
+                lines[-1] = lines[-1].rstrip(",")
+            return lines, index, emitted, truncated
+        sub_shape = shape[1:]
+        lines: list[str] = []
+        index = start_index
+        truncated = False
+        for _ in range(shape[0]):
+            lines.append(f"{indent}{{")
+            sub_lines, index, emitted, sub_truncated = (
+                CEmitter._emit_initializer_lines_truncated(
+                    values,
+                    sub_shape,
+                    truncate_after,
+                    indent + "    ",
+                    per_line,
+                    index,
+                    emitted,
+                )
+            )
+            lines.extend(sub_lines)
+            lines.append(f"{indent}}},")
+            if sub_truncated:
+                truncated = True
+                break
+        if lines:
+            lines[-1] = lines[-1].rstrip(",")
+        return lines, index, emitted, truncated
     @staticmethod
     def _print_format(dtype: ScalarType) -> str:
         if dtype == ScalarType.F16:
-            return "%.8g"
+            return "\\\"%a\\\""
         if dtype == ScalarType.F32:
-            return "%.8g"
+            return "\\\"%a\\\""
         if dtype == ScalarType.F64:
-            return "%.17g"
+            return "\\\"%a\\\""
         if dtype == ScalarType.BOOL:
             return "%d"
         if dtype == ScalarType.U64:

emx-onnx-cgen 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

emx-onnx-cgen 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl