PyPI - emx-onnx-cgen - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

emx-onnx-cgen 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

emx_onnx_cgen/_build_info.py +1 -1
emx_onnx_cgen/_version.py +2 -2
emx_onnx_cgen/cli.py +50 -23
emx_onnx_cgen/codegen/__init__.py +2 -0
emx_onnx_cgen/codegen/c_emitter.py +1844 -1568
emx_onnx_cgen/codegen/emitter.py +5 -0
emx_onnx_cgen/compiler.py +30 -387
emx_onnx_cgen/ir/context.py +87 -0
emx_onnx_cgen/ir/op_base.py +193 -0
emx_onnx_cgen/ir/op_context.py +65 -0
emx_onnx_cgen/ir/ops/__init__.py +130 -0
emx_onnx_cgen/ir/ops/elementwise.py +146 -0
emx_onnx_cgen/ir/ops/misc.py +421 -0
emx_onnx_cgen/ir/ops/nn.py +580 -0
emx_onnx_cgen/ir/ops/reduce.py +95 -0
emx_onnx_cgen/lowering/__init__.py +79 -1
emx_onnx_cgen/lowering/adagrad.py +114 -0
emx_onnx_cgen/lowering/arg_reduce.py +1 -1
emx_onnx_cgen/lowering/attention.py +1 -1
emx_onnx_cgen/lowering/average_pool.py +1 -1
emx_onnx_cgen/lowering/batch_normalization.py +1 -1
emx_onnx_cgen/lowering/cast.py +1 -1
emx_onnx_cgen/lowering/common.py +36 -18
emx_onnx_cgen/lowering/concat.py +1 -1
emx_onnx_cgen/lowering/constant_of_shape.py +1 -1
emx_onnx_cgen/lowering/conv.py +1 -1
emx_onnx_cgen/lowering/conv_transpose.py +1 -1
emx_onnx_cgen/lowering/cumsum.py +1 -1
emx_onnx_cgen/lowering/depth_space.py +1 -1
emx_onnx_cgen/lowering/dropout.py +1 -1
emx_onnx_cgen/lowering/einsum.py +1 -1
emx_onnx_cgen/lowering/elementwise.py +152 -4
emx_onnx_cgen/lowering/expand.py +1 -1
emx_onnx_cgen/lowering/eye_like.py +1 -1
emx_onnx_cgen/lowering/flatten.py +1 -1
emx_onnx_cgen/lowering/gather.py +1 -1
emx_onnx_cgen/lowering/gather_elements.py +1 -1
emx_onnx_cgen/lowering/gather_nd.py +1 -1
emx_onnx_cgen/lowering/gemm.py +1 -1
emx_onnx_cgen/lowering/global_max_pool.py +1 -1
emx_onnx_cgen/lowering/grid_sample.py +1 -1
emx_onnx_cgen/lowering/group_normalization.py +1 -1
emx_onnx_cgen/lowering/hardmax.py +1 -1
emx_onnx_cgen/lowering/identity.py +1 -1
emx_onnx_cgen/lowering/instance_normalization.py +1 -1
emx_onnx_cgen/lowering/layer_normalization.py +1 -1
emx_onnx_cgen/lowering/logsoftmax.py +1 -1
emx_onnx_cgen/lowering/lp_normalization.py +1 -1
emx_onnx_cgen/lowering/lp_pool.py +1 -1
emx_onnx_cgen/lowering/lrn.py +1 -1
emx_onnx_cgen/lowering/lstm.py +1 -1
emx_onnx_cgen/lowering/matmul.py +1 -1
emx_onnx_cgen/lowering/maxpool.py +1 -1
emx_onnx_cgen/lowering/mean_variance_normalization.py +1 -1
emx_onnx_cgen/lowering/negative_log_likelihood_loss.py +1 -1
emx_onnx_cgen/lowering/non_max_suppression.py +157 -0
emx_onnx_cgen/lowering/nonzero.py +1 -1
emx_onnx_cgen/lowering/one_hot.py +1 -1
emx_onnx_cgen/lowering/pad.py +1 -1
emx_onnx_cgen/lowering/qlinear_matmul.py +212 -0
emx_onnx_cgen/lowering/quantize_linear.py +1 -1
emx_onnx_cgen/lowering/range.py +1 -1
emx_onnx_cgen/lowering/reduce.py +1 -1
emx_onnx_cgen/lowering/registry.py +24 -5
emx_onnx_cgen/lowering/reshape.py +1 -1
emx_onnx_cgen/lowering/resize.py +1 -1
emx_onnx_cgen/lowering/rms_normalization.py +1 -1
emx_onnx_cgen/lowering/rotary_embedding.py +165 -0
emx_onnx_cgen/lowering/scatter_nd.py +1 -1
emx_onnx_cgen/lowering/shape.py +6 -25
emx_onnx_cgen/lowering/size.py +1 -1
emx_onnx_cgen/lowering/slice.py +1 -1
emx_onnx_cgen/lowering/softmax.py +1 -1
emx_onnx_cgen/lowering/softmax_cross_entropy_loss.py +1 -1
emx_onnx_cgen/lowering/split.py +1 -1
emx_onnx_cgen/lowering/squeeze.py +1 -1
emx_onnx_cgen/lowering/tensor_scatter.py +110 -0
emx_onnx_cgen/lowering/tile.py +1 -1
emx_onnx_cgen/lowering/topk.py +25 -7
emx_onnx_cgen/lowering/transpose.py +1 -1
emx_onnx_cgen/lowering/trilu.py +1 -1
emx_onnx_cgen/lowering/unsqueeze.py +1 -1
emx_onnx_cgen/lowering/variadic.py +1 -1
emx_onnx_cgen/lowering/where.py +1 -1
emx_onnx_cgen/runtime/evaluator.py +325 -1
emx_onnx_cgen/verification.py +9 -39
{emx_onnx_cgen-0.3.0.dist-info → emx_onnx_cgen-0.3.2.dist-info}/METADATA +8 -7
emx_onnx_cgen-0.3.2.dist-info/RECORD +107 -0
{emx_onnx_cgen-0.3.0.dist-info → emx_onnx_cgen-0.3.2.dist-info}/WHEEL +1 -1
shared/scalar_functions.py +11 -0
shared/ulp.py +17 -0
emx_onnx_cgen-0.3.0.dist-info/RECORD +0 -93
{emx_onnx_cgen-0.3.0.dist-info → emx_onnx_cgen-0.3.2.dist-info}/entry_points.txt +0 -0
{emx_onnx_cgen-0.3.0.dist-info → emx_onnx_cgen-0.3.2.dist-info}/top_level.txt +0 -0

emx_onnx_cgen/codegen/c_emitter.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from enum import Enum
 import itertools
 import math
 from math import prod
@@ -20,6 +19,85 @@ from ..ops import (
     binary_op_symbol,
     unary_op_symbol,
 )
+from ..ir.op_base import (
+    BroadcastingOpBase,
+    ConvLikeOpBase,
+    ElementwiseOpBase,
+    GemmLikeOpBase,
+    MatMulLikeOpBase,
+    ReduceOpBase,
+    RenderableOpBase,
+    OpBase,
+    EmitContext,
+)
+from ..ir.op_context import OpContext
+from ..ir.ops import (
+    AdagradOp,
+    ArgReduceOp,
+    AttentionOp,
+    AveragePoolOp,
+    BatchNormOp,
+    BinaryOp,
+    CastOp,
+    ClipOp,
+    ConcatOp,
+    ConstantOfShapeOp,
+    ConvOp,
+    ConvTransposeOp,
+    CumSumOp,
+    DepthToSpaceOp,
+    EinsumKind,
+    EinsumOp,
+    ExpandOp,
+    EyeLikeOp,
+    GatherElementsOp,
+    GatherNDOp,
+    GatherOp,
+    GemmOp,
+    GridSampleOp,
+    GroupNormalizationOp,
+    HardmaxOp,
+    IdentityOp,
+    InstanceNormalizationOp,
+    LayerNormalizationOp,
+    LogSoftmaxOp,
+    LpNormalizationOp,
+    LpPoolOp,
+    LrnOp,
+    LstmOp,
+    MatMulOp,
+    MaxPoolOp,
+    MeanVarianceNormalizationOp,
+    MultiInputBinaryOp,
+    NegativeLogLikelihoodLossOp,
+    NonMaxSuppressionOp,
+    NonZeroOp,
+    OneHotOp,
+    PadOp,
+    QuantizeLinearOp,
+    QLinearMatMulOp,
+    RangeOp,
+    ReduceOp,
+    ReshapeOp,
+    ResizeOp,
+    RMSNormalizationOp,
+    RotaryEmbeddingOp,
+    ScatterNDOp,
+    ShapeOp,
+    SizeOp,
+    SliceOp,
+    SoftmaxCrossEntropyLossOp,
+    SoftmaxOp,
+    SpaceToDepthOp,
+    SplitOp,
+    TensorScatterOp,
+    TileOp,
+    TopKOp,
+    TransposeOp,
+    TriluOp,
+    UnaryOp,
+    WhereOp,
+)
 from shared.scalar_functions import (
     ScalarFunction,
     ScalarFunctionKey,
@@ -150,44 +228,6 @@ _C_KEYWORDS = {
     "while",
 }
-@dataclass(frozen=True)
-class BinaryOp:
-    input0: str
-    input1: str
-    output: str
-    function: ScalarFunction
-    operator_kind: OperatorKind
-    input0_shape: tuple[int, ...]
-    input1_shape: tuple[int, ...]
-    shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class MultiInputBinaryOp:
-    inputs: tuple[str, ...]
-    output: str
-    function: ScalarFunction
-    operator_kind: OperatorKind
-    shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class WhereOp:
-    condition: str
-    input_x: str
-    input_y: str
-    output: str
-    condition_shape: tuple[int, ...]
-    x_shape: tuple[int, ...]
-    y_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
 @dataclass(frozen=True)
 class NodeInfo:
     op_type: str
@@ -197,905 +237,6 @@ class NodeInfo:
     attrs: dict[str, object]
-@dataclass(frozen=True)
-class UnaryOp:
-    input0: str
-    output: str
-    function: ScalarFunction
-    shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-    params: tuple[float, ...] = ()
-@dataclass(frozen=True)
-class ClipOp:
-    input0: str
-    input_min: str | None
-    input_max: str | None
-    output: str
-    input_shape: tuple[int, ...]
-    min_shape: tuple[int, ...] | None
-    max_shape: tuple[int, ...] | None
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-@dataclass(frozen=True)
-class CastOp:
-    input0: str
-    output: str
-    shape: tuple[int, ...]
-    input_dtype: ScalarType
-    dtype: ScalarType
-@dataclass(frozen=True)
-class MatMulOp:
-    input0: str
-    input1: str
-    output: str
-    input0_shape: tuple[int, ...]
-    input1_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    batch_shape: tuple[int, ...]
-    input0_batch_shape: tuple[int, ...]
-    input1_batch_shape: tuple[int, ...]
-    m: int
-    n: int
-    k: int
-    left_vector: bool
-    right_vector: bool
-    dtype: ScalarType
-class EinsumKind(str, Enum):
-    REDUCE_ALL = "reduce_all"
-    SUM_J = "sum_j"
-    TRANSPOSE = "transpose"
-    DOT = "dot"
-    BATCH_MATMUL = "batch_matmul"
-    BATCH_DIAGONAL = "batch_diagonal"
-@dataclass(frozen=True)
-class EinsumOp:
-    inputs: tuple[str, ...]
-    output: str
-    kind: EinsumKind
-    input_shapes: tuple[tuple[int, ...], ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class GemmOp:
-    input_a: str
-    input_b: str
-    input_c: str | None
-    output: str
-    m: int
-    n: int
-    k: int
-    trans_a: bool
-    trans_b: bool
-    alpha: float | int
-    beta: float | int
-    c_shape: tuple[int, ...] | None
-    dtype: ScalarType
-@dataclass(frozen=True)
-class AttentionOp:
-    input_q: str
-    input_k: str
-    input_v: str
-    input_attn_mask: str | None
-    input_past_key: str | None
-    input_past_value: str | None
-    input_nonpad_kv_seqlen: str | None
-    output: str
-    output_present_key: str | None
-    output_present_value: str | None
-    output_qk_matmul: str | None
-    batch: int
-    q_heads: int
-    kv_heads: int
-    q_seq: int
-    kv_seq: int
-    total_seq: int
-    past_seq: int
-    qk_head_size: int
-    v_head_size: int
-    q_hidden_size: int | None
-    k_hidden_size: int | None
-    v_hidden_size: int | None
-    scale: float
-    is_causal: bool
-    softcap: float
-    qk_matmul_output_mode: int
-    q_rank: int
-    k_rank: int
-    v_rank: int
-    output_rank: int
-    mask_shape: tuple[int, ...] | None
-    mask_is_bool: bool
-    mask_rank: int | None
-    mask_broadcast_batch: bool
-    mask_broadcast_heads: bool
-    mask_broadcast_q_seq: bool
-    mask_q_seq: int | None
-    mask_kv_seq: int | None
-    head_group_size: int
-    dtype: ScalarType
-@dataclass(frozen=True)
-class ConvOp:
-    input0: str
-    weights: str
-    bias: str | None
-    output: str
-    batch: int
-    in_channels: int
-    out_channels: int
-    spatial_rank: int
-    in_spatial: tuple[int, ...]
-    out_spatial: tuple[int, ...]
-    kernel_shape: tuple[int, ...]
-    strides: tuple[int, ...]
-    pads: tuple[int, ...]
-    dilations: tuple[int, ...]
-    group: int
-    dtype: ScalarType
-    @property
-    def out_h(self) -> int:
-        if self.spatial_rank < 1:
-            raise ValueError("Conv output height is undefined for spatial_rank < 1")
-        return self.out_spatial[0]
-    @property
-    def out_w(self) -> int:
-        if self.spatial_rank < 2:
-            raise ValueError("Conv output width is undefined for spatial_rank < 2")
-        return self.out_spatial[1]
-@dataclass(frozen=True)
-class ConvTransposeOp:
-    input0: str
-    weights: str
-    bias: str | None
-    output: str
-    batch: int
-    in_channels: int
-    out_channels: int
-    spatial_rank: int
-    in_spatial: tuple[int, ...]
-    out_spatial: tuple[int, ...]
-    kernel_shape: tuple[int, ...]
-    strides: tuple[int, ...]
-    pads: tuple[int, ...]
-    dilations: tuple[int, ...]
-    output_padding: tuple[int, ...]
-    group: int
-    dtype: ScalarType
-@dataclass(frozen=True)
-class AveragePoolOp:
-    input0: str
-    output: str
-    batch: int
-    channels: int
-    in_h: int
-    in_w: int
-    out_h: int
-    out_w: int
-    kernel_h: int
-    kernel_w: int
-    stride_h: int
-    stride_w: int
-    pad_top: int
-    pad_left: int
-    pad_bottom: int
-    pad_right: int
-    count_include_pad: bool
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LpPoolOp:
-    input0: str
-    output: str
-    batch: int
-    channels: int
-    in_h: int
-    in_w: int
-    out_h: int
-    out_w: int
-    kernel_h: int
-    kernel_w: int
-    stride_h: int
-    stride_w: int
-    pad_top: int
-    pad_left: int
-    pad_bottom: int
-    pad_right: int
-    p: int
-    dtype: ScalarType
-@dataclass(frozen=True)
-class QuantizeLinearOp:
-    input0: str
-    scale: str
-    zero_point: str | None
-    output: str
-    input_shape: tuple[int, ...]
-    axis: int | None
-    dtype: ScalarType
-    input_dtype: ScalarType
-    scale_dtype: ScalarType
-@dataclass(frozen=True)
-class SoftmaxOp:
-    input0: str
-    output: str
-    outer: int
-    axis_size: int
-    inner: int
-    axis: int
-    shape: tuple[int, ...]
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LogSoftmaxOp:
-    input0: str
-    output: str
-    outer: int
-    axis_size: int
-    inner: int
-    axis: int
-    shape: tuple[int, ...]
-    dtype: ScalarType
-@dataclass(frozen=True)
-class HardmaxOp:
-    input0: str
-    output: str
-    outer: int
-    axis_size: int
-    inner: int
-    axis: int
-    shape: tuple[int, ...]
-    dtype: ScalarType
-@dataclass(frozen=True)
-class NegativeLogLikelihoodLossOp:
-    input0: str
-    target: str
-    weight: str | None
-    output: str
-    input_shape: tuple[int, ...]
-    target_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    n: int
-    c: int
-    d: int
-    reduction: str
-    ignore_index: int
-    input_dtype: ScalarType
-    weight_dtype: ScalarType | None
-    weight_shape: tuple[int, ...] | None
-    dtype: ScalarType
-    target_dtype: ScalarType
-@dataclass(frozen=True)
-class SoftmaxCrossEntropyLossOp:
-    input0: str
-    target: str
-    weight: str | None
-    output: str
-    log_prob: str | None
-    input_shape: tuple[int, ...]
-    target_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    log_prob_shape: tuple[int, ...] | None
-    n: int
-    c: int
-    d: int
-    reduction: str
-    ignore_index: int | None
-    input_dtype: ScalarType
-    weight_dtype: ScalarType | None
-    weight_shape: tuple[int, ...] | None
-    dtype: ScalarType
-    target_dtype: ScalarType
-@dataclass(frozen=True)
-class BatchNormOp:
-    input0: str
-    scale: str
-    bias: str
-    mean: str
-    variance: str
-    output: str
-    shape: tuple[int, ...]
-    channels: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LpNormalizationOp:
-    input0: str
-    output: str
-    shape: tuple[int, ...]
-    axis: int
-    p: int
-    outer: int
-    axis_size: int
-    inner: int
-    dtype: ScalarType
-@dataclass(frozen=True)
-class InstanceNormalizationOp:
-    input0: str
-    scale: str
-    bias: str
-    output: str
-    shape: tuple[int, ...]
-    channels: int
-    spatial_size: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class GroupNormalizationOp:
-    input0: str
-    scale: str
-    bias: str
-    output: str
-    shape: tuple[int, ...]
-    channels: int
-    num_groups: int
-    group_size: int
-    spatial_size: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LayerNormalizationOp:
-    input0: str
-    scale: str
-    bias: str | None
-    output: str
-    mean_output: str | None
-    invstd_output: str | None
-    shape: tuple[int, ...]
-    normalized_shape: tuple[int, ...]
-    scale_shape: tuple[int, ...]
-    bias_shape: tuple[int, ...] | None
-    outer: int
-    inner: int
-    axis: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class MeanVarianceNormalizationOp:
-    input0: str
-    output: str
-    shape: tuple[int, ...]
-    axes: tuple[int, ...]
-    non_axes: tuple[int, ...]
-    reduce_count: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class RMSNormalizationOp:
-    input0: str
-    scale: str
-    output: str
-    shape: tuple[int, ...]
-    normalized_shape: tuple[int, ...]
-    scale_shape: tuple[int, ...]
-    outer: int
-    inner: int
-    axis: int
-    epsilon: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LrnOp:
-    input0: str
-    output: str
-    shape: tuple[int, ...]
-    channels: int
-    size: int
-    half: int
-    alpha: float
-    beta: float
-    bias: float
-    dtype: ScalarType
-@dataclass(frozen=True)
-class LstmOp:
-    input_x: str
-    input_w: str
-    input_r: str
-    input_b: str | None
-    input_sequence_lens: str | None
-    input_initial_h: str | None
-    input_initial_c: str | None
-    input_p: str | None
-    output_y: str | None
-    output_y_h: str | None
-    output_y_c: str | None
-    seq_length: int
-    batch_size: int
-    input_size: int
-    hidden_size: int
-    num_directions: int
-    direction: str
-    layout: int
-    input_forget: int
-    clip: float | None
-    activation_kinds: tuple[int, ...]
-    activation_alphas: tuple[float, ...]
-    activation_betas: tuple[float, ...]
-    dtype: ScalarType
-    sequence_lens_dtype: ScalarType | None
-@dataclass(frozen=True)
-class MaxPoolOp:
-    input0: str
-    output: str
-    indices: str | None
-    batch: int
-    channels: int
-    spatial_rank: int
-    in_spatial: tuple[int, ...]
-    out_spatial: tuple[int, ...]
-    kernel_shape: tuple[int, ...]
-    strides: tuple[int, ...]
-    pads: tuple[int, ...]
-    dilations: tuple[int, ...]
-    ceil_mode: bool
-    storage_order: int
-    dtype: ScalarType
-    indices_dtype: ScalarType | None
-@dataclass(frozen=True)
-class ConcatOp:
-    inputs: tuple[str, ...]
-    output: str
-    axis: int
-    input_shapes: tuple[tuple[int, ...], ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-@dataclass(frozen=True)
-class GatherElementsOp:
-    data: str
-    indices: str
-    output: str
-    axis: int
-    data_shape: tuple[int, ...]
-    indices_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    indices_dtype: ScalarType
-@dataclass(frozen=True)
-class GatherOp:
-    data: str
-    indices: str
-    output: str
-    axis: int
-    data_shape: tuple[int, ...]
-    indices_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    indices_dtype: ScalarType
-@dataclass(frozen=True)
-class GatherNDOp:
-    data: str
-    indices: str
-    output: str
-    batch_dims: int
-    data_shape: tuple[int, ...]
-    indices_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    indices_dtype: ScalarType
-@dataclass(frozen=True)
-class ScatterNDOp:
-    data: str
-    indices: str
-    updates: str
-    output: str
-    data_shape: tuple[int, ...]
-    indices_shape: tuple[int, ...]
-    updates_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    reduction: str
-    dtype: ScalarType
-    indices_dtype: ScalarType
-@dataclass(frozen=True)
-class TransposeOp:
-    input0: str
-    output: str
-    perm: tuple[int, ...]
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class ReshapeOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class IdentityOp:
-    input0: str
-    output: str
-    shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class EyeLikeOp:
-    input0: str
-    output: str
-    output_shape: tuple[int, ...]
-    k: int
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class TriluOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    upper: bool
-    k_value: int
-    k_input: str | None
-    k_input_shape: tuple[int, ...] | None
-    k_input_dtype: ScalarType | None
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class TileOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    repeats: tuple[int, ...]
-    input_strides: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class PadOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    pads_begin: tuple[int, ...] | None
-    pads_end: tuple[int, ...] | None
-    pads_input: str | None
-    pads_shape: tuple[int, ...] | None
-    pads_dtype: ScalarType | None
-    pads_axis_map: tuple[int | None, ...] | None
-    pads_values: tuple[int, ...] | None
-    axes_input: str | None
-    axes_shape: tuple[int, ...] | None
-    axes_dtype: ScalarType | None
-    mode: str
-    value: float | int | bool
-    value_input: str | None
-    value_shape: tuple[int, ...] | None
-    dtype: ScalarType
-    input_dtype: ScalarType
-    input_strides: tuple[int, ...]
-@dataclass(frozen=True)
-class DepthToSpaceOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    blocksize: int
-    mode: str
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class SpaceToDepthOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    blocksize: int
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class SliceOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    starts: tuple[int, ...] | None
-    steps: tuple[int, ...] | None
-    axes: tuple[int, ...] | None
-    starts_input: str | None
-    ends_input: str | None
-    axes_input: str | None
-    steps_input: str | None
-    starts_shape: tuple[int, ...] | None
-    ends_shape: tuple[int, ...] | None
-    axes_shape: tuple[int, ...] | None
-    steps_shape: tuple[int, ...] | None
-    starts_dtype: ScalarType | None
-    ends_dtype: ScalarType | None
-    axes_dtype: ScalarType | None
-    steps_dtype: ScalarType | None
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class ResizeOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    scales: tuple[float, ...]
-    scales_input: str | None
-    sizes_input: str | None
-    roi_input: str | None
-    axes: tuple[int, ...]
-    scales_shape: tuple[int, ...] | None
-    sizes_shape: tuple[int, ...] | None
-    roi_shape: tuple[int, ...] | None
-    scales_dtype: ScalarType | None
-    sizes_dtype: ScalarType | None
-    roi_dtype: ScalarType | None
-    scales_axes: tuple[int, ...] | None
-    sizes_axes: tuple[int, ...] | None
-    roi_axes: tuple[int, ...] | None
-    mode: str
-    coordinate_transformation_mode: str
-    nearest_mode: str
-    cubic_coeff_a: float
-    exclude_outside: bool
-    extrapolation_value: float
-    antialias: bool
-    keep_aspect_ratio_policy: str
-    dtype: ScalarType
-@dataclass(frozen=True)
-class GridSampleOp:
-    input0: str
-    grid: str
-    output: str
-    input_shape: tuple[int, ...]
-    grid_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    spatial_rank: int
-    input_spatial: tuple[int, ...]
-    output_spatial: tuple[int, ...]
-    mode: str
-    padding_mode: str
-    align_corners: bool
-    dtype: ScalarType
-    grid_dtype: ScalarType
-@dataclass(frozen=True)
-class ReduceOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    axes: tuple[int, ...]
-    axes_input: str | None
-    axes_input_shape: tuple[int, ...] | None
-    axes_input_dtype: ScalarType | None
-    keepdims: bool
-    noop_with_empty_axes: bool
-    reduce_kind: str
-    reduce_count: int | None
-    dtype: ScalarType
-@dataclass(frozen=True)
-class ArgReduceOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    axis: int
-    keepdims: bool
-    select_last_index: bool
-    reduce_kind: str
-    input_dtype: ScalarType
-    output_dtype: ScalarType
-@dataclass(frozen=True)
-class TopKOp:
-    input0: str
-    output_values: str
-    output_indices: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    axis: int
-    k: int
-    largest: bool
-    sorted: bool
-    input_dtype: ScalarType
-    output_values_dtype: ScalarType
-    output_indices_dtype: ScalarType
-@dataclass(frozen=True)
-class ConstantOfShapeOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    shape: tuple[int, ...]
-    value: float | int | bool
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class ShapeOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    values: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class SizeOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    value: int
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class NonZeroOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class ExpandOp:
-    input0: str
-    output: str
-    input_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    input_shape_padded: tuple[int, ...]
-    input_strides: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class CumSumOp:
-    input0: str
-    axis_input: str | None
-    axis_input_dtype: ScalarType | None
-    axis: int | None
-    output: str
-    input_shape: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
-    exclusive: bool
-    reverse: bool
-@dataclass(frozen=True)
-class RangeOp:
-    start: str
-    limit: str
-    delta: str
-    output: str
-    output_shape: tuple[int, ...]
-    length: int
-    dtype: ScalarType
-    input_dtype: ScalarType
-@dataclass(frozen=True)
-class OneHotOp:
-    indices: str
-    depth: str
-    values: str
-    output: str
-    axis: int
-    indices_shape: tuple[int, ...]
-    values_shape: tuple[int, ...]
-    output_shape: tuple[int, ...]
-    depth_dim: int
-    dtype: ScalarType
-    indices_dtype: ScalarType
-    depth_dtype: ScalarType
-@dataclass(frozen=True)
-class SplitOp:
-    input0: str
-    outputs: tuple[str, ...]
-    input_shape: tuple[int, ...]
-    output_shapes: tuple[tuple[int, ...], ...]
-    axis: int
-    split_sizes: tuple[int, ...]
-    dtype: ScalarType
-    input_dtype: ScalarType
 @dataclass(frozen=True)
 class ConstTensor:
     name: str
@@ -1135,78 +276,29 @@ class ModelHeader:
 @dataclass(frozen=True)
 class LoweredModel:
-    name: str
-    input_names: tuple[str, ...]
-    input_shapes: tuple[tuple[int, ...], ...]
-    input_dtypes: tuple[ScalarType, ...]
-    output_names: tuple[str, ...]
-    output_shapes: tuple[tuple[int, ...], ...]
-    output_dtypes: tuple[ScalarType, ...]
-    constants: tuple[ConstTensor, ...]
-    ops: tuple[
-        BinaryOp
-        | MultiInputBinaryOp
-        | WhereOp
-        | UnaryOp
-        | ClipOp
-        | CastOp
-        | QuantizeLinearOp
-        | MatMulOp
-        | EinsumOp
-        | GemmOp
-        | AttentionOp
-        | ConvOp
-        | ConvTransposeOp
-        | AveragePoolOp
-        | LpPoolOp
-        | BatchNormOp
-        | LpNormalizationOp
-        | InstanceNormalizationOp
-        | GroupNormalizationOp
-        | LayerNormalizationOp
-        | MeanVarianceNormalizationOp
-        | RMSNormalizationOp
-        | LrnOp
-        | LstmOp
-        | SoftmaxOp
-        | LogSoftmaxOp
-        | HardmaxOp
-        | NegativeLogLikelihoodLossOp
-        | SoftmaxCrossEntropyLossOp
-        | MaxPoolOp
-        | ConcatOp
-        | GatherElementsOp
-        | GatherOp
-        | GatherNDOp
-        | ScatterNDOp
-        | TransposeOp
-        | ReshapeOp
-        | IdentityOp
-        | EyeLikeOp
-        | TriluOp
-        | TileOp
-        | PadOp
-        | DepthToSpaceOp
-        | SpaceToDepthOp
-        | SliceOp
-        | ResizeOp
-        | GridSampleOp
-        | ReduceOp
-        | ArgReduceOp
-        | TopKOp
-        | ConstantOfShapeOp
-        | ShapeOp
-        | SizeOp
-        | NonZeroOp
-        | ExpandOp
-        | CumSumOp
-        | RangeOp
-        | OneHotOp
-        | SplitOp,
-        ...,
-    ]
+    name: str
+    input_names: tuple[str, ...]
+    input_shapes: tuple[tuple[int, ...], ...]
+    input_dtypes: tuple[ScalarType, ...]
+    output_names: tuple[str, ...]
+    output_shapes: tuple[tuple[int, ...], ...]
+    output_dtypes: tuple[ScalarType, ...]
+    constants: tuple[ConstTensor, ...]
+    ops: tuple[OpBase, ...]
     node_infos: tuple[NodeInfo, ...]
     header: ModelHeader
+    op_context: OpContext
+@dataclass
+class _EmitState:
+    model: LoweredModel
+    templates: dict[str, Template]
+    scalar_registry: ScalarFunctionRegistry
+    dim_args: str
+    tensor_dim_names: Mapping[str, Mapping[int, str]]
+    op_context: OpContext
+    value_name_map: Mapping[str, str]
 class CEmitter:
@@ -1235,6 +327,7 @@ class CEmitter:
         if large_weight_threshold < 0:
             raise CodegenError("large_weight_threshold must be >= 0")
         self._large_weight_threshold = large_weight_threshold
+        self._emit_state: _EmitState | None = None
     @staticmethod
     def _sanitize_identifier(name: str) -> str:
@@ -1297,6 +390,26 @@ class CEmitter:
             mapped[key] = unique
         return mapped
+    def _ctx_name(self, name: str) -> str:
+        if self._emit_state is None:
+            raise CodegenError("Emitter state not initialized")
+        return self._emit_state.value_name_map.get(name, name)
+    def _ctx_shape(self, name: str) -> tuple[int, ...]:
+        if self._emit_state is None:
+            raise CodegenError("Emitter state not initialized")
+        return self._emit_state.op_context.shape(self._ctx_name(name))
+    def _ctx_dtype(self, name: str) -> ScalarType:
+        if self._emit_state is None:
+            raise CodegenError("Emitter state not initialized")
+        return self._emit_state.op_context.dtype(self._ctx_name(name))
+    def _derived(self, op: OpBase, key: str) -> object:
+        if self._emit_state is None:
+            raise CodegenError("Emitter state not initialized")
+        return self._emit_state.op_context.require_derived(op, key)
     @staticmethod
     def _build_param_decls(
         specs: Sequence[tuple[str | None, str, str, bool]]
@@ -1334,10 +447,12 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
         | AttentionOp
+        | RotaryEmbeddingOp
         | ConvOp
         | AveragePoolOp
         | BatchNormOp
@@ -1349,6 +464,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -1360,6 +476,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -1379,6 +496,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -1409,6 +527,18 @@ class CEmitter:
                 names.append(op.zero_point)
             names.append(op.output)
             return tuple(names)
+        if isinstance(op, QLinearMatMulOp):
+            return (
+                op.input0,
+                op.input0_scale,
+                op.input0_zero_point,
+                op.input1,
+                op.input1_scale,
+                op.input1_zero_point,
+                op.output_scale,
+                op.output_zero_point,
+                op.output,
+            )
         if isinstance(op, MatMulOp):
             return (op.input0, op.input1, op.output)
         if isinstance(op, EinsumOp):
@@ -1437,6 +567,12 @@ class CEmitter:
             if op.output_qk_matmul is not None:
                 names.append(op.output_qk_matmul)
             return tuple(names)
+        if isinstance(op, RotaryEmbeddingOp):
+            names = [op.input0, op.cos_cache, op.sin_cache]
+            if op.position_ids is not None:
+                names.append(op.position_ids)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, ConvOp):
             names = [op.input0, op.weights]
             if op.bias is not None:
@@ -1494,6 +630,16 @@ class CEmitter:
             if op.output_y_c is not None:
                 names.append(op.output_y_c)
             return tuple(names)
+        if isinstance(op, AdagradOp):
+            return (
+                op.rate,
+                op.timestep,
+                *op.inputs,
+                *op.gradients,
+                *op.accumulators,
+                *op.outputs,
+                *op.accumulator_outputs,
+            )
         if isinstance(op, (SoftmaxOp, LogSoftmaxOp, HardmaxOp)):
             return (op.input0, op.output)
         if isinstance(op, NegativeLogLikelihoodLossOp):
@@ -1523,6 +669,12 @@ class CEmitter:
             return (op.data, op.indices, op.output)
         if isinstance(op, ScatterNDOp):
             return (op.data, op.indices, op.updates, op.output)
+        if isinstance(op, TensorScatterOp):
+            names = [op.past_cache, op.update]
+            if op.write_indices is not None:
+                names.append(op.write_indices)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, ConcatOp):
             return (*op.inputs, op.output)
         if isinstance(op, ConstantOfShapeOp):
@@ -1533,6 +685,16 @@ class CEmitter:
             return (op.input0, op.output)
         if isinstance(op, NonZeroOp):
             return (op.input0, op.output)
+        if isinstance(op, NonMaxSuppressionOp):
+            names = [op.boxes, op.scores]
+            if op.max_output_boxes_per_class is not None:
+                names.append(op.max_output_boxes_per_class)
+            if op.iou_threshold is not None:
+                names.append(op.iou_threshold)
+            if op.score_threshold is not None:
+                names.append(op.score_threshold)
+            names.append(op.output)
+            return tuple(names)
         if isinstance(op, ExpandOp):
             return (op.input0, op.output)
         if isinstance(op, CumSumOp):
@@ -1653,10 +815,12 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
         | AttentionOp
+        | RotaryEmbeddingOp
         | ConvOp
         | ConvTransposeOp
         | AveragePoolOp
@@ -1670,6 +834,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -1681,6 +846,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -1700,6 +866,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -1714,10 +881,12 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
         | AttentionOp
+        | RotaryEmbeddingOp
         | ConvOp
         | ConvTransposeOp
         | AveragePoolOp
@@ -1731,6 +900,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -1742,6 +912,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -1761,6 +932,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -1844,6 +1016,47 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 scale_dtype=op.scale_dtype,
             )
+        if isinstance(op, QLinearMatMulOp):
+            return QLinearMatMulOp(
+                input0=name_map.get(op.input0, op.input0),
+                input0_scale=name_map.get(op.input0_scale, op.input0_scale),
+                input0_zero_point=name_map.get(
+                    op.input0_zero_point, op.input0_zero_point
+                ),
+                input1=name_map.get(op.input1, op.input1),
+                input1_scale=name_map.get(op.input1_scale, op.input1_scale),
+                input1_zero_point=name_map.get(
+                    op.input1_zero_point, op.input1_zero_point
+                ),
+                output_scale=name_map.get(op.output_scale, op.output_scale),
+                output_zero_point=name_map.get(
+                    op.output_zero_point, op.output_zero_point
+                ),
+                output=name_map.get(op.output, op.output),
+                input0_shape=op.input0_shape,
+                input1_shape=op.input1_shape,
+                output_shape=op.output_shape,
+                batch_shape=op.batch_shape,
+                input0_batch_shape=op.input0_batch_shape,
+                input1_batch_shape=op.input1_batch_shape,
+                m=op.m,
+                n=op.n,
+                k=op.k,
+                left_vector=op.left_vector,
+                right_vector=op.right_vector,
+                input0_dtype=op.input0_dtype,
+                input1_dtype=op.input1_dtype,
+                dtype=op.dtype,
+                input0_scale_dtype=op.input0_scale_dtype,
+                input1_scale_dtype=op.input1_scale_dtype,
+                output_scale_dtype=op.output_scale_dtype,
+                input0_scale_shape=op.input0_scale_shape,
+                input1_scale_shape=op.input1_scale_shape,
+                output_scale_shape=op.output_scale_shape,
+                input0_zero_shape=op.input0_zero_shape,
+                input1_zero_shape=op.input1_zero_shape,
+                output_zero_shape=op.output_zero_shape,
+            )
         if isinstance(op, MatMulOp):
             return MatMulOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -1946,6 +1159,30 @@ class CEmitter:
                 head_group_size=op.head_group_size,
                 dtype=op.dtype,
             )
+        if isinstance(op, RotaryEmbeddingOp):
+            return RotaryEmbeddingOp(
+                input0=name_map.get(op.input0, op.input0),
+                cos_cache=name_map.get(op.cos_cache, op.cos_cache),
+                sin_cache=name_map.get(op.sin_cache, op.sin_cache),
+                position_ids=self._map_optional_name(
+                    name_map, op.position_ids
+                ),
+                output=name_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                cos_shape=op.cos_shape,
+                sin_shape=op.sin_shape,
+                position_ids_shape=op.position_ids_shape,
+                dtype=op.dtype,
+                position_ids_dtype=op.position_ids_dtype,
+                rotary_dim=op.rotary_dim,
+                rotary_dim_half=op.rotary_dim_half,
+                head_size=op.head_size,
+                num_heads=op.num_heads,
+                seq_len=op.seq_len,
+                batch=op.batch,
+                input_rank=op.input_rank,
+                interleaved=op.interleaved,
+            )
         if isinstance(op, ConvOp):
             return ConvOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2168,6 +1405,33 @@ class CEmitter:
                 dtype=op.dtype,
                 sequence_lens_dtype=op.sequence_lens_dtype,
             )
+        if isinstance(op, AdagradOp):
+            return AdagradOp(
+                rate=name_map.get(op.rate, op.rate),
+                timestep=name_map.get(op.timestep, op.timestep),
+                inputs=tuple(name_map.get(name, name) for name in op.inputs),
+                gradients=tuple(
+                    name_map.get(name, name) for name in op.gradients
+                ),
+                accumulators=tuple(
+                    name_map.get(name, name) for name in op.accumulators
+                ),
+                outputs=tuple(name_map.get(name, name) for name in op.outputs),
+                accumulator_outputs=tuple(
+                    name_map.get(name, name)
+                    for name in op.accumulator_outputs
+                ),
+                rate_shape=op.rate_shape,
+                timestep_shape=op.timestep_shape,
+                tensor_shapes=op.tensor_shapes,
+                output_shapes=op.output_shapes,
+                dtype=op.dtype,
+                rate_dtype=op.rate_dtype,
+                timestep_dtype=op.timestep_dtype,
+                norm_coefficient=op.norm_coefficient,
+                epsilon=op.epsilon,
+                decay_factor=op.decay_factor,
+            )
         if isinstance(op, SoftmaxOp):
             return SoftmaxOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2323,6 +1587,25 @@ class CEmitter:
                 dtype=op.dtype,
                 indices_dtype=op.indices_dtype,
             )
+        if isinstance(op, TensorScatterOp):
+            return TensorScatterOp(
+                past_cache=name_map.get(op.past_cache, op.past_cache),
+                update=name_map.get(op.update, op.update),
+                write_indices=(
+                    name_map.get(op.write_indices, op.write_indices)
+                    if op.write_indices is not None
+                    else None
+                ),
+                output=name_map.get(op.output, op.output),
+                past_cache_shape=op.past_cache_shape,
+                update_shape=op.update_shape,
+                output_shape=op.output_shape,
+                write_indices_shape=op.write_indices_shape,
+                axis=op.axis,
+                mode=op.mode,
+                dtype=op.dtype,
+                write_indices_dtype=op.write_indices_dtype,
+            )
         if isinstance(op, TransposeOp):
             return TransposeOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2583,6 +1866,33 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, NonMaxSuppressionOp):
+            return NonMaxSuppressionOp(
+                boxes=name_map.get(op.boxes, op.boxes),
+                scores=name_map.get(op.scores, op.scores),
+                max_output_boxes_per_class=self._map_optional_name(
+                    name_map, op.max_output_boxes_per_class
+                ),
+                iou_threshold=self._map_optional_name(
+                    name_map, op.iou_threshold
+                ),
+                score_threshold=self._map_optional_name(
+                    name_map, op.score_threshold
+                ),
+                output=name_map.get(op.output, op.output),
+                boxes_shape=op.boxes_shape,
+                scores_shape=op.scores_shape,
+                output_shape=op.output_shape,
+                center_point_box=op.center_point_box,
+                boxes_dtype=op.boxes_dtype,
+                output_dtype=op.output_dtype,
+                max_output_dtype=op.max_output_dtype,
+                max_output_shape=op.max_output_shape,
+                iou_threshold_dtype=op.iou_threshold_dtype,
+                iou_threshold_shape=op.iou_threshold_shape,
+                score_threshold_dtype=op.score_threshold_dtype,
+                score_threshold_shape=op.score_threshold_shape,
+            )
         if isinstance(op, ExpandOp):
             return ExpandOp(
                 input0=name_map.get(op.input0, op.input0),
@@ -2684,12 +1994,34 @@ class CEmitter:
             ops=ops,
             node_infos=model.node_infos,
             header=model.header,
+            op_context=model.op_context,
         )
         return sanitized, name_map
     def _sanitize_model_names(self, model: LoweredModel) -> LoweredModel:
         return self._sanitize_model_names_with_map(model)[0]
+    @staticmethod
+    def _copy_derived(
+        op_context: OpContext,
+        source_ops: Sequence[OpBase],
+        target_ops: Sequence[OpBase],
+    ) -> None:
+        for source_op, target_op in zip(source_ops, target_ops):
+            op_context.copy_derived(source_op, target_op)
+    @staticmethod
+    def _build_value_name_map(
+        name_map: Mapping[str, str],
+        temp_name_map: Mapping[str, str],
+    ) -> dict[str, str]:
+        reverse_name_map = {sanitized: original for original, sanitized in name_map.items()}
+        value_name_map = dict(reverse_name_map)
+        for sanitized_name, temp_name in temp_name_map.items():
+            original_name = reverse_name_map.get(sanitized_name, sanitized_name)
+            value_name_map[temp_name] = original_name
+        return value_name_map
     @staticmethod
     def _sanitize_testbench_inputs(
         testbench_inputs: Mapping[str, tuple[float | int | bool, ...]] | None,
@@ -2716,10 +2048,16 @@ class CEmitter:
                 "quantize_linear": self._env.get_template(
                     "quantize_linear_op.c.j2"
                 ),
+                "qlinear_matmul": self._env.get_template(
+                    "qlinear_matmul_op.c.j2"
+                ),
                 "matmul": self._env.get_template("matmul_op.c.j2"),
                 "einsum": self._env.get_template("einsum_op.c.j2"),
                 "gemm": self._env.get_template("gemm_op.c.j2"),
                 "attention": self._env.get_template("attention_op.c.j2"),
+                "rotary_embedding": self._env.get_template(
+                    "rotary_embedding_op.c.j2"
+                ),
                 "conv": self._env.get_template("conv_op.c.j2"),
                 "conv_transpose": self._env.get_template(
                     "conv_transpose_op.c.j2"
@@ -2743,6 +2081,7 @@ class CEmitter:
                 "rms_norm": self._env.get_template("rms_normalization_op.c.j2"),
                 "lrn": self._env.get_template("lrn_op.c.j2"),
                 "lstm": self._env.get_template("lstm_op.c.j2"),
+                "adagrad": self._env.get_template("adagrad_op.c.j2"),
                 "softmax": self._env.get_template("softmax_op.c.j2"),
                 "logsoftmax": self._env.get_template("logsoftmax_op.c.j2"),
                 "hardmax": self._env.get_template("hardmax_op.c.j2"),
@@ -2758,6 +2097,9 @@ class CEmitter:
                 "gather": self._env.get_template("gather_op.c.j2"),
                 "gather_nd": self._env.get_template("gather_nd_op.c.j2"),
                 "scatter_nd": self._env.get_template("scatter_nd_op.c.j2"),
+                "tensor_scatter": self._env.get_template(
+                    "tensor_scatter_op.c.j2"
+                ),
                 "transpose": self._env.get_template("transpose_op.c.j2"),
                 "reshape": self._env.get_template("reshape_op.c.j2"),
                 "identity": self._env.get_template("identity_op.c.j2"),
@@ -2785,6 +2127,9 @@ class CEmitter:
                 "shape": self._env.get_template("shape_op.c.j2"),
                 "size": self._env.get_template("size_op.c.j2"),
                 "nonzero": self._env.get_template("nonzero_op.c.j2"),
+                "nonmax_suppression": self._env.get_template(
+                    "nonmax_suppression_op.c.j2"
+                ),
                 "expand": self._env.get_template("expand_op.c.j2"),
                 "cumsum": self._env.get_template("cumsum_op.c.j2"),
                 "range": self._env.get_template("range_op.c.j2"),
@@ -2806,7 +2151,9 @@ class CEmitter:
         variable_dim_inputs: Mapping[int, Mapping[int, str]] | None = None,
         variable_dim_outputs: Mapping[int, Mapping[int, str]] | None = None,
     ) -> str:
+        original_model = model
         model, name_map = self._sanitize_model_names_with_map(model)
+        self._copy_derived(model.op_context, original_model.ops, model.ops)
         testbench_inputs = self._sanitize_testbench_inputs(
             testbench_inputs, name_map
         )
@@ -2832,68 +2179,17 @@ class CEmitter:
         self._env.globals["dim_args"] = dim_args
         templates = self._load_templates(emit_testbench)
         scalar_registry = ScalarFunctionRegistry()
-        binary_template = templates["binary"]
-        multi_input_template = templates["multi_input"]
-        where_template = templates["where"]
-        unary_template = templates["unary"]
-        clip_template = templates["clip"]
-        cast_template = templates["cast"]
-        quantize_linear_template = templates["quantize_linear"]
-        matmul_template = templates["matmul"]
-        einsum_template = templates["einsum"]
-        gemm_template = templates["gemm"]
-        attention_template = templates["attention"]
-        conv_template = templates["conv"]
-        conv_transpose_template = templates["conv_transpose"]
-        avg_pool_template = templates["avg_pool"]
-        lp_pool_template = templates["lp_pool"]
-        batch_norm_template = templates["batch_norm"]
-        lp_norm_template = templates["lp_norm"]
-        instance_norm_template = templates["instance_norm"]
-        group_norm_template = templates["group_norm"]
-        layer_norm_template = templates["layer_norm"]
-        mean_variance_norm_template = templates["mean_variance_norm"]
-        rms_norm_template = templates["rms_norm"]
-        lrn_template = templates["lrn"]
-        lstm_template = templates["lstm"]
-        softmax_template = templates["softmax"]
-        logsoftmax_template = templates["logsoftmax"]
-        hardmax_template = templates["hardmax"]
-        nllloss_template = templates["nllloss"]
-        softmax_cross_entropy_loss_template = templates["softmax_cross_entropy_loss"]
-        maxpool_template = templates["maxpool"]
-        concat_template = templates["concat"]
-        gather_elements_template = templates["gather_elements"]
-        gather_template = templates["gather"]
-        gather_nd_template = templates["gather_nd"]
-        scatter_nd_template = templates["scatter_nd"]
-        transpose_template = templates["transpose"]
-        reshape_template = templates["reshape"]
-        identity_template = templates["identity"]
-        eye_like_template = templates["eye_like"]
-        trilu_template = templates["trilu"]
-        tile_template = templates["tile"]
-        pad_template = templates["pad"]
-        depth_to_space_template = templates["depth_to_space"]
-        space_to_depth_template = templates["space_to_depth"]
-        slice_template = templates["slice"]
-        slice_dynamic_template = templates["slice_dynamic"]
-        resize_template = templates["resize"]
-        grid_sample_template = templates["grid_sample"]
-        reduce_template = templates["reduce"]
-        reduce_dynamic_template = templates["reduce_dynamic"]
-        arg_reduce_template = templates["arg_reduce"]
-        topk_template = templates["topk"]
-        constant_of_shape_template = templates["constant_of_shape"]
-        shape_template = templates["shape"]
-        size_template = templates["size"]
-        nonzero_template = templates["nonzero"]
-        expand_template = templates["expand"]
-        cumsum_template = templates["cumsum"]
-        range_template = templates["range"]
-        one_hot_template = templates["one_hot"]
-        split_template = templates["split"]
         testbench_template = templates.get("testbench")
+        initial_name_map = self._build_value_name_map(name_map, {})
+        self._emit_state = _EmitState(
+            model=model,
+            templates=templates,
+            scalar_registry=scalar_registry,
+            dim_args=dim_args,
+            tensor_dim_names=tensor_dim_names,
+            op_context=model.op_context,
+            value_name_map=initial_name_map,
+        )
         reserved_names = {
             model.name,
             *model.input_names,
@@ -2905,83 +2201,12 @@ class CEmitter:
             original: buffer.name for original, buffer in temp_buffers.items()
         }
         resolved_ops = [self._resolve_op(op, temp_name_map) for op in model.ops]
+        self._copy_derived(model.op_context, model.ops, resolved_ops)
+        value_name_map = self._build_value_name_map(name_map, temp_name_map)
+        self._emit_state.value_name_map = value_name_map
         self._propagate_tensor_dim_names(resolved_ops, tensor_dim_names)
         operator_fns = "\n\n".join(
-            self._render_op(
-                model,
-                op,
-                index,
-                array_suffix="",
-                loop_vars=(),
-                c_type=self._op_output_dtype(op).c_type,
-                zero_literal=self._op_output_dtype(op).zero_literal,
-                min_literal=self._op_output_dtype(op).min_literal,
-                max_literal=self._op_output_dtype(op).max_literal,
-                binary_template=binary_template,
-                multi_input_template=multi_input_template,
-                where_template=where_template,
-                unary_template=unary_template,
-                clip_template=clip_template,
-                cast_template=cast_template,
-                quantize_linear_template=quantize_linear_template,
-                matmul_template=matmul_template,
-                einsum_template=einsum_template,
-                gemm_template=gemm_template,
-                attention_template=attention_template,
-                conv_template=conv_template,
-                conv_transpose_template=conv_transpose_template,
-                avg_pool_template=avg_pool_template,
-                lp_pool_template=lp_pool_template,
-                batch_norm_template=batch_norm_template,
-                lp_norm_template=lp_norm_template,
-                instance_norm_template=instance_norm_template,
-                group_norm_template=group_norm_template,
-                layer_norm_template=layer_norm_template,
-                mean_variance_norm_template=mean_variance_norm_template,
-                rms_norm_template=rms_norm_template,
-                lrn_template=lrn_template,
-                lstm_template=lstm_template,
-                softmax_template=softmax_template,
-                logsoftmax_template=logsoftmax_template,
-                hardmax_template=hardmax_template,
-                nllloss_template=nllloss_template,
-                softmax_cross_entropy_loss_template=softmax_cross_entropy_loss_template,
-                maxpool_template=maxpool_template,
-                concat_template=concat_template,
-                gather_elements_template=gather_elements_template,
-                gather_template=gather_template,
-                gather_nd_template=gather_nd_template,
-                scatter_nd_template=scatter_nd_template,
-                transpose_template=transpose_template,
-                reshape_template=reshape_template,
-                identity_template=identity_template,
-                eye_like_template=eye_like_template,
-                trilu_template=trilu_template,
-                tile_template=tile_template,
-                pad_template=pad_template,
-                depth_to_space_template=depth_to_space_template,
-                space_to_depth_template=space_to_depth_template,
-                slice_template=slice_template,
-                slice_dynamic_template=slice_dynamic_template,
-                resize_template=resize_template,
-                grid_sample_template=grid_sample_template,
-                reduce_template=reduce_template,
-                reduce_dynamic_template=reduce_dynamic_template,
-                arg_reduce_template=arg_reduce_template,
-                topk_template=topk_template,
-                constant_of_shape_template=constant_of_shape_template,
-                shape_template=shape_template,
-                size_template=size_template,
-                nonzero_template=nonzero_template,
-                expand_template=expand_template,
-                cumsum_template=cumsum_template,
-                range_template=range_template,
-                one_hot_template=one_hot_template,
-                split_template=split_template,
-                scalar_registry=scalar_registry,
-                dim_args=dim_args,
-                tensor_dim_names=tensor_dim_names,
-            )
+            op.emit(self, EmitContext(op_index=index))
             for index, op in enumerate(resolved_ops)
         )
         wrapper_fn = self._emit_model_wrapper(
@@ -3073,7 +2298,9 @@ class CEmitter:
         variable_dim_inputs: Mapping[int, Mapping[int, str]] | None = None,
         variable_dim_outputs: Mapping[int, Mapping[int, str]] | None = None,
     ) -> tuple[str, str]:
+        original_model = model
         model, name_map = self._sanitize_model_names_with_map(model)
+        self._copy_derived(model.op_context, original_model.ops, model.ops)
         testbench_inputs = self._sanitize_testbench_inputs(
             testbench_inputs, name_map
         )
@@ -3099,68 +2326,17 @@ class CEmitter:
         self._env.globals["dim_args"] = dim_args
         templates = self._load_templates(emit_testbench)
         scalar_registry = ScalarFunctionRegistry()
-        binary_template = templates["binary"]
-        multi_input_template = templates["multi_input"]
-        where_template = templates["where"]
-        unary_template = templates["unary"]
-        clip_template = templates["clip"]
-        cast_template = templates["cast"]
-        quantize_linear_template = templates["quantize_linear"]
-        matmul_template = templates["matmul"]
-        einsum_template = templates["einsum"]
-        gemm_template = templates["gemm"]
-        attention_template = templates["attention"]
-        conv_template = templates["conv"]
-        conv_transpose_template = templates["conv_transpose"]
-        avg_pool_template = templates["avg_pool"]
-        lp_pool_template = templates["lp_pool"]
-        batch_norm_template = templates["batch_norm"]
-        lp_norm_template = templates["lp_norm"]
-        instance_norm_template = templates["instance_norm"]
-        group_norm_template = templates["group_norm"]
-        layer_norm_template = templates["layer_norm"]
-        mean_variance_norm_template = templates["mean_variance_norm"]
-        rms_norm_template = templates["rms_norm"]
-        lrn_template = templates["lrn"]
-        lstm_template = templates["lstm"]
-        softmax_template = templates["softmax"]
-        logsoftmax_template = templates["logsoftmax"]
-        hardmax_template = templates["hardmax"]
-        nllloss_template = templates["nllloss"]
-        softmax_cross_entropy_loss_template = templates["softmax_cross_entropy_loss"]
-        maxpool_template = templates["maxpool"]
-        concat_template = templates["concat"]
-        gather_elements_template = templates["gather_elements"]
-        gather_template = templates["gather"]
-        gather_nd_template = templates["gather_nd"]
-        scatter_nd_template = templates["scatter_nd"]
-        transpose_template = templates["transpose"]
-        reshape_template = templates["reshape"]
-        identity_template = templates["identity"]
-        eye_like_template = templates["eye_like"]
-        trilu_template = templates["trilu"]
-        tile_template = templates["tile"]
-        pad_template = templates["pad"]
-        depth_to_space_template = templates["depth_to_space"]
-        space_to_depth_template = templates["space_to_depth"]
-        slice_template = templates["slice"]
-        slice_dynamic_template = templates["slice_dynamic"]
-        resize_template = templates["resize"]
-        grid_sample_template = templates["grid_sample"]
-        reduce_template = templates["reduce"]
-        reduce_dynamic_template = templates["reduce_dynamic"]
-        arg_reduce_template = templates["arg_reduce"]
-        topk_template = templates["topk"]
-        constant_of_shape_template = templates["constant_of_shape"]
-        shape_template = templates["shape"]
-        size_template = templates["size"]
-        nonzero_template = templates["nonzero"]
-        expand_template = templates["expand"]
-        cumsum_template = templates["cumsum"]
-        range_template = templates["range"]
-        one_hot_template = templates["one_hot"]
-        split_template = templates["split"]
         testbench_template = templates.get("testbench")
+        initial_name_map = self._build_value_name_map(name_map, {})
+        self._emit_state = _EmitState(
+            model=model,
+            templates=templates,
+            scalar_registry=scalar_registry,
+            dim_args=dim_args,
+            tensor_dim_names=tensor_dim_names,
+            op_context=model.op_context,
+            value_name_map=initial_name_map,
+        )
         reserved_names = {
             model.name,
             *model.input_names,
@@ -3172,83 +2348,12 @@ class CEmitter:
             original: buffer.name for original, buffer in temp_buffers.items()
         }
         resolved_ops = [self._resolve_op(op, temp_name_map) for op in model.ops]
+        self._copy_derived(model.op_context, model.ops, resolved_ops)
+        value_name_map = self._build_value_name_map(name_map, temp_name_map)
+        self._emit_state.value_name_map = value_name_map
         self._propagate_tensor_dim_names(resolved_ops, tensor_dim_names)
         operator_fns = "\n\n".join(
-            self._render_op(
-                model,
-                op,
-                index,
-                array_suffix="",
-                loop_vars=(),
-                c_type=self._op_output_dtype(op).c_type,
-                zero_literal=self._op_output_dtype(op).zero_literal,
-                min_literal=self._op_output_dtype(op).min_literal,
-                max_literal=self._op_output_dtype(op).max_literal,
-                binary_template=binary_template,
-                multi_input_template=multi_input_template,
-                where_template=where_template,
-                unary_template=unary_template,
-                clip_template=clip_template,
-                cast_template=cast_template,
-                quantize_linear_template=quantize_linear_template,
-                matmul_template=matmul_template,
-                einsum_template=einsum_template,
-                gemm_template=gemm_template,
-                attention_template=attention_template,
-                conv_template=conv_template,
-                conv_transpose_template=conv_transpose_template,
-                avg_pool_template=avg_pool_template,
-                lp_pool_template=lp_pool_template,
-                batch_norm_template=batch_norm_template,
-                lp_norm_template=lp_norm_template,
-                instance_norm_template=instance_norm_template,
-                group_norm_template=group_norm_template,
-                layer_norm_template=layer_norm_template,
-                mean_variance_norm_template=mean_variance_norm_template,
-                rms_norm_template=rms_norm_template,
-                lrn_template=lrn_template,
-                lstm_template=lstm_template,
-                softmax_template=softmax_template,
-                logsoftmax_template=logsoftmax_template,
-                hardmax_template=hardmax_template,
-                nllloss_template=nllloss_template,
-                softmax_cross_entropy_loss_template=softmax_cross_entropy_loss_template,
-                maxpool_template=maxpool_template,
-                concat_template=concat_template,
-                gather_elements_template=gather_elements_template,
-                gather_template=gather_template,
-                gather_nd_template=gather_nd_template,
-                scatter_nd_template=scatter_nd_template,
-                transpose_template=transpose_template,
-                reshape_template=reshape_template,
-                identity_template=identity_template,
-                eye_like_template=eye_like_template,
-                trilu_template=trilu_template,
-                tile_template=tile_template,
-                pad_template=pad_template,
-                depth_to_space_template=depth_to_space_template,
-                space_to_depth_template=space_to_depth_template,
-                slice_template=slice_template,
-                slice_dynamic_template=slice_dynamic_template,
-                resize_template=resize_template,
-                grid_sample_template=grid_sample_template,
-                reduce_template=reduce_template,
-                reduce_dynamic_template=reduce_dynamic_template,
-                arg_reduce_template=arg_reduce_template,
-                topk_template=topk_template,
-                constant_of_shape_template=constant_of_shape_template,
-                shape_template=shape_template,
-                size_template=size_template,
-                nonzero_template=nonzero_template,
-                expand_template=expand_template,
-                cumsum_template=cumsum_template,
-                range_template=range_template,
-                one_hot_template=one_hot_template,
-                split_template=split_template,
-                scalar_registry=scalar_registry,
-                dim_args=dim_args,
-                tensor_dim_names=tensor_dim_names,
-            )
+            op.emit(self, EmitContext(op_index=index))
             for index, op in enumerate(resolved_ops)
         )
         wrapper_fn = self._emit_model_wrapper(
@@ -3536,6 +2641,8 @@ class CEmitter:
             ScalarFunction.SCALED_TANH,
             ScalarFunction.THRESHOLDED_RELU,
             ScalarFunction.LOGICAL_XOR,
+            ScalarFunction.ISNEGINF,
+            ScalarFunction.ISPOSINF,
         }
         if function in {ScalarFunction.MAXIMUM, ScalarFunction.MINIMUM}:
             if dtype in {ScalarType.F32, ScalarType.F64}:
@@ -3598,6 +2705,7 @@ class CEmitter:
             | ClipOp
             | CastOp
             | QuantizeLinearOp
+            | QLinearMatMulOp
             | MatMulOp
             | EinsumOp
             | GemmOp
@@ -3615,6 +2723,7 @@ class CEmitter:
             | RMSNormalizationOp
             | LrnOp
             | LstmOp
+            | AdagradOp
             | SoftmaxOp
             | LogSoftmaxOp
             | HardmaxOp
@@ -3626,6 +2735,7 @@ class CEmitter:
             | GatherOp
             | GatherNDOp
             | ScatterNDOp
+            | TensorScatterOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
@@ -3644,6 +2754,7 @@ class CEmitter:
             | ShapeOp
             | SizeOp
             | NonZeroOp
+            | NonMaxSuppressionOp
             | ExpandOp
             | CumSumOp
             | RangeOp
@@ -3830,6 +2941,7 @@ class CEmitter:
             | ClipOp
             | CastOp
             | QuantizeLinearOp
+            | QLinearMatMulOp
             | MatMulOp
             | EinsumOp
             | GemmOp
@@ -3847,6 +2959,7 @@ class CEmitter:
             | RMSNormalizationOp
             | LrnOp
             | LstmOp
+            | AdagradOp
             | SoftmaxOp
             | LogSoftmaxOp
             | HardmaxOp
@@ -3858,6 +2971,7 @@ class CEmitter:
             | GatherOp
             | GatherNDOp
             | ScatterNDOp
+            | TensorScatterOp
             | TransposeOp
             | ReshapeOp
             | IdentityOp
@@ -3876,6 +2990,7 @@ class CEmitter:
             | ShapeOp
             | SizeOp
             | NonZeroOp
+            | NonMaxSuppressionOp
             | ExpandOp
             | CumSumOp
             | RangeOp
@@ -3948,6 +3063,7 @@ class CEmitter:
                     RMSNormalizationOp,
                     LrnOp,
                     LstmOp,
+                    AdagradOp,
                     SoftmaxOp,
                     LogSoftmaxOp,
                     SoftmaxCrossEntropyLossOp,
@@ -3977,7 +3093,7 @@ class CEmitter:
         ):
             return True
         if any(
-            isinstance(op, (LpPoolOp, QuantizeLinearOp))
+            isinstance(op, (LpPoolOp, QuantizeLinearOp, QLinearMatMulOp))
             for op in resolved_ops
         ):
             return True
@@ -3991,6 +3107,7 @@ class CEmitter:
             | ClipOp
             | CastOp
             | QuantizeLinearOp
+            | QLinearMatMulOp
             | MatMulOp
             | EinsumOp
             | GemmOp
@@ -4036,6 +3153,7 @@ class CEmitter:
             | ShapeOp
             | SizeOp
             | NonZeroOp
+            | NonMaxSuppressionOp
             | ExpandOp
             | CumSumOp
             | RangeOp
@@ -4070,10 +3188,13 @@ class CEmitter:
         ):
             return True
         if any(
-            isinstance(op, QuantizeLinearOp) and op.dtype.is_integer
+            isinstance(op, (QuantizeLinearOp, QLinearMatMulOp))
+            and op.dtype.is_integer
             for op in resolved_ops
         ):
             return True
+        if any(isinstance(op, NonMaxSuppressionOp) for op in resolved_ops):
+            return True
         return False
     def _emit_model_wrapper(
@@ -4086,6 +3207,7 @@ class CEmitter:
             | ClipOp
             | CastOp
             | QuantizeLinearOp
+            | QLinearMatMulOp
             | MatMulOp
             | EinsumOp
             | GemmOp
@@ -4131,6 +3253,7 @@ class CEmitter:
             | ShapeOp
             | SizeOp
             | NonZeroOp
+            | NonMaxSuppressionOp
             | ExpandOp
             | CumSumOp
             | RangeOp
@@ -4195,10 +3318,12 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
         | AttentionOp
+        | RotaryEmbeddingOp
         | ConvOp
         | ConvTransposeOp
         | AveragePoolOp
@@ -4212,6 +3337,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -4223,6 +3349,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -4242,6 +3369,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -4261,6 +3389,21 @@ class CEmitter:
         if isinstance(op, WhereOp):
             args.extend([op.condition, op.input_x, op.input_y, op.output])
             return ", ".join(args)
+        if isinstance(op, QLinearMatMulOp):
+            args.extend(
+                [
+                    op.input0,
+                    op.input0_scale,
+                    op.input0_zero_point,
+                    op.input1,
+                    op.input1_scale,
+                    op.input1_zero_point,
+                    op.output_scale,
+                    op.output_zero_point,
+                    op.output,
+                ]
+            )
+            return ", ".join(args)
         if isinstance(op, MatMulOp):
             args.extend([op.input0, op.input1, op.output])
             return ", ".join(args)
@@ -4380,6 +3523,19 @@ class CEmitter:
                 call_parts.append(op.output_y_c)
             args.extend(call_parts)
             return ", ".join(args)
+        if isinstance(op, AdagradOp):
+            args.extend(
+                [
+                    op.rate,
+                    op.timestep,
+                    *op.inputs,
+                    *op.gradients,
+                    *op.accumulators,
+                    *op.outputs,
+                    *op.accumulator_outputs,
+                ]
+            )
+            return ", ".join(args)
         if isinstance(op, (SoftmaxOp, LogSoftmaxOp, HardmaxOp)):
             args.extend([op.input0, op.output])
             return ", ".join(args)
@@ -4417,6 +3573,12 @@ class CEmitter:
         if isinstance(op, ScatterNDOp):
             args.extend([op.data, op.indices, op.updates, op.output])
             return ", ".join(args)
+        if isinstance(op, TensorScatterOp):
+            args.extend([op.past_cache, op.update])
+            if op.write_indices is not None:
+                args.append(op.write_indices)
+            args.append(op.output)
+            return ", ".join(args)
         if isinstance(op, ConcatOp):
             args.extend([*op.inputs, op.output])
             return ", ".join(args)
@@ -4432,6 +3594,17 @@ class CEmitter:
         if isinstance(op, NonZeroOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
+        if isinstance(op, NonMaxSuppressionOp):
+            call_parts = [op.boxes, op.scores]
+            if op.max_output_boxes_per_class is not None:
+                call_parts.append(op.max_output_boxes_per_class)
+            if op.iou_threshold is not None:
+                call_parts.append(op.iou_threshold)
+            if op.score_threshold is not None:
+                call_parts.append(op.score_threshold)
+            call_parts.append(op.output)
+            args.extend(call_parts)
+            return ", ".join(args)
         if isinstance(op, ExpandOp):
             args.extend([op.input0, op.output])
             return ", ".join(args)
@@ -4566,10 +3739,12 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
         | AttentionOp
+        | RotaryEmbeddingOp
         | ConvOp
         | ConvTransposeOp
         | AveragePoolOp
@@ -4583,6 +3758,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -4594,6 +3770,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -4612,6 +3789,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -4626,6 +3804,7 @@ class CEmitter:
         | ClipOp
         | CastOp
         | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
@@ -4643,6 +3822,7 @@ class CEmitter:
         | RMSNormalizationOp
         | LrnOp
         | LstmOp
+        | AdagradOp
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
@@ -4654,6 +3834,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -4672,6 +3853,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -4791,6 +3973,47 @@ class CEmitter:
                 input_dtype=op.input_dtype,
                 scale_dtype=op.scale_dtype,
             )
+        if isinstance(op, QLinearMatMulOp):
+            return QLinearMatMulOp(
+                input0=temp_map.get(op.input0, op.input0),
+                input0_scale=temp_map.get(op.input0_scale, op.input0_scale),
+                input0_zero_point=temp_map.get(
+                    op.input0_zero_point, op.input0_zero_point
+                ),
+                input1=temp_map.get(op.input1, op.input1),
+                input1_scale=temp_map.get(op.input1_scale, op.input1_scale),
+                input1_zero_point=temp_map.get(
+                    op.input1_zero_point, op.input1_zero_point
+                ),
+                output_scale=temp_map.get(op.output_scale, op.output_scale),
+                output_zero_point=temp_map.get(
+                    op.output_zero_point, op.output_zero_point
+                ),
+                output=temp_map.get(op.output, op.output),
+                input0_shape=op.input0_shape,
+                input1_shape=op.input1_shape,
+                output_shape=op.output_shape,
+                batch_shape=op.batch_shape,
+                input0_batch_shape=op.input0_batch_shape,
+                input1_batch_shape=op.input1_batch_shape,
+                m=op.m,
+                n=op.n,
+                k=op.k,
+                left_vector=op.left_vector,
+                right_vector=op.right_vector,
+                input0_dtype=op.input0_dtype,
+                input1_dtype=op.input1_dtype,
+                dtype=op.dtype,
+                input0_scale_dtype=op.input0_scale_dtype,
+                input1_scale_dtype=op.input1_scale_dtype,
+                output_scale_dtype=op.output_scale_dtype,
+                input0_scale_shape=op.input0_scale_shape,
+                input1_scale_shape=op.input1_scale_shape,
+                output_scale_shape=op.output_scale_shape,
+                input0_zero_shape=op.input0_zero_shape,
+                input1_zero_shape=op.input1_zero_shape,
+                output_zero_shape=op.output_zero_shape,
+            )
         if isinstance(op, GemmOp):
             return GemmOp(
                 input_a=temp_map.get(op.input_a, op.input_a),
@@ -4885,6 +4108,32 @@ class CEmitter:
                 head_group_size=op.head_group_size,
                 dtype=op.dtype,
             )
+        if isinstance(op, RotaryEmbeddingOp):
+            return RotaryEmbeddingOp(
+                input0=temp_map.get(op.input0, op.input0),
+                cos_cache=temp_map.get(op.cos_cache, op.cos_cache),
+                sin_cache=temp_map.get(op.sin_cache, op.sin_cache),
+                position_ids=(
+                    temp_map.get(op.position_ids, op.position_ids)
+                    if op.position_ids is not None
+                    else None
+                ),
+                output=temp_map.get(op.output, op.output),
+                input_shape=op.input_shape,
+                cos_shape=op.cos_shape,
+                sin_shape=op.sin_shape,
+                position_ids_shape=op.position_ids_shape,
+                dtype=op.dtype,
+                position_ids_dtype=op.position_ids_dtype,
+                rotary_dim=op.rotary_dim,
+                rotary_dim_half=op.rotary_dim_half,
+                head_size=op.head_size,
+                num_heads=op.num_heads,
+                seq_len=op.seq_len,
+                batch=op.batch,
+                input_rank=op.input_rank,
+                interleaved=op.interleaved,
+            )
         if isinstance(op, LstmOp):
             return LstmOp(
                 input_x=temp_map.get(op.input_x, op.input_x),
@@ -4945,6 +4194,33 @@ class CEmitter:
                 dtype=op.dtype,
                 sequence_lens_dtype=op.sequence_lens_dtype,
             )
+        if isinstance(op, AdagradOp):
+            return AdagradOp(
+                rate=temp_map.get(op.rate, op.rate),
+                timestep=temp_map.get(op.timestep, op.timestep),
+                inputs=tuple(temp_map.get(name, name) for name in op.inputs),
+                gradients=tuple(
+                    temp_map.get(name, name) for name in op.gradients
+                ),
+                accumulators=tuple(
+                    temp_map.get(name, name) for name in op.accumulators
+                ),
+                outputs=tuple(temp_map.get(name, name) for name in op.outputs),
+                accumulator_outputs=tuple(
+                    temp_map.get(name, name)
+                    for name in op.accumulator_outputs
+                ),
+                rate_shape=op.rate_shape,
+                timestep_shape=op.timestep_shape,
+                tensor_shapes=op.tensor_shapes,
+                output_shapes=op.output_shapes,
+                dtype=op.dtype,
+                rate_dtype=op.rate_dtype,
+                timestep_dtype=op.timestep_dtype,
+                norm_coefficient=op.norm_coefficient,
+                epsilon=op.epsilon,
+                decay_factor=op.decay_factor,
+            )
         if isinstance(op, ConvOp):
             return ConvOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -5301,6 +4577,25 @@ class CEmitter:
                 dtype=op.dtype,
                 indices_dtype=op.indices_dtype,
             )
+        if isinstance(op, TensorScatterOp):
+            return TensorScatterOp(
+                past_cache=temp_map.get(op.past_cache, op.past_cache),
+                update=temp_map.get(op.update, op.update),
+                write_indices=(
+                    temp_map.get(op.write_indices, op.write_indices)
+                    if op.write_indices is not None
+                    else None
+                ),
+                output=temp_map.get(op.output, op.output),
+                past_cache_shape=op.past_cache_shape,
+                update_shape=op.update_shape,
+                output_shape=op.output_shape,
+                write_indices_shape=op.write_indices_shape,
+                axis=op.axis,
+                mode=op.mode,
+                dtype=op.dtype,
+                write_indices_dtype=op.write_indices_dtype,
+            )
         if isinstance(op, ConcatOp):
             return ConcatOp(
                 inputs=tuple(temp_map.get(name, name) for name in op.inputs),
@@ -5349,6 +4644,33 @@ class CEmitter:
                 dtype=op.dtype,
                 input_dtype=op.input_dtype,
             )
+        if isinstance(op, NonMaxSuppressionOp):
+            return NonMaxSuppressionOp(
+                boxes=temp_map.get(op.boxes, op.boxes),
+                scores=temp_map.get(op.scores, op.scores),
+                max_output_boxes_per_class=CEmitter._map_optional_name(
+                    temp_map, op.max_output_boxes_per_class
+                ),
+                iou_threshold=CEmitter._map_optional_name(
+                    temp_map, op.iou_threshold
+                ),
+                score_threshold=CEmitter._map_optional_name(
+                    temp_map, op.score_threshold
+                ),
+                output=temp_map.get(op.output, op.output),
+                boxes_shape=op.boxes_shape,
+                scores_shape=op.scores_shape,
+                output_shape=op.output_shape,
+                center_point_box=op.center_point_box,
+                boxes_dtype=op.boxes_dtype,
+                output_dtype=op.output_dtype,
+                max_output_dtype=op.max_output_dtype,
+                max_output_shape=op.max_output_shape,
+                iou_threshold_dtype=op.iou_threshold_dtype,
+                iou_threshold_shape=op.iou_threshold_shape,
+                score_threshold_dtype=op.score_threshold_dtype,
+                score_threshold_shape=op.score_threshold_shape,
+            )
         if isinstance(op, ExpandOp):
             return ExpandOp(
                 input0=temp_map.get(op.input0, op.input0),
@@ -5673,67 +4995,98 @@ class CEmitter:
             dtype=op.dtype,
         )
+    def render_op(self, op: OpBase, ctx: EmitContext) -> str:
+        if self._emit_state is None:
+            raise CodegenError("Emitter state not initialized")
+        state = self._emit_state
+        dtype = self._op_output_dtype(op)
+        templates = state.templates
+        return self._render_op(
+            state.model,
+            op,
+            ctx.op_index,
+            array_suffix="",
+            loop_vars=(),
+            c_type=dtype.c_type,
+            zero_literal=dtype.zero_literal,
+            min_literal=dtype.min_literal,
+            max_literal=dtype.max_literal,
+            binary_template=templates["binary"],
+            multi_input_template=templates["multi_input"],
+            where_template=templates["where"],
+            unary_template=templates["unary"],
+            clip_template=templates["clip"],
+            cast_template=templates["cast"],
+            quantize_linear_template=templates["quantize_linear"],
+            qlinear_matmul_template=templates["qlinear_matmul"],
+            matmul_template=templates["matmul"],
+            einsum_template=templates["einsum"],
+            gemm_template=templates["gemm"],
+            attention_template=templates["attention"],
+            rotary_embedding_template=templates["rotary_embedding"],
+            conv_template=templates["conv"],
+            conv_transpose_template=templates["conv_transpose"],
+            avg_pool_template=templates["avg_pool"],
+            lp_pool_template=templates["lp_pool"],
+            batch_norm_template=templates["batch_norm"],
+            lp_norm_template=templates["lp_norm"],
+            instance_norm_template=templates["instance_norm"],
+            group_norm_template=templates["group_norm"],
+            layer_norm_template=templates["layer_norm"],
+            mean_variance_norm_template=templates["mean_variance_norm"],
+            rms_norm_template=templates["rms_norm"],
+            lrn_template=templates["lrn"],
+            lstm_template=templates["lstm"],
+            adagrad_template=templates["adagrad"],
+            softmax_template=templates["softmax"],
+            logsoftmax_template=templates["logsoftmax"],
+            hardmax_template=templates["hardmax"],
+            nllloss_template=templates["nllloss"],
+            softmax_cross_entropy_loss_template=templates[
+                "softmax_cross_entropy_loss"
+            ],
+            maxpool_template=templates["maxpool"],
+            concat_template=templates["concat"],
+            gather_elements_template=templates["gather_elements"],
+            gather_template=templates["gather"],
+            gather_nd_template=templates["gather_nd"],
+            scatter_nd_template=templates["scatter_nd"],
+            transpose_template=templates["transpose"],
+            reshape_template=templates["reshape"],
+            identity_template=templates["identity"],
+            eye_like_template=templates["eye_like"],
+            trilu_template=templates["trilu"],
+            tile_template=templates["tile"],
+            pad_template=templates["pad"],
+            depth_to_space_template=templates["depth_to_space"],
+            space_to_depth_template=templates["space_to_depth"],
+            slice_template=templates["slice"],
+            slice_dynamic_template=templates["slice_dynamic"],
+            resize_template=templates["resize"],
+            grid_sample_template=templates["grid_sample"],
+            reduce_template=templates["reduce"],
+            reduce_dynamic_template=templates["reduce_dynamic"],
+            arg_reduce_template=templates["arg_reduce"],
+            topk_template=templates["topk"],
+            constant_of_shape_template=templates["constant_of_shape"],
+            shape_template=templates["shape"],
+            size_template=templates["size"],
+            nonzero_template=templates["nonzero"],
+            nonmax_suppression_template=templates["nonmax_suppression"],
+            expand_template=templates["expand"],
+            cumsum_template=templates["cumsum"],
+            range_template=templates["range"],
+            one_hot_template=templates["one_hot"],
+            split_template=templates["split"],
+            scalar_registry=state.scalar_registry,
+            dim_args=state.dim_args,
+            tensor_dim_names=state.tensor_dim_names,
+        )
     def _render_op(
         self,
         model: LoweredModel,
-        op: BinaryOp
-        | MultiInputBinaryOp
-        | WhereOp
-        | UnaryOp
-        | ClipOp
-        | CastOp
-        | QuantizeLinearOp
-        | MatMulOp
-        | EinsumOp
-        | GemmOp
-        | AttentionOp
-        | ConvOp
-        | ConvTransposeOp
-        | AveragePoolOp
-        | LpPoolOp
-        | BatchNormOp
-        | LpNormalizationOp
-        | InstanceNormalizationOp
-        | GroupNormalizationOp
-        | LayerNormalizationOp
-        | MeanVarianceNormalizationOp
-        | RMSNormalizationOp
-        | LrnOp
-        | LstmOp
-        | SoftmaxOp
-        | LogSoftmaxOp
-        | HardmaxOp
-        | NegativeLogLikelihoodLossOp
-        | SoftmaxCrossEntropyLossOp
-        | MaxPoolOp
-        | ConcatOp
-        | GatherElementsOp
-        | GatherOp
-        | GatherNDOp
-        | ScatterNDOp
-        | TransposeOp
-        | ReshapeOp
-        | IdentityOp
-        | EyeLikeOp
-        | TriluOp
-        | TileOp
-        | DepthToSpaceOp
-        | SpaceToDepthOp
-        | SliceOp
-        | ResizeOp
-        | GridSampleOp
-        | ReduceOp
-        | ArgReduceOp
-        | TopKOp
-        | ConstantOfShapeOp
-        | ShapeOp
-        | SizeOp
-        | NonZeroOp
-        | ExpandOp
-        | CumSumOp
-        | RangeOp
-        | OneHotOp
-        | SplitOp,
+        op: OpBase,
         index: int,
         *,
         array_suffix: str,
@@ -5749,10 +5102,12 @@ class CEmitter:
         clip_template,
         cast_template,
         quantize_linear_template,
+        qlinear_matmul_template,
         matmul_template,
         einsum_template,
         gemm_template,
         attention_template,
+        rotary_embedding_template,
         conv_template,
         conv_transpose_template,
         avg_pool_template,
@@ -5766,6 +5121,7 @@ class CEmitter:
         rms_norm_template,
         lrn_template,
         lstm_template,
+        adagrad_template,
         softmax_template,
         logsoftmax_template,
         hardmax_template,
@@ -5798,6 +5154,7 @@ class CEmitter:
         shape_template,
         size_template,
         nonzero_template,
+        nonmax_suppression_template,
         expand_template,
         cumsum_template,
         range_template,
@@ -5819,6 +5176,11 @@ class CEmitter:
             return f"{node_comment}\n{_format_c_indentation(rendered)}"
         if isinstance(op, BinaryOp):
+            input0_shape = self._ctx_shape(op.input0)
+            input1_shape = self._ctx_shape(op.input1)
+            output_shape = self._ctx_shape(op.output)
+            input_dtype = self._ctx_dtype(op.input0)
+            output_dtype = self._ctx_dtype(op.output)
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -5832,11 +5194,11 @@ class CEmitter:
                 and op.function not in COMPARE_FUNCTIONS
             ):
                 scalar_operator = self._scalar_function_name(
-                    op.function, op.input_dtype, scalar_registry
+                    op.function, input_dtype, scalar_registry
                 )
             op_spec = binary_op_symbol(
                 op.function,
-                dtype=op.input_dtype,
+                dtype=input_dtype,
                 validate_attrs=False,
             )
             if op_spec is None:
@@ -5844,17 +5206,19 @@ class CEmitter:
                     f"Unsupported binary operator for rendering: {op.function.value}"
                 )
             output_dim_names = _dim_names_for(op.output)
-            shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
-            loop_vars = CEmitter._loop_vars(op.shape)
-            output_suffix = self._param_array_suffix(op.shape, output_dim_names)
+            shape = CEmitter._shape_dim_exprs(output_shape, output_dim_names)
+            loop_vars = CEmitter._loop_vars(output_shape)
+            output_suffix = self._param_array_suffix(
+                output_shape, output_dim_names
+            )
             input0_suffix = self._param_array_suffix(
-                op.input0_shape, _dim_names_for(op.input0)
+                input0_shape, _dim_names_for(op.input0)
             )
             input1_suffix = self._param_array_suffix(
-                op.input1_shape, _dim_names_for(op.input1)
+                input1_shape, _dim_names_for(op.input1)
             )
-            input_c_type = op.input_dtype.c_type
-            output_c_type = op.dtype.c_type
+            input_c_type = input_dtype.c_type
+            output_c_type = output_dtype.c_type
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], input_c_type, input0_suffix, True),
@@ -5877,14 +5241,14 @@ class CEmitter:
             }
             left_expr = CEmitter._broadcast_index_expr(
                 params["input0"],
-                op.input0_shape,
-                op.shape,
+                input0_shape,
+                output_shape,
                 loop_vars,
             )
             right_expr = CEmitter._broadcast_index_expr(
                 params["input1"],
-                op.input1_shape,
-                op.shape,
+                input1_shape,
+                output_shape,
                 loop_vars,
             )
             operator_expr = None
@@ -5910,6 +5274,9 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, MultiInputBinaryOp):
+            output_shape = self._ctx_shape(op.output)
+            input_dtype = self._ctx_dtype(op.inputs[0])
+            output_dtype = self._ctx_dtype(op.output)
             params = self._shared_param_map(
                 [
                     *( (f"input{idx}", name) for idx, name in enumerate(op.inputs) ),
@@ -5923,11 +5290,11 @@ class CEmitter:
                 and op.function != ScalarFunction.MEAN
             ):
                 scalar_operator = self._scalar_function_name(
-                    op.function, op.input_dtype, scalar_registry
+                    op.function, input_dtype, scalar_registry
                 )
             op_spec = binary_op_symbol(
                 op.function,
-                dtype=op.input_dtype,
+                dtype=input_dtype,
                 validate_attrs=False,
             )
             if op_spec is None:
@@ -5936,11 +5303,13 @@ class CEmitter:
                     f"{op.function.value}"
                 )
             output_dim_names = _dim_names_for(op.output)
-            shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
-            loop_vars = CEmitter._loop_vars(op.shape)
-            array_suffix = self._param_array_suffix(op.shape, output_dim_names)
-            input_c_type = op.input_dtype.c_type
-            output_c_type = op.dtype.c_type
+            shape = CEmitter._shape_dim_exprs(output_shape, output_dim_names)
+            loop_vars = CEmitter._loop_vars(output_shape)
+            array_suffix = self._param_array_suffix(
+                output_shape, output_dim_names
+            )
+            input_c_type = input_dtype.c_type
+            output_c_type = output_dtype.c_type
             input_names = [
                 params[f"input{idx}"] for idx in range(len(op.inputs))
             ]
@@ -5999,6 +5368,11 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, WhereOp):
+            output_shape_raw = self._ctx_shape(op.output)
+            condition_shape = self._ctx_shape(op.condition)
+            x_shape = self._ctx_shape(op.input_x)
+            y_shape = self._ctx_shape(op.input_y)
+            output_dtype = self._ctx_dtype(op.output)
             params = self._shared_param_map(
                 [
                     ("condition", op.condition),
@@ -6009,32 +5383,32 @@ class CEmitter:
             )
             output_dim_names = _dim_names_for(op.output)
             output_shape = CEmitter._shape_dim_exprs(
-                op.output_shape, output_dim_names
+                output_shape_raw, output_dim_names
             )
-            loop_vars = CEmitter._loop_vars(op.output_shape)
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
             output_array_suffix = self._param_array_suffix(
-                op.output_shape, output_dim_names
+                output_shape_raw, output_dim_names
             )
             condition_array_suffix = self._param_array_suffix(
-                op.condition_shape, _dim_names_for(op.condition)
+                condition_shape, _dim_names_for(op.condition)
             )
             x_array_suffix = self._param_array_suffix(
-                op.x_shape, _dim_names_for(op.input_x)
+                x_shape, _dim_names_for(op.input_x)
             )
             y_array_suffix = self._param_array_suffix(
-                op.y_shape, _dim_names_for(op.input_y)
+                y_shape, _dim_names_for(op.input_y)
             )
             condition_expr = CEmitter._broadcast_index_expr(
                 params["condition"],
-                op.condition_shape,
-                op.output_shape,
+                condition_shape,
+                output_shape_raw,
                 loop_vars,
             )
             x_expr = CEmitter._broadcast_index_expr(
-                params["input_x"], op.x_shape, op.output_shape, loop_vars
+                params["input_x"], x_shape, output_shape_raw, loop_vars
             )
             y_expr = CEmitter._broadcast_index_expr(
-                params["input_y"], op.y_shape, op.output_shape, loop_vars
+                params["input_y"], y_shape, output_shape_raw, loop_vars
             )
             output_expr = f"{params['output']}" + "".join(
                 f"[{var}]" for var in loop_vars
@@ -6047,11 +5421,11 @@ class CEmitter:
                         condition_array_suffix,
                         True,
                     ),
-                    (params["input_x"], op.dtype.c_type, x_array_suffix, True),
-                    (params["input_y"], op.dtype.c_type, y_array_suffix, True),
+                    (params["input_x"], output_dtype.c_type, x_array_suffix, True),
+                    (params["input_y"], output_dtype.c_type, y_array_suffix, True),
                     (
                         params["output"],
-                        op.dtype.c_type,
+                        output_dtype.c_type,
                         output_array_suffix,
                         False,
                     ),
@@ -6074,8 +5448,8 @@ class CEmitter:
                 x_expr=x_expr,
                 y_expr=y_expr,
                 output_expr=output_expr,
-                input_c_type=op.dtype.c_type,
-                output_c_type=op.dtype.c_type,
+                input_c_type=output_dtype.c_type,
+                output_c_type=output_dtype.c_type,
                 condition_c_type=ScalarType.BOOL.c_type,
                 dim_args=dim_args,
                 params=param_decls,
@@ -6363,6 +5737,17 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, AttentionOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for Attention codegen."
+                )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, op.dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for Attention."
+                )
             params = self._shared_param_map(
                 [
                     ("input_q", op.input_q),
@@ -6543,6 +5928,7 @@ class CEmitter:
                 scale_literal=CEmitter._format_floating(op.scale, op.dtype),
                 softcap_literal=CEmitter._format_floating(op.softcap, op.dtype),
                 one_literal=CEmitter._format_literal(op.dtype, 1),
+                max_fn=max_fn,
                 exp_fn=CEmitter._math_fn(op.dtype, "expf", "exp"),
                 tanh_fn=CEmitter._math_fn(op.dtype, "tanhf", "tanh"),
                 is_causal=int(op.is_causal),
@@ -6580,9 +5966,74 @@ class CEmitter:
                 input_past_value_suffix=input_past_value_suffix,
                 input_nonpad_suffix=input_nonpad_suffix,
                 output_suffix=output_suffix,
-                output_present_key_suffix=output_present_key_suffix,
-                output_present_value_suffix=output_present_value_suffix,
-                output_qk_matmul_suffix=output_qk_matmul_suffix,
+                output_present_key_suffix=output_present_key_suffix,
+                output_present_value_suffix=output_present_value_suffix,
+                output_qk_matmul_suffix=output_qk_matmul_suffix,
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, RotaryEmbeddingOp):
+            params = self._shared_param_map(
+                [
+                    ("input0", op.input0),
+                    ("cos_cache", op.cos_cache),
+                    ("sin_cache", op.sin_cache),
+                    ("position_ids", op.position_ids),
+                    ("output", op.output),
+                ]
+            )
+            input_suffix = self._param_array_suffix(
+                op.input_shape, _dim_names_for(op.input0)
+            )
+            cos_suffix = self._param_array_suffix(op.cos_shape)
+            sin_suffix = self._param_array_suffix(op.sin_shape)
+            position_suffix = (
+                self._param_array_suffix(op.position_ids_shape)
+                if op.position_ids_shape is not None
+                else ""
+            )
+            output_suffix = self._param_array_suffix(
+                op.input_shape, _dim_names_for(op.output)
+            )
+            param_decls = self._build_param_decls(
+                [
+                    (params["input0"], c_type, input_suffix, True),
+                    (params["cos_cache"], c_type, cos_suffix, True),
+                    (params["sin_cache"], c_type, sin_suffix, True),
+                    (
+                        params["position_ids"],
+                        op.position_ids_dtype.c_type,
+                        position_suffix,
+                        True,
+                    )
+                    if params["position_ids"]
+                    else (None, "", "", True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            rendered = rotary_embedding_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                cos_cache=params["cos_cache"],
+                sin_cache=params["sin_cache"],
+                position_ids=params["position_ids"],
+                output=params["output"],
+                params=param_decls,
+                c_type=c_type,
+                input_suffix=input_suffix,
+                cos_suffix=cos_suffix,
+                sin_suffix=sin_suffix,
+                position_suffix=position_suffix,
+                output_suffix=output_suffix,
+                batch=op.batch,
+                seq_len=op.seq_len,
+                num_heads=op.num_heads,
+                head_size=op.head_size,
+                rotary_dim=op.rotary_dim,
+                rotary_dim_half=op.rotary_dim_half,
+                input_rank=op.input_rank,
+                interleaved=int(op.interleaved),
+                has_position_ids=int(op.position_ids is not None),
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ConvOp):
@@ -7432,15 +6883,142 @@ class CEmitter:
                 activation_functions=activation_functions,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, AdagradOp):
+            params = self._shared_param_map(
+                [
+                    ("rate", op.rate),
+                    ("timestep", op.timestep),
+                    *(
+                        (f"input{idx}", name)
+                        for idx, name in enumerate(op.inputs)
+                    ),
+                    *(
+                        (f"grad{idx}", name)
+                        for idx, name in enumerate(op.gradients)
+                    ),
+                    *(
+                        (f"acc{idx}", name)
+                        for idx, name in enumerate(op.accumulators)
+                    ),
+                    *(
+                        (f"output{idx}", name)
+                        for idx, name in enumerate(op.outputs)
+                    ),
+                    *(
+                        (f"acc_output{idx}", name)
+                        for idx, name in enumerate(op.accumulator_outputs)
+                    ),
+                ]
+            )
+            rate_suffix = self._param_array_suffix(
+                op.rate_shape, _dim_names_for(op.rate)
+            )
+            timestep_suffix = self._param_array_suffix(
+                op.timestep_shape, _dim_names_for(op.timestep)
+            )
+            param_specs = [
+                (params["rate"], op.rate_dtype.c_type, rate_suffix, True),
+                (
+                    params["timestep"],
+                    op.timestep_dtype.c_type,
+                    timestep_suffix,
+                    True,
+                ),
+            ]
+            tensor_specs = []
+            for idx, shape in enumerate(op.output_shapes):
+                input_suffix = self._param_array_suffix(
+                    op.tensor_shapes[idx], _dim_names_for(op.inputs[idx])
+                )
+                grad_suffix = self._param_array_suffix(
+                    op.tensor_shapes[idx], _dim_names_for(op.gradients[idx])
+                )
+                acc_suffix = self._param_array_suffix(
+                    op.tensor_shapes[idx], _dim_names_for(op.accumulators[idx])
+                )
+                output_suffix = self._param_array_suffix(
+                    op.output_shapes[idx], _dim_names_for(op.outputs[idx])
+                )
+                acc_output_suffix = self._param_array_suffix(
+                    op.output_shapes[idx],
+                    _dim_names_for(op.accumulator_outputs[idx]),
+                )
+                param_specs.extend(
+                    [
+                        (params[f"input{idx}"], c_type, input_suffix, True),
+                        (params[f"grad{idx}"], c_type, grad_suffix, True),
+                        (params[f"acc{idx}"], c_type, acc_suffix, True),
+                        (params[f"output{idx}"], c_type, output_suffix, False),
+                        (
+                            params[f"acc_output{idx}"],
+                            c_type,
+                            acc_output_suffix,
+                            False,
+                        ),
+                    ]
+                )
+                output_dim_names = _dim_names_for(op.outputs[idx])
+                shape_exprs = CEmitter._shape_dim_exprs(
+                    shape, output_dim_names
+                )
+                loop_vars = CEmitter._loop_vars(shape)
+                index_suffix = "".join(f"[{var}]" for var in loop_vars)
+                tensor_specs.append(
+                    {
+                        "shape": shape_exprs,
+                        "loop_vars": loop_vars,
+                        "input_expr": f"{params[f'input{idx}']}{index_suffix}",
+                        "grad_expr": f"{params[f'grad{idx}']}{index_suffix}",
+                        "acc_expr": f"{params[f'acc{idx}']}{index_suffix}",
+                        "output_expr": f"{params[f'output{idx}']}{index_suffix}",
+                        "acc_output_expr": f"{params[f'acc_output{idx}']}{index_suffix}",
+                    }
+                )
+            param_decls = self._build_param_decls(param_specs)
+            rendered = adagrad_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                rate=params["rate"],
+                timestep=params["timestep"],
+                params=param_decls,
+                c_type=c_type,
+                one_literal=CEmitter._format_literal(op.dtype, 1),
+                decay_factor_literal=CEmitter._format_floating(
+                    op.decay_factor, op.dtype
+                ),
+                norm_coefficient_literal=CEmitter._format_floating(
+                    op.norm_coefficient, op.dtype
+                ),
+                epsilon_literal=CEmitter._format_floating(op.epsilon, op.dtype),
+                sqrt_fn=CEmitter._math_fn(op.dtype, "sqrtf", "sqrt"),
+                tensors=tensor_specs,
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, SoftmaxOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for Softmax rendering."
+                )
+            output_shape = self._ctx_shape(op.output)
+            output_dtype = self._ctx_dtype(op.output)
+            outer = self._derived(op, "outer")
+            axis_size = self._derived(op, "axis_size")
+            inner = self._derived(op, "inner")
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, output_dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for Softmax."
+                )
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            array_suffix = self._param_array_suffix(op.shape)
+            array_suffix = self._param_array_suffix(output_shape)
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], c_type, array_suffix, True),
-                    (params["output"], c_type, array_suffix, False),
+                    (params["input0"], output_dtype.c_type, array_suffix, True),
+                    (params["output"], output_dtype.c_type, array_suffix, False),
                 ]
             )
             rendered = softmax_template.render(
@@ -7449,23 +7027,40 @@ class CEmitter:
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
-                c_type=c_type,
+                c_type=output_dtype.c_type,
                 array_suffix=array_suffix,
-                outer=op.outer,
-                axis_size=op.axis_size,
-                inner=op.inner,
-                exp_fn=CEmitter._math_fn(op.dtype, "expf", "exp"),
+                outer=outer,
+                axis_size=axis_size,
+                inner=inner,
+                max_fn=max_fn,
+                exp_fn=CEmitter._math_fn(output_dtype, "expf", "exp"),
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, LogSoftmaxOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for LogSoftmax rendering."
+                )
+            output_shape = self._ctx_shape(op.output)
+            output_dtype = self._ctx_dtype(op.output)
+            outer = self._derived(op, "outer")
+            axis_size = self._derived(op, "axis_size")
+            inner = self._derived(op, "inner")
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, output_dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for LogSoftmax."
+                )
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            array_suffix = self._param_array_suffix(op.shape)
+            array_suffix = self._param_array_suffix(output_shape)
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], c_type, array_suffix, True),
-                    (params["output"], c_type, array_suffix, False),
+                    (params["input0"], output_dtype.c_type, array_suffix, True),
+                    (params["output"], output_dtype.c_type, array_suffix, False),
                 ]
             )
             rendered = logsoftmax_template.render(
@@ -7474,24 +7069,41 @@ class CEmitter:
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
-                c_type=c_type,
+                c_type=output_dtype.c_type,
                 array_suffix=array_suffix,
-                outer=op.outer,
-                axis_size=op.axis_size,
-                inner=op.inner,
-                exp_fn=CEmitter._math_fn(op.dtype, "expf", "exp"),
-                log_fn=CEmitter._math_fn(op.dtype, "logf", "log"),
+                outer=outer,
+                axis_size=axis_size,
+                inner=inner,
+                max_fn=max_fn,
+                exp_fn=CEmitter._math_fn(output_dtype, "expf", "exp"),
+                log_fn=CEmitter._math_fn(output_dtype, "logf", "log"),
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, HardmaxOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for Hardmax rendering."
+                )
+            output_shape = self._ctx_shape(op.output)
+            output_dtype = self._ctx_dtype(op.output)
+            outer = self._derived(op, "outer")
+            axis_size = self._derived(op, "axis_size")
+            inner = self._derived(op, "inner")
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, output_dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for Hardmax."
+                )
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            array_suffix = self._param_array_suffix(op.shape)
+            array_suffix = self._param_array_suffix(output_shape)
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], c_type, array_suffix, True),
-                    (params["output"], c_type, array_suffix, False),
+                    (params["input0"], output_dtype.c_type, array_suffix, True),
+                    (params["output"], output_dtype.c_type, array_suffix, False),
                 ]
             )
             rendered = hardmax_template.render(
@@ -7500,13 +7112,14 @@ class CEmitter:
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
-                c_type=c_type,
+                c_type=output_dtype.c_type,
                 array_suffix=array_suffix,
-                outer=op.outer,
-                axis_size=op.axis_size,
-                inner=op.inner,
+                outer=outer,
+                axis_size=axis_size,
+                inner=inner,
                 zero_literal=zero_literal,
-                one_literal=CEmitter._format_literal(op.dtype, 1),
+                one_literal=CEmitter._format_literal(output_dtype, 1),
+                max_fn=max_fn,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, NegativeLogLikelihoodLossOp):
@@ -7576,6 +7189,17 @@ class CEmitter:
                 if op.dtype in {ScalarType.F16, ScalarType.F32}
                 else op.dtype
             )
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for SoftmaxCrossEntropyLoss."
+                )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, acc_dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for SoftmaxCrossEntropyLoss."
+                )
             acc_type = acc_dtype.c_type
             acc_zero_literal = CEmitter._format_literal(acc_dtype, 0)
             acc_one_literal = CEmitter._format_literal(acc_dtype, 1)
@@ -7652,9 +7276,21 @@ class CEmitter:
                 acc_one_literal=acc_one_literal,
                 acc_exp_fn=acc_exp_fn,
                 acc_log_fn=acc_log_fn,
+                max_fn=max_fn,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, MaxPoolOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for MaxPool rendering."
+                )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, op.dtype, scalar_registry
+            )
+            if max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar maximum function for MaxPool."
+                )
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -7699,6 +7335,7 @@ class CEmitter:
                 output_suffix=output_suffix,
                 indices_suffix=indices_suffix,
                 indices_c_type=indices_c_type,
+                max_fn=max_fn,
                 batch=op.batch,
                 channels=op.channels,
                 spatial_rank=op.spatial_rank,
@@ -8032,21 +7669,133 @@ class CEmitter:
                 reduction=op.reduction,
             ).rstrip()
             return with_node_comment(rendered)
+        if isinstance(op, TensorScatterOp):
+            param_pairs = [
+                ("past_cache", op.past_cache),
+                ("update", op.update),
+                ("output", op.output),
+            ]
+            if op.write_indices is not None:
+                param_pairs.insert(2, ("write_indices", op.write_indices))
+            params = self._shared_param_map(param_pairs)
+            output_dim_names = _dim_names_for(op.output)
+            update_dim_names = _dim_names_for(op.update)
+            past_dim_names = _dim_names_for(op.past_cache)
+            write_indices_dim_names = (
+                _dim_names_for(op.write_indices) if op.write_indices else None
+            )
+            output_shape = CEmitter._shape_dim_exprs(
+                op.output_shape, output_dim_names
+            )
+            update_shape = CEmitter._shape_dim_exprs(
+                op.update_shape, update_dim_names
+            )
+            prefix_shape = output_shape[: op.axis]
+            prefix_loop_vars = (
+                CEmitter._loop_vars(op.output_shape[: op.axis])
+                if op.output_shape[: op.axis]
+                else ()
+            )
+            tail_shape = output_shape[op.axis + 1 :]
+            tail_loop_vars = (
+                tuple(
+                    f"t{index}"
+                    for index in range(len(op.output_shape[op.axis + 1 :]))
+                )
+                if op.output_shape[op.axis + 1 :]
+                else ()
+            )
+            output_loop_vars = CEmitter._loop_vars(op.output_shape)
+            sequence_loop_var = "seq"
+            cache_index_var = "cache_index"
+            write_index_var = "write_index"
+            index_vars = (*prefix_loop_vars, cache_index_var, *tail_loop_vars)
+            output_index_expr = f"{params['output']}" + "".join(
+                f"[{var}]" for var in index_vars
+            )
+            update_index_vars = (
+                *prefix_loop_vars,
+                sequence_loop_var,
+                *tail_loop_vars,
+            )
+            update_index_expr = f"{params['update']}" + "".join(
+                f"[{var}]" for var in update_index_vars
+            )
+            past_suffix = self._param_array_suffix(
+                op.past_cache_shape, past_dim_names
+            )
+            update_suffix = self._param_array_suffix(
+                op.update_shape, update_dim_names
+            )
+            output_suffix = self._param_array_suffix(
+                op.output_shape, output_dim_names
+            )
+            param_decls = [
+                (params["past_cache"], c_type, past_suffix, True),
+                (params["update"], c_type, update_suffix, True),
+            ]
+            if op.write_indices is not None and op.write_indices_dtype is not None:
+                write_indices_suffix = self._param_array_suffix(
+                    op.write_indices_shape or (), write_indices_dim_names
+                )
+                param_decls.append(
+                    (
+                        params["write_indices"],
+                        op.write_indices_dtype.c_type,
+                        write_indices_suffix,
+                        True,
+                    )
+                )
+            param_decls.append((params["output"], c_type, output_suffix, False))
+            param_decls_rendered = self._build_param_decls(param_decls)
+            rendered = tensor_scatter_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                past_cache=params["past_cache"],
+                update=params["update"],
+                write_indices=(
+                    params.get("write_indices") if op.write_indices else None
+                ),
+                output=params["output"],
+                params=param_decls_rendered,
+                c_type=c_type,
+                output_shape=output_shape,
+                output_loop_vars=output_loop_vars,
+                prefix_shape=prefix_shape,
+                prefix_loop_vars=prefix_loop_vars,
+                sequence_dim=update_shape[op.axis],
+                sequence_loop_var=sequence_loop_var,
+                tail_shape=tail_shape,
+                tail_loop_vars=tail_loop_vars,
+                output_index_expr=output_index_expr,
+                update_index_expr=update_index_expr,
+                max_sequence_length=output_shape[op.axis],
+                write_indices_present=op.write_indices is not None,
+                batch_index_var=prefix_loop_vars[0]
+                if prefix_loop_vars
+                else "0",
+                write_index_var=write_index_var,
+                cache_index_var=cache_index_var,
+                circular=op.mode == "circular",
+            ).rstrip()
+            return with_node_comment(rendered)
         if isinstance(op, TransposeOp):
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            output_shape = CEmitter._codegen_shape(op.output_shape)
+            output_shape = CEmitter._codegen_shape(output_shape_raw)
             loop_vars = CEmitter._loop_vars(output_shape)
             output_suffix = self._param_array_suffix(output_shape)
-            input_suffix = self._param_array_suffix(op.input_shape)
+            input_suffix = self._param_array_suffix(input_shape)
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], c_type, input_suffix, True),
                     (params["output"], c_type, output_suffix, False),
                 ]
             )
-            if not op.input_shape:
+            if not input_shape:
                 input_indices = [loop_vars[0]]
             else:
                 input_indices = [None] * len(op.perm)
@@ -8067,19 +7816,21 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ReshapeOp):
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            input_suffix = self._param_array_suffix(op.input_shape)
-            output_shape = CEmitter._codegen_shape(op.output_shape)
-            output_suffix = self._param_array_suffix(op.output_shape)
+            input_suffix = self._param_array_suffix(input_shape)
+            output_shape = CEmitter._codegen_shape(output_shape_raw)
+            output_suffix = self._param_array_suffix(output_shape_raw)
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], c_type, input_suffix, True),
                     (params["output"], c_type, output_suffix, False),
                 ]
             )
-            loop_vars = CEmitter._loop_vars(op.output_shape)
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
             rendered = reshape_template.render(
                 model_name=model.name,
                 op_name=op_name,
@@ -8089,20 +7840,27 @@ class CEmitter:
                 c_type=c_type,
                 input_suffix=input_suffix,
                 output_suffix=output_suffix,
-                element_count=CEmitter._element_count(op.output_shape),
+                element_count=CEmitter._element_count(output_shape_raw),
                 output_shape=output_shape,
                 loop_vars=loop_vars,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, IdentityOp):
+            output_shape_raw = self._ctx_shape(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
             output_dim_names = _dim_names_for(op.output)
-            shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
-            loop_vars = CEmitter._loop_vars(op.shape)
-            output_suffix = self._param_array_suffix(shape, output_dim_names)
-            input_suffix = self._param_array_suffix(shape, _dim_names_for(op.input0))
+            shape = CEmitter._shape_dim_exprs(
+                output_shape_raw, output_dim_names
+            )
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
+            output_suffix = self._param_array_suffix(
+                output_shape_raw, output_dim_names
+            )
+            input_suffix = self._param_array_suffix(
+                output_shape_raw, _dim_names_for(op.input0)
+            )
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], c_type, input_suffix, True),
@@ -8704,39 +8462,41 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ReduceOp) and op.axes_input is None:
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output)
+            axes = self._derived(op, "axes")
+            output_dtype = self._ctx_dtype(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            output_shape = CEmitter._codegen_shape(op.output_shape)
+            output_shape = CEmitter._codegen_shape(output_shape_raw)
             output_loop_vars = CEmitter._loop_vars(output_shape)
-            if not op.input_shape:
+            if not input_shape:
                 reduce_loop_vars = ("r0",)
                 reduce_dims = (1,)
             else:
-                reduce_loop_vars = tuple(
-                    f"r{idx}" for idx in range(len(op.axes))
-                )
-                reduce_dims = tuple(op.input_shape[axis] for axis in op.axes)
-            if not op.input_shape:
+                reduce_loop_vars = tuple(f"r{idx}" for idx in range(len(axes)))
+                reduce_dims = tuple(input_shape[axis] for axis in axes)
+            if not input_shape:
                 input_indices = [reduce_loop_vars[0]]
             elif op.keepdims:
                 input_indices = [
-                    reduce_loop_vars[op.axes.index(axis)]
-                    if axis in op.axes
+                    reduce_loop_vars[axes.index(axis)]
+                    if axis in axes
                     else output_loop_vars[axis]
-                    for axis in range(len(op.input_shape))
+                    for axis in range(len(input_shape))
                 ]
             else:
                 kept_axes = [
                     axis
-                    for axis in range(len(op.input_shape))
-                    if axis not in op.axes
+                    for axis in range(len(input_shape))
+                    if axis not in axes
                 ]
                 input_indices = [
-                    reduce_loop_vars[op.axes.index(axis)]
-                    if axis in op.axes
+                    reduce_loop_vars[axes.index(axis)]
+                    if axis in axes
                     else output_loop_vars[kept_axes.index(axis)]
-                    for axis in range(len(op.input_shape))
+                    for axis in range(len(input_shape))
                 ]
             input_index_expr = "".join(f"[{var}]" for var in input_indices)
             output_index_expr = "".join(
@@ -8748,16 +8508,16 @@ class CEmitter:
             final_expr = "acc"
             use_kahan = False
             kahan_value_expr = None
-            fabs_fn = CEmitter._math_fn(op.dtype, "fabsf", "fabs")
-            exp_fn = CEmitter._math_fn(op.dtype, "expf", "exp")
-            log_fn = CEmitter._math_fn(op.dtype, "logf", "log")
-            sqrt_fn = CEmitter._math_fn(op.dtype, "sqrtf", "sqrt")
+            fabs_fn = CEmitter._math_fn(output_dtype, "fabsf", "fabs")
+            exp_fn = CEmitter._math_fn(output_dtype, "expf", "exp")
+            log_fn = CEmitter._math_fn(output_dtype, "logf", "log")
+            sqrt_fn = CEmitter._math_fn(output_dtype, "sqrtf", "sqrt")
             if op.reduce_kind == "sum":
                 init_literal = zero_literal
                 update_expr = f"acc += {value_expr};"
             elif op.reduce_kind == "mean":
                 count_literal = CEmitter._format_literal(
-                    op.dtype, op.reduce_count
+                    output_dtype, op.reduce_count
                 )
                 init_literal = zero_literal
                 update_expr = f"acc += {value_expr};"
@@ -8769,7 +8529,7 @@ class CEmitter:
                 init_literal = max_literal
                 update_expr = f"if ({value_expr} < acc) acc = {value_expr};"
             elif op.reduce_kind == "prod":
-                init_literal = CEmitter._format_literal(op.dtype, 1)
+                init_literal = CEmitter._format_literal(output_dtype, 1)
                 update_expr = f"acc *= {value_expr};"
             elif op.reduce_kind == "l1":
                 init_literal = zero_literal
@@ -8793,7 +8553,7 @@ class CEmitter:
                 raise CodegenError(
                     f"Unsupported reduce kind {op.reduce_kind}"
                 )
-            if op.dtype in {ScalarType.F16, ScalarType.F32} and op.reduce_kind in {
+            if output_dtype in {ScalarType.F16, ScalarType.F32} and op.reduce_kind in {
                 "sum",
                 "mean",
                 "logsum",
@@ -8811,8 +8571,8 @@ class CEmitter:
                     kahan_value_expr = f"{value_expr} * {value_expr}"
                 else:
                     kahan_value_expr = value_expr
-            input_suffix = self._param_array_suffix(op.input_shape)
-            output_suffix = self._param_array_suffix(op.output_shape)
+            input_suffix = self._param_array_suffix(input_shape)
+            output_suffix = self._param_array_suffix(output_shape_raw)
             param_decls = self._build_param_decls(
                 [
                     (params["input0"], c_type, input_suffix, True),
@@ -8842,33 +8602,40 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ArgReduceOp):
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output)
+            axis = self._derived(op, "axis")
+            input_dtype = self._ctx_dtype(op.input0)
+            output_dtype = self._ctx_dtype(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
-            output_shape = CEmitter._codegen_shape(op.output_shape)
+            output_shape = CEmitter._codegen_shape(output_shape_raw)
             output_loop_vars = CEmitter._loop_vars(output_shape)
             reduce_var = "r0"
-            reduce_dim = op.input_shape[op.axis]
+            reduce_dim = input_shape[axis]
             if op.keepdims:
                 input_indices = [
-                    reduce_var if axis == op.axis else output_loop_vars[axis]
-                    for axis in range(len(op.input_shape))
+                    reduce_var
+                    if axis_index == axis
+                    else output_loop_vars[axis_index]
+                    for axis_index in range(len(input_shape))
                 ]
             else:
                 kept_axes = [
-                    axis
-                    for axis in range(len(op.input_shape))
-                    if axis != op.axis
+                    axis_index
+                    for axis_index in range(len(input_shape))
+                    if axis_index != axis
                 ]
                 input_indices = [
                     reduce_var
-                    if axis == op.axis
-                    else output_loop_vars[kept_axes.index(axis)]
-                    for axis in range(len(op.input_shape))
+                    if axis_index == axis
+                    else output_loop_vars[kept_axes.index(axis_index)]
+                    for axis_index in range(len(input_shape))
                 ]
             init_indices = [
-                "0" if axis == op.axis else input_indices[axis]
-                for axis in range(len(op.input_shape))
+                "0" if axis_index == axis else input_indices[axis_index]
+                for axis_index in range(len(input_shape))
             ]
             input_index_expr = "".join(f"[{var}]" for var in input_indices)
             init_index_expr = "".join(f"[{var}]" for var in init_indices)
@@ -8883,12 +8650,12 @@ class CEmitter:
                 raise CodegenError(
                     f"Unsupported arg reduce kind {op.reduce_kind}"
                 )
-            input_suffix = self._param_array_suffix(op.input_shape)
-            output_suffix = self._param_array_suffix(op.output_shape)
+            input_suffix = self._param_array_suffix(input_shape)
+            output_suffix = self._param_array_suffix(output_shape_raw)
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
-                    (params["output"], op.output_dtype.c_type, output_suffix, False),
+                    (params["input0"], input_dtype.c_type, input_suffix, True),
+                    (params["output"], output_dtype.c_type, output_suffix, False),
                 ]
             )
             rendered = arg_reduce_template.render(
@@ -8897,8 +8664,8 @@ class CEmitter:
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
-                input_c_type=op.input_dtype.c_type,
-                output_c_type=op.output_dtype.c_type,
+                input_c_type=input_dtype.c_type,
+                output_c_type=output_dtype.c_type,
                 input_suffix=input_suffix,
                 output_suffix=output_suffix,
                 output_shape=output_shape,
@@ -8913,6 +8680,11 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, TopKOp):
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output_values)
+            input_dtype = self._ctx_dtype(op.input0)
+            output_values_dtype = self._ctx_dtype(op.output_values)
+            output_indices_dtype = self._ctx_dtype(op.output_indices)
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -8920,7 +8692,7 @@ class CEmitter:
                     ("output_indices", op.output_indices),
                 ]
             )
-            output_shape = CEmitter._codegen_shape(op.output_shape)
+            output_shape = CEmitter._codegen_shape(output_shape_raw)
             outer_shape = tuple(
                 dim for axis, dim in enumerate(output_shape) if axis != op.axis
             )
@@ -8930,7 +8702,7 @@ class CEmitter:
             input_indices: list[str] = []
             output_indices: list[str] = []
             outer_index = 0
-            for axis in range(len(op.input_shape)):
+            for axis in range(len(input_shape)):
                 if axis == op.axis:
                     input_indices.append(reduce_var)
                     output_indices.append(k_var)
@@ -8945,20 +8717,20 @@ class CEmitter:
                 if op.largest
                 else "(a < b) || ((a == b) && (ai < bi))"
             )
-            input_suffix = self._param_array_suffix(op.input_shape)
-            output_suffix = self._param_array_suffix(op.output_shape)
+            input_suffix = self._param_array_suffix(input_shape)
+            output_suffix = self._param_array_suffix(output_shape_raw)
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
+                    (params["input0"], input_dtype.c_type, input_suffix, True),
                     (
                         params["output_values"],
-                        op.output_values_dtype.c_type,
+                        output_values_dtype.c_type,
                         output_suffix,
                         False,
                     ),
                     (
                         params["output_indices"],
-                        op.output_indices_dtype.c_type,
+                        output_indices_dtype.c_type,
                         output_suffix,
                         False,
                     ),
@@ -9216,27 +8988,150 @@ class CEmitter:
             )
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
-                    (params["output"], c_type, output_suffix, False),
+                    (params["input0"], op.input_dtype.c_type, input_suffix, True),
+                    (params["output"], c_type, output_suffix, False),
+                ]
+            )
+            input_expr = f"{params['input0']}" + "".join(
+                f"[{var}]" for var in loop_vars
+            )
+            rendered = nonzero_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                output=params["output"],
+                params=param_decls,
+                input_c_type=op.input_dtype.c_type,
+                output_c_type=c_type,
+                input_suffix=input_suffix,
+                output_suffix=output_suffix,
+                input_shape=input_shape,
+                loop_vars=loop_vars,
+                input_expr=input_expr,
+                zero_literal=op.input_dtype.zero_literal,
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, NonMaxSuppressionOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for NonMaxSuppression."
+                )
+            min_fn = self._scalar_function_name(
+                ScalarFunction.MINIMUM, op.boxes_dtype, scalar_registry
+            )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, op.boxes_dtype, scalar_registry
+            )
+            if min_fn is None or max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar min/max functions for NonMaxSuppression."
+                )
+            params = self._shared_param_map(
+                [
+                    ("boxes", op.boxes),
+                    ("scores", op.scores),
+                    ("max_output_boxes_per_class", op.max_output_boxes_per_class),
+                    ("iou_threshold", op.iou_threshold),
+                    ("score_threshold", op.score_threshold),
+                    ("output", op.output),
+                ]
+            )
+            boxes_suffix = self._param_array_suffix(
+                op.boxes_shape, _dim_names_for(op.boxes)
+            )
+            scores_suffix = self._param_array_suffix(
+                op.scores_shape, _dim_names_for(op.scores)
+            )
+            output_suffix = self._param_array_suffix(
+                op.output_shape, _dim_names_for(op.output)
+            )
+            max_output_suffix = (
+                self._param_array_suffix(
+                    op.max_output_shape,
+                    _dim_names_for(op.max_output_boxes_per_class or ""),
+                )
+                if op.max_output_shape is not None
+                else ""
+            )
+            iou_threshold_suffix = (
+                self._param_array_suffix(
+                    op.iou_threshold_shape,
+                    _dim_names_for(op.iou_threshold or ""),
+                )
+                if op.iou_threshold_shape is not None
+                else ""
+            )
+            score_threshold_suffix = (
+                self._param_array_suffix(
+                    op.score_threshold_shape,
+                    _dim_names_for(op.score_threshold or ""),
+                )
+                if op.score_threshold_shape is not None
+                else ""
+            )
+            param_decls = self._build_param_decls(
+                [
+                    (params["boxes"], op.boxes_dtype.c_type, boxes_suffix, True),
+                    (params["scores"], op.boxes_dtype.c_type, scores_suffix, True),
+                    (
+                        params["max_output_boxes_per_class"],
+                        op.max_output_dtype.c_type if op.max_output_dtype else "",
+                        max_output_suffix,
+                        True,
+                    )
+                    if params["max_output_boxes_per_class"]
+                    else (None, "", "", True),
+                    (
+                        params["iou_threshold"],
+                        (
+                            op.iou_threshold_dtype.c_type
+                            if op.iou_threshold_dtype
+                            else ""
+                        ),
+                        iou_threshold_suffix,
+                        True,
+                    )
+                    if params["iou_threshold"]
+                    else (None, "", "", True),
+                    (
+                        params["score_threshold"],
+                        (
+                            op.score_threshold_dtype.c_type
+                            if op.score_threshold_dtype
+                            else ""
+                        ),
+                        score_threshold_suffix,
+                        True,
+                    )
+                    if params["score_threshold"]
+                    else (None, "", "", True),
+                    (params["output"], op.output_dtype.c_type, output_suffix, False),
                 ]
             )
-            input_expr = f"{params['input0']}" + "".join(
-                f"[{var}]" for var in loop_vars
-            )
-            rendered = nonzero_template.render(
+            rendered = nonmax_suppression_template.render(
                 model_name=model.name,
                 op_name=op_name,
-                input0=params["input0"],
+                boxes=params["boxes"],
+                scores=params["scores"],
+                max_output_boxes_per_class=params["max_output_boxes_per_class"],
+                iou_threshold=params["iou_threshold"],
+                score_threshold=params["score_threshold"],
                 output=params["output"],
                 params=param_decls,
-                input_c_type=op.input_dtype.c_type,
-                output_c_type=c_type,
-                input_suffix=input_suffix,
-                output_suffix=output_suffix,
-                input_shape=input_shape,
-                loop_vars=loop_vars,
-                input_expr=input_expr,
-                zero_literal=op.input_dtype.zero_literal,
+                input_c_type=op.boxes_dtype.c_type,
+                output_c_type=op.output_dtype.c_type,
+                compute_type=op.boxes_dtype.c_type,
+                output_capacity=op.output_shape[0],
+                num_batches=op.boxes_shape[0],
+                num_boxes=op.boxes_shape[1],
+                num_classes=op.scores_shape[1],
+                center_point_box=op.center_point_box,
+                min_fn=min_fn,
+                max_fn=max_fn,
+                iou_threshold_default=op.boxes_dtype.zero_literal,
+                score_threshold_default=op.boxes_dtype.zero_literal,
+                score_threshold_enabled=op.score_threshold is not None,
+                dim_args=dim_args,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ExpandOp):
@@ -9476,17 +9371,24 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, CastOp):
+            input_dtype = self._ctx_dtype(op.input0)
+            output_dtype = self._ctx_dtype(op.output)
+            output_shape_raw = self._ctx_shape(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
             output_dim_names = _dim_names_for(op.output)
-            shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
-            loop_vars = CEmitter._loop_vars(op.shape)
-            array_suffix = self._param_array_suffix(op.shape, output_dim_names)
+            shape = CEmitter._shape_dim_exprs(
+                output_shape_raw, output_dim_names
+            )
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
+            array_suffix = self._param_array_suffix(
+                output_shape_raw, output_dim_names
+            )
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.input_dtype.c_type, array_suffix, True),
-                    (params["output"], op.dtype.c_type, array_suffix, False),
+                    (params["input0"], input_dtype.c_type, array_suffix, True),
+                    (params["output"], output_dtype.c_type, array_suffix, False),
                 ]
             )
             rendered = cast_template.render(
@@ -9495,8 +9397,8 @@ class CEmitter:
                 input0=params["input0"],
                 output=params["output"],
                 params=param_decls,
-                input_c_type=op.input_dtype.c_type,
-                output_c_type=op.dtype.c_type,
+                input_c_type=input_dtype.c_type,
+                output_c_type=output_dtype.c_type,
                 array_suffix=array_suffix,
                 shape=shape,
                 loop_vars=loop_vars,
@@ -9504,6 +9406,10 @@ class CEmitter:
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, QuantizeLinearOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for QuantizeLinear."
+                )
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -9545,6 +9451,21 @@ class CEmitter:
                 ]
             )
             compute_type = "double" if op.input_dtype == ScalarType.F64 else "float"
+            compute_dtype = (
+                ScalarType.F64
+                if compute_type == "double"
+                else ScalarType.F32
+            )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, compute_dtype, scalar_registry
+            )
+            min_fn = self._scalar_function_name(
+                ScalarFunction.MINIMUM, compute_dtype, scalar_registry
+            )
+            if max_fn is None or min_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar min/max functions for QuantizeLinear."
+                )
             round_fn = CEmitter._math_fn(
                 op.input_dtype, "nearbyintf", "nearbyint"
             )
@@ -9580,10 +9501,221 @@ class CEmitter:
                 round_fn=round_fn,
                 min_literal=op.dtype.min_literal,
                 max_literal=op.dtype.max_literal,
+                min_fn=min_fn,
+                max_fn=max_fn,
+                dim_args=dim_args,
+            ).rstrip()
+            return with_node_comment(rendered)
+        if isinstance(op, QLinearMatMulOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for QLinearMatMul."
+                )
+            params = self._shared_param_map(
+                [
+                    ("input0", op.input0),
+                    ("input0_scale", op.input0_scale),
+                    ("input0_zero_point", op.input0_zero_point),
+                    ("input1", op.input1),
+                    ("input1_scale", op.input1_scale),
+                    ("input1_zero_point", op.input1_zero_point),
+                    ("output_scale", op.output_scale),
+                    ("output_zero_point", op.output_zero_point),
+                    ("output", op.output),
+                ]
+            )
+            output_shape = CEmitter._codegen_shape(op.output_shape)
+            output_loop_vars = CEmitter._loop_vars(output_shape)
+            output_index_expr = f"{params['output']}" + "".join(
+                f"[{var}]" for var in output_loop_vars
+            )
+            batch_rank = len(op.batch_shape)
+            batch_vars = output_loop_vars[:batch_rank]
+            if op.left_vector and op.right_vector:
+                row_var = None
+                col_var = None
+            elif op.left_vector:
+                row_var = None
+                col_var = output_loop_vars[-1]
+            elif op.right_vector:
+                row_var = output_loop_vars[-1]
+                col_var = None
+            else:
+                row_var = output_loop_vars[-2]
+                col_var = output_loop_vars[-1]
+            input0_index_expr, input1_index_expr = CEmitter._matmul_index_exprs(
+                op,
+                batch_vars,
+                row_var,
+                col_var,
+                batch_rank,
+                input0=params["input0"],
+                input1=params["input1"],
+            )
+            input0_suffix = self._param_array_suffix(op.input0_shape)
+            input1_suffix = self._param_array_suffix(op.input1_shape)
+            input0_scale_suffix = self._param_array_suffix(
+                op.input0_scale_shape
+            )
+            input1_scale_suffix = self._param_array_suffix(
+                op.input1_scale_shape
+            )
+            output_scale_suffix = self._param_array_suffix(
+                op.output_scale_shape
+            )
+            input0_zero_suffix = self._param_array_suffix(op.input0_zero_shape)
+            input1_zero_suffix = self._param_array_suffix(op.input1_zero_shape)
+            output_zero_suffix = self._param_array_suffix(op.output_zero_shape)
+            output_suffix = self._param_array_suffix(op.output_shape)
+            param_decls = self._build_param_decls(
+                [
+                    (
+                        params["input0"],
+                        op.input0_dtype.c_type,
+                        input0_suffix,
+                        True,
+                    ),
+                    (
+                        params["input0_scale"],
+                        op.input0_scale_dtype.c_type,
+                        input0_scale_suffix,
+                        True,
+                    ),
+                    (
+                        params["input0_zero_point"],
+                        op.input0_dtype.c_type,
+                        input0_zero_suffix,
+                        True,
+                    ),
+                    (
+                        params["input1"],
+                        op.input1_dtype.c_type,
+                        input1_suffix,
+                        True,
+                    ),
+                    (
+                        params["input1_scale"],
+                        op.input1_scale_dtype.c_type,
+                        input1_scale_suffix,
+                        True,
+                    ),
+                    (
+                        params["input1_zero_point"],
+                        op.input1_dtype.c_type,
+                        input1_zero_suffix,
+                        True,
+                    ),
+                    (
+                        params["output_scale"],
+                        op.output_scale_dtype.c_type,
+                        output_scale_suffix,
+                        True,
+                    ),
+                    (
+                        params["output_zero_point"],
+                        op.dtype.c_type,
+                        output_zero_suffix,
+                        True,
+                    ),
+                    (
+                        params["output"],
+                        op.dtype.c_type,
+                        output_suffix,
+                        False,
+                    ),
+                ]
+            )
+            compute_dtype = (
+                ScalarType.F64
+                if ScalarType.F64
+                in {
+                    op.input0_scale_dtype,
+                    op.input1_scale_dtype,
+                    op.output_scale_dtype,
+                }
+                else ScalarType.F32
+            )
+            compute_type = (
+                "double" if compute_dtype == ScalarType.F64 else "float"
+            )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, compute_dtype, scalar_registry
+            )
+            min_fn = self._scalar_function_name(
+                ScalarFunction.MINIMUM, compute_dtype, scalar_registry
+            )
+            if max_fn is None or min_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar min/max functions for QLinearMatMul."
+                )
+            round_fn = CEmitter._math_fn(
+                compute_dtype, "nearbyintf", "nearbyint"
+            )
+            scale_index = "0"
+            rendered = qlinear_matmul_template.render(
+                model_name=model.name,
+                op_name=op_name,
+                input0=params["input0"],
+                input1=params["input1"],
+                input0_scale=params["input0_scale"],
+                input0_zero_point=params["input0_zero_point"],
+                input1_scale=params["input1_scale"],
+                input1_zero_point=params["input1_zero_point"],
+                output_scale=params["output_scale"],
+                output_zero_point=params["output_zero_point"],
+                output=params["output"],
+                params=param_decls,
+                compute_type=compute_type,
+                output_c_type=op.dtype.c_type,
+                input0_index_expr=input0_index_expr,
+                input1_index_expr=input1_index_expr,
+                input0_scale_expr=f"{params['input0_scale']}[{scale_index}]",
+                input1_scale_expr=f"{params['input1_scale']}[{scale_index}]",
+                output_scale_expr=f"{params['output_scale']}[{scale_index}]",
+                input0_zero_expr=f"{params['input0_zero_point']}[{scale_index}]",
+                input1_zero_expr=f"{params['input1_zero_point']}[{scale_index}]",
+                output_zero_expr=f"{params['output_zero_point']}[{scale_index}]",
+                output_loop_vars=output_loop_vars,
+                output_loop_bounds=output_shape,
+                output_index_expr=output_index_expr,
+                k=op.k,
+                round_fn=round_fn,
+                min_literal=op.dtype.min_literal,
+                max_literal=op.dtype.max_literal,
+                min_fn=min_fn,
+                max_fn=max_fn,
                 dim_args=dim_args,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, ClipOp):
+            if scalar_registry is None:
+                raise CodegenError(
+                    "Scalar function registry is required for Clip rendering."
+                )
+            input_shape = self._ctx_shape(op.input0)
+            output_shape_raw = self._ctx_shape(op.output)
+            input_dtype = self._ctx_dtype(op.input0)
+            output_dtype = self._ctx_dtype(op.output)
+            min_shape = (
+                self._ctx_shape(op.input_min)
+                if op.input_min is not None
+                else None
+            )
+            max_shape = (
+                self._ctx_shape(op.input_max)
+                if op.input_max is not None
+                else None
+            )
+            min_fn = self._scalar_function_name(
+                ScalarFunction.MINIMUM, input_dtype, scalar_registry
+            )
+            max_fn = self._scalar_function_name(
+                ScalarFunction.MAXIMUM, input_dtype, scalar_registry
+            )
+            if min_fn is None or max_fn is None:
+                raise CodegenError(
+                    "Failed to resolve scalar min/max functions for Clip."
+                )
             params = self._shared_param_map(
                 [
                     ("input0", op.input0),
@@ -9594,61 +9726,61 @@ class CEmitter:
             )
             output_dim_names = _dim_names_for(op.output)
             output_shape = CEmitter._shape_dim_exprs(
-                op.output_shape, output_dim_names
+                output_shape_raw, output_dim_names
             )
-            loop_vars = CEmitter._loop_vars(op.output_shape)
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
             input_expr = CEmitter._broadcast_index_expr(
                 params["input0"],
-                op.input_shape,
-                op.output_shape,
+                input_shape,
+                output_shape_raw,
                 loop_vars,
             )
             min_expr = (
                 CEmitter._broadcast_index_expr(
                     params["input_min"],
-                    op.min_shape,
-                    op.output_shape,
+                    min_shape,
+                    output_shape_raw,
                     loop_vars,
                 )
                 if op.input_min is not None
-                else op.dtype.min_literal
+                else output_dtype.min_literal
             )
             max_expr = (
                 CEmitter._broadcast_index_expr(
                     params["input_max"],
-                    op.max_shape,
-                    op.output_shape,
+                    max_shape,
+                    output_shape_raw,
                     loop_vars,
                 )
                 if op.input_max is not None
-                else op.dtype.max_literal
+                else output_dtype.max_literal
             )
             input_suffix = self._param_array_suffix(
-                op.input_shape, _dim_names_for(op.input0)
+                input_shape, _dim_names_for(op.input0)
             )
             min_suffix = (
                 self._param_array_suffix(
-                    op.min_shape, _dim_names_for(op.input_min)
+                    min_shape, _dim_names_for(op.input_min)
                 )
-                if op.min_shape is not None
+                if min_shape is not None
                 else ""
             )
             max_suffix = (
                 self._param_array_suffix(
-                    op.max_shape, _dim_names_for(op.input_max)
+                    max_shape, _dim_names_for(op.input_max)
                 )
-                if op.max_shape is not None
+                if max_shape is not None
                 else ""
             )
             output_suffix = self._param_array_suffix(
-                op.output_shape, output_dim_names
+                output_shape_raw, output_dim_names
             )
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.dtype.c_type, input_suffix, True),
+                    (params["input0"], input_dtype.c_type, input_suffix, True),
                     (
                         params["input_min"],
-                        op.dtype.c_type,
+                        input_dtype.c_type,
                         min_suffix,
                         True,
                     )
@@ -9656,13 +9788,13 @@ class CEmitter:
                     else (None, "", "", True),
                     (
                         params["input_max"],
-                        op.dtype.c_type,
+                        input_dtype.c_type,
                         max_suffix,
                         True,
                     )
                     if params["input_max"]
                     else (None, "", "", True),
-                    (params["output"], op.dtype.c_type, output_suffix, False),
+                    (params["output"], output_dtype.c_type, output_suffix, False),
                 ]
             )
             rendered = clip_template.render(
@@ -9673,8 +9805,8 @@ class CEmitter:
                 input_max=params["input_max"],
                 output=params["output"],
                 params=param_decls,
-                input_c_type=op.dtype.c_type,
-                output_c_type=op.dtype.c_type,
+                input_c_type=input_dtype.c_type,
+                output_c_type=output_dtype.c_type,
                 input_suffix=input_suffix,
                 min_suffix=min_suffix,
                 max_suffix=max_suffix,
@@ -9684,30 +9816,51 @@ class CEmitter:
                 input_expr=input_expr,
                 min_expr=min_expr,
                 max_expr=max_expr,
+                min_fn=min_fn,
+                max_fn=max_fn,
                 dim_args=dim_args,
             ).rstrip()
             return with_node_comment(rendered)
         if isinstance(op, UnaryOp):
+            input_dtype = self._ctx_dtype(op.input0)
+            output_dtype = self._ctx_dtype(op.output)
+            output_shape_raw = self._ctx_shape(op.output)
             params = self._shared_param_map(
                 [("input0", op.input0), ("output", op.output)]
             )
             scalar_operator = None
             if scalar_registry is not None:
                 scalar_operator = self._scalar_function_name(
-                    op.function, op.dtype, scalar_registry, params=op.params
+                    op.function, input_dtype, scalar_registry, params=op.params
                 )
             output_dim_names = _dim_names_for(op.output)
-            shape = CEmitter._shape_dim_exprs(op.shape, output_dim_names)
-            loop_vars = CEmitter._loop_vars(op.shape)
-            array_suffix = self._param_array_suffix(op.shape, output_dim_names)
+            shape = CEmitter._shape_dim_exprs(
+                output_shape_raw, output_dim_names
+            )
+            loop_vars = CEmitter._loop_vars(output_shape_raw)
+            array_suffix = self._param_array_suffix(
+                output_shape_raw, output_dim_names
+            )
             param_decls = self._build_param_decls(
                 [
-                    (params["input0"], op.input_dtype.c_type, array_suffix, True),
-                    (params["output"], op.dtype.c_type, array_suffix, False),
+                    (params["input0"], input_dtype.c_type, array_suffix, True),
+                    (params["output"], output_dtype.c_type, array_suffix, False),
                 ]
             )
-            operator_symbol = unary_op_symbol(op.function, dtype=op.dtype)
-            if op.function in {ScalarFunction.ISINF, ScalarFunction.ISNAN}:
+            operator_symbol = unary_op_symbol(op.function, dtype=output_dtype)
+            if op.function == ScalarFunction.ISINF and len(op.params) == 2:
+                detect_negative, detect_positive = op.params
+                detect_negative = int(detect_negative)
+                detect_positive = int(detect_positive)
+                if detect_negative and detect_positive:
+                    operator_symbol = "isinf"
+                elif detect_negative:
+                    operator_symbol = "isneginf"
+                elif detect_positive:
+                    operator_symbol = "isposinf"
+                else:
+                    operator_symbol = "zero"
+            elif op.function in {ScalarFunction.ISINF, ScalarFunction.ISNAN}:
                 operator_symbol = (
                     "isinf" if op.function == ScalarFunction.ISINF else "isnan"
                 )
@@ -9722,8 +9875,8 @@ class CEmitter:
                 "array_suffix": array_suffix,
                 "shape": shape,
                 "loop_vars": loop_vars,
-                "input_c_type": op.input_dtype.c_type,
-                "output_c_type": op.dtype.c_type,
+                "input_c_type": input_dtype.c_type,
+                "output_c_type": output_dtype.c_type,
                 "zero_literal": zero_literal,
                 "dim_args": dim_args,
                 "params": param_decls,
@@ -9774,6 +9927,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -9803,8 +9957,8 @@ class CEmitter:
             return op.output_values
         return op.output
-    @staticmethod
     def _op_inputs(
+        self,
         op: BinaryOp
         | MultiInputBinaryOp
         | WhereOp
@@ -9840,6 +9994,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -9865,18 +10020,24 @@ class CEmitter:
     ) -> tuple[tuple[str, tuple[int, ...]], ...]:
         if isinstance(op, BinaryOp):
             return (
-                (op.input0, op.input0_shape),
-                (op.input1, op.input1_shape),
+                (op.input0, self._ctx_shape(op.input0)),
+                (op.input1, self._ctx_shape(op.input1)),
             )
         if isinstance(op, MultiInputBinaryOp):
-            return tuple((name, op.shape) for name in op.inputs)
+            return tuple((name, self._ctx_shape(name)) for name in op.inputs)
+        if isinstance(op, WhereOp):
+            return (
+                (op.condition, self._ctx_shape(op.condition)),
+                (op.input_x, self._ctx_shape(op.input_x)),
+                (op.input_y, self._ctx_shape(op.input_y)),
+            )
         if isinstance(op, EinsumOp):
             return tuple(
                 (name, shape)
                 for name, shape in zip(op.inputs, op.input_shapes)
             )
         if isinstance(op, UnaryOp):
-            return ((op.input0, op.shape),)
+            return ((op.input0, self._ctx_shape(op.input0)),)
         if isinstance(op, LpNormalizationOp):
             return ((op.input0, op.shape),)
         if isinstance(op, InstanceNormalizationOp):
@@ -9901,32 +10062,57 @@ class CEmitter:
         if isinstance(op, RMSNormalizationOp):
             return ((op.input0, op.shape), (op.scale, op.scale_shape))
         if isinstance(op, ClipOp):
-            inputs = [(op.input0, op.input_shape)]
-            if op.input_min is not None and op.min_shape is not None:
-                inputs.append((op.input_min, op.min_shape))
-            if op.input_max is not None and op.max_shape is not None:
-                inputs.append((op.input_max, op.max_shape))
+            inputs = [(op.input0, self._ctx_shape(op.input0))]
+            if op.input_min is not None:
+                inputs.append((op.input_min, self._ctx_shape(op.input_min)))
+            if op.input_max is not None:
+                inputs.append((op.input_max, self._ctx_shape(op.input_max)))
             return tuple(inputs)
         if isinstance(op, CastOp):
-            return ((op.input0, op.shape),)
+            return ((op.input0, self._ctx_shape(op.input0)),)
         if isinstance(op, NonZeroOp):
             return ((op.input0, op.input_shape),)
+        if isinstance(op, NonMaxSuppressionOp):
+            inputs = [
+                (op.boxes, op.boxes_shape),
+                (op.scores, op.scores_shape),
+            ]
+            if (
+                op.max_output_boxes_per_class is not None
+                and op.max_output_shape is not None
+            ):
+                inputs.append(
+                    (op.max_output_boxes_per_class, op.max_output_shape)
+                )
+            if (
+                op.iou_threshold is not None
+                and op.iou_threshold_shape is not None
+            ):
+                inputs.append((op.iou_threshold, op.iou_threshold_shape))
+            if (
+                op.score_threshold is not None
+                and op.score_threshold_shape is not None
+            ):
+                inputs.append(
+                    (op.score_threshold, op.score_threshold_shape)
+                )
+            return tuple(inputs)
         if isinstance(op, QuantizeLinearOp):
             scale_shape = (
                 ()
                 if op.axis is None
-                else (op.input_shape[op.axis],)
+                else (self._ctx_shape(op.input0)[op.axis],)
             )
-            inputs = [(op.input0, op.input_shape), (op.scale, scale_shape)]
+            inputs = [(op.input0, self._ctx_shape(op.input0)), (op.scale, scale_shape)]
             if op.zero_point is not None:
                 inputs.append((op.zero_point, scale_shape))
             return tuple(inputs)
         if isinstance(op, IdentityOp):
-            return ((op.input0, op.shape),)
+            return ((op.input0, self._ctx_shape(op.input0)),)
         if isinstance(op, EyeLikeOp):
             return ((op.input0, op.output_shape),)
         if isinstance(op, TriluOp):
-            inputs = [(op.input0, op.input_shape)]
+            inputs = [(op.input0, self._ctx_shape(op.input0))]
             if op.k_input is not None and op.k_input_shape is not None:
                 inputs.append((op.k_input, op.k_input_shape))
             return tuple(inputs)
@@ -9943,6 +10129,14 @@ class CEmitter:
             return tuple(inputs)
         if isinstance(op, ScatterNDOp):
             return ((op.data, op.data_shape),)
+        if isinstance(op, TensorScatterOp):
+            inputs = [
+                (op.past_cache, op.past_cache_shape),
+                (op.update, op.update_shape),
+            ]
+            if op.write_indices is not None and op.write_indices_shape is not None:
+                inputs.append((op.write_indices, op.write_indices_shape))
+            return tuple(inputs)
         if isinstance(op, CumSumOp):
             return ((op.input0, op.input_shape),)
         if isinstance(op, RangeOp):
@@ -9956,7 +10150,9 @@ class CEmitter:
         if isinstance(op, SplitOp):
             return ((op.input0, op.input_shape),)
         if isinstance(op, TopKOp):
-            return ((op.input0, op.input_shape),)
+            return ((op.input0, self._ctx_shape(op.input0)),)
+        if isinstance(op, (TransposeOp, ReshapeOp, ReduceOp, ArgReduceOp)):
+            return ((op.input0, self._ctx_shape(op.input0)),)
         return ()
     def _propagate_tensor_dim_names(
@@ -10014,6 +10210,7 @@ class CEmitter:
             | ShapeOp
             | SizeOp
             | NonZeroOp
+            | NonMaxSuppressionOp
             | ExpandOp
             | RangeOp
             | OneHotOp
@@ -10031,8 +10228,8 @@ class CEmitter:
                         tensor_dim_names[output_name] = dict(dim_names)
                         break
-    @staticmethod
     def _op_outputs(
+        self,
         op: BinaryOp
         | MultiInputBinaryOp
         | WhereOp
@@ -10068,6 +10265,7 @@ class CEmitter:
         | GatherOp
         | GatherNDOp
         | ScatterNDOp
+        | TensorScatterOp
         | TransposeOp
         | ReshapeOp
         | IdentityOp
@@ -10086,14 +10284,40 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | RangeOp
         | OneHotOp
         | SplitOp,
-    ) -> tuple[tuple[str, tuple[int, ...], str], ...]:
+    ) -> tuple[tuple[str, tuple[int, ...], ScalarType], ...]:
+        if isinstance(
+            op,
+            (
+                BinaryOp,
+                MultiInputBinaryOp,
+                WhereOp,
+                UnaryOp,
+                ClipOp,
+                CastOp,
+                TransposeOp,
+                ReshapeOp,
+                IdentityOp,
+                SoftmaxOp,
+                LogSoftmaxOp,
+                HardmaxOp,
+                ReduceOp,
+            ),
+        ):
+            return (
+                (
+                    op.output,
+                    self._op_output_shape(op),
+                    self._ctx_dtype(op.output),
+                ),
+            )
         if isinstance(op, AttentionOp):
-            outputs: list[tuple[str, tuple[int, ...], str]] = [
-                (op.output, CEmitter._op_output_shape(op), op.dtype)
+            outputs: list[tuple[str, tuple[int, ...], ScalarType]] = [
+                (op.output, self._op_output_shape(op), op.dtype)
             ]
             if op.output_present_key is not None:
                 outputs.append(
@@ -10121,7 +10345,7 @@ class CEmitter:
             )
             return tuple(outputs)
         if isinstance(op, LstmOp):
-            outputs: list[tuple[str, tuple[int, ...], str]] = []
+            outputs: list[tuple[str, tuple[int, ...], ScalarType]] = []
             if op.output_y is not None:
                 if op.layout == 0:
                     y_shape = (
@@ -10155,13 +10379,25 @@ class CEmitter:
                     )
                 )
             return tuple(outputs)
+        if isinstance(op, AdagradOp):
+            outputs = [
+                (name, shape, op.dtype)
+                for name, shape in zip(op.outputs, op.output_shapes)
+            ]
+            outputs.extend(
+                (name, shape, op.dtype)
+                for name, shape in zip(
+                    op.accumulator_outputs, op.output_shapes
+                )
+            )
+            return tuple(outputs)
         if isinstance(op, SoftmaxCrossEntropyLossOp):
             outputs = [(op.output, op.output_shape, op.dtype)]
             if op.log_prob is not None and op.log_prob_shape is not None:
                 outputs.append((op.log_prob, op.log_prob_shape, op.dtype))
             return tuple(outputs)
         if isinstance(op, LayerNormalizationOp):
-            outputs: list[tuple[str, tuple[int, ...], str]] = [
+            outputs: list[tuple[str, tuple[int, ...], ScalarType]] = [
                 (op.output, op.shape, op.dtype)
             ]
             if op.mean_output is not None:
@@ -10172,10 +10408,10 @@ class CEmitter:
                 outputs.append((op.invstd_output, invstd_shape, op.dtype))
             return tuple(outputs)
         if isinstance(op, MaxPoolOp):
-            outputs = [(op.output, CEmitter._op_output_shape(op), op.dtype)]
+            outputs = [(op.output, self._op_output_shape(op), op.dtype)]
             if op.indices is not None and op.indices_dtype is not None:
                 outputs.append(
-                    (op.indices, CEmitter._op_output_shape(op), op.indices_dtype)
+                    (op.indices, self._op_output_shape(op), op.indices_dtype)
                 )
             return tuple(outputs)
         if isinstance(op, SplitOp):
@@ -10184,30 +10420,40 @@ class CEmitter:
                 for name, shape in zip(op.outputs, op.output_shapes)
             )
         if isinstance(op, ArgReduceOp):
-            return ((op.output, CEmitter._op_output_shape(op), op.output_dtype),)
+            return (
+                (
+                    op.output,
+                    self._op_output_shape(op),
+                    self._ctx_dtype(op.output),
+                ),
+            )
         if isinstance(op, TopKOp):
             return (
                 (
                     op.output_values,
-                    CEmitter._op_output_shape(op),
-                    op.output_values_dtype,
+                    self._op_output_shape(op),
+                    self._ctx_dtype(op.output_values),
                 ),
                 (
                     op.output_indices,
-                    CEmitter._op_output_shape(op),
-                    op.output_indices_dtype,
+                    self._op_output_shape(op),
+                    self._ctx_dtype(op.output_indices),
                 ),
             )
-        return ((op.output, CEmitter._op_output_shape(op), op.dtype),)
+        if isinstance(op, NonMaxSuppressionOp):
+            return ((op.output, op.output_shape, op.output_dtype),)
+        return ((op.output, self._op_output_shape(op), op.dtype),)
-    @staticmethod
     def _op_output_shape(
+        self,
         op: BinaryOp
         | MultiInputBinaryOp
         | WhereOp
         | UnaryOp
         | ClipOp
         | CastOp
+        | QuantizeLinearOp
+        | QLinearMatMulOp
         | MatMulOp
         | EinsumOp
         | GemmOp
@@ -10249,6 +10495,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -10257,19 +10504,21 @@ class CEmitter:
         | PadOp,
     ) -> tuple[int, ...]:
         if isinstance(op, BinaryOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, MultiInputBinaryOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, WhereOp):
-            return op.output_shape
+            return self._ctx_shape(op.output)
         if isinstance(op, UnaryOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, ClipOp):
-            return op.output_shape
+            return self._ctx_shape(op.output)
         if isinstance(op, QuantizeLinearOp):
             return op.input_shape
         if isinstance(op, CastOp):
-            return op.shape
+            return self._ctx_shape(op.output)
+        if isinstance(op, QLinearMatMulOp):
+            return op.output_shape
         if isinstance(op, MatMulOp):
             return op.output_shape
         if isinstance(op, EinsumOp):
@@ -10301,11 +10550,11 @@ class CEmitter:
         if isinstance(op, LrnOp):
             return op.shape
         if isinstance(op, SoftmaxOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, LogSoftmaxOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, HardmaxOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, NegativeLogLikelihoodLossOp):
             return op.output_shape
         if isinstance(op, SoftmaxCrossEntropyLossOp):
@@ -10322,12 +10571,14 @@ class CEmitter:
             return op.output_shape
         if isinstance(op, ScatterNDOp):
             return op.output_shape
-        if isinstance(op, TransposeOp):
+        if isinstance(op, TensorScatterOp):
             return op.output_shape
+        if isinstance(op, TransposeOp):
+            return self._ctx_shape(op.output)
         if isinstance(op, ReshapeOp):
-            return op.output_shape
+            return self._ctx_shape(op.output)
         if isinstance(op, IdentityOp):
-            return op.shape
+            return self._ctx_shape(op.output)
         if isinstance(op, EyeLikeOp):
             return op.output_shape
         if isinstance(op, TriluOp):
@@ -10347,11 +10598,11 @@ class CEmitter:
         if isinstance(op, GridSampleOp):
             return op.output_shape
         if isinstance(op, ReduceOp):
-            return op.output_shape
+            return self._ctx_shape(op.output)
         if isinstance(op, ArgReduceOp):
-            return op.output_shape
+            return self._ctx_shape(op.output)
         if isinstance(op, TopKOp):
-            return op.output_shape
+            return self._ctx_shape(op.output_values)
         if isinstance(op, ConstantOfShapeOp):
             return op.shape
         if isinstance(op, ShapeOp):
@@ -10360,6 +10611,8 @@ class CEmitter:
             return op.output_shape
         if isinstance(op, NonZeroOp):
             return op.output_shape
+        if isinstance(op, NonMaxSuppressionOp):
+            return op.output_shape
         if isinstance(op, ExpandOp):
             return op.output_shape
         if isinstance(op, CumSumOp):
@@ -10372,8 +10625,8 @@ class CEmitter:
             return (op.batch, op.q_seq, op.q_heads * op.v_head_size)
         return (op.batch, op.q_heads, op.q_seq, op.v_head_size)
-    @staticmethod
     def _op_output_dtype(
+        self,
         op: BinaryOp
         | MultiInputBinaryOp
         | WhereOp
@@ -10399,6 +10652,7 @@ class CEmitter:
         | SoftmaxOp
         | LogSoftmaxOp
         | HardmaxOp
+        | AdagradOp
         | NegativeLogLikelihoodLossOp
         | SoftmaxCrossEntropyLossOp
         | MaxPoolOp
@@ -10420,6 +10674,7 @@ class CEmitter:
         | ShapeOp
         | SizeOp
         | NonZeroOp
+        | NonMaxSuppressionOp
         | ExpandOp
         | CumSumOp
         | RangeOp
@@ -10428,9 +10683,30 @@ class CEmitter:
         | PadOp,
     ) -> ScalarType:
         if isinstance(op, ArgReduceOp):
-            return op.output_dtype
+            return self._ctx_dtype(op.output)
         if isinstance(op, TopKOp):
-            return op.output_values_dtype
+            return self._ctx_dtype(op.output_values)
+        if isinstance(op, NonMaxSuppressionOp):
+            return op.output_dtype
+        if isinstance(
+            op,
+            (
+                BinaryOp,
+                MultiInputBinaryOp,
+                WhereOp,
+                UnaryOp,
+                ClipOp,
+                CastOp,
+                SoftmaxOp,
+                LogSoftmaxOp,
+                HardmaxOp,
+                TransposeOp,
+                ReshapeOp,
+                IdentityOp,
+                ReduceOp,
+            ),
+        ):
+            return self._ctx_dtype(op.output)
         return op.dtype
     @staticmethod
@@ -10815,7 +11091,7 @@ class CEmitter:
         self, constants: tuple[ConstTensor, ...]
     ) -> tuple[tuple[ConstTensor, ...], tuple[ConstTensor, ...]]:
         if self._large_weight_threshold <= 0:
-            return (), constants
+            return constants, ()
         inline: list[ConstTensor] = []
         large: list[ConstTensor] = []
         for const in constants:

emx-onnx-cgen 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

emx-onnx-cgen 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl