PyPI - emx-onnx-cgen - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

emx-onnx-cgen 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of emx-onnx-cgen might be problematic. Click here for more details.

Files changed (99) hide show

emx_onnx_cgen/_build_info.py +1 -1
emx_onnx_cgen/_version.py +34 -0
emx_onnx_cgen/cli.py +372 -64
emx_onnx_cgen/codegen/__init__.py +2 -0
emx_onnx_cgen/codegen/c_emitter.py +3932 -1398
emx_onnx_cgen/codegen/emitter.py +5 -0
emx_onnx_cgen/compiler.py +169 -343
emx_onnx_cgen/ir/context.py +87 -0
emx_onnx_cgen/ir/model.py +1 -0
emx_onnx_cgen/ir/op_base.py +193 -0
emx_onnx_cgen/ir/op_context.py +65 -0
emx_onnx_cgen/ir/ops/__init__.py +130 -0
emx_onnx_cgen/ir/ops/elementwise.py +146 -0
emx_onnx_cgen/ir/ops/misc.py +421 -0
emx_onnx_cgen/ir/ops/nn.py +580 -0
emx_onnx_cgen/ir/ops/reduce.py +95 -0
emx_onnx_cgen/lowering/__init__.py +79 -1
emx_onnx_cgen/lowering/adagrad.py +114 -0
emx_onnx_cgen/lowering/arg_reduce.py +1 -1
emx_onnx_cgen/lowering/attention.py +1 -1
emx_onnx_cgen/lowering/average_pool.py +1 -1
emx_onnx_cgen/lowering/batch_normalization.py +1 -1
emx_onnx_cgen/lowering/cast.py +1 -1
emx_onnx_cgen/lowering/common.py +406 -11
emx_onnx_cgen/lowering/concat.py +1 -1
emx_onnx_cgen/lowering/constant_of_shape.py +1 -1
emx_onnx_cgen/lowering/conv.py +1 -1
emx_onnx_cgen/lowering/conv_transpose.py +301 -0
emx_onnx_cgen/lowering/cumsum.py +1 -1
emx_onnx_cgen/lowering/depth_space.py +1 -1
emx_onnx_cgen/lowering/dropout.py +1 -1
emx_onnx_cgen/lowering/einsum.py +153 -0
emx_onnx_cgen/lowering/elementwise.py +152 -4
emx_onnx_cgen/lowering/expand.py +1 -1
emx_onnx_cgen/lowering/eye_like.py +1 -1
emx_onnx_cgen/lowering/flatten.py +1 -1
emx_onnx_cgen/lowering/gather.py +1 -1
emx_onnx_cgen/lowering/gather_elements.py +2 -4
emx_onnx_cgen/lowering/gather_nd.py +79 -0
emx_onnx_cgen/lowering/gemm.py +1 -1
emx_onnx_cgen/lowering/global_max_pool.py +59 -0
emx_onnx_cgen/lowering/grid_sample.py +1 -1
emx_onnx_cgen/lowering/group_normalization.py +1 -1
emx_onnx_cgen/lowering/hardmax.py +53 -0
emx_onnx_cgen/lowering/identity.py +7 -6
emx_onnx_cgen/lowering/instance_normalization.py +1 -1
emx_onnx_cgen/lowering/layer_normalization.py +1 -1
emx_onnx_cgen/lowering/logsoftmax.py +6 -2
emx_onnx_cgen/lowering/lp_normalization.py +1 -1
emx_onnx_cgen/lowering/lp_pool.py +141 -0
emx_onnx_cgen/lowering/lrn.py +1 -1
emx_onnx_cgen/lowering/lstm.py +1 -1
emx_onnx_cgen/lowering/matmul.py +7 -8
emx_onnx_cgen/lowering/maxpool.py +1 -1
emx_onnx_cgen/lowering/mean_variance_normalization.py +1 -1
emx_onnx_cgen/lowering/negative_log_likelihood_loss.py +13 -13
emx_onnx_cgen/lowering/non_max_suppression.py +157 -0
emx_onnx_cgen/lowering/nonzero.py +42 -0
emx_onnx_cgen/lowering/one_hot.py +120 -0
emx_onnx_cgen/lowering/pad.py +1 -1
emx_onnx_cgen/lowering/qlinear_matmul.py +212 -0
emx_onnx_cgen/lowering/quantize_linear.py +126 -0
emx_onnx_cgen/lowering/range.py +1 -1
emx_onnx_cgen/lowering/reduce.py +6 -7
emx_onnx_cgen/lowering/registry.py +24 -5
emx_onnx_cgen/lowering/reshape.py +224 -52
emx_onnx_cgen/lowering/resize.py +1 -1
emx_onnx_cgen/lowering/rms_normalization.py +1 -1
emx_onnx_cgen/lowering/rotary_embedding.py +165 -0
emx_onnx_cgen/lowering/scatter_nd.py +82 -0
emx_onnx_cgen/lowering/shape.py +6 -25
emx_onnx_cgen/lowering/size.py +1 -1
emx_onnx_cgen/lowering/slice.py +1 -1
emx_onnx_cgen/lowering/softmax.py +6 -2
emx_onnx_cgen/lowering/softmax_cross_entropy_loss.py +1 -1
emx_onnx_cgen/lowering/split.py +1 -1
emx_onnx_cgen/lowering/squeeze.py +6 -6
emx_onnx_cgen/lowering/tensor_scatter.py +110 -0
emx_onnx_cgen/lowering/tile.py +1 -1
emx_onnx_cgen/lowering/topk.py +134 -0
emx_onnx_cgen/lowering/transpose.py +1 -1
emx_onnx_cgen/lowering/trilu.py +89 -0
emx_onnx_cgen/lowering/unsqueeze.py +6 -6
emx_onnx_cgen/lowering/variadic.py +1 -1
emx_onnx_cgen/lowering/where.py +1 -1
emx_onnx_cgen/onnx_import.py +4 -0
emx_onnx_cgen/onnxruntime_utils.py +11 -0
emx_onnx_cgen/ops.py +4 -0
emx_onnx_cgen/runtime/evaluator.py +785 -43
emx_onnx_cgen/testbench.py +23 -0
emx_onnx_cgen/verification.py +31 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.1.dist-info}/METADATA +33 -6
emx_onnx_cgen-0.3.1.dist-info/RECORD +107 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.1.dist-info}/WHEEL +1 -1
shared/scalar_functions.py +60 -17
shared/ulp.py +65 -0
emx_onnx_cgen-0.2.0.dist-info/RECORD +0 -76
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.1.dist-info}/entry_points.txt +0 -0
{emx_onnx_cgen-0.2.0.dist-info → emx_onnx_cgen-0.3.1.dist-info}/top_level.txt +0 -0

emx_onnx_cgen/runtime/evaluator.py CHANGED Viewed

@@ -10,37 +10,48 @@ from ..errors import ShapeInferenceError, UnsupportedOpError
 from ..ir.model import Graph, Node
 from ..lowering.attention import resolve_attention_spec
 from ..lowering.average_pool import lower_average_pool, lower_global_average_pool
+from ..lowering.adagrad import lower_adagrad
 from ..lowering.batch_normalization import lower_batch_normalization
 from ..lowering.concat import lower_concat
 from ..lowering.constant_of_shape import lower_constant_of_shape
 from ..lowering.conv import resolve_conv_spec
+from ..lowering.conv_transpose import resolve_conv_transpose_spec
 from ..lowering.dropout import lower_dropout
 from ..lowering.cumsum import lower_cumsum
+from ..lowering.einsum import lower_einsum
 from ..lowering.flatten import lower_flatten
 from ..lowering.gemm import resolve_gemm_spec
 from ..lowering.logsoftmax import lower_logsoftmax
+from ..lowering.hardmax import lower_hardmax
 from ..lowering.lp_normalization import lower_lp_normalization
+from ..lowering.lp_pool import lower_lp_pool
 from ..lowering.grid_sample import lower_grid_sample
 from ..lowering.instance_normalization import lower_instance_normalization
 from ..lowering.group_normalization import lower_group_normalization
 from ..lowering.layer_normalization import lower_layer_normalization
+from ..lowering.non_max_suppression import lower_non_max_suppression
 from ..lowering.mean_variance_normalization import (
     lower_mean_variance_normalization,
 )
+from ..lowering.global_max_pool import lower_global_max_pool
 from ..lowering.negative_log_likelihood_loss import (
     lower_negative_log_likelihood_loss,
 )
+from ..lowering.nonzero import lower_nonzero
 from ..lowering.pad import lower_pad
 from ..lowering.expand import lower_expand
 from ..lowering.range import lower_range
+from ..lowering.one_hot import lower_onehot
 from ..lowering.split import lower_split
 from ..lowering.softmax_cross_entropy_loss import (
     lower_softmax_cross_entropy_loss,
 )
 from ..lowering.arg_reduce import lower_arg_reduce
+from ..lowering.topk import lower_topk
 from ..lowering.lstm import ACTIVATION_KIND_BY_NAME, resolve_lstm_spec
 from ..lowering.lrn import resolve_lrn_spec
 from ..lowering.matmul import lower_matmul
+from ..lowering.qlinear_matmul import lower_qlinear_matmul
 from ..lowering.maxpool import resolve_maxpool_spec
 from ..lowering.reduce import (
     REDUCE_KIND_BY_OP,
@@ -49,15 +60,19 @@ from ..lowering.reduce import (
     resolve_reduce_axes,
 )
 from ..lowering.reshape import lower_reshape
+from ..lowering.scatter_nd import lower_scatternd
+from ..lowering.tensor_scatter import lower_tensor_scatter
 from ..lowering.slice import _normalize_slices
 from ..lowering.shape import lower_shape
 from ..lowering.size import lower_size
 from ..lowering.softmax import lower_softmax
 from ..lowering.rms_normalization import lower_rms_normalization
+from ..lowering.rotary_embedding import lower_rotary_embedding
 from ..lowering.squeeze import lower_squeeze
 from ..lowering.transpose import lower_transpose
 from ..lowering.unsqueeze import lower_unsqueeze
 from ..lowering.where import lower_where
+from ..lowering.quantize_linear import resolve_quantize_spec
 from ..lowering.variadic import BINARY_ONLY_OPS, VARIADIC_OP_FUNCTIONS
 from ..lowering.registry import resolve_dispatch
 from ..lowering.common import node_dtype, optional_name, value_dtype, value_shape
@@ -133,6 +148,52 @@ def _eval_matmul(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = _apply_matmul(left, right)
+@register_evaluator("Einsum")
+def _eval_einsum(evaluator: Evaluator, node: Node) -> None:
+    lower_einsum(evaluator.graph, node)
+    equation_value = node.attrs.get("equation")
+    if equation_value is None:
+        raise UnsupportedOpError("Einsum equation attribute is required")
+    equation = (
+        equation_value.decode()
+        if isinstance(equation_value, (bytes, bytearray))
+        else str(equation_value)
+    )
+    inputs = [evaluator.values[name] for name in node.inputs]
+    evaluator.values[node.outputs[0]] = np.einsum(equation, *inputs)
+@register_evaluator("Adagrad")
+def _eval_adagrad(evaluator: Evaluator, node: Node) -> None:
+    op = lower_adagrad(evaluator.graph, node)
+    rate = evaluator.values[op.rate]
+    timestep = evaluator.values[op.timestep]
+    rate_value = (
+        np.array(rate, dtype=op.dtype.np_dtype).reshape(-1)[0].item()
+    )
+    timestep_value = (
+        np.array(timestep, dtype=np.int64).reshape(-1)[0].item()
+    )
+    r = op.dtype.np_dtype.type(
+        rate_value / (1.0 + float(timestep_value) * op.decay_factor)
+    )
+    for x_name, g_name, h_name, out_name, h_out_name in zip(
+        op.inputs,
+        op.gradients,
+        op.accumulators,
+        op.outputs,
+        op.accumulator_outputs,
+    ):
+        x = evaluator.values[x_name]
+        g = evaluator.values[g_name]
+        h = evaluator.values[h_name]
+        g_regularized = op.norm_coefficient * x + g
+        h_new = h + g_regularized * g_regularized
+        h_adaptive = np.sqrt(h_new) + op.epsilon
+        evaluator.values[out_name] = x - r * g_regularized / h_adaptive
+        evaluator.values[h_out_name] = h_new
 @register_evaluator("Clip")
 def _eval_clip(evaluator: Evaluator, node: Node) -> None:
     if not node.inputs or len(node.outputs) != 1:
@@ -163,6 +224,79 @@ def _eval_clip(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = np.clip(x, min_val, max_val)
+def _max_min(lhs: float, rhs: float) -> tuple[float, float]:
+    if lhs >= rhs:
+        return rhs, lhs
+    return lhs, rhs
+def _suppress_by_iou(
+    boxes: np.ndarray,
+    box_index1: int,
+    box_index2: int,
+    *,
+    center_point_box: int,
+    iou_threshold: float,
+) -> bool:
+    box1 = boxes[box_index1]
+    box2 = boxes[box_index2]
+    if center_point_box == 0:
+        x1_min, x1_max = _max_min(float(box1[1]), float(box1[3]))
+        x2_min, x2_max = _max_min(float(box2[1]), float(box2[3]))
+        intersection_x_min = max(x1_min, x2_min)
+        intersection_x_max = min(x1_max, x2_max)
+        if intersection_x_max <= intersection_x_min:
+            return False
+        y1_min, y1_max = _max_min(float(box1[0]), float(box1[2]))
+        y2_min, y2_max = _max_min(float(box2[0]), float(box2[2]))
+        intersection_y_min = max(y1_min, y2_min)
+        intersection_y_max = min(y1_max, y2_max)
+        if intersection_y_max <= intersection_y_min:
+            return False
+    else:
+        box1_width_half = float(box1[2]) / 2.0
+        box1_height_half = float(box1[3]) / 2.0
+        box2_width_half = float(box2[2]) / 2.0
+        box2_height_half = float(box2[3]) / 2.0
+        x1_min = float(box1[0]) - box1_width_half
+        x1_max = float(box1[0]) + box1_width_half
+        x2_min = float(box2[0]) - box2_width_half
+        x2_max = float(box2[0]) + box2_width_half
+        y1_min = float(box1[1]) - box1_height_half
+        y1_max = float(box1[1]) + box1_height_half
+        y2_min = float(box2[1]) - box2_height_half
+        y2_max = float(box2[1]) + box2_height_half
+        intersection_x_min = max(x1_min, x2_min)
+        intersection_x_max = min(x1_max, x2_max)
+        if intersection_x_max <= intersection_x_min:
+            return False
+        intersection_y_min = max(y1_min, y2_min)
+        intersection_y_max = min(y1_max, y2_max)
+        if intersection_y_max <= intersection_y_min:
+            return False
+    intersection_area = (intersection_x_max - intersection_x_min) * (
+        intersection_y_max - intersection_y_min
+    )
+    if intersection_area <= 0:
+        return False
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    union_area = area1 + area2 - intersection_area
+    if area1 <= 0 or area2 <= 0 or union_area <= 0:
+        return False
+    intersection_over_union = intersection_area / union_area
+    return intersection_over_union > iou_threshold
 def _exclusive_cumsum(data: np.ndarray, axis: int) -> np.ndarray:
     result = np.zeros_like(data)
     if data.shape[axis] == 0:
@@ -197,6 +331,100 @@ def _eval_cumsum(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[op.output] = result
+@register_evaluator("NonMaxSuppression")
+def _eval_nonmax_suppression(evaluator: Evaluator, node: Node) -> None:
+    op = lower_non_max_suppression(evaluator.graph, node)
+    boxes = evaluator.values[op.boxes]
+    scores = evaluator.values[op.scores]
+    max_output_boxes_per_class = 0
+    if op.max_output_boxes_per_class is not None:
+        max_output_values = evaluator.values[
+            op.max_output_boxes_per_class
+        ].astype(np.int64, copy=False)
+        max_output_values = max_output_values.reshape(-1)
+        if max_output_values.size != 1:
+            raise UnsupportedOpError(
+                "NonMaxSuppression max_output_boxes_per_class must be scalar"
+            )
+        max_output_boxes_per_class = max(int(max_output_values[0]), 0)
+    iou_threshold = 0.0
+    if op.iou_threshold is not None:
+        iou_values = evaluator.values[op.iou_threshold].reshape(-1)
+        if iou_values.size != 1:
+            raise UnsupportedOpError(
+                "NonMaxSuppression iou_threshold must be scalar"
+            )
+        iou_threshold = float(iou_values[0])
+    score_threshold = 0.0
+    score_threshold_enabled = op.score_threshold is not None
+    if op.score_threshold is not None:
+        score_values = evaluator.values[op.score_threshold].reshape(-1)
+        if score_values.size != 1:
+            raise UnsupportedOpError(
+                "NonMaxSuppression score_threshold must be scalar"
+            )
+        score_threshold = float(score_values[0])
+    if max_output_boxes_per_class == 0:
+        evaluator.values[op.output] = np.empty((0, 3), dtype=np.int64)
+        return
+    num_batches = boxes.shape[0]
+    num_boxes = boxes.shape[1]
+    num_classes = scores.shape[1]
+    selected_indices: list[tuple[int, int, int]] = []
+    for batch_index in range(num_batches):
+        batch_boxes = boxes[batch_index]
+        for class_index in range(num_classes):
+            class_scores = scores[batch_index, class_index]
+            candidates: list[tuple[float, int]] = []
+            if score_threshold_enabled:
+                for box_index in range(num_boxes):
+                    score = float(class_scores[box_index])
+                    if score > score_threshold:
+                        candidates.append((score, box_index))
+            else:
+                for box_index in range(num_boxes):
+                    candidates.append(
+                        (float(class_scores[box_index]), box_index)
+                    )
+            candidates.sort(key=lambda item: (item[0], -item[1]))
+            selected_boxes: list[int] = []
+            while (
+                candidates
+                and len(selected_boxes) < max_output_boxes_per_class
+            ):
+                _, box_index = candidates.pop()
+                if any(
+                    _suppress_by_iou(
+                        batch_boxes,
+                        box_index,
+                        selected_index,
+                        center_point_box=op.center_point_box,
+                        iou_threshold=iou_threshold,
+                    )
+                    for selected_index in selected_boxes
+                ):
+                    continue
+                selected_boxes.append(box_index)
+                selected_indices.append(
+                    (batch_index, class_index, box_index)
+                )
+    result = np.empty((len(selected_indices), 3), dtype=np.int64)
+    for idx, (batch_index, class_index, box_index) in enumerate(
+        selected_indices
+    ):
+        result[idx, 0] = batch_index
+        result[idx, 1] = class_index
+        result[idx, 2] = box_index
+    evaluator.values[op.output] = result
 @register_evaluator("Pad")
 def _eval_pad(evaluator: Evaluator, node: Node) -> None:
     op = lower_pad(evaluator.graph, node)
@@ -242,10 +470,11 @@ def _eval_pad(evaluator: Evaluator, node: Node) -> None:
             pads_begin = np.zeros(rank, dtype=np.int64)
             pads_end = np.zeros(rank, dtype=np.int64)
             for axis, pad_index in enumerate(op.pads_axis_map):
-                if pad_index is None:
-                    continue
-                pads_begin[axis] = int(pads_values[pad_index])
-                pads_end[axis] = int(pads_values[pad_index + axis_count])
+                if pad_index is not None:
+                    pads_begin[axis] = int(pads_values[pad_index])
+                    pads_end[axis] = int(
+                        pads_values[pad_index + axis_count]
+                    )
             pad_width = tuple(
                 (int(pads_begin[index]), int(pads_end[index]))
                 for index in range(rank)
@@ -270,6 +499,82 @@ def _eval_pad(evaluator: Evaluator, node: Node) -> None:
     )
+@register_evaluator("ScatterND")
+def _eval_scatternd(evaluator: Evaluator, node: Node) -> None:
+    op = lower_scatternd(evaluator.graph, node)
+    data = evaluator.values[op.data]
+    indices = evaluator.values[op.indices]
+    updates = evaluator.values[op.updates]
+    output = np.array(data, copy=True)
+    index_depth = op.indices_shape[-1]
+    update_indices_shape = op.indices_shape[:-1]
+    update_count = int(np.prod(update_indices_shape)) if update_indices_shape else 1
+    flat_indices = indices.astype(np.int64, copy=False).reshape(
+        update_count, index_depth
+    )
+    tail_shape = op.data_shape[index_depth:]
+    updates_reshaped = updates.reshape((update_count,) + tail_shape)
+    for index, index_values in enumerate(flat_indices):
+        output_index: list[int | slice] = []
+        for axis, value in enumerate(index_values):
+            axis_size = op.data_shape[axis]
+            idx = int(value)
+            if idx < 0:
+                idx += axis_size
+            if idx < 0 or idx >= axis_size:
+                raise UnsupportedOpError(
+                    "ScatterND indices must be within data bounds"
+                )
+            output_index.append(idx)
+        output_index.extend([slice(None)] * len(tail_shape))
+        target = tuple(output_index)
+        update_value = updates_reshaped[index]
+        if op.reduction == "none":
+            output[target] = update_value
+        elif op.reduction == "add":
+            output[target] = output[target] + update_value
+        elif op.reduction == "mul":
+            output[target] = output[target] * update_value
+        elif op.reduction == "min":
+            output[target] = np.minimum(output[target], update_value)
+        elif op.reduction == "max":
+            output[target] = np.maximum(output[target], update_value)
+        else:
+            raise UnsupportedOpError(
+                f"Unsupported ScatterND reduction {op.reduction}"
+            )
+    evaluator.values[op.output] = output
+@register_evaluator("TensorScatter")
+def _eval_tensor_scatter(evaluator: Evaluator, node: Node) -> None:
+    op = lower_tensor_scatter(evaluator.graph, node)
+    past_cache = evaluator.values[op.past_cache]
+    update = evaluator.values[op.update]
+    if op.write_indices is None:
+        write_indices = np.zeros((past_cache.shape[0],), dtype=np.int64)
+    else:
+        write_indices = evaluator.values[op.write_indices].astype(
+            np.int64, copy=False
+        )
+    axis = op.axis
+    max_sequence_length = past_cache.shape[axis]
+    sequence_length = update.shape[axis]
+    output = np.array(past_cache, copy=True)
+    for prefix_idx in np.ndindex(past_cache.shape[:axis]):
+        batch_idx = prefix_idx[0]
+        base_index = int(write_indices[batch_idx])
+        for sequence_idx in range(sequence_length):
+            cache_idx = (*prefix_idx, base_index + sequence_idx)
+            if op.mode == "circular":
+                cache_idx = tuple(
+                    np.mod(np.asarray(cache_idx), max_sequence_length)
+                )
+            update_idx = (*prefix_idx, sequence_idx)
+            output[cache_idx] = update[update_idx]
+    evaluator.values[op.output] = output
 @register_evaluator("Celu")
 def _eval_celu(evaluator: Evaluator, node: Node) -> None:
     if len(node.inputs) != 1 or len(node.outputs) != 1:
@@ -676,8 +981,22 @@ def _eval_isinf(evaluator: Evaluator, node: Node) -> None:
     output_dtype = value_dtype(evaluator.graph, node.outputs[0], node)
     if output_dtype != ScalarType.BOOL:
         raise UnsupportedOpError("IsInf output must be bool")
+    detect_negative = int(node.attrs.get("detect_negative", 1))
+    detect_positive = int(node.attrs.get("detect_positive", 1))
+    if detect_negative not in {0, 1} or detect_positive not in {0, 1}:
+        raise UnsupportedOpError(
+            "IsInf detect_negative and detect_positive must be 0 or 1"
+        )
     x = evaluator.values[node.inputs[0]]
-    evaluator.values[node.outputs[0]] = np.isinf(x)
+    if detect_negative and detect_positive:
+        result = np.isinf(x)
+    elif detect_negative:
+        result = np.isneginf(x)
+    elif detect_positive:
+        result = np.isposinf(x)
+    else:
+        result = np.zeros(x.shape, dtype=bool)
+    evaluator.values[node.outputs[0]] = result
 @register_evaluator("IsNaN")
@@ -786,6 +1105,40 @@ def _eval_eye_like(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = output
+@register_evaluator("Trilu")
+def _eval_trilu(evaluator: Evaluator, node: Node) -> None:
+    if len(node.inputs) not in {1, 2} or len(node.outputs) != 1:
+        raise UnsupportedOpError("Trilu must have 1 or 2 inputs and 1 output")
+    value = evaluator.values[node.inputs[0]]
+    if value.ndim < 2:
+        raise UnsupportedOpError("Trilu expects input rank >= 2")
+    output_dtype = value_dtype(evaluator.graph, node.outputs[0], node)
+    input_dtype = value_dtype(evaluator.graph, node.inputs[0], node)
+    if output_dtype != input_dtype:
+        raise UnsupportedOpError(
+            "Trilu expects matching input/output dtypes, "
+            f"got {input_dtype.onnx_name} and {output_dtype.onnx_name}"
+        )
+    k = 0
+    if len(node.inputs) == 2 and node.inputs[1]:
+        k_value = np.array(evaluator.values[node.inputs[1]], dtype=np.int64)
+        if k_value.size != 1:
+            raise UnsupportedOpError("Trilu k input must be scalar")
+        k = int(k_value.reshape(-1)[0])
+    upper_attr = node.attrs.get("upper", 1)
+    upper = bool(int(upper_attr))
+    rows, cols = value.shape[-2], value.shape[-1]
+    batch_shape = value.shape[:-2]
+    batch_size = int(np.prod(batch_shape)) if batch_shape else 1
+    view = value.reshape(batch_size, rows, cols)
+    if upper:
+        mask = np.triu(np.ones((rows, cols), dtype=bool), k=k)
+    else:
+        mask = np.tril(np.ones((rows, cols), dtype=bool), k=k)
+    output = np.where(mask, view, np.zeros_like(view))
+    evaluator.values[node.outputs[0]] = output.reshape(value.shape)
 @register_evaluator("Tile")
 def _eval_tile(evaluator: Evaluator, node: Node) -> None:
     if len(node.inputs) != 2 or len(node.outputs) != 1:
@@ -922,6 +1275,73 @@ def _eval_gather(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = np.take(data, indices, axis=axis)
+@register_evaluator("GatherND")
+def _eval_gather_nd(evaluator: Evaluator, node: Node) -> None:
+    if len(node.inputs) != 2 or len(node.outputs) != 1:
+        raise UnsupportedOpError("GatherND must have 2 inputs and 1 output")
+    data = evaluator.values[node.inputs[0]]
+    indices = evaluator.values[node.inputs[1]]
+    if indices.dtype.type not in {np.int32, np.int64}:
+        raise UnsupportedOpError(
+            f"GatherND indices must be int32 or int64, got {indices.dtype}"
+        )
+    if indices.ndim < 1:
+        raise UnsupportedOpError("GatherND indices must have rank >= 1")
+    batch_dims = int(node.attrs.get("batch_dims", 0))
+    if batch_dims < 0:
+        raise UnsupportedOpError(
+            f"GatherND batch_dims must be >= 0, got {batch_dims}"
+        )
+    if batch_dims > indices.ndim - 1:
+        raise UnsupportedOpError(
+            "GatherND batch_dims must be <= indices rank - 1, "
+            f"got {batch_dims} vs {indices.ndim - 1}"
+        )
+    if batch_dims > data.ndim:
+        raise UnsupportedOpError(
+            "GatherND batch_dims must be <= data rank, "
+            f"got {batch_dims} vs {data.ndim}"
+        )
+    if tuple(data.shape[:batch_dims]) != tuple(indices.shape[:batch_dims]):
+        raise UnsupportedOpError(
+            "GatherND batch_dims must match on data/indices, "
+            f"got {data.shape} vs {indices.shape}"
+        )
+    index_depth = indices.shape[-1]
+    if index_depth <= 0:
+        raise UnsupportedOpError(
+            "GatherND indices final dimension must be >= 1"
+        )
+    if index_depth > data.ndim - batch_dims:
+        raise UnsupportedOpError(
+            "GatherND indices final dimension must be <= data rank - "
+            f"batch_dims, got {index_depth} vs {data.ndim - batch_dims}"
+        )
+    tail_shape = data.shape[batch_dims + index_depth :]
+    output_shape = indices.shape[:-1] + tail_shape
+    output = np.empty(output_shape, dtype=data.dtype)
+    indices_prefix_shape = indices.shape[:-1]
+    prefix_iter = (
+        np.ndindex(*indices_prefix_shape) if indices_prefix_shape else [()]
+    )
+    for prefix in prefix_iter:
+        raw_index = indices[prefix]
+        if index_depth == 1:
+            index_values = [int(np.asarray(raw_index).item())]
+        else:
+            index_values = [int(value) for value in raw_index]
+        for dim_index, value in enumerate(index_values):
+            if value < 0:
+                index_values[dim_index] = value + data.shape[
+                    batch_dims + dim_index
+                ]
+        data_index = list(prefix[:batch_dims]) + index_values
+        data_index.extend([slice(None)] * len(tail_shape))
+        output_index = prefix + (slice(None),) * len(tail_shape)
+        output[output_index] = data[tuple(data_index)]
+    evaluator.values[node.outputs[0]] = output
 @register_evaluator("Slice")
 def _eval_slice(evaluator: Evaluator, node: Node) -> None:
     input_value = evaluator.values[node.inputs[0]]
@@ -1010,6 +1430,49 @@ def _eval_attention(evaluator: Evaluator, node: Node) -> None:
         evaluator.values[qk_matmul_output_name] = qk_output
+@register_evaluator("RotaryEmbedding")
+def _eval_rotary_embedding(evaluator: Evaluator, node: Node) -> None:
+    op = lower_rotary_embedding(evaluator.graph, node)
+    x = evaluator.values[op.input0]
+    cos_cache = evaluator.values[op.cos_cache]
+    sin_cache = evaluator.values[op.sin_cache]
+    position_ids = (
+        evaluator.values[op.position_ids] if op.position_ids else None
+    )
+    original_shape = x.shape
+    if op.input_rank == 4:
+        x = np.transpose(x, (0, 2, 1, 3))
+    else:
+        x = x.reshape(op.batch, op.seq_len, op.num_heads, op.head_size)
+    x_rotate = x[..., : op.rotary_dim]
+    x_not_rotate = x[..., op.rotary_dim :]
+    if position_ids is not None:
+        cos_cache = cos_cache[position_ids]
+        sin_cache = sin_cache[position_ids]
+    cos_cache = np.expand_dims(cos_cache, axis=2)
+    sin_cache = np.expand_dims(sin_cache, axis=2)
+    if op.interleaved:
+        x1 = x_rotate[..., 0::2]
+        x2 = x_rotate[..., 1::2]
+    else:
+        x1, x2 = np.split(x_rotate, 2, axis=-1)
+    real = (cos_cache * x1) - (sin_cache * x2)
+    imag = (sin_cache * x1) + (cos_cache * x2)
+    if op.interleaved:
+        real = np.expand_dims(real, axis=-1)
+        imag = np.expand_dims(imag, axis=-1)
+        x_rotate_concat = np.concatenate((real, imag), axis=-1)
+        x_rotate = np.reshape(x_rotate_concat, x_rotate.shape)
+    else:
+        x_rotate = np.concatenate((real, imag), axis=-1)
+    output = np.concatenate((x_rotate, x_not_rotate), axis=-1)
+    if op.input_rank == 4:
+        output = np.transpose(output, (0, 2, 1, 3))
+    else:
+        output = output.reshape(original_shape)
+    evaluator.values[node.outputs[0]] = output
 def _apply_lstm_activation(
     kind: int, value: np.ndarray, alpha: float, beta: float
 ) -> np.ndarray:
@@ -1101,6 +1564,28 @@ def _eval_conv(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = _apply_conv(spec, data, weights, bias)
+@register_evaluator("ConvTranspose")
+def _eval_conv_transpose(evaluator: Evaluator, node: Node) -> None:
+    op_dtype = value_dtype(evaluator.graph, node.inputs[0], node)
+    output_dtype = value_dtype(evaluator.graph, node.outputs[0], node)
+    if op_dtype != output_dtype:
+        raise UnsupportedOpError(
+            f"{node.op_type} expects matching input/output dtypes, "
+            f"got {op_dtype.onnx_name} and {output_dtype.onnx_name}"
+        )
+    if not op_dtype.is_float:
+        raise UnsupportedOpError(
+            "ConvTranspose supports float16, float, and double inputs only"
+        )
+    spec = resolve_conv_transpose_spec(evaluator.graph, node)
+    data = evaluator.values[node.inputs[0]]
+    weights = evaluator.values[node.inputs[1]]
+    bias = evaluator.values[node.inputs[2]] if len(node.inputs) > 2 else None
+    evaluator.values[node.outputs[0]] = _apply_conv_transpose(
+        spec, data, weights, bias
+    )
 @register_evaluator("BatchNormalization")
 def _eval_batch_norm(evaluator: Evaluator, node: Node) -> None:
     op = lower_batch_normalization(evaluator.graph, node)
@@ -1133,6 +1618,94 @@ def _eval_lp_normalization(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[op.output] = data / denom
+@register_evaluator("LpPool")
+def _eval_lp_pool(evaluator: Evaluator, node: Node) -> None:
+    op = lower_lp_pool(evaluator.graph, node)
+    data = evaluator.values[op.input0]
+    output = np.zeros(
+        (op.batch, op.channels, op.out_h, op.out_w), dtype=data.dtype
+    )
+    for n in range(op.batch):
+        for c in range(op.channels):
+            for out_h in range(op.out_h):
+                for out_w in range(op.out_w):
+                    h_start = out_h * op.stride_h - op.pad_top
+                    w_start = out_w * op.stride_w - op.pad_left
+                    acc = 0.0
+                    for kh in range(op.kernel_h):
+                        for kw in range(op.kernel_w):
+                            in_h = h_start + kh
+                            in_w = w_start + kw
+                            if (
+                                0 <= in_h < op.in_h
+                                and 0 <= in_w < op.in_w
+                            ):
+                                value = data[(n, c, in_h, in_w)]
+                                acc += abs(value) ** op.p
+                    output[(n, c, out_h, out_w)] = acc ** (1.0 / op.p)
+    evaluator.values[op.output] = output
+@register_evaluator("QuantizeLinear")
+def _eval_quantize_linear(evaluator: Evaluator, node: Node) -> None:
+    spec = resolve_quantize_spec(evaluator.graph, node)
+    data = evaluator.values[node.inputs[0]]
+    scale = evaluator.values[node.inputs[1]]
+    zero_point_name = optional_name(node.inputs, 2)
+    if zero_point_name is None:
+        zero_point = 0
+    else:
+        zero_point = evaluator.values[zero_point_name]
+    if spec.axis is None:
+        scaled = data / scale + zero_point
+    else:
+        shape = [1] * data.ndim
+        shape[spec.axis] = scale.shape[0]
+        scaled = data / scale.reshape(shape) + np.asarray(zero_point).reshape(
+            shape
+        )
+    rounded = np.rint(scaled)
+    info = np.iinfo(spec.output_dtype.np_dtype)
+    clipped = np.clip(rounded, info.min, info.max)
+    evaluator.values[node.outputs[0]] = clipped.astype(
+        spec.output_dtype.np_dtype, copy=False
+    )
+@register_evaluator("QLinearMatMul")
+def _eval_qlinear_matmul(evaluator: Evaluator, node: Node) -> None:
+    op = lower_qlinear_matmul(evaluator.graph, node)
+    input0 = evaluator.values[op.input0]
+    input1 = evaluator.values[op.input1]
+    input0_scale = evaluator.values[op.input0_scale]
+    input1_scale = evaluator.values[op.input1_scale]
+    output_scale = evaluator.values[op.output_scale]
+    input0_zero_point = evaluator.values[op.input0_zero_point]
+    input1_zero_point = evaluator.values[op.input1_zero_point]
+    output_zero_point = evaluator.values[op.output_zero_point]
+    def _scalar_value(array: np.ndarray) -> float:
+        return float(np.asarray(array).reshape(-1)[0])
+    def _scalar_int(array: np.ndarray) -> int:
+        return int(np.asarray(array).reshape(-1)[0])
+    input0_zero = _scalar_int(input0_zero_point)
+    input1_zero = _scalar_int(input1_zero_point)
+    output_zero = _scalar_int(output_zero_point)
+    scale = _scalar_value(input0_scale) * _scalar_value(
+        input1_scale
+    ) / _scalar_value(output_scale)
+    acc = _apply_matmul(
+        input0.astype(np.int32) - input0_zero,
+        input1.astype(np.int32) - input1_zero,
+    )
+    scaled = acc.astype(np.float64) * scale + output_zero
+    rounded = np.rint(scaled)
+    info = np.iinfo(op.dtype.np_dtype)
+    clipped = np.clip(rounded, info.min, info.max)
+    evaluator.values[op.output] = clipped.astype(op.dtype.np_dtype)
 @register_evaluator("InstanceNormalization")
 def _eval_instance_normalization(evaluator: Evaluator, node: Node) -> None:
     op = lower_instance_normalization(evaluator.graph, node)
@@ -1284,6 +1857,18 @@ def _eval_maxpool(evaluator: Evaluator, node: Node) -> None:
         evaluator.values[indices_output] = indices
+@register_evaluator("GlobalMaxPool")
+def _eval_global_max_pool(evaluator: Evaluator, node: Node) -> None:
+    op = lower_global_max_pool(evaluator.graph, node)
+    value = evaluator.values[node.inputs[0]]
+    if not op.axes:
+        evaluator.values[node.outputs[0]] = value.copy()
+        return
+    evaluator.values[node.outputs[0]] = np.max(
+        value, axis=op.axes, keepdims=op.keepdims
+    )
 @register_evaluator("Softmax")
 def _eval_softmax(evaluator: Evaluator, node: Node) -> None:
     op = lower_softmax(evaluator.graph, node)
@@ -1298,6 +1883,19 @@ def _eval_logsoftmax(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[node.outputs[0]] = _apply_logsoftmax(value, op.axis)
+@register_evaluator("Hardmax")
+def _eval_hardmax(evaluator: Evaluator, node: Node) -> None:
+    op = lower_hardmax(evaluator.graph, node)
+    value = evaluator.values[node.inputs[0]]
+    max_values = np.max(value, axis=op.axis, keepdims=True)
+    is_max = value == max_values
+    max_index = np.argmax(is_max, axis=op.axis)
+    output = np.zeros_like(value)
+    ones = np.array(1.0, dtype=value.dtype)
+    np.put_along_axis(output, np.expand_dims(max_index, axis=op.axis), ones, axis=op.axis)
+    evaluator.values[node.outputs[0]] = output
 @register_evaluator("NegativeLogLikelihoodLoss")
 def _eval_negative_log_likelihood_loss(
     evaluator: Evaluator, node: Node
@@ -1409,6 +2007,16 @@ def _eval_size(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[op.output] = np.array(op.value, dtype=np.int64)
+@register_evaluator("NonZero")
+def _eval_nonzero(evaluator: Evaluator, node: Node) -> None:
+    op = lower_nonzero(evaluator.graph, node)
+    values = evaluator.values[op.input0]
+    indices = np.nonzero(values)
+    evaluator.values[op.output] = np.stack(indices, axis=0).astype(
+        np.int64, copy=False
+    )
 @register_evaluator("Expand")
 def _eval_expand(evaluator: Evaluator, node: Node) -> None:
     op = lower_expand(evaluator.graph, node)
@@ -1428,6 +2036,45 @@ def _eval_range(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[op.output] = output
+@register_evaluator("OneHot")
+def _eval_onehot(evaluator: Evaluator, node: Node) -> None:
+    op = lower_onehot(evaluator.graph, node)
+    indices = evaluator.values[op.indices].astype(np.int64, copy=False)
+    depth_values = evaluator.values[op.depth].reshape(-1)
+    if depth_values.size != 1:
+        raise UnsupportedOpError("OneHot depth input must be a scalar")
+    depth_value = int(depth_values[0])
+    if depth_value < 0:
+        raise UnsupportedOpError("OneHot depth must be non-negative")
+    values = evaluator.values[op.values].reshape(-1)
+    if values.size != 2:
+        raise UnsupportedOpError("OneHot values input must have 2 elements")
+    off_value, on_value = values[0], values[1]
+    if depth_value == 0:
+        evaluator.values[op.output] = np.full(
+            op.output_shape, off_value, dtype=values.dtype
+        )
+        return
+    axis = op.axis
+    rank = indices.ndim
+    if axis < 0:
+        axis += rank + 1
+    depth_range = np.arange(depth_value, dtype=np.int64)
+    new_shape = (1,) * axis + (depth_value,) + (1,) * (rank - axis)
+    targets = depth_range.reshape(new_shape)
+    adjusted = np.mod(indices, depth_value) if depth_value > 0 else indices
+    values_reshaped = np.reshape(
+        adjusted, indices.shape[:axis] + (1,) + indices.shape[axis:]
+    )
+    valid_mask = (indices >= -depth_value) & (indices < depth_value)
+    valid_mask = np.reshape(
+        valid_mask, indices.shape[:axis] + (1,) + indices.shape[axis:]
+    )
+    one_hot = (targets == values_reshaped) & valid_mask
+    output = np.where(one_hot, on_value, off_value).astype(values.dtype)
+    evaluator.values[op.output] = output
 @register_evaluator("Split")
 def _eval_split(evaluator: Evaluator, node: Node) -> None:
     op = lower_split(evaluator.graph, node)
@@ -1550,6 +2197,39 @@ def _eval_arg_reduce(evaluator: Evaluator, node: Node) -> None:
     evaluator.values[op.output] = indices.astype(op.output_dtype.np_dtype)
+@register_evaluator("TopK")
+def _eval_topk(evaluator: Evaluator, node: Node) -> None:
+    op = lower_topk(evaluator.graph, node)
+    value = evaluator.values[op.input0]
+    moved = np.moveaxis(value, op.axis, -1)
+    axis_dim = moved.shape[-1]
+    flat = moved.reshape(-1, axis_dim)
+    values_out = np.empty((flat.shape[0], op.k), dtype=value.dtype)
+    indices_out = np.empty((flat.shape[0], op.k), dtype=np.int64)
+    for row_index in range(flat.shape[0]):
+        row = flat[row_index]
+        order = sorted(
+            range(axis_dim),
+            key=lambda idx: (
+                -row[idx].item() if op.largest else row[idx].item(),
+                idx,
+            ),
+        )
+        topk = order[: op.k]
+        indices_out[row_index] = topk
+        values_out[row_index] = row[topk]
+    values_out = values_out.reshape(moved.shape[:-1] + (op.k,))
+    indices_out = indices_out.reshape(moved.shape[:-1] + (op.k,))
+    values_out = np.moveaxis(values_out, -1, op.axis)
+    indices_out = np.moveaxis(indices_out, -1, op.axis)
+    evaluator.values[op.output_values] = values_out.astype(
+        op.output_values_dtype.np_dtype
+    )
+    evaluator.values[op.output_indices] = indices_out.astype(
+        op.output_indices_dtype.np_dtype
+    )
 def _eval_binary_unary(evaluator: Evaluator, node: Node) -> None:
     if node.op_type == "BitShift":
         if len(node.inputs) != 2 or len(node.outputs) != 1:
@@ -1671,9 +2351,8 @@ def _matmul_batch_broadcastable(
     left_padded = (1,) * (max_rank - len(left)) + left
     right_padded = (1,) * (max_rank - len(right)) + right
     for left_dim, right_dim in zip(left_padded, right_padded):
-        if left_dim == right_dim or left_dim == 1 or right_dim == 1:
-            continue
-        return False
+        if not (left_dim == right_dim or left_dim == 1 or right_dim == 1):
+            return False
     return True
@@ -1916,7 +2595,9 @@ def _apply_attention(
     return output, key_total, value_total, qk_output
-def _apply_conv(spec, data: np.ndarray, weights: np.ndarray, bias: np.ndarray | None) -> np.ndarray:
+def _apply_conv(
+    spec, data: np.ndarray, weights: np.ndarray, bias: np.ndarray | None
+) -> np.ndarray:
     output = np.zeros(
         (spec.batch, spec.out_channels, *spec.out_spatial),
         dtype=data.dtype,
@@ -1958,15 +2639,67 @@ def _apply_conv(spec, data: np.ndarray, weights: np.ndarray, bias: np.ndarray |
                                     valid = False
                                     break
                                 in_index.append(in_dim)
-                            if not valid:
-                                continue
-                            acc += data[(n, ic_global, *in_index)] * weights[
-                                (oc_global, ic, *kernel_index)
-                            ]
+                            if valid:
+                                acc += data[(n, ic_global, *in_index)] * weights[
+                                    (oc_global, ic, *kernel_index)
+                                ]
                     output[(n, oc_global, *out_index)] = acc
     return output
+def _apply_conv_transpose(
+    spec, data: np.ndarray, weights: np.ndarray, bias: np.ndarray | None
+) -> np.ndarray:
+    output = np.zeros(
+        (spec.batch, spec.out_channels, *spec.out_spatial), dtype=data.dtype
+    )
+    if bias is not None:
+        output += bias.reshape((1, spec.out_channels) + (1,) * spec.spatial_rank)
+    pad_begin = spec.pads[: spec.spatial_rank]
+    group_in_channels = spec.in_channels // spec.group
+    group_out_channels = spec.out_channels // spec.group
+    for n in range(spec.batch):
+        for g in range(spec.group):
+            oc_base = g * group_out_channels
+            ic_base = g * group_in_channels
+            for ic in range(group_in_channels):
+                ic_global = ic_base + ic
+                for in_index in np.ndindex(*spec.in_spatial):
+                    value = data[(n, ic_global, *in_index)]
+                    for oc in range(group_out_channels):
+                        oc_global = oc_base + oc
+                        for kernel_index in np.ndindex(*spec.kernel_shape):
+                            out_index = []
+                            valid = True
+                            for (
+                                in_dim,
+                                kernel_dim,
+                                stride,
+                                dilation,
+                                pad,
+                                out_size,
+                            ) in zip(
+                                in_index,
+                                kernel_index,
+                                spec.strides,
+                                spec.dilations,
+                                pad_begin,
+                                spec.out_spatial,
+                            ):
+                                out_dim = (
+                                    in_dim * stride + kernel_dim * dilation - pad
+                                )
+                                if out_dim < 0 or out_dim >= out_size:
+                                    valid = False
+                                    break
+                                out_index.append(out_dim)
+                            if valid:
+                                output[(n, oc_global, *out_index)] += (
+                                    value * weights[(ic_global, oc, *kernel_index)]
+                                )
+    return output
 def _apply_lrn(spec, data: np.ndarray) -> np.ndarray:
     output = np.empty_like(data)
     spatial_shape = spec.shape[2:]
@@ -2002,15 +2735,15 @@ def _apply_average_pool(op, data: np.ndarray) -> np.ndarray:
                         if ih < 0 or ih >= op.in_h:
                             if op.count_include_pad:
                                 count += op.kernel_w
-                            continue
-                        for kw in range(op.kernel_w):
-                            iw = ow * op.stride_w + kw - op.pad_left
-                            if iw < 0 or iw >= op.in_w:
-                                if op.count_include_pad:
+                        else:
+                            for kw in range(op.kernel_w):
+                                iw = ow * op.stride_w + kw - op.pad_left
+                                if iw < 0 or iw >= op.in_w:
+                                    if op.count_include_pad:
+                                        count += 1
+                                else:
+                                    acc += data[n, c, ih, iw]
                                     count += 1
-                                continue
-                            acc += data[n, c, ih, iw]
-                            count += 1
                     output[n, c, oh, ow] = 0.0 if count == 0 else acc / float(count)
     return output
@@ -2059,25 +2792,30 @@ def _apply_maxpool(
                             valid = False
                             break
                         in_index.append(idx)
-                    if not valid:
-                        continue
-                    value = data[(n, c, *in_index)]
-                    if value > max_value or not has_value:
-                        max_value = value
-                        has_value = True
-                        if return_indices:
-                            linear_index = n * spec.channels + c
-                            if spec.storage_order == 0:
-                                for idx, size in zip(in_index, spec.in_spatial):
-                                    linear_index = linear_index * size + idx
-                            else:
-                                spatial_index = 0
-                                spatial_stride = 1
-                                for idx, size in zip(in_index, spec.in_spatial):
-                                    spatial_index += idx * spatial_stride
-                                    spatial_stride *= size
-                                linear_index = linear_index * spatial_stride + spatial_index
-                            max_index = linear_index
+                    if valid:
+                        value = data[(n, c, *in_index)]
+                        if value > max_value or not has_value:
+                            max_value = value
+                            has_value = True
+                            if return_indices:
+                                linear_index = n * spec.channels + c
+                                if spec.storage_order == 0:
+                                    for idx, size in zip(
+                                        in_index, spec.in_spatial
+                                    ):
+                                        linear_index = linear_index * size + idx
+                                else:
+                                    spatial_index = 0
+                                    spatial_stride = 1
+                                    for idx, size in zip(
+                                        in_index, spec.in_spatial
+                                    ):
+                                        spatial_index += idx * spatial_stride
+                                        spatial_stride *= size
+                                    linear_index = (
+                                        linear_index * spatial_stride + spatial_index
+                                    )
+                                max_index = linear_index
                 output[(n, c, *out_index)] = max_value
                 if return_indices and indices is not None:
                     indices[(n, c, *out_index)] = max_index
@@ -2162,8 +2900,12 @@ def _apply_lstm(
         beta_g = spec.activation_betas[act_offset + 1]
         beta_h = spec.activation_betas[act_offset + 2]
         for step in range(seq_length):
-            t_index = step if dir_kind == "forward" else seq_length - 1 - step
-            x_t = x[t_index]
+            if dir_kind == "forward":
+                x_t = x[step]
+            else:
+                t_indices = sequence_lens - 1 - step
+                t_indices = np.clip(t_indices, 0, seq_length - 1)
+                x_t = x[t_indices, np.arange(batch_size)]
             gates = x_t @ w_dir.T + h_prev @ r_dir.T + bias
             if spec.clip is not None and spec.clip > 0:
                 gates = np.clip(gates, -spec.clip, spec.clip)

emx-onnx-cgen 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

emx-onnx-cgen 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl