PyPI - onnx - Versions diffs - 1.15.0__cp311-cp311-win_amd64.whl → 1.16.1__cp311-cp311-win_amd64.whl - Mend

onnx 1.15.0__cp311-cp311-win_amd64.whl → 1.16.1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of onnx might be problematic. Click here for more details.

Files changed (584) hide show

onnx/defs/math/old.cc CHANGED Viewed

@@ -2631,10 +2631,10 @@ ONNX_OPERATOR_SET_SCHEMA(
 static const char* TopK_ver1_doc = R"DOC(
 Retrieve the top-K elements along a specified axis. Given an input tensor of
-shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
-  -Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs:
+  -Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}]
     which contains the values of the top k elements along the specified axis
-  -Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
+  -Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which
    contains the indices of the top k elements (original indices from the input
    tensor).
 Given two equivalent values, this operator uses the indices along the axis  as
@@ -2646,17 +2646,17 @@ ONNX_OPERATOR_SET_SCHEMA(
     1,
     OpSchema()
         .SetDoc(TopK_ver1_doc)
-        .Input(0, "X", "Tensor of shape [a_1, a_2, ..., a_n, r]", "T")
+        .Input(0, "X", "Tensor of shape [a_0, a_1, ..., a_{n-1}]", "T")
         .Output(
             0,
             "Values",
-            "Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] "
+            "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] "
             "containing top K values from the input tensor",
             "T")
         .Output(
             1,
             "Indices",
-            "Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] "
+            "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] "
             "containing the corresponding input tensor indices for the top K "
             "values.",
             "I")
@@ -2697,10 +2697,10 @@ ONNX_OPERATOR_SET_SCHEMA(
 static const char* TopK_ver10_doc = R"DOC(
 Retrieve the top-K elements along a specified axis. Given an input tensor of
-shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
-  -Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs:
+  -Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}]
     which contains the values of the top k elements along the specified axis
-  -Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
+  -Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which
    contains the indices of the top k elements (original indices from the input
    tensor).
@@ -2713,7 +2713,7 @@ ONNX_OPERATOR_SET_SCHEMA(
     10,
     OpSchema()
         .SetDoc(TopK_ver10_doc)
-        .Input(0, "X", "Tensor of shape [a_1, a_2, ..., a_n, r]", "T")
+        .Input(0, "X", "Tensor of shape [a_0, a_1, ..., a_{n-1}]", "T")
         .Input(
             1,
             "K",
@@ -2722,13 +2722,13 @@ ONNX_OPERATOR_SET_SCHEMA(
         .Output(
             0,
             "Values",
-            "Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] "
+            "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] "
             "containing top K values from the input tensor",
             "T")
         .Output(
             1,
             "Indices",
-            "Tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] "
+            "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] "
             "containing the corresponding input tensor indices for the top K "
             "values.",
             "I")
@@ -3107,4 +3107,88 @@ ONNX_OPERATOR_SET_SCHEMA(
           updateOutputShape(ctx, 0, result_shape_proto);
         }));
+ONNX_OPERATOR_SET_SCHEMA(
+    QLinearMatMul,
+    10,
+    OpSchema()
+        .SetDoc(defs::math::utils::QLinearMatMulDoc())
+        .Input(0, "a", "N-dimensional quantized matrix a", "T1", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Input(
+            1,
+            "a_scale",
+            "scale of quantized input a",
+            "tensor(float)",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            2,
+            "a_zero_point",
+            "zero point of quantized input a",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(3, "b", "N-dimensional quantized matrix b", "T2", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Input(
+            4,
+            "b_scale",
+            "scale of quantized input b",
+            "tensor(float)",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            5,
+            "b_zero_point",
+            "zero point of quantized input b",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            6,
+            "y_scale",
+            "scale of quantized output y",
+            "tensor(float)",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            7,
+            "y_zero_point",
+            "zero point of quantized output y",
+            "T3",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "y",
+            "Quantized matrix multiply results from a * b",
+            "T3",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .TypeConstraint(
+            "T1",
+            {"tensor(int8)", "tensor(uint8)"},
+            "Constrain input a and its zero point data type to 8-bit integer tensor.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(int8)", "tensor(uint8)"},
+            "Constrain input b and its zero point data type to 8-bit integer tensor.")
+        .TypeConstraint(
+            "T3",
+            {"tensor(int8)", "tensor(uint8)"},
+            "Constrain output y and its zero point data type to 8-bit integer tensor.")
+        .TypeAndShapeInferenceFunction(defs::math::utils::QLinearMatMulShapeInference));
 } // namespace ONNX_NAMESPACE

onnx/defs/math/utils.cc ADDED Viewed

@@ -0,0 +1,127 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "onnx/defs/math/utils.h"
+#include <string>
+namespace ONNX_NAMESPACE {
+namespace defs {
+namespace math {
+namespace utils {
+void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx) {
+  if (!hasInputShape(ctx, input1Idx) || !hasInputShape(ctx, input2Idx)) {
+    return;
+  }
+  const auto shape0 = ctx.getInputType(input1Idx)->tensor_type().shape();
+  const auto shape1 = ctx.getInputType(input2Idx)->tensor_type().shape();
+  if (shape0.dim_size() == 0 || shape1.dim_size() == 0) {
+    fail_shape_inference("Input tensors of wrong rank (0).");
+  }
+  ONNX_NAMESPACE::TensorShapeProto shapeL, shapeR;
+  // First promote each shape to at least rank-2. This logic is
+  // specific to matmul, not generic broadcasting.
+  {
+    if (shape0.dim_size() == 1) {
+      shapeL.add_dim()->set_dim_value(1);
+      *shapeL.add_dim() = shape0.dim(0);
+    } else {
+      *shapeL.mutable_dim() = shape0.dim();
+    }
+    if (shape1.dim_size() == 1) {
+      *shapeR.add_dim() = shape1.dim(0);
+      shapeR.add_dim()->set_dim_value(1);
+    } else {
+      *shapeR.mutable_dim() = shape1.dim();
+    }
+  }
+  // Check for compatible matrix multiply dimensions
+  {
+    auto dimL = shapeL.dim(shapeL.dim_size() - 1);
+    auto dimR = shapeR.dim(shapeR.dim_size() - 2);
+    if (dimL.has_dim_value() && dimR.has_dim_value() && dimL.dim_value() != dimR.dim_value()) {
+      fail_shape_inference("Incompatible dimensions for matrix multiplication");
+    }
+  }
+  ONNX_NAMESPACE::TensorShapeProto resultShape;
+  // Now call out to generic multidimensional broadcasting for
+  // the broadcastable prefixes.
+  {
+    ONNX_NAMESPACE::TensorShapeProto prefixShapeL, prefixShapeR;
+    for (int i = 0; i < shapeL.dim_size() - 2; ++i) {
+      *prefixShapeL.add_dim() = shapeL.dim(i);
+    }
+    for (int i = 0; i < shapeR.dim_size() - 2; ++i) {
+      *prefixShapeR.add_dim() = shapeR.dim(i);
+    }
+    bidirectionalBroadcastShapeInference(prefixShapeL, prefixShapeR, resultShape);
+  }
+  // Back to matmul-specific. Add the trailing dimensions back in.
+  {
+    if (shape0.dim_size() != 1) {
+      *resultShape.add_dim() = shapeL.dim(shapeL.dim_size() - 2);
+    }
+    if (shape1.dim_size() != 1) {
+      *resultShape.add_dim() = shapeR.dim(shapeR.dim_size() - 1);
+    }
+  }
+  *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape() = resultShape;
+}
+void QLinearMatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) {
+  auto a_type = ctx.getInputType(0);
+  auto b_type = ctx.getInputType(3);
+  if (nullptr == a_type || nullptr == b_type || a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType ||
+      b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+    fail_type_inference("inputs are expected to have tensor type.");
+  }
+  auto a_zero_point_type = ctx.getInputType(2);
+  if (nullptr == a_zero_point_type ||
+      a_zero_point_type->tensor_type().elem_type() != a_type->tensor_type().elem_type()) {
+    fail_type_inference("input and zero_point pair is expected to have be same type.");
+  }
+  auto b_zero_point_type = ctx.getInputType(5);
+  if (nullptr == b_zero_point_type ||
+      b_zero_point_type->tensor_type().elem_type() != b_type->tensor_type().elem_type()) {
+    fail_type_inference("input and zero_point pair is expected to have same type.");
+  }
+  propagateElemTypeFromInputToOutput(ctx, 7, 0);
+  MatMulShapeInference(ctx, 0, 3);
+}
+const char* QLinearMatMulDoc() {
+  static const char* QLinearMatMul_doc = R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
+and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
+For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
+Scale and zero point must have same shape. They must be either scalar (per tensor) or N-D tensor
+(per row for 'a' and per column for 'b'). Scalar refers to per tensor quantization whereas N-D refers to per row
+or per column quantization. If the input is 2D of shape [M, K] then zero point and scale tensor may be
+an M element vector [v_1, v_2, ..., v_M] for per row quantization and K element vector of shape [v_1, v_2, ..., v_K]
+for per column quantization. If the input is N-D tensor with shape [D1, D2, M, K] then zero point and scale tensor may
+have shape [D1, D2, M, 1] for per row quantization and shape [D1, D2, 1, K] for per column quantization.
+Production must never overflow, and accumulation may overflow if and only if in 32 bits.
+)DOC";
+  return QLinearMatMul_doc;
+}
+} // namespace utils
+} // namespace math
+} // namespace defs
+} // namespace ONNX_NAMESPACE

onnx/defs/math/utils.h CHANGED Viewed

@@ -12,6 +12,7 @@ namespace ONNX_NAMESPACE {
 namespace defs {
 namespace math {
 namespace utils {
 template <typename T>
 T GetScalarValueFromTensor(const ONNX_NAMESPACE::TensorProto* t) {
   if (t == nullptr) {
@@ -32,6 +33,13 @@ T GetScalarValueFromTensor(const ONNX_NAMESPACE::TensorProto* t) {
       fail_shape_inference("Unsupported input data type of ", data_type);
   }
 }
+void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx);
+void QLinearMatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx);
+const char* QLinearMatMulDoc();
 } // namespace utils
 } // namespace math
 } // namespace defs

onnx/defs/nn/defs.cc CHANGED Viewed

@@ -164,25 +164,23 @@ void convPoolShapeInference(
       continue;
     }
     // how big is the input, including padding
-    int64_t effective_input_size = input_shape.dim(2 + i).dim_value();
-    effective_input_size += pads[i];
-    effective_input_size += pads[i + kernel_shape_size];
+    int64_t input_size = input_shape.dim(2 + i).dim_value();
+    int64_t effective_input_size = input_size + pads[i] + pads[i + kernel_shape_size];
     // default is floor mode .i.e. ceil_mode is set to 0
     auto ceil_mode = getAttribute(ctx, "ceil_mode", 0);
-    // how many times we can move the kernel from it's initial position, based
-    // on the stride
-    int64_t strided_kernel_positions;
-    if (ceil_mode == 1)
-      strided_kernel_positions =
-          (int64_t)(std::ceil((effective_input_size - effective_kernel_shape[i]) / float(strides[i])));
-    else
-      strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) / strides[i];
+    int64_t output_size =
+        (effective_input_size - effective_kernel_shape[i] + (ceil_mode ? strides[i] - 1 : 0)) / strides[i] + 1;
+    if (ceil_mode == 1 && (output_size - 1) * strides[i] >= (input_size + pads[i])) {
+      // we need to match pytorch's behavior of "Sliding windows that would start in the right padded region are
+      // ignored." (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html#maxpool1d). this code follows the
+      // same logic as PyTorch's C++ implementation:
+      // https://github.com/pytorch/pytorch/blob/f1cdb39da3850c47d51ec6a5b1ae864c32b3accf/aten/src/ATen/native/Pool.h#L54C21-L54C21
+      --output_size;
+    }
-    // add in the initial position
-    newdim->set_dim_value(1 + strided_kernel_positions);
+    newdim->set_dim_value(output_size);
   }
   if (ctx.getNumOutputs() > 1) {
@@ -223,7 +221,7 @@ std::function<void(OpSchema&)> PoolOpSchemaGenerator(
  ```
  output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
  ```
- if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+ if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
  `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
  ```
@@ -456,7 +454,7 @@ void maxUnpoolShapeInference(InferenceContext& ctx) {
   }
 }
-static const char* MaxUnpool_ver9_doc = R"DOC(
+static const char* MaxUnpool_ver11_doc = R"DOC(
 MaxUnpool essentially computes the partial inverse of the MaxPool op.
  The input information to this op is typically the output information from a MaxPool op. The first
  input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
@@ -481,7 +479,7 @@ ONNX_OPERATOR_SET_SCHEMA(
     MaxUnpool,
     11,
     OpSchema()
-        .SetDoc(MaxUnpool_ver9_doc)
+        .SetDoc(MaxUnpool_ver11_doc)
         .Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
         .Attr(
             "strides",
@@ -1657,8 +1655,8 @@ ONNX_OPERATOR_SET_SCHEMA(
             0.9f)
         .Attr(
             "training_mode",
-            "If set to true, it indicates BatchNormalization is being used for training, and outputs 1, "
-            "2, 3, and 4 would be populated.",
+            "If set to true, it indicates BatchNormalization is being used for training, and outputs 1 "
+            "and 2 are to be computed.",
             AttributeProto::INT,
             static_cast<int64_t>(0))
         .Input(
@@ -1997,7 +1995,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
-static const char* Flatten_ver13_doc = R"DOC(
+static const char* Flatten_ver11_doc = R"DOC(
 Flattens the input tensor into a 2D matrix. If input tensor has shape
 (d_0, d_1, ... d_n) then the output will have shape
 (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
@@ -2005,9 +2003,9 @@ Flattens the input tensor into a 2D matrix. If input tensor has shape
 ONNX_OPERATOR_SET_SCHEMA(
     Flatten,
-    13,
+    21,
     OpSchema()
-        .SetDoc(Flatten_ver13_doc)
+        .SetDoc(Flatten_ver11_doc)
         .Input(0, "input", "A tensor of rank >= axis.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -2021,7 +2019,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output to all tensor types.")
+        .TypeConstraint(
+            "T",
+            OpSchema::all_tensor_types_ir10(),
+            "Constrain input and output to all tensor types up to IRv10.")
         .Attr(
             "axis",
             "Indicate up to which input dimensions "
@@ -2516,7 +2517,9 @@ static const char* LayerNormalization_ver17_doc = R"DOC(
       Let `d[i]` indicate the i-th dimension of `X`.
       If `X`'s shape is `[d[0], ..., d[axis-1], d[axis], ..., d[rank-1]]`,
       the shape of `Mean` and `InvStdDev` is `[d[0], ..., d[axis-1], 1, ..., 1]`.
-      `Y` and `X` have the same shape.
+      `Y` and `X` have the same shape. This operator supports unidirectional broadcasting
+      (tensors `Scale` and `B` should be unidirectional broadcastable to tensor `X`);
+      for more details please check [the doc](Broadcasting.md).
 )DOC";
 bool BuildContextDependentFunctionBodyLayerNormalization(
@@ -2705,7 +2708,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
-static const char* GroupNormalization_ver18_doc = R"DOC(
+static const char* GroupNormalization_ver21_doc = R"DOC(
 A GroupNormalization function. Carries out group normalization as described in
 the paper https://arxiv.org/abs/1803.08494
@@ -2718,6 +2721,14 @@ where the mean and variance are computed per instance per group of channels, and
 groups `num_groups` should be divisible by the number of channels so that there are
 an equal number of channels per group.
+The overall computation has two stages: the first stage normalizes the elements to
+have zero mean and unit variance for each instance in each group, and the second
+stage scales and shifts the results of the first stage. The floating-point precision
+used in the first stage is determined by the `stash_type` attribute. For example,
+if `stash_type` is 1, the operator casts all input variables to 32-bit float,
+performs the computation, and finally casts the normalized results back to the
+original type of `X`. The second stage does not depend on `stash_type`.
 When the number of groups is the same as the number of channels, this operator is
 equivalent to InstanceNormalization. When there is only one group, this operator
 is equivalent to LayerNormalization.
@@ -2725,15 +2736,20 @@ is equivalent to LayerNormalization.
 ONNX_OPERATOR_SET_SCHEMA(
     GroupNormalization,
-    18,
+    21,
     OpSchema()
-        .SetDoc(GroupNormalization_ver18_doc)
+        .SetDoc(GroupNormalization_ver21_doc)
         .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
         .Attr(
             "num_groups",
             "The number of groups of channels. It should be a divisor of the number of channels `C`.",
             AttributeProto::INT,
             true)
+        .Attr(
+            "stash_type",
+            "The floating-point precision used in stage one of the computation.",
+            AttributeProto::INT,
+            static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT))
         .Input(
             0,
             "X",
@@ -2746,24 +2762,8 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .Input(
-            1,
-            "scale",
-            "Scale tensor of shape `(num_groups)`.",
-            "T",
-            OpSchema::Single,
-            true,
-            1,
-            OpSchema::Differentiable)
-        .Input(
-            2,
-            "bias",
-            "Bias tensor of shape `(num_groups)`.",
-            "T",
-            OpSchema::Single,
-            true,
-            1,
-            OpSchema::Differentiable)
+        .Input(1, "scale", "Scale tensor of shape `(C)`.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Input(2, "bias", "Bias tensor of shape `(C)`.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
             "Y",
@@ -2773,17 +2773,14 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .SetContextDependentFunctionBodyBuilder(
             [](const FunctionBodyBuildContext& ctx, const OpSchema& schema, FunctionProto& functionProto) {
               // GroupNormalization <epsilon, num_groups> (X, scale, bias) => (Y)
               auto* tp = ctx.getInputType(0);
               if ((tp == nullptr) || (!tp->has_tensor_type()))
                 return false;
-              int64_t T = tp->tensor_type().elem_type();
+              int64_t in_type = tp->tensor_type().elem_type();
               auto* epsilon_attr = ctx.getAttribute("epsilon");
               float epsilon = (epsilon_attr != nullptr) ? epsilon_attr->f() : 1e-5f;
@@ -2792,10 +2789,21 @@ ONNX_OPERATOR_SET_SCHEMA(
                 return false;
               int64_t num_groups = num_groups_attr->i();
+              auto stash_type_attr = ctx.getAttribute("stash_type");
+              int64_t stash_type = (stash_type_attr != nullptr)
+                  ? stash_type_attr->i()
+                  : static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+              if ((stash_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) &&
+                  (stash_type != ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) &&
+                  (stash_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) &&
+                  (stash_type != ONNX_NAMESPACE::TensorProto_DataType_DOUBLE))
+                return false; // Error
               FunctionBuilder builder(functionProto);
               builder.Const1D("FloatEpsilon", epsilon)
-                  .Add("Epsilon = Cast (FloatEpsilon)", "to", T)
-                  .Add("XShape = Shape (X)") // shape of input tensor: 1D tensor
+                  .Add("Epsilon = Cast (FloatEpsilon)", "to", stash_type)
+                  .Add("XU = Cast (X)", "to", stash_type)
+                  .Add("XShape = Shape (XU)") // shape of input tensor: 1D tensor
                   .Add("C = Shape <start = 1, end = 2> (X)")
                   .Const1D("NumGroups", num_groups)
                   .Add("GroupSize = Div (C, NumGroups)")
@@ -2804,11 +2812,11 @@ ONNX_OPERATOR_SET_SCHEMA(
                   // NewShape = [N, num_groups, group_size, H, W, (...)]
                   .Add("NewShape = Concat <axis = 0> (N, NumGroups, GroupSize, InstanceShape)")
-                  .Add("XReshaped = Reshape (X, NewShape)")
+                  .Add("XReshaped = Reshape (XU, NewShape)")
                   // Flatten into 3D tensor: [N, num_groups, group_size x H x W (x ...)]
                   .Add("Shape3D = Constant <value_ints = [0, 0, -1]> ()")
-                  .Add("X3D = Reshape(XReshaped, Shape3D)")
+                  .Add("X3D = Reshape (XReshaped, Shape3D)")
                   // Calculate statistics
                   .Const1D("Axes2", (int64_t)2)
@@ -2820,17 +2828,22 @@ ONNX_OPERATOR_SET_SCHEMA(
                   .Add("VarPlusEpsilon = Add (Var, Epsilon)")
                   .Add("StdDev = Sqrt (VarPlusEpsilon)")
                   .Add("Deviation = Sub (X3D, Mean)")
-                  .Add("Normalized = Div (Deviation, StdDev)")
+                  .Add("NormalizedU = Div (Deviation, StdDev)")
+                  // Reshape to [N, C, H x W (x ...)] and cast to original type
+                  .Add("NormalizedOriginalShape = Reshape (NormalizedU, XShape)")
+                  .Add("NormalizedNC = Reshape (NormalizedOriginalShape, Shape3D)")
+                  .Add("NormalizedT = Cast (NormalizedNC)", "to", in_type)
-                  // Reshape scale and bias for broadcasting
+                  // Reshape scale and bias to [1, C, 1] for broadcasting
                   .Add("ScaleShape = Constant <value_ints = [1, -1, 1]> ()")
-                  .Add("ScaleT = Cast (scale)", "to", T)
-                  .Add("BiasT = Cast (bias)", "to", T)
+                  .Add("ScaleT = Cast (scale)", "to", in_type)
+                  .Add("BiasT = Cast (bias)", "to", in_type)
                   .Add("ScaleReshaped = Reshape (ScaleT, ScaleShape)")
                   .Add("BiasReshaped = Reshape (BiasT, ScaleShape)")
                   // Calculate scaled and biased output
-                  .Add("Scaled = Mul (ScaleReshaped, Normalized)")
+                  .Add("Scaled = Mul (ScaleReshaped, NormalizedT)")
                   .Add("Biased = Add (Scaled, BiasReshaped)")
                   .Add("Y = Reshape (Biased, XShape)");