PyPI - onnx - Versions diffs - 1.19.0__cp313-cp313t-win_amd64.whl → 1.19.1rc1__cp313-cp313t-win_amd64.whl - Mend

onnx 1.19.0__cp313-cp313t-win_amd64.whl → 1.19.1rc1__cp313-cp313t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of onnx might be problematic. Click here for more details.

Files changed (202) hide show

onnx/defs/nn/old.cc CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <cmath>
 #include "onnx/defs/function.h"
+#include "onnx/defs/nn/utils.h"
 #include "onnx/defs/schema.h"
 namespace ONNX_NAMESPACE {
@@ -4373,154 +4374,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "U",
             OpSchema::all_non_complex_numeric_types_plus_bool_ir4(),
             "Constrain output 'mask' types to boolean tensors and input types.")
-        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-          propagateElemTypeFromInputToOutput(ctx, 0, 0);
-          int64_t kv_sequence_length = -1;
-          ONNX_NAMESPACE::TensorShapeProto output_shape;
-          ONNX_NAMESPACE::TensorShapeProto qk_matmul_shape;
-          if (hasInputShape(ctx, 0)) {
-            auto& query_shape = getInputShape(ctx, 0);
-            auto& query_dims = query_shape.dim();
-            if ((query_dims.size() != 3) && (query_dims.size() != 4)) {
-              fail_shape_inference("Inputs 0 (query) shall be 3 or 4 dimensions");
-            }
-            if (query_dims.size() == 3) {
-              auto* q_num_heads_attr = ctx.getAttribute("q_num_heads");
-              if (q_num_heads_attr == nullptr) {
-                fail_type_inference("3D inputs expected to have q_num_heads attribute.");
-              }
-              auto* kv_num_heads_attr = ctx.getAttribute("kv_num_heads");
-              if (kv_num_heads_attr == nullptr) {
-                fail_type_inference("3D inputs expected to have q_num_heads attribute.");
-              }
-            }
-            *output_shape.add_dim() = query_dims[0]; // batch_size
-            *output_shape.add_dim() = query_dims[1]; // num_heads for 4D, sequence_length for 3D
-            *qk_matmul_shape.add_dim() = query_dims[0]; // batch_size
-            if (hasInputShape(ctx, 1)) {
-              auto& key_shape = getInputShape(ctx, 1);
-              auto& key_dims = key_shape.dim();
-              if ((key_dims.size() != 3) && (key_dims.size() != 4)) {
-                fail_shape_inference("Inputs 1 (key) shall be 3 or 4 dimensions");
-              }
-            }
-            if (hasInputShape(ctx, 2)) {
-              auto& value_shape = getInputShape(ctx, 2);
-              auto& value_dims = value_shape.dim();
-              if ((value_dims.size() != 3) && (value_dims.size() != 4)) {
-                fail_shape_inference("Inputs 2 (value) shall be 3 or 4 dimensions");
-              }
-              // Update Output Shape for 4D inputs
-              // Input 0 (query) has shape (batch_size, q_num_heads, q_sequence_length, head_size)
-              // Input 1 (key) has shape (batch_size, kv_num_heads, kv_sequence_length, head_size)
-              // Input 2 (value) has shape (batch_size, kv_num_heads, kv_sequence_length, v_head_size)
-              // Output 0 has shape (batch_size, q_num_heads, q_sequence_length, v_head_size)
-              if (value_dims.size() == 4 && query_dims.size() == 4) {
-                kv_sequence_length = value_dims[2].dim_value();
-                *output_shape.add_dim() = query_dims[2]; // sequence_length
-                *output_shape.add_dim() = value_dims[3]; // head_size
-                updateOutputShape(ctx, 0, output_shape);
-                // Update qk_matmul_shape
-                *qk_matmul_shape.add_dim() = query_dims[1]; // q_num_heads
-                *qk_matmul_shape.add_dim() = query_dims[2]; // q_sequence_length
-                qk_matmul_shape.add_dim()->set_dim_value(kv_sequence_length);
-              }
-              // Update Output Shape for 3D inputs
-              // Input 0 (query) has shape (batch_size, q_sequence_length, q_hidden_size),
-              // q_hidden_size = q_num_heads * head_size
-              // Input 1 (key) has shape (batch_size, kv_sequence_length, k_hidden_size),
-              // k_hidden_size = kv_num_heads * head_size
-              // Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size),
-              // v_hidden_size = kv_num_heads * v_head_size
-              // Output 0 has shape (batch_size, q_sequence_length, hidden_size),
-              // hidden_size = q_num_heads * v_head_size
-              if (value_dims.size() == 3 && query_dims.size() == 3) {
-                kv_sequence_length = value_dims[1].dim_value();
-                auto* q_num_heads_attr = ctx.getAttribute("q_num_heads");
-                if (q_num_heads_attr == nullptr) {
-                  fail_type_inference("3D inputs expected to have q_num_heads attribute.");
-                }
-                auto* kv_num_heads_attr = ctx.getAttribute("kv_num_heads");
-                if (kv_num_heads_attr == nullptr) {
-                  fail_type_inference("3D inputs expected to have kv_num_heads attribute.");
-                }
-                int64_t q_num_heads = q_num_heads_attr->i();
-                int64_t kv_num_heads = kv_num_heads_attr->i();
-                // Calculate v_head_size
-                int64_t v_head_size = value_dims[2].dim_value() / kv_num_heads;
-                output_shape.add_dim()->set_dim_value(v_head_size * q_num_heads);
-                updateOutputShape(ctx, 0, output_shape);
-                // Update qk_matmul_shape
-                qk_matmul_shape.add_dim()->set_dim_value(q_num_heads);
-                *qk_matmul_shape.add_dim() = query_dims[1];
-                qk_matmul_shape.add_dim()->set_dim_value(kv_sequence_length);
-              }
-            }
-          }
-          if (ctx.hasOutput(3)) { // has qk_matmul_output
-            propagateElemTypeFromInputToOutput(ctx, 0, 3);
-            updateOutputShape(ctx, 3, qk_matmul_shape);
-          }
-          if (ctx.hasOutput(1) && ctx.hasOutput(2)) { // has present outputs
-            if (ctx.hasInput(4) && ctx.hasInput(5)) { // has past_key
-              // copy the type from query to present key and value
-              propagateElemTypeFromInputToOutput(ctx, 4, 1);
-              propagateElemTypeFromInputToOutput(ctx, 5, 2);
-              if (hasInputShape(ctx, 4) && hasInputShape(ctx, 5)) {
-                auto& past_key_shape = getInputShape(ctx, 4);
-                auto& past_key_dims = past_key_shape.dim();
-                auto& past_value_shape = getInputShape(ctx, 5);
-                auto& past_value_dims = past_value_shape.dim();
-                // past key has shape (batch_size, kv_num_heads, past_sequence_length, head_size)
-                if (past_key_dims.size() != 4) {
-                  fail_shape_inference("The past_key input shall be 4 dimensions");
-                }
-                // past value has shape (batch_size, kv_num_heads, past_sequence_length, v_head_size)
-                if (past_value_dims.size() != 4) {
-                  fail_shape_inference("The past_value input shall be 4 dimensions");
-                }
-                if (kv_sequence_length > 0 && past_key_dims[2].has_dim_value()) {
-                  int64_t total_sequence_length = kv_sequence_length + past_key_dims[2].dim_value();
-                  ONNX_NAMESPACE::TensorShapeProto present_key_shape;
-                  for (auto& dim : past_key_dims) {
-                    *present_key_shape.add_dim() = dim;
-                  }
-                  ONNX_NAMESPACE::TensorShapeProto present_value_shape;
-                  for (auto& dim : past_value_dims) {
-                    *present_value_shape.add_dim() = dim;
-                  }
-                  if (ctx.hasOutput(3)) { // has qk_matmul_output with bias
-                    qk_matmul_shape.mutable_dim(3)->set_dim_value(total_sequence_length);
-                    updateOutputShape(ctx, 3, qk_matmul_shape);
-                  }
-                  // shape of present key/value is (batch_size, kv_num_heads, total_sequence_length, head_size)
-                  present_key_shape.mutable_dim(2)->set_dim_value(total_sequence_length);
-                  present_value_shape.mutable_dim(2)->set_dim_value(total_sequence_length);
-                  updateOutputShape(ctx, 1, present_key_shape);
-                  updateOutputShape(ctx, 2, present_value_shape);
-                }
-              }
-            }
-          }
-        })
+        .TypeAndShapeInferenceFunction(defs::nn::utils::AttentionPropagateElemTypeFromInputToOutput)
         .SetContextDependentFunctionBodyBuilder([](const FunctionBodyBuildContext& ctx,
                                                    const OpSchema& schema,
                                                    FunctionProto& functionProto) {
@@ -4544,12 +4398,6 @@ ONNX_OPERATOR_SET_SCHEMA(
               (softmax_precision != ONNX_NAMESPACE::TensorProto_DataType_DOUBLE))
             return false; // Error
-          auto mkbooltensor = [](bool val) -> ONNX_NAMESPACE::TensorProto {
-            auto tp = ONNX_NAMESPACE::ToTensor(std::vector<bool>{val});
-            tp.add_dims(1);
-            return tp;
-          };
           // If shape is 3D, q_num_heads and kv_num_heads is provided,
           // for 4D cases, set num_heads to zero for reshape purposes
           auto* q_num_heads_attr = ctx.getAttribute("q_num_heads");
@@ -4561,15 +4409,16 @@ ONNX_OPERATOR_SET_SCHEMA(
           bool is_3d_input = (q_num_heads > 0 && kv_num_heads > 0);
           FunctionBuilder builder(functionProto);
+          builder
+              .Add("BatchSize = Shape <start = 0, end = 1> (Q)") // batch size
+              .Add("QSeqLen = Shape <start = -2, end = -1> (Q)") // q_sequence_length
+              .Add("KVSeqLen = Shape <start = -2, end = -1> (K)"); // kv_sequence_length
           if (is_3d_input) {
             // For 3D inputs: First reshape to [batch_size, seq_length, num_heads, head_size]
             // then transpose to [batch_size, num_heads, seq_length, head_size]
             builder
-                .Add("BatchSize = Shape <start = 0, end = 1> (Q)") // batch size
                 .Const1D("QNumHeadsAttr", q_num_heads) // q_num_heads from attrs
                 .Const1D("KVNumHeadsAttr", kv_num_heads) // kv_num_heads from attrs
-                .Add("QSeqLen = Shape <start = -2, end = -1> (Q)") // q_sequence_length
-                .Add("KVSeqLen = Shape <start = -2, end = -1> (K)") // kv_sequence_length
                 .Const1D("NegOne", static_cast<int64_t>(-1)); // head_size, inferred from other dimensions
             builder.Add("QIntermediateShape = Concat <axis = 0> (BatchSize, QSeqLen, QNumHeadsAttr, NegOne)")
@@ -4596,6 +4445,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           builder
               .Add("QKHeadSize = Shape <start = 3, end = 4> (QReshaped)") // head_size for Q and K
               .Add("QKHeadSizeF = Cast (QKHeadSize)", "to", float_type)
+              .Add("VHeadSize = Shape <start = 3, end = 4> (VReshaped)") // head_size for V
               .Add("SqrtHeadSize = Sqrt(QKHeadSizeF)")
               .Const1D("One1D", static_cast<int64_t>(1))
               .Const1D("One1DF", static_cast<float>(1))
@@ -4610,8 +4460,10 @@ ONNX_OPERATOR_SET_SCHEMA(
           if (ctx.hasInput(4)) {
             builder.Add("PresentKey = Concat <axis = 2> (past_key, KReshaped)");
+            builder.Add("PastKVSeqLen =  Shape <start = -2, end = -1> (past_key)");
           } else {
             builder.Add("PresentKey = Identity (KReshaped)");
+            builder.Const1D("PastKVSeqLen", static_cast<int64_t>(0));
           }
           if (ctx.hasOutput(1)) {
             builder.Add("present_key = Identity (PresentKey)");
@@ -4626,46 +4478,9 @@ ONNX_OPERATOR_SET_SCHEMA(
             builder.Add("present_value = Identity (PresentValue)");
           }
-          // If attn_mask is provided
-          float neg_inf = -std::numeric_limits<float>::infinity();
-          if (ctx.hasInput(3)) {
-            auto* up = ctx.getInputType(3);
-            if ((up == nullptr) || (!up->has_tensor_type()))
-              return false;
-            int64_t U = up->tensor_type().elem_type();
-            builder.Const1D("FloatInf", neg_inf);
-            builder.Const1D("ScalarZero", 0.f);
-            builder.Add(
-                U == ONNX_NAMESPACE::TensorProto_DataType_BOOL ? "AttnBias = Where(attn_mask, ScalarZero, FloatInf)"
-                                                               : "AttnBias = Identity(attn_mask)");
-          } else {
-            // If is_causal set to true, the attention masking is a lower triangular matrix when the mask
-            // is a square matrix. The attention masking has the form of the upper left causal bias due to
-            // the alignment when the mask is a non-square matrix.
-            // An error is thrown if both attn_mask and is_causal are set.
-            auto* is_causal_attr = ctx.getAttribute("is_causal");
-            int64_t is_causal = (is_causal_attr != nullptr) ? is_causal_attr->i() : 0;
-            if (!is_3d_input) {
-              builder.Add("QSeqLen = Shape <start = -2, end = -1> (Q)"); // q_sequence_length
-            }
-            if (is_causal == 1) {
-              builder.Const1D("FloatInf", neg_inf);
-              builder.Const1D("ScalarZero", 0.f);
-              builder.Add("NewKVSeqLen =  Shape <start = -2, end = -1> (PresentKey)")
-                  .Add("AttnBiasShape = Concat <axis = -1> (QSeqLen, NewKVSeqLen)")
-                  .Add("AttnBiasZeros_ = ConstantOfShape(AttnBiasShape)")
-                  .Add("AttnBiasZeros = CastLike(AttnBiasZeros_, Q)")
-                  .Add("BoolMask = ConstantOfShape(AttnBiasShape)", "value", mkbooltensor(1))
-                  .Add("BoolMaskTri = Trilu <upper = 0> (BoolMask, Zero1D)")
-                  .Add("AttnBias = Where(BoolMaskTri, ScalarZero, FloatInf)");
-            } else {
-              builder.Add("NewKVSeqLen =  Shape <start = -2, end = -1> (PresentKey)")
-                  .Add("AttnBiasShape = Concat <axis = -1> (QSeqLen, NewKVSeqLen)")
-                  .Add("AttnBias = ConstantOfShape(AttnBiasShape)");
-            }
-          }
-          builder.Add("AttnBiasT = Cast (AttnBias)", "to", T1);
+          if (!defs::nn::utils::AttentionAppendFunctionCausalMask(ctx, builder, false))
+            return false;
+          builder.Add("AttnBiasT = Cast (AttnBiasCausalOrNot)", "to", T1);
           // Group Query Attention is applied if the following are satisfied
           // 1) q_num_heads != kv_num_heads
@@ -4678,10 +4493,25 @@ ONNX_OPERATOR_SET_SCHEMA(
               .Add("RemainderNumHeads = Mod(QNumHeads, KVNumHeads)")
               .Add("GQACond2 = Equal(RemainderNumHeads, Zero1D)")
               .Add("GQACond = And(GQACond1, GQACond2)")
-              .Add("InterleaveDim = Where(GQACond, IDivNumHeads, One1D)")
-              .Add("InterleaveShape = Concat <axis = 0> (One1D, InterleaveDim, One1D, One1D)")
-              .Add("KAttentionInput = Tile(PresentKey, InterleaveShape)")
-              .Add("VAttentionInput = Tile(PresentValue, InterleaveShape)");
+              .Add("InterleaveDim = Where(GQACond, IDivNumHeads, One1D)");
+          // repeat kv (repeat_interleave)
+          builder.Const1D("Two1D", static_cast<int64_t>(2))
+              .Add("KUnsqueezed = Unsqueeze(PresentKey, Two1D)") // [B, Hk, 1, T, Dk]
+              .Add("VUnsqueezed = Unsqueeze(PresentValue, Two1D)"); // [B, Hk, 1, T, Dv]
+          // Build expand shape: [B, Hk, repeats, T, Dk]
+          builder
+              .Add("KExpandShape = Concat <axis = 0> (BatchSize, KVNumHeads, InterleaveDim, NewKVSeqLen, QKHeadSize)")
+              .Add("KExpanded = Expand(KUnsqueezed, KExpandShape)");
+          builder.Add("VExpandShape = Concat <axis = 0> (BatchSize, KVNumHeads, InterleaveDim, NewKVSeqLen, VHeadSize)")
+              .Add("VExpanded = Expand(VUnsqueezed, VExpandShape)");
+          // Reshape to [B, Hq, T, Dk] where Hq = Hk * repeats
+          builder.Add("KAttentionShape = Concat <axis = 0> (BatchSize, QNumHeads, NewKVSeqLen, QKHeadSize)")
+              .Add("VAttentionShape = Concat <axis = 0> (BatchSize, QNumHeads, NewKVSeqLen, VHeadSize)")
+              .Add("KAttentionInput = Reshape(KExpanded, KAttentionShape)")
+              .Add("VAttentionInput = Reshape(VExpanded, VAttentionShape)");
           // The following pattern is applied
           //      Q          K          V

onnx/defs/nn/utils.cc ADDED Viewed

@@ -0,0 +1,222 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "onnx/defs/nn/utils.h"
+#include <algorithm>
+namespace ONNX_NAMESPACE {
+namespace defs {
+namespace nn {
+namespace utils {
+void AttentionPropagateElemTypeFromInputToOutput(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+  int64_t kv_sequence_length = -1;
+  ONNX_NAMESPACE::TensorShapeProto output_shape;
+  ONNX_NAMESPACE::TensorShapeProto qk_matmul_shape;
+  if (hasInputShape(ctx, 0)) {
+    auto& query_shape = getInputShape(ctx, 0);
+    auto& query_dims = query_shape.dim();
+    if ((query_dims.size() != 3) && (query_dims.size() != 4)) {
+      fail_shape_inference("Inputs 0 (query) shall be 3 or 4 dimensions");
+    }
+    if (query_dims.size() == 3) {
+      auto* q_num_heads_attr = ctx.getAttribute("q_num_heads");
+      if (q_num_heads_attr == nullptr) {
+        fail_type_inference("3D inputs expected to have q_num_heads attribute.");
+      }
+      auto* kv_num_heads_attr = ctx.getAttribute("kv_num_heads");
+      if (kv_num_heads_attr == nullptr) {
+        fail_type_inference("3D inputs expected to have q_num_heads attribute.");
+      }
+    }
+    *output_shape.add_dim() = query_dims[0]; // batch_size
+    *output_shape.add_dim() = query_dims[1]; // num_heads for 4D, sequence_length for 3D
+    *qk_matmul_shape.add_dim() = query_dims[0]; // batch_size
+    if (hasInputShape(ctx, 1)) {
+      auto& key_shape = getInputShape(ctx, 1);
+      auto& key_dims = key_shape.dim();
+      if ((key_dims.size() != 3) && (key_dims.size() != 4)) {
+        fail_shape_inference("Inputs 1 (key) shall be 3 or 4 dimensions");
+      }
+    }
+    if (hasInputShape(ctx, 2)) {
+      auto& value_shape = getInputShape(ctx, 2);
+      auto& value_dims = value_shape.dim();
+      if ((value_dims.size() != 3) && (value_dims.size() != 4)) {
+        fail_shape_inference("Inputs 2 (value) shall be 3 or 4 dimensions");
+      }
+      // Update Output Shape for 4D inputs
+      // Input 0 (query) has shape (batch_size, q_num_heads, q_sequence_length, head_size)
+      // Input 1 (key) has shape (batch_size, kv_num_heads, kv_sequence_length, head_size)
+      // Input 2 (value) has shape (batch_size, kv_num_heads, kv_sequence_length, v_head_size)
+      // Output 0 has shape (batch_size, q_num_heads, q_sequence_length, v_head_size)
+      if (value_dims.size() == 4 && query_dims.size() == 4) {
+        kv_sequence_length = value_dims[2].dim_value();
+        *output_shape.add_dim() = query_dims[2]; // sequence_length
+        *output_shape.add_dim() = value_dims[3]; // head_size
+        updateOutputShape(ctx, 0, output_shape);
+        // Update qk_matmul_shape
+        *qk_matmul_shape.add_dim() = query_dims[1]; // q_num_heads
+        *qk_matmul_shape.add_dim() = query_dims[2]; // q_sequence_length
+        qk_matmul_shape.add_dim()->set_dim_value(kv_sequence_length);
+      }
+      // Update Output Shape for 3D inputs
+      // Input 0 (query) has shape (batch_size, q_sequence_length, q_hidden_size),
+      // q_hidden_size = q_num_heads * head_size
+      // Input 1 (key) has shape (batch_size, kv_sequence_length, k_hidden_size),
+      // k_hidden_size = kv_num_heads * head_size
+      // Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size),
+      // v_hidden_size = kv_num_heads * v_head_size
+      // Output 0 has shape (batch_size, q_sequence_length, hidden_size),
+      // hidden_size = q_num_heads * v_head_size
+      if (value_dims.size() == 3 && query_dims.size() == 3) {
+        kv_sequence_length = value_dims[1].dim_value();
+        auto* q_num_heads_attr = ctx.getAttribute("q_num_heads");
+        if (q_num_heads_attr == nullptr) {
+          fail_type_inference("3D inputs expected to have q_num_heads attribute.");
+        }
+        auto* kv_num_heads_attr = ctx.getAttribute("kv_num_heads");
+        if (kv_num_heads_attr == nullptr) {
+          fail_type_inference("3D inputs expected to have kv_num_heads attribute.");
+        }
+        int64_t q_num_heads = q_num_heads_attr->i();
+        int64_t kv_num_heads = kv_num_heads_attr->i();
+        // Calculate v_head_size
+        int64_t v_head_size = value_dims[2].dim_value() / kv_num_heads;
+        output_shape.add_dim()->set_dim_value(v_head_size * q_num_heads);
+        updateOutputShape(ctx, 0, output_shape);
+        // Update qk_matmul_shape
+        qk_matmul_shape.add_dim()->set_dim_value(q_num_heads);
+        *qk_matmul_shape.add_dim() = query_dims[1];
+        qk_matmul_shape.add_dim()->set_dim_value(kv_sequence_length);
+      }
+    }
+  }
+  if (ctx.hasOutput(3)) { // has qk_matmul_output
+    propagateElemTypeFromInputToOutput(ctx, 0, 3);
+    updateOutputShape(ctx, 3, qk_matmul_shape);
+  }
+  if (ctx.hasOutput(1) && ctx.hasOutput(2)) { // has present outputs
+    if (ctx.hasInput(4) && ctx.hasInput(5)) { // has past_key
+      // copy the type from query to present key and value
+      propagateElemTypeFromInputToOutput(ctx, 4, 1);
+      propagateElemTypeFromInputToOutput(ctx, 5, 2);
+      if (hasInputShape(ctx, 4) && hasInputShape(ctx, 5)) {
+        auto& past_key_shape = getInputShape(ctx, 4);
+        auto& past_key_dims = past_key_shape.dim();
+        auto& past_value_shape = getInputShape(ctx, 5);
+        auto& past_value_dims = past_value_shape.dim();
+        // past key has shape (batch_size, kv_num_heads, past_sequence_length, head_size)
+        if (past_key_dims.size() != 4) {
+          fail_shape_inference("The past_key input shall be 4 dimensions");
+        }
+        // past value has shape (batch_size, kv_num_heads, past_sequence_length, v_head_size)
+        if (past_value_dims.size() != 4) {
+          fail_shape_inference("The past_value input shall be 4 dimensions");
+        }
+        if (kv_sequence_length > 0 && past_key_dims[2].has_dim_value()) {
+          int64_t total_sequence_length = kv_sequence_length + past_key_dims[2].dim_value();
+          ONNX_NAMESPACE::TensorShapeProto present_key_shape;
+          for (auto& dim : past_key_dims) {
+            *present_key_shape.add_dim() = dim;
+          }
+          ONNX_NAMESPACE::TensorShapeProto present_value_shape;
+          for (auto& dim : past_value_dims) {
+            *present_value_shape.add_dim() = dim;
+          }
+          if (ctx.hasOutput(3)) { // has qk_matmul_output with bias
+            qk_matmul_shape.mutable_dim(3)->set_dim_value(total_sequence_length);
+            updateOutputShape(ctx, 3, qk_matmul_shape);
+          }
+          // shape of present key/value is (batch_size, kv_num_heads, total_sequence_length, head_size)
+          present_key_shape.mutable_dim(2)->set_dim_value(total_sequence_length);
+          present_value_shape.mutable_dim(2)->set_dim_value(total_sequence_length);
+          updateOutputShape(ctx, 1, present_key_shape);
+          updateOutputShape(ctx, 2, present_value_shape);
+        }
+      }
+    }
+  }
+}
+bool AttentionAppendFunctionCausalMask(const FunctionBodyBuildContext& ctx, FunctionBuilder& builder, bool padding) {
+  builder.Add("NewKVSeqLen =  Shape <start = -2, end = -1> (PresentKey)")
+      .Add("AttnBiasShape = Concat <axis = 0> (QSeqLen, NewKVSeqLen)");
+  float neg_inf = -std::numeric_limits<float>::infinity();
+  builder.Const1D("FloatNegInf", neg_inf);
+  builder.Const1D("ScalarZero", 0.f);
+  // If attn_mask is provided
+  if (ctx.hasInput(3)) {
+    auto* up = ctx.getInputType(3);
+    if ((up == nullptr) || (!up->has_tensor_type()))
+      return false;
+    int64_t U = up->tensor_type().elem_type();
+    builder.Add(
+        U == ONNX_NAMESPACE::TensorProto_DataType_BOOL ? "AttnBiasShort = Where(attn_mask, ScalarZero, FloatNegInf)"
+                                                       : "AttnBiasShort = Identity(attn_mask)");
+    // If attn_mask has a shorter kv sequence length, we pad it to NewKVSeqLen with FloatNegInf
+    if (padding) {
+      builder.Add("MaskKVSeqLen = Shape <start = -1> (attn_mask)")
+          .Add("PaddingKVSeqLen = Sub(NewKVSeqLen, MaskKVSeqLen)")
+          .Add("Pads = Concat <axis = 0> (Zero1D, PaddingKVSeqLen)")
+          .Add("FloatNegInfCast = CastLike(FloatNegInf, AttnBiasShort)")
+          .Add("AttnBias = Pad(AttnBiasShort, Pads, FloatNegInfCast, NegOne1D)");
+    } else {
+      builder.Add("AttnBias = Identity(AttnBiasShort)");
+    }
+  } else {
+    builder.Add("AttnBias = ConstantOfShape(AttnBiasShape)");
+  }
+  // If is_causal set to true, the attention masking is a lower triangular matrix when the mask
+  // is a square matrix. The attention masking has the form of the upper left causal bias due to
+  // the alignment when the mask is a non-square matrix.
+  // An error is thrown if both attn_mask and is_causal are set.
+  auto* is_causal_attr = ctx.getAttribute("is_causal");
+  int64_t is_causal = (is_causal_attr != nullptr) ? is_causal_attr->i() : 0;
+  if (is_causal == 1) {
+    builder.Const1D("Zero", static_cast<int64_t>(0))
+        .Const1D("One", static_cast<int64_t>(1))
+        .Add("ZeroNoDim = Squeeze(Zero, Zero)")
+        .Add("OneNoDim = Squeeze(One, Zero)")
+        .Add("SequenceLength = Gather(AttnBiasShape, ZeroNoDim)")
+        .Add("TotalSequenceLength = Gather(AttnBiasShape, OneNoDim)")
+        .Add("RangeRow = Range(ZeroNoDim, SequenceLength, OneNoDim)")
+        .Add("RangeRow2D = Unsqueeze(RangeRow, One)")
+        .Add("RangeCol = Range(ZeroNoDim, TotalSequenceLength, OneNoDim)")
+        .Add("RangeCol2D = Unsqueeze(RangeCol, Zero)")
+        .Add("RangeRow2DPast = Add(RangeRow2D, PastKVSeqLen)")
+        .Add("BoolMaskTri = Less(RangeRow2DPast, RangeCol2D)")
+        .Add("MaskTri = Where(BoolMaskTri, FloatNegInf, ScalarZero)")
+        .Add("AttnBiasCausalOrNot = Add(AttnBias, MaskTri)");
+  } else {
+    builder.Add("AttnBiasCausalOrNot = Identity(AttnBias)");
+  }
+  return true;
+}
+} // namespace utils
+} // namespace nn
+} // namespace defs
+} // namespace ONNX_NAMESPACE

onnx/defs/nn/utils.h ADDED Viewed

@@ -0,0 +1,25 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include "onnx/common/assertions.h"
+#include "onnx/defs/function.h"
+#include "onnx/defs/schema.h"
+namespace ONNX_NAMESPACE {
+namespace defs {
+namespace nn {
+namespace utils {
+/** Implements shape and type propagation for Attention (23-). */
+void AttentionPropagateElemTypeFromInputToOutput(InferenceContext& ctx);
+/** Implements CausalMask for Attention. */
+bool AttentionAppendFunctionCausalMask(const FunctionBodyBuildContext& ctx, FunctionBuilder& builder, bool padding);
+} // namespace utils
+} // namespace nn
+} // namespace defs
+} // namespace ONNX_NAMESPACE

onnx/onnx_cpp2py_export.cp313t-win_amd64.pyd CHANGED Viewed

Binary file

onnx/reference/ops/op_attention.py CHANGED Viewed

@@ -24,6 +24,25 @@ def _softcap(X, softcap):
     return X
+def _apply_causal(mask, past_sequence_length):
+    """Applies a causal mask on the input `mask`:
+    ``mask[i, j] = -inf if past_sequence_length + i > j else 0``.
+    Because a softmax is applied on the mask, -inf becomes 0 and 0 becomes 1.
+    The modification is done inplace.
+    """
+    q_sequence_length, total_sequence_length = mask.shape[-2:]
+    triu = np.triu(
+        np.ones(
+            (q_sequence_length, total_sequence_length - past_sequence_length),
+            dtype=mask.dtype,
+        ),
+        k=1,
+    )
+    triu[triu == 1] = -np.inf
+    mask[..., :, past_sequence_length:] += triu
+    return mask
 def _compute_attention(
     Q: np.ndarray,
     K: np.ndarray,
@@ -114,18 +133,18 @@ def _compute_attention(
     # bias due to the alignment when the mask is a non-square matrix.
     if is_causal:
         if attn_mask is None:
-            temp_mask = np.ones((q_sequence_length, kv_sequence_length), dtype=bool)
-            temp_mask = np.tril(temp_mask, k=0)
-            temp_mask = np.logical_not(temp_mask)
-            attn_bias_ma = np.ma.array(attn_bias, mask=temp_mask)
-            attn_bias = attn_bias_ma.filled(fill_value=float("-inf"))
+            temp_mask = np.zeros((q_sequence_length, kv_sequence_length), dtype=Q.dtype)
+            attn_bias = _apply_causal(
+                temp_mask,
+                past_sequence_length=past_key.shape[2] if past_key is not None else 0,
+            )
         else:
             if attn_mask.dtype == np.bool_:
                 attn_mask = (1 - attn_mask).astype(Q.dtype) * (-np.inf)
-            temp_mask = np.ones((q_sequence_length, kv_sequence_length), dtype=Q.dtype)
-            temp_mask = 1 - np.tril(temp_mask, k=0)
-            temp_mask[temp_mask == 1] = -np.inf
-            attn_bias = attn_mask + temp_mask
+            attn_bias = _apply_causal(
+                attn_mask.copy(),
+                past_sequence_length=past_key.shape[2] if past_key is not None else 0,
+            )
     elif attn_mask is not None:
         if attn_mask.dtype == np.bool_:
             attn_mask = (1 - attn_mask).astype(Q.dtype)
@@ -158,10 +177,10 @@ def _compute_attention(
         and (q_num_heads % k_num_heads == 0)
         and (k_num_heads == v_num_heads)
     ):
-        seq_reps = int(q_num_heads / k_num_heads)
-        reps = [1, seq_reps, 1, 1]
-        K = np.tile(K, reps)
-        V = np.tile(V, reps)
+        seq_reps = q_num_heads // k_num_heads
+        # Interleave-repeat each KV head: [h0, h0, h1, h1, ...]
+        K = np.repeat(K, repeats=seq_reps, axis=1)
+        V = np.repeat(V, repeats=seq_reps, axis=1)
     # The following pattern is applied
     #      Q          K          V
@@ -185,7 +204,7 @@ def _compute_attention(
     qk_matmul_output = np.matmul(Q * scale, k_transpose * scale)
     qk_with_bias = qk_matmul_output + attn_bias
     if qk_matmul_output_mode == 1:
-        qk_matmul_output = qk_matmul_output + attn_bias
+        qk_matmul_output = qk_with_bias.copy()
     # Apply softcap
     if softcap is not None: