PyPI - tf-nightly-cpu - Versions diffs - 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250222__cp310-cp310-win_amd64.whl - Mend

tf-nightly-cpu 2.20.0.dev20250220__cp310-cp310-win_amd64.whl → 2.20.0.dev20250222__cp310-cp310-win_amd64.whl

Files changed (128) hide show

tensorflow/include/xla/backends/cpu/runtime/work_queue.h CHANGED Viewed

@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
 #include "absl/container/fixed_array.h"
-#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -44,15 +43,6 @@ namespace xla::cpu {
 // A work queue that partitions `num_tasks` tasks into `num_partitions`
 // partitions processed by parallel workers.
 class WorkQueue {
-  // Align all atomic counters to a cache line boundary to avoid false
-  // sharing between multiple worker threads.
-  static constexpr size_t kAtomicAlignment =
-#if defined(__cpp_lib_hardware_interference_size)
-      std::hardware_destructive_interference_size;
-#else
-      64;
-#endif
  public:
   WorkQueue(size_t num_tasks, size_t num_partitions);
@@ -60,13 +50,23 @@ class WorkQueue {
   // if the partition is complete.
   std::optional<size_t> Pop(size_t partition_index);
-  size_t num_partitions() const { return partitions_.size(); }
+  // Return the partition [begin, end) task range.
+  std::pair<size_t, size_t> partition_range(size_t partition_index) const;
-  bool empty() const { return empty_.load(std::memory_order_relaxed); }
+  size_t num_partitions() const { return partitions_.size(); }
  private:
   friend class Worker;
+  // Align all atomic counters to a cache line boundary to avoid false
+  // sharing between multiple worker threads.
+  static constexpr size_t kAtomicAlignment =
+#if defined(__cpp_lib_hardware_interference_size)
+      std::hardware_destructive_interference_size;
+#else
+      64;
+#endif
   struct Partition {
     void Initialize(size_t begin, size_t end);
@@ -76,8 +76,21 @@ class WorkQueue {
     size_t end;
   };
+  // An empty work queue flag to stop worker threads from looping through all
+  // partitions looking for work.
+  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+  void SetEmpty() { empty_.store(true, std::memory_order_relaxed); }
+  // Notify that one of the workers switched to the work stealing mode.
+  void NotifyWorkStealingWorker();
+  // Decrements the number of work stealing workers by at most `max_workers` and
+  // returns the number of decremented work stealing workers.
+  size_t DecrementWorkStealingWorkers(size_t max_workers);
   absl::FixedArray<Partition, 32> partitions_;
   alignas(kAtomicAlignment) std::atomic<bool> empty_;
+  alignas(kAtomicAlignment) std::atomic<size_t> num_work_stealing_workers_;
 };
 // Worker processes tasks from the work queue starting from the assigned
@@ -130,10 +143,14 @@ inline void WorkQueue::Partition::Initialize(size_t begin, size_t end) {
 }
 inline WorkQueue::WorkQueue(size_t num_tasks, size_t num_partitions)
-    : partitions_(num_partitions), empty_(num_tasks == 0) {
-  size_t partition_size = tsl::MathUtil::CeilOfRatio(num_tasks, num_partitions);
-  for (size_t i = 0, begin = 0, end = partition_size; i < num_partitions;
-       ++i, begin = end, end = std::min(num_tasks, end + partition_size)) {
+    : partitions_(num_partitions),
+      empty_(num_tasks == 0),
+      num_work_stealing_workers_(0) {
+  size_t partition_size =
+      tsl::MathUtil::FloorOfRatio(num_tasks, num_partitions);
+  size_t rem_tasks = num_tasks % num_partitions;
+  for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
+    end = begin + partition_size + ((i < rem_tasks) ? 1 : 0);
     partitions_[i].Initialize(begin, end);
   }
 }
@@ -154,6 +171,29 @@ inline std::optional<size_t> WorkQueue::Pop(size_t partition_index) {
                                                     : std::make_optional(index);
 }
+inline std::pair<size_t, size_t> WorkQueue::partition_range(
+    size_t partition_index) const {
+  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
+  return {partitions_[partition_index].begin, partitions_[partition_index].end};
+}
+inline void WorkQueue::NotifyWorkStealingWorker() {
+  num_work_stealing_workers_.fetch_add(1, std::memory_order_relaxed);
+}
+inline size_t WorkQueue::DecrementWorkStealingWorkers(size_t max_workers) {
+  size_t n = num_work_stealing_workers_.load(std::memory_order_relaxed);
+  size_t decrement = std::min(n, max_workers);
+  while (decrement && !num_work_stealing_workers_.compare_exchange_weak(
+                          n, n - decrement, std::memory_order_relaxed,
+                          std::memory_order_relaxed)) {
+    decrement = std::min(n, max_workers);
+  }
+  return decrement;
+}
 inline Worker::Worker(size_t worker_index, WorkQueue* queue)
     : worker_index_(worker_index),
       partition_index_(worker_index),
@@ -163,7 +203,13 @@ inline std::optional<size_t> Worker::Pop() {
   std::optional<size_t> task = queue_->Pop(partition_index_);
   if (ABSL_PREDICT_TRUE(task)) return task;
-  while (!task.has_value() && !queue_->empty()) {
+  // If we didn't find a task in the initially assigned partition, notify the
+  // work queue that we are switching to work stealing mode.
+  if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
+    queue_->NotifyWorkStealingWorker();
+  }
+  while (!task.has_value() && !queue_->IsEmpty()) {
     // Wrap around to the first partition.
     if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
       partition_index_ = 0;
@@ -171,7 +217,7 @@ inline std::optional<size_t> Worker::Pop() {
     // We checked all partitions and got back to the partition we started from.
     if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
-      queue_->empty_.store(true, std::memory_order_relaxed);
+      queue_->SetEmpty();
       break;
     }
@@ -205,6 +251,7 @@ Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
       parallel_task(std::forward<ParallelTask>(parallel_task)) {}
 template <typename ParallelTask>
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
 void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
                                     uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
@@ -223,11 +270,26 @@ void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
   while (end_index - start_index > 1) {
     // If work queue is empty, we don't need to keep enqueuing more workers and
     // can simply count down for the remaining workers.
-    if (ABSL_PREDICT_FALSE(ctx->work_queue.empty())) {
+    if (ABSL_PREDICT_FALSE(ctx->work_queue.IsEmpty())) {
       count_down(end_index - start_index, absl::OkStatus());
       return;
     }
+    // If we have workers in the work stealing mode, we can skip enqueuing
+    // more tasks as existing workers will process remaining partitions. By
+    // doing this optimization we avoid unnecessary thread pool overheads.
+    size_t skip_workers =
+        ctx->work_queue.DecrementWorkStealingWorkers(end_index - start_index);
+    if (ABSL_PREDICT_FALSE(skip_workers > 0)) {
+      DCHECK_LE(skip_workers, end_index - start_index);
+      count_down(skip_workers, absl::OkStatus());
+      end_index -= skip_workers;
+      if (start_index == end_index) return;
+      if (end_index - start_index == 1) break;
+    }
+    DCHECK_GE(end_index - start_index, 1);
     uint16_t mid_index = (start_index + end_index) / 2;
     ctx->device->enqueueNoNotification([ctx, mid_index, end_index] {
       ParallelizeWithContext(ctx, mid_index, end_index);

tensorflow/include/xla/codegen/kernel_spec.h CHANGED Viewed

@@ -17,12 +17,14 @@ limitations under the License.
 #define XLA_CODEGEN_KERNEL_SPEC_H_
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 #include <string>
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
-#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/launch_dim.h"
 namespace xla {
@@ -33,15 +35,17 @@ namespace xla {
 // will load kernel PTX on device and instantiate a KernelThunk.
 class KernelSpec {
  public:
-  using BufferUses = absl::InlinedVector<BufferUse, 8>;
+  using Buffers = absl::InlinedVector<BufferAllocation::Slice, 8>;
   KernelSpec(absl::string_view name, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   KernelSpec(absl::string_view name, se::ClusterDim cluster_dim,
              se::BlockDim block_dim, se::ThreadDim thread_dim,
-             BufferUses buffer_uses,
+             Buffers argument_buffers, Buffers result_buffers,
+             absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
   // Get the backend specific name of the kernel.
@@ -67,15 +71,28 @@ class KernelSpec {
   // managed buffer that is likely to be in L1/L2 cache).
   std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
-  // Buffers (buffer allocation slices) used by the kernel.
-  const BufferUses& buffer_uses() const { return buffer_uses_; }
+  // Argument buffers read by the kernel.
+  const Buffers& argument_buffers() const { return argument_buffers_; }
+  // Result buffers written to by the kernel.
+  const Buffers& result_buffers() const { return result_buffers_; }
+  // Returns a set of invariant arguments (corresponding to the indices in the
+  // argument buffers list).
+  const absl::flat_hash_set<int64_t>& invariant_arguments() const {
+    return invariant_arguments_;
+  }
  private:
   std::string name_;
   se::ClusterDim cluster_dim_;
   se::BlockDim block_dim_;
   se::ThreadDim thread_dim_;
-  BufferUses buffer_uses_;
+  Buffers argument_buffers_;
+  Buffers result_buffers_;
+  absl::flat_hash_set<int64_t> invariant_arguments_;
   std::optional<size_t> scratch_bytes_;
 };

tensorflow/include/xla/hlo/ir/hlo_casting_utils.h CHANGED Viewed

@@ -44,28 +44,6 @@ T* Cast(HloInstruction* instr) {
   return tsl::down_cast<T*>(instr);
 }
-// Downcasts a const HloInstruction pointer or returns nullptr if argument is
-// nullptr. Dies if TargetClass::ClassOf() does not match.
-template <typename T>
-const T* CastOrNull(const HloInstruction* i) {
-  if (i == nullptr) {
-    return nullptr;
-  }
-  CHECK(T::ClassOf(i));
-  return tsl::down_cast<const T*>(i);
-}
-// Downcasts a const HloInstruction pointer or returns nullptr if argument is
-// nullptr. Dies if TargetClass::ClassOf() does not match.
-template <typename T>
-T* CastOrNull(HloInstruction* i) {
-  if (i == nullptr) {
-    return nullptr;
-  }
-  CHECK(T::ClassOf(i));
-  return tsl::down_cast<T*>(i);
-}
 // Downcasts a const HloInstruction pointer or returns nullptr if
 // TargetClass::ClassOf() does not match. Dies if argument is nullptr. Similar
 // to LLVM's dyn_cast.
@@ -84,28 +62,6 @@ T* DynCast(HloInstruction* i) {
   return !T::ClassOf(i) ? nullptr : tsl::down_cast<T*>(i);
 }
-// Downcasts a const HloInstruction pointer. Return nullptr if argument is
-// nullptr orTargetClass::ClassOf() does not match. Similar to LLVM's
-// dyn_cast_or_null.
-template <typename T>
-const T* DynCastOrNull(const HloInstruction* instruction) {
-  if (instruction == nullptr || !T::ClassOf(instruction)) {
-    return nullptr;
-  }
-  return tsl::down_cast<const T*>(instruction);
-}
-// Downcasts a non-const HloInstruction pointer. Return nullptr if argument is
-// nullptr orTargetClass::ClassOf() does not match. Similar to LLVM's
-// dyn_cast_or_null.
-template <typename T>
-T* DynCastOrNull(HloInstruction* instruction) {
-  if (instruction == nullptr || !T::ClassOf(instruction)) {
-    return nullptr;
-  }
-  return tsl::down_cast<T*>(instruction);
-}
 }  // namespace xla
 #endif  // XLA_HLO_IR_HLO_CASTING_UTILS_H_

tensorflow/include/xla/hlo/ir/hlo_instruction.h CHANGED Viewed

@@ -1914,6 +1914,18 @@ class HloInstruction {
                           result_accuracy().mode() != ResultAccuracy::DEFAULT);
   }
+  bool equal_result_accuracy(const HloInstruction* other) const {
+    return result_accuracy().has_tolerance() ==
+               other->result_accuracy().has_tolerance() &&
+           result_accuracy().tolerance().atol() ==
+               other->result_accuracy().tolerance().atol() &&
+           result_accuracy().tolerance().rtol() ==
+               other->result_accuracy().tolerance().rtol() &&
+           result_accuracy().tolerance().ulps() ==
+               other->result_accuracy().tolerance().ulps() &&
+           result_accuracy().mode() == other->result_accuracy().mode();
+  }
   void add_single_statistic(Statistic statistic) {
     *mutable_rare()->statistics_viz.add_statistics() = std::move(statistic);
   }

tensorflow/include/xla/mlir_hlo/_virtual_includes/stablehlo_extension_pass_inc_gen/stablehlo_ext/transforms/passes.h.inc CHANGED Viewed

@@ -3,6 +3,7 @@
 #ifdef GEN_PASS_DECL
 // Generate declarations for all passes.
 #define GEN_PASS_DECL_CHLORECOMPOSEOPSPASS
+#define GEN_PASS_DECL_STABLEHLOADDQDQAFTERCONVPASS
 #define GEN_PASS_DECL_STABLEHLOCANONICALIZEDYNAMISMPASS
 #define GEN_PASS_DECL_STABLEHLOFLATTENENTRYFUNCTIONTUPLESPASS
 #define GEN_PASS_DECL_STABLEHLOFLATTENTUPLEPASS
@@ -87,6 +88,82 @@ std::unique_ptr<::mlir::Pass> createChloRecomposeOpsPass() {
 #undef GEN_PASS_DEF_CHLORECOMPOSEOPSPASS
 #endif // GEN_PASS_DEF_CHLORECOMPOSEOPSPASS
+//===----------------------------------------------------------------------===//
+// StablehloAddQDQAfterConvPass
+//===----------------------------------------------------------------------===//
+#ifdef GEN_PASS_DECL_STABLEHLOADDQDQAFTERCONVPASS
+std::unique_ptr<::mlir::Pass> createStablehloAddQDQAfterConvPass();
+#undef GEN_PASS_DECL_STABLEHLOADDQDQAFTERCONVPASS
+#endif // GEN_PASS_DECL_STABLEHLOADDQDQAFTERCONVPASS
+#ifdef GEN_PASS_DEF_STABLEHLOADDQDQAFTERCONVPASS
+namespace impl {
+  std::unique_ptr<::mlir::Pass> createStablehloAddQDQAfterConvPass();
+} // namespace impl
+namespace impl {
+template <typename DerivedT>
+class StablehloAddQDQAfterConvPassBase : public ::mlir::OperationPass<ModuleOp> {
+public:
+  using Base = StablehloAddQDQAfterConvPassBase;
+  StablehloAddQDQAfterConvPassBase() : ::mlir::OperationPass<ModuleOp>(::mlir::TypeID::get<DerivedT>()) {}
+  StablehloAddQDQAfterConvPassBase(const StablehloAddQDQAfterConvPassBase &other) : ::mlir::OperationPass<ModuleOp>(other) {}
+  StablehloAddQDQAfterConvPassBase& operator=(const StablehloAddQDQAfterConvPassBase &) = delete;
+  StablehloAddQDQAfterConvPassBase(StablehloAddQDQAfterConvPassBase &&) = delete;
+  StablehloAddQDQAfterConvPassBase& operator=(StablehloAddQDQAfterConvPassBase &&) = delete;
+  ~StablehloAddQDQAfterConvPassBase() = default;
+  /// Returns the command-line argument attached to this pass.
+  static constexpr ::llvm::StringLiteral getArgumentName() {
+    return ::llvm::StringLiteral("stablehlo-ext-add-qdq-after-conv");
+  }
+  ::llvm::StringRef getArgument() const override { return "stablehlo-ext-add-qdq-after-conv"; }
+  ::llvm::StringRef getDescription() const override { return "Add quant and dequant ops after convolution op."; }
+  /// Returns the derived pass name.
+  static constexpr ::llvm::StringLiteral getPassName() {
+    return ::llvm::StringLiteral("StablehloAddQDQAfterConvPass");
+  }
+  ::llvm::StringRef getName() const override { return "StablehloAddQDQAfterConvPass"; }
+  /// Support isa/dyn_cast functionality for the derived pass class.
+  static bool classof(const ::mlir::Pass *pass) {
+    return pass->getTypeID() == ::mlir::TypeID::get<DerivedT>();
+  }
+  /// A clone method to create a copy of this pass.
+  std::unique_ptr<::mlir::Pass> clonePass() const override {
+    return std::make_unique<DerivedT>(*static_cast<const DerivedT *>(this));
+  }
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::quant::QuantDialect>();
+    registry.insert<stablehlo::StablehloDialect>();
+  }
+  /// Explicitly declare the TypeID for this class. We declare an explicit private
+  /// instantiation because Pass classes should only be visible by the current
+  /// library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StablehloAddQDQAfterConvPassBase<DerivedT>)
+protected:
+private:
+  friend std::unique_ptr<::mlir::Pass> createStablehloAddQDQAfterConvPass() {
+    return std::make_unique<DerivedT>();
+  }
+};
+} // namespace impl
+std::unique_ptr<::mlir::Pass> createStablehloAddQDQAfterConvPass() {
+  return impl::createStablehloAddQDQAfterConvPass();
+}
+#undef GEN_PASS_DEF_STABLEHLOADDQDQAFTERCONVPASS
+#endif // GEN_PASS_DEF_STABLEHLOADDQDQAFTERCONVPASS
 //===----------------------------------------------------------------------===//
 // StablehloCanonicalizeDynamismPass
 //===----------------------------------------------------------------------===//
@@ -360,9 +437,9 @@ public:
   /// Returns the command-line argument attached to this pass.
   static constexpr ::llvm::StringLiteral getArgumentName() {
-    return ::llvm::StringLiteral("legalize-quant-composite");
+    return ::llvm::StringLiteral("stablehlo-ext-legalize-quant-composite");
   }
-  ::llvm::StringRef getArgument() const override { return "legalize-quant-composite"; }
+  ::llvm::StringRef getArgument() const override { return "stablehlo-ext-legalize-quant-composite"; }
   ::llvm::StringRef getDescription() const override { return "Lowers the quantization related composites op to native quantized ops."; }
@@ -576,6 +653,23 @@ inline void registerChloRecomposeOpsPassPass() {
   });
 }
+//===----------------------------------------------------------------------===//
+// StablehloAddQDQAfterConvPass Registration
+//===----------------------------------------------------------------------===//
+inline void registerStablehloAddQDQAfterConvPass() {
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return createStablehloAddQDQAfterConvPass();
+  });
+}
+// Old registration code, kept for temporary backwards compatibility.
+inline void registerStablehloAddQDQAfterConvPassPass() {
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return createStablehloAddQDQAfterConvPass();
+  });
+}
 //===----------------------------------------------------------------------===//
 // StablehloCanonicalizeDynamismPass Registration
 //===----------------------------------------------------------------------===//
@@ -684,6 +778,7 @@ inline void registerStablehloRefineShapesPassPass() {
 inline void registerPasses() {
   registerChloRecomposeOpsPass();
+  registerStablehloAddQDQAfterConvPass();
   registerStablehloCanonicalizeDynamismPass();
   registerStablehloFlattenEntryFunctionTuplesPass();
   registerStablehloFlattenTuplePass();
@@ -745,6 +840,56 @@ public:
 protected:
 };
+template <typename DerivedT>
+class StablehloAddQDQAfterConvPassBase : public ::mlir::OperationPass<ModuleOp> {
+public:
+  using Base = StablehloAddQDQAfterConvPassBase;
+  StablehloAddQDQAfterConvPassBase() : ::mlir::OperationPass<ModuleOp>(::mlir::TypeID::get<DerivedT>()) {}
+  StablehloAddQDQAfterConvPassBase(const StablehloAddQDQAfterConvPassBase &other) : ::mlir::OperationPass<ModuleOp>(other) {}
+  StablehloAddQDQAfterConvPassBase& operator=(const StablehloAddQDQAfterConvPassBase &) = delete;
+  StablehloAddQDQAfterConvPassBase(StablehloAddQDQAfterConvPassBase &&) = delete;
+  StablehloAddQDQAfterConvPassBase& operator=(StablehloAddQDQAfterConvPassBase &&) = delete;
+  ~StablehloAddQDQAfterConvPassBase() = default;
+  /// Returns the command-line argument attached to this pass.
+  static constexpr ::llvm::StringLiteral getArgumentName() {
+    return ::llvm::StringLiteral("stablehlo-ext-add-qdq-after-conv");
+  }
+  ::llvm::StringRef getArgument() const override { return "stablehlo-ext-add-qdq-after-conv"; }
+  ::llvm::StringRef getDescription() const override { return "Add quant and dequant ops after convolution op."; }
+  /// Returns the derived pass name.
+  static constexpr ::llvm::StringLiteral getPassName() {
+    return ::llvm::StringLiteral("StablehloAddQDQAfterConvPass");
+  }
+  ::llvm::StringRef getName() const override { return "StablehloAddQDQAfterConvPass"; }
+  /// Support isa/dyn_cast functionality for the derived pass class.
+  static bool classof(const ::mlir::Pass *pass) {
+    return pass->getTypeID() == ::mlir::TypeID::get<DerivedT>();
+  }
+  /// A clone method to create a copy of this pass.
+  std::unique_ptr<::mlir::Pass> clonePass() const override {
+    return std::make_unique<DerivedT>(*static_cast<const DerivedT *>(this));
+  }
+  /// Register the dialects that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::quant::QuantDialect>();
+    registry.insert<stablehlo::StablehloDialect>();
+  }
+  /// Explicitly declare the TypeID for this class. We declare an explicit private
+  /// instantiation because Pass classes should only be visible by the current
+  /// library.
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StablehloAddQDQAfterConvPassBase<DerivedT>)
+protected:
+};
 template <typename DerivedT>
 class StablehloCanonicalizeDynamismPassBase : public ::mlir::OperationPass<func::FuncOp> {
 public:
@@ -907,9 +1052,9 @@ public:
   /// Returns the command-line argument attached to this pass.
   static constexpr ::llvm::StringLiteral getArgumentName() {
-    return ::llvm::StringLiteral("legalize-quant-composite");
+    return ::llvm::StringLiteral("stablehlo-ext-legalize-quant-composite");
   }
-  ::llvm::StringRef getArgument() const override { return "legalize-quant-composite"; }
+  ::llvm::StringRef getArgument() const override { return "stablehlo-ext-legalize-quant-composite"; }
   ::llvm::StringRef getDescription() const override { return "Lowers the quantization related composites op to native quantized ops."; }