PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/kernel_ir.h ADDED Viewed

@@ -0,0 +1,1457 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <ir/all_nodes.h>
+#include <ir/base_nodes.h>
+#include <mma_type.h>
+#include <parallel_type_bitmap.h>
+#include <tma.h>
+#include <type.h>
+#include <utils.h>
+#include <visibility.h>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+namespace nvfuser {
+class IrBuilderPasskey;
+namespace kir {
+class Kernel;
+// Values
+class Predicate;
+class TensorIndex;
+// Expressions
+class Allocate;
+class Asm;
+class BlockSync;
+class GridSync;
+class FenceAsyncProxy;
+class WgMmaFence;
+class SetMaxNReg;
+class Return;
+class MBarrierInit;
+class MBarrierInvalidate;
+class MBarrierArrive;
+class MBarrierArriveExpectTx;
+class MBarrierWait;
+class MBarrierWaitParity;
+class BlockSerializeWait;
+class BlockSerializeRelease;
+class AsyncWait;
+class AsyncCommit;
+class InitMagicZero;
+class UpdateMagicZero;
+class IfThenElse;
+class GridReduction;
+class GroupedGridReduction;
+class GridBroadcast;
+class GridWelford;
+class GroupedGridWelford;
+class AllocateFusedReduction;
+// Expr container
+class Predicate final : public Val {
+ public:
+  explicit Predicate(
+      IrBuilderPasskey passkey,
+      PredicateType ptype,
+      const Expr* expr = nullptr,
+      Val* thread_pred = nullptr);
+  explicit Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop);
+  explicit Predicate(IrBuilderPasskey passkey, Val* value);
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  PredicateType predicate_type() const {
+    return ptype_;
+  }
+  const Expr* expr() const {
+    NVF_ERROR(
+        ptype_ != PredicateType::Unswitch &&
+        ptype_ != PredicateType::Vectorize && ptype_ != PredicateType::Manual &&
+        ptype_ != PredicateType::ElectSync);
+    return expr_;
+  }
+  Val* thread_pred() const {
+    NVF_ERROR(
+        ptype_ == PredicateType::Inline ||
+        ptype_ == PredicateType::Misaligned ||
+        ptype_ == PredicateType::ReductionWrite ||
+        ptype_ == PredicateType::ElectSync);
+    return thread_pred_;
+  }
+  ForLoop* unrolled_loop() const {
+    NVF_ERROR(ptype_ == PredicateType::Unswitch);
+    return unrolled_loop_;
+  }
+  bool hasValue() const {
+    return value_ != nullptr;
+  }
+  Val* value() const {
+    NVF_ERROR(
+        value_ != nullptr,
+        "The conditional expression for this Predicate is invalid.");
+    return value_;
+  }
+  void setValue(Val* value) {
+    NVF_ERROR(value != nullptr, "The Bool expression is invalid.");
+    value_ = value;
+  }
+  bool isConst() const final {
+    return hasValue() && value_->isConst();
+  }
+  bool isTrivial() const {
+    return isConst() && value_->value().is<bool>() &&
+        value_->value().as<bool>();
+  }
+ private:
+  PredicateType ptype_ = PredicateType::Manual;
+  // For PredicateCompute::getInlinePredicate,
+  // ShiftPredicateInserter::getShiftPredicate and getPaddingPredicate
+  const Expr* expr_ = nullptr;
+  // For PredicateCompute::getInlinePredicate
+  Val* thread_pred_ = nullptr;
+  // For ParallelType::Unswitch - UnswitchPredicate::get
+  ForLoop* unrolled_loop_ = nullptr;
+  // The Bool conditional value
+  // The value is nullptr until lower_predicate pass
+  Val* value_ = nullptr;
+};
+class TensorIndex final : public Val {
+ public:
+  TensorIndex(
+      IrBuilderPasskey,
+      const TensorView* view,
+      Val* index,
+      DataType dtype = DataType::Null);
+  Val* index() const {
+    return index_;
+  }
+  TensorView* view() const {
+    NVF_ERROR(view_ != nullptr);
+    return const_cast<TensorView*>(view_); // NOLINT
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+ private:
+  const TensorView* view_ = nullptr;
+  Val* index_ = nullptr;
+};
+// In theory, we should just put this struct into class Asm, but unfortunately,
+// due to compiler bug, we can not do that:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165
+struct AsmOptions {
+  bool volatile_ = false;
+  bool memory = false;
+  std::unordered_set<int64_t> readable_outputs = {};
+};
+class Asm final : public Expr {
+ public:
+  using Options = AsmOptions;
+  using Expr::Expr;
+  explicit Asm(
+      IrBuilderPasskey passkey,
+      const std::string& code,
+      const std::vector<Val*>& outputs,
+      const std::vector<Val*>& inputs,
+      const Options& options = Options());
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "Asm";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const std::string& code() const {
+    return attribute<std::string>(0);
+  }
+  const Options& options() const {
+    return attribute<Options>(1);
+  }
+  Options& options() {
+    return attribute<Options>(1);
+  }
+  bool volatile_() const {
+    return options().volatile_;
+  }
+  bool& volatile_() {
+    return options().volatile_;
+  }
+  bool memory() const {
+    return options().memory;
+  }
+  bool& memory() {
+    return options().memory;
+  }
+  bool hasBooleanInput() const {
+    for (auto input : inputs()) {
+      if (input->dtype() == DataType::Bool) {
+        return true;
+      }
+    }
+    return false;
+  }
+  std::vector<std::pair<std::string, Val*>> constraintsAndOutputs() const;
+  std::vector<std::pair<std::string, Val*>> constraintsAndInputs() const;
+  std::string parameters() const;
+};
+//! Allocate is a lower level Node that describes a buffer of memory that
+//! is required as an intermediate within a kernel. The extent is the expression
+//! of the size of the buffer that is generated from the TensorView that
+//! describes the output of an operation.
+class Allocate final : public Expr {
+ public:
+  using Expr::Expr;
+  //! Allocation of a multi-dimensional buffer
+  //!
+  //! param shape Size of each dimension
+  //! param zero_init Should this memory be zero-initialized?
+  //! param resets_to_zero Will this memory be set to zero upon completion of
+  //!   this kernel?
+  //! param alias Is this an alias of previously-allocated memory
+  explicit Allocate(
+      IrBuilderPasskey passkey,
+      Val* buffer,
+      MemoryType memory_type,
+      std::vector<Val*> shape = {},
+      bool zero_init = false,
+      bool resets_to_zero = false,
+      Allocate* alias = nullptr);
+  //! Allocation of a non-dimensional buffer
+  //!
+  //! param size Size of allocation
+  explicit Allocate(
+      IrBuilderPasskey passkey,
+      Val* buffer,
+      MemoryType memory_type,
+      Val* size,
+      bool zero_init = false,
+      bool resets_to_zero = false);
+  const char* getOpString() const override {
+    return "Allocate";
+  }
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* buffer() const {
+    return attributeVal(0);
+  }
+  MemoryType memoryType() const {
+    return attribute<MemoryType>(1);
+  }
+  //! Total size
+  Val* size() const {
+    return input(0);
+  }
+  //! Size of each dimension
+  std::vector<Val*> shape() const {
+    std::vector<Val*> result;
+    result.reserve(attributes().size() - 6);
+    for (auto i = attributes().begin() + 6; i != attributes().end(); ++i) {
+      result.emplace_back((*i)->as<Val>());
+    }
+    return result;
+  }
+  //! Does this allocation require its memory to be initialized to zero before
+  //! this kernel is launched? If this is true, then an additional memset
+  //! kernel might be launched before the current Fusion kernel is launched in
+  //! order to guarantee that this buffer is filled with zeroes (see
+  //! resetsToZero() below).
+  bool zeroInit() const {
+    return attribute<bool>(2);
+  }
+  //! Is this buffer guaranteed to be reset to all zero values at the end of
+  //! this kernel? This is used to avoid an additional memset kernel launch for
+  //! buffers that require zeroed memory (see zeroInit() above).
+  //!
+  //! A common use case for zeroInit() allocations is semaphore buffers that
+  //! hold counters starting at zero. Typically, each participating thread would
+  //! increment the counter and the last thread would leave the counter in a
+  //! non-zeroed state. The next time that kernel is run, it can no longer
+  //! re-use the non-zero semaphore buffer, so KernelExecutor will launch
+  //! at::zeroes to allocate a new buffer, resulting in a memset kernel launch.
+  //!
+  //! Instead, if the last thread resets the counter to zero, then the buffer
+  //! can be re-used, and at::zeroes need only be run at the first kernel
+  //! launch. If resetsToZero() is true, then KernelExecutor will use
+  //! contigZeroedTensor() and releaseZeroedMemory() from global_allocator.h to
+  //! reuse zeroed memory avoiding the additional kernel launch.
+  //!
+  //! Whenever possible, we should try to guarantee that resetsToZero() is true
+  //! if zeroInit() is true by modifying our code to clean up global counters,
+  //! because the latency penalty of an additional kernel launch should be
+  //! greater than that required to reset this memory at the end of the fusion.
+  //! The exception is when a kernel is launched only a single time, in which
+  //! case resetting the memory is unnecessary, but we expect that kernels will
+  //! instead be launched many times.
+  bool resetsToZero() const {
+    return attribute<bool>(3);
+  }
+  // This alias tracks the next Allocate node in a linked chain of aliases
+  // If the alias is nullptr, then the Allocate node uses memory in the kernel
+  const Allocate* alias() const {
+    return dynamic_cast<const Allocate*>(attribute(4));
+  }
+  // Set the address of a shared memory allocation within the dynamic shared
+  // memory array. The addr argument should be a scalar expression describing an
+  // aligned address in bytes.
+  void setAddress(Val* addr) {
+    NVF_CHECK(
+        memoryType() == MemoryType::Shared,
+        "Allocation address may only be set for shared memory allocations. Memory type is ",
+        memoryType());
+    NVF_CHECK(
+        address() == nullptr,
+        "Attempted to set address twice for allocation ",
+        toString());
+    attributes_[5] = addr;
+  }
+  // This is an integer scalar describing the byte address within the dynamic
+  // shared memory array for a shared memory allocation. For memory types other
+  // than Shared, or before allocation, this function might return nullptr.
+  Val* address() const {
+    return attributeVal(5);
+  }
+};
+// Sync represents __syncthreads barrier for block level coordination.
+//
+// TODO(kir): change name to SyncThreads as we could have other barriers.
+//
+class BlockSync final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit BlockSync(IrBuilderPasskey passkey, bool war_sync = false);
+  const char* getOpString() const override {
+    return "BlockSync";
+  }
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  // TODO: war_sync_ is only used for testing/validation purposes.
+  bool isWarHazardSync() const {
+    return attribute<bool>(0);
+  }
+};
+// Synchronize all blocks in device, implies cooperative group launch is
+// required.
+class GridSync final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit GridSync(
+      IrBuilderPasskey passkey,
+      ParallelTypeBitmap sync_dims,
+      Val* sync_buffer);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "GridSync";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  ParallelTypeBitmap syncDims() const {
+    return attribute<ParallelTypeBitmap>(0);
+  }
+  Val* syncBuffer() const {
+    return attributeVal(1);
+  }
+};
+// PTX: fence.proxy.async
+class FenceAsyncProxy final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit FenceAsyncProxy(IrBuilderPasskey passkey);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "FenceAsyncProxy";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+};
+// PTX: wgmma.fence.sync.aligned
+class WgMmaFence final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit WgMmaFence(IrBuilderPasskey passkey);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "WgMmaFence";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+};
+// PTX: setmaxnreg.inc.sync.aligned.u32 and setmaxnreg.dec.sync.aligned.u32
+class SetMaxNReg final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit SetMaxNReg(
+      IrBuilderPasskey passkey,
+      Val* number_of_registers,
+      bool increase_registers);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return (increaseRegisters()) ? "IncSetMaxNReg" : "DecSetMaxNReg";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  bool increaseRegisters() const {
+    return attribute<bool>(0);
+  }
+  Val* numberOfRegisters() const {
+    return input(0);
+  }
+};
+class Return final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit Return(IrBuilderPasskey passkey);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "Return";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+};
+class MBarrierInit final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierInit(
+      IrBuilderPasskey passkey,
+      Val* mbarrier,
+      Val* thread_count);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierInit";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* mbarrier() const {
+    return input(0);
+  }
+  Val* threadCount() const {
+    return input(1);
+  }
+};
+class MBarrierInvalidate final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierInvalidate(IrBuilderPasskey passkey, Val* mbarrier);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierInvalidate";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* mbarrier() const {
+    return input(0);
+  }
+};
+class MBarrierArrive final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierArrive(IrBuilderPasskey passkey, Val* state, Val* mbarrier);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierArrive";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* state() const {
+    if (!outputs().empty()) {
+      return output(0);
+    }
+    return nullptr;
+  }
+  Val* mbarrier() const {
+    return input(0);
+  }
+};
+// IR node for: mbarrier.arrive.expect_tx
+// This is usually used to specify the number of bytes that will be
+// transferred for cp.async and cp.async.bulk, so that future mbarrier.wait
+// can wait for the completion of the transfer.
+class MBarrierArriveExpectTx final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierArriveExpectTx(
+      IrBuilderPasskey passkey,
+      Val* state,
+      Val* mbarrier,
+      Val* tx_count);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierArriveExpectTx";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* state() const {
+    if (!outputs().empty()) {
+      return output(0);
+    }
+    return nullptr;
+  }
+  Val* mbarrier() const {
+    return input(0);
+  }
+  Val* txCount() const {
+    return input(1);
+  }
+};
+class MBarrierWait final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierWait(IrBuilderPasskey passkey, Val* mbarrier, Val* state);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierWait";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* mbarrier() const {
+    return input(0);
+  }
+  Val* state() const {
+    return input(1);
+  }
+};
+class MBarrierWaitParity final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit MBarrierWaitParity(
+      IrBuilderPasskey passkey,
+      Val* mbarrier,
+      Val* parity);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "MBarrierWaitParity";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* mbarrier() const {
+    return input(0);
+  }
+  Val* parity() const {
+    return input(1);
+  }
+};
+// For all but first block in each reduction segment, first thread waits for
+// sync flag to indicate it is our turn to proceed (sync flag is incremented by
+// BlockSerializeRelease). Then block sync. This has the effect of
+// serializing blocks in each reduction segment. This is a block syncing
+// operation.
+class BlockSerializeWait final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit BlockSerializeWait(
+      IrBuilderPasskey passkey,
+      ParallelTypeBitmap sync_dims,
+      Val* sync_buffer);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "BlockSerializeWait";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  ParallelTypeBitmap syncDims() const {
+    return attribute<ParallelTypeBitmap>(0);
+  }
+  Val* syncBuffer() const {
+    return attributeVal(1);
+  }
+};
+// This first performs a block sync. For all but last block in the reduction
+// segment, first thread then writes the next segment ID to the sync flag. When
+// used with BlockSerializeWait, this has the effect of serializing blocks in
+// order each reduction segment.
+class BlockSerializeRelease final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit BlockSerializeRelease(
+      IrBuilderPasskey passkey,
+      ParallelTypeBitmap sync_dims,
+      Val* sync_buffer);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "BlockSerializeRelease";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  ParallelTypeBitmap syncDims() const {
+    return attribute<ParallelTypeBitmap>(0);
+  }
+  Val* syncBuffer() const {
+    return attributeVal(1);
+  }
+};
+// AsyncWait represents wait intrinsics for cp.async, cp.async.bulk and
+// wgmma.mma_async
+class AsyncWait final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit AsyncWait(
+      IrBuilderPasskey passkey,
+      AsyncOpType async_op_type,
+      int64_t keep_stages = 0);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "AsyncWait";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* ptx() const;
+  bool memory() const;
+  AsyncOpType asyncOpType() const {
+    return attribute<AsyncOpType>(0);
+  }
+  //! Returns the remaining number of stages that are not synchronized
+  //!  after this op.
+  int64_t keepStages() const {
+    return attribute<int64_t>(1);
+  }
+};
+// AsyncCommit represents commit intrinsics for cp.async
+//  A commit intrinsic communicates delimiter of transaction groups
+// to the async load hardware. Example usage see [Cicular buffer].
+class AsyncCommit final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit AsyncCommit(IrBuilderPasskey passkey, AsyncOpType async_op_type);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "AsyncCommit";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* ptx() const;
+  //! Returns if the corresponding PTX needs a `:memory` in the end, this value
+  //! will be used to set AsmOptions::memory when lowering to inline PTX.
+  bool memory() const;
+  AsyncOpType asyncOpType() const {
+    return attribute<AsyncOpType>(0);
+  }
+};
+// Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero
+// in helpers.cu
+class InitMagicZero final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit InitMagicZero(IrBuilderPasskey passkey);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "InitMagicZero";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+};
+// Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero
+// in helpers.cu
+class UpdateMagicZero final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit UpdateMagicZero(IrBuilderPasskey passkey);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "UpdateMagicZero";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+};
+//! IfThenElse provides scoping for an boolean operator. Exprs placed in its
+//! body are considered inside the scope of the if statement. In the future the
+//! implementation should look quite different so that we can do proper
+//! dependency annalysis like in Fusion.
+//!
+//! TODO(kir): this is not a real expression
+//!
+class IfThenElse final : public Expr {
+ public:
+  using Expr::Expr;
+  explicit IfThenElse(IrBuilderPasskey passkey, Predicate* cond);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "IfThenElse";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Scope& thenBody() {
+    return attribute<Scope>(0);
+  }
+  const Scope& thenBody() const {
+    return attribute<Scope>(0);
+  }
+  Scope& elseBody() {
+    return attribute<Scope>(1);
+  }
+  const Scope& elseBody() const {
+    return attribute<Scope>(1);
+  }
+  bool hasElse() const {
+    return !elseBody().empty();
+  }
+  bool empty() const {
+    return thenBody().empty() && elseBody().empty();
+  }
+};
+//! Grid reduction operation
+//!
+//! This node is used only after lowering a fusion to explicitly mark a grid
+//! reduction and the buffer allocation needed to do it.
+//!
+//! This node provides KernelExecutor the information it needs to allocate the
+//! reduction and sync buffers.
+class GridReduction final : public ReductionOp {
+  static constexpr int num_reduction_op_attr = 4;
+ public:
+  using ReductionOp::ReductionOp;
+  GridReduction(
+      IrBuilderPasskey passkey,
+      BinaryOpType reduction_op_type,
+      Val* init,
+      Val* out,
+      Val* in,
+      Allocate* reduction_buffer,
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances,
+      bool is_allreduce = false,
+      TensorIndex* serial_reduction_tensor = nullptr);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "GridReduction";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Allocate* reduction_buffer() const {
+    return attribute(num_reduction_op_attr)->as<Allocate>();
+  }
+  Allocate* sync_buffer() const {
+    return attribute(num_reduction_op_attr + 1)->as<Allocate>();
+  }
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return attributeVal(num_reduction_op_attr + 2);
+  }
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return attributeVal(num_reduction_op_attr + 3);
+  }
+  // gridReduce has template flags for thread predicates. In order to
+  // use them, the thread predicate is held here separately from
+  // Expr::predicate_.
+  const ParallelTypeBitmap& threadPredicate() const {
+    return attribute<ParallelTypeBitmap>(num_reduction_op_attr + 4);
+  }
+  ParallelTypeBitmap& threadPredicate() {
+    return attribute<ParallelTypeBitmap>(num_reduction_op_attr + 4);
+  }
+  TensorIndex* serialReductionTensor() const {
+    return dynamic_cast<TensorIndex*>(attributeVal(num_reduction_op_attr + 5));
+  }
+  bool isSerial() const {
+    return serialReductionTensor() != nullptr;
+  }
+  GridReduction* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GridReduction>();
+    result->threadPredicate() = thread_predicate;
+    return result;
+  }
+};
+class GroupedGridReduction final : public GroupedReductionOp {
+ public:
+  using GroupedReductionOp::GroupedReductionOp;
+  GroupedGridReduction(
+      IrBuilderPasskey passkey,
+      std::vector<BinaryOpType> reduction_op_type,
+      std::vector<Val*> init,
+      std::vector<Val*> out,
+      std::vector<Val*> in,
+      std::vector<Allocate*> reduction_buffers,
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances,
+      Val* buffer_stride,
+      bool is_allreduce = false);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  // number of attributes in the parent class
+  size_t numGroupedReductionOpAttr() const {
+    return 2 + outputs().size();
+  }
+  const char* getOpString() const override {
+    return "GroupedGridReduction";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  std::vector<Allocate*> reduction_buffers() const {
+    auto offset = numGroupedReductionOpAttr() + 5;
+    auto size = outputs().size();
+    std::vector<Allocate*> result;
+    result.reserve(size);
+    for (auto i : c10::irange(offset, offset + size)) {
+      result.emplace_back(attribute(i)->as<Allocate>());
+    }
+    return result;
+  }
+  Allocate* reduction_buffer(size_t i) const {
+    return reduction_buffers().at(i);
+  }
+  Allocate* sync_buffer() const {
+    return attribute(numGroupedReductionOpAttr())->as<Allocate>();
+  }
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return attributeVal(numGroupedReductionOpAttr() + 1);
+  }
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return attributeVal(numGroupedReductionOpAttr() + 2);
+  }
+  // Stride of reduction buffers
+  Val* buffer_stride() const {
+    return attributeVal(numGroupedReductionOpAttr() + 3);
+  }
+  // gridReduce has template flags for thread predicates. In order to
+  // use them, the thread predicate is held here separately from
+  // Expr::predicate_.
+  const ParallelTypeBitmap& threadPredicate() const {
+    return attribute<ParallelTypeBitmap>(numGroupedReductionOpAttr() + 4);
+  }
+  ParallelTypeBitmap& threadPredicate() {
+    return attribute<ParallelTypeBitmap>(numGroupedReductionOpAttr() + 4);
+  }
+  GroupedGridReduction* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GroupedGridReduction>();
+    result->threadPredicate() = thread_predicate;
+    return result;
+  }
+};
+//! Grid broadcast operation
+//!
+//! This node is used only after lowering a fusion to explicitly mark a grid
+//! broadcast and the buffer allocation needed to do it.
+//!
+//! This node provides KernelExecutor the information it needs to allocate the
+//! broadcast and sync buffers.
+class GridBroadcast final : public Expr {
+ public:
+  using Expr::Expr;
+  GridBroadcast(
+      IrBuilderPasskey passkey,
+      BroadcastOp* broadcast_op,
+      Allocate* broadcast_buffer,
+      Allocate* sync_buffer);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "GridBroadcast";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  BroadcastOp* broadcast_op() const {
+    return attribute(0)->as<BroadcastOp>();
+  }
+  Allocate* broadcast_buffer() const {
+    return attribute(1)->as<Allocate>();
+  }
+  Allocate* sync_buffer() const {
+    return attribute(2)->as<Allocate>();
+  }
+};
+//! Grid welford operation
+//!
+//! This node is used only after lowering a fusion to explicitly mark a grid
+//! reduction and the buffer allocation needed to do it.
+//!
+//! This node provides KernelExecutor the information it needs to allocate the
+//! reduction and sync buffers.
+//!
+//! TODO: Make this a subclass of WelfordOp
+class GridWelford final : public Expr {
+ public:
+  using Expr::Expr;
+  GridWelford(
+      IrBuilderPasskey passkey,
+      WelfordOp* welford_op,
+      Allocate* var_buffer,
+      Allocate* avg_buffer,
+      Allocate* n_buffer,
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "GridWelford";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  WelfordOp* welford_op() const {
+    return attribute(0)->as<WelfordOp>();
+  }
+  Allocate* var_buffer() const {
+    return attribute(1)->as<Allocate>();
+  }
+  Allocate* avg_buffer() const {
+    return attribute(2)->as<Allocate>();
+  }
+  Allocate* N_buffer() const {
+    return attribute(3)->as<Allocate>();
+  }
+  Allocate* sync_buffer() const {
+    return attribute(4)->as<Allocate>();
+  }
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return attributeVal(5);
+  }
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return attributeVal(6);
+  }
+  // gridReduce has template flags for thread predicates. In order to
+  // use them, the thread predicate is held here separately from
+  // Expr::predicate_.
+  const ParallelTypeBitmap& threadPredicate() const {
+    return attribute<ParallelTypeBitmap>(7);
+  }
+  ParallelTypeBitmap& threadPredicate() {
+    return attribute<ParallelTypeBitmap>(7);
+  }
+  GridWelford* withThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GridWelford>();
+    result->threadPredicate() = thread_predicate;
+    return result;
+  }
+};
+class GroupedGridWelford final : public GroupedWelfordOp {
+ public:
+  using GroupedWelfordOp::GroupedWelfordOp;
+  // input, output and init vals are vectors of triplets
+  GroupedGridWelford(
+      IrBuilderPasskey passkey,
+      std::vector<WelfordTriplet> output_vals,
+      std::vector<WelfordTriplet> input_vals,
+      std::vector<WelfordTriplet> init_vals,
+      std::array<std::vector<Allocate*>, 3> reduction_buffers,
+      Allocate* sync_buffer,
+      Val* entrance_index,
+      Val* entrances,
+      Val* buffer_stride,
+      bool is_allreduce = false,
+      bool use_outer_opt = false);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  size_t numGroupedWelfordOpAttr() const {
+    return 1 + outputs().size();
+  }
+  const char* getOpString() const override {
+    return "GroupedGridWelford";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  std::array<std::vector<Allocate*>, 3> reduction_buffers() const {
+    auto offset = numGroupedWelfordOpAttr() + 5;
+    auto size = outputs().size() / 3;
+    std::array<std::vector<Allocate*>, 3> result;
+    result[0].reserve(size);
+    result[1].reserve(size);
+    result[2].reserve(size);
+    for (auto i : c10::irange(size)) {
+      result[0].emplace_back(attribute(offset + i * 3)->as<Allocate>());
+      result[1].emplace_back(attribute(offset + i * 3 + 1)->as<Allocate>());
+      result[2].emplace_back(attribute(offset + i * 3 + 2)->as<Allocate>());
+    }
+    return result;
+  }
+  Allocate* sync_buffer() const {
+    return attribute(numGroupedWelfordOpAttr())->as<Allocate>();
+  }
+  // Which instance of entering this grid reduction is this iteration?
+  Val* entrance_index() const {
+    return attributeVal(numGroupedWelfordOpAttr() + 1);
+  }
+  // How many times will this grid reduction be entered
+  Val* entrances() const {
+    return attributeVal(numGroupedWelfordOpAttr() + 2);
+  }
+  // Stride of reduction buffers
+  Val* buffer_stride() const {
+    return attributeVal(numGroupedWelfordOpAttr() + 3);
+  }
+  // gridReduce has template flags for thread predicates. In order to
+  // use them, the thread predicate is held here separately from
+  // Expr::predicate_.
+  const ParallelTypeBitmap& threadPredicate() const {
+    return attribute<ParallelTypeBitmap>(numGroupedWelfordOpAttr() + 4);
+  }
+  ParallelTypeBitmap& threadPredicate() {
+    return attribute<ParallelTypeBitmap>(numGroupedWelfordOpAttr() + 4);
+  }
+  GroupedGridWelford* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GroupedGridWelford>();
+    result->threadPredicate() = thread_predicate;
+    return result;
+  }
+  // True if the outer-optimized kernel should be used
+  bool useOuterOpt() const {
+    auto offset = numGroupedWelfordOpAttr() + 5 + outputs().size();
+    return attribute<bool>(offset);
+  }
+  //! Return the required smem buffer size
+  int64_t getSmemBufferSize(int64_t bdimx, int64_t bdimy, int64_t bdimz) const;
+};
+//! Represents a WelfordOp with the division by count is hoisted out
+//! of an innermost loop
+class VectorizedWelfordOp final : public WelfordOp {
+ public:
+  using WelfordOp::WelfordOp;
+  VectorizedWelfordOp(
+      IrBuilderPasskey,
+      const WelfordTriplet& output,
+      const WelfordTriplet& input,
+      const WelfordTriplet& init,
+      Val* count,
+      Val* reciprocal_of_count,
+      Val* hoisted_predicate);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "VectorizedWelfordOp";
+  }
+  //! New count that should be set to outN
+  Val* count() const {
+    return attributeVal(WelfordOp::kNumAttrs);
+  }
+  //! Reciprocal of count
+  Val* reciprocalOfCount() const {
+    return attributeVal(WelfordOp::kNumAttrs + 1);
+  }
+  //! Predicate of this expression hoisted out of an innermost loop
+  Val* hoistedPredicate() const {
+    return attributeVal(WelfordOp::kNumAttrs + 2);
+  }
+};
+// Allocate an instance of the fused reduction class.
+class AllocateFusedReduction final : public Expr {
+  explicit AllocateFusedReduction(IrBuilderPasskey passkey, Expr* grid_expr);
+ public:
+  using Expr::Expr;
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GridReduction* grid_reduction)
+      : AllocateFusedReduction(passkey, dynamic_cast<Expr*>(grid_reduction)) {}
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GridWelford* grid_welford)
+      : AllocateFusedReduction(passkey, dynamic_cast<Expr*>(grid_welford)) {}
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GroupedGridReduction* grouped_grid_reduction)
+      : AllocateFusedReduction(
+            passkey,
+            dynamic_cast<Expr*>(grouped_grid_reduction)) {}
+  explicit AllocateFusedReduction(
+      IrBuilderPasskey passkey,
+      GroupedGridWelford* grouped_grid_welford)
+      : AllocateFusedReduction(
+            passkey,
+            dynamic_cast<Expr*>(grouped_grid_welford)) {}
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "AllocateFusedReduction";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  //! GridReduction, GridWelford, GroupedGridReduction or GroupedGridWelford
+  Expr* gridExpr() const {
+    return attribute(0)->asExpr();
+  }
+  TensorIndex* out() const;
+  const ParallelTypeBitmap& threadPredicate() const;
+};
+class GetRNGSeedAndOffsetFromHost : public Expr {
+ public:
+  using Expr::Expr;
+  GetRNGSeedAndOffsetFromHost(
+      IrBuilderPasskey,
+      Val* seed_ptr,
+      Val* seed_val,
+      Val* first_offset_ptr,
+      Val* first_offset_val,
+      int64_t offsets = -1);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "GetRNGSeedAndOffsetFromHost";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const int64_t& offsets() const {
+    return attribute<int64_t>(0);
+  }
+  int64_t& offsets() {
+    return attribute<int64_t>(0);
+  }
+  std::vector<PolymorphicValue> evaluate(
+      const ExpressionEvaluator& ee,
+      const std::vector<PolymorphicValue>& inputs) const override;
+};
+// Expr for driver API cuTensorMapEncodeTiled
+class EncodeTensorMapTiled : public Expr {
+ public:
+  using Expr::Expr;
+  EncodeTensorMapTiled(
+      IrBuilderPasskey,
+      Val* output,
+      DataType data_type,
+      Val* global_address,
+      Val* global_dim,
+      Val* global_strides,
+      Val* box_dim,
+      Val* element_strides,
+      tma::TensorMapInterleave interleave,
+      MmaInputSmemSwizzle swizzle,
+      tma::TensorMapL2Promotion l2_promotion,
+      tma::TensorMapFloatOOBFill oob_fill);
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  const char* getOpString() const override {
+    return "EncodeTensorMapTiled";
+  }
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  Val* globalAddress() const {
+    return input(0);
+  }
+  Val* globalDim() const {
+    return input(1);
+  }
+  Val* globalStrides() const {
+    return input(2);
+  }
+  Val* boxDim() const {
+    return input(3);
+  }
+  Val* elementStrides() const {
+    return input(4);
+  }
+  const DataType& dataType() const {
+    return attribute<DataType>(0);
+  }
+  const int64_t& tensorRank() const {
+    return attribute<int64_t>(1);
+  }
+  const tma::TensorMapInterleave& interleave() const {
+    return attribute<tma::TensorMapInterleave>(2);
+  }
+  const MmaInputSmemSwizzle& swizzle() const {
+    return attribute<MmaInputSmemSwizzle>(3);
+  }
+  const tma::TensorMapL2Promotion& l2Promotion() const {
+    return attribute<tma::TensorMapL2Promotion>(4);
+  }
+  const tma::TensorMapFloatOOBFill& oobFill() const {
+    return attribute<tma::TensorMapFloatOOBFill>(5);
+  }
+  std::vector<PolymorphicValue> evaluate(
+      const ExpressionEvaluator& ee,
+      const std::vector<PolymorphicValue>& inputs) const override;
+};
+} // namespace kir
+} // namespace nvfuser