PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/type_promotion.h ADDED Viewed

@@ -0,0 +1,61 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+namespace nvfuser {
+struct TypePromotionConfig {
+  bool promote_integer_inputs_to_float = false;
+  // Checks the promoted type is either single or double.
+  bool require_full_precision_promoted = false;
+};
+namespace TypePromotion {
+static const TypePromotionConfig comparison_op_config;
+static const TypePromotionConfig default_op_config;
+static const TypePromotionConfig float_op_config{
+    /* promote_integer_inputs_to_float */ true,
+    /* require_full_precision_promoted */ false};
+static const TypePromotionConfig float_only_op_config{
+    /* promote_integer_inputs_to_float */ false,
+    /* require_full_precision_promoted */ true};
+} // namespace TypePromotion
+// Implements the the behavior of the following flags:
+//   - promote_inputs_to_common_dtype
+//   - promote_integer_inputs_to_float
+DataType computeTypes(
+    const TypePromotionConfig& config,
+    const std::vector<Val*>& operands,
+    const bool cast_half_to_float = true);
+// Computes the common dtype for the given operands
+// Casts operands to common dtype if necessary
+// Automatically cast FP16/BF16 dtype to Float
+std::vector<Val*> promoteValues(
+    const TypePromotionConfig& config,
+    const std::vector<Val*>& operands);
+std::vector<Val*> promoteValues(
+    const std::vector<Val*>& operands,
+    DataType common_type);
+// Casts value to common dtype if necessary
+// Avoid cast if value's dtype matches its dtype class
+Val* optionalCast(DataType dtype, Val* v);
+// Casts value to common dtype if necessary
+Val* optionalCastStrict(DataType dtype, Val* v);
+} // namespace nvfuser

nvfuser/include/nvfuser/utils.h ADDED Viewed

@@ -0,0 +1,619 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ATen/ATen.h>
+#include <exceptions.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/torch.h>
+#include <visibility.h>
+#include <debug.h>
+#include <mma_type.h>
+#include <tma.h>
+#include <type.h>
+#include <c10/core/thread_pool.h>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+#define NVF_TORCH_VERSION_GREATER(major, minor, patch)                \
+  TORCH_VERSION_MAJOR > major ||                                      \
+      (TORCH_VERSION_MAJOR == major && TORCH_VERSION_MINOR > minor || \
+       (TORCH_VERSION_MINOR == minor && TORCH_VERSION_PATCH > patch))
+#define NVF_TORCH_VERSION_NO_LESS(major, minor, patch)                \
+  TORCH_VERSION_MAJOR > major ||                                      \
+      (TORCH_VERSION_MAJOR == major && TORCH_VERSION_MINOR > minor || \
+       (TORCH_VERSION_MINOR == minor && TORCH_VERSION_PATCH >= patch))
+//! IR header hierarchy
+//! 1. ** utils.h ** - PolymorphicBase and NonCopyable
+//! 2. ir/base_nodes.h - Statement, Expr, and Val
+//! 3. ir/internal_base_nodes.h - IterDomain and TensorDomain
+//! 4. ir/interface_nodes.h - TensorView and Scalar
+//! 5. ir/internal_nodes.h ** - Any internal-only IR nodes
+namespace nvfuser {
+int getNumThreads();
+c10::ThreadPool* getThreadPool();
+std::string debug_str(const c10::IValue& val);
+std::string debug_str(const at::Tensor& tensor);
+bool is_cpu_scalar(const at::Tensor& tensor);
+bool is_meta_scalar(const at::Tensor& tensor);
+//! Find common device among tensor inputs. If no tensor inputs are found and
+//! the selected_device argument is omitted, a default value of 0 is returned.
+//! If no tensor inputs are found and selected_device is provided,
+//! selected_device will be returned. If tensor inputs are found their devices
+//! must match one another, and if selected_device is given they must match it
+//! as well, otherwise -1 is returned.
+int8_t getCommonDeviceCUDA(
+    const at::ArrayRef<c10::IValue>& inputs,
+    std::optional<int8_t> selected_device = std::nullopt);
+int64_t getRegPerThreadGivenThreadsPerSM(int64_t threads_per_sm);
+int64_t getThreadsPerSMGivenRegPerThread(int64_t reg_per_thread);
+// Check if fallback path should be used which will dispatch to eager mode if
+// any errors are encountered. Helpful for debugging.
+bool useFallback();
+//! Ceil integer division
+constexpr int64_t ceilDiv(int64_t dividend, int64_t divisor) {
+  return (dividend + divisor - 1) / divisor;
+}
+constexpr int64_t roundUpToMultiple(int64_t dividend, int64_t divisor) {
+  return ceilDiv(dividend, divisor) * divisor;
+}
+//! Simple mixin for suppressing copy & move operations, ex:
+//!
+//!  class Foo : public NonCopyable {
+//!   ...
+//!  };
+//!
+class NonCopyable {
+ public:
+  NonCopyable() = default;
+  // No copy/move semantics
+  NonCopyable(const NonCopyable&) = delete;
+  NonCopyable& operator=(const NonCopyable&) = delete;
+};
+//! A generic root for a hierarchy of polymorphic classes:
+//! - It ensures virtual destructors
+//! - Provides the base->as<Derived>() and node->isA<T>() notation
+class PolymorphicBase {
+ public:
+  virtual ~PolymorphicBase() = default;
+  // Replacement for static_cast<T*>(ptr): ptr->as<T>()
+  // (checked in DEBUG builds)
+  template <class T>
+  T* as() {
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
+    auto downcast_ptr = static_cast<T*>(this);
+#else
+    auto downcast_ptr = dynamic_cast<T*>(this);
+    NVF_ERROR(downcast_ptr != nullptr);
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
+    return downcast_ptr;
+  }
+  template <class T>
+  const T* as() const {
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
+    auto downcast_ptr = static_cast<const T*>(this);
+#else
+    auto downcast_ptr = dynamic_cast<const T*>(this);
+    NVF_ERROR(downcast_ptr != nullptr);
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
+    return downcast_ptr;
+  }
+  //! Check if the runtime type is T (or derived from T)
+  //!
+  //! \note Don't use this for conditional casts. Instead, use:
+  //!
+  //!  if (auto t = dynamic_cast<T>(p)) { ... }
+  //!
+  //! instead of:
+  //!
+  //!  if (p->isA<T>()) { auto t = p->as<T>(); ... }
+  //!
+  template <class T>
+  bool isA() const {
+    return dynamic_cast<const T*>(this) != nullptr;
+  }
+  //! Check if the runtime type is strictly T. Returns false for classes
+  //! derived from T
+  template <class T>
+  bool isStrictlyA() const {
+    return typeid(*this) == typeid(T);
+  }
+ private:
+  template <int> // unused template argument
+  bool isOneOf() const {
+    return false;
+  }
+  template <int, class T1, class... T>
+  bool isOneOf() const {
+    return isA<T1>() || isOneOf<0, T...>();
+  }
+  template <int> // unused template argument
+  bool isStrictlyOneOf() const {
+    return false;
+  }
+  template <int, class T1, class... T>
+  bool isStrictlyOneOf() const {
+    return isStrictlyA<T1>() || isStrictlyOneOf<0, T...>();
+  }
+ public:
+  //! Check if the runtime type is one of the given types (or derived from
+  //! one of the given types)
+  template <class... T>
+  bool isOneOf() const {
+    return isOneOf<0, T...>();
+  }
+  //! Check if the runtime type is strictly one of the given types. Derived
+  //! types not in the given list does not count.
+  template <class... T>
+  bool isStrictlyOneOf() const {
+    return isStrictlyOneOf<0, T...>();
+  }
+};
+template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
+constexpr unsigned int switch_pair(T t1, T t2) {
+  constexpr unsigned int _WORD_SHIFT = 16;
+  return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
+}
+std::vector<int64_t> getTensorSizes(at::TensorTypePtr const& tensor_type);
+//! Return a sorted list of keys of an unordered map so that it can be
+//! iterated deterministically
+template <typename KeyType, typename ValueType, typename Cmp>
+std::vector<KeyType> getSortedKeys(
+    const std::unordered_map<KeyType, ValueType>& map,
+    Cmp cmp) {
+  std::vector<KeyType> keys(map.size());
+  auto keys_it = keys.begin();
+  for (const auto& kv : map) {
+    *keys_it = kv.first;
+    ++keys_it;
+  }
+  std::sort(keys.begin(), keys.end(), cmp);
+  return keys;
+}
+// Based on https://stackoverflow.com/a/9154394
+template <typename T>
+static auto hasToStringHelper(int)
+    -> decltype(std::declval<typename std::remove_pointer<T>::type>().toString(), std::true_type{});
+template <typename>
+static auto hasToStringHelper(long) -> std::false_type;
+template <class T>
+struct hasToString : decltype(hasToStringHelper<T>(0)) {};
+// If T::toString() is defined, use the toString() to get its
+// string. If std::stringstream << is defined for T, then use <<.
+// otherwise, just returns a "<attr>"
+template <typename T>
+struct Printer {
+  static std::string toString(const T& value) {
+    if constexpr (hasToString<T>()) {
+      if constexpr (std::is_pointer<T>::value) {
+        return value->toString();
+      } else {
+        return value.toString();
+      }
+    } else {
+      return "<attr>";
+    }
+  }
+};
+#if 0
+// Waiting for C++20....
+#include <concepts>
+template<typename T>
+concept Printable = requires(T a)
+{
+  { std::stringstream{} << a } -> std::convertible_to<std::stringstream>;
+};
+template <Printable T>
+struct Printer<T> {
+  static std::string toString(const T& value) {
+    std::stringstream ss;
+    ss << value;
+    return ss.str();
+  }
+};
+#else
+#define SPECIALIZE_PRINTER(T)                     \
+  template <>                                     \
+  struct Printer<T> {                             \
+    static std::string toString(const T& value) { \
+      std::stringstream ss;                       \
+      ss << value;                                \
+      return ss.str();                            \
+    }                                             \
+  }
+SPECIALIZE_PRINTER(bool);
+SPECIALIZE_PRINTER(int);
+SPECIALIZE_PRINTER(std::string);
+using ConstCharStar = const char*;
+SPECIALIZE_PRINTER(ConstCharStar);
+using VoidStar = void*;
+SPECIALIZE_PRINTER(VoidStar);
+SPECIALIZE_PRINTER(uint32_t);
+SPECIALIZE_PRINTER(int64_t);
+SPECIALIZE_PRINTER(uint64_t);
+SPECIALIZE_PRINTER(DataType);
+SPECIALIZE_PRINTER(MemoryType);
+SPECIALIZE_PRINTER(UnaryOpType);
+SPECIALIZE_PRINTER(BinaryOpType);
+SPECIALIZE_PRINTER(TernaryOpType);
+SPECIALIZE_PRINTER(LoadStoreOpType);
+SPECIALIZE_PRINTER(CircularBufferLoopStage);
+SPECIALIZE_PRINTER(tma::TensorMapInterleave);
+SPECIALIZE_PRINTER(tma::TensorMapL2Promotion);
+SPECIALIZE_PRINTER(tma::TensorMapFloatOOBFill);
+SPECIALIZE_PRINTER(MmaInputSmemSwizzle);
+SPECIALIZE_PRINTER(SwizzleType);
+SPECIALIZE_PRINTER(Swizzle2DType);
+SPECIALIZE_PRINTER(SwizzleMode);
+SPECIALIZE_PRINTER(std::vector<int>);
+SPECIALIZE_PRINTER(std::vector<uint32_t>);
+SPECIALIZE_PRINTER(std::vector<int64_t>);
+SPECIALIZE_PRINTER(std::vector<uint64_t>);
+SPECIALIZE_PRINTER(std::optional<bool>);
+#undef SPECIALIZE_PRINTER
+#endif // if 0
+// Stringification with delimiter
+template <typename Iterator>
+std::string toDelimitedString(
+    Iterator first,
+    Iterator last,
+    std::string delim = ", ") {
+  std::stringstream ss;
+  bool first_val = true;
+  for (auto it = first; it != last; ++it) {
+    if (!first_val) {
+      ss << delim;
+    }
+    ss << Printer<typename Iterator::value_type>::toString(*it);
+    first_val = false;
+  }
+  return ss.str();
+}
+template <typename Printable>
+std::string toDelimitedString(
+    const std::vector<Printable>& vec,
+    std::string delim = ", ") {
+  return toDelimitedString(vec.begin(), vec.end(), delim);
+}
+template <typename Printable>
+std::string toDelimitedString(
+    std::initializer_list<Printable> list,
+    std::string delim = ", ") {
+  // toDelimitedString(list.begin(), list.end(), delim) doesn't work out of the
+  // box, because list.begin() returns a Printable* not an iterator.
+  return toDelimitedString(std::vector<Printable>(list), delim);
+}
+template <typename Printable>
+std::string toDelimitedString(
+    const std::deque<Printable>& dq,
+    std::string delim = ", ") {
+  return toDelimitedString(dq.begin(), dq.end(), delim);
+}
+template <typename Printable>
+std::string toDelimitedString(
+    const std::unordered_set<Printable>& set,
+    std::string delim = ", ") {
+  return toDelimitedString(set.begin(), set.end(), delim);
+}
+template <int64_t index, int64_t stop, int64_t step, typename func_t>
+void unrolled_for(func_t fun) {
+  if constexpr (index < stop) {
+    fun(std::integral_constant<int64_t, index>());
+    unrolled_for<index + step, stop>(fun);
+  }
+}
+template <int64_t index, int64_t stop, typename func_t>
+void unrolled_for(func_t fun) {
+  unrolled_for<index, stop, 1>(fun);
+}
+template <int64_t stop, typename func_t>
+void unrolled_for(func_t fun) {
+  unrolled_for<0, stop>(fun);
+}
+template <typename... Args>
+std::string toDelimitedString(
+    const std::tuple<Args...>& args,
+    std::string delim = ", ") {
+  std::stringstream ss;
+  bool first_val = true;
+  unrolled_for<sizeof...(Args)>([&](auto i) {
+    if (!first_val) {
+      ss << delim;
+    }
+    auto item = std::get<decltype(i)::value>(args);
+    ss << Printer<decltype(item)>::toString(item);
+    first_val = false;
+  });
+  return ss.str();
+}
+template <typename ContainerOfStatement>
+std::string toDelimitedInlineString(
+    const ContainerOfStatement& container,
+    std::string delim = ", ") {
+  std::stringstream ss;
+  bool first_val = true;
+  for (const auto& item : container) {
+    if (!first_val) {
+      ss << delim;
+    }
+    ss << item->toInlineString();
+    first_val = false;
+  }
+  return ss.str();
+}
+class DebugPrintScope {
+ public:
+  template <typename... Args>
+  DebugPrintScope(std::string name, Args... args) : name_(std::move(name)) {
+    debug() << "Entering " << name_ << "("
+            << toDelimitedString(std::forward_as_tuple(args...)) << ")"
+            << std::endl;
+  }
+  ~DebugPrintScope() {
+    debug() << "Leaving " << name_;
+    if (!return_.empty()) {
+      debug() << " returning " << return_;
+    }
+    if (!file_.empty()) {
+      debug() << " at " << file_;
+    }
+    if (line_ >= 0) {
+      debug() << ":" << line_;
+    }
+    debug() << std::endl;
+  }
+  template <typename T>
+  void setReturn(const T& ret, std::string file = "", int64_t line = -1) {
+    return_ = Printer<std::decay_t<T>>::toString(ret);
+    file_ = std::move(file);
+    line_ = line;
+  }
+ private:
+  // The name of the scope, as specified as the first argument of
+  // DEBUG_PRINT_SCOPE_NAME. If using DEBUG_PRINT_SCOPE, then this is __func__.
+  std::string name_;
+  // Return value and location of the return statement.
+  // Note that the recording of the return value is not automatic. The function
+  // needs to be manually instrumented to replace `return XXX;` with
+  // `RECORD_AND_RETURN(XXX)` to record the return value.
+  std::string return_;
+  std::string file_;
+  int64_t line_ = -1;
+};
+// Debug printing the entering and leaving of a function. The given arguments
+// will be printed when entering the function.
+//
+// Note: ##__VA_ARGS__ is not C++ stardard, but it should work on gcc and clang.
+// Compared to __VA_ARGS__, ##__VA_ARGS__ automatically remove the preceding
+// comma when empty, allowing empty variadic parameters. If using other
+// compiler, please use DebugPrintScope directly without this macro.
+#define DEBUG_PRINT_SCOPE_NAME(name, ...)                                 \
+  std::unique_ptr<DebugPrintScope> _debug_print_scope;                    \
+  if (isDebugDumpEnabled(DebugDumpOption::FunctionTrace)) {               \
+    auto enabled = getDebugDumpArguments(DebugDumpOption::FunctionTrace); \
+    for (auto pattern : enabled) {                                        \
+      std::regex re(pattern);                                             \
+      if (std::regex_match(name, re)) {                                   \
+        _debug_print_scope =                                              \
+            std::make_unique<DebugPrintScope>(name, ##__VA_ARGS__);       \
+        break;                                                            \
+      }                                                                   \
+    }                                                                     \
+  }
+#define DEBUG_PRINT_SCOPE(...) DEBUG_PRINT_SCOPE_NAME(__func__, ##__VA_ARGS__)
+#define DEBUG_LOG(...)                                    \
+  if (_debug_print_scope) {                               \
+    debug() << "[" << __FILE__ << ":" << __LINE__ << "] " \
+            << to_str("", ##__VA_ARGS__) << std::endl;    \
+  }
+// Record the return value and return it.
+#define RECORD_AND_RETURN(ret)                              \
+  if (_debug_print_scope) {                                 \
+    _debug_print_scope->setReturn(ret, __FILE__, __LINE__); \
+  }                                                         \
+  return ret
+// Computes the index type required.
+// Made into a class w/ state to allow reuse with
+// different tensors and without needing to pass an allocated
+// vector of size+stride
+class KernelIndexTypeCompute {
+  // Save 1 more bit besides the sign bit to be conservative
+  static constexpr int64_t most_positive_int32_index =
+      std::numeric_limits<int>::max() / 2;
+ public:
+  // Updates counters and returns current reqd mode
+  inline PrimDataType addDim(int64_t size, int64_t stride) {
+    if (size > 1) {
+      NVF_ERROR(stride >= 0, "Negative stride is not supported: ", stride);
+      if (stride > 0) {
+        // Accumulate positive stride
+        tensor_most_positive_index_ += (size - 1) * stride;
+      }
+    }
+    return getType();
+  }
+  inline PrimDataType getType() const {
+    if (tensor_most_positive_index_ > most_positive_int32_index) {
+      return PrimDataType::Int;
+    } else {
+      return PrimDataType::Int32;
+    }
+  }
+ private:
+  int64_t tensor_most_positive_index_ = 0;
+};
+template <typename>
+struct is_std_vector : std::false_type {};
+template <typename T, typename A>
+struct is_std_vector<std::vector<T, A>> : std::true_type {};
+template <typename T>
+constexpr auto is_std_vector_v = is_std_vector<T>::value;
+//! Alter an existing hash in order to combine it with a new hash in a way that
+//! is order-dependent and spreads bits over the entire range of a size_t.
+//! Inspired by boost::hash_combine. See https://stackoverflow.com/q/35985960
+inline void hashCombine(size_t& hash, size_t new_hash) {
+  hash ^= new_hash + 0x9e3779b9 + (hash << 6) + (hash >> 2);
+}
+//! A wrapper to std::getenv. env_name is prepended with NVFUSER_.
+NVF_API char* getNvFuserEnv(const char* env_name);
+// Returns the mapped value or the default.
+template <typename K, typename V>
+const V& getOrDefault(
+    const std::unordered_map<K, V>& map,
+    const K& key,
+    const V& default_value = V()) {
+  const auto i = map.find(key);
+  return i == map.end() ? default_value : i->second;
+}
+size_t deviceAvailableSharedMemoryBytes();
+inline int64_t wrapDim(int64_t dim, int64_t ndim) {
+  if (dim < 0) {
+    dim += ndim;
+  }
+  NVF_CHECK(
+      dim >= 0 && dim < ndim,
+      "Tried to access out of boundary index ",
+      dim,
+      ". total index: ",
+      ndim);
+  return dim;
+}
+// This is the same as the pow utility included in runtime/helpers.cu. It is
+// included here to facilitate matching host-side computation.
+template <typename T>
+T pow(T a, T b) {
+  if (b < 0) {
+    if (a == 1) {
+      return 1;
+    } else if (a == -1) {
+      auto negative = (-b) % static_cast<T>(2);
+      return negative ? -1 : 1;
+    } else {
+      return 0;
+    }
+  } else {
+    T result = 1;
+    while (b) {
+      if (b & 1) {
+        result *= a;
+      }
+      b /= 2;
+      a *= a;
+    }
+    return result;
+  }
+}
+// Returns true if given number is power of 2
+constexpr bool isPowOf2(int64_t x) {
+  return x > 1 && (x & (x - 1)) == 0;
+}
+template <typename T>
+using MaybeUniqueOwningPtr = dynamic_type::
+    DynamicType<dynamic_type::NoContainers, T*, std::unique_ptr<T>>;
+template <typename T>
+void checkAllEqual(std::initializer_list<T> elements) {
+  for (const auto& element : elements) {
+    NVF_CHECK(
+        element == *elements.begin(),
+        "Expected all elements to be equal, but found ",
+        element,
+        " and ",
+        *elements.begin(),
+        " in [",
+        toDelimitedString(elements),
+        "]");
+  }
+}
+} // namespace nvfuser