PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/scheduler/utils.h ADDED Viewed

@@ -0,0 +1,771 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <device_lower/pass/loop_rotation.h>
+#include <disjoint_set.h>
+#include <exceptions.h>
+#include <fusion.h>
+#include <ir/all_nodes.h>
+#include <ir/cloner.h>
+#include <scheduler/reduction_heuristic.h>
+#include <scheduler/tools/maxinfo_propagator.h>
+#include <visibility.h>
+namespace nvfuser {
+class ComputeAtMap;
+class SchedulerRuntimeInfo;
+class HeuristicDataCache;
+namespace scheduler_utils {
+// Assume any only half of the register file is available to spend on buffers,
+// this is because when we allocate a buffer in register is has to be accesed
+// with a compile time constant index. Unfortunately nvcc seems to be using
+// many registers for indexing. This is a bad estimation of extra register use,
+// but it's hard to get a better one.
+constexpr int64_t register_file_size_full = (int64_t)256 * 1024;
+constexpr int64_t register_file_size = register_file_size_full / 2;
+constexpr int64_t register_file_size_56k = (int64_t)56 * 4 * 1024;
+// Empirically observed number. Not guaranteed to be a good estimate
+constexpr int64_t register_overhead = 40l;
+constexpr int64_t max_registers_per_thread = 255l;
+constexpr int64_t bytes_per_register = 4l;
+constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1;
+constexpr int64_t y_grid_limit = 65535;
+constexpr int64_t z_grid_limit = 65535;
+constexpr int64_t z_block_limit = 64;
+// Find largest power of 2 that is a factor of n. If n==0, return largest power
+// of 2 representable by int64_t
+constexpr int64_t maxVectorizationWidth(int64_t n) {
+  if (n == 0) {
+    // Max representable int has null sign bit then all ones. Shift right then
+    // xor to preserve only the most significant bit.
+    int64_t m = std::numeric_limits<int64_t>::max();
+    return m ^ (m >> 1);
+  }
+  // For example
+  //   n               = b101101000
+  //           n - 1   = b101100111
+  //        ~ (n - 1)  = b010011000
+  //   n & (~ (n - 1)) = b000001000
+  // The key is that subtracting one flips all trailing 0s as well as the least
+  // significant 1, so all of the other bits will fail the &, leaving
+  // only that 1.
+  return n & (~(n - 1));
+}
+// Largest Power of 2 less-than n
+constexpr int64_t lastPow2(int64_t n) {
+  NVF_ERROR(n >= 0);
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
+  n |= (n >> 16); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
+  n |= (n >> 32); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
+  return std::max((int64_t)1, n - (n >> 1));
+}
+// round up to multiple of 8 or pow2 whichever smaller
+constexpr int64_t roundUpPow2Or8(const int64_t x) {
+  auto round_up_pow2 = lastPow2(x);
+  if (round_up_pow2 < x) {
+    round_up_pow2 *= 2;
+  }
+  constexpr int64_t kEight = 8;
+  auto round_up_8 = x % kEight == 0 ? x : x + (kEight - x % kEight);
+  return std::min(round_up_8, round_up_pow2);
+}
+constexpr int64_t roundUpPow2(const int64_t x) {
+  auto round_up_pow2 = scheduler_utils::lastPow2(x);
+  if (round_up_pow2 < x) {
+    round_up_pow2 *= 2;
+  }
+  return round_up_pow2;
+}
+constexpr int64_t roundUpToN(const int64_t x, const int64_t n) {
+  return x % n == 0 ? x : x + (n - x % n);
+}
+// Div x by y, but min at 1
+inline int64_t safeDiv(const int64_t x, const int64_t y) {
+  return std::max(x / y, (int64_t)1);
+}
+// Split the given dimensions in `to_split`. Also update the dimensions in
+// `to_update` to the positions in the splitted tensor. Splitting one dimension
+// multiple times is supported, and if this is the case, then the order of
+// `to_split` matters. All given dimensions are numbers before any split.
+void splitDims(
+    TensorView* tv,
+    std::vector<std::pair<int64_t, int64_t>> to_split, // (dim, size)
+    std::vector<int64_t>& to_update);
+inline void splitDims(
+    TensorView* tv,
+    std::vector<std::pair<int64_t, int64_t>> to_split) { // (dim, size)
+  std::vector<int64_t> unused;
+  splitDims(tv, std::move(to_split), unused);
+}
+// Merge all the given dimensions in `to_merge` into a single dimension. Also
+// update the dimensions in `to_update` to the positions in the merged tensor.
+// Returns the merged dimension. All given dimensions are numbers before any
+// merge.
+// NOTE: merged is done as the entries in the order of `to_merge`, assuming an
+// order from inner to outer
+std::optional<int64_t> mergeDims(
+    TensorView* tv,
+    std::vector<int64_t> to_merge,
+    std::vector<int64_t>& to_update);
+inline std::optional<int64_t> mergeDims(
+    TensorView* tv,
+    std::vector<int64_t> to_merge) {
+  std::vector<int64_t> unused;
+  return mergeDims(tv, std::move(to_merge), unused);
+}
+// Merge all reduction to the right side and returns total number of
+// reduction axes.
+int64_t mergeReduction(TensorView* tv);
+// merge all non-reduction axes to the left side and returns total number of
+// iteration axes.
+int64_t mergeNonReduction(TensorView* tv);
+// Propagate the parallelization from the selected dimensions of the reference
+// tensor to their corresponding dimensions in all selected tensors in the DAG.
+// Position `pos` means selecting all the dimensions [0, 1, ..., pos - 1]. pos =
+// -1 means selecting all dimensions. `selected_tvs` are selected tensors in the
+// DAG. Empty `selected_tvs` means selecting all tensors in the fusion of
+// `reference_tv`. `selected_parallel_types` are the selected parallel types.
+// Empty `selected_parallel_types` means selecting all parallel types.
+void parallelizeAllLike(
+    TensorView* reference_tv,
+    int64_t pos = -1,
+    std::vector<TensorView*> selected_tvs = {},
+    const std::unordered_set<ParallelType>& selected_parallel_types = {},
+    bool propagate_padding = true);
+inline void parallelizeAllLike(
+    TensorView* reference_tv,
+    std::vector<TensorView*> selected_tvs,
+    const std::unordered_set<ParallelType>& selected_parallel_types = {},
+    bool propagate_padding = true) {
+  parallelizeAllLike(
+      reference_tv,
+      -1,
+      std::move(selected_tvs),
+      selected_parallel_types,
+      propagate_padding);
+}
+// Common hyperparameters used in heuristic scheduler. These hyperparameters
+// are passed to SchedulerEntry::computeHeuristics through the
+// HeuristicDataCache. These hyperparameters alter the generation of the
+// HeuristicParams for the scheduler.
+struct SchedulerHyperParameters {
+  SchedulerHyperParameters(
+      int64_t vectorize_factor_,
+      int64_t unroll_factor_,
+      int64_t threads_per_block_min_,
+      int64_t threads_per_block_max_)
+      : vectorize_factor(vectorize_factor_),
+        unroll_factor(unroll_factor_),
+        threads_per_block_min(threads_per_block_min_),
+        threads_per_block_max(threads_per_block_max_) {}
+  //! Number of elements to load per vectorize load.
+  int64_t vectorize_factor = 1;
+  //! Number of iterations to unroll for-loop.
+  int64_t unroll_factor = 1;
+  //! Minimum number of threads per block.
+  int64_t threads_per_block_min = 1;
+  //! Maximum number of threads per block.
+  int64_t threads_per_block_max = 1;
+};
+struct PersistentBufferInfo {
+  std::vector<TensorView*> persistent_buffers;
+  std::unordered_set<IterDomain*> unmappable_dims;
+  // Persistent buffers are needed until the path through the reduction -
+  // broadcast chain is resolved by any other chain using the persistent buffer
+  // that is not going through a reduction. This assumes all reduction paths
+  // have the same reduction pattern. Order is the same as persistent_buffers
+  std::vector<std::vector<TensorView*>> persistent_buffer_resolution_points;
+  // Not all persistent buffers can be projected to inputs, if a buffer can be
+  // projected to the inputs which may reduce the persistent buffer size (BN
+  // Backwards specifically) then keep track of it here. Persistent buffers that
+  // have a persistent buffer/reduction before them should not be projected
+  // through that.
+  std::vector<TensorView*> projectable_persistent_buffers;
+  // Track inputs of input projectable buffers
+  std::vector<TensorView*> projectable_buffer_inputs;
+  // Map unmappable dims to projectable_buffer_inputs
+  std::unordered_set<IterDomain*> unamppable_dims_projected_to_inputs;
+  // Some parameters used in
+  // normalization_scheduler_utils::isProjectBufferToInput
+  bool has_view_ops = false;
+  bool projection_with_exp_op = false;
+  bool projection_with_rng_op = false;
+};
+// Buffers whos roots can't map to all producer roots based on compute at. These
+// are the buffers we would make persistent in a persistent kerenl or would have
+// to recompute if we can't make a persistent kernel. This function will also
+// return inputs as being marked persistent if they follow this pattern. It is
+// important to note however inputs don't strictly have to be persistent as they
+// can simply be read multiple times from GMEM in the same kernel.
+PersistentBufferInfo persistentBuffers(Fusion* fusion);
+// A persistent tv can be projected to its producers when all the producers are
+// persistent tvs and there is no reduction op.
+bool canProjectToPersistentProducer(
+    TensorView* buffer,
+    const std::vector<TensorView*>& producers,
+    const std::unordered_set<TensorView*>& persistent_buffer_set);
+//! Evaluates if a persistent buffer can be projected to input tvs without
+//! dependency on reduction tvs. Returns a std::pair with a boolean indicating
+//! whether projection is feasible and a vector of projectable tvs.
+//!
+//! The function operates in two main steps:
+//! (1) Checks if the persistent buffer has dependencies on any of the given
+//!     reduction tvs. If no dependencies are found, it returns true with an
+//!     empty vector of target broadcast tvs.
+//! (2) If there are dependencies, it examines each reduction tv for an
+//!     associated broadcast tv that can be projected to. If all reduction tvs
+//!     have corresponding broadcast tvs, true is returned along with these tvs.
+//!     If any reduction tv lacks a corresponding broadcast tv, false is
+//!     returned with the current list of identified broadcast tvs.
+std::pair<bool, std::vector<TensorView*>> canProjectToInputsWithoutReduction(
+    const std::vector<TensorView*> reduction_tvs,
+    TensorView* persistent_buffer);
+struct ReductionTvProperties {
+  // How many elements in tensor view are there to reduce.
+  int64_t total_reduction_numel = 1;
+  // How many reductions do we need to perform, i.e. how many iter dimension.
+  // elements are there
+  int64_t total_iteration_numel = 1;
+  // Is the inner most dimension a reduction, if no reductions mark true.
+  bool fastest_dim_reduction = true;
+  // How many elements in the inner most dimension merging surrounding domains
+  // that match in type. This is used for 3D schedulers in
+  // reduction/normalization.
+  int64_t inner_most_dimension_numel = 1;
+  // Same thing as above, but the number of dimensions instead of the numel.
+  int64_t inner_most_dimension_ndims = 1;
+  // Merging neighboring iteration domains, and reduction domains, what's the
+  // resulting dimensionality of the problem.
+  int64_t dimensionality = 1;
+};
+// Fill ReductionTvProperties structure about tv
+ReductionTvProperties getReductionProperties(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    TensorView* tv);
+// Struct to store persistent buffer sizes. also holds the persistent buffer
+// size of the buffers are projected to the inputs.
+struct PersistentBufferSizeReturn {
+  int64_t persistent_buffer_size = 0;
+  int64_t projected_persistent_buffer_size = 0;
+};
+// Compute the amount of register space would be needed to perform this kernel
+// persistently, only based on buffers that must be persistent, and based on the
+// maximum of all minimum size requirement. i.e. if must be persistent, only
+// hold persistent dimension.
+PersistentBufferSizeReturn persistentBufferSize(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    const PersistentBufferInfo& persistent_buffers,
+    HeuristicDataCache* data_cache = nullptr);
+// Merges tensor view to the form:
+// [IterationDomain, ReductionDomain] Returns if <iteration dimensions,
+// reduction dimensions>
+std::pair<bool, bool> canonicalDimReduction(
+    Fusion* fusion,
+    TensorView* tv,
+    bool schedule_3D = false);
+// Return a list of tensor views that are outputs of reduction operations,
+// excluding resharding reduce expressions. If multiple outputs of an expression
+// are found, only include one in the list
+std::vector<TensorView*> getReductionTvs(Fusion* fusion);
+// Returns a list of TensorViews that are the consumer tv for a view operation.
+std::vector<TensorView*> getViewTVs(Fusion* fusion);
+// Returns a list of non-reduction TensorViews that have a root domain
+std::vector<TensorView*> getTVsWithNonReductionRFactor(Fusion* fusion);
+// Reset inputs and outputs to global memory, everything else to local.
+void clearMemorySpace(Fusion* fusion);
+// Returns cached after tensors of the fusion inputs if unrolled. Otherwise
+// return empty vector.
+std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll);
+// Returns the pairs of <cache of each fusion output, corresponding output> for
+// all outputs.
+std::vector<std::pair<TensorView*, TensorView*>> cacheAndForkOutputs(
+    Fusion* fusion,
+    bool unroll);
+// Ignores broadcast and reduction, returns iter domain in allocation domain
+// that's "inner most".
+IterDomain* innerMostAllocDim(TensorView* tv);
+// Looks through fusion and finds all dims that match to the one provided in
+// the tensorview provided. Iter domain must be a root domain. If inner_only,
+// will only map dimensions if they're the inner most position. This is
+// important when projecting a dimension between an rfactor position and its
+// root position when mapping from consumer to producer. If inner_only=true,
+// takes the rfactor/root dimensions that maps, projects it to the root/rfactor
+// domain, but only following the inner most pass when encounting split/merge.
+// When propagating backward, for split it will only propagate backwards if the
+// mapped dimension is the inner portion of the split. For merge, inner_only
+// doesn't make a dimension and will propagate through the inner portion of the
+// merge. When propagating forward, the logic is symmetric with the backward
+// case.
+class FindAllMappedDims : public MaxInfoSpanningTree::Propagator {
+  std::unordered_map<TensorView*, IterDomain*> mapped_root_ids_;
+  std::unordered_map<TensorView*, IterDomain*> mapped_logical_ids_;
+  TensorView* starting_tv_ = nullptr;
+  IterDomain* starting_id_ = nullptr;
+  bool inner_only_;
+  bool vectorize_pass_;
+ public:
+  FindAllMappedDims(
+      TensorView* from,
+      IterDomain* starting_id,
+      bool inner_only,
+      bool vectorize_pass);
+  void setUp() override;
+  void propagateC2P(TensorView* from, TensorView* to) override;
+  void propagateP2C(TensorView* from, TensorView* to) override;
+  void propagateSibling(TensorView* from, TensorView* to) override;
+  std::unordered_set<IterDomain*> get() const;
+};
+// Checks if tensor view has an iteration domain in vector dims in its inner
+// most root position (excluding broadcast and reduction), and checks if it is a
+// contiguous dimension
+bool hasInnerDim(
+    TensorView* tv,
+    std::unordered_set<IterDomain*> vector_dims,
+    bool should_vectorize);
+// Returns all inputs and outputs that share the inner most dimension of the
+// provided reference. If reference is an input it ignores reduction axes, will
+// ignore all broadcast axes. If inner_only, will require inner->inner mapping
+// in view, otherwise, it allows all inner->any mapping. If vectorize_pass, will
+// check contiguity for vectorization, otherwise it just checks it has that
+// inner dim.
+std::vector<TensorView*> getInputsOutputsWithInnerDim(
+    TensorView* reference_tv,
+    bool inner_only,
+    bool vectorize_pass);
+// Holder return struct for the below function.
+struct DisjointLogicalSetInfo {
+  // const* to the disjoint set in disjoint_rfactor_set passed in to
+  // getDisjointLogicalSetsOf each iterdomain in the rfactor of ref is mapped
+  // to.
+  //
+  // WARNING: these pointers are relative to the disjoint_rfactor_set reference
+  // passed into getDisjointLogicalSetsOf it's the user's responsibility to
+  // maintain the lifetime of that reference to match this vector.
+  std::vector<const VectorOfUniqueEntries<IterDomain*>*> disjoint_sets_of_ref;
+  // Unique ID associated to the disjoint view group the logical id belongs to
+  // in disjoint_sets_of_ref. It's straight forward to map from
+  // disjoint_sets_of_ref to the vector, but not the other way around.
+  std::vector<int64_t> disjoint_set_ids;
+  // TensorView reference the above vectors are relative to.
+  TensorView* ref;
+};
+// Returns disjoint rfactor sets mapped onto the given reference. Returns a pair
+// of vectors of size rfactorDomain of reference. Vector of
+// VectorOfUniqueEntries returns a const* to the disjoint set in
+// disjoint_rfactor_set the iterdomain is mapped to. Integer vector represents
+// which disjoint rfactor group the logical id belongs to. It's straightforward
+// to map from the former to the latter, but not the latter to former.
+//
+// Since we return a const* to entries in disjoint_rfactor_set, it must be
+// passed in as a reference. Algorithm is N^2 based on number of dims in
+// reference, but generating the disjoint rfactor set is likely the limiter on
+// perf of this function.
+//
+// logical_reorder_map is provided to assume TensorView `of` will be reordered
+// per the map
+DisjointLogicalSetInfo getDisjointLogicalSetsOf(
+    Fusion* fusion,
+    TensorView* of,
+    DisjointSets<IterDomain*>& disjoint_rfactor_set,
+    const std::unordered_map<int64_t, int64_t>& logical_reorder_map = {});
+// Structure to hold byte multiples for break points. I.e. if we have the
+// tensors:
+// T0[I0, I1] float
+// T1[I0, I1] bool
+// T2[I0]     half
+// T3    [I1] double
+// and a break point of 1 the multiples would be:
+// lhs_multiple = 4 + 1 + 2 = 7
+// rhs_multiple = 4 + 1 + 8 = 13
+struct BroadcastMultiple {
+  int64_t rhs_multiple = 0;
+  int64_t lhs_multiple = 0;
+};
+struct BroadcastMultipleInformation {
+  std::vector<int64_t> view_disjoint_set_ids;
+  std::vector<BroadcastMultiple> broadcast_multiples;
+};
+// Returns a vector of size reference_tv->getLogicalDomain().size() which
+// is a view disjoint set id of each of those iter domains. If entries share the
+// same value, they undergo view transformations in the fusion together.
+// Broadcast multiples are also of size
+// reference_tv->getLogicalDomain().size(), each entry [i] is the number of
+// inputs/outputs that have a non-broadcast dimension mapped to the
+// corresponding dimension in reference_tv. Broadcast multiples includes
+// reference_tv if reference_tv is an input or output. Broadcast multiples is
+// multiplied by data type size. In the case of view operations the broadcast
+// multiple is the full multiple size if any domain in the group maps to a
+// non-broadcast dimension in the given input/output. Otherwise if all
+// dimensions are broadcast that input/output will not contribute to the
+// multiple.
+//
+// logical_reorder_map is provided to assume reference_tv will be reordered per
+// the map
+BroadcastMultipleInformation getBroadcastMultiples(
+    TensorView* reference_tv,
+    DataType index_type,
+    const std::unordered_map<int64_t, int64_t>& logical_reorder_map = {});
+//! Propagate current transformations on from_tv up to the given
+//!  position, to all tensorviews on the owning fusion that has
+//!  a connection with `from_tv` on the fusion graph.
+void transformPropagateToAllFrom(TensorView* from_tv, int64_t pos);
+//! A type of custom transform propagator that propagates iterdomain
+//!  transforms from a source tv to all tvs that are selected
+//!  using a "direction" and a "boundary".
+//!
+//! The propagation model always assumes a `from_tv`, a `direction` and a
+//! `boundary`.
+//!
+//! This propagator will only transform producers and consumers
+//! of `from_tv`, and all propagation modes **require** a boundary to be
+//! specified to signify where the propagation should stop.
+//!
+//! There are currently three modes of propagation: forward, backward and
+//! both-way, see comment on the interface functions for details.
+struct BoundedDirectionalTransformPropagator {
+  //! Custom option container for configuring
+  //!  the transform propagation actions.
+  //! All option values default to false unless
+  //!  the corresponding setter is called.
+  struct Options {
+    //! If true, the transform propagator will
+    //!   also propagate parallel types from
+    //!   `from_tv` to all selected tvs.
+    bool propagate_parallel_type = false;
+    //! If true, the specified boundary tvs
+    //!  will also be replayed as `from_tv`.
+    //!  If false, they will not be affected
+    //!  by the propagation pass.
+    bool transform_boundary = false;
+    //! Sets the position boundary in parallel
+    //!  type propagation, see comment on
+    //!  scheduler_utils::parallelizeAllLike.
+    //! Only used if propagate_parallel_type==true.
+    int64_t parallel_propagation_pos = -1;
+    //! Setter for enabling parallel type
+    //!  propagation. see comment on the variable.
+    //!
+    //! \param up_to_pos, sets the parallel type
+    //!  propagation boundary. see comment on
+    //!  scheduler_utils::parallelizeAllLike.
+    Options propagateParallelType(int64_t up_to_pos = -1) {
+      propagate_parallel_type = true;
+      parallel_propagation_pos = up_to_pos;
+      return *this;
+    }
+    //! Setter for enabling propagation to
+    //!  boundary tvs. see comment on the variable
+    Options propagateToBoundary() {
+      transform_boundary = true;
+      return *this;
+    }
+  };
+  //! Replay transforms from tensorview `from`
+  //!  to the tensorviews that are consumers
+  //!  of boundary tensorviews in `to` and producers of `from`.
+  static void backward(
+      TensorView* from,
+      int64_t pos,
+      std::vector<TensorView*> to,
+      std::optional<Options> options = std::nullopt);
+  //! Replay transforms from tensorview `from`
+  //! to the tensorviews that are producers
+  //!  of boundary tensorviews in `to` and consumers of `from`.
+  static void forward(
+      TensorView* from,
+      int64_t pos,
+      std::vector<TensorView*> to,
+      std::optional<Options> options = std::nullopt);
+  //! Replay transforms from tensorview `from`
+  //!  to all the tensorviews that are consumers
+  //!  of tensorviews in `backward_to` and producers
+  //!  of tensorviews in `forward_to` while being
+  //!  either a producer or a consumer of tensorview `from`.
+  static void bothWays(
+      TensorView* from,
+      int64_t pos,
+      std::vector<TensorView*> backward_to,
+      std::vector<TensorView*> forward_to,
+      std::optional<Options> options = std::nullopt);
+ private:
+  //! Utility function:
+  //!  Will realize the transform propagation to the
+  //! tensorview's in `included_tvs`.
+  //!  Assumes that all tvs in included_tvs are either
+  //! a producer or a consumer of from_tv.
+  static void propagate(
+      TensorView* from_tv,
+      int64_t pos,
+      std::unordered_set<TensorView*> included_tvs,
+      Options options);
+};
+// Schedulers typically start by merging some axes together then splitting,
+// and propagating those transformations through the dag. What we want to
+// understand is if these merges can be supported through view operations.
+// For example it could be problematic to support a reduction fusion:
+//
+// tv0[2, 3, 4]
+// tv1 = sum(tv0, {1, 2})
+// tv2 = view(tv0, {6, 4})
+//
+// Since the first step of the reduction scheduler would be tv1->merge(1, 2).
+// If we tried to propagate this transformation through the view it would make
+// the view invalid. If we tried to propagate the view through the reduction,
+// it would attempt to merge a reduction and non-reduction dimension. So for
+// these types of fusions we would like to understand that the view considers
+// axis 1 and 2 of tv1 as "non-separable" axes.
+//
+// If IterDomains are disjoint in the returned set, then they are considered
+// "separable".
+// Warning: This pass generates the IdGraphs, not intended for use at runtime.
+DisjointSets<IterDomain*> disjointLogicalSets(Fusion* fusion);
+// Makes sure that there are no group id's left of pos that match right of pos.
+// e.g.
+// [1, 0, 0] pos 2 would return false
+// [1, 0, 0] pos 1 would return true
+bool breakIsDisjoint(std::vector<int64_t> group_ids, int64_t pos);
+// Generates an old to new map to reorder tv's domain as the logical order.
+// Priority is given to inner most dimensions for example:
+// logical [i0, i1, i2]
+// domain [i0*i2, i1]
+// will produce the map {{0, 1}, {1, 0}}
+// This is somewhat similar to orderTiledConcreteIdAsRoot
+std::unordered_map<int64_t, int64_t> domainReorderAsLogicalMap(TensorView* tv);
+// Generates an old to new map to reorder tv's domain as the logical order.
+// This only handles the simple case where allocation is a permutation of
+// logical domain, otherwise, the function returns an empty container.
+std::unordered_map<int64_t, int64_t> maybeLogicalReorderAsAllocationMap(
+    TensorView* tv);
+// Assumes view's are consistent as detected by
+// registery.cpp::requiresForwardViewReplay returning false
+void propagateReshapeTransforms(Fusion* fusion, const ComputeAtMap& ca_map);
+//! Check if tv is an output of a fastest-dim reduction
+bool isFastestDimReduction(TensorView* tv);
+// A wrapper for Fusion::rotateLoop that provide more consistent interace
+inline void rotateLoop(
+    TensorView* loop_tv,
+    int64_t axis,
+    std::unordered_set<Statement*> selection) {
+  auto fusion = loop_tv->fusion();
+  if (!fusion->hasManaged("loop_rotation")) {
+    fusion->manage("loop_rotation", LoopRotationParam{});
+  }
+  fusion->getManaged<LoopRotationParam>("loop_rotation")
+      .emplace_back(loop_tv, axis, std::move(selection));
+}
+//! Certain tensors may need to be placed on shared or global memory
+//! due to data dependencies caused by resize operations. Create
+//! caches of those tensors so that original operations producing
+//! them should keep using the same memory. This avoids, for example,
+//! reductions to global memory.
+//!
+//! Example:
+//!
+//! tv1 = sum(tv0)
+//! tv2 = some_resize_op(tv1);
+//! tv3 = some_other_op(tv1);
+//!
+//! When tv1 is promoted to Global, we want to avoid reducing to a
+//! global memory tensor. After the transformation by this function,
+//! the fusion should look like:
+//!
+//! tv1 = sum(tv0);
+//! tv4 = tv1
+//! tv4->setMemoryType(Global)
+//! tv2 = some_resize_op(tv4)
+//! tv3 = some_other_op(tv1);
+//!
+//! Note that the sum reduction is done using a Local buffer, i.e.,
+//! tv1, but the data dependency for the resize op is still satisfied
+//! by having a copy of tv1, i.e., tv4. Note that the other op using
+//! tv1 still uses tv1.
+void prepareForMemoryTypePromotion(Fusion* fusion);
+//! If a consumer tensor induces a data dependency between threads,
+//! move its producer to a shared memory that is sufficient to satisfy
+//! the dependency. For example, if the domain is parallelized
+//! with blockIdx, the producer memory type will be changed to
+//! Global. A proper RAW sync will be automatically inserted when the
+//! fusion is lowered.
+void promoteProducerMemoryTypes(
+    Fusion* fusion,
+    const std::vector<TensorView*>& input_caches);
+//! Get all tensors that are connected to from_tvs without going through
+//! any tvs in the cutoff_tv_set.
+std::unordered_set<TensorView*> getAllTvsFrom(
+    const std::vector<TensorView*>& from_tvs,
+    const std::unordered_set<TensorView*>& cutoff_tv_set);
+//! Get the persistent buffer size of a tensor
+int64_t getPersistentBufferSizeOfTensor(
+    const TensorView* buffer,
+    SchedulerRuntimeInfo& runtime_info,
+    const PersistentBufferInfo& persistent_buffer_info);
+//! The required shared memory size for a block inclues two parts: (1) smem
+//! for persistent buffers and (2) overhead. The overhead includes space
+//! reserved by the CUDA driver and reduction workspace which depends on the
+//! number of threads per block specified by the parameter threads_per_block.
+//! By default, the function uses the maximum allowed number of threads per
+//! block (threads_per_block = -1) to calculate the overhead. The caller can
+//! specify a different value if they are sure about the max value used at
+//! runtime.
+int64_t getSharedMemoryOverheadPerBlock(
+    Fusion* fusion,
+    const std::vector<TensorView*>& reduction_tvs,
+    int64_t threads_per_block = -1);
+// Returns true if any Expr in `fusion` is resharding.
+bool isResharding(Fusion* fusion);
+// Move non-concretized broadcast domains to innermost
+// positions. Broadcast domains mapped with any domains of given tvs
+// are ignored.
+//
+// The goal here is to find domains that are not scheduled by
+// propagation from reference tensors (i.e., ignored_tvs). All
+// schedulers make sure to include only schedulable domains but they
+// may also allow to have non-concretized broadcast domains that have
+// no mapping with any of reference tensors. Since they are
+// non-concretized, they should be safe to ignore. Ideally, they
+// should just be removed from the fusion. For now, they are moved to
+// innermost positions to prevent them from interfering
+// inlining. If they happened to be at the
+// outermost position, the tensor wouldn't be inlined at all. See
+// issue #2686 and PR #2799.
+void moveNonConcretizedBroadcastInnermost(
+    Fusion* fusion,
+    const std::unordered_set<TensorView*>& ignored_tvs = {});
+// Returns a factor represents the computation cost of the given fusion.
+// Estimated using the number of MUFU operations, each weighted with a
+// predefined factor.
+int64_t getComputationCostFactor(Fusion* fusion);
+// Returns the required bytes in flight to saturate the memory bandwidth.
+int64_t getRequiredBytesInFlight();
+// Returns true if the device has a high bandwidth to compute raito.
+bool isHighBandwidthFlopsRatio();
+// Return true if the fusion has computation requires Floating-Point
+// Multi-Function (MUFU) units, e.g. cos, sin, exponent, logarithm, sine,
+// cosine, square root, hyperbolic tangent. Currently, we only tested tanh, exp,
+// and Reciprocal. Note that, if compiled with fast math (not supported yet) or
+// directly lowered with inlined ptx, needs to revise the inner reduction
+// heuristics which uses this function to set the optimal unroll factor.
+bool hasExpensiveMUFUops(Fusion* fusion);
+// Reorder DID parallelized axes to outermost positions. Returns
+// the position of the outermost non-DID axis.
+int64_t reorderDevicesToOuter(TensorView* tv);
+// Returns number of non-reduction/non-broadcas/non-device dims in logical
+// domain
+inline int64_t nLogicalDims(const TensorView* tv) {
+  auto logical_dom = tv->getLogicalDomain();
+  int64_t tv_n_dims = 0;
+  for (auto dim : logical_dom) {
+    if (!dim->isReduction() && !dim->isBroadcast() && !dim->isDeviceDim()) {
+      tv_n_dims++;
+    }
+  }
+  return tv_n_dims;
+}
+// Reorer the loop domain of a given tensor to align with a given list of
+// reference IDs. Non-matching loop IDs are placed outermost positions.
+void reorderTensorLike(TensorView* tv, const std::vector<IterDomain*>& ref);
+} // namespace scheduler_utils
+} // namespace nvfuser