PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/iter_visitor.h ADDED Viewed

@@ -0,0 +1,661 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <bfs.h>
+#include <dispatch.h>
+#include <ir/base_nodes.h>
+#include <type.h>
+#include <deque>
+#include <unordered_set>
+#include <vector>
+namespace nvfuser {
+class Fusion;
+/*
+ * IterVisitor starts from leaf nodes, fusion outputs, or the provided values.
+ * It walks the DAG bacwkards from the starting nodes, to roots. Each node in
+ * the dag will be called with handle(Statement*) in topolgical order inputs of
+ * the fusion to outputs of the fusion.
+ *
+ * TODO: We may want a BFS version of this code to extract ILP, not implemented
+ * yet.
+ *
+ * TODO: We may want to have ordering of outputs to inputs. I'm not sure why we
+ * would want this, but seems like it would be a reasonable request.
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class NVF_API IterVisitor : public OptOutDispatch {
+ public:
+  ~IterVisitor() override = default;
+  IterVisitor() = default;
+  IterVisitor(const IterVisitor& other) = default;
+  IterVisitor& operator=(const IterVisitor& other) = default;
+  IterVisitor(IterVisitor&& other) = default;
+  IterVisitor& operator=(IterVisitor&& other) = default;
+ protected:
+  // Functions return nodes in reverse order to be added to the to_visit queue
+  // These functions will start at outputs and propagate up through the DAG
+  // to inputs based on depth first traversal. Next could be called on a node
+  // multiple times.
+  virtual std::vector<Statement*> next(Statement* stmt);
+  virtual std::vector<Statement*> next(Val* v);
+  virtual std::vector<Statement*> next(Expr* expr);
+  using OptOutDispatch::handle;
+  // This dispatch functions is called on every Statement* in topological order,
+  // starting from outputs to inputs.
+  void dispatch(Statement* s) override;
+  // This dispatch functions is called on every Expr* in topological order,
+  // starting from outputs to inputs.
+  void dispatch(Expr* e) override;
+  // This dispatch functions is called on every Val* in topological order,
+  // starting from outputs to inputs.
+  void dispatch(Val* v) override;
+  // The entire stack during traversal. stmt_stack.back().back() is the node
+  // that is being called in handle(). stmt_stack.back() contains siblings (not
+  // guarenteed to be all siblings throughout traversal). stmt_stack.front()
+  // contains the outputs we started with (not guarenteed to be all outputs
+  // throughout traversal).
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::vector<Statement*>> stmt_stack;
+  void traverseHelper(Fusion* fusion, bool traverse_all_paths = false);
+ public:
+  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
+  //! from inputs towards outputs.
+  //! \param traverseAllPaths = false only call handle on each Statement* once
+  //!    traverseAllPaths = true traverses all paths between expressions/values.
+  //!    Calls handle on a Statement* for every path from inputs to "to".
+  //! \param traverseIntoMembers = When hitting nodes like TensorView,
+  //! TensorDomain, or IterDomain where there are members of the nodes that are
+  //! Val's a value of "true" will also traverse into those member Val's, a
+  //! value of "false" will not traverse into the members.
+  //! \param traverse_attributes When true, traverse into expr
+  //! attributes. Note that attributes of template type Attribute are
+  //! not traversed as there's no dispatch support.
+  //! \param traverse_siblings When true, traverse all outputs of
+  //! active multi-output expressions, even if those Expr outputs are not used
+  //! in paths to Fusion outputs.
+  void traverseTo(
+      const std::vector<Val*>& to,
+      bool traverse_all_paths = false,
+      bool traverse_into_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
+  //! from inputs towards outputs.
+  //! \param traverseAllPaths = false only call handle on each Statement* once
+  //!    traverseAllPaths = true traverses all paths between expressions/values.
+  //!    Calls handle on a Statement* for every path from inputs to "to".
+  //! \param traverseIntoMembers = When hitting nodes like TensorView,
+  //! TensorDomain, or IterDomain where there are members of the nodes that are
+  //! Val's a value of "true" will also traverse into those member Val's, a
+  //! value of "false" will not traverse into the members.
+  //! \param from: Specified values to start traversing. If a "from" Val is not
+  //! on path from inputs to "to" node it will not be visited. If there's a path
+  //! from inputs to "to" that doesn't go through "from" that input and the path
+  //! from it will also be traversed.
+  //! \param traverse_attributes When true, traverse into expr
+  //! attributes. Note that attributes of template type Attribute are
+  //! not traversed as there's no dispatch support.
+  //! \param traverse_siblings When true, traverse all outputs of
+  //! active multi-output expressions, even if those Expr outputs are not used
+  //! in paths to Fusion outputs.
+  void traverseBetween(
+      const std::unordered_set<Val*>& from,
+      const std::vector<Val*>& to,
+      bool traverse_all_paths = false,
+      bool traverse_into_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Iterates from terminating outputs registered with the fusion. Terminating
+  // means value is not used to generate any other value used in producing
+  // registered outputs.
+  void traverse(Fusion* fusion);
+  // Same as traverse but it traverses every edge, meaning it will traverse
+  // values more than once.
+  void traverseAllPaths(Fusion* fusion);
+  //! Get inputs to vals. Possible input vals can be optionally
+  //! given. If not, vals with no producers are returned.
+  //
+  // TODO: This doesn't seem to fit with IterVisitor. Should probably be moved
+  // out of the class.
+  static std::vector<Val*> getInputsTo(
+      const std::vector<Val*>& vals,
+      const std::vector<Val*>& inputs = {});
+};
+/*
+ * Backward visitor calls handle in reverse order from outputs to inputs.
+ * It would be really nice to unify this with IterVisitor, however,
+ * the challenge there is that we specify traversal from outputs towards inputs
+ * because it implicitly provides DCE. However, if users are not careful, they
+ * could miss necessary outputs to do a backward traversal.
+ *
+ * BackwardVisitor checks that all outputs of an Expr is visited before visiting
+ * the Expr. If we don't provide nodes to start from on all backward paths of
+ * those outputs we will never visit the Expr.
+ *
+ * The first step of BackwardVisitor is to make sure we've specified enough
+ * outputs to guarentee that we will traverse all outputs of all exprs during
+ * the backward traversal. In the case where we don't require visiting all
+ * outputs of some exprs, example being the `N` output of welford ops.
+ * `must_cover_all_expr_outputs` is added to disable the check, and in
+ * this case the visitor pass need be aware
+ *  1. Exprs in the `from` list with any output that has a use chain that
+ * ends with a final consumer `will be` visited.
+ *  2. Vals in the `from` list that doesn't have a use chain that ends with
+ * a final consumer `will not be` visited, even though its
+ * definition expr might be visited. An example is if the `N` output
+ * of an welford op is unused, but other outputs are, the welford op
+ * will be visited but the `N` output will not.
+ *
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class BackwardVisitor : public OptOutDispatch {
+ public:
+  // clang-tidy: cppcoreguidelines-virtual-class-destructor
+  ~BackwardVisitor() override = default;
+ protected:
+  BackwardVisitor(bool must_cover_all_expr_outputs = true)
+      : must_cover_all_expr_outputs_(must_cover_all_expr_outputs) {}
+  BackwardVisitor(const BackwardVisitor& other) = default;
+  BackwardVisitor& operator=(const BackwardVisitor& other) = default;
+  BackwardVisitor(BackwardVisitor&& other) = default;
+  BackwardVisitor& operator=(BackwardVisitor&& other) = default;
+  // Functions return nodes in reverse order to be added to the to_visit queue
+  // These functions will start at outputs and propagate up through the DAG
+  // to inputs based on depth first traversal. Next could be called on a node
+  // multiple times.
+  virtual std::vector<Statement*> next(Statement* stmt);
+  virtual std::vector<Statement*> next(Expr* expr);
+  virtual std::vector<Statement*> next(Val* val);
+  using OptOutDispatch::handle;
+  // This handle functions is called on every Statement* in topological order,
+  // starting from outputs to inputs.
+  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
+  virtual void dispatch(Statement* stmt) override;
+  // This handle functions is called on every Expr* in topological order,
+  // starting from outputs to inputs.
+  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
+  virtual void dispatch(Expr* expr) override;
+  // This handle functions is called on every Val* in topological order,
+  // starting from outputs to inputs.
+  // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
+  virtual void dispatch(Val* val) override;
+  // All exprs that need to be visited in this traversal. Labeled in topological
+  // order (size_t).
+  std::unordered_map<Expr*, size_t> traversal_exprs_;
+  // The entire stack during traversal. stmt_stack.back().back() is the node
+  // that is being called in handle(). stmt_stack.back() contains siblings (not
+  // guarenteed to be all siblings throughout traversal). stmt_stack.front()
+  // contains the inputs we started with (not guarenteed to be all outputs
+  // throughout traversal).
+  std::deque<std::deque<Statement*>> stmt_stack_;
+  // Starts at nodes provided in from, traverses from these nodes to inputs.
+  // Calls handle on all Statement*s in topological sorted order.
+  // traverseAllPaths = false only call handle on each Statement* once
+  // traverseAllPaths = true traverses all paths from nodes in from to inputs.
+  //   Handle on a Statement* for every path from "from" nodes, to inputs.
+  void traverseTo(const std::vector<Val*>& from, bool traverseAllPaths = false);
+  bool must_cover_all_expr_outputs_ = true;
+};
+class DependencyCheck {
+ public:
+  // Returns if "dependency" is a dependency of "of".
+  NVF_API static bool isDependencyOf(Val* dependency, Val* of);
+  // Finds a Val* path from "of" to "dependency". Returns that path.
+  // deque.back() is "of", deque[0] is dependency if a chain exists.
+  NVF_API static std::deque<Val*> getSingleDependencyChain(
+      Val* dependency,
+      Val* of);
+  // Finds all Val* paths from "of" to "dependency". Returns those paths.
+  // deque[i].back() is "of", and deque[i][0] is "dependency". Returns an
+  // empty deque if no dependency found.
+  static std::deque<std::deque<Val*>> getAllDependencyChains(
+      Val* dependency,
+      Val* of);
+  // Finds all Val* paths from all leaf nodes to "dependency". Returns those
+  // paths. deque[i].back() are leaf nodes, and deque[i][0] is "dependency".
+  // Returns an empty deque if there are no uses of dependency found.
+  static std::deque<std::deque<Val*>> getAllUseChains(Val* dependency);
+  // Grab all values that exist between and including provided
+  // vals. Returned values are topologicaly ordered, and unique.
+  NVF_API static std::vector<Val*> getAllValsBetween(
+      const std::unordered_set<Val*>& dependencies,
+      const std::vector<Val*>& of);
+  // Returns all dependent exprs that exist between
+  //  the provided vals
+  static std::vector<Expr*> getAllExprsBetween(
+      const std::unordered_set<Val*>& dependencies,
+      const std::vector<Val*>& of);
+  // Return registered outputs of the fusion that are a dependency of any val of
+  static std::unordered_set<Val*> getAllOutputsOf(
+      const std::unordered_set<Val*>& of);
+  // Return all Vals that depend on the given Vals
+  static std::unordered_set<Val*> getAllDependentVals(
+      const std::unordered_set<Val*>& of);
+};
+// Expr sort will take a fusion and return a topologically sorted list of
+// expressions.
+class StmtSort : public IterVisitor {
+ protected:
+  StmtSort() = default;
+  std::vector<Statement*> stmts;
+  using IterVisitor::handle;
+  void dispatch(Statement* stmt) override;
+ public:
+  // If traverse_members it will also extract all member nodes in the sorted
+  // statement list in the fusion. i.e. all IterDomains, extents, and associated
+  // expressions of them. Similarly, if traverse_attributes it will
+  // grab all nodes associated as Expr attributes.
+  NVF_API static std::vector<Statement*> getStmts(
+      Fusion* fusion,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Returns ordered Statements required to produce 'to', including 'to'.
+  NVF_API static std::vector<Statement*> getStmtsTo(
+      const std::vector<Val*>& to,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Returns all ordered Statements of a given fusion. Unlike
+  // getStmts, for TensorDomain, all of its iter domains and exprs are
+  // grabbed and returned in a topological order.
+  NVF_API static std::vector<Statement*> getAllStmts(
+      Fusion* fusion,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Returns ordered Statements required to produce 'to', including
+  // 'to'. Unlike getStmtsTo, for TensorDomain, all of its iter domains and
+  // exprs are grabbed and returned in a topological order, regardless of
+  // `traverse_members`.
+  //
+  // The to vals are assumed to be either TensorView or scalar
+  // Val. This assumption could be removed if desired.
+  NVF_API static std::vector<Statement*> getAllStmtsTo(
+      const std::vector<Val*>& to,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Returns ordered Statements required to produce from, including from.
+  // Stops traversal once hiting any Statements in to. Includes Statements in
+  // to.
+  //
+  // Warning: this doesn't necessarily prevent statements before `to` from being
+  // returned. e.g.
+  // i1 = i0
+  // i2 = i1
+  // i3 = i2
+  // i4 = i3 + i1
+  // getExprs(fusion, {i4}, {i3})
+  // will return the definition and values {i0, i1, i4}
+  // i3 is dependent on i1, but since i4 also is then the traversal will go down
+  // the i4->i1->i0 path, even though the i4->i3-//>i2->i1 path is blocked.
+  //
+  // If traverse_members it will also extract all member nodes in the sorted
+  // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
+  NVF_API static std::vector<Statement*> getStmtsBetween(
+      const std::vector<Val*>& from,
+      const std::vector<Val*>& to,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Same as getStmts version but filters to only return the Expr*s
+  static std::vector<Expr*> getExprs(
+      const Fusion* fusion,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Same as getStmts version but filters to only return the Expr*s
+  NVF_API static std::vector<Expr*> getExprsTo(
+      const std::vector<Val*>& to,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+  // Same as getStmts version but filters to only return the Expr*s
+  NVF_API static std::vector<Expr*> getExprsBetween(
+      const std::vector<Val*>& from,
+      const std::vector<Val*>& to,
+      bool traverse_members = false,
+      bool traverse_attributes = false,
+      bool traverse_siblings = false);
+};
+class InputsOf : public IterVisitor {
+ private:
+  std::unordered_set<Val*> grabbed_inputs;
+  std::vector<Val*> ordered_inputs;
+  using IterVisitor::handle;
+  void dispatch(Val* v) final;
+ public:
+  NVF_API static std::vector<Val*> output(Val* output_);
+  static std::vector<Val*> outputs(const std::vector<Val*>& outputs_);
+};
+//! This is a generic traversal class that is used to modify a Fusion graph by
+//! replacing Vals to simplify computation or remove dead code. This differs
+//! from OptOutMutator, which is built for mutating TensorViews in-place in a
+//! graph by altering the associated IterDomains, and which does not easily
+//! handle modifying TensorView definitions and Fusion outputs during traversal.
+//!
+//! Derived classes should override handle() for relevant Exprs and they should
+//! make use of registerReplacement() to change the definitions of Vals in the
+//! graph. Note that if replacements are made using registerReplacement(old_val,
+//! new_val), then neither new_val nor any new Statements produced in creating
+//! it will be traversed by this class. Also note that any Vals or Exprs that
+//! are previously marked dead will not be processed by handle().
+class DeadCodeRemover : BackwardVisitor {
+ public:
+  DeadCodeRemover(Fusion* fusion) : BackwardVisitor(false), fusion_(fusion) {}
+  DeadCodeRemover(const DeadCodeRemover& other) = default;
+  DeadCodeRemover& operator=(const DeadCodeRemover& other) = default;
+  DeadCodeRemover(DeadCodeRemover&& other) = default;
+  DeadCodeRemover& operator=(DeadCodeRemover&& other) = default;
+  //! Instead of traverseTo, run() is the entry point for this class, and we
+  //! always traverse from outputs backward to their inputs.
+  //!
+  //! Returns a bool indicating whether the Fusion was modified or not.
+  bool run();
+  inline Fusion* fusion() const {
+    return fusion_;
+  }
+ protected:
+  using BackwardVisitor::handle;
+  void dispatch(Statement* stmt) override;
+  void dispatch(Expr* expr) override;
+  //! We implement this in order to remove dangling TensorViews whose uses are
+  //! all dead. Note that we do not remove other ValTypes like Scalars since
+  //! they might still be used as attributes or members of other objects, which
+  //! is not reflected by Val::uses().
+  void handle(TensorView* tv) override;
+  //! Registers a Val for replacement in outputs and in all its uses.
+  //!
+  //! Note that replacement does not occur immediately, but will be done after
+  //! the traversal is completed. This is so that any Val* and Expr* pointers
+  //! may be safely dereferenced during traversal.
+  //!
+  //! The argument old_val is always marked Dead by this method. If old_val is a
+  //! Fusion input, we do not replace it. If old_val's definition is non-null
+  //! and has other outputs which are not dead, we do not remove old_val.
+  //!
+  //! Returns whether old_val was registered for removal from the Fusion.
+  bool registerReplacement(Val* old_val, Val* new_val);
+  //! Find whether a statement is not marked as live code.
+  inline bool isDead(Statement* stmt) const {
+    return live_statements_.find(stmt) == live_statements_.end();
+  }
+  //! Find whether a statement is marked as live code.
+  inline bool isLive(Statement* stmt) const {
+    return !isDead(stmt);
+  }
+  //! Check whether all outputs of an expression have been marked dead
+  inline bool allOutputsDead(Expr* expr) const {
+    return std::all_of(
+        expr->outputs().begin(), expr->outputs().end(), [&](Val* outp) {
+          return isDead(outp);
+        });
+  }
+  //! Check whether all uses have been marked dead
+  inline bool allUsesDead(Val* val) const {
+    auto fu_it = future_uses_.find(val);
+    if (fu_it != future_uses_.end() && !fu_it->second.empty()) {
+      // Regardless of whether current uses are marked dead, this appears in a
+      // replacement expression, so it has a future live use and we should keep
+      // it.
+      return false;
+    }
+    return std::all_of(val->uses().begin(), val->uses().end(), [&](Expr* use) {
+      return isDead(use);
+    });
+  }
+ private:
+  //! Removes an Expr* from the Fusion, if possible.
+  //!
+  //! The Expr will _only_ be marked dead and removed if all of its outputs are
+  //! already marked dead. In this case all the outputs will also be removed
+  //! from the Fusion.
+  //!
+  //! Returns whether the Expr was marked dead and removed from the Fusion.
+  bool maybeRemoveExpr(Expr* expr);
+  //! Mark a single Statement as being alive.
+  inline void markLive(Statement* stmt) {
+    live_statements_.insert(stmt);
+    if (auto e = dynamic_cast<Expr*>(stmt)) {
+      // Check if this expression is already in uses() for each of its inputs
+      // and if not, record it in future_uses_
+      for (Val* inp : e->inputs()) {
+        if (std::find(inp->uses().begin(), inp->uses().end(), e) ==
+            inp->uses().end()) {
+          auto fu_it = future_uses_.find(inp);
+          if (fu_it == future_uses_.end()) {
+            future_uses_.emplace(inp, std::unordered_set<Expr*>({e}));
+          } else {
+            fu_it->second.insert(e);
+          }
+        }
+      }
+    }
+  }
+  //! Ensure that a Statement and its upstream Statements are alive. If it is an
+  //! Expr, ensure all its inputs are alive. If it's a Val with a definition,
+  //! recursive to the definition. Newly-created Statements default to being
+  //! dead, so this method is called when adding a Statement to the active path
+  //! of the Fusion inside registerReplacement.
+  void markLiveRecursive(Statement* stmt);
+  //! Mark a single Statement as being dead. This does not remove stmt from the
+  //! Fusion. It is an error to call this on a Fusion output.
+  //!
+  //! Returns true if the statement was previously live, and false otherwise.
+  bool markDead(Statement* stmt);
+  //! Register a Val for later removal.
+  void registerRemoval(Val* val);
+  //! Register an Expr for later removal.
+  //!
+  //! Note that if any of its outputs are removed, expr will be removed even if
+  //! it is not marked for removal, and all its outputs will have their
+  //! definitions set to nullptr.
+  inline void registerRemoval(Expr* expr) {
+    exprs_to_remove_.push_back(expr);
+  }
+  //! All modifications to the Fusion are registered during traversal then
+  //! later they are committed by this method. For safety, this should only be
+  //! run after traversing the graph.
+  //!
+  //! Returns a bool indicating whether any modifications were performed.
+  bool modifyFusion() const;
+ private:
+  //! The Fusion associated with live_statements_
+  Fusion* fusion_;
+  //! Statements are marked dead by removing them from this set
+  std::unordered_set<Statement*> live_statements_;
+  //! Vals to be replaced in outputs and with replaceValInExprInputs in all
+  //! uses.
+  std::vector<std::pair<Val*, Val*>> vals_to_replace_;
+  //! Statements that will be removed. We remove Vals before Exprs, so we track
+  //! them separately here.
+  std::vector<Val*> vals_to_remove_;
+  std::vector<Expr*> exprs_to_remove_;
+  //! This holds additional _future_ uses of each val. val->uses() only returns
+  //! currently live uses, so until we have finalized all replacements, new uses
+  //! will not appear there. The mapping below gets populated whenever we mark
+  //! an expression as live, if that expression is not already in inp->uses()
+  //! for any of its inputs.
+  std::unordered_map<Val*, std::unordered_set<Expr*>> future_uses_;
+};
+struct IRDefinitions {
+  decltype(auto) operator()(Val* val) const {
+    auto def = val->definition();
+    if (def == nullptr) {
+      return std::vector<Expr*>{};
+    }
+    return std::vector<Expr*>{val->definition()};
+  }
+};
+struct IRUses {
+  decltype(auto) operator()(Val* val) const {
+    return val->uses();
+  }
+};
+struct IRInputs {
+  decltype(auto) operator()(Expr* expr) const {
+    return expr->inputs();
+  }
+};
+struct IROutputs {
+  decltype(auto) operator()(Expr* expr) const {
+    return expr->outputs();
+  }
+};
+template <>
+struct GetValType<Expr*> {
+  using type = Val*;
+};
+class IRBFS
+    : public BFS<Expr*, Val*, IRDefinitions, IRUses, IRInputs, IROutputs> {
+ public:
+  IRBFS(
+      std::vector<NodeType> from_groups,
+      std::vector<NodeType> to_groups,
+      bool require_all_to_visited,
+      Direction allowed_direction = Direction::Undefined)
+      : BFS(IRDefinitions{},
+            IRUses{},
+            IRInputs{},
+            IROutputs{},
+            std::move(from_groups),
+            std::move(to_groups),
+            require_all_to_visited,
+            allowed_direction) {}
+};
+inline std::vector<Val*> getInputsOfExpr(Expr* expr, Direction dir) {
+  return getInputsOfExpr<Expr*>(expr, dir, IRInputs(), IROutputs());
+}
+inline std::vector<Val*> getOutputsOfExpr(Expr* expr, Direction dir) {
+  return getOutputsOfExpr<Expr*>(expr, dir, IRInputs(), IROutputs());
+}
+class IRPermissiveBFS : public BFSWithPermissiveDependence<
+                            Expr*,
+                            Val*,
+                            IRDefinitions,
+                            IRUses,
+                            IRInputs,
+                            IROutputs> {
+ public:
+  IRPermissiveBFS(
+      std::vector<NodeType> from_groups,
+      std::vector<NodeType> to_groups,
+      bool require_all_to_visited,
+      Direction allowed_direction = Direction::Undefined)
+      : BFSWithPermissiveDependence(
+            IRDefinitions{},
+            IRUses{},
+            IRInputs{},
+            IROutputs{},
+            std::move(from_groups),
+            std::move(to_groups),
+            require_all_to_visited,
+            allowed_direction) {}
+};
+} // namespace nvfuser