PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/fusion_segmenter.h ADDED Viewed

@@ -0,0 +1,751 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <debug.h>
+#include <exceptions.h>
+#include <fusion.h>
+#include <ir/base_nodes.h>
+#include <options.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/registry.h>
+#include <scheduler/runtime_info.h>
+#include <utils.h>
+#include <visibility.h>
+#include <deque>
+#include <list>
+#include <unordered_set>
+#include <vector>
+namespace nvfuser {
+class SegmentedGroup;
+class SegmentCandidateFinder;
+// A directed edge on DAG,
+// Wrapper for values, edges between segmented groups which are made up
+// of Exprs. Multiple edges can exist between segmented groups.
+struct SegmentedEdge {
+  SegmentedEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val)
+      : from(from), to(to), val(val) {}
+  SegmentedGroup* from;
+  SegmentedGroup* to;
+  Val* val;
+  void print() const;
+};
+std::ostream& operator<<(std::ostream& os, const SegmentedEdge* edge);
+//! Groups together expressions which create a segmented group
+//! Can be used to produce fusions
+class SegmentedGroup {
+ public:
+  //! Utility struct to represent a group connection
+  //!  both the group to connect with and the edge
+  //!  to connect through
+  struct NeighborGroup {
+    NeighborGroup(SegmentedGroup* g, SegmentedEdge* e) : group(g), edge(e) {}
+    SegmentedGroup* group;
+    SegmentedEdge* edge;
+  };
+  SegmentedGroup(SegmentedFusion* segmented_fusion)
+      : segmented_fusion_(segmented_fusion) {}
+  SegmentedGroup(Expr* expr, SegmentedFusion* segmented_fusion)
+      : segmented_fusion_(segmented_fusion) {
+    exprs_.push_back(expr);
+  }
+  //! Create a temporary group to signify a fusion input, which can be
+  //! an original fusion input or a forwarded input with unary-only
+  //! use chains
+  SegmentedGroup(SegmentedFusion* segmented_fusion, bool is_fusion_input)
+      : is_fusion_input_(is_fusion_input),
+        segmented_fusion_(segmented_fusion) {}
+  //! Serialize SegmentedGroup using flatbuffers
+  flatbuffers::Offset<serde::SegmentedGroup> serialize(
+      flatbuffers::FlatBufferBuilder& builder,
+      const std::unordered_map<Val*, int64_t>& vals_map,
+      const std::unordered_map<Expr*, int64_t>& exprs_map,
+      const std::unordered_map<SegmentedGroup*, int64_t>& groups_map,
+      const std::unordered_map<SegmentedEdge*, int64_t>& edges_map) const;
+  //! Deserialize SegmentedGroup using flatbuffers
+  void deserialize(
+      const serde::SegmentedGroup* buffer,
+      const std::deque<Val*>& vals,
+      const std::deque<Expr*>& exprs,
+      const std::vector<SegmentedGroup*>& groups,
+      const std::vector<SegmentedEdge*>& edges);
+  //! Checks if this group takes original fusion's input
+  bool isInputGroup() {
+    return !input_vals.empty();
+  };
+  //! Checks if this group is used any where in the segmented fusion
+  bool isConnected() const {
+    return !producer_edges.empty() || !consumer_edges.empty() ||
+        !output_vals.empty();
+  }
+  //! returns the id assigned by segment pass
+  int groupId() const {
+    return group_id_;
+  }
+  //! Returns inputs that this group shares with the original fusion
+  const auto& inputs() const {
+    return input_vals;
+  }
+  //! Returns outputs that this group shares with the original fusion
+  const auto& outputs() const {
+    return output_vals;
+  }
+  //! Returns the schedule heuristic associated with this group
+  SchedulerType schedulerType() const {
+    return scheduler_type_;
+  }
+  //! Returns the exprs that make up this group
+  const std::vector<Expr*>& exprs() const {
+    return exprs_;
+  }
+  //! Returns the complete fusion inputs mapped to this segmented group's fusion
+  const auto& getCompleteFusionInputs() const {
+    return original_inputs_in_cloned_fusion_;
+  }
+  //! Returns cloned fusion for this segmented group.
+  //! TODO Replace read-only uses of makeFusion with cached getFusion
+  Fusion* getFusion() {
+    // Build cloned fusion for this segmented group
+    if (cloned_fusion_ == nullptr) {
+      makeClonedFusion();
+    }
+    return cloned_fusion_.get();
+  }
+  //! Debug print function
+  void print() const;
+  //! Utility to re-collect the operators included in this
+  //!  segmented group after updating the group boundary.
+  void resetExprList();
+  //! Try to get a scheduler entry for this group with
+  //!  the given runtime info.
+  //! Returns a new scheduler with the same heuristics
+  //!  for this group if possible.
+  //!  Note that the schedule params can be different.
+  //! Returns a nullopt if this group cannot be scheduled
+  //!  with the same heuristics.
+  std::optional<std::unique_ptr<HeuristicParams>> getMaybeHeuristicParams(
+      SchedulerRuntimeInfo& runtime_info);
+  //! Query if this is a group for a fusion input
+  bool isFusionInputGroup() const;
+ public:
+  //! "Ancestor nodes", towards inputs of segmentedDAG
+  std::vector<SegmentedEdge*> producer_edges;
+  //! "Descendent nodes", towards outputs of segmentedDAG
+  std::vector<SegmentedEdge*> consumer_edges;
+  //! Composite Fusion inputs in this group
+  std::vector<Val*> input_vals;
+  //! Composite Fusion outputs in this group
+  std::vector<Val*> output_vals;
+  bool isMerged() const {
+    return merged_;
+  }
+ private:
+  friend class SegmentCandidateFinder;
+  friend class SegmentedFusion;
+  friend class FusionKernelRuntime;
+  friend class TranslateApplicableWelford;
+  //! unique identifier of group in the segmented fusion
+  int group_id_ = -1;
+  //! The scheduler to use for compiling this group
+  SchedulerType scheduler_type_ = SchedulerType::None;
+  //! Exprs that make up the group
+  std::vector<Expr*> exprs_;
+  //! Maximum path distance from an input segmented group required for
+  //! Theorem 4.2
+  int level_ = -1;
+  //! traversal marker, has this node already been processed
+  bool visited_ = false;
+  //! Did we select another group to merge with
+  SegmentedGroup* merge_with_ = nullptr;
+  //! if we selected another group to merge, which edge is to be contracted
+  SegmentedEdge* merge_through_ = nullptr;
+  //! Has this node been merged?
+  bool merged_ = false;
+  //! Is a group for a fusion input?
+  bool is_fusion_input_ = false;
+ private:
+  //! Utility to convert edge vector to value vector
+  std::vector<Val*> edgesToVals(const std::vector<SegmentedEdge*>& se_v);
+  //! Reset method to call at begining of each
+  //!  merge node iteration
+  void clearTraversalInfo();
+  //! To be called at the very end of segment fusion
+  //!  no more segment merging should be done beyond
+  void finalize();
+  //! Make the cloned fusion for this segmented group
+  void makeClonedFusion();
+  //! Return all segmented groups connected with *this
+  std::vector<SegmentedGroup*> getNeighbors();
+  //! TODO: May want to sort this based on size of connections between this and
+  //! neighbors as well as if the connection is an output of the fusion (has to
+  //! be saved to gmem anyways)
+  std::vector<NeighborGroup> getNeighborGroups();
+  //! Look at all neighbors of this and return who this could merge with based
+  //! on level values of this, neighbors, and merged neighbors of neighbors
+  std::vector<NeighborGroup> getMergeCandidates();
+  //! Assign scheduler type to this group
+  void setSchedulerType(SchedulerType scheduler_type) {
+    scheduler_type_ = scheduler_type;
+  }
+  //! Assign Id for this group
+  void setID(int id) {
+    NVF_ERROR(group_id_ == -1);
+    group_id_ = id;
+  }
+  //! SegmentedFusion this group belongs to
+  SegmentedFusion* segmented_fusion_;
+  //! The cloned segmented fusion
+  std::unique_ptr<Fusion> cloned_fusion_;
+  //! These are the complete fusion's inputs mapped to the cloned fusion
+  std::vector<Val*> original_inputs_in_cloned_fusion_;
+};
+std::ostream& operator<<(std::ostream& os, const SegmentedGroup* group);
+//! Exported Interface for representing segmented fusion graph
+//!   this class owns the segmented groups
+class SegmentedFusion {
+ public:
+  explicit SegmentedFusion(std::unique_ptr<Fusion> fusion);
+  //! Factory function for the un-segmented case, directly
+  //!  constructs a "SegmentedFusion", with the given Fusion
+  //!  as the only group.
+  static std::unique_ptr<SegmentedFusion> fromCompleteFusion(
+      std::unique_ptr<Fusion> fusion,
+      SchedulerType scheduler_type,
+      const KernelArgumentHolder& runtime_inputs);
+  //! Is the fusion segmented?
+  bool isSegmented() const {
+    return !groups_.empty();
+  }
+  std::vector<SegmentedGroup*>& groups() {
+    return groups_;
+  }
+  const std::vector<SegmentedGroup*>& groups() const {
+    return groups_;
+  }
+  std::vector<SegmentedEdge*>& edges() {
+    return edges_;
+  }
+  const std::vector<SegmentedGroup*>& cgroups() const {
+    return groups_;
+  }
+  const std::vector<SegmentedEdge*>& cedges() const {
+    return edges_;
+  }
+  //! Returns the original un-segmented fusion
+  Fusion* completeFusion() const {
+    return complete_fusion_.get();
+  }
+  const auto& inputs() const {
+    return complete_fusion_->inputs();
+  }
+  const auto& outputs() const {
+    return complete_fusion_->outputs();
+  }
+  //! Get the fusion for the segmented group and return the IrCloner used to
+  //! clone the complete fusion
+  std::pair<IrCloner, std::unique_ptr<Fusion>> makeFusion(SegmentedGroup* sg);
+  //! Make a heuristics entry for a group and parameters
+  std::unique_ptr<HeuristicParams> makeInitialHeuristicParams(
+      SegmentedGroup* sg,
+      SchedulerRuntimeInfo& runtime_info);
+  //! Debug drawing for graphviz
+  void draw();
+  //! Debug print for segmented fusions
+  void print() const;
+  //! API for adding groups
+  SegmentedGroup* newGroup();
+  //! API shortcut for adding a singleton group
+  SegmentedGroup* newGroup(Expr* expr);
+  //! API shortcut for adding a new group for a fusion input
+  SegmentedGroup* newFusionInputGroup();
+  //! API for adding edges
+  SegmentedEdge* newEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
+  HeuristicDataCache* getCachedHeuristicDataFor(SegmentedGroup* group);
+  //! Lower FP precision of inputs and outputs specified by the given
+  //! edges.
+  //!
+  //! This function is used in two scenarios. One is when testing a
+  //! merge of groups during the segmentation time. At that time,
+  //! those groups are not yet merged, but we want to consider them as
+  //! merged and see if there's a valid scheduler. So, we treat the
+  //! groups given by groups_to_merge as a single group and insert
+  //! cast ops into the group. No other group is modified unless it
+  //! has an edge to any of the merged groups.
+  //!
+  //! The second scenario is when inserting cast ops to a whole
+  //! segmented fusion. All groups are considered separate groups with
+  //! no (temporary) merging. Each edge is considered a potential
+  //! place to insert cast. In this case, groups_to_merge should be
+  //! empty.
+  std::vector<SegmentedEdge*> castInputOutputToLowerPrecision(
+      const std::vector<SegmentedEdge*>& edges,
+      const std::vector<SegmentedGroup*>& groups_to_merge = {});
+  //! Revert the changes made by castInputOutputToLowerPrecision to the given
+  //! edges
+  void revertInputOutputPrecisionChanges(
+      const std::vector<SegmentedEdge*>& edges);
+  //! Grab edges with val
+  std::vector<SegmentedEdge*> getEdgesByVal(Val* val) const;
+  //! Make sure it's a DAG and optionally disjoint
+  void validate(bool require_disjoint = true) const;
+  //! Same as validate but only enabled when NDEBUG is undefined
+  void validateIfDebug(bool require_disjoint = true) const;
+  //! Serialize SegmentedFusion using flatbuffers
+  flatbuffers::Offset<serde::SegmentedFusion> serialize(
+      flatbuffers::FlatBufferBuilder& builder) const;
+  //! Deserialize SegmentedFusion using flatbuffers
+  void deserialize(const serde::SegmentedFusion* buffer);
+ private:
+  void validateDAG() const;
+  void validateDisjoint() const;
+  //! Serialize SegmentedEdge using flatbuffers
+  flatbuffers::Offset<serde::SegmentedEdge> serialize(
+      flatbuffers::FlatBufferBuilder& builder,
+      const nvfuser::SegmentedEdge* edge,
+      const std::unordered_map<Val*, int64_t>& vals_map,
+      const std::unordered_map<SegmentedGroup*, int64_t>& groups_map) const;
+  //! Deserialize SegmentedEdge using flatbuffers
+  nvfuser::SegmentedEdge deserialize(
+      const serde::SegmentedEdge* buffer,
+      const std::deque<Val*>& vals);
+ private:
+  //! Unique name for segmented fusion
+  size_t segmented_fusion_name_;
+  //! States representing segmentation
+  std::vector<SegmentedEdge*> edges_;
+  std::vector<SegmentedGroup*> groups_;
+  //! Owning object to explicitly manage groups and edges
+  class Impl {
+   public:
+    explicit Impl(SegmentedFusion* sf) : owning_fusion_(sf) {}
+    SegmentedGroup* makeGroup();
+    SegmentedGroup* makeGroup(Expr*);
+    SegmentedGroup* makeFusionInputGroup();
+    SegmentedEdge* makeEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
+    void cleanUnused();
+    std::unordered_map<SegmentedGroup*, int64_t> groups_map() const;
+    std::unordered_map<SegmentedEdge*, int64_t> edges_map() const;
+   private:
+    using GroupPtr = std::unique_ptr<SegmentedGroup>;
+    using EdgePtr = std::unique_ptr<SegmentedEdge>;
+    std::vector<GroupPtr> groups_;
+    std::vector<EdgePtr> edges_;
+    SegmentedFusion* owning_fusion_;
+  };
+  Impl impl_;
+  //! A Copy of original full fusion
+  std::unique_ptr<Fusion> complete_fusion_;
+  //! A set of intermediate tensors that need to be cast to fp16
+  std::unordered_set<TensorView*> force_fp16_tv_set_;
+  DataType force_half_precision_type_;
+  //! Static traversal information to be used for fast heuristics lookup
+  std::unordered_map<SegmentedGroup*, std::unique_ptr<HeuristicDataCache>>
+      heuristic_data_cache_;
+  //! The number of values in fusion after constructing segmented fusion.
+  //! Used for checking state during deserialization.
+  size_t initial_vals_size_;
+  //! The number of expressions in fusion after constructing segmented fusion.
+  //! Used for checking state during deserialization.
+  size_t initial_exprs_size_;
+  // TODO: this class needs cleanup
+ protected:
+  friend class SegmentCandidateFinder;
+  //! Cleanup function to be call at the end of fusion
+  //!  segment pass
+  void finalize();
+  //! Collect all the intermediate tensors between segmented
+  //!  groups that will cast to fp16
+  void annotateFP16IntermediateTensors();
+  //! Keep heuristic checking intermediate data
+  void setCachedHeuristicDataFor(
+      SegmentedGroup* group,
+      std::unique_ptr<HeuristicDataCache> data);
+  //! Utility to give unique name for each segmented fusion
+  static size_t segmentedFusionName() {
+    static size_t counter = 0;
+    return counter++;
+  }
+};
+std::ostream& operator<<(
+    std::ostream& os,
+    const SegmentedFusion* segmented_fusion);
+//! This is a base class for segmenter analysis
+//!  provides the minimal implementation on header so that
+//!  a unique_ptr can use this base class
+//!  actual implementations of analyses are in the .cpp files
+//! TODO: In the next refactor PR, should put segment candidate
+//!  finder in .cpp file completely since API doesn't require these
+//!  details
+class SegmenterAnalysis : public PolymorphicBase {};
+class GroupDependencyAnalysis;
+// Manual node merging passes
+class CombineReductions;
+class MergeUpAndDownCast;
+//! Options to configure/debug candidate finder
+struct SegmentCandidateFinderOptions {
+  bool run_translate_welford = true;
+  bool run_combine_reductions = true;
+  bool run_herrmann_merge = true;
+  bool run_final_merge = true;
+  bool only_segment_resharding_exprs = false;
+};
+//!  SegmentCandidateFinder
+//!    Responsible for going through DAG and proposing things we could try to
+//!    fuse together, calls "canGenerateCode" on these proposed segments to see
+//!    if they are valid and we can generate code for them.
+//!  FusionSegment
+//!    A group of exprs that are segmented together
+//!  FusionSegmentConnections
+//!    Holds vals and what they connect. In other words it's a val that is an
+//!    output of a FusionSegment "from" and an input of FusionSegment "to".
+//!    There's nothing preventing from a val being between segments twice.
+//!    TODO: make sure there's nothing wrong with segmentation on nodes that
+//!    have the same value input twice. i.e. (B = A*A)
+//! Selecting segments to propose is based on the theorem 4.2 in the paper which
+//! makes sure when segment the segmented graph will be a DAG (assumes Fusion is
+//! already a DAG). The segmentation code relies on assumptions of DAG-ness
+//! during segmentation, meaning proposed merging of groups must maintain the
+//! DAG property of the graph.
+//!
+//! Julien Herrmann, Yusuf Özkaya, Bora Uçar, Kamer Kaya, Umit Catalyurek.
+//! Multilevel Algorithms for Acyclic Partitioning of Directed Acyclic Graphs.
+//! SIAM Journal on Scientific Computing, Society for Industrial and Applied
+//! Mathematics, 2019, 41 (4), pp.A2117-A2145. ff10.1137/18M1176865ff.
+//! ffhal02306566f
+class SegmentCandidateFinder {
+ public:
+  // Perform segmentation on a copy of the given fusion
+  static std::unique_ptr<SegmentedFusion> segment(
+      const Fusion* fusion,
+      const KernelArgumentHolder* inputs,
+      SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions());
+  // Perform segmentation on and take ownership of the given fusion
+  static std::unique_ptr<SegmentedFusion> segment(
+      std::unique_ptr<Fusion> fusion,
+      const KernelArgumentHolder* inputs,
+      SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions());
+  static std::unique_ptr<SegmentedFusion> segment(
+      std::unique_ptr<Fusion> fusion,
+      const KernelArgumentHolder* inputs,
+      SchedulerRuntimeInfo& runtime_info);
+  static bool hasSegmentHints(Fusion* fusion);
+  NVF_API static bool translateWelfordInFusion(
+      Fusion* fusion,
+      const KernelArgumentHolder& runtime_inputs);
+ private:
+  // Perform segmentation on and take ownership of the given fusion
+  NVF_API SegmentCandidateFinder(
+      std::unique_ptr<Fusion> fusion,
+      const KernelArgumentHolder* inputs,
+      SegmentCandidateFinderOptions options);
+  void resetTraversal();
+  void resetLevels();
+  SegmentedGroup* mergeNodes();
+  bool codeGenSupportedMerge(SegmentedGroup* group1, SegmentedGroup* group2);
+  void buildInitialSegments();
+  void findSegments();
+  //! Find a group found in candidates that can be merged with the
+  //! given group and set them to be merged if found. When no
+  //! candidate is given, SegmentedGroup::getMergeCandidates is used
+  //! to get candidates.
+  void trySetUpMerge(
+      SegmentedGroup* group,
+      std::vector<SegmentedGroup::NeighborGroup> candidates = {});
+  std::unordered_set<SegmentedEdge*> disconnectGroup(SegmentedGroup* group);
+  std::vector<SegmentedGroup*>& groups() {
+    NVF_ERROR(
+        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
+    return segmented_fusion_->groups();
+  }
+  std::vector<SegmentedEdge*>& edges() {
+    NVF_ERROR(
+        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
+    return segmented_fusion_->edges();
+  }
+  Fusion* completeFusion() {
+    NVF_ERROR(
+        segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
+    return segmented_fusion_->completeFusion();
+  }
+  SchedulerRuntimeInfo& runtimeInfo() {
+    NVF_ERROR(runtime_info_.has_value(), "needs runtime info");
+    return runtime_info_.value();
+  }
+  ExpressionEvaluator& expressionEvaluator() {
+    return runtimeInfo().expressionEvaluator();
+  }
+  //! Additional merging iteration, clean up the rest of
+  //!  the merging opportunities
+  //!  Herrmann et al. is a fast and safe algorithm for finding merge candidates
+  //!  but can become too conservative in our use cases because we place
+  //!  additional qualifiers on valid merges other than having to generate DAGs,
+  //!  i.e. canSchedule. So we need a bruteforce final merging iteration as a
+  //!  clean up pass. Cost isn't expected to be high since the graph at this
+  //!  stage is already quite merged. Example cf. test_gpu.cpp:
+  //!  FusionDAGMerging_CUDA
+  //!
+  //!  This merging algorithm is based on Theorem 4.1 of Herrmann et al.,
+  //!   to check if a producer-consumer pair can be merged into one group,
+  //!   it's enough to check if any other consumer of the producer also
+  //!   produces the consumer.
+  void finalMerge();
+  //! Duplicate and add all exprs producing the used
+  //!  scalar values in group
+  void resolveScalarsInGroup(SegmentedGroup* group);
+  //! Duplicate and add all exprs from fusion inputs to `forwarded_input` into
+  //! the group, to complete inputs. These expressions are simply unary ops of
+  //! inputs that we want to recompute for each segment, instead of computing
+  //! and producing a segmented val. For example if we have:
+  //!
+  //!   tv1 = tv0 * 2;
+  //!   tv3 = tv1 + tv2;
+  //!   tv4 = tv1 + tv4
+  //!
+  //! If we segmented on tv1, we would be producing an output for tv1 for 2
+  //! groups that have tv3 or tv4, instead we could easily recompute tv1 from
+  //! tv0.
+  void resolveNonscalarForwardedInput(Val* forwarded_input);
+  void resolveForwardedInputs();
+  // Creates the input group that ends at `forwarded_input`, i.e., the region
+  // between fusion inputs and `forwarded_input`.
+  SegmentedGroup* createInputGroup(Val* forwarded_input);
+  //! Remove all scalar edges in group
+  //!  (TODO: need structure better so we don't have to do this)
+  void removeScalarEdges();
+  //! Utility function to merge a vector of groups in one step,
+  //!  need to check for DAG condition before using this method
+  SegmentedGroup* mergeAllGivenGroups(
+      const std::vector<SegmentedGroup*>& groups);
+  //! Utility to remove a group and corresponding edges
+  //!  TODO: remove inline versions of this as much as possible
+  void eraseGroups(std::unordered_set<SegmentedGroup*>& groups_to_erase);
+  void finalize();
+  //! Return the resulting SchedulerType corresponding to the merged
+  //!  group built by merging the two groups connected by edge
+  SchedulerType deriveSchedulerType(SegmentedGroup* edge);
+  GroupDependencyAnalysis* getGroupDependency();
+  //! Find all expresions that are simply unary ops from
+  //! inputs. Don't segment
+  //! these as they're easy targets for recomputation. Only go until the first
+  //! expression that has multiple uses.
+  //!
+  //! The ending tensors, or the forwarded tensors, are considered
+  //! fusion inputs for the sake of segmentation, and the expressions
+  //! between the real inputs and the forwarded tensors are excluded
+  //! from the segmentation steps until the finalization, at which
+  //! point they are simply prepended to each final segment using the
+  //! forwarded inputs.
+  void forwardInputs();
+  void cleanupForwardedInputs();
+  //! Query if a val is a fusion input or a forwarded input
+  bool isFusionInput(Val* val) const {
+    return std::find(
+               forwarded_fusion_inputs_.begin(),
+               forwarded_fusion_inputs_.end(),
+               val) != forwarded_fusion_inputs_.end();
+  };
+ protected:
+  //! These are the merge node heuristic passes, should
+  //!  eventually should have a dedicated interface
+  //!  instead of keeping adding friends
+  friend class CombineReductions;
+  friend class MergeUpAndDownCast;
+  //! options to configure and debug the segment process
+  SegmentCandidateFinderOptions options_;
+  std::deque<SegmentedGroup*> to_visit_;
+  std::vector<SegmentedGroup*> next_to_visit_;
+  std::unordered_set<SegmentedGroup*> clean_up_groups_;
+  std::unordered_set<SegmentedEdge*> clean_up_edges_;
+  std::vector<SegmentedGroup*> to_merge_;
+  std::unique_ptr<SegmentedFusion> segmented_fusion_;
+  std::unique_ptr<SegmenterAnalysis> group_dependency_;
+  //! List of vals to treat as complete fusion inputs for segmentation
+  std::vector<Val*> forwarded_fusion_inputs_;
+  //! Keep track of complete fusion input use
+  std::unordered_map<Val*, SegmentedGroup*> input2group_;
+  // Expressions to exclude from segmentation because they're just derived from
+  // unary ops on inputs to the complete fusion
+  VectorOfUniqueEntries<Expr*> excluded_inp_unary_exprs_;
+  // This is allowed to be null in the multidevice case where the segmenter is
+  // used for breaking the fusion into compute and communication segments
+  std::optional<SchedulerRuntimeInfo> runtime_info_;
+  //! Note:
+  //!  Segmenter should eventually rely only on runtime_info_ for
+  //!  safe caching. runtime_inputs_ is only used in translateWelford
+  //!  to initialize expression evaluators on copies of the original
+  //!  fusion, which doesn't use any un-cached info and is safe.
+  //!
+  //!  Directly using runtime_inputs_ in other cases is in general
+  //!   risky.
+  //!
+  //!  To get rid of runtime_inputs_ we need mechanisms
+  //!  to copy expression evaluator values from fusion
+  //!  to a copy, or even better to a copy of a
+  //!  sub-graph of original fusion.
+  //! TODO:
+  //!  implement the expression evaluator transfer and
+  //!  remove runtime_inputs_ in a follow up.
+  const KernelArgumentHolder* runtime_inputs_;
+};
+// TODO: Make as member functions on classes instead of global scope
+std::string toString(const SegmentedGroup* group);
+std::string toString(const SegmentedEdge* edge);
+std::string toString(const SegmentedFusion* segmented_fusion);
+std::string toString(const SegmentCandidateFinderOptions& segment_options);
+} // namespace nvfuser