PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h ADDED Viewed

@@ -0,0 +1,158 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <device_lower/utils.h>
+#include <ir/all_nodes.h>
+#include <parallel_type_bitmap.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+namespace nvfuser {
+//! Maps TensorViews to a { ParallelTypeBitmap, SourceMap } pair
+//!
+//! Map from TensorView to bit set represnting <BIDx, BIDy, BIDz, TIDx, TIDy,
+//! TIDz> If any dependency of TV had a parallelized reduction, we will track
+//! it here. This will be used for predicate generation to prevent
+//! parallelization on that axis. This is important if we have a reduction on
+//! for example TIDx, as the reduced value is only valid on threadIdx.x == 0
+//! therefore if we use that value later in the kernel we have that predicate.
+//! If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
+//! no longer need the predicate and can reset the bit accordingly
+//!
+//! In addition, if a parallel thread type is not used, it is
+//! redundant to use all threads/blocks. That isn't a problem
+//! generally although it can be inefficient, but when an aliased smem
+//! buffer is used as an output, redundant writes can be invalid (see issue
+//! #1110). PredicateInfo::redundant_types track which parallel types
+//! are redundant for each tensor and is used to let only one
+//! thread/block of a redundant type execute the expression for a
+//! tensor.
+class ThreadPredicateMap {
+ public:
+  using SourceMap =
+      std::unordered_map<ParallelType, std::unordered_set<const TensorView*>>;
+  //! Thread predicate information for each tensor
+  struct PredicateInfo {
+    // Parallel types where only one thread/block is valid.
+    ParallelTypeBitmap limited_types;
+    // Parallel types where only one thread/block is enough.
+    ParallelTypeBitmap redundant_types;
+    // when a loop domain of a Tensor stored in global memory
+    // is merged from concretized broadcast logical domain, the broadcasted
+    // logical domains should be skipped when writing to global memory.
+    // broadcast_ld_indices_map maps a parallel type to a list of indices
+    // of the broadcasted logical domains. The write to global memory is needed
+    // only when the index equals to 0.
+    std::unordered_map<ParallelType, std::vector<Val*>>
+        broadcast_ld_indices_map;
+    // Tracking use chain of redundant writes:
+    //  [Redundant use chain]
+    //  a parallel type is a `redundant_consumer_type` only
+    //    if all of its propagation use chains terminate with
+    //    a redundant write of this type.
+    //  A propagation use chain is currently either a reg-to-reg
+    //   chain for a shared mem tv, or a reg/smem-to-reg/smem chain
+    //   for a global tv.
+    // This is complementary information to `redundant_types`.
+    //  If a tensor view is redundantly written and not redundantly
+    //  used by all consumers, see FusionRedundantPredSync3,
+    //  a RAW sync will need to be inserted before reading
+    //  this redundantly written tensor.
+    ParallelTypeBitmap redundant_use_types;
+    bool operator==(const PredicateInfo& other) const {
+      return limited_types == other.limited_types &&
+          redundant_types == other.redundant_types &&
+          redundant_use_types == other.redundant_use_types;
+    }
+  };
+  using MapType = std::unordered_map<const TensorView*, PredicateInfo>;
+  using const_iterator = MapType::const_iterator;
+  //! Build a map from each tensor to PredicateInfo.
+  void build(Fusion* fusion);
+  //! Get a PredicateInfo for a given tensor. If it's an output of
+  //! a parallel broadcast, unmask the limited_types_ bit of the
+  //! corresponding parallel type since it must join the broadcast
+  //! operation although the valid input is only available at one of
+  //! the threads/blocks.
+  NVF_API PredicateInfo getPredicateInfo(const TensorView* tv) const;
+  //! Returns a flag set that indicates which parallel types should be
+  //! predicated.
+  ParallelTypeBitmap getPredicatedParallelTypes(const TensorView* tv) const;
+  //! Returns a Bool predicate for a given TensorView.
+  Val* getPredicate(
+      const TensorView* tv,
+      ParallelTypeBitmap mask = ParallelTypeBitmap().setAll()) const;
+  //! Returns a ParallelTypeBitmap representing which domain needs
+  //! blockBroadcast.
+  //!
+  //! Even when a domain is broadcast and parallelized, it does not need
+  //! blockBroadcast unless it is predicated by limited_types_
+  ParallelTypeBitmap getParallelBroadcastDomains(const TensorView* tv) const;
+  //! Mark tv as updated so that rebuilding the map should recompute
+  //! its predicates and those of its dependents.
+  void markAsUpdated(const TensorView* tv);
+  void print() const;
+  //! Generate a Bool value from PredicateInfo.
+  static Val* getPredicateFromPredicateInfo(
+      const ThreadPredicateMap::PredicateInfo& pred_info,
+      const ParallelTypeBitmap& mask);
+  //! Get the redundant use types of the given expr, see [Redundant use chain]
+  ParallelTypeBitmap getRedundantConsumerType(Expr* expr) const;
+ private:
+  // Update the thread_predicates bitset based on provided Expr
+  void updateBitSet(const Expr*);
+  void avoidConcretizedBroadcastRedundantWrite(const TensorView* out_tv);
+  const_iterator find(const TensorView* tv) const;
+  const_iterator end() const;
+  const PredicateInfo& at(const TensorView* tv) const;
+  PredicateInfo& at(const TensorView* tv);
+  //! Update a mapping
+  bool update(
+      const TensorView* tv,
+      const ParallelTypeBitmap& limited_types,
+      const ParallelTypeBitmap& redundant_types);
+  //! Update a mapping
+  bool update(const TensorView* tv, const PredicateInfo& pred_and_src);
+  //! Backward populate redundant use chain info once the redundant
+  //!  parallel writes have been identified.
+  void populateRedundantUseMap(Fusion* fusion);
+ private:
+  MapType thread_predicates_;
+  //! Keep track of updated tensors that need predicates to be computed
+  std::unordered_set<const TensorView*> updated_tvs_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/device_lower/analysis/tma.h ADDED Viewed

@@ -0,0 +1,93 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ostream>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+#include <ir/all_nodes.h>
+#include <val_graph.h>
+namespace nvfuser {
+// See doc/dev/tma.md for design
+// All ValGroups are in the traversal graph of tensor indexer
+struct TMADim {
+  ValGroup partitioned;
+  ValGroup box;
+  ValGroup tile;
+  ValGroup stride;
+  Val* gmem_stride_bytes;
+  Val* tensorSize() const {
+    return partitioned->front()->as<IterDomain>()->extent();
+  }
+  Val* boxSize() const {
+    return box ? box->front()->as<IterDomain>()->extent()
+               : gmem_stride_bytes->fusion()->oneVal();
+  }
+  Val* tileSize() const {
+    return tile ? tile->front()->as<IterDomain>()->extent()
+                : gmem_stride_bytes->fusion()->oneVal();
+  }
+  Val* elementStride() const {
+    return stride ? stride->front()->as<IterDomain>()->extent()
+                  : gmem_stride_bytes->fusion()->oneVal();
+  }
+};
+std::ostream& operator<<(std::ostream& os, const TMADim& d);
+class TMAInfo {
+  std::vector<TMADim> dims_;
+  MmaInputSmemSwizzle swizzle_;
+  TensorView* gmem_tv_;
+ public:
+  TMAInfo(
+      std::vector<TMADim> dims,
+      MmaInputSmemSwizzle swizzle,
+      TensorView* gmem_tv)
+      : dims_(std::move(dims)), swizzle_(swizzle), gmem_tv_(gmem_tv) {}
+  const std::vector<TMADim>& dims() const {
+    return dims_;
+  }
+  std::vector<ValGroup> getTMADomain() const {
+    std::vector<ValGroup> result;
+    std::transform(
+        dims_.begin(),
+        dims_.end(),
+        std::back_inserter(result),
+        [](const auto& d) { return d.partitioned; });
+    return result;
+  }
+  Val* tileSizeBytes() const {
+    int64_t itemsize = dataTypeSize(gmem_tv_->dtype());
+    Val* size = IrBuilder::create<Val>(itemsize, DataType::Index);
+    for (const auto& d : dims_) {
+      size = SimplifyingIrBuilder::mulExpr(size, d.tileSize());
+    }
+    return size;
+  }
+  Val* tensorMap() const;
+};
+std::unordered_map<TensorView*, const TMAInfo> getConsumerToTMAInfoMap(
+    Fusion* fusion);
+MmaInputSmemSwizzle getSwizzle(TensorView* tv);
+} // namespace nvfuser

nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h ADDED Viewed

@@ -0,0 +1,75 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <ir/all_nodes.h>
+#include <logical_domain_map.h>
+#include <visibility.h>
+namespace nvfuser {
+//! Traverse and collect all concretized broadcast domains.
+//!
+//! The traversal first initializes the origin map with broadcast
+//! domains in input tensors. Then, a new entry is added to the origin
+//! map when a broadcast op is encountered during a forward traversal
+//! of the given fusion. For non-broadcast ops, mappings are just
+//! propagated forward using PairwiseLogicalDomainMap.
+//!
+//! When the mapped consumer domain is not broadcast, it means the
+//! producer broadcast domain is concretized, and its origin broadcast
+//! domains are marked as concretized.
+class NVF_API ConcretizedBroadcastDomains : private IterVisitor {
+ public:
+  ConcretizedBroadcastDomains() = delete;
+  ConcretizedBroadcastDomains(Fusion* fusion);
+  //! Is a domain concretized?
+  bool isConcretized(IterDomain* id) const;
+  //! Is a domain concretized to a unique concrete domain?
+  bool isUniquelyConcretized(IterDomain* id) const;
+  //! Is a domain concretized to multiple concrete domains?
+  bool maybeNonUniquelyConcretized(IterDomain* id) const;
+  //! Return all domains id is concretized to, if concretized
+  std::unordered_set<IterDomain*> allConcretizedDomains(IterDomain* id) const;
+ private:
+  using IterVisitor::handle;
+  void handle(TensorView* tv) final;
+  void handle(BroadcastOp* bop) final;
+  void dispatch(Expr* expr) final;
+  void markAsConcretized(
+      IterDomain* broadcast_root_domain,
+      IterDomain* concrete_root_domain);
+  bool insertRootDomainToConcreteDomainSet(
+      IterDomain* new_root_id,
+      std::unordered_set<IterDomain*>& id_set);
+ private:
+  //! Maps each root broadcast domain to its original root broadcast
+  //! domains. Their can be multiple original domains due to, e.g.,
+  //! binary ops with broadcast domains in both inputs.
+  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
+      broadcast_origin_map_;
+  //! Map all broadcast domains to concrete root domains
+  std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
+      broadcast_to_concrete_map_;
+  std::unique_ptr<ExactLogicalDomainMap> exact_map_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/device_lower/id_model_options.h ADDED Viewed

@@ -0,0 +1,135 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <id_model/utils.h>
+#include <sstream>
+namespace nvfuser {
+class IdModelOptions {
+ public:
+  IdModelOptions()
+      : build_id_model_(isOptionEnabled(EnableOption::IdModel)),
+        consumer_index_(
+            isIdModelOptionEnabled(IdModelEnableOption::ConsumerIndex)),
+        producer_index_(
+            isIdModelOptionEnabled(IdModelEnableOption::ProducerIndex)),
+        inline_predicate_(
+            isIdModelOptionEnabled(IdModelEnableOption::InlinePredicate)),
+        unswitch_predicate_(
+            isIdModelOptionEnabled(IdModelEnableOption::UnswitchPredicate)),
+        loop_(isIdModelOptionEnabled(IdModelEnableOption::Loop)) {
+    ensureConsistency();
+  }
+  bool buildIdModel() const {
+    return build_id_model_;
+  }
+  void setBuildIdModel(bool b) {
+    build_id_model_ = b;
+    ensureConsistency();
+  }
+  bool buildTensorIndexer() const {
+    return build_tensor_indexer_;
+  }
+  void setBuildTensorIndexer(bool b) {
+    build_tensor_indexer_ = b;
+    ensureConsistency();
+  }
+  bool consumerIndex() const {
+    return consumer_index_;
+  }
+  void setConsumerIndex(bool b) {
+    consumer_index_ = b;
+    ensureConsistency();
+  }
+  bool producerIndex() const {
+    return producer_index_;
+  }
+  void setProducerIndex(bool b) {
+    producer_index_ = b;
+    ensureConsistency();
+  }
+  bool inlinePredicate() const {
+    return inline_predicate_;
+  }
+  void setInlinePredicate(bool b) {
+    inline_predicate_ = b;
+    ensureConsistency();
+  }
+  bool unswitchPredicate() const {
+    return unswitch_predicate_;
+  }
+  void setUnswitchPredicate(bool b) {
+    unswitch_predicate_ = b;
+    ensureConsistency();
+  }
+  bool loop() const {
+    return loop_;
+  }
+  void setLoop(bool b) {
+    loop_ = b;
+    ensureConsistency();
+  }
+  std::string toString() const {
+    auto bool2str = [](bool b) { return b ? "true" : "false"; };
+    std::stringstream ss;
+    ss << "build_id_model=" << bool2str(build_id_model_)
+       << ", build_tensor_indexer=" << bool2str(build_tensor_indexer_)
+       << ", consumer_index=" << bool2str(consumer_index_)
+       << ", producer_index=" << bool2str(producer_index_)
+       << ", inline_predicate=" << bool2str(inline_predicate_)
+       << ", unswitch_predicate=" << bool2str(unswitch_predicate_)
+       << ", loop=" << bool2str(loop_);
+    return ss.str();
+  }
+ private:
+  void ensureConsistency() {
+    // TensorIndexer is required if these options are enabled
+    build_tensor_indexer_ = build_tensor_indexer_ || consumer_index_ ||
+        producer_index_ || inline_predicate_ || unswitch_predicate_ || loop_;
+    // Similarly, IdModel needs to be built if TensorIndexer is used
+    build_id_model_ = build_id_model_ || build_tensor_indexer_;
+  }
+ private:
+  // Build IdModel
+  bool build_id_model_ = false;
+  // Build TensorIndexer
+  bool build_tensor_indexer_ = false;
+  // Globally enables consumer indexing.
+  bool consumer_index_ = false;
+  // Globally enables producer indexing.
+  bool producer_index_ = false;
+  // Globally enables inline predicate
+  bool inline_predicate_ = false;
+  // Globally enables unswitch predicate
+  bool unswitch_predicate_ = false;
+  // Generate loops using IdModel
+  bool loop_ = false;
+};
+} // namespace nvfuser