PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/device_lower/utils.h ADDED Viewed

@@ -0,0 +1,382 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <compute_at_map.h>
+#include <ir/all_nodes.h>
+#include <kernel_ir.h>
+#include <parallel_type_bitmap.h>
+#include <val_graph.h>
+#include <bitset>
+#include <map>
+// Provides utilities for dealing with nested ForLoop and IfThenElse scopes
+namespace nvfuser {
+class ThreadPredicateMap;
+using IterDomainMap = std::unordered_map<IterDomain*, IterDomain*>;
+namespace scope_utils {
+//! Create an **empty** Forloop and copy the metadata.
+ForLoop* cloneForLoop(ForLoop* for_loop);
+//! Create an **empty** IfThenElse and copy the metadata.
+kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite);
+} // namespace scope_utils
+namespace ir_utils {
+// Somtimes we want to temporarily view a tensorview with another tensordomain.
+// This isn't a permanent transformation, but in indexing we want to index
+// producers with a consumer set of indices, so we need to view the producer
+// transformed like consumer while we index. This will set the tv with td for
+// the life of this context guard.
+class TVDomainGuard {
+ private:
+  TensorView* tv_;
+  TensorDomain* prev_domain_;
+ public:
+  explicit TVDomainGuard(TensorView* tv, TensorDomain* td);
+  TVDomainGuard(const TVDomainGuard&) = delete;
+  NVF_API TVDomainGuard(TVDomainGuard&&);
+  //! An utility to access the tensordomain before the temporary
+  //!  view. This is used to retrieve information, like swizzle
+  //!  information that can only be reliably kept at the original domain.
+  const TensorDomain* prevDomain() const {
+    return prev_domain_;
+  }
+  NVF_API ~TVDomainGuard();
+};
+// Create a TVDomainGuard that temporarily view a TensorView with specified
+// all-true or all-false contiguity.
+NVF_API ir_utils::TVDomainGuard overrideContiguityGuard(
+    TensorView* tv,
+    bool contiguity);
+// Create a TVDomainGuard that temporarily setting allocation domain as
+// getLogicalDomain() from a TensorView, contiguity are filled all true or
+// all false
+ir_utils::TVDomainGuard allocateToLogicalDomainGuard(
+    TensorView* tv,
+    bool contiguity);
+//! Return inputs of provided IterDomains that are IterDomains. A list
+//! of input IterDomain can be optionally given. Otherwise,
+//! IterDomains with no defining expression are returned.
+std::vector<IterDomain*> iterDomainInputsOf(
+    const std::vector<IterDomain*>& input_ids,
+    const std::vector<IterDomain*>& all_inputs = {});
+// Return inputs of provided IterDomains that are IterDomains, order as the
+// second provided vector.
+std::vector<IterDomain*> iterDomainInputsOfOrderedAs(
+    const std::vector<IterDomain*>& of,
+    const std::vector<IterDomain*>& order);
+// Returns if Val is a TensorView or TensorIndex
+bool isTV(const Val* const);
+// Returns if Expr is a TensorView or TensorIndex Expr.
+NVF_API bool isTvOp(const Expr*);
+//! Returns the iterdomain that maps to the thread dimension grouped
+//!  to warps. Returns nullopt if the reduction is not to be lowered to
+//!  a warp reduction.
+std::optional<std::pair<IterDomain*, IterDomain*>> getMaybeWarpReductionDim(
+    const Val* output,
+    const Val* input);
+bool isScalarOp(const Expr*);
+bool isIterDomainOp(const Expr*);
+//! Get TensorView potentially via kir::TensorIndex. Returns nullptr if
+//! cast fails.
+TensorView* getTv(Val*);
+const TensorView* getTv(const Val*);
+//! Get only TensorView potentially via kir::TensorIndex.
+std::vector<TensorView*> getTvs(const std::vector<Val*>& vals);
+std::unordered_map<ParallelType, IterDomain*> getParallelDomains(
+    const Val* val);
+//! Returns true if the expression will be lowered to
+//!  a ldmatrix intrinsic.
+bool isLdMatrixOp(const Expr* expr);
+bool isStMatrixOp(const Expr* expr);
+//! Returns true if the expression will be lowered to
+//!  a cp.async intrinsic.
+bool isCpAsyncOp(const Expr* expr);
+//! Returns true if the expression will be lowered to
+//!  a cp.async.bulk (a.k.a. TMA) intrinsic.
+bool isCpAsyncBulkLoad(const Expr* expr);
+bool isCpAsyncBulkStore(const Expr* expr);
+bool isCpAsyncBulk(const Expr* expr);
+//! Short-cut for detecting initialization for cpAsync op.
+bool isCpAsyncInit(const Expr* expr);
+//! Short-cut for matching a singleton expr in a if statement,
+//!  which likely becomes a predicated instruction in ptx, eg.:
+//!  if(...) {expr;}
+//! Returns the expr if it is this pattern.
+//! Returns nullptr if the pattern doesn't match.
+std::optional<Expr*> getMaybePredicatedSingleton(Expr* expr);
+//! Short-cut for checking if the expression loads from global memory.
+bool isGlobalLoad(const Expr* expr);
+//! Short-cut for checking if the given expression initializes buffers
+//!  for global memory load.
+bool isGlobalLoadInit(const Expr* expr);
+//! Returns true if the given expression fills the output
+//!  tensor with a single scalar.
+bool isTensorScalarFillOp(const Expr* expr);
+//! Flattens all the scoped exprs, i.e. ForLoop and IfThenElse,
+//!  and returns all the exprs in all scopes in the original
+//!  linear textural order.
+NVF_API std::vector<Expr*> flattenScopedExprs(
+    const std::vector<Expr*>& loop_nests);
+//! Returns all swizzle ops between the set of iterdomains
+//!  in `from` and `to`.
+std::vector<Expr*> getAllSwizzlesBetween(
+    std::vector<IterDomain*> from,
+    std::vector<IterDomain*> to);
+// Replace value pass on Kernel IR.
+//  Replace each use of any Val* that apears in the given `replacement_map`
+//  Keeps the predicate carried by each expr
+//
+// Warning: Blindly replaces all use based on pointer
+// Warning: May invalidate indexing if replacing uses of allocated values
+std::vector<Expr*> replaceInputsInExpr(
+    const std::vector<Expr*>& exprs,
+    const std::unordered_map<Val*, Val*>& replacement_map);
+} // namespace ir_utils
+namespace lower_utils {
+bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map);
+// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid
+// welford reduce, grid broadcast.
+kir::Allocate* allocGlobalBufferForGridComm(
+    Val* buffer_size,
+    DataType dtype,
+    bool zero_init,
+    bool resets_to_zero = false);
+struct BasicAllocInfo {
+  // The for loop that the initialization of this allocation must be
+  // placed in, nullptr if not within a loop
+  ForLoop* init_for_loop = nullptr;
+  // Keep track of the actual allocation loop. This can be different
+  // from init_for_loop only with unswitched shared memory allocations,
+  // which are moved outer loops to avoid duplicated allocations. This means
+  // that the alloc position may be outside what's expected. Most applications
+  // outside lower_allocation is likely looking for init_for_loop which is
+  // more directly related to how large an allocation is and how it's used.
+  // (see issue #1133).
+  ForLoop* alloc_for_loop = nullptr;
+  // The allocation position relative to buffer IDs, it could be outside the
+  // compute at position if it's shared memory with a compute at inside an
+  // unswitch
+  int64_t alloc_pos = 0;
+};
+// Fill the above allocation struct based on provided information. id_map is
+// used if we're looking at a producer tensor but loops on a consumer tensor.
+BasicAllocInfo getAllocInformation(
+    const TensorView* tv,
+    const std::vector<ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, IterDomain*>& id_map = {},
+    bool use_id_map = false);
+//! Returns true if the expression has a variant that takes a predicate
+//!  as an inline argument.
+bool supportInlinePredicate(Expr* expr);
+//! Test if an expression is a scalar expression.
+bool isScalarExpr(Expr* expr);
+//! Test if provided IterDomain instance has an extent that matches maximum
+//!  extent stored in parallel dimension map for parallel type of provided
+//!  IterDomain object. `in_compute_warp` specifies we are checking an
+//!  expression in the compute warp, if so, we need to get the parallel type
+//!  extent of the compute warp, instead of the global parallel type extent.
+bool isExtentEqualToMaxParallelTypeExtent(
+    const IterDomain* id,
+    bool in_compute_warp = false);
+//! Get the uint32_t index of a scalar TensorView. This is usually used for
+//! indexing special items in shared memory, like mbarrier.
+NVF_API Val* u32IndexScalarSmemTv(TensorView* tv);
+//! Get the uint32_t index of a TensorIndex. This is usually used for
+//! initializing a pipeline of mbarriers.
+NVF_API Val* u32IndexScalarSmemTv(kir::TensorIndex* index);
+//! Get the size of a global sync buffer needed to perform a grid reduction for
+//! each axis in bitmap.
+Val* getGridSyncBufferSize(const ParallelTypeBitmap& bitmap);
+//! Returns the fusion outputs that require codegen.
+//! The fusion outputs to be computed through expression evaluator are
+//! filtered out.
+std::vector<Val*> getFusionOutputsRequiringCodegen(Fusion* fusion);
+//! Get the number of threads in a tensor view. Note that this function
+//! only cares about the given tensor view itself, not the entire fusion.
+//! That is, for example, if the tensor view is [TIDx{3}], but the entire
+//! fusion has blockDim.x = 128, this function will return 3 instead of 128.
+Val* getNumThreadsInTensorView(TensorView* tv);
+//! Get the unit dimensions of A and B for the given MmaOp.
+std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr);
+// Returns true if expr is an expression that initializes a reduction
+// buffer.
+bool isReductionInitExpr(const Expr* expr);
+// Return true if it is sufficient to predicate the end of the loop
+// iteration. An aligned vectorized loop is one example where it is
+// guaranteed to be valid by the validation checks. More generally,
+// the divisible split set is used to find such loops. The divisible
+// split set contains splits used in view transformations as well as
+// those whose output domains are vectorized. View transformations
+// guarantee that any split involved is divisible, whereas
+// vectorization only guarantees that the overall root extent is
+// divisible by the split factor. Thus, if a loop IterDomain is
+// an output of a split included in the divisible view splits, we can
+// just predicate the end of the loop iteration. If a loop IterDomain
+// is an output of a divisible split due to vectorization, it is only
+// valid when the loop IterDomain is mapped with the vectorized inner
+// output IterDomain. If it is mapped with an outer IterDomain, since
+// the split input IterDomain may be an output IterDomain of a
+// non-divisible split, we still need to predicate each loop iteration
+// value.
+bool predicateAtEnd(ForLoop* loop);
+// Given linear_g and domain, prove that linear_g is linear with respect to
+// domain and return the stride. linear_g is linear with respect to domain if
+// there exists a strided view of domain such that linear_g is one of the
+// axes of that strided view. Usually, linear_g is a group in the loop domain of
+// some tensor, and domain is the allocation domain of some tensor. In this
+// case, if the index of linear_g is i, then this function proves that the index
+// is is a linear function of i, with the linear coefficient being the return
+// value. Note that this function does the proof and stride calculation in a
+// best-effort manner. It can not cover all linear cases. If the return value is
+// nullptr, it can be either because linear_g is not linear with respect to
+// domain, or because linear_g is actually linear with respect to domain, but it
+// is too hard for this function to find a proof.
+Val* proveLinearAndGetStride(
+    const ValGraph& id_graph,
+    const ValGroup& linear_g,
+    const ValGroups& domain);
+// Get the concrete loop domain of a given loop ID
+IterDomain* getConcreteLoopID(IterDomain* loop_id);
+// Go through all expressions and compute a local ordering of loops. operator<
+// is implemented based on the concrete_id_dependencies analysis done. If
+// there's no dependency between two IDs then order doesn't mater, otherwise we
+// can tell which is inner most by checking if there's any dependency
+// relationships.
+//
+// Dependency relationships in concrete_id_dependencies has a "global" view in
+// the fusion, so it can resolve ordering by only looking at id's and the
+// dependency map.
+//
+// For example two expressions may have domains: [I0], [I1] Yet we
+// won't know the ordering unless we see a domain with: [I0, I1]. This happened
+// in Indexing9 (also see Indexing17) test when merging T5 with
+// the group containing T10 (cache of T5, which is post broadcasted output) and
+// T6(pre broadcasted output).
+// T5 had the domain [0, 1, 2, 3, 4] produce at 3
+// T6 had the domain [0, 3, 4] compute at 3
+// Merging [0, 1, 2] and [0, 3, 4] resulted in the domain [0, 3, 4, 1, 2]
+//
+// If ID's are not in filter, we don't care about their ordering and ignore
+// them. This is because we're only focused on loops we will have to merge
+// across groups. If the domain is not in a produce at position in the producer
+// edges, or a compute at position in the consumer edges, the expressions we
+// look at may not have a unique ordering.
+//
+// The optional kernel_scope_domain parameter is only used in
+// expression sorting. It isn't in the CA map, but since we only have
+// a single unique IterDomain, the conrete ID is just itself.
+struct IterDomainDependencySorter {
+  IterDomainDependencySorter(
+      const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
+          concrete_id_dependencies,
+      IterDomain* kernel_scope_domain = nullptr)
+      : concrete_id_dependencies_(concrete_id_dependencies),
+        kernel_scope_domain_(kernel_scope_domain) {}
+  // Return true if id0 should be before id1
+  // Orders such that if x maps to {y}, x comes before y in final ordering.
+  inline bool operator()(IterDomain* id0, IterDomain* id1) {
+    auto concrete_id_0 =
+        id0 != kernel_scope_domain_ ? getConcreteLoopID(id0) : id0;
+    auto concrete_id_1 =
+        id1 != kernel_scope_domain_ ? getConcreteLoopID(id1) : id1;
+    if (concrete_id_dependencies_.find(concrete_id_0) !=
+        concrete_id_dependencies_.end()) {
+      const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0);
+      // if id0 depends on id1 it means id1 is inside id0, so id0 < id1
+      if (dependencies_0.count(concrete_id_1)) {
+        return true;
+      }
+    }
+    return false;
+  }
+  const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
+      concrete_id_dependencies_;
+  const IterDomain* kernel_scope_domain_ = nullptr;
+};
+// Check if all the inputs of the given MmaOp is guarded by mbarrier
+bool allMmaInputsGuardedByMBarrier(const MmaOp* mma);
+// Create a list of expressions that will be used to wait for async operations.
+// For example, if op_type is AsyncOpType::WgMma, then the returned expressions
+// will be:
+//   wgmma.commit_group.sync.aligned
+//   wgmma.wait_group.sync.aligned
+std::vector<Expr*> getSyncExprs(
+    AsyncOpType async_type,
+    int64_t keep_stages = 0);
+} // namespace lower_utils
+} // namespace nvfuser

nvfuser/include/nvfuser/device_lower/validation.h ADDED Viewed

@@ -0,0 +1,74 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <ir/all_nodes.h>
+namespace nvfuser {
+class ContigIDs;
+void validateIr(Fusion* fusion);
+//! Validate vectorization and collect information on vectorization
+//! used in code generation as well as runtime validation.
+void validateAndCollectVectorizeInfo(Fusion* fusion);
+//! Find the contig allocation domains that a vectorized loop domain
+//! of a consumer TV depends on. Required for runtime validation.
+void fillConsumerVectorizedContigAllocationDomains(
+    const TensorView* consumer_tv,
+    const ContigIDs& contig_finder);
+//! Find the contig allocation domains that a vectorized loop domain
+//! of a producer TV depends on. Required for runtime validation.
+//! Producer must be transformed as consumer.
+void fillProducerVectorizedContigAllocationDomains(
+    const TensorView* producer_tv,
+    const TensorView* consumer_tv,
+    const ContigIDs& contig_finder);
+//! Validate data format and GPU arch compatibility of scheduled
+//!  mma operators on the fusion.
+void validateMma(Fusion* fusion);
+//! Validates swizzle ops to ensure consistent indexing:
+//!   - Currently only allow swizzle ops on the right of CA axis,
+//!   - (Except ZShape) All swizzle ops have to be on const sized ids
+//!   - Xor and Transpose swizzle have to have equal dimensions on the
+//!       participating ids.
+void validateSwizzle(Fusion* fusion);
+//! Validate use of ParallelType::Group. It is currently only allowed
+//! in ReductionOp and not in WelfordOp. Group has similar constraints
+//! as Vectorize, e.g., it can only be used with IterDomains with
+//! static extents. Differences are, e.g, it has no constraints on
+//! alignments and predicates. Each individual reduction has its own
+//! predicate, so it is possile for only part of grouped reductions to
+//! be executed.
+//!
+//! Also, grouping is only enabled for persistent grid reductions, in
+//! other words, grid allreduces. Note that no grid reduction without
+//! broadcast is persistent anymore.
+//!
+//! Validated ReductionOp with ParallelType::Group is converted to
+//! GroupedReductionOp.
+void validateAndConvertIterDomainGrouping(Fusion* fusion);
+//! Validate the number of grouped reductions is within the limit
+void validateGroupedReductions(Fusion* fusion);
+//! Validate all of the lookup TVs are ensured to be fusion inputs
+void validateLookupTV(Fusion* fusion);
+//! Check that there are no reductions over unexpanded broadcasts
+void validateReductions(Fusion* fusion);
+} // namespace nvfuser