PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/mma_type.h ADDED Viewed

@@ -0,0 +1,257 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <macros.h>
+#include <exceptions.h>
+#include <type.h>
+#include <visibility.h>
+#include <cstring>
+#include <ostream>
+#include <cstdint>
+namespace nvfuser {
+constexpr std::string_view MATMUL_LOG_PREFIX = "[MATMUL DEBUG] ";
+//! Named descriptors of domains in matmul
+enum class MatmulDimRole { M = 0, N, K, Batch };
+std::string toString(MatmulDimRole role);
+//! Named descriptors of TensorView roles in fusion
+//!  OPERAND_A - an input to the fusion that is a producer of a matmul "A" input
+//!  OPERAND_B - an input to the fusion that is a producer of a matmul "B" input
+//!  OUTPUT - fusion outputs that have the matmul as a dependency
+//!  EPILOGUE_INPUT - an input to the fusion that is a producer of an
+//!    OUTPUT, but not of an MMA input
+//!
+//!  Note: bias vector tensors will be assigned to the EPILOGUE_INPUT role.
+enum class MatmulTensorRole {
+  OPERAND_A = 0,
+  OPERAND_B,
+  OUTPUT,
+  EPILOGUE_INPUT
+};
+//! The expected number of occurances of core TensorView roles in fusion
+static constexpr size_t MATMUL_CORE_ROLES_EXPECTED_COUNT = 1;
+//! Utility data structure for recording gemm tiles
+struct GemmTile {
+  int64_t m, n, k;
+  GemmTile(int64_t m_, int64_t n_, int64_t k_) : m(m_), n(n_), k(k_) {}
+  bool operator==(const GemmTile& other) const {
+    return m == other.m && n == other.n && k == other.k;
+  }
+  GemmTile operator/(const GemmTile& other) const {
+    return GemmTile(m / other.m, n / other.n, k / other.k);
+  }
+  std::vector<int64_t> toVector() const {
+    return {m, n, k};
+  }
+};
+//! Utility data structure for recording gemm tiles
+struct MatMulTileOptions {
+  GemmTile cta_tile = GemmTile(128, 128, 32);
+  GemmTile warp_tile = GemmTile(64, 64, 32);
+  MatMulTileOptions() = default;
+  MatMulTileOptions(GemmTile cta_tile_, GemmTile warp_tile_)
+      : cta_tile(cta_tile_), warp_tile(warp_tile_) {}
+  bool operator==(const MatMulTileOptions& other) const {
+    return cta_tile == other.cta_tile && warp_tile == other.warp_tile;
+  }
+};
+enum class MmaMacro : uint64_t;
+struct MmaMacroEncode {
+  enum class Arch : uint16_t { NoMma, Volta, Turing, Ampere, Hopper } arch;
+  uint16_t m;
+  uint16_t n;
+  uint16_t k;
+  constexpr operator uint64_t() {
+    return (uint64_t)arch << 48 | (uint64_t)m << 32 | (uint64_t)n << 16 |
+        (uint64_t)k;
+  }
+  constexpr operator MmaMacro() {
+    return static_cast<MmaMacro>(static_cast<uint64_t>(*this));
+  }
+  constexpr MmaMacroEncode(MmaMacro macro)
+      : arch(Arch(toUnderlying(macro) >> 48)),
+        m((toUnderlying(macro) >> 32) & 0xFFFF),
+        n((toUnderlying(macro) >> 16) & 0xFFFF),
+        k(toUnderlying(macro) & 0xFFFF) {}
+  constexpr MmaMacroEncode(Arch arch_, uint16_t m_, uint16_t n_, uint16_t k_)
+      : arch(arch_), m(m_), n(n_), k(k_) {}
+};
+static_assert(sizeof(MmaMacroEncode) == sizeof(uint64_t));
+//! Type of mma instrinsic macro to use
+//!  This will translate to which mma intrinsic from runtime string
+//!    to be generated to implement the mma op. The current plan
+//!    is to have exactly one macro for each
+//!  (arch, datatype, operand layout) triple, though there
+//!  exists multiple possibilities for some cases, e.g. for Turing and fp16
+//!  one can use 16_8_8 or 16_8_16.
+//! Will consider adding more choices that the scheduler can pick from
+//!  when our perf target becomes more fine grained, which is more likely in
+//!  latency bound kernels.
+#define MACRO(arch, m, n, k) \
+  arch##_##m##_##n##_##k = MmaMacroEncode(MmaMacroEncode::Arch::arch, m, n, k)
+enum class MmaMacro : uint64_t {
+  NoMMA = 0,
+  MACRO(Turing, 16, 8, 8),
+  MACRO(Turing, 16, 8, 16),
+  MACRO(Turing, 16, 16, 16),
+  MACRO(Ampere, 16, 8, 16),
+  MACRO(Ampere, 16, 16, 16),
+  MACRO(Hopper, 64, 8, 16),
+  MACRO(Hopper, 64, 16, 16),
+  MACRO(Hopper, 64, 24, 16),
+  MACRO(Hopper, 64, 32, 16),
+  MACRO(Hopper, 64, 40, 16),
+  MACRO(Hopper, 64, 48, 16),
+  MACRO(Hopper, 64, 56, 16),
+  MACRO(Hopper, 64, 64, 16),
+  MACRO(Hopper, 64, 72, 16),
+  MACRO(Hopper, 64, 80, 16),
+  MACRO(Hopper, 64, 88, 16),
+  MACRO(Hopper, 64, 96, 16),
+  MACRO(Hopper, 64, 104, 16),
+  MACRO(Hopper, 64, 112, 16),
+  MACRO(Hopper, 64, 120, 16),
+  MACRO(Hopper, 64, 128, 16),
+  MACRO(Hopper, 64, 136, 16),
+  MACRO(Hopper, 64, 144, 16),
+  MACRO(Hopper, 64, 152, 16),
+  MACRO(Hopper, 64, 160, 16),
+  MACRO(Hopper, 64, 168, 16),
+  MACRO(Hopper, 64, 176, 16),
+  MACRO(Hopper, 64, 184, 16),
+  MACRO(Hopper, 64, 192, 16),
+  MACRO(Hopper, 64, 200, 16),
+  MACRO(Hopper, 64, 208, 16),
+  MACRO(Hopper, 64, 216, 16),
+  MACRO(Hopper, 64, 224, 16),
+  MACRO(Hopper, 64, 232, 16),
+  MACRO(Hopper, 64, 240, 16),
+  MACRO(Hopper, 64, 248, 16),
+  MACRO(Hopper, 64, 256, 16),
+};
+#undef MACRO
+//! [Operand Layout Convention]
+//! Operand layout, T=transposed/row_major, N=normal/col_major
+//! Ordered by position of K
+//! NT : K,M x K,N -> M,N
+//! TT : M,K X K,N -> M,N
+//! TN : M,K X N,K -> M,N
+//! NN : K,M X N,K -> M,N
+enum class MmaLayout { NT = 0, TT, TN, NN };
+//! Indicates which dimension is innermost in the allocation domain of an
+//! operand
+enum class UnitDim { K, M_or_N };
+//! Utility to annotate which input of mma this option struct describes
+enum class MmaOperand { A, B };
+//! GPU arch check for macro type
+inline bool isTuring(MmaMacro macro) {
+  return MmaMacroEncode(macro).arch == MmaMacroEncode::Arch::Turing;
+}
+inline bool isAmpere(MmaMacro macro) {
+  return MmaMacroEncode(macro).arch == MmaMacroEncode::Arch::Ampere;
+}
+inline bool isHopper(MmaMacro macro) {
+  return MmaMacroEncode(macro).arch == MmaMacroEncode::Arch::Hopper;
+}
+//! Get the m size from macro type
+inline int64_t getM(MmaMacro macro) {
+  return MmaMacroEncode(macro).m;
+}
+//! Get the n size from macro type
+inline int64_t getN(MmaMacro macro) {
+  return MmaMacroEncode(macro).n;
+}
+//! Get the k size from macro type
+inline int64_t getK(MmaMacro macro) {
+  return MmaMacroEncode(macro).k;
+}
+// Unpacked constants from macro type:
+//   exact numbers are defined by each individual instruction.
+int getOutputRegisterSize(MmaMacro macro);
+int getInputARegisterSize(MmaMacro macro);
+int getInputBRegisterSize(MmaMacro macro);
+// Unpack MMA op shape
+GemmTile getMmaOpShape(MmaMacro macro);
+// Warning: The values of the enum class must match the matrix descriptor as
+// specified in:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor
+// Do not edit the values of the enum class unless you know what you are doing.
+enum class MmaInputSmemSwizzle {
+  None = 0,
+  B128 = 1,
+  B64 = 2,
+  B32 = 3,
+};
+constexpr int64_t core_matrix_width_bytes = 16;
+int64_t getBytesFromSwizzle(MmaInputSmemSwizzle swizzle);
+MmaInputSmemSwizzle getSwizzleFromBytes(int64_t bytes);
+// MMA stringify utils
+NVF_API std::string toString(MmaLayout input_layout);
+std::string toString(const GemmTile& tile);
+NVF_API std::string toString(const MatMulTileOptions& opts);
+NVF_API std::string toString(MmaMacro macro);
+NVF_API std::string toString(MmaInputSmemSwizzle swizzle);
+inline std::ostream& operator<<(
+    std::ostream& os,
+    MmaInputSmemSwizzle input_layout) {
+  os << toString(input_layout);
+  return os;
+}
+// MMA hash utils
+NVF_API size_t hash(MmaMacro macro);
+size_t hash(MmaLayout input_layout);
+size_t hash(const GemmTile& tile);
+NVF_API size_t hash(const MatMulTileOptions& opts);
+} // namespace nvfuser

nvfuser/include/nvfuser/multidevice/c10d_mock.h ADDED Viewed

@@ -0,0 +1,175 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/ivalue.h>
+#include <c10/util/intrusive_ptr.h>
+namespace c10d {
+inline void setDebugLevelFromEnvironment() {}
+class Work : public torch::CustomClassHolder {
+ public:
+  void wait() {}
+};
+struct ReduceOp : torch::CustomClassHolder {
+  enum RedOpType {
+    SUM,
+    AVG,
+    PRODUCT,
+    MIN,
+    MAX,
+    BAND,
+    BOR,
+    BXOR,
+    UNUSED,
+  };
+  ReduceOp() = default;
+  ReduceOp(RedOpType op) : op_(op) {}
+  RedOpType op_ = UNUSED;
+};
+struct ReduceScatterOptions {
+  ReduceOp reduceOp = ReduceOp::UNUSED;
+};
+struct ScatterOptions {
+  int64_t rootRank = 0;
+};
+struct AllgatherOptions {};
+struct GatherOptions {
+  int64_t rootRank = 0;
+};
+struct BroadcastOptions {
+  int64_t rootRank = 0;
+};
+struct AllreduceOptions {
+  ReduceOp reduceOp = ReduceOp::UNUSED;
+};
+struct ReduceOptions {
+  ReduceOp reduceOp = ReduceOp::UNUSED;
+  int64_t rootRank = 0;
+};
+struct BarrierOptions {
+  std::vector<int64_t> device_ids;
+};
+class Backend : public torch::CustomClassHolder {
+ public:
+  void startCoalescing() {}
+  c10::intrusive_ptr<Work> endCoalescing() {
+    return c10::make_intrusive<Work>();
+  }
+  const std::string getBackendName() const {
+    return "";
+  };
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) {
+    return c10::make_intrusive<Work>();
+  }
+  int getSize() const {
+    return 0;
+  }
+};
+struct TCPStoreOptions {
+  static constexpr uint16_t kDefaultPort = 0;
+};
+class TCPStore : public torch::CustomClassHolder {};
+} // namespace c10d

nvfuser/include/nvfuser/multidevice/communication.h ADDED Viewed

@@ -0,0 +1,232 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ir/base_nodes.h>
+#include <ir/builder.h>
+#include <ir/interface_nodes.h>
+#include <multidevice/communicator.h>
+#include <multidevice/device_mesh.h>
+#include <multidevice/multidevice.h>
+#ifdef NVFUSER_DISTRIBUTED
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#else
+#include <multidevice/c10d_mock.h>
+#endif
+#include <type.h>
+#include <visibility.h>
+namespace nvfuser {
+enum class CommunicationType {
+  Gather,
+  Allgather,
+  Scatter,
+  Reduce,
+  Allreduce,
+  ReduceScatter,
+  Broadcast,
+  SendRecv
+};
+std::ostream& operator<<(std::ostream& os, const CommunicationType& type);
+using RedOpType = c10d::ReduceOp::RedOpType;
+// The class "Communication" represents a MPI-style communication
+// communication operation to be executed on the network. The base class
+// Communication should not be used directly but through its derived classes:
+// Broadcast, Gather, Scatter, Allgather, and SendRecv. Other collectives will
+// be added later.
+class Communication : public Expr {
+ public:
+  using Expr::Expr;
+  // Only specify `root` for types that have root.
+  // Only specify `red_op` for reduction types.
+  // Only specify `scattered_axis` for ReduceScatter.
+  Communication(
+      IrBuilderPasskey passkey,
+      CommunicationType type,
+      TensorView* out,
+      TensorView* in,
+      Team team, // All devices involved in this communication. It must include
+                 // `root`. It can be a subset of `root`+`mesh` in case of 2D
+                 // sharding.
+      DeviceIdxType root = -1,
+      RedOpType red_op = RedOpType::UNUSED,
+      int64_t scattered_axis = -1);
+  Communication(const Communication& other) = delete;
+  Communication& operator=(const Communication& other) = delete;
+  Communication(Communication&& other) = delete;
+  Communication& operator=(Communication&& other) = delete;
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "Communication";
+  }
+  CommunicationType type() const {
+    return attribute<CommunicationType>(0);
+  }
+  TensorView* out() const {
+    return output(0)->as<TensorView>();
+  }
+  TensorView* in() const {
+    return input(0)->as<TensorView>();
+  }
+  const Team& team() const {
+    return attribute<Team>(1);
+  }
+  // A convenience helper so the user doesn't need to convert size_t to int64_t.
+  int64_t team_size() const {
+    return static_cast<int64_t>(team().size());
+  }
+  DeviceIdxType root() const {
+    return attribute<DeviceIdxType>(2);
+  }
+  RedOpType reduceOp() const {
+    return attribute<RedOpType>(3);
+  }
+  int64_t scatteredAxis() const {
+    return attribute<int64_t>(4);
+  }
+  // PyTorch's process group expects the root to be specified
+  // as an integer between 0 and world_size-1. We choose it to be
+  // the device's relative index within the team
+  int64_t getRootRelativeIndex();
+ private:
+  void validate();
+};
+enum class P2PCommunicationType { SEND, RECV };
+std::ostream& operator<<(std::ostream& os, const P2PCommunicationType& type);
+class P2PCommunication : public Expr {
+ public:
+  using Expr::Expr;
+  P2PCommunication(
+      IrBuilderPasskey passkey,
+      P2PCommunicationType type,
+      TensorView* buffer,
+      Val* peer);
+  P2PCommunication(const P2PCommunication& other) = delete;
+  P2PCommunication& operator=(const P2PCommunication& other) = delete;
+  P2PCommunication(P2PCommunication&& other) = delete;
+  P2PCommunication& operator=(P2PCommunication&& other) = delete;
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "P2PCommunication";
+  }
+  P2PCommunicationType type() const {
+    return attribute<P2PCommunicationType>(0);
+  }
+  TensorView* buffer() const {
+    return input(0)->as<TensorView>();
+  }
+  Val* peer() const {
+    return attributeVal(1);
+  }
+};
+// The method "post" triggers the execution of the communication. This call is
+// non-blocking. The communication can be posted multiple times.
+// It is assumed that the current device_index (given by
+// communicator.deviceId()) belongs to the team of the communication,
+// otherwise an error is thrown.
+//
+// NOTE: pytorch's NCCL process group API needs <team_size> buffers on root for
+// scatter/gather operation.
+// (*) Broadcast
+// Copies the root's src buffer to each device's dst buffer
+// Requirements:
+//   - the root is set and belongs to the team
+//   - the root has one src buffer, and no or one dst buffer
+//   - non-roots have no src buffer and one dst buffer
+//   - all buffers have the same size
+// (*) Gather
+// Copies each device's source buffer to the root's respective src
+// buffer. The order of the sender devices matches the order of the
+// root's buffers.
+// Requirements:
+//   - the root is set and belongs to the team
+//   - the root has one src buffer and <team_size> dst buffers
+//   - non-roots have one src buffer and no dst buffer
+//   - all buffers have the same size
+// (*) Allgather
+// Copies each device's src buffer to each device's respective src
+// buffer. The order of the devices matches the order of the
+// buffers
+// Requirements:
+//   - all device have one src buffer and <team_size> dst buffers
+//   - all buffers have the same size
+// (*) Scatter
+// Copies each root's src buffer to each device's dst buffer.
+// The order of the buffers matches the order of the receiver devices
+// Requirements:
+//   - the root is set and belongs to the team
+//   - the root has <team_size> src buffers and one dst buffer
+//   - non-roots have no src buffer and one dst buffer
+//   - all buffers have the same size
+// (*) Reduce
+// Reduce the src buffers to the root's dst buffer.
+// Requirements:
+//   - the root is set and belongs to the team
+//   - the root has one src buffers and one dst buffer
+//   - non-roots have one src buffer and no dst buffer
+//   - all buffers have the same size
+// (*) Allreduce
+// Reduce the src buffers to the dst buffer.
+// Requirements:
+//   - all devices have one src buffer and one dst buffer
+//   - all buffers have the same size
+// (*) ReduceScatter
+// Reduce all the src buffers and shard the result to the dst buffers.
+// Requirements:
+//   - all devices have <team_size> src buffer and one dst buffer
+//   - all buffers have the same size
+// (*) SendRecv
+// Copies the sender's src buffers to the receiver's dst buffer
+// It is equivalent to a Broadcast with a team of size == 2
+c10::intrusive_ptr<c10d::Work> postSingleCommunication(
+    Communication* communication,
+    DeviceIdxType my_device_index,
+    c10d::Backend* backend,
+    at::Tensor input_tensor,
+    at::Tensor output_tensor);
+c10::intrusive_ptr<c10d::Work> postSingleCommunication(
+    P2PCommunication* communication,
+    DeviceIdxType my_device_index,
+    DeviceIdxType peer,
+    c10d::Backend* backend,
+    at::Tensor buffer);
+} // namespace nvfuser