PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/multidevice/communicator.h ADDED Viewed

@@ -0,0 +1,179 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/ivalue.h>
+#include <c10/util/intrusive_ptr.h>
+#include <exceptions.h>
+#include <multidevice/multidevice.h>
+#ifdef NVFUSER_DISTRIBUTED
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/TCPStore.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#else
+#include <multidevice/c10d_mock.h>
+#endif
+#include <visibility.h>
+namespace nvfuser {
+// This file implements the class Communicator which sets up the inter-process
+// Backend. This class contains inter-process information, such as the rank, the
+// world size, as well as the Process Group that can be called to perform
+// inter-process communications.
+//
+// Each process is associated with a unique deviceId and device. The actual MPI
+// rank remains private to the class and should not be used by the user. The
+// communicator class holds privately the mappings ranks <-> device IDs <->
+// device.
+using RankType = DeviceIdxType;
+// Supported backends. TODO: gloo untested
+enum class CommunicatorBackend { kNccl, kUcc, kGloo };
+std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb);
+#ifdef USE_C10D_NCCL
+constexpr CommunicatorBackend comm_backend_default = CommunicatorBackend::kNccl;
+#else
+constexpr CommunicatorBackend comm_backend_default = CommunicatorBackend::kUcc;
+#endif
+constexpr int comm_server_local_rank_default = 0;
+class Communicator {
+ public:
+  static Communicator& getInstance() {
+    // This isn't the best practice to use singleton. Ideally, we'd like to
+    // ```
+    // static Communicator communicator;
+    // ```
+    // and let the destructor clean it up at program exit after `main` returns.
+    // This however would cause a "driver shutting down" error, likely because
+    // another static variable destructor shuts down the CUDA driver before
+    // ~Communicator. Note that the order of static variable destruction
+    // across translation units is undefined.
+    //
+    // Therefore, we `new Communicator()` as a raw pointer and let the user
+    // call Communicator::getInstance().cleanup() to clean up the Communicator
+    // explicitly before the end of `main`. For example, the cleanup method is
+    // called via MultiDeviceTestEnvironment::TearDown in C++ unit tests and
+    // nvfuser._cleanup() in Python.
+    static auto* communicator = new Communicator();
+    return *communicator;
+  }
+  Communicator(const Communicator&) = delete;
+  Communicator& operator=(const Communicator&) = delete;
+  ~Communicator() = delete;
+  // As said in `getInstance`, the user of this class is supposed to call this
+  // method to clean up the singleton. This obviously can only be called once.
+  void cleanup();
+  // returns if distributed config is available
+  auto is_available() const {
+    return is_available_;
+  }
+  // returns the number of processes in the communicator
+  auto size() const {
+    return size_;
+  }
+  // returns the local number of processes in the communicator (within the node)
+  auto local_size() const {
+    return local_size_;
+  }
+  // sets the communicator's default backend
+  void setDefaultBackend(CommunicatorBackend backend) {
+    default_backend_ = backend;
+  }
+  // performs a blocking barrier in the communicator
+  void barrier(std::optional<CommunicatorBackend> backend = std::nullopt);
+  // returns the backend associated with a team
+  // the argument "prefix" is prepended to the key used to retrieve preexisting
+  // backends. Prefix is used to distinguish between different backends with the
+  // same team
+  c10d::Backend* getBackendForTeam(
+      const Team& team,
+      std::optional<CommunicatorBackend> backend,
+      const std::string& prefix = "");
+  // returns the device associated with the current process
+  auto device() const {
+    return at::Device("cuda:" + std::to_string(local_rank_));
+  }
+  // returns the device Id associated with the current process
+  DeviceIdxType deviceId() const {
+    return rankToDiD(rank_);
+  }
+  // returns local rank associted with the current process,
+  // i.e. the rank within a machine/node as opposed to the rank within the
+  // world.
+  RankType local_rank() const {
+    return local_rank_;
+  }
+  // returns world backend for communicator backend or default backend if not
+  // specified.
+  c10d::Backend* getWorld(
+      std::optional<CommunicatorBackend> backend = std::nullopt);
+  // returns if a backend is available for creation
+  bool isBackendAvailable(CommunicatorBackend backend) const {
+    if (backend == CommunicatorBackend::kUcc) {
+      return ucc_available_;
+    } else if (backend == CommunicatorBackend::kNccl) {
+      return nccl_available_;
+    }
+    return false;
+  }
+ private:
+  Communicator(
+      CommunicatorBackend backend = comm_backend_default,
+      RankType server_local_rank = comm_server_local_rank_default);
+  // returns the rank corresponding to a device index
+  RankType dIdToRank(DeviceIdxType d_id) const {
+    return static_cast<RankType>(d_id);
+  }
+  // returns the device index corresponding to a rank
+  DeviceIdxType rankToDiD(RankType rank) const {
+    return static_cast<DeviceIdxType>(rank);
+  }
+  CommunicatorBackend getBackend(std::optional<CommunicatorBackend> backend) {
+    return backend.value_or(default_backend_);
+  }
+  bool is_available_;
+  CommunicatorBackend default_backend_;
+  RankType rank_;
+  int64_t size_;
+  RankType local_rank_;
+  int64_t local_size_;
+  std::string master_addr_;
+  int master_port_;
+  bool ucc_available_;
+  bool nccl_available_;
+  // stores the world's store used for the backend init
+  c10::intrusive_ptr<c10d::TCPStore> store_;
+  // cache for the created backends. The keys are strings generated from Teams
+  std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/multidevice/device_mesh.h ADDED Viewed

@@ -0,0 +1,95 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <vector>
+#include <exceptions.h>
+#include <multidevice/multidevice.h>
+#include <type.h>
+#include <visibility.h>
+namespace nvfuser {
+// The class DeviceMesh represents a set of (unique) devices on which a Pipeline
+// Stage will be executed. For now, we only support flat meshes, but later we
+// will add support for n-dimensional meshes.
+class DeviceMesh final {
+ public:
+  // https://google.github.io/styleguide/cppguide.html#Implicit_Conversions
+  //
+  // Not using `explicit` for the constructor that takes a vector would lead
+  // to contention between operator<<(std::vector) defined in c10/util/Logging.h
+  // and operator<<(DeviceMesh) defined later in this file, which would be
+  // resolved arbitrarily by the compiler.
+  //
+  // There are no such contention for std::initializer_list so I chose to
+  // allow implicit conversion for that. This allows users to write `DeviceMesh
+  // mesh = {1, 2};`, which is more concise.
+  explicit DeviceMesh(std::vector<DeviceIdxType> devices = {});
+  DeviceMesh(std::initializer_list<DeviceIdxType> devices);
+  DeviceMesh(const DeviceMesh&) = default;
+  DeviceMesh(DeviceMesh&&) = default;
+  DeviceMesh& operator=(const DeviceMesh&) = default;
+  DeviceMesh& operator=(DeviceMesh&&) = default;
+  // Creates a device mesh of [0 .. num_devices-1]. I didn't make it a
+  // constructor because single-element initializer lists would be directed to
+  // use that instead of the constructor for vectors.
+  static DeviceMesh createForNumDevices(int64_t num_devices);
+  // Returns the number of devices in the mesh
+  int64_t size() const {
+    return static_cast<int64_t>(vector_.size());
+  }
+  int64_t size(ParallelType parallel_type) const;
+  // Returns a vector containing the device indices of the mesh
+  const std::vector<DeviceIdxType>& vector() const {
+    return vector_;
+  }
+  // Returns whether a device is present in the mesh
+  bool has(const DeviceIdxType device) const {
+    return std::find(vector_.begin(), vector_.end(), device) != vector_.end();
+  }
+  // Returns the index of device in the mesh, or -1 if device is not present.
+  int64_t idxOf(const DeviceIdxType device) const {
+    auto it = std::find(vector_.begin(), vector_.end(), device);
+    if (it != vector_.end()) {
+      return std::distance(vector_.begin(), it);
+    }
+    return -1;
+  }
+  // Returns the device at a particular index in the mesh
+  DeviceIdxType at(int64_t index) const {
+    return vector_.at(index);
+  }
+  bool operator==(const DeviceMesh& other) const {
+    return vector_ == other.vector();
+  }
+  bool operator!=(const DeviceMesh& other) const {
+    return vector_ != other.vector();
+  }
+ private:
+  void setDevices(std::vector<DeviceIdxType> devices);
+  // stores the list of device indices
+  std::vector<DeviceIdxType> vector_;
+};
+std::ostream& operator<<(std::ostream& out, const DeviceMesh& mesh);
+} // namespace nvfuser

nvfuser/include/nvfuser/multidevice/executor.h ADDED Viewed

@@ -0,0 +1,107 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <c10/core/DeviceType.h>
+#include <exceptions.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <host_ir/executor.h>
+#include <ir/cloner.h>
+#include <multidevice/communication.h>
+#include <multidevice/communicator.h>
+#include <multidevice/multidevice.h>
+namespace nvfuser {
+/*
+  The MultiDeviceExecutor executes a Fusion on a multi-device setting.
+  It is instantiated from a Fusion and a Communicator.
+  The Fusion must be scheduled prior to the instantiation of the
+  MultiDeviceExecutor. One can use the multidevice scheduling API to specify
+  the desired tensor sharding. It is composed of two aspects:
+    *) Set each tensor's DeviceMesh, through TensorView::setDeviceMesh
+    *) parallelize each tensor axis, possibly with the multidevice sharding
+       parallel type ParallelType::DIDx
+  We make the following assumptions on the Fusion:
+  - Only one (non-reduction) axis is allowed to be parallelized
+    with ParallelType::DIDx. Moreover, this axis cannot be split/merged.
+  - We only support 1D device meshes for now
+  - We only support TensorViews in communication segments.
+  Summary of the different steps performed by the MultiDeviceExecutor:
+  I. At instantiation:
+  - resharding "Set" exprs are automatically inserted in the fusion where a
+    network communication is needed. See the function insertReshardings.
+  - the Fusion is segmented into segments which can be of two types:
+      1) compute segments, composed of non-Resharding expressions only,
+         that can be purely execute on a single device
+      or
+      2) communication, composed of exactly one resharding expression, which
+         can be either a "Set" or "Reduce" Exprs.
+  - the runtime order of execution of the different segments is computed in
+    prepareRuntimeOrder
+  II. At runtime, through the method runWithInput:
+  - allocateRecvBuffers allocates on each device the necessary buffers to
+    store the data received from network communications
+  - Each (compute or comm) segment is executed separately, in order:
+    1) each compute segment is transformed into a fusion, compiled and executed
+       on a single device, see postKernel
+    2) each comm segment is lowered into a series of communications (defined in
+       multidevice/communications.h) and are posted on the stream.
+       "Wait" primitives are also posted on the stream.
+  TODOS:
+  *) the MultiDeviceExecutor should be integrated into FusionExecutorCache.
+  *) The different steps should be divided into compilation, allocation,
+     runtime etc. This will be done along the way when we will have better
+     symbolic representation of the multidevice modules
+  *) Allocation of buffers needs to be reimplemented
+  *) Need to work on auto-scheduling, in particular, to combine inter-/intra-
+     device scheduling.
+*/
+class MultiDeviceExecutor {
+ public:
+  MultiDeviceExecutor(
+      std::unique_ptr<Fusion> fusion,
+      Communicator& comm,
+      hir::HostIrEvaluatorParams params = hir::HostIrEvaluatorParams());
+  // Run the fusion on several devices with the given global inputs
+  std::vector<at::Tensor> runWithInput(const std::vector<c10::IValue>& inputs);
+  // Returns the Communicator
+  Communicator* comm() const {
+    return &comm_;
+  }
+  // check if the runtime is valid returns an error msg.
+  // An empty message means that the runtime is valid
+  std::string validate() const {
+    return host_ir_executor_->canRun();
+  }
+  //! Print to default debugging output stream
+  std::ostream& print(std::ostream& os = debug());
+  const auto& getFusionExecutorCaches() {
+    return host_ir_executor_->getFusionExecutorCaches();
+  };
+ private:
+  // holds the Communicator to be used for execution
+  Communicator& comm_;
+  // holds the HostIrEvaluator used for execution
+  std::unique_ptr<hir::HostIrEvaluator> host_ir_executor_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/multidevice/multidevice.h ADDED Viewed

@@ -0,0 +1,18 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <c10/core/Device.h>
+namespace nvfuser {
+using DeviceIdxType = int64_t;
+using DimensionType = int;
+using DeviceType = c10::Device;
+using Team = std::vector<DeviceIdxType>;
+} // namespace nvfuser

nvfuser/include/nvfuser/multidevice/utils.h ADDED Viewed

@@ -0,0 +1,187 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <c10/util/ArrayRef.h>
+#include <compute_at_map.h>
+#include <fusion.h>
+#include <id_model/id_model.h>
+#include <ir/interface_nodes.h>
+#include <multidevice/multidevice.h>
+#include <visibility.h>
+namespace nvfuser {
+// Returns true iff nvFuser was compiled with distributed APIs enabled.
+NVF_API bool distributedEnabled();
+// For a resharding expression, either a set or reduce, returns root IDs
+// that change sharding.
+// (1) sharded root IterDomains that are added by the expression
+// i.e. sharded IterDomains that are present in the output, but not the input.
+// (2) sharded root IterDomains that are removed by the expression
+// i.e. sharded IterDomains that are present in the input, but not the output.
+// TODO: Analyze loop domain for unsharded/sharded IDs and return their
+// parent root IDs.
+std::pair<std::vector<IterDomain*>, std::vector<IterDomain*>> getShardingChanges(
+    TensorView* producer,
+    TensorView* consumer);
+// Returns whether a TensorView has a non-reduction axis parallelized Didx
+// Checks that the other non-reduction axis are not parallelized on Didx
+bool isSharded(const TensorView*);
+// Returns number of device dimensions in a TensorView's loop domain.
+int64_t numDeviceDims(const TensorView*);
+// Returns the subset of tvs which elements have the different multi-device
+// sharding as ref
+template <typename TvIterator>
+std::unordered_set<TensorView*> getTvsWithDifferentSharding(
+    TensorView* ref,
+    TvIterator tvs) {
+  std::unordered_set<TensorView*> ret;
+  const auto& reference_dom = ref->getLoopDomain();
+  FusionGuard fg(ref->fusion());
+  auto ca_map = ComputeAtMap(FusionGuard::getCurFusion());
+  std::unordered_map<IterDomain*, IterDomain*> concrete_to_reference_map;
+  for (auto id : reference_dom) {
+    auto ca_id =
+        ca_map.getConcreteMappedID(id, IdMappingMode::PERMISSIVE_RESIZE);
+    concrete_to_reference_map[ca_id] = id;
+  }
+  for (TensorView* tv : tvs) {
+    if (ref->getDeviceMesh().vector() != tv->getDeviceMesh().vector()) {
+      ret.insert(tv);
+      continue;
+    }
+    for (auto id : tv->getLoopDomain()) {
+      auto ca_id =
+          ca_map.getConcreteMappedID(id, IdMappingMode::PERMISSIVE_RESIZE);
+      if (concrete_to_reference_map.count(ca_id) > 0) {
+        auto ref_id = concrete_to_reference_map.at(ca_id);
+        if ((ref_id->isDeviceDim() || id->isDeviceDim()) &&
+            ref_id->getParallelType() != id->getParallelType()) {
+          ret.insert(tv);
+          break;
+        }
+      }
+    }
+  }
+  return ret;
+}
+// Returns whether an Expr embeds multi-device resharding
+bool isResharding(const Expr* expr);
+// Returns whether two tensors have different shardings. Expect a
+// producer/consumer relationship between the arguments.
+bool haveDifferentShardings(
+    const TensorView* producer,
+    const TensorView* consumer,
+    const IdModel& id_model);
+// Returns whether a resharding expr reshards an inner axis
+bool isInnerResharding(Expr* expr);
+// Shards all tensors in tvs like reference
+void shardAllLike(TensorView* ref, std::vector<TensorView*> tvs);
+// Shards all TVs between from and to AND between TVs created inside a fusion
+// and to. This is required for (1) expressions like rng_uniform that create a
+// TV inside a fusion that is not between a path from user visible TVs. (2)
+// multi-output expressions may have output tensors that are not along a path to
+// the fusion output which would not be reachable otherwise. (2) sharding
+// propagation checks all TVs in the fusion are assigned a device mesh
+// regardless if they are reachable. To keep the checks simple, we require all
+// TVs are assigned a mesh if they exist in the fusion.
+void shardBetween(
+    const std::vector<TensorView*>& from,
+    const std::vector<TensorView*>& to,
+    TensorView* ref);
+// Same as above but using the outputs of the from and to expressions
+// to form the from and to TVs.
+void shardBetween(
+    const std::vector<Expr*>& from,
+    const std::vector<Expr*>& to,
+    TensorView* ref);
+// Returns the devices involved in an expr
+std::set<DeviceIdxType> involvedDevices(Expr* expr);
+// Returns the number of device indices present accross all
+// device meshes in the Fusion
+int64_t requestedNumberOfDevices(Fusion*);
+// remove the multi-device scheduling annotations
+void unshard(Fusion*);
+void unshard(TensorView*);
+// Returns the index of the sharded logical axis that produces the allocation
+// IterDomain sharded on `parallel_type`. If `tv` isn't sharded on the parallel
+// type, returns -1.
+//
+// This is used to correlate `tv` and its corresponding at::Tensor, e.g., by
+// `unshardedSizes` and `shardTensor`. `at::Tensor::sizes` and
+// `tv->getLogicalDomain()` map one-to-one modulo reduction. However, a size in
+// `at::Tensor::sizes` is a factor of the corresponding logical IterDomain's
+// extent if that IterDomain is sharded.
+int64_t getShardedLogicalAxis(const TensorView* tv, ParallelType parallel_type);
+// Shards the input tensor along `axis`. How the tensor gets sliced along `axis`
+// is determined by `mesh` and `device_id`. Returns the sharded tensor.
+at::Tensor shardTensor(
+    at::Tensor tensor,
+    int64_t axis,
+    const DeviceMesh& mesh,
+    DeviceIdxType device_id);
+// Reorders a TensorView so that the DID parallelized axis are in front.
+void reorderDIDToFront(TensorView*);
+// Given a TensorView and the shape of a sharded tensor of which certain
+// dimensions are partially allocated, returns the global shape that'll be used
+// to bind to the TensorView's logical domain. This is to solve #3282 so we can
+// bind a sharded tensor to a TensorView that has a DID-parallel loop domain.
+//
+// For example, when `tv` is
+//   logical: iM, iN
+//   allocation: iDIDx{D}, iN/D, iM
+// and `sizes` is [2, 3], the returned shape will be [2, 3D]. This is because,
+// according to the allocation domain, iM is fully allocated and iN is sharded
+// and thus partially allocated.
+//
+// If the TensorView is not sharded, this function returns `sizes`.
+//
+// Limitations:
+// - The function assumes that there are no Merges from logical to the
+// DID-parallel IterDomains in allocation. Otherwise, it's unclear which logical
+// dimension this DID-parallelization should be attributed to.
+// - The function assumes that all Splits from logical to the DID-parallel
+// IterDomains in allocation are even. This is because there are currently no
+// ways to pass in the global shape.
+//
+// Despite these limitations, I took this approach as a shortcut to fix #3282,
+// which blocked many other tasks. I'm however open to other better, long-term
+// solutions. Some alternatives considered in #3282 are:
+// - Try to bind `at::Tensor`s to allocation domains instead of logical. Many
+// `*Op::evaluate` methods (e.g.
+// https://github.com/NVIDIA/Fuser/blob/2415d904d1e9a5da7ca6fb1a55d3045bbd510341/csrc/ir/nodes.cpp#L4321-L4329)
+// assume the input/output `at::Tensor`s have the same dimension order as the
+// logical domain. Doing so would have to change them all.
+// - Try to pass into FusionExecutorCache both logical (global) shapes and
+// allocated (local) tensors for sharded TensorViews. The logical shapes would
+// have to be passed through FusionKernelRuntime, FusionExecutor,
+// ExpressionEvaluator, and so on, which is an API overhaul.
+std::vector<int64_t> unshardedSizes(
+    const TensorView* tv,
+    c10::IntArrayRef sizes);
+} // namespace nvfuser

nvfuser/include/nvfuser/non_divisible_split.h ADDED Viewed

@@ -0,0 +1,86 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <ir/all_nodes.h>
+#include <iter_visitor.h>
+namespace nvfuser {
+//! See doc/reading/divisibility-of-split.md#predication
+//! If an IterDomain is split and its inner output domain is
+//! eventually split too, the second split must be divisible or the
+//! inner domain must be predicated. This class finds Split
+//! expressions that need to be divisible or predicated.
+//!
+//! Second splits are not limited to just direct output domains of
+//! first splits but also indirect descendent domains as well.
+//!
+//! Predicating non-divisible split domains does not work if split
+//! output domains are vectorized where ParallelType::Vectorize is
+//! applied to an inner domain of splits. If it's non-divisible,
+//! predicating the input domain of the non-divisible split results in
+//! a vectoried operation is predicated out entirely since we do not
+//! generate a fall-back non-vectorized else path. Runtime check is
+//! done for those domains.
+class NVF_API NonDivisibleSplitInfo : public IterVisitor {
+ public:
+  void build(Fusion* fusion);
+  const auto& splitsToPredicate() const {
+    return splits_to_predicate_;
+  }
+  const auto& splitsToValidate() const {
+    return splits_to_validate_;
+  }
+ private:
+  using IterVisitor::handle;
+  void handle(Split* split) override;
+  void handle(Merge* merge) override;
+  //! True if reachable from inner domains of splits
+  bool isReachableFromInnerDomains(IterDomain* id) const;
+  //! Forward propagate the reachability information
+  void propagateReachability(Split* split, bool is_protected);
+  //! Forward propagate the reachability information
+  void propagateReachability(Merge* merge);
+  void clearReachability();
+  //! Returns the extent of a split output domain if it's not proven to
+  //! be divisible.
+  Val* getMaybeNonDivisibleExtent(Split* split) const;
+  //! Remove redundant predicates as divisibility may be validated at
+  //! run time
+  void removeRedundancy();
+  //! Add validations to GpuLower::current()->validations()
+  void addValidations();
+ private:
+  //! Split expressions whose input domain must be predicated
+  std::unordered_map<TensorView*, std::vector<Split*>> splits_to_predicate_;
+  //! Split expressions whose divisibility must be validated at run time
+  std::unordered_set<Split*> splits_to_validate_;
+  //! Temporarily used for analyzing each tensor
+  TensorView* current_tv_ = nullptr;
+  std::unordered_set<IterDomain*> inner_domains_;
+};
+} // namespace nvfuser