PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/logical_domain_map.h ADDED Viewed

@@ -0,0 +1,577 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <disjoint_set.h>
+#include <exceptions.h>
+#include <ir/all_nodes.h>
+#include <iter_visitor.h>
+#include <utils.h>
+#include <visibility.h>
+namespace nvfuser {
+//! Generic interface for mapping logical domains of a producer-consumer pair.
+class LogicalDomainMap : public PolymorphicBase {
+ public:
+  //! Return a map from a producer TensorDomain to a consumer
+  //! TensorDomain
+  //!
+  //! \param producer A producer TensorDomain
+  //! \param consumer A consumer TensorDomain
+  //! \param dims_to_map Maps only producer logical domains in this set
+  std::unordered_map<IterDomain*, IterDomain*> mapProducerToConsumer(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& dims_to_map) const;
+  //! Return a map from a producer TensorDomain to a consumer
+  //! TensorDomain
+  //!
+  //! \param producer A producer TensorDomain
+  //! \param consumer A consumer TensorDomain
+  std::unordered_map<IterDomain*, IterDomain*> mapProducerToConsumer(
+      const TensorDomain* producer,
+      const TensorDomain* consumer) const;
+  //! Return a map from a consumer TensorDomain to a producer
+  //! TensorDomain
+  //!
+  //! \param consumer A consumer TensorDomain
+  //! \param producer A producer TensorDomain
+  //! \param dims_to_map Maps only consumer root domains in this set
+  std::unordered_map<IterDomain*, IterDomain*> mapConsumerToProducer(
+      const TensorDomain* consumer,
+      const TensorDomain* producer,
+      const std::unordered_set<IterDomain*>& dims_to_map) const;
+  //! Return a map from a consumer TensorDomain to a producer
+  //! TensorDomain
+  //!
+  //! \param consumer A consumer TensorDomain
+  //! \param producer A producer TensorDomain
+  std::unordered_map<IterDomain*, IterDomain*> mapConsumerToProducer(
+      const TensorDomain* consumer,
+      const TensorDomain* producer) const;
+ protected:
+  //! Return a map between logical IterDomains of a producer-consumer
+  //! pair.
+  //!
+  //! \param producer A producer TensorDomain
+  //! \param consumer A consumer TensorDomain
+  //! \param dims_to_map Maps only from IterDomains in this set
+  //! \param producer_to_consumer Maps from producer to consumer if true
+  virtual std::unordered_map<IterDomain*, IterDomain*> map(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& dims_to_map,
+      bool producer_to_consumer) const = 0;
+};
+//! Maps logical domains of a producer-consumer pair. This class only
+//! looks at the given pair of TensorViews and does not take into
+//! consideration the constraints of the computeAt transformation,
+//! i.e., unable to compute the same tensors multiple times. This
+//! should not be used for transformations implementing computeAt, but
+//! should be valid otherwise.
+class PairwiseLogicalDomainMap : public LogicalDomainMap {
+ public:
+  //! When require_same_extent is false, domains that may have
+  //! different extents are also mapped. For example, IDs of lookup
+  //! tensors in gather may have larger extents than the corresponding
+  //! IDs of the output and index tensors. This relaxation is
+  //! necessary when indexing into lookup tensors as producers.
+  //!
+  //! \param producer The producer tensor of a producer-consumer pair.
+  //! \param consumer The consumer tensor of a producer-consumer pair.
+  PairwiseLogicalDomainMap(
+      const TensorView* producer,
+      const TensorView* consumer);
+  PairwiseLogicalDomainMap& mapBroadcast(bool b) {
+    map_broadcast_ = b;
+    return *this;
+  }
+  //! If b is true: map symbolic domains with other IterDomains even if their
+  //! extents don't match. If b is false (default): map symbolic domains with
+  //! other IterDomains only if their extents match.
+  PairwiseLogicalDomainMap& mapSymbolic(bool b) {
+    map_symbolic_ = b;
+    return *this;
+  }
+  PairwiseLogicalDomainMap& mapDifferentExtents(bool b) {
+    map_different_extents_ = b;
+    return *this;
+  }
+  PairwiseLogicalDomainMap& mapIndexedDomains(bool b) {
+    map_indexed_domains_ = b;
+    return *this;
+  }
+  const TensorView* producerTv() const {
+    return producer_tv_;
+  }
+  const TensorView* consumerTv() const {
+    return consumer_tv_;
+  }
+  std::string toString() const;
+  // Helper methods on top of LogicalDomainMap::mapProducerToConsumer and
+  // LogicalDomainMap::mapConsumerToProducer. This way, the caller doesn't have
+  // to specify the producer domain and the consumer domain, which is redundant
+  // and error-prone.
+  std::unordered_map<IterDomain*, IterDomain*> mapProducerToConsumer(
+      const std::unordered_set<IterDomain*>* dims_to_map = nullptr) const;
+  std::unordered_map<IterDomain*, IterDomain*> mapConsumerToProducer(
+      const std::unordered_set<IterDomain*>* dims_to_map = nullptr) const;
+ protected:
+  std::unordered_map<IterDomain*, IterDomain*> map(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& dims_to_map,
+      bool producer_to_consumer) const override;
+ private:
+  const TensorView* producer_tv_ = nullptr;
+  const TensorView* consumer_tv_ = nullptr;
+  //! Options to allow more permissive mappings
+  //! Map broadcast and non-broadcast domains. Note that this is on by
+  //! default
+  bool map_broadcast_ = true;
+  //! Map symbolic domains with other IterDomains, even if their extents don't
+  //! match. Note that this is off by default, in which case they are mapped
+  //! only if their extents match.
+  bool map_symbolic_ = false;
+  //! Map domains that may have different extents, e.g., torchGather
+  bool map_different_extents_ = false;
+  //! Map domains that are indirectly accessed, e.g., indexSelect
+  bool map_indexed_domains_ = false;
+};
+//! Represents an iteration domain of a TensorDomain. Only used for
+//! logical domain mapping.
+//!
+//! Note that an IterDomain object may be reused
+//! across multiple TensorDomains, but an IterDomain in a
+//! TensorDomain may not be necessarily mappable to the same
+//! IterDomain used in a different TensorDomain. Thus, for the purpose
+//! of logical domain mapping, an iteration domain needs to be identified
+//! with an IterDomain and its TensorDomain.
+class DomainKey {
+ public:
+  DomainKey() = default;
+  DomainKey(
+      const TensorDomain* td,
+      const IterDomain* id,
+      const IterDomain* concrete_id = nullptr)
+      : td_(td), id_(id), concrete_id_(concrete_id) {}
+  const TensorDomain* td() const {
+    return td_;
+  }
+  const IterDomain* id() const {
+    return id_;
+  }
+  const IterDomain* concreteId() const {
+    return concrete_id_;
+  }
+  bool operator==(const DomainKey& other) const {
+    return td() == other.td() && id() == other.id() &&
+        concreteId() == other.concreteId();
+  }
+  bool operator!=(const DomainKey& other) const {
+    return !(*this == other);
+  }
+  std::string toString() const;
+ private:
+  const TensorDomain* td_ = nullptr;
+  const IterDomain* id_ = nullptr;
+  const IterDomain* concrete_id_ = nullptr;
+};
+struct DomainKeyHash {
+  std::size_t operator()(const DomainKey& key) const {
+    return std::hash<const TensorDomain*>{}(key.td()) ^
+        std::hash<const IterDomain*>{}(key.id());
+  }
+};
+using DomainKeySet = std::unordered_set<DomainKey, DomainKeyHash>;
+template <typename Mapped>
+using DomainKeyMap = std::unordered_map<DomainKey, Mapped, DomainKeyHash>;
+class ComputeAtLogicalDomainMap;
+//! A helper class to find all DomainKeys that are consumers of
+//! reduction outputs. Such consumer IterDomains may not be mapped to
+//! the producer reduction domain since the corresponding reduction
+//! loop must be closed before any of the consumers can appear.
+class UnmappableReductionDomains : private IterVisitor {
+ public:
+  UnmappableReductionDomains();
+  ~UnmappableReductionDomains() override = default;
+  //! Returns true when mapping consumer domains would cause a
+  //! reduction output domain to be mapped with a consumer domain of
+  //! the redution. It needs to be avoided as computing consumers of
+  //! reduction outputs within the corresponding reduction loop is not
+  //! possible. This routine is used to build logical domain mappings.
+  bool isReductionOutputMapped(
+      const DomainKeySet& consumer_domains,
+      const ComputeAtLogicalDomainMap& logical_map) const;
+  std::string toString() const;
+ private:
+  using IterVisitor::handle;
+  void handle(ReductionOp* op) override;
+  void handle(GroupedReductionOp* op) override;
+  void handle(WelfordOp* op) override;
+  void handle(MmaOp* op) override;
+  void handleReductionOutput(TensorView* out_tv);
+ private:
+  //! Map from Reduction output DomainKeys to consumer DomainKeys
+  DomainKeyMap<DomainKeySet> reduction_domains_;
+  //! Map from Reduction output DomainKeys to producer DomainKeys
+  DomainKeyMap<DomainKeySet> reduction_domain_inputs_;
+};
+//! Models logical-domain mappings for computeAt
+//!
+//! Two iteration domains are mapped when computeAt of one iteration
+//! domain is possible at another iteration domain. Consider a simple
+//! example:
+//!    T2 [i0,i1] = T1[i2,i3] + T0[i4,i5]
+//! This will create mappings between i0, i2 and i4.
+//!
+//! Note that with views, there can be multiple domains mapped with
+//! the same domain. Thus, obtaining one-to-one maps can
+//! fail. Currently, the only use of this class is getMappableDims,
+//! which just grabs any domain that is mappable, which works no
+//! matter view is used or not.
+class NVF_API ComputeAtLogicalDomainMap : public LogicalDomainMap {
+  friend class ComputeAtLogicalDomainMapBuilder;
+ public:
+  //! Builds a mapping table by analyzing the current
+  //! fusion. Overwrite a previous table if any.
+  //!
+  //! \param map_through_reduction If set
+  //!   true, will disable UnmappableReductionDomains check.
+  //!   This is only for re-using logic in detecting
+  //!   normalization fusions, which deviates slightly from
+  //!   intended use of this class. Should always be true
+  //!   in compute_at use cases.
+  void build(bool map_through_reduction = false);
+  //! Returns if key(td_a, id_a) and key(td_b, id_b) are mapped to eachother
+  //! (equivalent), or are the same key.
+  //!
+  //! \param td_a A TensorDomain
+  //! \param id_a An IterDomain in td_a
+  //! \param td_b Another TensorDomain
+  //! \param id_b An IterDomain in td_b
+  //! \returns Boolean representing if they are mapped
+  bool canMap(
+      const TensorDomain* td_a,
+      const IterDomain* id_a,
+      const TensorDomain* td_b,
+      const IterDomain* id_b) const;
+  //! Make a TensorDomain an alias of another TensorDomain
+  //!
+  //! This is for the computeAt transformation, where TensorViews are
+  //! updated with new TensorDomains. Since they keep using the same
+  //! logical doamins, the logical mapping remains valid but needs to
+  //! reflect the use of new TensorDomains as aliases of the existing
+  //! ones.
+  //!
+  //! \param td An existing TensorDomain
+  //! \param td_alias An alias of td
+  void setAlias(const TensorDomain* td, const TensorDomain* td_alias);
+  //! Return a map between TensorDomains
+  //!
+  //! Unlike the other map functions, two TensorDomains do not need to
+  //! be a producer-consumer pair. Since they may not be a
+  //! producer-consumer pair, this function requires proper domains, which may
+  //! be root or logical domains. Also, no error check is done as we do not
+  //! assume producer-consumer relationship.
+  //!
+  //! Note that an exception is thrown when a domain is found to be
+  //! mapped to multiple domains, which can happen with views.
+  //!
+  //! \param from_td A TensorDomain from which a map is created
+  //! \param from_dom A root/logical domain of from_td
+  //! \param to_td A TensorDomain to which a map is created
+  //! \param to_dom A root/logical domain of to_td
+  std::unordered_map<IterDomain*, IterDomain*> mapBestEffort(
+      const TensorDomain* from_td,
+      const std::vector<IterDomain*>& from_dom,
+      const TensorDomain* to_td,
+      const std::vector<IterDomain*>& to_dom) const;
+  // Returns an unordered set of all iter domains in producer and consumer that
+  // can map to eachother
+  std::unordered_set<IterDomain*> getMappableDims(
+      const TensorDomain* producer,
+      const TensorDomain* consumer) const;
+  std::string toString() const;
+  //! Returns true if id in td is concretized
+  bool isConcretized(const TensorDomain* td, const IterDomain* id) const;
+ private:
+  //! Returns if key_a and key(td_b, id_b) are mapped to eachother (equivalent),
+  //! or are the same key.
+  //!
+  //! \param key_a A DomainKey
+  //! \param td_b Another TensorDomain
+  //! \param id_b An IterDomain in td_b
+  //! \returns Boolean representing if they are mapped
+  bool canMap(
+      const DomainKey& key_a,
+      const TensorDomain* td_b,
+      const IterDomain* id_b) const;
+  //! Returns if key_a and key_b are mapped to each other (equivalent), or are
+  //! the same key. Returns false if two keys are not known to be mapped.
+  bool canMap(const DomainKey& key_a, const DomainKey& key_b) const;
+  //! Returns the set of (non-broadcast) DomainKeys that id in td is
+  //! broadcasted to. Can result in more than one "concrete" DomainKey.
+  std::vector<DomainKey> getConcretizedKeys(
+      const TensorDomain* td,
+      const IterDomain* id) const;
+  //! Returns the set of (non-broadcast) iter domains that id in td is
+  //! broadcasted to. Can result in more than one "concrete" iter domain.
+  std::unordered_set<const IterDomain*>& getConcretizedDomains(
+      const TensorDomain* td,
+      const IterDomain* id);
+  //! Return a map between logical IterDomains of a producer-consumer
+  //! pair.
+  //!
+  //! \param producer A producer TensorDomain
+  //! \param consumer A consumer TensorDomain
+  //! \param dims_to_map Maps only from IterDomains in this set
+  //! \param producer_to_consumer Maps from producer to consumer if true
+  std::unordered_map<IterDomain*, IterDomain*> map(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& dims_to_map,
+      bool producer_to_consumer) const override;
+ private:
+  //! Disjoint set of all mapped <TD, ID> keys to determine axes equivalency
+  DisjointSets<DomainKey, DomainKeyHash> eq_set_;
+  //! All IterDomains in the mapping that are a broadcast ID
+  DomainKeyMap<std::unordered_set<const IterDomain*>> bcast_map_;
+  //! Broadcast iter domain that does not match dimensions in its produer,
+  //! meaning it is a brand new domain in its TensorDomain.
+  DomainKeySet new_broadcast_domains_;
+  //! Broadcast iter domain that does not match dimensions in its consumer,
+  //! meaning it is a removed domain in its TensorDomain.
+  DomainKeySet removed_broadcast_domains_;
+  //! Keep track of window axes so that the map function can ignore them.
+  std::unordered_set<IterDomain*> window_axes_;
+};
+//! Create a DisjointSets of logical IterDomains by traversing the
+//! current fusion entirely. IterDomains that can be mapped each
+//! other with computeAt are grouped into the same subset in the
+//! DisjointSets.
+class ComputeAtLogicalDomainMapBuilder : private BackwardVisitor {
+ public:
+  explicit ComputeAtLogicalDomainMapBuilder(
+      ComputeAtLogicalDomainMap& logical_map,
+      bool map_through_reduction = false);
+ private:
+  //! Initialize the bcast map for fusion outputs
+  void initializeBcastMap(const TensorView* tv, const IterDomain* id);
+  //! Set a pair of producer-consumer domain keys as mappable
+  void setMapped(const DomainKey& producer, const DomainKey& consumer);
+  //! Records two domains are invalid to map
+  void setInvalid(const DomainKey& key1, const DomainKey& key2);
+  //! Check if no pair of domains is invalid to map
+  bool isInvalid(const DomainKeySet& domains) const;
+  //! Track a pair of producer-consumer domains as potentially mappable. Inserts
+  //! entries into pending_map_, but does not add anything into the logical_map_
+  //! (added when handle is called on a TensorView). Maybe mapped will, however,
+  //! immediately propagate broadcast iter domains.
+  void setMaybeMapped(
+      const TensorDomain* producer_td,
+      const IterDomain* producer_id,
+      const TensorDomain* consumer_td,
+      const IterDomain* consumer_id);
+  void addToPendingList(const DomainKey& producer, const DomainKey& consumer);
+  //! Map pointwise IterDomains from inputs of expressions to outputs.
+  //! Do not map reduction IterDomains in inputs.
+  void mapPointwiseLikeOp(Expr* e);
+  using BackwardVisitor::handle;
+  void dispatch(Expr* e) override;
+  void handle(UnaryOp* uop) override {
+    mapPointwiseLikeOp(uop);
+  }
+  void handle(BinaryOp* bop) override {
+    mapPointwiseLikeOp(bop);
+  }
+  void handle(TernaryOp* top) override {
+    mapPointwiseLikeOp(top);
+  }
+  void handle(RNGOp* top) override;
+  void handle(SelectOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(IndexSelectOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(TorchGatherOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(ReductionOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(GroupedReductionOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(WelfordOp* wop) override {
+    mapPointwiseLikeOp(wop);
+  }
+  void handle(LoadStoreOp* ldst) override {
+    mapPointwiseLikeOp(ldst);
+  }
+  void handle(MmaOp* wop) override {
+    mapPointwiseLikeOp(wop);
+  }
+  void handle(ViewOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(ViewAsScalar* op) override;
+  void handle(BroadcastOp* op) override;
+  void handle(SqueezeOp* op) override;
+  void handle(ExpandOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(RepeatOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(PadOp* op) override {
+    // For compute-at, padded id should be mapped
+    mapPointwiseLikeOp(op);
+  }
+  void handle(SliceOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+  void handle(CatOp* op) override {
+    // For compute-at, concat id should be mapped
+    mapPointwiseLikeOp(op);
+  }
+  void handle(TensorView* tv) override;
+  //! Maps all pending mappings.
+  //! This is called for each of TensorViews in a backward traversal,
+  //! recursively building mappings from the output tensors to the
+  //! input tensors.
+  void mapAllPendingMappings(const DomainKey& key);
+  //! Maps all pending mappings for id of td. When id is a broadcast,
+  //! mapping is done separately for each concrete domain.
+  void mapAllPendingMappings(const TensorDomain* td, IterDomain* id);
+  bool safeToMap(const DomainKeySet& domains);
+ private:
+  ComputeAtLogicalDomainMap& logical_map_;
+  //! Keep track of what we want to try and map
+  DomainKeyMap<DomainKeySet> pending_map_;
+  std::unordered_set<Expr*> visited_;
+  //! Helper class to find invalid mappings due to reductions
+  UnmappableReductionDomains incompatible_domains_;
+  //! Running vector of domain pairs that are invalid to map
+  std::vector<std::pair<DomainKey, DomainKey>> invalid_mappings_;
+  //! Disable UnmappableReductions check, should
+  //!  always be false for compute_at use cases
+  bool map_through_reduction_ = false;
+};
+//! Maps logical domains of an entire fusion. Does not map broadcast
+//! domains with non-broadcast domains.
+class NVF_API ExactLogicalDomainMap : public LogicalDomainMap {
+ public:
+  ExactLogicalDomainMap(Fusion* fusion);
+  bool areMapped(const IterDomain* id_a, const IterDomain* id_b) const;
+  std::string toString() const;
+  const DisjointSets<const IterDomain*>& getMappedSets() const;
+ protected:
+  std::unordered_map<IterDomain*, IterDomain*> map(
+      const TensorDomain* producer,
+      const TensorDomain* consumer,
+      const std::unordered_set<IterDomain*>& dims_to_map,
+      bool producer_to_consumer) const override;
+ private:
+  DisjointSets<const IterDomain*> eq_sets_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/macros.h ADDED Viewed

@@ -0,0 +1,23 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#if __has_include(<bits/c++config.h>)
+#include <bits/c++config.h>
+#endif
+#if defined(__GLIBCXX__) && __GLIBCXX__ >= 20230714
+#define STD_UNORDERED_SET_SUPPORTS_INCOMPLETE_TYPE 1
+#endif
+#if __cplusplus < 202002L
+#define IS_CPP20 0
+#else
+#define IS_CPP20 1
+#endif