PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/scheduler/registry_utils.h ADDED Viewed

@@ -0,0 +1,111 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <scheduler/all_schedulers.h>
+namespace nvfuser {
+class TensorView;
+class ComputeAtLogicalDomainMap;
+class ComputeAtMap;
+class ExpressionEvaluator;
+class KernelArgumentHolder;
+namespace registry_utils {
+bool checkPatternEquivalence(
+    TensorView* out_tv0,
+    TensorView* out_tv1,
+    const ComputeAtLogicalDomainMap& logical_map);
+// Reusing some code from lowering specifically in lower_trivial_broadcast.cpp
+// ConcretizedBroadcastDomains::maybeNonUniquelyConcretized this checks if
+// there's a broadcast iteration domain that's being broadcasted to seemingly
+// different extents, meaning we don't know in the kernel if the dimension is
+// being broadcasted to one size multiple times or different sizes. This is a
+// hard to optimize problem and likely indicates we shouldn't be fusing.
+bool hasNonUniqueBcast(Fusion* fusion);
+// TODO: remove this requirement entirely
+bool rejectScheduleForMemoryPromotion(
+    Fusion* fusion,
+    SchedulerType scheduler_type);
+bool isConnectedFusionGraph(Fusion* fusion);
+// Returns if a fusion cannot transformed into a consistent format since we
+// can't transform forward through view operations, for exmaple:
+//
+// tv0[I0, I1, I2]
+// tv1[I0*I1, I2] = view(tv0)
+// tv2[I0, I1*I2] = view(tv0)
+//
+// If we start transform propagation at either tv1 or tv2, it would require
+// "replaying forward" through the other. If we started at tv1 we'd have to be
+// able to take tv2[I0, I1*I2] and transform it to [I0*I1, I2], however this
+// would "undo" the view transformation which we do not support today.
+//
+// Returns true if a scenario like above is found in the fusion.
+bool requiresForwardViewReplay(Fusion* fusion, ComputeAtMap& ca_map);
+// Returns if view interferes with how we want to treat the reference, being
+// at least a 2D reduction schedule but maybe a 3D reduction schedule.
+bool reductionInterferingView(
+    Fusion* fusion,
+    const ComputeAtMap& ca_map,
+    TensorView* reduction_reference);
+// Check inputs, outputs and intermediates
+// Intermediates are contiguous, so strides are not necessary
+// Strides are required for inputs and also maybe for outputs as
+// they may be non-contiguous. However, in our current interface,
+// output strides are not available, so if there's any outputs that
+// are non contiguous, need to fall back to 64-bit indexing
+PrimDataType getIndexTypeOfKernel(
+    Fusion* fusion,
+    const std::vector<TensorView*>& all_tvs,
+    const KernelArgumentHolder& inputs,
+    ExpressionEvaluator& ee);
+class SchedulerTopologyChecker {
+ public:
+  // Checks if any broadcasts are resolved after a reduction that don't follow
+  // the normalization pattern
+  static bool hasNonNormalizePostReductionBCast(Fusion* fusion);
+  // Checks if any broadcasts are resolved after a reduction, this shouldn't
+  // be accepted in the single reduction or multi-reduction scheduler
+  static bool hasPostReductionBCast(Fusion* fusion);
+  // Checks if there's any unsupported operations post reduction. If outer
+  // reduction we can fuse some pointwise ops if they don't require
+  // broadcasting (checked in hasPostReductionBCast). For inner reductions we
+  // cannot fuse any binary like operation (includes operations like shift
+  // that we're not fusing right now) involving "new" inputs (not going
+  // through a reduction).
+  static bool supportedPostReductionFusion(
+      Fusion* fusion,
+      std::vector<TensorView*> reduction_tvs);
+  // Checks if there's any gather-like ops that result in non-resolved
+  // broadcast domains and then get squeezed before reaching reduction
+  // TVs. The reduction scheduler uses reduction TVs as a scheduling
+  // reference, so that won't be able to schedule the broadcast ID if
+  // squeezed and its corresponding index-accessed producer ID, and
+  // any IDs that the producer ID depends on.
+  //
+  // This analysis has some similarity as DomainMap. Can be
+  // consolidated?
+  static bool hasGatherToBroadcastBeforeReduction(
+      Fusion* fusion,
+      const std::vector<TensorView*>& reduction_tvs);
+};
+} // namespace registry_utils
+} // namespace nvfuser

nvfuser/include/nvfuser/scheduler/resize.h ADDED Viewed

@@ -0,0 +1,41 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <scheduler/heuristic.h>
+#include <scheduler/registry.h>
+namespace nvfuser {
+class Fusion;
+class SchedulerRuntimeInfo;
+class HeuristicDataCache;
+class ResizeScheduler : public SchedulerEntry {
+ public:
+  bool canScheduleCompileTime(Fusion* fusion) override;
+  bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicDataCache* data_cache = nullptr) override {
+    return true;
+  }
+  std::unique_ptr<HeuristicParams> computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicDataCache* data_cache) override;
+  void schedule(Fusion* fusion, const HeuristicParams* params) override;
+  constexpr static SchedulerType schedulerType() {
+    return SchedulerType::Resize;
+  }
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/scheduler/resize_heuristic.h ADDED Viewed

@@ -0,0 +1,67 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <c10/util/hash.h>
+#include <ir/interface_nodes.h>
+#include <scheduler/heuristic.h>
+#include <utils.h>
+#include <sstream>
+namespace nvfuser {
+class ResizeParams : public HeuristicParams {
+ public:
+  ResizeParams() : HeuristicParams(SchedulerType::Resize) {};
+  // Split grid x dimension
+  bool split_grid_x_dim = false;
+  int64_t largest_input = -1;
+  int64_t vectorization_factor = 1;
+  static constexpr int64_t max_gdimx = (1L << 31) - 1L;
+  using HeuristicParams::HeuristicParams;
+  // Warning: Does not check launch parameters!
+  bool sameAs(const HeuristicParams* other_base) const override {
+    auto other = dynamic_cast<const ResizeParams*>(other_base);
+    if (other == nullptr) {
+      return false;
+    }
+    bool attr_equal = other->cparams == cparams &&
+        other->split_grid_x_dim == split_grid_x_dim &&
+        other->largest_input == largest_input &&
+        other->vectorization_factor == vectorization_factor;
+    return attr_equal;
+  }
+  std::string toString() const override {
+    std::stringstream ss;
+    ss << "\n===== Resize Parameters ========\n"
+       << (tag.empty() ? "" : "Tag: ") << tag << " Resize Characteristics:\n"
+       << " split grid x dim: " << split_grid_x_dim << "\n"
+       << " index of largest input: " << largest_input << "\n"
+       << " vectorization factor: " << vectorization_factor << "\n";
+    ss << "====================================\n";
+    return ss.str();
+  }
+  size_t hash() const override {
+    return c10::get_hash(split_grid_x_dim);
+  }
+  std::unique_ptr<HeuristicParams> clone() const override {
+    return std::make_unique<ResizeParams>(*this);
+  }
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/scheduler/runtime_info.h ADDED Viewed

@@ -0,0 +1,166 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <runtime/executor_kernel_arg.h>
+#include <utils.h>
+#include <visibility.h>
+namespace nvfuser {
+class ExpressionEvaluator;
+//!  SchedulerRuntimeInfo is the abstraction introduced in
+//! this PR for passing runtime input dependent information
+//! to the schedulers and kernel caches.
+//!
+//! Note:
+//!  if any additional info needed,  or maybe just the inputs themselves it
+//!    could just be added to this class, and they will be distributed to the
+//!    segmenter and schedulers.
+//!  It is important that input id encoding should be up to date with any change
+//!   of this class to avoid launching compiled kernels with illegal inputs.
+class SchedulerRuntimeInfo : public NonCopyable {
+ public:
+  // Max vector size we will consider, in bytes,
+  //  currently set to 16B = 128b
+  static constexpr int64_t max_alignment_size_in_byte = 16;
+  //! Create runtime info for given fusion and input. Creating and binding
+  //! evaluator is optional. The evaluator is used to manage intermediate
+  //! integers in the fusion. We need them for segmenter and schedulers,
+  //! but we don't need them when we are just using this class to provide
+  //! additional encoding for kernel cache lookup.
+  //!
+  //! The index type of forced_index_type is used if given, no matter
+  //! how large the actual arguments and fusion tensors
+  //! are. CORRECTNESS IS NOT GUARANTEED.
+  SchedulerRuntimeInfo(
+      Fusion* complete_fusion,
+      KernelArgumentHolder args,
+      PrecomputedValues* precomputed_values = nullptr,
+      const std::vector<TensorView*>& all_tvs = {},
+      std::optional<PrimDataType> forced_index_type = std::nullopt);
+  NVF_API SchedulerRuntimeInfo(
+      Fusion* complete_fusion,
+      const at::ArrayRef<c10::IValue>& aten_inputs);
+  //! Lookup for the alignment sizes of the given tv. Currently only returns
+  //!  actual alignment info for input tensors to the complete fusion,
+  //!  and for other intermediate/fuser-allocated tensors will
+  //!  return max_alignment_size_in_byte.
+  size_t getAlignmentSize(TensorView* tv);
+  //! Returns sizes of tensor dimensions in same order as allocation domain,
+  //! ignoring any IterType::Reduction domains in the allocation domain. This
+  //! only works for complete Fusion inputs whose allocation domain is a
+  //! permutation of their root domain and will raise an exception otherwise.
+  const std::vector<int64_t>& getInputAllocationSizes(TensorView* tv) const {
+    NVF_ERROR(
+        isInputTv(tv),
+        "TensorView ",
+        tv->toString(),
+        " is not an input or its logical domain is not a permutation of its ",
+        "allocation domain");
+    auto sizes_it = input_sizes_.find(tv);
+    NVF_ERROR(sizes_it != input_sizes_.end());
+    return sizes_it->second;
+  }
+  //! Returns strides of tensor in same order as allocation domain, in elements
+  //! instead of bytes. Only works for complete Fusion inputs whose allocation
+  //! domain is a permutation of their root domain and will raise an exception
+  //! otherwise.
+  const std::vector<int64_t>& getInputAllocationStrides(TensorView* tv) const {
+    NVF_ERROR(
+        isInputTv(tv),
+        "TensorView ",
+        tv->toString(),
+        " is not an input or its logical domain is not a permutation of its ",
+        "allocation domain");
+    auto strides_it = input_strides_elements_.find(tv);
+    NVF_ERROR(strides_it != input_strides_elements_.end());
+    return strides_it->second;
+  }
+  // Computes alignment size in bytes for provided ptr address
+  static size_t computeAlignmentSize(size_t ptr_address);
+  // Return the runtime pointer value for provided tensor view
+  size_t ptrOf(TensorView* tv) const;
+  PrimDataType getIndexType() const {
+    return index_type_;
+  }
+  Fusion* fusion() {
+    return complete_fusion_;
+  }
+  ExpressionEvaluator& expressionEvaluator() {
+    NVF_ERROR(expression_evaluator_ != nullptr);
+    return *expression_evaluator_;
+  }
+ private:
+  // Build and bind full fusion inputs to an expression evaluator
+  std::unique_ptr<ExpressionEvaluator> getExpressionEvaluator(
+      const KernelArgumentHolder& inputs,
+      PrecomputedValues* precomputed_values);
+  bool isInputTv(TensorView* tv) const {
+    return std::find(
+               complete_fusion_->inputs().begin(),
+               complete_fusion_->inputs().end(),
+               tv) != complete_fusion_->inputs().end();
+  }
+ private:
+  // Returns the offset of tv in the inputs ignoring non tensor views. Used to
+  // access input_sizes, input_strides, input_ptr
+  int offsetTensorPos(TensorView* tv);
+  // Expression evaluator used to probe sizes in the fusion IR
+  std::unique_ptr<ExpressionEvaluator> expression_evaluator_ = nullptr;
+  // Fusion reference that this runtime info is associated with
+  Fusion* complete_fusion_ = nullptr;
+  // Copy of aten input pointer addresses
+  // TODO: Support output tensor pointers
+  std::unordered_map<Val*, size_t> input_ptrs_;
+  // Copy of aten input tensor sizes ordered like the TensorView's allocation
+  // domain
+  std::unordered_map<Val*, std::vector<int64_t>> input_sizes_;
+  // Copy of aten input tensor strides (in elements) ordered like the
+  // TensorView's allocation domain
+  std::unordered_map<Val*, std::vector<int64_t>> input_strides_elements_;
+  // Copy of aten input tensor strides (in bytes) for only discontiguous
+  // dimensions
+  std::unordered_map<Val*, std::vector<size_t>> input_discontig_strides_;
+  // Cache for getAlignmentSize
+  std::unordered_map<TensorView*, size_t> alignment_map_;
+  // Found index mode kernel needs to be run in
+  PrimDataType index_type_ = PrimDataType::Int;
+  // TODO: Remove
+  std::unordered_map<TensorView*, size_t> vectorword_map_;
+};
+} // namespace nvfuser

nvfuser/include/nvfuser/scheduler/scheduler_types.h ADDED Viewed

@@ -0,0 +1,80 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <visibility.h>
+#include <array>
+#include <ostream>
+#include <string>
+namespace nvfuser {
+//! Each SchedulerType maps to a scheduler in distinct CPP files.
+//! For instance, SchedulerType::PointWise maps to PointWiseScheduler in
+//! pointwise.cpp.
+//!
+//!    Each of the scheduler needs to provide 3 interface functions:
+//!
+//!      1. canScheduleCompileTime(Fusion* fusion) :
+//!
+//!        This function contains compiled-time checks on the graph itself
+//!        without runtime input information. Only `fusion` is given in the
+//!        argument to make sure only compile-time available info is needed in
+//!        the check.
+//!
+//!        This function is to be called exactly once on each segmented group
+//!        created in a segmented fusion so this part will not contribute to
+//!        dynamic shape latency.
+//!
+//!     2. canScheduleRunTime(
+//!            Fusion* fusion,
+//!            SchedulerRuntimeInfo& runtime_info,
+//!           HeuristicDataCache* data_cache = nullptr):
+//!        This function contains all canSchedule checks that will have to
+//!        involve runtime input information, and will be run both by the
+//!        segmenter and the kernel cache. The latency of this function will
+//!        contribute to dynamic shape latency so `data_cache` should be used as
+//!        much as possible to save re-computation.
+//!
+//!     3. schedule(fusion):
+//!
+//!        This function will be called when compiling a kernel. It should apply
+//!        scheduling to the given fusion
+enum class SchedulerType {
+  None,
+  NoOp,
+  PointWise,
+  Matmul,
+  Reduction,
+  InnerPersistent,
+  InnerOuterPersistent,
+  OuterPersistent,
+  Transpose,
+  ExprEval,
+  Resize
+};
+//! Define a schedule table to loop over all the heuristics in priority order.
+constexpr std::array<SchedulerType, 10> all_heuristics_in_priority_order = {
+    SchedulerType::ExprEval,
+    SchedulerType::NoOp,
+    SchedulerType::Matmul,
+    SchedulerType::Reduction,
+    SchedulerType::Resize,
+    SchedulerType::Transpose,
+    SchedulerType::PointWise,
+    SchedulerType::InnerPersistent,
+    SchedulerType::OuterPersistent,
+    SchedulerType::InnerOuterPersistent};
+std::string toString(SchedulerType sh);
+NVF_API std::ostream& operator<<(std::ostream& os, SchedulerType sh);
+} // namespace nvfuser

nvfuser/include/nvfuser/scheduler/transpose.h ADDED Viewed

@@ -0,0 +1,114 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <exceptions.h>
+#include <fusion.h>
+#include <scheduler/registry.h>
+#include <scheduler/transpose_heuristic.h>
+#include <visibility.h>
+#define SUPPORT_SPLITTING_INNERMOST_DIM 0
+namespace nvfuser {
+// Note [Transpose scheduling]
+//
+// The target of transpose scheduling is to get coalesced global memory access
+// to as much input and output tensors as possible. For a DAG with only pure
+// pointwise operators, the scheduling is very simple because the inner most
+// dimension of all input and output tensors are all mapped together in the
+// ComputeAtMap, i.e., there is essentially only one inner most dimension. In
+// such case, we just vectorize that inner most dimension and bind it to
+// threadIdx.x identically for all input and output tensors. In the case where
+// transposes are present in the DAG, the inner most dimensions of different
+// inputs and outputs might not match. And there is no fixed pattern on which
+// input/output tensors should share the same inner most dimension with which.
+// Consider the following example DAGs ([T] represents transpose, all tensors
+// are 2D):
+//
+//   t0    t1      t0    t1      t0    t1        t0    t1         t0
+//    \    |        \    /        \    |          \    |          |
+//     \  [T]       [T] [T]        \  [T]          t2 [T]        [T]
+//      \ /           \ /           \ / \         / \ / \         |
+//      t2             t2           t2   t3      t3  t4 t5       [T]
+//                                                                |
+//                                                                t1
+//
+// In order to support all these cases in a general way, the following
+// perspective is very important: What we are looking for is to bind threadIdx.x
+// differently for different inputs and outputs, so there has to be some tensor
+// somewhere in the DAG that we write and read with different threadIdx.x
+// bindings. The tensor of binding swap can be any tensor on the path that
+// connects inputs/outputs with different inner most dimension, especially, it
+// does not necessarily have to be the tensor of the transpose operator. In
+// other words, thanks to our indexing system who is already taking care of the
+// correctness of transpose, the scheduler can freely choose where to realize
+// these transposes as different threadIdx.x bindings. This observation greatly
+// simplifies our scheduling.
+//
+// Our scheduling strategy is as follows: We first split the inputs and outputs
+// of the fusion into two groups according to their inner most dimension. The
+// inner most dimensions of tensors in the same group are mapped to each other,
+// and they are not mapped to the inner most dimesion of tensors in a different
+// group. Depending on the transpose pattern, there can be more than two groups,
+// if this is the case, we only consider the two largest groups, and the tensors
+// in the remaining groups will just be accessed unvectorized and uncoalesced.
+// We call the largest group as `group1` and the second largest group as
+// `group2`. When we have the groups, we will make a 2D tiling [I1, I2] ->
+// [I1/tile1, tile1, I2/tile2, tile2] on the inner most dimensions of group1 and
+// group2. If I1 and I2 are too small to make a 32x32 tile, such as in the
+// fusion of tanspose(T1[1024, 2, 1024, 2], {1, 3}), we merge in other
+// dimensions to make a virtual I1 and I2. The details of how we create virtual
+// I1 and I2 are described in note [Supporting small transpose dimensions].
+//
+// Each tile [tile1, tile2] will be handled by a block, and the tensors that
+// have mismatched threadIdx.x bindings will use shared memory. The outer IDs of
+// the tiling split will be merged with non-tiled IDs and then binded to
+// blockIdx.x for the entire DAG, regardless of which group a tensor belongs to.
+// For the inner tile IDs [tile1, tile2], we need to transform and parallelize
+// group 1 and group 2 differently. The intermediate tensors can be transformed
+// and parallelized consistently either with group 1 or group 2. Here, since
+// group 1 is larger than group 2, we decide to only transform and parallelize
+// the cached inputs of group 2 together with group 2, and keep the rest of the
+// DAG consistent with group 1.
+//
+// If you would like to see an example of how to manually schedule a complicated
+// DAG using this idea, refer to:
+//   FusionManualScheduleTransposeComplexDAG1_CUDA
+class SchedulerRuntimeInfo;
+class HeuristicDataCache;
+//! Utility for canSchedule interface to check if this fusion has at least two
+//! groups, each with a fully broadcasted reference tensor.
+NVF_API bool hasAtLeastTwoValidGroups(Fusion* fusion);
+class TransposeScheduler : public SchedulerEntry {
+ public:
+  bool canScheduleCompileTime(Fusion* fusion) override;
+  bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicDataCache* data_cache = nullptr) override;
+  std::unique_ptr<HeuristicParams> computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicDataCache* data_cache) override;
+  void schedule(Fusion* fusion, const HeuristicParams* params) override;
+  constexpr static SchedulerType schedulerType() {
+    return SchedulerType::Transpose;
+  }
+};
+} // namespace nvfuser