PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/scheduler/vectorize_helper.h ADDED Viewed

@@ -0,0 +1,349 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <compute_at_map.h>
+#include <device_lower/analysis/divisible_split.h>
+#include <exceptions.h>
+#include <fusion.h>
+#include <ir/all_nodes.h>
+#include <scheduler/tools/maxinfo_propagator.h>
+#include <visibility.h>
+// TODO: Move to cpp file.
+#include <ir/builder.h>
+#include <sstream>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+namespace nvfuser {
+class SchedulerRuntimeInfo;
+class HeuristicDataCache;
+namespace vectorize_helper {
+// Projects IterDomains through the fusion starting at provided reference. IDs
+// in the reference are expected to be "contiguous", simply means dimensions
+// that the iter domains are consecutive and next to each other in the
+// reference. This property is not enforced, but mapping can have some
+// unpredictbale results if they are not. The reason we want contiguity here
+// is this class is primarily used for vectorization analysis. Domains may be
+// inserted or removed while propogating through the fusion and this class has
+// to be senstitive to that.
+//
+// For example:
+// Input: T0[i0, i2]
+// Reference: T5[i0, i1, i2]
+// If we want to base the vectorization size on the reference being contiguous
+// in a 1D scheduler, we'd start the proces on the reference with {i0, i1,
+// i2}. When we propogate to the input what we would still like is: {i0, i1,
+// i2} to signify to us that the root domains in the input that map to the
+// reference are not contiguous. So when we think of vector word, if we want
+// the input to be included in the vectorized dimensions, we can only check
+// multiples based on i2, not i0*i1*i2 like the reference would indicate.
+//
+// Another example:
+// Input:[i1, i0, i2]
+// Refrence [i0, i1, i2]
+// Similarly as above when we propogate from the reference to the Input we'd
+// like {i0, i1, i2}, which is the order of the reference, not the input. This
+// is because we can compare that with the input domains to understand it's
+// not ordered consistently, so once again we can only take into consideration
+// vectorization based on i2.
+//
+// Another example:
+// Input:[i0, i1, i2]
+// Intermediate: [i1, i0, i2]
+// Refrence [i0, i1, i2]
+// Keeping the ordering relative to the reference also allows us to look
+// though transpose operations without missing out in a case like this that
+// the reference and input are consistently ordered so we can look at i0*i1*i2
+// for our vector multiple even though there are transposes in between them.
+//
+// The tricky part of this class is what happens through combinations of view
+// and transpose. IterDomains are projected for example:
+//   tv0[2*3, 5*7, 11]
+//   tv1[2*3, 5, 7*11] = view(tv0)
+// With tv1 and 7*11 as the reference and ids. When we project from tv1 to
+// tv0, we'd map the inner most 11, but we also want to map the 5*7 with an
+// extent of 7. This can get tricky though as:
+//   tv0[2, 3*5*7, 11]
+//   tv1[2*3, 5, 7*11] = view(tv0)
+// with tv1 and [2*3, 7*11] as the reference and ids. tv0's 2 and 11 dim are
+// easily identified as being mapped. The 3*5*7 dimension however, is
+// partially mapped on the left and right side. Since this class is intended to
+// line up "inner dimensions" of tensors through out the graph for the purpose
+// of unrolling and vectorization, it only tracks partial dimensions as they are
+// on the right hand side of iteration domains. For example in the last case we
+// would only identify tv0's 3*5*7 dimension as being a mapping with extent 7.
+// If we further had:
+//   tv0[5*7*11]
+//   tv1[5*7, 11] = view(tv0)
+//   tv2[5, 7*11] = view(tv1)
+// with tv2 and [7*11] as the reference and ids (this could be a valid example
+// from the pointwise scheduler).
+// (1) tv1 would:
+//     map on 5*7 with extent 7
+//     map on 11 with extent 11.
+// (1) tv0 would:
+//     map on 5*7*11 with size 7*11
+//
+// Finally if we have:
+// tv0[3, 5, 7]
+// tv1[7, 5, 3] = view(tv0)
+// tv2[3, 5, 7] = view(tv1)
+// with tv2 mapping on 5, 7
+// We use fractional, symbolic, and conditional mappings so tv1:
+//   maps on 3 with extent 3
+//   maps on 5 with extent 5
+//   maps on 7 with extent (5*7)/(5*3)
+// Then tv0:
+//   maps on 7 with extent 7
+//   maps on 5 with extent 5
+//
+// This class is responsible for both computing the spanning tree and running
+// the spanning tree.
+//
+// In other words this class implements:
+//   MaxInfoSpanningTree::computeInfoC2P
+//     and
+//   MaxInfoSpanningTree::Propagator::propagateC2P
+//
+// The challenge here is the information we need for
+// MaxInfoSpanningTree::computeInfoC2P is the same information we need to
+// compute for MaxInfoSpanningTree::Propagator::propagateC2P
+//
+// We could compute both of these passes at the same time, only saving the
+// result produced from processing the edge that's chosen from
+// MaxInfoSpanningTree::computeInfoC2P while processing based on
+// MaxInfoSpanningTree::Propagator::propagateC2P. However, this would require
+// refactoring of MaxInfoSpanningTree so for right now this class just uses
+// two passes.
+//
+// MaxInfoSpanningTree::computeInfoC2P runs first with recording_=false and
+// will effectively compute the values of projected_root_ids_ and
+// projected_logical_ids_. However it will compute these by running all edges
+// between expressions. Therefore,
+// MaxInfoSpanningTree::Propagator::propagateC2P later simply calls
+// MaxInfoSpanningTree::computeInfoC2P with recording_=true where it will
+// actually record the computed information since it will be then projected
+// through the DAG maximizing saving information.
+class NVF_API ContiguousInnerDimensionsMapper
+    : public MaxInfoSpanningTree,
+      MaxInfoSpanningTree::Propagator {
+ public:
+  ContiguousInnerDimensionsMapper() = delete;
+  static ContiguousInnerDimensionsMapper map(
+      TensorView* reference,
+      const std::vector<IterDomain*>& ids,
+      std::shared_ptr<const ComputeAtMap> ca_map,
+      const std::unordered_set<Split*>& divisible_splits);
+  static ContiguousInnerDimensionsMapper map(
+      TensorView* reference,
+      const std::vector<IterDomain*>& ids) {
+    auto ca_map = std::make_shared<ComputeAtMap>(reference->fusion());
+    auto divisible_splits =
+        getAllDivisibleSplits(reference->fusion(), ca_map.get());
+    return ContiguousInnerDimensionsMapper::map(
+        reference, ids, ca_map, divisible_splits);
+  }
+  bool hasMappedDims(TensorView* tv) const {
+    return tv_infos_.find(tv) != tv_infos_.end();
+  }
+  const std::vector<IterDomain*>& mappedRootIds(TensorView* tv) const {
+    NVF_ERROR(
+        tv_infos_.find(tv) != tv_infos_.end(),
+        "TensorView not found: ",
+        tv->toString());
+    return std::dynamic_pointer_cast<const MappedDomain>(tv_infos_.at(tv))
+        ->mapped_root_ids_;
+  }
+  const std::vector<IterDomain*>& mappedLogicalIds(TensorView* tv) const {
+    NVF_ERROR(
+        tv_infos_.find(tv) != tv_infos_.end(),
+        "TensorView not found: ",
+        tv->toString());
+    return std::dynamic_pointer_cast<const MappedDomain>(tv_infos_.at(tv))
+        ->mapped_logical_ids_;
+  }
+  Val* getProjectedExtent(IterDomain* id) const {
+    if (projected_extent_.find(id) == projected_extent_.end()) {
+      NVF_THROW("Not projected: ", id->toString());
+    }
+    return projected_extent_.at(id);
+  }
+  std::unordered_map<TensorView*, Val*> getTvToContigMergeOfInnerSizeMap();
+ private:
+  ContiguousInnerDimensionsMapper(
+      TensorView* reference,
+      const std::vector<IterDomain*>& ids,
+      std::shared_ptr<const ComputeAtMap> ca_map,
+      const std::unordered_set<Split*>& divisible_splits);
+  class MappedDomain : public Information {
+   public:
+    MappedDomain() = default;
+    static std::shared_ptr<MappedDomain> build(
+        std::vector<IterDomain*> root_ids,
+        std::vector<IterDomain*> logical_ids,
+        bool is_c2p) {
+      auto ptr = std::make_shared<MappedDomain>();
+      ptr->mapped_root_ids_ = root_ids;
+      ptr->mapped_logical_ids_ = logical_ids;
+      ptr->is_c2p_ = is_c2p;
+      return ptr;
+    }
+    operator bool() const final {
+      return !mapped_root_ids_.empty() || !mapped_logical_ids_.empty();
+    }
+    bool operator<(const Information& other_info) const final {
+      auto other_mapped_domain = dynamic_cast<const MappedDomain&>(other_info);
+      if (is_c2p_) {
+        return mapped_logical_ids_.size() <
+            other_mapped_domain.mapped_logical_ids_.size();
+      }
+      return mapped_root_ids_.size() <
+          other_mapped_domain.mapped_root_ids_.size();
+    }
+    std::vector<IterDomain*> mapped_root_ids_;
+    std::vector<IterDomain*> mapped_logical_ids_;
+    // Information is not symmetric between c2p and p2c, track which direction
+    // the computation is in for the < operator
+    bool is_c2p_ = true;
+  };
+  // TODO: make pe a lanmda function so it is not evaluated if not needed
+  void addProjectedExtent(IterDomain* id, Val* pe) {
+    if (!recording_) {
+      return;
+    }
+    NVF_ERROR(
+        projected_extent_.count(id) == 0,
+        "Already registered: ",
+        id->toString(),
+        ", existing: ",
+        projected_extent_.at(id)->toInlineString(),
+        ", new: ",
+        pe->toInlineString());
+    projected_extent_[id] = pe;
+  }
+  // Return a boolean predicate indicating if the given ID is fully projected.
+  Val* isFullyProjected(IterDomain* id);
+  // From the projected extent (PE) of I1 and I2, update the PE of I1*I2.
+  template <typename MergeOrSplit>
+  void combinePE(const MergeOrSplit* merge_or_split, bool outer_maps);
+  // From the projected extent (PE) of I1*I2, update the PE of I1 and I2.
+  template <typename MergeOrSplit>
+  void distributePE(const MergeOrSplit* merge_or_split);
+  // Returns the projected inner size. Contiguous inner dimensions are merged.
+  Val* getContigMergeOfInnerSize(TensorView* of_tv);
+  // MaxInfoSpanningTree functions
+  std::shared_ptr<Information> computeInfoC2P(
+      TensorView* from,
+      TensorView* to,
+      std::shared_ptr<Information> from_info) final;
+  std::shared_ptr<Information> computeInfoP2C(
+      TensorView* from,
+      TensorView* to,
+      std::shared_ptr<Information> from_info) final;
+  std::shared_ptr<Information> computeInfoSibling(
+      TensorView* from,
+      TensorView* to,
+      std::shared_ptr<Information> from_info) final;
+  // Projection from root<->logical domains
+  std::vector<IterDomain*> projectId(
+      const std::vector<IterDomain*>& from,
+      const std::vector<IterDomain*>& to);
+  // Propagator functions
+  void propagateC2P(TensorView* from, TensorView* to) final;
+  void propagateP2C(TensorView* from, TensorView* to) final;
+  void propagateSibling(TensorView* from, TensorView* to) final;
+  // Initialized to false, series of compute... calls will be performed to find
+  // the spanning tree. Then propagate... calls will call the compute... calls.
+  // recording_ starts as false, and stays that way during the first series of
+  // compute... calls. As soon as the first propagate... calls are called,
+  // recording_ will perpetually stay on.
+  bool recording_ = false;
+  std::shared_ptr<const ComputeAtMap> ca_map_;
+  const std::unordered_set<Split*>& divisible_splits_;
+  // Mapped root dimensions for each TensorView as we propogate. These
+  // mappings are in the order of the reference.
+  std::unordered_map<
+      TensorView*,
+      std::shared_ptr<MaxInfoSpanningTree::Information>>
+      tv_infos_;
+  std::unordered_map<IterDomain*, Val*> projected_extent_;
+};
+// logical_reorder_map is provided to assume reference_tv will be reordered per
+// the map, hence changing the order of IterDomain in the reference
+int64_t getVectorizationFactor(
+    SchedulerRuntimeInfo& runtime_info,
+    TensorView* reference_tv,
+    HeuristicDataCache* data_cache,
+    int64_t break_point,
+    const std::unordered_map<int64_t, int64_t>& logical_reorder = {});
+int64_t getVectorizationFactorTransposeGroup(
+    SchedulerRuntimeInfo& runtime_info,
+    TensorView* reference,
+    int64_t inner_most_dim,
+    const std::vector<int64_t>& dims_to_merge,
+    const std::vector<TensorView*>& vec_tv,
+    int64_t max_vectorization);
+//! Find the break point for vectorization. Here, we vectorize either
+//! the innermost reduction or iteration domains. We use the producer
+//! of the reduction as a reference of the vectorization
+//! analsis.
+//
+//! Since this is for the reduction and normalization schedulers, the
+//! producer of the reduction should not have reduction domains,
+//! except when it's a fusion input, in which case the reduction
+//! domains of the producer should just be ignored.
+//
+//! \param reduction_consumer
+//! \param reduction_producer
+//! \param consumer_innermost_ndims Innermost consumer domains to vectorize
+int64_t getVectorizationBreakPointOfReductionProducer(
+    TensorView* reduction_consumer,
+    TensorView* reduction_producer,
+    int64_t consumer_innermost_ndims);
+} // namespace vectorize_helper
+} // namespace nvfuser

nvfuser/include/nvfuser/serde/factory.h ADDED Viewed

@@ -0,0 +1,55 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <type.h>
+#include <functional>
+namespace nvfuser::serde {
+// Flatbuffer enum are represented as an unscoped enumeration, so we can map
+// them to an Integer type. This Factory class contains a vector that maps from
+// an enum integer to its corresponding parser function.
+//
+// All parser functions have the same signature. We use lambdas to support
+// functions that require extra arguments.
+template <typename SerdeBuffer, typename BaseTypePtr>
+class Factory {
+ public:
+  // A function pointer that creates a BaseType object given a Buffer
+  typedef std::function<BaseTypePtr(const SerdeBuffer*)> SerdeParser;
+  Factory(size_t num_parsers) : parsers_(num_parsers, nullptr) {};
+  template <typename SerdeEnum>
+  void registerParser(SerdeEnum serde_type, SerdeParser parser) {
+    auto serde_integer = nvfuser::toUnderlying(serde_type);
+    NVF_ERROR(
+        serde_integer >= 0 && serde_integer < (int)parsers_.size(),
+        "RegisterParser: Invalid serde type: ",
+        serde_integer);
+    parsers_.at(serde_integer) = parser;
+  }
+  template <typename SerdeEnum>
+  BaseTypePtr parse(SerdeEnum serde_type, const SerdeBuffer* buffer) {
+    auto serde_integer = nvfuser::toUnderlying(serde_type);
+    NVF_ERROR(
+        serde_integer >= 0 && serde_integer < (int)parsers_.size(),
+        "Deserialize: Invalid serde type: ",
+        serde_integer);
+    return parsers_.at(serde_integer)(buffer);
+  }
+ private:
+  std::vector<SerdeParser> parsers_;
+};
+} // namespace nvfuser::serde