PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/python_frontend/fusion_cache.h ADDED Viewed

@@ -0,0 +1,298 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <python_frontend/fusion_record.h>
+#include <runtime/fusion_executor_cache.h>
+#include <scheduler/compile_time_info.h>
+#include <scheduler/registry.h>
+#include <memory>
+#include <mutex>
+namespace nvfuser::python_frontend {
+//! \struct UserSchedule
+//! \brief A container to hold a scheduled Fusion IR as well as an executor
+//! to contain the corresponding generated kernel.
+struct UserSchedule {
+  UserSchedule(int64_t fusion_id, int64_t device_id);
+  //! Runtime information for schedulers
+  std::unique_ptr<SchedulerRuntimeInfo> runtime_info;
+  //! The scheduler heuristic for this UserSchedule
+  std::unique_ptr<SchedulerEntry> scheduler;
+  //! The parameters for scheduler heuristic.
+  std::unique_ptr<HeuristicParams> heuristic_params;
+  //! The compile-time data cache.
+  std::unique_ptr<HeuristicDataCache> data_cache;
+  //! Concretized, Scheduled Fusion IR
+  std::unique_ptr<Fusion> scheduled_fusion;
+  //! Generated kernel container
+  std::unique_ptr<KernelExecutor> executor;
+  //! ID of fusion in python frontend fusion cache
+  int64_t fusion_id_ = -1;
+  //! device ID for this user schedule
+  int64_t device_id_ = -1;
+  //! Get scheduler runtime info for UserSchedule
+  SchedulerRuntimeInfo* runtimeInfo() {
+    NVF_ERROR(
+        runtime_info != nullptr,
+        "Requires SchedulerRuntimeInfo to use heuristic schedulers");
+    return runtime_info.get();
+  }
+  //! Get Fusion for UserSchedule
+  Fusion* fusion() {
+    NVF_ERROR(
+        scheduled_fusion != nullptr,
+        "Requires Fusion to use heuristic schedulers");
+    return scheduled_fusion.get();
+  }
+  //! Return if we can schedule FusionDefinition with heuristic.
+  bool canSchedule(const SchedulerType& heuristic);
+  //! Return if we can schedule FusionDefinition with heuristic along with any
+  //! debug messages from canScheduleRejectReason.
+  std::tuple<bool, std::string> canScheduleDebug(
+      const SchedulerType& scheduler_type);
+  //! Create scheduler and get heuristic parameters for fusion.
+  HeuristicParams* computeHeuristics(SchedulerType scheduler_type);
+  //! Schedule fusion with selected heuristics and scheduler.
+  void schedule();
+  //! Schedule fusion with heuristic.
+  void scheduleWithType(SchedulerType scheduler_type);
+};
+//! \struct FusionSchedules
+//! \brief A container for auto generated and user defined schedules
+//! that correspond to compiled kernels for each complete Fusion Definition.
+struct FusionSchedules {
+  FusionSchedules(int64_t fusion_id = 0);
+  Fusion* preschedFusion();
+  //! Schedules Automatically generated by nvFuser for dynamic inputs. (default)
+  //! NOTE: The FusionExecutorCache also holds the Unscheduled Fusion IR
+  std::unique_ptr<FusionExecutorCache> auto_gen_schedules;
+  //! Schedules defined by the user for specific input sizes.
+  //! They are also generated per device as all devices may not be the same.
+  //! Key:   Input Encoding hash of Fusion inputs as is created by the
+  //!        InputsIdLookup struct found inside of the FusionCache.
+  //! Value: A vector based on device_id of User Defined Fusion Schedules.
+  std::unordered_map<size_t, std::unordered_map<int, UserSchedule>>
+      user_def_schedules;
+  //! Keeps a pointer to the last scheduled Fusion IR for printing
+  Fusion* last_user_def_scheduled_ir;
+  //! Keeps a pointer to the last executed executor for printing its cuda kernel
+  KernelExecutor* last_user_def_executor;
+  //! For thread-Safe locking of Fusion Schedules
+  std::mutex scheds_lock;
+  //! ID of fusion in python frontend fusion cache
+  int64_t fusion_id_ = -1;
+  //! Fusion IDs of input arguments for FusionState
+  std::vector<int64_t> inputs_fid_;
+  //! IDs for Extents for TensorView input arguments for FusionState
+  std::vector<int64_t> extents_fid_;
+  //! Fusion IDs of output arguments for FusionState
+  std::vector<int64_t> outputs_fid_;
+  //! Map Fusion Val to its corresponding FusionDefinition index
+  std::unordered_map<const Val*, int64_t> map_value_to_fid_;
+};
+//! \struct TrieNode
+//! \brief Is the container for a Node in a prefix tree or trie
+//! where each node represents a statement in a fusion definition and
+//! the leaf Nodes represent a complete Fusion that is cached.
+struct TrieNode {
+  TrieNode(
+      RecordFunctor* rec,
+      TrieNode* _parent = nullptr,
+      size_t _fusion_id = 0);
+  // Queries whether the entry denotes a leaf node which also represents
+  // a the end of Fusion entry in the cache.
+  bool isTerminal() const;
+  //! getException returns the cached Exception raise during construction of
+  //! Fusion. It returns std::nullopt if the no error thrown. This function is
+  //! called at the end of FusionDefinition::finalizeDefinition to avoid
+  //! silently using a bad FusionDefinition cached in FusionCache.
+  std::optional<std::string> getException();
+  //! setException is called to record exception message thrown during
+  //! construction of Fusion.
+  void setException(const char* e);
+  //! Serialize TrieNode using flatbuffers
+  NVF_API flatbuffers::Offset<serde::TrieNode> serialize(
+      flatbuffers::FlatBufferBuilder& builder,
+      const std::map<RecordFunctor*, size_t>&
+          map_record_functor_to_trie_node_id);
+  //! An entry's primary data is the record it holds
+  std::unique_ptr<RecordFunctor> record;
+  //! A hash map of the children for the current node.
+  //! The hash map hashes a pointer to a RecordFunctor because
+  //! the hash function is virtual.
+  std::unordered_map<RecordFunctor*, std::unique_ptr<TrieNode>> children;
+  //! An index into FusionCache's vector of nvFuser object that holds an
+  //! unscheduled Fusion.  The id is only valid if the entry is terminal.
+  size_t fusion_id;
+  //! Count of times the Entry is traversed
+  size_t visits;
+  //! Parent node for printing
+  TrieNode* parent;
+  //! For thread-Safe locking of a node
+  std::mutex trie_node_lock;
+  //! exception is used to track if we failed to create a valid fusion for
+  //! FusionDefinition at this given TrieNode
+  std::optional<std::string> exception = std::nullopt;
+};
+//! \class FusionCache
+//! \brief A singleton class used in the nvFuser python interface
+//! to manage the caching of fusions.
+//!
+//! The fusion cache implements a prefix tree (trie) of records in order to
+//! cache fusions.  A leaf of the tree with a terminal node contains a
+//! container for caching the kernels generated for specific fusions.
+//!
+//! \todo
+//! Add the ability to evict a fusion.  There is currently a max number
+//! of fusions that is checked to prevent a runaway case.
+//!
+//! \note
+//! Thread-Safety is assured by the Python GIL.  If a no-GIL python is used
+//! then further scrutiny needs to be applied to the mutexes used to limit
+//! acccess to the singleton pointer, node creation, and user schedule
+//! creation.  Otherwise, the Python GIL provides a natural thread based mutex
+//! that does not allow for multiple threads to interact.
+class FusionCache {
+  //! The constructor is private given the FusionCache is only constructed
+  //! as a singleton.
+  FusionCache(size_t max_fusions, std::optional<int64_t> selected_device);
+ public:
+  //! Copy and Assignment of the FusionCache is not supported
+  //! clang-tidy: deleted member function should be public
+  FusionCache(const FusionCache&) = delete;
+  FusionCache& operator=(const FusionCache&) = delete;
+  //! The next 4 public methods are the python interface methods
+  //! Gets a pointer to the singleton and creates a new one if necessary
+  NVF_API static FusionCache* get(
+      size_t max_fusions = 16384,
+      std::optional<int64_t> selected_device = std::nullopt,
+      bool load_from_default_workspace = true);
+  //! Number of fusions cached
+  NVF_API size_t numFusions() const;
+  //! Get device associated with this FusionCache
+  NVF_API std::optional<int64_t> deviceId() const;
+  //! print cache contents
+  NVF_API void print(std::ostream& os) const;
+  //! print cache stats
+  NVF_API void stats(std::ostream& os) const;
+  //! Reset Cache to an empty state
+  NVF_API static void reset();
+  //! Serialize Fusion Cache using flatbuffers
+  NVF_API void serialize(std::string filename) const;
+  //! Deserialize Fusion Cache using flatbuffers
+  NVF_API void deserialize(std::string filename);
+  //! The rest of the public methods are only used in C++
+  //! Thread-Unsafe: Queries the current trie node to see if a record matches
+  //! one of its children
+  NVF_API std::optional<TrieNode*> queryChildren(
+      TrieNode* node,
+      RecordFunctor* rec) const;
+  //! Query a Fusion's Schedules based on fusion id or cache id
+  FusionSchedules* queryFusionSchedules(size_t fusion_id) const;
+  //! Determine if a user schedule exists for given inputs.
+  bool existUserSchedule(
+      const FusionSchedules* scheds,
+      const at::ArrayRef<c10::IValue>& inputs,
+      int device);
+  //! Lookup the User Schedule Id and return null if one does not exist.
+  //! NOTE: this method cannot be const because the InputsIdLookup can
+  //! cause a modification to that data member for cache eviction.
+  std::optional<size_t> queryUserScheduleId(
+      const FusionSchedules* scheds,
+      const at::ArrayRef<c10::IValue>& inputs);
+  //! Lookup the User Schedule based on Id
+  const UserSchedule& queryUserSchedule(
+      const FusionSchedules* scheds,
+      size_t id,
+      int device) const;
+  //! Thread-Safe: Creates a child node for the current cache entry and an
+  //! optional fusion_id is returned if the new entry is terminal
+  NVF_API TrieNode* createChild(TrieNode* node, RecordFunctor* rec);
+  //! Lookup the User Schedule based on Id
+  UserSchedule* createUserSchedule(
+      FusionSchedules* scheds,
+      const at::ArrayRef<c10::IValue>& inputs,
+      int device,
+      bool overwrite_existing_schedule = false);
+  //! Get the root Trie ptr
+  NVF_API TrieNode* rootTriePtr();
+ private:
+  //! The static pointer to the FusionCache
+  static FusionCache* singleton_;
+  //! Lock for accessing the singleton by multiple threads
+  static std::mutex singleton_lock_;
+  //! The max allowed number of fusions in the cache
+  size_t max_fusions_;
+  //! A separate process is created for each device in a distributed setting.
+  //! Each FusionCache becomes associated with a device.
+  std::optional<int64_t> device_id_;
+  //! The root (start) of the prefix tree to start a cache look up of a given
+  //! fusion definition.
+  std::unique_ptr<TrieNode> root_;
+  //! A vector of nvFuser Fusion IR fusions.
+  std::vector<std::unique_ptr<FusionSchedules>> fusions_;
+  //! A vector of Terminal trie nodes for Stats collection
+  std::vector<TrieNode*> terminal_nodes_;
+  //! Items specifically to aid user defined schedules these data members
+  //! are for the mechanics of user schedule usage and don't make sense as
+  //! part of an abstraction
+  // Inputs for user defined schedules are encoded into an integer Id
+  // NOTE: I would prefer this be per FusionSchedules object but the container
+  // is not allowed to be copied or moved.
+  InputsIdLookup user_def_input_encodings_;
+};
+//! Serialize Fusion Cache to common workspace
+//! /tmp/nvfuser_kernel_db/nvf_serde_[cuda_major]_[cuda_minor]_[nvrtc_major]_[nvrtc_minor]
+//!
+//! '''python
+//! # Use atexit to automatically call serialize on program exit
+//! import atexit
+//! atexit.register(nvfuser.serialize)
+//! '''
+NVF_API void serialize();
+} // namespace nvfuser::python_frontend

nvfuser/include/nvfuser/python_frontend/fusion_definition.h ADDED Viewed

@@ -0,0 +1,372 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <exceptions.h>
+#include <python_frontend/distributed_tensor.h>
+#include <python_frontend/fusion_state.h>
+#include <python_frontend/segmentation.h>
+#include <visibility.h>
+namespace nvfuser::python_frontend {
+class FusionCache;
+class FusionDefinition;
+class FusionInterface;
+class FusionState;
+struct RecordFunctor;
+class SegmentationState;
+struct TrieNode;
+struct UserSchedule;
+//! This is helper function used to print a python formated
+//! Fusion IR DataType when printing a fusion definition.
+NVF_API const char* dtypeToPyString(PrimDataType t);
+//! The Tensor and Scalar classes are used to define separate function
+//! signatures in the FusionDefinition to identify the appropriate Operator
+//! function.
+//!
+//! Example:
+//!
+//!   add(Tensor* arg1, Tensor* arg2) -> Tensor*
+//!   add(Tensor* arg1, Val* arg2) -> Tensor*
+//!   add(Val* arg1, Val* arg2) -> Val*
+struct Tensor {
+  Tensor(size_t _index, size_t _dims, FusionDefinition* _fd)
+      : index(_index), dims(_dims), fusion_definition(_fd) {}
+  size_t operator()() const {
+    return index;
+  }
+  bool operator==(const Tensor& other) const {
+    if (index != other.index) {
+      return false;
+    }
+    if (dims != other.dims) {
+      return false;
+    }
+    if (fusion_definition != other.fusion_definition) {
+      return false;
+    }
+    return true;
+  }
+  bool operator!=(const Tensor& other) const {
+    return !(*this == other);
+  }
+  //! A unique index to identifiy each recorded state item.
+  size_t index;
+  size_t dims;
+  //! Pointer to the FusionDefinition used to create this tensor
+  //! The FusionDefinition pointer is necessary to enable special
+  //! dunder operations (ie __add__()) from the python API.
+  FusionDefinition* fusion_definition;
+};
+struct Scalar {
+  Scalar(size_t _index, FusionDefinition* _fd)
+      : index(_index), fusion_definition(_fd) {}
+  size_t operator()() const {
+    return index;
+  }
+  bool operator==(const Scalar& other) const {
+    if (index != other.index) {
+      return false;
+    }
+    if (fusion_definition != other.fusion_definition) {
+      return false;
+    }
+    return true;
+  }
+  bool operator!=(const Scalar& other) const {
+    return !(*this == other);
+  }
+  //! A unique index to identifiy each recorded state item.
+  size_t index;
+  //! Pointer to the FusionDefinition used to create this scalar
+  //! The FusionDefinition pointer is necessary to enable special
+  //! dunder operations (ie __add__()) from the python API.
+  FusionDefinition* fusion_definition;
+};
+struct Vector {
+  Vector(size_t _index, size_t _size, FusionDefinition* _fd)
+      : index(_index), size(_size), fusion_definition(_fd) {}
+  size_t operator()() const {
+    return index;
+  }
+  bool operator==(const Vector& other) const {
+    if (index != other.index) {
+      return false;
+    }
+    if (size != other.size) {
+      return false;
+    }
+    if (fusion_definition != other.fusion_definition) {
+      return false;
+    }
+    return true;
+  }
+  bool operator!=(const Vector& other) const {
+    return !(*this == other);
+  }
+  //! A unique index to identifiy each recorded state item.
+  size_t index;
+  //! Elements in the vector
+  size_t size;
+  //! Pointer to the FusionDefinition used to create this scalar
+  FusionDefinition* fusion_definition;
+};
+//! FusionDefinition defines the C++ side of a Python Context manager to
+//! encapsulate the definition of fusion operations.
+//!
+//! The FusionDefinition records the state definitions and operations prior
+//! to exiting the context manager.  Upon exit, the operations are queried
+//! in a cache and the recorded records are used to build an nvFuser Fusion
+//! object if the definition missed in the cache.
+//!
+//! The nested Operators class was designed to allow the user to query all the
+//! available Operators in the FusionDefinition via python help.
+//!
+//! Example:
+//!   help(FusionDefinition.Operators)
+class NVF_API FusionDefinition : public FusionState {
+ public:
+  FusionDefinition(std::optional<size_t> id, size_t max_length = 256);
+  // The copy/move/assign constructors/operators are removed
+  FusionDefinition(const FusionDefinition& fd) = delete;
+  FusionDefinition(FusionDefinition&& fd) = delete;
+  FusionDefinition& operator=(const FusionDefinition& fd) = delete;
+  FusionDefinition& operator=(FusionDefinition&& fd) = delete;
+  //! Enter Python Context Manager -- Reset trie for new cache lookup
+  NVF_API FusionDefinition* setupDefinition();
+  //! Exit Python Context Manager -- Triggers Fusion IR build if it is not
+  //! cached
+  NVF_API void finalizeDefinition();
+  //! Check that a user schedule exists for FusionDefinition and input
+  //! arguments on device.
+  NVF_API bool existSchedule(const at::ArrayRef<c10::IValue>& inputs);
+  //! Setup user scheduling of a fusion
+  //! Copies fusion object and sets up FusionGuard
+  NVF_API void setupSchedule(
+      const at::ArrayRef<c10::IValue>& inputs,
+      bool overwrite_existing_schedule = false);
+  //! Finalized use scheduling of a fusion
+  //! resets FusionGuard, lowers IR to a kernel, compiles kernel
+  NVF_API void finalizeSchedule(const at::ArrayRef<c10::IValue>& inputs);
+  //! A hook that gets called right before
+  //! FusionDefinition.multidevice_schedule.
+  NVF_API void setupMultideviceSchedule();
+  //! A hook that gets called right after FusionDefinition.multidevice_schedule.
+  NVF_API void finalizeMultideviceSchedule();
+  //! Prints a python function representing the definition
+  NVF_API void print(std::ostream& os) const;
+  //! Executes a fusion if a valid definition or cache lookup occurred prior.
+  //!
+  //! This method returns a list of `DistributedTensor`s. Each
+  //! `DistributedTensor` is either the local view of a distributed tensor
+  //! (when the mesh is non-empty) or a non-distributed tensor
+  //! (when the mesh is empty).
+  //!
+  //! Alternatives considered:
+  //! 1. Return std::vector<std::variant<at::Tensor, DistributedTensor>>.
+  //! Because DistributedTensor can also represent a non-distributed tensor, I
+  //! chose the current API for simplicity -- C++ is more verbose than Python
+  //! when dealing with dynamic types.
+  //! 2. Return std::variant<std::vector<at::Tensor>,
+  //! std::vector<DistributedTensor>>. Same reason.
+  //! 3. Store output shardings (i.e. the mesh and the mesh-to-tenseor-axis
+  //! mapping) to a field of FusionDefinition and retrieve it using another
+  //! method. This would be similar to getDebugOutput. I didn't choose that
+  //! because it introduced a new state in the class that could get out of sync.
+  NVF_API std::vector<DistributedTensor> execute(
+      const at::ArrayRef<c10::IValue>& inputs,
+      std::optional<int8_t> device,
+      bool override_user_schedule,
+      bool capture_debug_output,
+      bool profile,
+      std::vector<std::string> _enable_options,
+      std::vector<std::string> _disable_options) const;
+  //! Return debugging output captured through exeuction with
+  //! capture_debug_output=true
+  std::optional<std::string> getDebugOutput() const {
+    return debug_output_;
+  }
+  // Returns the tolerances values based on reduction sizes.
+  NVF_API std::vector<std::pair<double, double>> getValTolerances(
+      const at::ArrayRef<c10::IValue>& inputs);
+  //! Return the unscheduled Fusion IR
+  NVF_API std::string fusionIr();
+  //! Return the user scheduled FusionIR;
+  NVF_API std::string userScheduleIr();
+  //! Return the Cuda code for the last executed set of inputs
+  NVF_API std::string lastCudaCode(
+      bool intrinsic_code,
+      bool override_user_schedule) const;
+  //! Return the Cuda code for the given inputs
+  NVF_API std::string cudaCodeFor(
+      const at::ArrayRef<c10::IValue>& inputs,
+      bool intrinsic_code,
+      bool override_user_schedule) const;
+  //! Return the Cuda code for the last executed set of inputs
+  NVF_API std::string lastScheduledFusionIr(
+      bool tensor_transforms,
+      bool override_user_schedule) const;
+  //! Return the Cuda code for the given inputs
+  NVF_API std::string scheduledFusionIrFor(
+      const at::ArrayRef<c10::IValue>& inputs,
+      bool tensor_transforms,
+      bool override_user_schedule) const;
+  //! Return fusion id of defined FusionDefinition
+  NVF_API std::optional<size_t> id() const;
+  //! Prints the Prescheduled Fusion IR representation
+  void printMathIr();
+  bool completed() {
+    return id().has_value();
+  }
+  //! Return a prescheduled Fusion object
+  Fusion* preschedFusion();
+  //! Return UserSchedule struct if it exists
+  UserSchedule* userSchedule();
+  //! These methods are used to record the FusionDefinition for cache lookup
+  //! Defines a Tensor State Record
+  NVF_API Tensor addTensor(TensorView* tv);
+  //! Defines a Scalar State Record
+  NVF_API Scalar defineScalar();
+  //! Defines a Tensor State Record
+  NVF_API Tensor defineTensor(size_t dims);
+  //! Defines a Vector State Record
+  NVF_API Vector defineVector(size_t size);
+  //! Defines a Record that records the operation required to
+  //! build the corresponding Fusion IR operation on cache miss.
+  NVF_API void defineRecord(RecordFunctor* record);
+  //! Gets a Record State object
+  NVF_API State recordingState(size_t index) const;
+  //! Get all Tensors in FusionState.
+  NVF_API std::vector<Tensor> tensors();
+  //! Run segmentation algorithm on FusionDefinition. Returns the number of
+  //! segments.
+  NVF_API int64_t setupSegmentation(const at::ArrayRef<c10::IValue>& inputs);
+  //! Given an empty FusionDefinition and a segment id, buildSegment creates the
+  //! CPP Fusion, translates it to the python FusionDefinition, then return a
+  //! mapping from segment fusion state indices to the original fusion state
+  //! indices.
+  NVF_API std::unordered_map<int64_t, int64_t> buildSegment(
+      FusionDefinition& segment_fd,
+      int64_t segment_id);
+  //! After creating segments, destroy SegmentationState.
+  NVF_API void finalizeSegmentation();
+ private:
+  //! Returns the FusionCache Ptr that holds the cache of Fusions
+  FusionCache* fusionCache() const;
+  //! Composite operations can create hidden TensorViews in the CPP fusion
+  //! These TensorViews are not visible from python definition. This function
+  //! finds and adds them to FusionDefinition
+  void findHiddenTensorViews(Fusion* fusion);
+  //! Update Symbolic FusionStates after DynamicTransform pass
+  void updateSymbolicStates(
+      const std::unordered_map<Val*, Val*>& symbolic_to_concretized_map);
+  // Check that the NvFuser TensorView and the Python Tensor dimensions match.
+  // Apply after buildFusionIr
+  void verifyTensorDimensions();
+  //! Holds the defined maximum length of a FusionDefinition in order to
+  //! prevent a run away error. The user should feel free to increase this
+  //! number as appropriate.
+  size_t max_length_;
+  //! Fusion Cache Id for Scheduled Fusion.
+  std::optional<size_t> fusion_id_;
+  //! A pointer to the FusionCache.
+  FusionCache* fusion_cache_;
+  //! Current pointer to node in FusionCache.
+  TrieNode* trie_node_;
+  // Book keeping data members for user created schedules
+  //! Data member for holding previous fusion container when manually setting
+  //! the fusion guard.
+  Fusion* prev_fusion_;
+  //! Data member for holding the current user schedule object
+  UserSchedule* user_sched_;
+  //! Number of recording_states_ before applying user schedule
+  int64_t num_recording_states_presched_ = 0;
+  //! Data member that creates SegmentedFusion from cloned, prescheduled Fusion
+  //! then translates the segments to python FusionDefinitions.
+  std::unique_ptr<SegmentationState> segmentation_state_;
+ public:
+  //! The Operators are not directly defined in this header.  They are defined
+  //! in the python bindings through lambda functions so the user only needs to
+  //! define new operators in one place.
+  //! Operators define what operations are fused.
+  struct Operators {
+    Operators(FusionDefinition* fd) : fusion_definition(fd) {}
+    bool validUse() const {
+      return !fusion_definition->completed();
+    }
+    FusionDefinition* fusion_definition;
+  };
+  //! The SchedOperators are not directly defined in this header.  They are
+  //! defined in the python bindings through lambda functions so the user only
+  //! needs to define new operators in one place.
+  //! SchedOperators allow the user to define how a fusion should be blocked
+  //! for execution.
+  struct SchedOperators {
+    SchedOperators(FusionDefinition* fd) : fusion_definition(fd) {}
+    bool validUse() const {
+      return fusion_definition->completed();
+    }
+    FusionDefinition* fusion_definition;
+  };
+  Operators ops;
+  SchedOperators sched;
+ private:
+  mutable std::optional<std::string> debug_output_ = std::nullopt;
+};
+} // namespace nvfuser::python_frontend