PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/_C.cpython-312-x86_64-linux-gnu.so ADDED Viewed

Binary file

nvfuser/__init__.py ADDED Viewed

@@ -0,0 +1,618 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+import logging
+import os
+import re
+import sys
+from typing import Callable
+import warnings
+import torch
+# This is needed when libnvfuser.so is patched and doesn't have the pytorch library location available.
+pytorch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib")
+if pytorch_lib_dir not in sys.path:
+    sys.path.append(pytorch_lib_dir)
+# we need to import _C here to avoid confusing error message generated from failure in this python script ended up with
+# complaining on `_C` not defined for `_C._FusionDefinition`
+from . import _C
+from ._C import *  # noqa: F401,F403
+from . import contrib  # noqa: F401
+logger = logging.getLogger("nvfuser")
+# Register automatic serialization of Nvfuser cache hierarchy and cuda kernels.
+def enable_automatic_serialization():
+    import atexit
+    atexit.register(_C.serialize)
+    # A separate process is created for each device in a distributed setting.
+    # Each FusionCache becomes associated with a single device.
+    # Automatic serialization saves a separate cache for each device.
+    # Set the FusionCache id to the ddp local rank.
+    env_var_ddp_local_rank = os.environ.get("LOCAL_RANK", None)
+    if env_var_ddp_local_rank is not None:
+        env_var_ddp_local_rank = int(env_var_ddp_local_rank)
+    _C.FusionCache.get(max_fusions := 8192, env_var_ddp_local_rank)
+# Unregister automatic serialization of Nvfuser cache hierarchy and cuda kernels.
+def disable_automatic_serialization():
+    import atexit
+    atexit.unregister(_C.serialize)
+class FusionDefinition(_C._FusionDefinition):
+    def __init__(self, id=None, max_length=1024):
+        super(FusionDefinition, self).__init__(id, max_length)
+        self.profiled = False
+    def segment(self, inputs):
+        """
+        Decompose this FusionDefinition into a sequence of segment
+        FusionDefinitions.
+        This function runs the nvfuser segmentation algorithm and translates the
+        segments into their corresponding FusionDefinitions.
+        Args:
+            inputs (List[Union[Tensor, Scalar]]): A list of inputs to fusion.
+        Returns:
+            List[FusionDefinition]: The FusionDefinitions corresponding to the
+            sub-fusion segments of this FusionDefinition.
+        """
+        num_segments = self._setup_segmentation(inputs)
+        if num_segments == 1:
+            self._finalize_segmentation()
+            return []
+        # Track all segments for this FusionDefinition
+        self.segments = []
+        # Track map_segment_fid_to_original_fid for each segment
+        self.segment_index_space_maps = {}
+        # Track the last segment a value is used as an input
+        self.map_value_to_last_used_segment = {}
+        for idx in range(num_segments):
+            new_fd = FusionDefinition()
+            map_segment_fid_to_original_fid = self._build_segment(new_fd, idx)
+            for segment_input in new_fd.inputs():
+                original_input = map_segment_fid_to_original_fid[segment_input]
+                self.map_value_to_last_used_segment[original_input] = idx
+            self.segment_index_space_maps[new_fd] = map_segment_fid_to_original_fid
+            self.segments.append(new_fd)
+        self._finalize_segmentation()
+        return self.segments
+    def __enter__(self):
+        return self._setup_definition()
+    def __exit__(self, type, value, traceback):
+        try:
+            self._finalize_definition()
+        except Exception as err:
+            logger.exception(self._repro_error_str("defining"))
+            raise
+    def definition(self):
+        raise NotImplementedError("definition() should be implemented by child class!")
+    def _execute_segments(self, input_arguments, *, device=None, profile=False):
+        """
+        Run the sequence of FusionDefinition segments to generate the results
+        of this FusionDefinition.
+        This FusionDefinition acts an argument manager. It gathers input
+        arguments for the segments and stores their output results. After
+        running a segment, any redundant intermediate values, which are
+        unnecessary for any other segments, are deleted to save memory.
+        Args:
+            inputs (List[Union[Tensor, Scalar]]): A list of inputs to fusion.
+        Kwargs:
+            device (Optional[Union[int, str, torch.device]]): This is a hint to run
+                the Fusion on the given CUDA device. This is not typically
+                necessary, as the device is usually inferred from the locations
+                of input tensors. However, for some fusion definitions, no
+                tensors will be input (for example when all tensors are
+                generated with `full` or `uniform` ops). In these cases, we
+                must either tell NVFuser where to run the resulting kernel, or
+                let it default to 0. Note that passing this option providing
+                and input tensors that lie on another device is an error.
+            profile (bool): Captures a CUPTI based profile of a fusion.
+        Returns:
+            List[Tensor]: The output results for this FusionDefinition.
+        """
+        assert len(self.segments) > 0
+        assert len(self.segments) == len(self.segment_index_space_maps)
+        input_arguments_with_extents = [*input_arguments]
+        for a in input_arguments:
+            if type(a) is torch.Tensor:
+                input_arguments_with_extents.extend(a.size())
+        # Map inputs arguments to original fid
+        map_original_fid_to_value = {
+            fd_state: argument
+            for fd_state, argument in zip(
+                self.inputs() + self.extents(), input_arguments_with_extents
+            )
+        }
+        # Run all segments in correct order
+        for idx, segment in enumerate(self.segments):
+            segment_to_original_map = self.segment_index_space_maps[segment]
+            # Gather segment input arguments
+            segment_arguments = [
+                map_original_fid_to_value[segment_to_original_map[fd_state]]
+                for fd_state in segment.inputs()
+            ]
+            # Run segment
+            segment_outputs = segment.execute(
+                segment_arguments, device=device, profile=profile
+            )
+            # Update original fusion definition indices to outputs
+            for fd_state, output in zip(segment.outputs(), segment_outputs):
+                map_original_fid_to_value[segment_to_original_map[fd_state]] = output
+            # Destroy any arguments that are not used by future segments
+            for segment_input in segment.inputs():
+                original_input = segment_to_original_map[segment_input]
+                if (
+                    original_input not in self.outputs()
+                    and self.map_value_to_last_used_segment[original_input] == idx
+                ):
+                    del map_original_fid_to_value[original_input]
+        # Map output fid to actual results
+        return [map_original_fid_to_value[fd_state] for fd_state in self.outputs()]
+    def execute(
+        self,
+        inputs,
+        *,
+        device=None,
+        override_user_schedule=False,
+        capture_debug_output=False,
+        print_repro=False,
+        profile=False,
+        save_repro_inputs=False,
+        _enable_options: list[str] = [],
+        _disable_options: list[str] = [],
+    ) -> list[torch.Tensor | DistributedTensor]:
+        """
+        Executes an nvFuser set of kernels for a given Fusion
+        The FusionDefinition will be executed on a single CUDA device.
+        Typically, which device to run on is determined by the devices where
+        the input tensors reside. However, if the Fusion is defined such that
+        none of the inputs are tensors, we are not able to infer a device from
+        the inputs. For example, the following FusionDefinition will be unable
+        to unambiguously infer the device of its output:
+            with FusionDefinition() as fd:
+                tv1 = fd.ops.full([5])
+                fd.add_output(tv1)
+        In that case, we default to selecting the first CUDA
+        device, i.e. `torch.device("cuda:0")`. This method enables selecting an
+        alternative preferred device.
+        Args:
+            inputs (List[Union[Tensor, Scalar]]): A list of inputs to fusion.
+        Kwargs:
+            device (Optional[Union[int, str, torch.device]]): This is a hint to run
+                the Fusion on the given CUDA device. This is not typically
+                necessary, as the device is usually inferred from the locations
+                of input tensors. However, for some fusion definitions, no
+                tensors will be input (for example when all tensors are
+                generated with `full` or `uniform` ops). In these cases, we
+                must either tell NVFuser where to run the resulting kernel, or
+                let it default to 0. Note that passing this option providing
+                and input tensors that lie on another device is an error.
+            override_user_schedule (bool): For a user defined schedule,
+                override with auto-generated schedule (default: False)
+            capture_debug_output (bool): Whether to capture any printed
+                debugging information as a string. If True, the string can be
+                retrieved after execution using :meth:`get_debug_output`. If False,
+                then that method will return None when called.
+            print_repro (bool): Prints a reproduction script to stdout.
+            profile (bool): Captures a CUPTI based profile of a fusion.
+            save_repro_inputs (bool): Saves the inputs for last_repro_script() to
+                provide a provide a reproduction script.
+            _enable_options/_disable_options (list): NVFUSER_ENABLE/DISABLE options to use.
+                This is an alternative to environment variables.
+                Note: Currently, we do not cache/store these options in the FusionCache which makes it
+                    plausible to reuse kernels when executing the same fusion definition with different sets of options.
+                    Reset the FusionCache manually to avoid inadvertent kernel reuse when between different sets of options.
+        Returns:
+            List[Tensor]
+        """
+        self.profiled = profile
+        if device is not None:
+            if not isinstance(device, torch.device):
+                device = torch.device(device)
+            assert (
+                device.type == "cuda"
+            ), "If device argument is passed it must be a CUDA device"
+            device = device.index
+        # if definition is not defined by a context manager, try a child class
+        if self.id() is None:
+            self._setup_definition()
+            self.definition()
+            self._finalize_definition()
+        defined_multidevice_schedule = hasattr(
+            self, "multidevice_schedule"
+        ) and isinstance(self.multidevice_schedule, Callable)
+        defined_schedule = hasattr(self, "schedule") and isinstance(
+            self.schedule, Callable
+        )
+        assert not (
+            defined_multidevice_schedule and defined_schedule
+        ), "I haven't tested what if both are defined. We don't plan to support this use case although it may just work."
+        if defined_multidevice_schedule:
+            # Unlike `schedule`, `multidevice_schedule` is designed for inter-device
+            # scheduling, The scheduling is done before concretization and therefore
+            # before pre-segmentation. `schedule` however assumes the FusionDefinition
+            # has been concretized and pre-segmented, and therefore requires
+            # `_setup_schedule` and `_finalize_schedule` to be called before and after.
+            #
+            # Note: there's a plan to embed multidevice schedules into FusionDefinition
+            # as annotating nodes. This may eventually replace `multidevice_schedule`.
+            self._setup_multidevice_schedule()
+            self.multidevice_schedule()
+            self._finalize_multidevice_schedule()
+        # If schedule is defined by child class and schedule is not defined for
+        # inputs, make a schedule.
+        if defined_schedule:
+            # Schedule fusion if it does not exist yet or profiling fusion
+            if profile or not self._exist_schedule(inputs):
+                self._setup_schedule(inputs, overwrite_existing_schedule=profile)
+                self.schedule()
+                self._finalize_schedule(inputs)
+        if save_repro_inputs:
+            from torch._subclasses.fake_tensor import FakeTensorMode
+            fake_mode = FakeTensorMode()
+            self.fake_inputs = [fake_mode.from_tensor(inp) for inp in inputs]
+        if hasattr(self, "segments") and len(self.segments) > 0:
+            return self._execute_segments(inputs, device=device, profile=profile)
+        try:
+            if print_repro:
+                print(self.repro_script_for(inputs))
+            if len(_enable_options) or len(_disable_options):
+                warnings.warn(
+                    "Reset the FusionCache manually to avoid reusing kernels when re-executing the fusion definition with different options."
+                )
+            out_tensors: list[DistributedTensor] = self._execute(
+                inputs,
+                device=device,
+                override_user_schedule=override_user_schedule,
+                capture_debug_output=capture_debug_output,
+                profile=profile,
+                _enable_options=_enable_options,
+                _disable_options=_disable_options,
+            )
+            for i, out_tensor in enumerate(out_tensors):
+                if out_tensor.mesh.size == 0:
+                    out_tensors[i] = out_tensor.local
+            return out_tensors
+        except Exception as err:
+            logger.exception(self._repro_error_str("executing", inputs))
+            raise
+    def debug_output(self):
+        """
+        Retrieve string of captured debug information from the previous execution.
+        Note that `capture_debug_output=True` must be passed to `execute()` in
+        order to enable capturing this output. Otherwise, this method will
+        return `None`.
+        Returns:
+            Optional[String] : the captured debug output for the previous call
+            to execute(). If the `capture_debug_output` argument to that call
+            was False, returns None. Otherwise, returns the output as a string.
+        """
+        return self._debug_output()
+    def from_pytorch(self, tensor, static_sizes=False):
+        """
+        Defines an nvfuser input tensor from a pytorch tensor and defaults
+        to definining a symbolic tensor for dynamic shape usage.
+        Args:
+            tensor (torch.Tensor): Input tensor to nvFuser
+            static_sizes (bool)  : Interprets sizes as static rather than
+                                   as symbolic for dynamic shape usage
+        Returns:
+            nvfuser.Tensor
+        """
+        try:
+            from .pytorch_utils import torch_dtype_to_nvfuser_dtype
+        except ImportError:
+            raise ImportError("Unable to import pytorch_utils!")
+        if not tensor.is_cuda and len(tensor.size()) != 0:
+            raise ValueError("CPU non-scalar tensor is not supported!")
+        return self.define_tensor(
+            sizes=tensor.size(),
+            strides=tensor.stride(),
+            dtype=torch_dtype_to_nvfuser_dtype(tensor.dtype),
+            static_sizes=static_sizes,
+            is_cpu=tensor.is_cpu,
+        )
+    def fusion_ir(self):
+        """
+        Returns the uscheduled Fusion IR for the given definition that corresponds to all scheduled inputs.
+        Returns:
+            String
+        """
+        return self._fusion_ir()
+    def last_cuda_code(self, intrinsic_code=False, **kwargs):
+        """
+        Returns the Cuda Code for the last executed set of inputs
+        Args:
+            intrinsic_code (Bool): Include all the additional code required to run kernel(s). (default: False)
+        Kwargs:
+            override_user_schedule (Bool): For a user defined schedule, override with auto-generated schedule (default: False)
+        Returns:
+            String
+        """
+        override_user_schedule = kwargs.pop("override_user_schedule", False)
+        return self._last_cuda_code(intrinsic_code, override_user_schedule)
+    def cuda_code_for(self, inputs, intrinsic_code=False, **kwargs):
+        """
+        Returns the Cuda Code for the given inputs
+        Args:
+            inputs (List[Union[Tensor, Scalar]]): A list of inputs to fusion.
+            intrinsic_code (Bool): Include all the additional code required to run kernel(s). (default: False)
+        Kwargs:
+            override_user_schedule (Bool): For a user defined schedule, override with auto-generated schedule (default: False)
+        Returns:
+            String
+        """
+        override_user_schedule = kwargs.pop("override_user_schedule", False)
+        return self._cuda_code_for(inputs, intrinsic_code, override_user_schedule)
+    def last_scheduled_fusion_ir(self, tensor_transforms=False, **kwargs):
+        """
+        Returns the Scheduled Fusion IR for the last executed set of inputs
+        Args:
+            tensor_transforms (Bool): Include tensor transforms that were applied through scheduling. (default: False)
+        Kwargs:
+            override_user_schedule (Bool): For a user defined schedule, override with auto-generated schedule (default: False)
+        Returns:
+            String
+        """
+        override_user_schedule = kwargs.pop("override_user_schedule", False)
+        return self._last_scheduled_fusion_ir(tensor_transforms, override_user_schedule)
+    def scheduled_fusion_ir_for(self, inputs, tensor_transforms=False, **kwargs):
+        """
+        Returns the Scheduled Fusion IR for the last executed set of inputs
+        Args:
+            inputs (List[Union[Tensor, Scalar]]): A list of inputs to fusion.
+            tensor_transforms (Bool): Include tensor transforms that were applied through scheduling. (default: False)
+        Kwargs:
+            override_user_schedule (Bool): For a user defined schedule, override with auto-generated schedule (default: False)
+        Returns:
+            String
+        """
+        override_user_schedule = kwargs.pop("override_user_schedule", False)
+        return self._scheduled_fusion_ir_for(
+            inputs, tensor_transforms, override_user_schedule
+        )
+    def profile(self):
+        """
+        Returns the FusionProfile object from the CUPTI based FusionProfiler
+        Returns:
+            FusionProfile
+        """
+        if not self.profiled:
+            raise ValueError(
+                "The execute() method was not previously called with profiling enabled!"
+            )
+        fp = self._profile()
+        if fp.fusion_id < 0:
+            raise ValueError(
+                "Something went wrong with Fusion Profiling as an illegal fusion_id was returned! "
+                + str(fp.fusion_id)
+            )
+        if fp.segments < 1:
+            raise ValueError(
+                "Something went wrong with Fusion Profiling as no kernel segments were profiled!"
+                + str(fp.segments)
+            )
+        return fp
+    def last_repro_script(self) -> str:
+        assert (
+            self.fake_inputs is not None
+        ), "fd.last_repro_script() cannot provide a repro because fd.execute(inputs, save_repro_state=True) was not executed!"
+        script = self.repro_script_for(self.fake_inputs)
+        return script
+    def repro_script_for(self, inputs: list | None = None) -> str:
+        msg = "# CUDA devices:\n"
+        for i in range(torch.cuda.device_count()):
+            msg += f"#  {i}: {torch.cuda.get_device_name(i)}\n"
+        msg += (
+            f"# torch version: {torch.__version__}\n"
+            f"# cuda version: {torch.version.cuda}\n"
+            f"# nvfuser version: {version()}\n"
+            "import torch\n"
+            "from nvfuser import FusionDefinition, DataType\n"
+            f"{self}"
+            "with FusionDefinition() as fd:\n"
+            f"    nvfuser_fusion_id{self.id()}(fd)\n"
+        )
+        if inputs is not None:
+            msg += "\ninputs = [\n"
+            for i in inputs:
+                if isinstance(i, torch.Tensor):
+                    if i.is_contiguous():
+                        msg += f"    torch.testing.make_tensor({tuple(i.size())}, dtype={i.dtype}, device='{i.device}'),\n"
+                    else:
+                        # max linear index determines number of elements to generate
+                        sz = 1
+                        for szi, stri in zip(i.size(), i.stride()):
+                            if szi == 0:
+                                sz = 0
+                                break
+                            sz += (szi - 1) * stri
+                        if i.dtype.is_floating_point:
+                            msg += (
+                                f"    torch.randn({sz}, dtype={i.dtype}, device='{i.device}')"
+                                f".as_strided({tuple(i.size())}, {tuple(i.stride())}),\n"
+                            )
+                        else:
+                            upper_bound = 2 if i.dtype == torch.bool else 10
+                            msg += (
+                                f"    torch.randint(0, {upper_bound}, ({sz},), dtype={i.dtype}, device='{i.device}')"
+                                f".as_strided({tuple(i.size())}, {tuple(i.stride())}),\n"
+                            )
+                else:
+                    input_as_string = str(i)
+                    # `nan` and `inf` are stringified as is, which are not
+                    # defined in Python. So we replace them with `float("nan")`
+                    # and `float("inf")`. `-inf` is replaced with
+                    # `-float("inf")`, which equals `float("-inf")`.
+                    input_as_string = re.sub(
+                        r"\binf\b", 'float("inf")', input_as_string
+                    )
+                    input_as_string = re.sub(
+                        r"\bnan\b", 'float("nan")', input_as_string
+                    )
+                    msg += f"    {input_as_string},\n"
+            msg += "]"
+            msg += "\nfd.execute(inputs)\n"
+        return msg
+    def _repro_error_str(self, section: str, inputs: list | None = None):
+        msg = (
+            f"An error occurred while {section} nvFuser FusionDefinition {self.id()}.\n"
+            "If you believe this is a bug or need assistance, please file an issue at "
+            "https://github.com/NVIDIA/Fuser/issues/new\n"
+            f"Here's a script to reproduce the error:\n"
+            "```python\n"
+        )
+        msg += self.repro_script_for(inputs)
+        msg += "```\n"
+        return msg
+    def validate(
+        self,
+        inputs: list[torch.Tensor],
+        reference_outputs: list[torch.Tensor],
+        kwargs=None,
+    ):
+        """
+        Validates the fusion outputs against the provided reference outputs, using variable tolerances determined based on datatype and reduction size.
+        Inputs:
+            inputs: A list of inputs expected by the fusion definition
+            reference_outputs: A list of reference outputs to validate against
+        """
+        fusion_outputs = self.execute(inputs)
+        assert len(fusion_outputs) == len(
+            reference_outputs
+        ), f"Expected {len(fusion_outputs)} reference outputs for validation."
+        tolerance_values = self.getValTolerances(inputs)
+        assert len(tolerance_values) == len(
+            fusion_outputs
+        ), f"Missing tolerance values, expected {len(fusion_outputs)}, got {len(tolerance_values)}"
+        for inx, fusion_output in enumerate(fusion_outputs):
+            atol, rtol = tolerance_values[inx]
+            reference_output = reference_outputs[inx]
+            assert (
+                reference_output.shape == fusion_output.shape
+            ), "Mismatch in reference and fusion output dimensions"
+            if torch.is_floating_point(fusion_output) or torch.is_complex(
+                fusion_output
+            ):
+                assert torch.allclose(
+                    fusion_output, reference_output, atol=atol, rtol=rtol
+                ), f"Max error: {torch.abs(torch.max(fusion_output - reference_output))}, \
+                    Absolute tolerance: {atol}, Relative tolerance: {rtol}"
+            else:
+                assert torch.equal(
+                    fusion_output, reference_output
+                ), "Mismatch in reference and fusion output values, datatype is not float/complex."
+from .nvfuser_version import __version__
+def version():
+    r"""returns nvfuser version in format of a string 'm.n.p+git[7d-sha]'.
+    We strip the git[7d-sha] and convert the string to
+    `nvfuser_version.Version` for comparison. e.g. you can use it as:
+        import nvfuser
+        print(nvfuser.version())              # 0.0.1+git21df524
+        nvfuser.version() == '0.0.1`          # True
+        nvfuser.version() > '0.0.0`           # True
+        from nvfuser_version import Version
+        nvfuser.version() < Version('1.0.0')  # True
+    """
+    return __version__

nvfuser/__init__.pyi ADDED Viewed

@@ -0,0 +1,4 @@
+from typing import List
+def compute_contiguity(sizes, strides) -> List[bool]: ...

nvfuser/contrib/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+from . import nn
+__all__ = [
+    "nn",
+]

nvfuser/contrib/nn/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+from .normalization import InstanceNorm1dNVFuser
+from .normalization import InstanceNorm2dNVFuser
+from .normalization import InstanceNorm3dNVFuser
+__all__ = [
+    "InstanceNorm1dNVFuser",
+    "InstanceNorm2dNVFuser",
+    "InstanceNorm3dNVFuser",
+]