PyPI - nvfuser-cu121-torch25 - Versions diffs - 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl

Files changed (242) hide show

nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
nvfuser/__init__.py +618 -0
nvfuser/__init__.pyi +4 -0
nvfuser/contrib/__init__.py +9 -0
nvfuser/contrib/nn/__init__.py +13 -0
nvfuser/contrib/nn/normalization.py +725 -0
nvfuser/include/nvfuser/alias_analysis.h +116 -0
nvfuser/include/nvfuser/bfs.h +929 -0
nvfuser/include/nvfuser/codegen.h +26 -0
nvfuser/include/nvfuser/compute_at.h +28 -0
nvfuser/include/nvfuser/compute_at_map.h +394 -0
nvfuser/include/nvfuser/contiguity.h +351 -0
nvfuser/include/nvfuser/cuda_utils.h +50 -0
nvfuser/include/nvfuser/debug.h +50 -0
nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
nvfuser/include/nvfuser/device_lower/utils.h +382 -0
nvfuser/include/nvfuser/device_lower/validation.h +74 -0
nvfuser/include/nvfuser/disjoint_set.h +556 -0
nvfuser/include/nvfuser/dispatch.h +334 -0
nvfuser/include/nvfuser/driver_api.h +49 -0
nvfuser/include/nvfuser/dynamic_transform.h +316 -0
nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
nvfuser/include/nvfuser/evaluator_common.h +295 -0
nvfuser/include/nvfuser/exceptions.h +283 -0
nvfuser/include/nvfuser/expr_evaluator.h +125 -0
nvfuser/include/nvfuser/expr_simplifier.h +218 -0
nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
nvfuser/include/nvfuser/fusion.h +511 -0
nvfuser/include/nvfuser/fusion_guard.h +37 -0
nvfuser/include/nvfuser/fusion_profiler.h +311 -0
nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
nvfuser/include/nvfuser/global_allocator.h +27 -0
nvfuser/include/nvfuser/grouped_reduction.h +47 -0
nvfuser/include/nvfuser/host_ir/container.h +60 -0
nvfuser/include/nvfuser/host_ir/executor.h +152 -0
nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
nvfuser/include/nvfuser/host_ir/lower.h +35 -0
nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
nvfuser/include/nvfuser/id_model/id_model.h +359 -0
nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
nvfuser/include/nvfuser/id_model/indexing.h +208 -0
nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
nvfuser/include/nvfuser/id_model/schedule.h +54 -0
nvfuser/include/nvfuser/id_model/to_string.h +87 -0
nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
nvfuser/include/nvfuser/id_model/utils.h +176 -0
nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
nvfuser/include/nvfuser/index_compute.h +651 -0
nvfuser/include/nvfuser/instrumentation.h +107 -0
nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
nvfuser/include/nvfuser/ir/builder.h +215 -0
nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
nvfuser/include/nvfuser/ir/cloner.h +185 -0
nvfuser/include/nvfuser/ir/container.h +226 -0
nvfuser/include/nvfuser/ir/graphviz.h +119 -0
nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
nvfuser/include/nvfuser/ir/iostream.h +98 -0
nvfuser/include/nvfuser/ir/printer.h +57 -0
nvfuser/include/nvfuser/ir/utils.h +801 -0
nvfuser/include/nvfuser/iter_visitor.h +661 -0
nvfuser/include/nvfuser/kernel.h +299 -0
nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
nvfuser/include/nvfuser/kernel_ir.h +1457 -0
nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
nvfuser/include/nvfuser/linked_hash_map.h +97 -0
nvfuser/include/nvfuser/logical_domain_map.h +577 -0
nvfuser/include/nvfuser/macros.h +23 -0
nvfuser/include/nvfuser/mma_type.h +257 -0
nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
nvfuser/include/nvfuser/multidevice/communication.h +232 -0
nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
nvfuser/include/nvfuser/multidevice/executor.h +107 -0
nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
nvfuser/include/nvfuser/multidevice/utils.h +187 -0
nvfuser/include/nvfuser/non_divisible_split.h +86 -0
nvfuser/include/nvfuser/opaque_type.h +129 -0
nvfuser/include/nvfuser/ops/alias.h +192 -0
nvfuser/include/nvfuser/ops/all_ops.h +13 -0
nvfuser/include/nvfuser/ops/arith.h +712 -0
nvfuser/include/nvfuser/ops/composite.h +130 -0
nvfuser/include/nvfuser/ops/indexing.h +55 -0
nvfuser/include/nvfuser/ops/normalization.h +263 -0
nvfuser/include/nvfuser/ops/utils.h +127 -0
nvfuser/include/nvfuser/options.h +313 -0
nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
nvfuser/include/nvfuser/polymorphic_value.h +432 -0
nvfuser/include/nvfuser/predicate_compute.h +213 -0
nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
nvfuser/include/nvfuser/scheduler/registry.h +97 -0
nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
nvfuser/include/nvfuser/scheduler/resize.h +41 -0
nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
nvfuser/include/nvfuser/scheduler/utils.h +771 -0
nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
nvfuser/include/nvfuser/serde/factory.h +55 -0
nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
nvfuser/include/nvfuser/serde/utils.h +34 -0
nvfuser/include/nvfuser/struct.inl +127 -0
nvfuser/include/nvfuser/swizzle.h +54 -0
nvfuser/include/nvfuser/sys_utils.h +40 -0
nvfuser/include/nvfuser/tensor_metadata.h +118 -0
nvfuser/include/nvfuser/tma.h +124 -0
nvfuser/include/nvfuser/transform_iter.h +522 -0
nvfuser/include/nvfuser/transform_replay.h +297 -0
nvfuser/include/nvfuser/transform_rfactor.h +33 -0
nvfuser/include/nvfuser/transform_view.h +136 -0
nvfuser/include/nvfuser/type.h +1125 -0
nvfuser/include/nvfuser/type_promotion.h +61 -0
nvfuser/include/nvfuser/utils.h +619 -0
nvfuser/include/nvfuser/val_graph.h +446 -0
nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
nvfuser/include/nvfuser/validator_utils.h +92 -0
nvfuser/include/nvfuser/vectorization_info.h +31 -0
nvfuser/include/nvfuser/visibility.h +21 -0
nvfuser/lib/libnvfuser_codegen.so +0 -0
nvfuser/nvfuser_version.py +69 -0
nvfuser/pytorch_utils.py +184 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
nvfuser/utils.py +18 -0
nvfuser/version.py +1 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0

nvfuser/include/nvfuser/ops/composite.h ADDED Viewed

@@ -0,0 +1,130 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+//
+// The operations defined in this header is intended as user facing functions.
+// The user will provide the necessary input TensorViews and the function will
+// create the correct intermediate nodes and return the output TensorViews.
+//
+namespace nvfuser {
+struct ForwardDropoutResult {
+  TensorView* output = nullptr;
+  TensorView* mask = nullptr;
+};
+NVF_API ForwardDropoutResult dropout(TensorView* x, Val* prob);
+NVF_API ForwardDropoutResult dropout(TensorView* x, Val* prob, Val* scale);
+NVF_API TensorView* dropout_backward(
+    TensorView* dy,
+    TensorView* mask,
+    Val* scale);
+NVF_API TensorView* triu(TensorView* tv, Val* offset);
+struct LstmResult {
+  TensorView* cell = nullptr;
+  TensorView* hidden = nullptr;
+};
+NVF_API LstmResult lstm(
+    TensorView* prev_cell,
+    TensorView* in_x,
+    TensorView* forget_x,
+    TensorView* cell_x,
+    TensorView* out_x);
+// Linear functions which takes in two tensors of shapes input[* , in_features],
+// weight[out_features, in_features] / [in_features] and an optional bias of
+// shape [out_features] or 0D scalar. Bias can only be given if weight is a 2-D
+// tensor.
+TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias);
+// This is an implementation detail to reflect when linear is called
+// without a bias. This calls the above function. We use this function
+// since it simplifies creating a Python API which takes optional arguments.
+// Other options include using lambdas or creating a new RecordFunctor for
+// Linear.
+TensorView* linear(TensorView* input, TensorView* weight);
+NVF_API TensorView* sign(TensorView* x);
+NVF_API Val* sign(Val* x);
+TensorView* softplus(TensorView* x, Val* beta, Val* threshold);
+NVF_API TensorView* gelu(TensorView* x);
+NVF_API TensorView* gelu_backward(TensorView* dy, TensorView* x);
+TensorView* tanh_gelu(TensorView* x);
+TensorView* tanh_gelu_backward(TensorView* dy, TensorView* x);
+TensorView* tanh_backward(TensorView* dy, TensorView* tanh_x);
+TensorView* leaky_relu(TensorView* x, Val* negative_slope);
+NVF_API TensorView* view_as_real(TensorView* x);
+// Matmul function which takes in tensors with the shapes
+// A[*, M, K] / A[K] and B[*, K, N] / B[K], but the tensors may have different
+// layouts via strides. This has the same functionality as torch.matmul
+TensorView* matmul(TensorView* tv_a, TensorView* tv_b);
+// Scaled Dot Product Flash Attention Forward Result
+struct SdpfaFwdResult {
+  TensorView* output = nullptr;
+  TensorView* log_sumexp = nullptr;
+  TensorView* philox_seed = nullptr;
+  TensorView* philox_offset = nullptr;
+};
+// Scaled Dot Product Flash Attention Forward API.
+// Returns the same output as at::_scaled_dot_product_flash_attention
+SdpfaFwdResult sdpfa_fwd(
+    TensorView* query,
+    TensorView* key,
+    TensorView* value,
+    Val* dropout_p,
+    Val* is_causal,
+    Val* scale);
+// Scaled Dot Product Flash Attention Backward Result
+struct SdpfaBwdResult {
+  TensorView* grad_query = nullptr;
+  TensorView* grad_key = nullptr;
+  TensorView* grad_value = nullptr;
+};
+// Scaled Dot Product Flash Attention Backward API.
+// Returns the same output as at::_scaled_dot_product_flash_attention_backward
+SdpfaBwdResult sdpfa_bwd(
+    TensorView* grad_output,
+    TensorView* query,
+    TensorView* key,
+    TensorView* value,
+    TensorView* output,
+    TensorView* log_sumexp,
+    Val* dropout_p,
+    Val* is_causal,
+    TensorView* philox_seed,
+    TensorView* philox_offset,
+    Val* scale);
+TensorView* embedding_fwd(
+    TensorView* input,
+    TensorView* weight,
+    Val* padding_idx,
+    Val* max_norm,
+    Val* norm_type,
+    Val* scale_grad_by_freq,
+    Val* sparse);
+} // namespace nvfuser

nvfuser/include/nvfuser/ops/indexing.h ADDED Viewed

@@ -0,0 +1,55 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+namespace nvfuser {
+NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index);
+// torch.index_select
+NVF_API TensorView* indexSelect(
+    TensorView* input,
+    int64_t dim,
+    TensorView* index);
+// torch.gather
+NVF_API TensorView* torchGather(
+    TensorView* input,
+    int64_t dim,
+    TensorView* index);
+// torch.scatter
+TensorView* scatterOp(
+    ScatterOpType type,
+    TensorView* self,
+    int64_t dim,
+    TensorView* index,
+    TensorView* src);
+NVF_API TensorView* scatter(
+    TensorView* self,
+    int64_t dim,
+    TensorView* index,
+    TensorView* src);
+//! numpy.take_along_axis
+//! (https://numpy.org/doc/stable/reference/generated/numpy.take_along_axis.html)
+//! Note the order of the parameters follows the numpy order, which is
+//! different from torchGather.
+NVF_API TensorView* takeAlongAxis(
+    TensorView* input,
+    TensorView* index,
+    int64_t dim);
+} // namespace nvfuser

nvfuser/include/nvfuser/ops/normalization.h ADDED Viewed

@@ -0,0 +1,263 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <visibility.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+#include <tuple>
+#include <vector>
+//
+// The operations defined in this header is intended as user facing functions.
+// The user will provide the necessary input TensorViews and the function will
+// create the correct intermediate nodes and return the output TensorViews.
+//
+namespace nvfuser {
+struct ForwardNormResult {
+  TensorView* output = nullptr;
+  TensorView* mean = nullptr;
+  TensorView* invstd = nullptr;
+};
+struct BackwardNormResult {
+  TensorView* grad_input = nullptr;
+  TensorView* grad_weight = nullptr;
+  TensorView* grad_bias = nullptr;
+};
+struct ForwardRMSNormResult {
+  TensorView* output = nullptr;
+  TensorView* invstd = nullptr;
+};
+struct BackwardRMSNormResult {
+  TensorView* grad_input = nullptr;
+  TensorView* grad_weight = nullptr;
+};
+struct VarMeanResult {
+  TensorView* var = nullptr;
+  TensorView* mean = nullptr;
+};
+} // namespace nvfuser
+namespace std {
+// Make these results behave like a std::tuple
+using nvfuser::BackwardNormResult;
+using nvfuser::BackwardRMSNormResult;
+using nvfuser::ForwardNormResult;
+using nvfuser::ForwardRMSNormResult;
+using nvfuser::TensorView;
+using nvfuser::VarMeanResult;
+template <int i>
+constexpr TensorView* get(const ForwardNormResult& results) {
+  if (i == 0) {
+    return results.output;
+  }
+  if (i == 1) {
+    return results.mean;
+  }
+  if (i == 2) {
+    return results.invstd;
+  }
+  return nullptr;
+}
+template <int i>
+constexpr TensorView* get(const BackwardNormResult& results) {
+  if (i == 0) {
+    return results.grad_input;
+  }
+  if (i == 1) {
+    return results.grad_weight;
+  }
+  if (i == 2) {
+    return results.grad_bias;
+  }
+  return nullptr;
+}
+template <int i>
+constexpr TensorView* get(const ForwardRMSNormResult& results) {
+  if (i == 0) {
+    return results.output;
+  }
+  if (i == 1) {
+    return results.invstd;
+  }
+  return nullptr;
+}
+template <int i>
+constexpr TensorView* get(const BackwardRMSNormResult& results) {
+  if (i == 0) {
+    return results.grad_input;
+  }
+  if (i == 1) {
+    return results.grad_weight;
+  }
+  return nullptr;
+}
+template <int i>
+constexpr TensorView* get(const VarMeanResult& results) {
+  if (i == 0) {
+    return results.var;
+  }
+  if (i == 1) {
+    return results.mean;
+  }
+  return nullptr;
+}
+} // namespace std
+namespace nvfuser {
+TensorView* mean(TensorView* x, const std::vector<int64_t>& dims, bool keepdim);
+NVF_API TensorView* variance(
+    TensorView* x,
+    const std::vector<int64_t>& dims,
+    bool unbiased,
+    bool keepdim);
+NVF_API TensorView* variance(
+    TensorView* x,
+    const std::vector<int64_t>& dims,
+    int64_t correction,
+    bool keepdim);
+NVF_API VarMeanResult variance_mean(
+    TensorView* x,
+    const std::vector<int64_t>& dims,
+    int64_t correction,
+    bool keepdim);
+NVF_API TensorView* standard_deviation(
+    TensorView* x,
+    const std::vector<int64_t>& dims,
+    bool unbiased,
+    bool keepdim);
+NVF_API TensorView* softmax(TensorView* x, int64_t dim);
+NVF_API TensorView* softmax_backward(
+    TensorView* dy,
+    TensorView* y,
+    const int64_t dim);
+NVF_API TensorView* log_softmax(TensorView* x, int64_t dim);
+NVF_API TensorView* log_softmax_backward(
+    TensorView* dy,
+    TensorView* y,
+    const int64_t dim);
+NVF_API ForwardNormResult layer_norm(
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* weight,
+    TensorView* bias,
+    Val* eps);
+NVF_API ForwardNormResult layer_norm(
+    TensorView* x,
+    const int64_t kNormShapeNumDims,
+    TensorView* weight,
+    TensorView* bias,
+    Val* eps);
+NVF_API ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* weight,
+    Val* eps);
+NVF_API ForwardRMSNormResult rms_norm(
+    TensorView* x,
+    const int64_t kNormShapeNumDims,
+    TensorView* weight,
+    Val* eps);
+NVF_API BackwardNormResult layer_norm_backward(
+    TensorView* dy,
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* mean,
+    TensorView* rstd,
+    TensorView* weight,
+    TensorView* bias,
+    const std::vector<bool>& output_mask);
+NVF_API BackwardRMSNormResult rms_norm_backward(
+    TensorView* dy,
+    TensorView* x,
+    const std::vector<int64_t>& norm_shape,
+    TensorView* rstd,
+    TensorView* weight,
+    const std::vector<bool>& output_mask);
+NVF_API ForwardNormResult batch_norm(
+    TensorView* x,
+    TensorView* weight,
+    TensorView* bias,
+    TensorView* running_mean,
+    TensorView* running_var,
+    const bool kTraining,
+    Val* momentum,
+    Val* eps,
+    bool channels_last = false);
+NVF_API BackwardNormResult batch_norm_backward(
+    TensorView* x,
+    TensorView* dy,
+    TensorView* weight,
+    TensorView* running_mean,
+    TensorView* running_var,
+    TensorView* save_mean,
+    TensorView* save_invstd,
+    const bool kTraining,
+    Val* eps,
+    const std::vector<bool>& output_mask,
+    bool channels_last = false);
+NVF_API ForwardNormResult instance_norm(
+    TensorView* x,
+    TensorView* weight,
+    TensorView* bias,
+    TensorView* running_mean,
+    TensorView* running_var,
+    const bool kUseInputStats, // kTraining?
+    Val* momentum,
+    Val* eps,
+    bool channels_last = false);
+NVF_API BackwardNormResult instance_norm_backward(
+    TensorView* x,
+    TensorView* dy,
+    TensorView* weight,
+    TensorView* running_mean,
+    TensorView* running_var,
+    TensorView* save_mean,
+    TensorView* save_invstd,
+    const bool kTraining,
+    Val* eps,
+    const std::vector<bool>& output_mask,
+    bool channels_last = false);
+} // namespace nvfuser

nvfuser/include/nvfuser/ops/utils.h ADDED Viewed

@@ -0,0 +1,127 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <exceptions.h>
+#include <ir/base_nodes.h>
+#include <ir/interface_nodes.h>
+#include <scheduler/matmul_utils.h>
+#include <type.h>
+#include <visibility.h>
+#include <vector>
+namespace nvfuser {
+enum class AttnRole { Q = 0, K, V, Mask };
+namespace ops {
+TensorView* maybe_broadcast_inner_to_rank(TensorView* t, size_t rank);
+// A utility function that broadcasts index TensorView to the rank of the other
+// TensorView.
+TensorView* maybeBroadcastIndexTv(TensorView* t, size_t dim, size_t rank);
+// A utility function that checks if index tv is already broadcasted to correct
+// shape for index_select
+bool isIndexAlreadyBroadcast(
+    const std::vector<IterDomain*>& index_domain,
+    size_t dim,
+    size_t rank);
+Val* simplifiedInt(Val* val);
+// If one size is nullptr, return the other. If both symbolic just return v1. If
+// one's concrete, prefer that one (simplified). If both concrete make sure
+// they're the same size.
+Val* promoteSize(Val* v1, Val* v2);
+// Will return a new value of type val with the DataType dtype.
+Val* newScalar(ValType vtype, DataType dtype);
+IterType promoteIterType(IterType type1, IterType type2);
+// For MatmulOp, the input iterdomains at a given index do not necessarily map
+// to the output iterdomain at that index This function aligns the input
+// iterdomain to the output and returns a vector where each element is the input
+// iterdomain corresponding to the output iterdomain at that index. If the
+// element is nullptr, there is no mapping between input-output at that index.
+// Based on the input dimensions following cases are possible:
+// 1. A/B is 1D: [M, K] x [K] -> [M] (Mapping A: {id_M}, Mapping B: {nullptr})
+// or [K] x [N, K] -> [N] (Mapping A: {nullptr}, Mapping B: {id_N})
+// 2. A and B are 2D: [M, K] x [K, N] -> [M, N] (Mapping A: {id_M, nullptr},
+// Mapping B: {nullptr, id_N})
+// 3. A/B are atleast 1D and one of them is > 2D: [B, M, K] x [K, N] -> [B, M,
+// N] (Mapping A: {id_B, id_M, nullptr}, Mapping B: {nullptr, nullptr, id_N})
+// Args:
+// 1. input_domain: root/logical domain without reductions for any input to
+// MatmulOp
+// 2. input_position: Specifies if the input is A / B (0 or 1)
+// 3: out_size: MatmulOp output dimension (input and output may not be the same
+// size).
+std::vector<IterDomain*> mapMatmulOpIterDomains(
+    const std::vector<IterDomain*>& input_domain,
+    int64_t input_position,
+    size_t out_size);
+// For LinearOp, the output is the same as the first input (A[*,
+// in_features])for all but the last dimension. If the second input is 2D
+// (B[out_features, in_features]), the last dimension of output is out_features.
+// If bias is 1D (bias[out_features]) it maps to the last dimension of the
+// output. Args:
+// 1. input_domain: root/logical domain without reductions for any input to
+// LinearOp
+// 2. input_position: Specifies if the input is A / B / Bias (0, 1, or 2)
+// (MatmulTensorRole::Input_A/Input_B/Input_C) 3: out_size: LinearOp output
+// dimension (input and output may not be the same size).
+std::vector<IterDomain*> mapLinearOpIterDomains(
+    const std::vector<IterDomain*>& input_domain,
+    int64_t input_position,
+    size_t out_size,
+    bool k_bcast);
+// Takes a vector of aligned input iterdomains to create the output iterdomain.
+// This is used if the input iterdomains are not trivially mapped to the output
+// iterdomains. For eg: MatmulOp. If given, the forced_iter_type argument will
+// be the output IterType regardless of the inputs; otherwise the output
+// IterType is inferred from ids.
+IterDomain* newOutputIterDomain(
+    const std::vector<IterDomain*>& ids,
+    const std::optional<IterType> force_iter_type = std::nullopt);
+// Takes a vector of `Val*`s and assumes they are all aligned to create the
+// output tensorview, e.g., for BinaryOp. `vals` can contain scalars, e.g, when
+// creating the output TensorView for `tv0+scalar`. This is for convenience and
+// scalars will be ignored.
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
+TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);
+std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals);
+NVF_API Val* newValLike(Val* val, DataType dtype);
+// returns the minimum init value for reduction:
+//   -inf for floating type;
+//   lowest value for integer type;
+//   false for bool.
+Val* getMinimumValue(DataType v);
+// returns the maximum init value for reduction:
+//   inf for floating type;
+//   highest value for integer type;
+//   true for bool.
+Val* getMaximumValue(DataType v);
+std::vector<unsigned int> canonicalizeAxes(
+    const std::vector<int64_t>& axes,
+    int64_t ndims);
+} // namespace ops
+} // namespace nvfuser