nvfuser-cu121-torch25 0.2.25.dev20250201__cp312-cp312-manylinux_2_28_x86_64.whl
Sign up to get free protection for your applications and to get access to all the features.
- nvfuser/_C.cpython-312-x86_64-linux-gnu.so +0 -0
- nvfuser/__init__.py +618 -0
- nvfuser/__init__.pyi +4 -0
- nvfuser/contrib/__init__.py +9 -0
- nvfuser/contrib/nn/__init__.py +13 -0
- nvfuser/contrib/nn/normalization.py +725 -0
- nvfuser/include/nvfuser/alias_analysis.h +116 -0
- nvfuser/include/nvfuser/bfs.h +929 -0
- nvfuser/include/nvfuser/codegen.h +26 -0
- nvfuser/include/nvfuser/compute_at.h +28 -0
- nvfuser/include/nvfuser/compute_at_map.h +394 -0
- nvfuser/include/nvfuser/contiguity.h +351 -0
- nvfuser/include/nvfuser/cuda_utils.h +50 -0
- nvfuser/include/nvfuser/debug.h +50 -0
- nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
- nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
- nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
- nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
- nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
- nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
- nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
- nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
- nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
- nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
- nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
- nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
- nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
- nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
- nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
- nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
- nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
- nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
- nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
- nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
- nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
- nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
- nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
- nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
- nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
- nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
- nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
- nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
- nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
- nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
- nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
- nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
- nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
- nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
- nvfuser/include/nvfuser/device_lower/utils.h +382 -0
- nvfuser/include/nvfuser/device_lower/validation.h +74 -0
- nvfuser/include/nvfuser/disjoint_set.h +556 -0
- nvfuser/include/nvfuser/dispatch.h +334 -0
- nvfuser/include/nvfuser/driver_api.h +49 -0
- nvfuser/include/nvfuser/dynamic_transform.h +316 -0
- nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
- nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
- nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
- nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
- nvfuser/include/nvfuser/evaluator_common.h +295 -0
- nvfuser/include/nvfuser/exceptions.h +283 -0
- nvfuser/include/nvfuser/expr_evaluator.h +125 -0
- nvfuser/include/nvfuser/expr_simplifier.h +218 -0
- nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
- nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
- nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
- nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
- nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
- nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
- nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
- nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
- nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
- nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
- nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
- nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
- nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
- nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
- nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
- nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
- nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
- nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
- nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
- nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
- nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
- nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
- nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
- nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
- nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
- nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
- nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
- nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
- nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
- nvfuser/include/nvfuser/fusion.h +511 -0
- nvfuser/include/nvfuser/fusion_guard.h +37 -0
- nvfuser/include/nvfuser/fusion_profiler.h +311 -0
- nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
- nvfuser/include/nvfuser/global_allocator.h +27 -0
- nvfuser/include/nvfuser/grouped_reduction.h +47 -0
- nvfuser/include/nvfuser/host_ir/container.h +60 -0
- nvfuser/include/nvfuser/host_ir/executor.h +152 -0
- nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
- nvfuser/include/nvfuser/host_ir/lower.h +35 -0
- nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
- nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
- nvfuser/include/nvfuser/id_model/id_model.h +359 -0
- nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
- nvfuser/include/nvfuser/id_model/indexing.h +208 -0
- nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
- nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
- nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
- nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
- nvfuser/include/nvfuser/id_model/schedule.h +54 -0
- nvfuser/include/nvfuser/id_model/to_string.h +87 -0
- nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
- nvfuser/include/nvfuser/id_model/utils.h +176 -0
- nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
- nvfuser/include/nvfuser/index_compute.h +651 -0
- nvfuser/include/nvfuser/instrumentation.h +107 -0
- nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
- nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
- nvfuser/include/nvfuser/ir/builder.h +215 -0
- nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
- nvfuser/include/nvfuser/ir/cloner.h +185 -0
- nvfuser/include/nvfuser/ir/container.h +226 -0
- nvfuser/include/nvfuser/ir/graphviz.h +119 -0
- nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
- nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
- nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
- nvfuser/include/nvfuser/ir/iostream.h +98 -0
- nvfuser/include/nvfuser/ir/printer.h +57 -0
- nvfuser/include/nvfuser/ir/utils.h +801 -0
- nvfuser/include/nvfuser/iter_visitor.h +661 -0
- nvfuser/include/nvfuser/kernel.h +299 -0
- nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
- nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
- nvfuser/include/nvfuser/kernel_ir.h +1457 -0
- nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
- nvfuser/include/nvfuser/linked_hash_map.h +97 -0
- nvfuser/include/nvfuser/logical_domain_map.h +577 -0
- nvfuser/include/nvfuser/macros.h +23 -0
- nvfuser/include/nvfuser/mma_type.h +257 -0
- nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
- nvfuser/include/nvfuser/multidevice/communication.h +232 -0
- nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
- nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
- nvfuser/include/nvfuser/multidevice/executor.h +107 -0
- nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
- nvfuser/include/nvfuser/multidevice/utils.h +187 -0
- nvfuser/include/nvfuser/non_divisible_split.h +86 -0
- nvfuser/include/nvfuser/opaque_type.h +129 -0
- nvfuser/include/nvfuser/ops/alias.h +192 -0
- nvfuser/include/nvfuser/ops/all_ops.h +13 -0
- nvfuser/include/nvfuser/ops/arith.h +712 -0
- nvfuser/include/nvfuser/ops/composite.h +130 -0
- nvfuser/include/nvfuser/ops/indexing.h +55 -0
- nvfuser/include/nvfuser/ops/normalization.h +263 -0
- nvfuser/include/nvfuser/ops/utils.h +127 -0
- nvfuser/include/nvfuser/options.h +313 -0
- nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
- nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
- nvfuser/include/nvfuser/polymorphic_value.h +432 -0
- nvfuser/include/nvfuser/predicate_compute.h +213 -0
- nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
- nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
- nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
- nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
- nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
- nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
- nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
- nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
- nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
- nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
- nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
- nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
- nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
- nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
- nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
- nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
- nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
- nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
- nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
- nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
- nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
- nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
- nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
- nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
- nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
- nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
- nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
- nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
- nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
- nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
- nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
- nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
- nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
- nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
- nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
- nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
- nvfuser/include/nvfuser/scheduler/registry.h +97 -0
- nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
- nvfuser/include/nvfuser/scheduler/resize.h +41 -0
- nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
- nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
- nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
- nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
- nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
- nvfuser/include/nvfuser/scheduler/utils.h +771 -0
- nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
- nvfuser/include/nvfuser/serde/factory.h +55 -0
- nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
- nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
- nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
- nvfuser/include/nvfuser/serde/utils.h +34 -0
- nvfuser/include/nvfuser/struct.inl +127 -0
- nvfuser/include/nvfuser/swizzle.h +54 -0
- nvfuser/include/nvfuser/sys_utils.h +40 -0
- nvfuser/include/nvfuser/tensor_metadata.h +118 -0
- nvfuser/include/nvfuser/tma.h +124 -0
- nvfuser/include/nvfuser/transform_iter.h +522 -0
- nvfuser/include/nvfuser/transform_replay.h +297 -0
- nvfuser/include/nvfuser/transform_rfactor.h +33 -0
- nvfuser/include/nvfuser/transform_view.h +136 -0
- nvfuser/include/nvfuser/type.h +1125 -0
- nvfuser/include/nvfuser/type_promotion.h +61 -0
- nvfuser/include/nvfuser/utils.h +619 -0
- nvfuser/include/nvfuser/val_graph.h +446 -0
- nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
- nvfuser/include/nvfuser/validator_utils.h +92 -0
- nvfuser/include/nvfuser/vectorization_info.h +31 -0
- nvfuser/include/nvfuser/visibility.h +21 -0
- nvfuser/lib/libnvfuser_codegen.so +0 -0
- nvfuser/nvfuser_version.py +69 -0
- nvfuser/pytorch_utils.py +184 -0
- nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
- nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
- nvfuser/utils.py +18 -0
- nvfuser/version.py +1 -0
- nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
- nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +16 -0
- nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
- nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
- nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
- nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0
@@ -0,0 +1,432 @@
|
|
1
|
+
// clang-format off
|
2
|
+
/*
|
3
|
+
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
|
4
|
+
* All rights reserved.
|
5
|
+
* SPDX-License-Identifier: BSD-3-Clause
|
6
|
+
*/
|
7
|
+
// clang-format on
|
8
|
+
#pragma once
|
9
|
+
|
10
|
+
#include <exceptions.h>
|
11
|
+
#include <any>
|
12
|
+
#include <complex>
|
13
|
+
#include <cstddef>
|
14
|
+
#include <functional>
|
15
|
+
#include <numeric>
|
16
|
+
#include <ostream>
|
17
|
+
#include <unordered_map>
|
18
|
+
|
19
|
+
#include <ATen/ATen.h>
|
20
|
+
|
21
|
+
#ifndef DYNAMIC_TYPE_CHECK
|
22
|
+
#define DYNAMIC_TYPE_CHECK NVF_ERROR
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#include <dynamic_type/dynamic_type.h>
|
26
|
+
#include <macros.h>
|
27
|
+
#include <opaque_type.h>
|
28
|
+
|
29
|
+
namespace nvfuser {
|
30
|
+
|
31
|
+
struct DataType;
|
32
|
+
|
33
|
+
// Use a single pointer type to represent all pointers, otherwise we would need
|
34
|
+
// exponential compilation time for all pointer types in PolymorphicValue.
|
35
|
+
class Pointer {
|
36
|
+
std::byte* ptr_;
|
37
|
+
int64_t size_;
|
38
|
+
|
39
|
+
public:
|
40
|
+
template <typename T>
|
41
|
+
Pointer(T* ptr) : ptr_(reinterpret_cast<std::byte*>(ptr)), size_(sizeof(T)) {}
|
42
|
+
|
43
|
+
inline Pointer(void* ptr, DataType dtype);
|
44
|
+
|
45
|
+
Pointer() : ptr_(nullptr), size_(-1) {}
|
46
|
+
|
47
|
+
int64_t size() const {
|
48
|
+
return size_;
|
49
|
+
}
|
50
|
+
|
51
|
+
template <typename T>
|
52
|
+
explicit operator T*() const {
|
53
|
+
return reinterpret_cast<T*>(ptr_);
|
54
|
+
}
|
55
|
+
|
56
|
+
Pointer& operator+=(int64_t offset) {
|
57
|
+
ptr_ += offset * size_;
|
58
|
+
return *this;
|
59
|
+
}
|
60
|
+
|
61
|
+
Pointer& operator-=(int64_t offset) {
|
62
|
+
ptr_ -= offset * size_;
|
63
|
+
return *this;
|
64
|
+
}
|
65
|
+
|
66
|
+
Pointer& operator++() {
|
67
|
+
ptr_ += size_;
|
68
|
+
return *this;
|
69
|
+
}
|
70
|
+
|
71
|
+
Pointer& operator--() {
|
72
|
+
ptr_ -= size_;
|
73
|
+
return *this;
|
74
|
+
}
|
75
|
+
|
76
|
+
Pointer operator++(int) {
|
77
|
+
Pointer tmp = *this;
|
78
|
+
++*this;
|
79
|
+
return tmp;
|
80
|
+
}
|
81
|
+
|
82
|
+
Pointer operator--(int) {
|
83
|
+
Pointer tmp = *this;
|
84
|
+
--*this;
|
85
|
+
return tmp;
|
86
|
+
}
|
87
|
+
|
88
|
+
Pointer operator+(int64_t offset) const {
|
89
|
+
Pointer tmp = *this;
|
90
|
+
tmp += offset;
|
91
|
+
return tmp;
|
92
|
+
}
|
93
|
+
|
94
|
+
Pointer operator-(int64_t offset) const {
|
95
|
+
Pointer tmp = *this;
|
96
|
+
tmp -= offset;
|
97
|
+
return tmp;
|
98
|
+
}
|
99
|
+
|
100
|
+
int64_t operator-(const Pointer& other) const {
|
101
|
+
NVF_ERROR(size_ == other.size_);
|
102
|
+
return (ptr_ - other.ptr_) / (int64_t)size_;
|
103
|
+
}
|
104
|
+
|
105
|
+
bool operator==(const Pointer& other) const {
|
106
|
+
return ptr_ == other.ptr_;
|
107
|
+
}
|
108
|
+
|
109
|
+
bool operator==(std::nullptr_t) const {
|
110
|
+
return ptr_ == nullptr;
|
111
|
+
}
|
112
|
+
|
113
|
+
bool operator!=(const Pointer& other) const {
|
114
|
+
return ptr_ != other.ptr_;
|
115
|
+
}
|
116
|
+
|
117
|
+
bool operator!=(std::nullptr_t) const {
|
118
|
+
return ptr_ != nullptr;
|
119
|
+
}
|
120
|
+
|
121
|
+
bool operator<(const Pointer& other) const {
|
122
|
+
return ptr_ < other.ptr_;
|
123
|
+
}
|
124
|
+
|
125
|
+
bool operator>(const Pointer& other) const {
|
126
|
+
return ptr_ > other.ptr_;
|
127
|
+
}
|
128
|
+
|
129
|
+
bool operator<=(const Pointer& other) const {
|
130
|
+
return ptr_ <= other.ptr_;
|
131
|
+
}
|
132
|
+
|
133
|
+
bool operator>=(const Pointer& other) const {
|
134
|
+
return ptr_ >= other.ptr_;
|
135
|
+
}
|
136
|
+
|
137
|
+
bool operator!() const {
|
138
|
+
return !ptr_;
|
139
|
+
}
|
140
|
+
|
141
|
+
explicit operator bool() const {
|
142
|
+
return ptr_;
|
143
|
+
}
|
144
|
+
|
145
|
+
explicit operator int64_t() const {
|
146
|
+
return reinterpret_cast<int64_t>(ptr_);
|
147
|
+
}
|
148
|
+
|
149
|
+
explicit operator unsigned() const {
|
150
|
+
return (unsigned)(int64_t)(*this);
|
151
|
+
}
|
152
|
+
|
153
|
+
explicit operator size_t() const {
|
154
|
+
return reinterpret_cast<size_t>(ptr_);
|
155
|
+
}
|
156
|
+
};
|
157
|
+
|
158
|
+
inline Pointer operator+(int64_t offset, const Pointer& ptr) {
|
159
|
+
return ptr + offset;
|
160
|
+
}
|
161
|
+
|
162
|
+
inline std::ostream& operator<<(std::ostream& os, const Pointer& ptr) {
|
163
|
+
os << (void*)ptr;
|
164
|
+
return os;
|
165
|
+
}
|
166
|
+
|
167
|
+
struct Struct;
|
168
|
+
class Accessor;
|
169
|
+
struct StructType;
|
170
|
+
|
171
|
+
// See Note [Struct Support in PolymorphicValue] for documentation.
|
172
|
+
class StructHandle {
|
173
|
+
std::shared_ptr<Struct> struct_ptr_;
|
174
|
+
|
175
|
+
public:
|
176
|
+
StructHandle(std::shared_ptr<Struct> struct_ptr)
|
177
|
+
: struct_ptr_(std::move(struct_ptr)) {}
|
178
|
+
StructHandle& operator=(std::shared_ptr<Struct> struct_ptr) {
|
179
|
+
struct_ptr_ = std::move(struct_ptr);
|
180
|
+
return *this;
|
181
|
+
}
|
182
|
+
|
183
|
+
StructHandle(const StructHandle& other) = default;
|
184
|
+
StructHandle(StructHandle&& other) = default;
|
185
|
+
StructHandle& operator=(const StructHandle& other) = default;
|
186
|
+
StructHandle& operator=(StructHandle&& other) = default;
|
187
|
+
|
188
|
+
bool operator==(const StructHandle& other) const;
|
189
|
+
|
190
|
+
template <typename T>
|
191
|
+
bool is() const {
|
192
|
+
return std::dynamic_pointer_cast<T>(struct_ptr_) != nullptr;
|
193
|
+
}
|
194
|
+
|
195
|
+
template <typename T>
|
196
|
+
inline T& as() const {
|
197
|
+
return *std::dynamic_pointer_cast<T>(struct_ptr_);
|
198
|
+
}
|
199
|
+
|
200
|
+
inline StructType type() const;
|
201
|
+
|
202
|
+
template <typename Ret, typename Class>
|
203
|
+
inline std::enable_if_t<std::is_base_of_v<Struct, Class>, Ret&> operator->*(
|
204
|
+
Ret Class::* member) const {
|
205
|
+
return as<Class>().*member;
|
206
|
+
}
|
207
|
+
|
208
|
+
inline Accessor operator->*(const std::string& key) const;
|
209
|
+
};
|
210
|
+
|
211
|
+
using PolymorphicValue = dynamic_type::DynamicType<
|
212
|
+
dynamic_type::Containers<std::vector>,
|
213
|
+
StructHandle,
|
214
|
+
Pointer,
|
215
|
+
Opaque,
|
216
|
+
at::Tensor,
|
217
|
+
std::complex<double>,
|
218
|
+
double,
|
219
|
+
int64_t,
|
220
|
+
bool>;
|
221
|
+
|
222
|
+
namespace PolymorphicValue_functions {
|
223
|
+
|
224
|
+
NVF_API std::string toString(const PolymorphicValue& v);
|
225
|
+
|
226
|
+
template <typename T>
|
227
|
+
inline bool isNan(const T& a) {
|
228
|
+
return std::isnan(a);
|
229
|
+
}
|
230
|
+
|
231
|
+
// For example, `nan+i` and `nan-i` are treated equal because both are NaNs.
|
232
|
+
// This is consistent with pytorch's implementation:
|
233
|
+
// https://github.com/pytorch/pytorch/blob/6d8e0c4b5a3be8201cab731dfd1e6513162cf25c/c10/util/complex_utils.h#L43.
|
234
|
+
template <typename T>
|
235
|
+
inline bool isNan(const std::complex<T>& a) {
|
236
|
+
return std::isnan(a.real()) || std::isnan(a.imag());
|
237
|
+
}
|
238
|
+
|
239
|
+
// NaNs are treated equal.
|
240
|
+
template <typename T>
|
241
|
+
inline bool isSameNanSensitive(const T& a, const T& b) {
|
242
|
+
if (isNan(a) && isNan(b)) {
|
243
|
+
return true;
|
244
|
+
}
|
245
|
+
return a == b;
|
246
|
+
}
|
247
|
+
|
248
|
+
inline bool isSame(const PolymorphicValue& a, const PolymorphicValue& b) {
|
249
|
+
if (a.type() != b.type()) {
|
250
|
+
return false;
|
251
|
+
}
|
252
|
+
if (a.is<at::Tensor>()) {
|
253
|
+
return (a.as<at::Tensor>().is_same(b.as<at::Tensor>()));
|
254
|
+
}
|
255
|
+
if (a.is<double>()) {
|
256
|
+
return isSameNanSensitive(a.as<double>(), b.as<double>());
|
257
|
+
}
|
258
|
+
if (a.is<std::complex<double>>()) {
|
259
|
+
return isSameNanSensitive(
|
260
|
+
a.as<std::complex<double>>(), b.as<std::complex<double>>());
|
261
|
+
}
|
262
|
+
return a == b;
|
263
|
+
}
|
264
|
+
|
265
|
+
inline PolymorphicValue signbit(const PolymorphicValue& a) {
|
266
|
+
if (a.is<int64_t>()) {
|
267
|
+
return PolymorphicValue(std::signbit(a.as<int64_t>()));
|
268
|
+
}
|
269
|
+
if (a.is<double>()) {
|
270
|
+
return PolymorphicValue(std::signbit(a.as<double>()));
|
271
|
+
}
|
272
|
+
if (a.is<at::Tensor>()) {
|
273
|
+
return PolymorphicValue(a.as<at::Tensor>().signbit());
|
274
|
+
}
|
275
|
+
NVF_THROW("PolymorphicValue signbit not implemented for ", a.type().name());
|
276
|
+
}
|
277
|
+
|
278
|
+
inline PolymorphicValue fmod(
|
279
|
+
const PolymorphicValue& a,
|
280
|
+
const PolymorphicValue& b) {
|
281
|
+
// TODO: relax the type check
|
282
|
+
NVF_ERROR(
|
283
|
+
a.is<at::Tensor>() || a.type() == b.type(),
|
284
|
+
"fmod is not implemented for mismatch dtypes");
|
285
|
+
if (a.is<int64_t>()) {
|
286
|
+
if (b.is<int64_t>()) {
|
287
|
+
return PolymorphicValue(std::fmod(a.as<int64_t>(), b.as<int64_t>()));
|
288
|
+
}
|
289
|
+
if (b.is<double>()) {
|
290
|
+
return PolymorphicValue(std::fmod(a.as<int64_t>(), b.as<double>()));
|
291
|
+
}
|
292
|
+
}
|
293
|
+
if (a.is<double>()) {
|
294
|
+
if (b.is<int64_t>()) {
|
295
|
+
return PolymorphicValue(std::fmod(a.as<double>(), b.as<int64_t>()));
|
296
|
+
}
|
297
|
+
if (b.is<double>()) {
|
298
|
+
return PolymorphicValue(std::fmod(a.as<double>(), b.as<double>()));
|
299
|
+
}
|
300
|
+
}
|
301
|
+
if (a.is<at::Tensor>()) {
|
302
|
+
if (b.is<int64_t>()) {
|
303
|
+
return PolymorphicValue(a.as<at::Tensor>().fmod(b.as<int64_t>()));
|
304
|
+
}
|
305
|
+
if (b.is<double>()) {
|
306
|
+
return PolymorphicValue(a.as<at::Tensor>().fmod(b.as<double>()));
|
307
|
+
}
|
308
|
+
if (b.is<at::Tensor>()) {
|
309
|
+
return PolymorphicValue(a.as<at::Tensor>().fmod(b.as<at::Tensor>()));
|
310
|
+
}
|
311
|
+
}
|
312
|
+
NVF_THROW(
|
313
|
+
"PolymorphicValue fmod not implemented for ",
|
314
|
+
a.type().name(),
|
315
|
+
" , ",
|
316
|
+
b.type().name());
|
317
|
+
}
|
318
|
+
|
319
|
+
inline PolymorphicValue ceildiv(
|
320
|
+
const PolymorphicValue& a,
|
321
|
+
const PolymorphicValue& b) {
|
322
|
+
if (a.is<int64_t>() && b.is<int64_t>()) {
|
323
|
+
auto aa = a.as<int64_t>();
|
324
|
+
auto bb = b.as<int64_t>();
|
325
|
+
if (bb > 0) {
|
326
|
+
return PolymorphicValue((aa + bb - 1) / bb);
|
327
|
+
} else {
|
328
|
+
return PolymorphicValue((aa + bb + 1) / bb);
|
329
|
+
}
|
330
|
+
}
|
331
|
+
return PolymorphicValue(std::ceil((a / b).as<double>()));
|
332
|
+
}
|
333
|
+
|
334
|
+
inline PolymorphicValue max(
|
335
|
+
const PolymorphicValue& a,
|
336
|
+
const PolymorphicValue& b) {
|
337
|
+
return PolymorphicValue(a > b ? a : b);
|
338
|
+
}
|
339
|
+
|
340
|
+
inline PolymorphicValue min(
|
341
|
+
const PolymorphicValue& a,
|
342
|
+
const PolymorphicValue& b) {
|
343
|
+
return PolymorphicValue(a < b ? a : b);
|
344
|
+
}
|
345
|
+
|
346
|
+
inline PolymorphicValue gcd(
|
347
|
+
const PolymorphicValue& a,
|
348
|
+
const PolymorphicValue& b) {
|
349
|
+
return PolymorphicValue(std::gcd(a.as<int64_t>(), b.as<int64_t>()));
|
350
|
+
}
|
351
|
+
|
352
|
+
inline PolymorphicValue abs(const PolymorphicValue& a) {
|
353
|
+
if (a.is<int64_t>()) {
|
354
|
+
return PolymorphicValue(std::abs(a.as<int64_t>()));
|
355
|
+
}
|
356
|
+
if (a.is<double>()) {
|
357
|
+
return PolymorphicValue(std::abs(a.as<double>()));
|
358
|
+
}
|
359
|
+
if (a.is<bool>()) {
|
360
|
+
return a;
|
361
|
+
}
|
362
|
+
if (a.is<std::complex<double>>()) {
|
363
|
+
return std::abs(a.as<std::complex<double>>());
|
364
|
+
}
|
365
|
+
if (a.is<at::Tensor>()) {
|
366
|
+
return a.as<at::Tensor>().abs();
|
367
|
+
}
|
368
|
+
NVF_THROW("PolymorphicValue abs not implemented for ", a.type().name());
|
369
|
+
}
|
370
|
+
|
371
|
+
inline PolymorphicValue erf(const PolymorphicValue& a) {
|
372
|
+
if (a.is<at::Tensor>()) {
|
373
|
+
return PolymorphicValue(a.as<at::Tensor>().erf());
|
374
|
+
}
|
375
|
+
NVF_THROW("PolymorphicValue erf not implemented for ", a.type().name());
|
376
|
+
}
|
377
|
+
|
378
|
+
// Convert scalars, vector of scalars, vector of vector of scalars, etc., into
|
379
|
+
// an at::Tensor. device argument allows for the creation of CPU Scalars.
|
380
|
+
inline PolymorphicValue toTensor(
|
381
|
+
const PolymorphicValue& x,
|
382
|
+
at::DeviceType device_type = at::kCUDA,
|
383
|
+
int8_t device_index = 0) {
|
384
|
+
if (x.is<at::Tensor>()) {
|
385
|
+
return x;
|
386
|
+
}
|
387
|
+
auto options = at::TensorOptions().device(device_type, device_index);
|
388
|
+
if (x.is<int64_t>()) {
|
389
|
+
return PolymorphicValue(
|
390
|
+
at::tensor(x.as<int64_t>(), options.dtype(at::kLong)).squeeze());
|
391
|
+
}
|
392
|
+
if (x.is<double>()) {
|
393
|
+
return PolymorphicValue(
|
394
|
+
at::tensor(x.as<double>(), options.dtype(at::kDouble)).squeeze());
|
395
|
+
}
|
396
|
+
if (x.is<bool>()) {
|
397
|
+
return PolymorphicValue(
|
398
|
+
at::tensor(x.as<bool>(), options.dtype(at::kBool)).squeeze());
|
399
|
+
}
|
400
|
+
if (x.is<std::complex<double>>()) {
|
401
|
+
return PolymorphicValue(
|
402
|
+
at::tensor(
|
403
|
+
(c10::complex<double>)x.as<std::complex<double>>(),
|
404
|
+
options.dtype(at::kComplexDouble))
|
405
|
+
.squeeze());
|
406
|
+
}
|
407
|
+
if (x.is<std::vector>()) {
|
408
|
+
auto vec = x.as<std::vector>();
|
409
|
+
std::vector<at::Tensor> tensors;
|
410
|
+
tensors.reserve(vec.size());
|
411
|
+
for (const auto& elem : vec) {
|
412
|
+
tensors.push_back(toTensor(elem).as<at::Tensor>());
|
413
|
+
}
|
414
|
+
return PolymorphicValue(at::stack(tensors));
|
415
|
+
}
|
416
|
+
NVF_THROW("PolymorphicValue toTensor not implemented for ", x.type().name());
|
417
|
+
}
|
418
|
+
|
419
|
+
// Convert PolymorphicValue to c10::Scalar.
|
420
|
+
inline c10::Scalar toScalar(const PolymorphicValue& x) {
|
421
|
+
if (x.is<std::complex<double>>()) {
|
422
|
+
return (c10::complex<double>)x.as<std::complex<double>>();
|
423
|
+
} else {
|
424
|
+
return (c10::Scalar)x;
|
425
|
+
}
|
426
|
+
}
|
427
|
+
|
428
|
+
} // namespace PolymorphicValue_functions
|
429
|
+
|
430
|
+
} // namespace nvfuser
|
431
|
+
|
432
|
+
#include <struct.inl>
|
@@ -0,0 +1,213 @@
|
|
1
|
+
// clang-format off
|
2
|
+
/*
|
3
|
+
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
|
4
|
+
* All rights reserved.
|
5
|
+
* SPDX-License-Identifier: BSD-3-Clause
|
6
|
+
*/
|
7
|
+
// clang-format on
|
8
|
+
#pragma once
|
9
|
+
|
10
|
+
#include <device_lower/analysis/thread_predicate.h>
|
11
|
+
#include <device_lower/utils.h>
|
12
|
+
#include <exceptions.h>
|
13
|
+
#include <index_compute.h>
|
14
|
+
#include <kernel_ir.h>
|
15
|
+
#include <logical_domain_map.h>
|
16
|
+
|
17
|
+
namespace nvfuser {
|
18
|
+
|
19
|
+
class PredicateCompute {
|
20
|
+
public:
|
21
|
+
// ignore_internal_syncthread_ops will prevent creation of predicates on
|
22
|
+
// block/grid broadcast/reduce as these have syncthread calls within them
|
23
|
+
// so all threads need to execute the function.
|
24
|
+
static Val* getInlinePredicate(
|
25
|
+
const Expr* expr,
|
26
|
+
const std::vector<ForLoop*>& loops,
|
27
|
+
const std::unordered_set<ForLoop*>& rotated_loops,
|
28
|
+
Val* thread_pred,
|
29
|
+
PredicateType pred_type);
|
30
|
+
};
|
31
|
+
|
32
|
+
//! Parallelized domains may need to be predicated with threading
|
33
|
+
//! indices and IterDomain extents. For example, if a domain is
|
34
|
+
//! parallelized by TIDx, when TIDx is not exact, i.e., it can be
|
35
|
+
//! larger than the extents of domains parallelized by TIDx,
|
36
|
+
//! threadIdx.x may be larger than the IterDomain extent. This can be
|
37
|
+
//! harmless for Local tensors, however, for it would
|
38
|
+
//! result in out-of-bounds access for Shared tensors as they are
|
39
|
+
//! allocated based on tensor shapes rather than threading
|
40
|
+
//! dimensions.
|
41
|
+
class ParallelizedDomainPredicate {
|
42
|
+
public:
|
43
|
+
//! Predicate information for parallelized domains
|
44
|
+
class PredicateInfo {
|
45
|
+
public:
|
46
|
+
explicit PredicateInfo(ParallelType pt) : pt_(pt) {}
|
47
|
+
|
48
|
+
//! Adds a domain that is parallized by the same parallel type
|
49
|
+
bool addDomain(IterDomain* id);
|
50
|
+
|
51
|
+
const std::vector<IterDomain*>& ids() const {
|
52
|
+
return ids_;
|
53
|
+
}
|
54
|
+
|
55
|
+
//! Generates a predicate Val from predicate information
|
56
|
+
Val* getPredicate() const;
|
57
|
+
|
58
|
+
private:
|
59
|
+
ParallelType pt_;
|
60
|
+
//! Domains parallelized by the same parallel type
|
61
|
+
std::vector<IterDomain*> ids_;
|
62
|
+
};
|
63
|
+
|
64
|
+
//! Returns a predicate Val for parallelied domains of an expression.
|
65
|
+
static Val* getPredicate(
|
66
|
+
const Expr* expr,
|
67
|
+
const std::vector<ForLoop*>& loops);
|
68
|
+
|
69
|
+
//! Returns predicate information for parallelied domains of an
|
70
|
+
//! expression.
|
71
|
+
static std::unordered_map<ParallelType, PredicateInfo> getPredicateMap(
|
72
|
+
const Expr* expr,
|
73
|
+
const std::vector<ForLoop*>& loops,
|
74
|
+
ForLoop* unswitched_loop = nullptr);
|
75
|
+
};
|
76
|
+
|
77
|
+
//! Keys to identify unique unswitch predicates. Just consists of a
|
78
|
+
//! predicated concrete domain if not parallelized. If parallelized,
|
79
|
+
//! pick one for each different parallelization. When the same
|
80
|
+
//! parallel type is used for different concrete domains, they are
|
81
|
+
//! considered different predicates and are included in the unswitch
|
82
|
+
//! condition lists.
|
83
|
+
class UnswitchPredicateKey {
|
84
|
+
public:
|
85
|
+
UnswitchPredicateKey();
|
86
|
+
|
87
|
+
// Parameter loop_ids represents the loop domains used for the
|
88
|
+
// predicated domain
|
89
|
+
UnswitchPredicateKey(
|
90
|
+
IterDomain* predicated_consumer_id,
|
91
|
+
TensorView* consumer_tv,
|
92
|
+
IterDomain* predicated_concrete_id,
|
93
|
+
std::unordered_set<IterDomain*> loop_ids);
|
94
|
+
|
95
|
+
bool operator==(const UnswitchPredicateKey& other) const {
|
96
|
+
return predicated_concrete_id_ == other.predicated_concrete_id_ &&
|
97
|
+
parallel_concrete_ids_ == other.parallel_concrete_ids_;
|
98
|
+
}
|
99
|
+
|
100
|
+
const auto& predicatedId() const {
|
101
|
+
return predicated_concrete_id_;
|
102
|
+
}
|
103
|
+
|
104
|
+
const auto& parallelConcreteIds() const {
|
105
|
+
return parallel_concrete_ids_;
|
106
|
+
}
|
107
|
+
|
108
|
+
IterDomain* parallelId(ParallelType pt) const {
|
109
|
+
auto it = parallelConcreteIds().find(pt);
|
110
|
+
if (it == parallelConcreteIds().end()) {
|
111
|
+
return nullptr;
|
112
|
+
} else {
|
113
|
+
return it->second;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
std::string toString() const;
|
118
|
+
|
119
|
+
private:
|
120
|
+
//! Predicated concrete domain
|
121
|
+
IterDomain* predicated_concrete_id_ = nullptr;
|
122
|
+
//! Dependent loop domains
|
123
|
+
std::unordered_set<IterDomain*> loop_ids_;
|
124
|
+
//! Store parallelized concrete domains
|
125
|
+
std::unordered_map<ParallelType, IterDomain*> parallel_concrete_ids_;
|
126
|
+
};
|
127
|
+
|
128
|
+
struct UnswitchPredicateKeyHash {
|
129
|
+
std::size_t operator()(const UnswitchPredicateKey& key) const;
|
130
|
+
};
|
131
|
+
|
132
|
+
// Generate predicates for loops that are unswitched, unrolled or
|
133
|
+
// vectorized loops
|
134
|
+
class UnswitchPredicate {
|
135
|
+
public:
|
136
|
+
// Get a predicate for a loop that is unswitched, unrolled or
|
137
|
+
// vectorized. The outer_loops parameter represents the outer loops
|
138
|
+
// of the unswitched/unrolled/vectorized loop.
|
139
|
+
static Val* get(
|
140
|
+
const std::vector<ForLoop*>& outer_loops,
|
141
|
+
ForLoop* unrolled_loop);
|
142
|
+
|
143
|
+
private:
|
144
|
+
//! Predicate information for each UnswitchPredicateKey.
|
145
|
+
struct MergedPredicates {
|
146
|
+
//! Predicate information for the start and stop predicates.
|
147
|
+
struct Info {
|
148
|
+
//! Most restrictive static predicate. Nullptr if no static
|
149
|
+
//! predicate found.
|
150
|
+
Val* static_pred = nullptr;
|
151
|
+
//! The offset value of static_pred
|
152
|
+
PolymorphicValue static_offset = 0L;
|
153
|
+
//! List of dynamic predicates.
|
154
|
+
std::vector<Val*> dynamic_preds;
|
155
|
+
//! Circular buffer loop stage if applicable. The predicate
|
156
|
+
//! generated in the main loop where no epilogue is generated
|
157
|
+
//! needs to be used.
|
158
|
+
CircularBufferLoopStage loop_stage =
|
159
|
+
CircularBufferLoopStage::NotApplicable;
|
160
|
+
};
|
161
|
+
UnswitchPredicateKey predicate_key;
|
162
|
+
Info start;
|
163
|
+
Info stop;
|
164
|
+
};
|
165
|
+
|
166
|
+
UnswitchPredicate(std::vector<ForLoop*> outer_loops, ForLoop* unrolled_loop);
|
167
|
+
|
168
|
+
void predicateOn(Expr*);
|
169
|
+
|
170
|
+
void openLoop(ForLoop*);
|
171
|
+
|
172
|
+
void openIte(kir::IfThenElse*);
|
173
|
+
|
174
|
+
//! Generates the final predicates from the predicated_keys map
|
175
|
+
void finalize();
|
176
|
+
|
177
|
+
//! Merge predicates as much as possible. If a predicate offset is
|
178
|
+
//! static, only pick the most restrictive one, e.g., the one with the
|
179
|
+
//! minimum offset for the start predication.
|
180
|
+
void mergeUnswitchPredicates(
|
181
|
+
Val* predicate,
|
182
|
+
Val* offset,
|
183
|
+
CircularBufferLoopStage loop_stage,
|
184
|
+
MergedPredicates::Info& merged_predicate_info,
|
185
|
+
bool is_start);
|
186
|
+
|
187
|
+
//! Adds new predicates for parallelized domains
|
188
|
+
void addParallelizedDomainPredicates(Expr*);
|
189
|
+
|
190
|
+
private:
|
191
|
+
//! Track which iter domains have been predicated
|
192
|
+
std::unordered_set<UnswitchPredicateKey, UnswitchPredicateKeyHash>
|
193
|
+
predicated_keys_;
|
194
|
+
|
195
|
+
//! The predicates that have been recorded but not yet finalized
|
196
|
+
std::vector<MergedPredicates> pending_predicates_;
|
197
|
+
|
198
|
+
//! Track which parallelized domains have been predicated
|
199
|
+
std::unordered_map<ParallelType, ParallelizedDomainPredicate::PredicateInfo>
|
200
|
+
parallelized_dom_predicates_;
|
201
|
+
|
202
|
+
//! The predicates that have been generated.
|
203
|
+
std::vector<Val*> predicates_;
|
204
|
+
|
205
|
+
std::vector<ForLoop*> for_loops_;
|
206
|
+
|
207
|
+
// Keep track of the loop in which the currently visiting expr is a rotated.
|
208
|
+
std::unordered_set<ForLoop*> rotated_loop_;
|
209
|
+
|
210
|
+
ForLoop* unrolled_loop_;
|
211
|
+
};
|
212
|
+
|
213
|
+
} // namespace nvfuser
|
@@ -0,0 +1,50 @@
|
|
1
|
+
// clang-format off
|
2
|
+
/*
|
3
|
+
* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
|
4
|
+
* All rights reserved.
|
5
|
+
* SPDX-License-Identifier: BSD-3-Clause
|
6
|
+
*/
|
7
|
+
// clang-format on
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include <ATen/core/TensorBody.h>
|
12
|
+
|
13
|
+
#include <multidevice/device_mesh.h>
|
14
|
+
#include <type.h>
|
15
|
+
|
16
|
+
namespace nvfuser::python_frontend {
|
17
|
+
|
18
|
+
// A class that represents a distributed tensor. It wraps a local tensor, a
|
19
|
+
// mesh, and a mapping from mesh axes to tensor axes. If the mesh is empty,
|
20
|
+
// it degenerates into a non-distributed tensor.
|
21
|
+
class DistributedTensor {
|
22
|
+
public:
|
23
|
+
explicit DistributedTensor(
|
24
|
+
at::Tensor local_tensor,
|
25
|
+
DeviceMesh mesh = DeviceMesh())
|
26
|
+
: local_(std::move(local_tensor)), mesh_(std::move(mesh)) {}
|
27
|
+
DistributedTensor(const DistributedTensor&) = delete;
|
28
|
+
DistributedTensor& operator=(const DistributedTensor&) = delete;
|
29
|
+
DistributedTensor(DistributedTensor&&) = default;
|
30
|
+
DistributedTensor& operator=(DistributedTensor&&) = default;
|
31
|
+
|
32
|
+
const DeviceMesh& mesh() const {
|
33
|
+
return mesh_;
|
34
|
+
}
|
35
|
+
|
36
|
+
at::Tensor local() const {
|
37
|
+
return local_;
|
38
|
+
}
|
39
|
+
|
40
|
+
void setAxisIsShardedOn(int64_t axis, ParallelType parallel_type);
|
41
|
+
|
42
|
+
int64_t axisShardedOn(ParallelType parallel_type) const;
|
43
|
+
|
44
|
+
private:
|
45
|
+
at::Tensor local_;
|
46
|
+
DeviceMesh mesh_;
|
47
|
+
std::unordered_map<ParallelType, int64_t> axis_sharded_on_;
|
48
|
+
};
|
49
|
+
|
50
|
+
} // namespace nvfuser::python_frontend
|