triton-windows 3.3.1.post19__cp313-cp313-win_amd64.whl → 3.5.0.post21__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of triton-windows might be problematic. Click here for more details.
- triton/_C/libtriton.pyd +0 -0
- triton/__init__.py +11 -2
- triton/_filecheck.py +97 -0
- triton/_internal_testing.py +95 -18
- triton/_utils.py +112 -21
- triton/backends/__init__.py +20 -23
- triton/backends/amd/__init__.py +0 -0
- triton/backends/amd/compiler.py +161 -119
- triton/backends/amd/driver.c +118 -46
- triton/backends/amd/driver.py +274 -96
- triton/backends/compiler.py +7 -21
- triton/backends/driver.py +13 -0
- triton/backends/nvidia/bin/ptxas.exe +0 -0
- triton/backends/nvidia/compiler.py +163 -106
- triton/backends/nvidia/driver.c +166 -101
- triton/backends/nvidia/driver.py +384 -202
- triton/compiler/__init__.py +5 -2
- triton/compiler/code_generator.py +439 -231
- triton/compiler/compiler.py +152 -84
- triton/experimental/__init__.py +0 -0
- triton/experimental/gluon/__init__.py +5 -0
- triton/experimental/gluon/_compiler.py +0 -0
- triton/experimental/gluon/_runtime.py +102 -0
- triton/experimental/gluon/language/__init__.py +119 -0
- triton/experimental/gluon/language/_core.py +490 -0
- triton/experimental/gluon/language/_layouts.py +583 -0
- triton/experimental/gluon/language/_math.py +20 -0
- triton/experimental/gluon/language/_semantic.py +380 -0
- triton/experimental/gluon/language/_standard.py +80 -0
- triton/experimental/gluon/language/amd/__init__.py +4 -0
- triton/experimental/gluon/language/amd/_layouts.py +96 -0
- triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
- triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
- triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
- triton/experimental/gluon/language/extra/__init__.py +3 -0
- triton/experimental/gluon/language/nvidia/__init__.py +4 -0
- triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
- triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
- triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
- triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
- triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
- triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
- triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
- triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
- triton/experimental/gluon/nvidia/__init__.py +4 -0
- triton/experimental/gluon/nvidia/blackwell.py +3 -0
- triton/experimental/gluon/nvidia/hopper.py +45 -0
- triton/knobs.py +546 -0
- triton/language/__init__.py +50 -19
- triton/language/core.py +909 -572
- triton/language/extra/cuda/__init__.py +10 -7
- triton/language/extra/cuda/gdc.py +42 -0
- triton/language/extra/cuda/libdevice.py +394 -394
- triton/language/extra/cuda/utils.py +21 -21
- triton/language/extra/hip/__init__.py +3 -1
- triton/language/extra/hip/libdevice.py +120 -104
- triton/language/extra/hip/utils.py +35 -0
- triton/language/extra/libdevice.py +4 -0
- triton/language/math.py +65 -66
- triton/language/random.py +12 -2
- triton/language/semantic.py +1757 -1768
- triton/language/standard.py +127 -62
- triton/language/target_info.py +54 -0
- triton/runtime/_allocation.py +15 -3
- triton/runtime/_async_compile.py +55 -0
- triton/runtime/autotuner.py +117 -60
- triton/runtime/build.py +83 -17
- triton/runtime/cache.py +61 -47
- triton/runtime/driver.py +25 -47
- triton/runtime/interpreter.py +95 -50
- triton/runtime/jit.py +445 -248
- triton/runtime/tcc/include/_mingw.h +8 -10
- triton/runtime/tcc/include/assert.h +5 -0
- triton/runtime/tcc/include/errno.h +1 -1
- triton/runtime/tcc/include/float.h +21 -3
- triton/runtime/tcc/include/iso646.h +36 -0
- triton/runtime/tcc/include/limits.h +5 -0
- triton/runtime/tcc/include/malloc.h +2 -2
- triton/runtime/tcc/include/math.h +21 -261
- triton/runtime/tcc/include/stdalign.h +16 -0
- triton/runtime/tcc/include/stdarg.h +5 -70
- triton/runtime/tcc/include/stdatomic.h +171 -0
- triton/runtime/tcc/include/stddef.h +7 -19
- triton/runtime/tcc/include/stdlib.h +15 -4
- triton/runtime/tcc/include/stdnoreturn.h +7 -0
- triton/runtime/tcc/include/sys/stat.h +2 -2
- triton/runtime/tcc/include/sys/types.h +5 -0
- triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
- triton/runtime/tcc/include/tccdefs.h +342 -0
- triton/runtime/tcc/include/tgmath.h +89 -0
- triton/runtime/tcc/include/uchar.h +33 -0
- triton/runtime/tcc/include/unistd.h +1 -0
- triton/runtime/tcc/include/winapi/qos.h +72 -0
- triton/runtime/tcc/include/winapi/shellapi.h +59 -0
- triton/runtime/tcc/include/winapi/winbase.h +9 -2
- triton/runtime/tcc/include/winapi/wincon.h +8 -0
- triton/runtime/tcc/include/winapi/windows.h +1 -1
- triton/runtime/tcc/include/winapi/winnls.h +778 -0
- triton/runtime/tcc/include/winapi/winnt.h +9 -7
- triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
- triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
- triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
- triton/runtime/tcc/lib/libtcc1.a +0 -0
- triton/runtime/tcc/lib/python314.def +1800 -0
- triton/runtime/tcc/lib/python314t.def +1809 -0
- triton/runtime/tcc/libtcc.dll +0 -0
- triton/runtime/tcc/tcc.exe +0 -0
- triton/testing.py +16 -12
- triton/tools/compile.py +62 -14
- triton/tools/disasm.py +3 -4
- triton/tools/extra/cuda/compile.c +1 -0
- triton/tools/extra/hip/compile.cpp +66 -0
- triton/tools/extra/hip/compile.h +13 -0
- triton/tools/ragged_tma.py +92 -0
- triton/tools/tensor_descriptor.py +34 -0
- triton/windows_utils.py +52 -81
- {triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +8 -4
- triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
- triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
- triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
- triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
- triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
- triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
- triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
- triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
- triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
- triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
- triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
- triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
- triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
- triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
- triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
- triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
- triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
- triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
- triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
- triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
- triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
- triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
- triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
- triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
- triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
- triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
- triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
- triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
- triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
- triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
- triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
- triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
- triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
- triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
- triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
- triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
- triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
- triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
- triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
- triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
- triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
- triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
- triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
- triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
- triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
- triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
- triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
- triton/backends/amd/include/hip/channel_descriptor.h +0 -39
- triton/backends/amd/include/hip/device_functions.h +0 -38
- triton/backends/amd/include/hip/driver_types.h +0 -468
- triton/backends/amd/include/hip/hip_bf16.h +0 -36
- triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
- triton/backends/amd/include/hip/hip_common.h +0 -100
- triton/backends/amd/include/hip/hip_complex.h +0 -38
- triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
- triton/backends/amd/include/hip/hip_deprecated.h +0 -95
- triton/backends/amd/include/hip/hip_ext.h +0 -161
- triton/backends/amd/include/hip/hip_fp16.h +0 -36
- triton/backends/amd/include/hip/hip_fp8.h +0 -33
- triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
- triton/backends/amd/include/hip/hip_hcc.h +0 -24
- triton/backends/amd/include/hip/hip_math_constants.h +0 -36
- triton/backends/amd/include/hip/hip_profile.h +0 -27
- triton/backends/amd/include/hip/hip_runtime.h +0 -75
- triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
- triton/backends/amd/include/hip/hip_texture_types.h +0 -29
- triton/backends/amd/include/hip/hip_vector_types.h +0 -41
- triton/backends/amd/include/hip/hip_version.h +0 -17
- triton/backends/amd/include/hip/hiprtc.h +0 -421
- triton/backends/amd/include/hip/library_types.h +0 -78
- triton/backends/amd/include/hip/math_functions.h +0 -42
- triton/backends/amd/include/hip/surface_types.h +0 -63
- triton/backends/amd/include/hip/texture_types.h +0 -194
- triton/backends/amd/include/hsa/Brig.h +0 -1131
- triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
- triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
- triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
- triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
- triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
- triton/backends/amd/include/hsa/hsa.h +0 -5738
- triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
- triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
- triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
- triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
- triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
- triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
- triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
- triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
- triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
- triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
- triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
- triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
- triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
- triton/backends/amd/include/roctracer/roctracer.h +0 -779
- triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
- triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
- triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
- triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
- triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
- triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
- triton/backends/amd/include/roctracer/roctx.h +0 -229
- triton/language/_utils.py +0 -21
- triton/language/extra/cuda/_experimental_tma.py +0 -106
- triton/runtime/tcc/lib/libtcc1-64.a +0 -0
- triton/tools/experimental_descriptor.py +0 -32
- triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
- triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
- {triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
|
@@ -1,293 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* MIT License
|
|
3
|
-
*
|
|
4
|
-
* Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
5
|
-
*
|
|
6
|
-
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
-
* of this software and associated documentation files (the "Software"), to deal
|
|
8
|
-
* in the Software without restriction, including without limitation the rights
|
|
9
|
-
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
-
* copies of the Software, and to permit persons to whom the Software is
|
|
11
|
-
* furnished to do so, subject to the following conditions:
|
|
12
|
-
*
|
|
13
|
-
* The above copyright notice and this permission notice shall be included in
|
|
14
|
-
* all copies or substantial portions of the Software.
|
|
15
|
-
*
|
|
16
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
-
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
-
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
-
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
-
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
-
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
-
* SOFTWARE.
|
|
23
|
-
*/
|
|
24
|
-
|
|
25
|
-
/*!\file
|
|
26
|
-
* \brief hip_bfloat16.h provides struct for hip_bfloat16 typedef
|
|
27
|
-
*/
|
|
28
|
-
|
|
29
|
-
#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
|
|
30
|
-
#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
|
|
31
|
-
|
|
32
|
-
#include "host_defines.h"
|
|
33
|
-
#if defined(__HIPCC_RTC__)
|
|
34
|
-
#define __HOST_DEVICE__ __device__
|
|
35
|
-
#else
|
|
36
|
-
#define __HOST_DEVICE__ __host__ __device__
|
|
37
|
-
#endif
|
|
38
|
-
|
|
39
|
-
#if __cplusplus < 201103L || !defined(__HIPCC__)
|
|
40
|
-
|
|
41
|
-
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
|
|
42
|
-
// include a minimal definition of hip_bfloat16
|
|
43
|
-
|
|
44
|
-
#include <stdint.h>
|
|
45
|
-
/*! \brief Struct to represent a 16 bit brain floating point number. */
|
|
46
|
-
typedef struct
|
|
47
|
-
{
|
|
48
|
-
uint16_t data;
|
|
49
|
-
} hip_bfloat16;
|
|
50
|
-
|
|
51
|
-
#else // __cplusplus < 201103L || !defined(__HIPCC__)
|
|
52
|
-
|
|
53
|
-
#include <hip/hip_runtime.h>
|
|
54
|
-
|
|
55
|
-
#pragma clang diagnostic push
|
|
56
|
-
#pragma clang diagnostic ignored "-Wshadow"
|
|
57
|
-
struct hip_bfloat16
|
|
58
|
-
{
|
|
59
|
-
__hip_uint16_t data;
|
|
60
|
-
|
|
61
|
-
enum truncate_t
|
|
62
|
-
{
|
|
63
|
-
truncate
|
|
64
|
-
};
|
|
65
|
-
|
|
66
|
-
__HOST_DEVICE__ hip_bfloat16() = default;
|
|
67
|
-
|
|
68
|
-
// round upper 16 bits of IEEE float to convert to bfloat16
|
|
69
|
-
explicit __HOST_DEVICE__ hip_bfloat16(float f)
|
|
70
|
-
: data(float_to_bfloat16(f))
|
|
71
|
-
{
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
|
|
75
|
-
: data(truncate_float_to_bfloat16(f))
|
|
76
|
-
{
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
|
|
80
|
-
__HOST_DEVICE__ operator float() const
|
|
81
|
-
{
|
|
82
|
-
union
|
|
83
|
-
{
|
|
84
|
-
uint32_t int32;
|
|
85
|
-
float fp32;
|
|
86
|
-
} u = {uint32_t(data) << 16};
|
|
87
|
-
return u.fp32;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
__HOST_DEVICE__ hip_bfloat16 &operator=(const float& f)
|
|
91
|
-
{
|
|
92
|
-
data = float_to_bfloat16(f);
|
|
93
|
-
return *this;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f)
|
|
97
|
-
{
|
|
98
|
-
hip_bfloat16 output;
|
|
99
|
-
output.data = float_to_bfloat16(f);
|
|
100
|
-
return output;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
|
|
104
|
-
{
|
|
105
|
-
hip_bfloat16 output;
|
|
106
|
-
output.data = truncate_float_to_bfloat16(f);
|
|
107
|
-
return output;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
private:
|
|
111
|
-
static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f)
|
|
112
|
-
{
|
|
113
|
-
union
|
|
114
|
-
{
|
|
115
|
-
float fp32;
|
|
116
|
-
uint32_t int32;
|
|
117
|
-
} u = {f};
|
|
118
|
-
if(~u.int32 & 0x7f800000)
|
|
119
|
-
{
|
|
120
|
-
// When the exponent bits are not all 1s, then the value is zero, normal,
|
|
121
|
-
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
|
|
122
|
-
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
|
|
123
|
-
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
|
|
124
|
-
// least significant bits of the float mantissa are greater than 0x8000,
|
|
125
|
-
// or if they are equal to 0x8000 and the least significant bit of the
|
|
126
|
-
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
|
|
127
|
-
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
|
|
128
|
-
// has the value 0x7f, then incrementing it causes it to become 0x00 and
|
|
129
|
-
// the exponent is incremented by one, which is the next higher FP value
|
|
130
|
-
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
|
|
131
|
-
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
|
|
132
|
-
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
|
|
133
|
-
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
|
|
134
|
-
// incrementing it causes it to become an exponent of 0xFF and a mantissa
|
|
135
|
-
// of 0x00, which is Inf, the next higher value to the unrounded value.
|
|
136
|
-
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
|
|
137
|
-
}
|
|
138
|
-
else if(u.int32 & 0xffff)
|
|
139
|
-
{
|
|
140
|
-
// When all of the exponent bits are 1, the value is Inf or NaN.
|
|
141
|
-
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
|
|
142
|
-
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
|
|
143
|
-
// bit being 1. Signaling NaN is indicated by the most significant
|
|
144
|
-
// mantissa bit being 0 but some other bit(s) being 1. If any of the
|
|
145
|
-
// lower 16 bits of the mantissa are 1, we set the least significant bit
|
|
146
|
-
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
|
|
147
|
-
// the bloat16's mantissa bits are all 0.
|
|
148
|
-
u.int32 |= 0x10000; // Preserve signaling NaN
|
|
149
|
-
}
|
|
150
|
-
return __hip_uint16_t(u.int32 >> 16);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
// Truncate instead of rounding, preserving SNaN
|
|
154
|
-
static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f)
|
|
155
|
-
{
|
|
156
|
-
union
|
|
157
|
-
{
|
|
158
|
-
float fp32;
|
|
159
|
-
uint32_t int32;
|
|
160
|
-
} u = {f};
|
|
161
|
-
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
|
|
162
|
-
}
|
|
163
|
-
};
|
|
164
|
-
#pragma clang diagnostic pop
|
|
165
|
-
|
|
166
|
-
typedef struct
|
|
167
|
-
{
|
|
168
|
-
__hip_uint16_t data;
|
|
169
|
-
} hip_bfloat16_public;
|
|
170
|
-
|
|
171
|
-
static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
|
|
172
|
-
"hip_bfloat16 is not a standard layout type, and thus is "
|
|
173
|
-
"incompatible with C.");
|
|
174
|
-
|
|
175
|
-
static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
|
|
176
|
-
"hip_bfloat16 is not a trivial type, and thus is "
|
|
177
|
-
"incompatible with C.");
|
|
178
|
-
#if !defined(__HIPCC_RTC__)
|
|
179
|
-
static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
|
|
180
|
-
&& offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
|
|
181
|
-
"internal hip_bfloat16 does not match public hip_bfloat16");
|
|
182
|
-
|
|
183
|
-
inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
|
|
184
|
-
{
|
|
185
|
-
return os << float(bf16);
|
|
186
|
-
}
|
|
187
|
-
#endif
|
|
188
|
-
|
|
189
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
|
|
190
|
-
{
|
|
191
|
-
return a;
|
|
192
|
-
}
|
|
193
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a)
|
|
194
|
-
{
|
|
195
|
-
a.data ^= 0x8000;
|
|
196
|
-
return a;
|
|
197
|
-
}
|
|
198
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
|
|
199
|
-
{
|
|
200
|
-
return hip_bfloat16(float(a) + float(b));
|
|
201
|
-
}
|
|
202
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
|
|
203
|
-
{
|
|
204
|
-
return hip_bfloat16(float(a) - float(b));
|
|
205
|
-
}
|
|
206
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
|
|
207
|
-
{
|
|
208
|
-
return hip_bfloat16(float(a) * float(b));
|
|
209
|
-
}
|
|
210
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
|
|
211
|
-
{
|
|
212
|
-
return hip_bfloat16(float(a) / float(b));
|
|
213
|
-
}
|
|
214
|
-
inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
|
|
215
|
-
{
|
|
216
|
-
return float(a) < float(b);
|
|
217
|
-
}
|
|
218
|
-
inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
|
|
219
|
-
{
|
|
220
|
-
return float(a) == float(b);
|
|
221
|
-
}
|
|
222
|
-
inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
|
|
223
|
-
{
|
|
224
|
-
return b < a;
|
|
225
|
-
}
|
|
226
|
-
inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
|
|
227
|
-
{
|
|
228
|
-
return !(a > b);
|
|
229
|
-
}
|
|
230
|
-
inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
|
|
231
|
-
{
|
|
232
|
-
return !(a == b);
|
|
233
|
-
}
|
|
234
|
-
inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
|
|
235
|
-
{
|
|
236
|
-
return !(a < b);
|
|
237
|
-
}
|
|
238
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
|
|
239
|
-
{
|
|
240
|
-
return a = a + b;
|
|
241
|
-
}
|
|
242
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
|
|
243
|
-
{
|
|
244
|
-
return a = a - b;
|
|
245
|
-
}
|
|
246
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
|
|
247
|
-
{
|
|
248
|
-
return a = a * b;
|
|
249
|
-
}
|
|
250
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
|
|
251
|
-
{
|
|
252
|
-
return a = a / b;
|
|
253
|
-
}
|
|
254
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a)
|
|
255
|
-
{
|
|
256
|
-
return a += hip_bfloat16(1.0f);
|
|
257
|
-
}
|
|
258
|
-
inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a)
|
|
259
|
-
{
|
|
260
|
-
return a -= hip_bfloat16(1.0f);
|
|
261
|
-
}
|
|
262
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int)
|
|
263
|
-
{
|
|
264
|
-
hip_bfloat16 orig = a;
|
|
265
|
-
++a;
|
|
266
|
-
return orig;
|
|
267
|
-
}
|
|
268
|
-
inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int)
|
|
269
|
-
{
|
|
270
|
-
hip_bfloat16 orig = a;
|
|
271
|
-
--a;
|
|
272
|
-
return orig;
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
namespace std
|
|
276
|
-
{
|
|
277
|
-
constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a)
|
|
278
|
-
{
|
|
279
|
-
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
|
|
280
|
-
}
|
|
281
|
-
constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a)
|
|
282
|
-
{
|
|
283
|
-
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
|
|
284
|
-
}
|
|
285
|
-
constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a)
|
|
286
|
-
{
|
|
287
|
-
return !(a.data & 0x7fff);
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
#endif // __cplusplus < 201103L || !defined(__HIPCC__)
|
|
292
|
-
|
|
293
|
-
#endif // _HIP_BFLOAT16_H_
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
|
3
|
-
|
|
4
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
5
|
-
this software and associated documentation files (the "Software"), to deal in
|
|
6
|
-
the Software without restriction, including without limitation the rights to
|
|
7
|
-
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
8
|
-
of the Software, and to permit persons to whom the Software is furnished to do
|
|
9
|
-
so, subject to the following conditions:
|
|
10
|
-
|
|
11
|
-
The above copyright notice and this permission notice shall be included in all
|
|
12
|
-
copies or substantial portions of the Software.
|
|
13
|
-
|
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
20
|
-
SOFTWARE.
|
|
21
|
-
*/
|
|
22
|
-
|
|
23
|
-
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
|
|
24
|
-
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
|
|
25
|
-
|
|
26
|
-
#if defined(__clang__) && defined(__HIP__)
|
|
27
|
-
#define __HIP_CLANG_ONLY__ 1
|
|
28
|
-
#else
|
|
29
|
-
#define __HIP_CLANG_ONLY__ 0
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
|
-
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
3
|
-
|
|
4
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
6
|
-
in the Software without restriction, including without limitation the rights
|
|
7
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
9
|
-
furnished to do so, subject to the following conditions:
|
|
10
|
-
|
|
11
|
-
The above copyright notice and this permission notice shall be included in
|
|
12
|
-
all copies or substantial portions of the Software.
|
|
13
|
-
|
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20
|
-
THE SOFTWARE.
|
|
21
|
-
*/
|
|
22
|
-
|
|
23
|
-
/* The header defines complex numbers and related functions*/
|
|
24
|
-
|
|
25
|
-
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
|
26
|
-
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
|
27
|
-
|
|
28
|
-
#if !defined(__HIPCC_RTC__)
|
|
29
|
-
#include "hip/amd_detail/amd_hip_vector_types.h"
|
|
30
|
-
#endif
|
|
31
|
-
|
|
32
|
-
#if defined(__HIPCC_RTC__)
|
|
33
|
-
#define __HOST_DEVICE__ __device__
|
|
34
|
-
#else
|
|
35
|
-
#define __HOST_DEVICE__ __host__ __device__
|
|
36
|
-
// TODO: Clang has a bug which allows device functions to call std functions
|
|
37
|
-
// when std functions are introduced into default namespace by using statement.
|
|
38
|
-
// math.h may be included after this bug is fixed.
|
|
39
|
-
#if __cplusplus
|
|
40
|
-
#include <cmath>
|
|
41
|
-
#else
|
|
42
|
-
#include "math.h"
|
|
43
|
-
#endif
|
|
44
|
-
#endif // !defined(__HIPCC_RTC__)
|
|
45
|
-
|
|
46
|
-
typedef float2 hipFloatComplex;
|
|
47
|
-
|
|
48
|
-
__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
|
49
|
-
|
|
50
|
-
__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
|
51
|
-
|
|
52
|
-
__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
|
53
|
-
hipFloatComplex z;
|
|
54
|
-
z.x = a;
|
|
55
|
-
z.y = b;
|
|
56
|
-
return z;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
|
60
|
-
hipFloatComplex ret;
|
|
61
|
-
ret.x = z.x;
|
|
62
|
-
ret.y = -z.y;
|
|
63
|
-
return ret;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
|
|
67
|
-
return z.x * z.x + z.y * z.y;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
|
71
|
-
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
|
75
|
-
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
|
79
|
-
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
|
83
|
-
float sqabs = hipCsqabsf(q);
|
|
84
|
-
hipFloatComplex ret;
|
|
85
|
-
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
|
86
|
-
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
|
87
|
-
return ret;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
typedef double2 hipDoubleComplex;
|
|
94
|
-
|
|
95
|
-
__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
|
96
|
-
|
|
97
|
-
__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
|
98
|
-
|
|
99
|
-
__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
|
100
|
-
hipDoubleComplex z;
|
|
101
|
-
z.x = a;
|
|
102
|
-
z.y = b;
|
|
103
|
-
return z;
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
|
107
|
-
hipDoubleComplex ret;
|
|
108
|
-
ret.x = z.x;
|
|
109
|
-
ret.y = -z.y;
|
|
110
|
-
return ret;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
|
|
114
|
-
return z.x * z.x + z.y * z.y;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
|
118
|
-
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
|
122
|
-
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
|
126
|
-
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
|
130
|
-
double sqabs = hipCsqabs(q);
|
|
131
|
-
hipDoubleComplex ret;
|
|
132
|
-
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
|
133
|
-
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
|
134
|
-
return ret;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
|
|
138
|
-
|
|
139
|
-
typedef hipFloatComplex hipComplex;
|
|
140
|
-
|
|
141
|
-
__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
|
|
142
|
-
return make_hipFloatComplex(x, y);
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
|
|
146
|
-
return make_hipFloatComplex((float)z.x, (float)z.y);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
|
|
150
|
-
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
|
|
154
|
-
float real = (p.x * q.x) + r.x;
|
|
155
|
-
float imag = (q.x * p.y) + r.y;
|
|
156
|
-
|
|
157
|
-
real = -(p.y * q.y) + real;
|
|
158
|
-
imag = (p.x * q.y) + imag;
|
|
159
|
-
|
|
160
|
-
return make_hipComplex(real, imag);
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
|
|
164
|
-
hipDoubleComplex r) {
|
|
165
|
-
double real = (p.x * q.x) + r.x;
|
|
166
|
-
double imag = (q.x * p.y) + r.y;
|
|
167
|
-
|
|
168
|
-
real = -(p.y * q.y) + real;
|
|
169
|
-
imag = (p.x * q.y) + imag;
|
|
170
|
-
|
|
171
|
-
return make_hipDoubleComplex(real, imag);
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|