triton-windows 3.2.0.post12__cp313-cp313-win_amd64.whl → 3.3.0a0.post12__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +3 -3
  3. triton/_internal_testing.py +59 -4
  4. triton/_utils.py +35 -0
  5. triton/backends/amd/compiler.py +121 -74
  6. triton/backends/amd/driver.py +77 -43
  7. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
  8. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
  9. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
  13. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
  15. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
  16. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
  17. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
  18. triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
  19. triton/backends/amd/include/hip/hip_ext.h +4 -2
  20. triton/backends/amd/include/hip/hip_fp8.h +33 -0
  21. triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
  22. triton/backends/amd/include/hip/hip_version.h +3 -3
  23. triton/backends/amd/include/hip/hiprtc.h +25 -25
  24. triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
  25. triton/backends/amd/include/hsa/hsa.h +11 -2
  26. triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
  27. triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
  28. triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
  29. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
  30. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
  31. triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
  32. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
  33. triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
  34. triton/backends/amd/lib/asanrtl.bc +0 -0
  35. triton/backends/compiler.py +25 -225
  36. triton/backends/driver.py +7 -2
  37. triton/backends/nvidia/bin/ptxas.exe +0 -0
  38. triton/backends/nvidia/compiler.py +135 -90
  39. triton/backends/nvidia/driver.c +0 -1
  40. triton/backends/nvidia/driver.py +135 -49
  41. triton/backends/nvidia/include/cuda.h +2162 -241
  42. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  43. triton/compiler/__init__.py +2 -2
  44. triton/compiler/code_generator.py +334 -231
  45. triton/compiler/compiler.py +77 -66
  46. triton/language/__init__.py +22 -5
  47. triton/language/core.py +448 -74
  48. triton/language/extra/cuda/_experimental_tma.py +3 -5
  49. triton/language/math.py +1 -1
  50. triton/language/random.py +2 -1
  51. triton/language/semantic.py +206 -52
  52. triton/language/standard.py +35 -18
  53. triton/runtime/_allocation.py +32 -0
  54. triton/runtime/autotuner.py +27 -32
  55. triton/runtime/build.py +1 -48
  56. triton/runtime/cache.py +6 -6
  57. triton/runtime/errors.py +10 -0
  58. triton/runtime/interpreter.py +179 -45
  59. triton/runtime/jit.py +149 -190
  60. triton/testing.py +39 -11
  61. triton/tools/compile.py +27 -20
  62. triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
  63. triton/tools/mxfp.py +301 -0
  64. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
  65. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
  66. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
  67. /triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
  68. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0
@@ -67,32 +67,32 @@ typedef enum hiprtcResult {
67
67
  */
68
68
 
69
69
  typedef enum hiprtcJIT_option {
70
- HIPRTC_JIT_MAX_REGISTERS = 0, ///< Maximum registers may be used in a thread, passed to compiler
71
- HIPRTC_JIT_THREADS_PER_BLOCK, ///< Number of thread per block
72
- HIPRTC_JIT_WALL_TIME, ///< Value for total wall clock time
73
- HIPRTC_JIT_INFO_LOG_BUFFER, ///< Pointer to the buffer with logged information
74
- HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES, ///< Size of the buffer in bytes for logged info
75
- HIPRTC_JIT_ERROR_LOG_BUFFER, ///< Pointer to the buffer with logged error(s)
76
- HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, ///< Size of the buffer in bytes for logged error(s)
77
- HIPRTC_JIT_OPTIMIZATION_LEVEL, ///< Value of optimization level for generated codes
78
- HIPRTC_JIT_TARGET_FROM_HIPCONTEXT, ///< The target context, which is the default
79
- HIPRTC_JIT_TARGET, ///< JIT target
80
- HIPRTC_JIT_FALLBACK_STRATEGY, ///< Fallback strategy
81
- HIPRTC_JIT_GENERATE_DEBUG_INFO, ///< Generate debug information
82
- HIPRTC_JIT_LOG_VERBOSE, ///< Generate log verbose
83
- HIPRTC_JIT_GENERATE_LINE_INFO, ///< Generate line number information
84
- HIPRTC_JIT_CACHE_MODE, ///< Set cache mode
85
- HIPRTC_JIT_NEW_SM3X_OPT, ///< @deprecated New SM3X option.
86
- HIPRTC_JIT_FAST_COMPILE, ///< Set fast compile
87
- HIPRTC_JIT_GLOBAL_SYMBOL_NAMES, ///< Array of device symbol names to be relocated to the host
88
- HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS, ///< Array of host addresses to be relocated to the device
89
- HIPRTC_JIT_GLOBAL_SYMBOL_COUNT, ///< Number of symbol count.
90
- HIPRTC_JIT_LTO, ///< @deprecated Enable link-time optimization for device code
91
- HIPRTC_JIT_FTZ, ///< @deprecated Set single-precision denormals.
92
- HIPRTC_JIT_PREC_DIV, ///< @deprecated Set single-precision floating-point division and
70
+ HIPRTC_JIT_MAX_REGISTERS = 0, ///< CUDA Only Maximum registers may be used in a thread, passed to compiler
71
+ HIPRTC_JIT_THREADS_PER_BLOCK, ///< CUDA Only Number of thread per block
72
+ HIPRTC_JIT_WALL_TIME, ///< CUDA Only Value for total wall clock time
73
+ HIPRTC_JIT_INFO_LOG_BUFFER, ///< CUDA Only Pointer to the buffer with logged information
74
+ HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES, ///< CUDA Only Size of the buffer in bytes for logged info
75
+ HIPRTC_JIT_ERROR_LOG_BUFFER, ///< CUDA Only Pointer to the buffer with logged error(s)
76
+ HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, ///< CUDA Only Size of the buffer in bytes for logged error(s)
77
+ HIPRTC_JIT_OPTIMIZATION_LEVEL, ///< Value of optimization level for generated codes, acceptable options -O0, -O1, -O2, -O3
78
+ HIPRTC_JIT_TARGET_FROM_HIPCONTEXT, ///< CUDA Only The target context, which is the default
79
+ HIPRTC_JIT_TARGET, ///< CUDA Only JIT target
80
+ HIPRTC_JIT_FALLBACK_STRATEGY, ///< CUDA Only Fallback strategy
81
+ HIPRTC_JIT_GENERATE_DEBUG_INFO, ///< CUDA Only Generate debug information
82
+ HIPRTC_JIT_LOG_VERBOSE, ///< CUDA Only Generate log verbose
83
+ HIPRTC_JIT_GENERATE_LINE_INFO, ///< CUDA Only Generate line number information
84
+ HIPRTC_JIT_CACHE_MODE, ///< CUDA Only Set cache mode
85
+ HIPRTC_JIT_NEW_SM3X_OPT, ///< @deprecated CUDA Only New SM3X option.
86
+ HIPRTC_JIT_FAST_COMPILE, ///< CUDA Only Set fast compile
87
+ HIPRTC_JIT_GLOBAL_SYMBOL_NAMES, ///< CUDA Only Array of device symbol names to be relocated to the host
88
+ HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS, ///< CUDA Only Array of host addresses to be relocated to the device
89
+ HIPRTC_JIT_GLOBAL_SYMBOL_COUNT, ///< CUDA Only Number of symbol count.
90
+ HIPRTC_JIT_LTO, ///< @deprecated CUDA Only Enable link-time optimization for device code
91
+ HIPRTC_JIT_FTZ, ///< @deprecated CUDA Only Set single-precision denormals.
92
+ HIPRTC_JIT_PREC_DIV, ///< @deprecated CUDA Only Set single-precision floating-point division and
93
93
  ///< reciprocals
94
- HIPRTC_JIT_PREC_SQRT, ///< @deprecated Set single-precision floating-point square root
95
- HIPRTC_JIT_FMA, ///< @deprecated Enable floating-point multiplies and adds/subtracts operations
94
+ HIPRTC_JIT_PREC_SQRT, ///< @deprecated CUDA Only Set single-precision floating-point square root
95
+ HIPRTC_JIT_FMA, ///< @deprecated CUDA Only Enable floating-point multiplies and adds/subtracts operations
96
96
  HIPRTC_JIT_NUM_OPTIONS, ///< Number of options
97
97
  HIPRTC_JIT_IR_TO_ISA_OPT_EXT = 10000, ///< Linker options to be passed on to compiler
98
98
  /// @note Only supported for the AMD platform.
@@ -75,7 +75,8 @@ enum {
75
75
  ELFABIVERSION_AMDGPU_HSA_V2 = 0,
76
76
  ELFABIVERSION_AMDGPU_HSA_V3 = 1,
77
77
  ELFABIVERSION_AMDGPU_HSA_V4 = 2,
78
- ELFABIVERSION_AMDGPU_HSA_V5 = 3
78
+ ELFABIVERSION_AMDGPU_HSA_V5 = 3,
79
+ ELFABIVERSION_AMDGPU_HSA_V6 = 4,
79
80
  };
80
81
 
81
82
  // AMDGPU specific e_flags.
@@ -87,6 +88,7 @@ enum : unsigned {
87
88
  EF_AMDGPU_MACH_NONE = 0x000,
88
89
 
89
90
  // AMDGCN-based processors.
91
+ // clang-format off
90
92
  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
91
93
  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
92
94
  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
@@ -127,14 +129,25 @@ enum : unsigned {
127
129
  EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
128
130
  EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046,
129
131
  EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047,
132
+ EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048,
133
+ EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049,
130
134
  EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a,
131
135
  EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b,
132
136
  EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
137
+ EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
138
+ EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,
133
139
  EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f,
140
+ EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
141
+ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051,
142
+ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
143
+ EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053,
144
+ EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054,
145
+ EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
146
+ // clang-format on
134
147
 
135
148
  // First/last AMDGCN-based processors.
136
149
  EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
137
- EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942,
150
+ EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
138
151
 
139
152
  // Indicates if the "xnack" target feature is enabled for all code contained
140
153
  // in the object.
@@ -160,8 +173,7 @@ enum : unsigned {
160
173
 
161
174
  // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values.
162
175
  //
163
- // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
164
- // ELFABIVERSION_AMDGPU_HSA_V5.
176
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
165
177
  EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
166
178
  // XNACK is not supported.
167
179
  EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
@@ -174,8 +186,7 @@ enum : unsigned {
174
186
 
175
187
  // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
176
188
  //
177
- // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
178
- // ELFABIVERSION_AMDGPU_HSA_V5.
189
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
179
190
  EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
180
191
  // SRAMECC is not supported.
181
192
  EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
@@ -185,6 +196,21 @@ enum : unsigned {
185
196
  EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
186
197
  // SRAMECC is on.
187
198
  EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
199
+
200
+ // Generic target versioning. This is contained in the list byte of EFLAGS.
201
+ EF_AMDGPU_GENERIC_VERSION = 0xff000000,
202
+ EF_AMDGPU_GENERIC_VERSION_OFFSET = 24,
203
+ EF_AMDGPU_GENERIC_VERSION_MIN = 1,
204
+ EF_AMDGPU_GENERIC_VERSION_MAX = 0xff,
205
+ };
206
+
207
+ // ELF Relocation types for AMDGPU.
208
+ enum : unsigned {
209
+ R_AMDGPU_ABS32_LO = 1,
210
+ R_AMDGPU_ABS32_HI = 2,
211
+ R_AMDGPU_ABS64 = 3,
212
+ R_AMDGPU_ABS32 = 6,
213
+ R_AMDGPU_RELATIVE64 = 13,
188
214
  };
189
215
 
190
216
  } // end namespace ELF
@@ -246,14 +272,14 @@ typedef enum {
246
272
  // ELF Symbol Flag Enumeration Values.
247
273
  #define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST
248
274
 
249
- // AMD GPU Relocation Type Enumeration Values.
250
- #define R_AMDGPU_NONE 0
251
- #define R_AMDGPU_32_LOW 1
252
- #define R_AMDGPU_32_HIGH 2
253
- #define R_AMDGPU_64 3
254
- #define R_AMDGPU_INIT_SAMPLER 4
255
- #define R_AMDGPU_INIT_IMAGE 5
256
- #define R_AMDGPU_RELATIVE64 13
275
+ // Legacy/V1 AMD GPU Relocation Type Enumeration Values.
276
+ #define R_AMDGPU_V1_NONE 0
277
+ #define R_AMDGPU_V1_32_LOW 1
278
+ #define R_AMDGPU_V1_32_HIGH 2
279
+ #define R_AMDGPU_V1_64 3
280
+ #define R_AMDGPU_V1_INIT_SAMPLER 4
281
+ #define R_AMDGPU_V1_INIT_IMAGE 5
282
+ #define R_AMDGPU_V1_RELATIVE64 13
257
283
 
258
284
  // AMD GPU Note Type Enumeration Values.
259
285
  #define NT_AMD_HSA_CODE_OBJECT_VERSION 1
@@ -598,10 +598,14 @@ typedef enum {
598
598
  * AqlProfile extension.
599
599
  */
600
600
  HSA_EXTENSION_AMD_AQLPROFILE = 0x202,
601
+ /**
602
+ * PC Sampling extension.
603
+ */
604
+ HSA_EXTENSION_AMD_PC_SAMPLING = 0x203,
601
605
  /**
602
606
  * Last AMD extension.
603
607
  */
604
- HSA_AMD_LAST_EXTENSION = 0x202
608
+ HSA_AMD_LAST_EXTENSION = 0x203
605
609
  } hsa_extension_t;
606
610
 
607
611
  /**
@@ -5656,7 +5660,12 @@ typedef enum {
5656
5660
  * undefined if the symbol is not an indirect function. The type of this
5657
5661
  * attribute is uint32_t.
5658
5662
  */
5659
- HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
5663
+ HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16,
5664
+ /**
5665
+ * Wavefront size used by the kernel. The value of this attribute is either
5666
+ * 32 or 64. The type of this attribute is uint32_t.
5667
+ */
5668
+ HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19
5660
5669
  } hsa_code_symbol_info_t;
5661
5670
 
5662
5671
  /**
@@ -44,39 +44,26 @@
44
44
  #define HSA_RUNTIME_INC_HSA_API_TRACE_H
45
45
 
46
46
  #include "hsa.h"
47
+ #include "hsa_api_trace_version.h"
47
48
  #ifdef AMD_INTERNAL_BUILD
48
49
  #include "hsa_ext_image.h"
49
50
  #include "hsa_ext_amd.h"
50
51
  #include "hsa_ext_finalize.h"
51
52
  #include "hsa_amd_tool.h"
53
+ #include "hsa_ven_amd_pc_sampling.h"
52
54
  #else
53
55
  #include "inc/hsa_ext_image.h"
54
56
  #include "inc/hsa_ext_amd.h"
55
57
  #include "inc/hsa_ext_finalize.h"
56
58
  #include "inc/hsa_amd_tool.h"
59
+ #include "inc/hsa_ven_amd_pc_sampling.h"
57
60
  #endif
58
61
 
59
62
  #include <string.h>
60
63
  #include <assert.h>
61
64
  #include <stddef.h>
62
65
 
63
- // Major Ids of the Api tables exported by Hsa Core Runtime
64
- #define HSA_API_TABLE_MAJOR_VERSION 0x03
65
- #define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02
66
- #define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02
67
- #define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02
68
- #define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02
69
- #define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01
70
- #define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01
71
-
72
- // Step Ids of the Api tables exported by Hsa Core Runtime
73
- #define HSA_API_TABLE_STEP_VERSION 0x00
74
- #define HSA_CORE_API_TABLE_STEP_VERSION 0x00
75
- #define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x01
76
- #define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00
77
- #define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00
78
- #define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00
79
- #define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00
66
+ // Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h
80
67
 
81
68
  // Min function used to copy Api Tables
82
69
  static inline uint32_t Min(const uint32_t a, const uint32_t b) {
@@ -191,6 +178,19 @@ struct ImageExtTable {
191
178
  decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn;
192
179
  };
193
180
 
181
+ // Table to export HSA PC Sampling Extension Apis
182
+ struct PcSamplingExtTable {
183
+ ApiTableVersion version;
184
+ decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn;
185
+ decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn;
186
+ decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn;
187
+ decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn;
188
+ decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn;
189
+ decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn;
190
+ decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn;
191
+ };
192
+
193
+
194
194
  // Table to export AMD Extension Apis
195
195
  struct AmdExtTable {
196
196
  ApiTableVersion version;
@@ -263,6 +263,8 @@ struct AmdExtTable {
263
263
  decltype(hsa_amd_vmem_get_alloc_properties_from_handle)*
264
264
  hsa_amd_vmem_get_alloc_properties_from_handle_fn;
265
265
  decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn;
266
+ decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn;
267
+ decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn;
266
268
  };
267
269
 
268
270
  // Table to export HSA Core Runtime Apis
@@ -464,6 +466,9 @@ struct HsaApiTable {
464
466
 
465
467
  // Table of function pointers for tools to use
466
468
  ToolsApiTable* tools_;
469
+
470
+ // Table of function pointers to AMD PC Sampling Extension
471
+ PcSamplingExtTable* pc_sampling_ext_;
467
472
  };
468
473
 
469
474
  // Structure containing instances of different api tables
@@ -474,6 +479,7 @@ struct HsaApiTableContainer {
474
479
  FinalizerExtTable finalizer_ext;
475
480
  ImageExtTable image_ext;
476
481
  ToolsApiTable tools;
482
+ PcSamplingExtTable pc_sampling_ext;
477
483
 
478
484
  // Default initialization of a container instance
479
485
  HsaApiTableContainer() {
@@ -505,6 +511,11 @@ struct HsaApiTableContainer {
505
511
  tools.version.minor_id = sizeof(ToolsApiTable);
506
512
  tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION;
507
513
  root.tools_ = &tools;
514
+
515
+ pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION;
516
+ pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable);
517
+ pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION;
518
+ root.pc_sampling_ext_ = &pc_sampling_ext;
508
519
  }
509
520
  };
510
521
 
@@ -562,5 +573,7 @@ static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) {
562
573
  copyElement(&dest->image_ext_->version, &src->image_ext_->version);
563
574
  if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id))
564
575
  copyElement(&dest->tools_->version, &src->tools_->version);
576
+ if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id))
577
+ copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version);
565
578
  }
566
579
  #endif
@@ -0,0 +1,68 @@
1
+ ////////////////////////////////////////////////////////////////////////////////
2
+ //
3
+ // The University of Illinois/NCSA
4
+ // Open Source License (NCSA)
5
+ //
6
+ // Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved.
7
+ //
8
+ // Developed by:
9
+ //
10
+ // AMD Research and AMD HSA Software Development
11
+ //
12
+ // Advanced Micro Devices, Inc.
13
+ //
14
+ // www.amd.com
15
+ //
16
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
17
+ // of this software and associated documentation files (the "Software"), to
18
+ // deal with the Software without restriction, including without limitation
19
+ // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20
+ // and/or sell copies of the Software, and to permit persons to whom the
21
+ // Software is furnished to do so, subject to the following conditions:
22
+ //
23
+ // - Redistributions of source code must retain the above copyright notice,
24
+ // this list of conditions and the following disclaimers.
25
+ // - Redistributions in binary form must reproduce the above copyright
26
+ // notice, this list of conditions and the following disclaimers in
27
+ // the documentation and/or other materials provided with the distribution.
28
+ // - Neither the names of Advanced Micro Devices, Inc,
29
+ // nor the names of its contributors may be used to endorse or promote
30
+ // products derived from this Software without specific prior written
31
+ // permission.
32
+ //
33
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36
+ // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37
+ // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38
+ // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39
+ // DEALINGS WITH THE SOFTWARE.
40
+ //
41
+ ////////////////////////////////////////////////////////////////////////////////
42
+
43
+ #ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
44
+ #define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
45
+
46
+ // CODE IN THIS FILE **MUST** BE C-COMPATIBLE
47
+
48
+ // Major Ids of the Api tables exported by Hsa Core Runtime
49
+ #define HSA_API_TABLE_MAJOR_VERSION 0x03
50
+ #define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02
51
+ #define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02
52
+ #define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02
53
+ #define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02
54
+ #define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01
55
+ #define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01
56
+ #define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01
57
+
58
+ // Step Ids of the Api tables exported by Hsa Core Runtime
59
+ #define HSA_API_TABLE_STEP_VERSION 0x01
60
+ #define HSA_CORE_API_TABLE_STEP_VERSION 0x00
61
+ #define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x03
62
+ #define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00
63
+ #define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00
64
+ #define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00
65
+ #define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00
66
+ #define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00
67
+
68
+ #endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H
@@ -47,16 +47,19 @@
47
47
 
48
48
  #include "hsa.h"
49
49
  #include "hsa_ext_image.h"
50
+ #include "hsa_ven_amd_pc_sampling.h"
50
51
 
51
- /*
52
+ /**
52
53
  * - 1.0 - initial version
53
54
  * - 1.1 - dmabuf export
54
55
  * - 1.2 - hsa_amd_memory_async_copy_on_engine
55
56
  * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool
56
57
  * - 1.4 - Virtual Memory API
58
+ * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES
59
+ * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align
57
60
  */
58
61
  #define HSA_AMD_INTERFACE_VERSION_MAJOR 1
59
- #define HSA_AMD_INTERFACE_VERSION_MINOR 4
62
+ #define HSA_AMD_INTERFACE_VERSION_MINOR 6
60
63
 
61
64
  #ifdef __cplusplus
62
65
  extern "C" {
@@ -220,6 +223,11 @@ enum {
220
223
  * Exceeded number of VGPRs available on this agent
221
224
  */
222
225
  HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45,
226
+
227
+ /**
228
+ * Resource is busy or temporarily unavailable
229
+ */
230
+ HSA_STATUS_ERROR_RESOURCE_BUSY = 46,
223
231
  };
224
232
 
225
233
  /**
@@ -1175,7 +1183,11 @@ typedef enum hsa_amd_memory_pool_flag_s {
1175
1183
  * connection. Atomic memory operations on these memory buffers are not
1176
1184
  * guaranteed to be visible at system scope.
1177
1185
  */
1178
- HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1,
1186
+ HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0),
1187
+ /**
1188
+ * Allocates physically contiguous memory
1189
+ */
1190
+ HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1),
1179
1191
 
1180
1192
  } hsa_amd_memory_pool_flag_t;
1181
1193
 
@@ -2782,7 +2794,7 @@ hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* d
2782
2794
  */
2783
2795
  hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
2784
2796
 
2785
- /*
2797
+ /**
2786
2798
  * @brief Allocate a reserved address range
2787
2799
  *
2788
2800
  * Reserve a virtual address range. The size must be a multiple of the system page size.
@@ -2802,11 +2814,39 @@ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
2802
2814
  *
2803
2815
  * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
2804
2816
  * range of this size.
2817
+ *
2818
+ * Note that this API will be deprecated in a future release and replaced by
2819
+ * hsa_amd_vmem_address_reserve_align
2805
2820
  */
2806
2821
  hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
2807
2822
  uint64_t flags);
2808
2823
 
2809
- /*
2824
+ /**
2825
+ * @brief Allocate a reserved address range
2826
+ *
2827
+ * Reserve a virtual address range. The size must be a multiple of the system page size.
2828
+ * If it is not possible to allocate the address specified by @p address, then @p va will be
2829
+ * a different address range.
2830
+ * Address range should be released by calling hsa_amd_vmem_address_free.
2831
+ *
2832
+ * @param[out] va virtual address allocated
2833
+ * @param[in] size of address range requested
2834
+ * @param[in] address requested
2835
+ * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2
2836
+ * @param[in] flags currently unsupported
2837
+ *
2838
+ * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
2839
+ *
2840
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
2841
+ * initialized.
2842
+ *
2843
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
2844
+ * range of this size.
2845
+ */
2846
+ hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address,
2847
+ uint64_t alignment, uint64_t flags);
2848
+
2849
+ /**
2810
2850
  * @brief Free a reserved address range
2811
2851
  *
2812
2852
  * Free a previously allocated address range. The size must match the size of a previously
@@ -2840,7 +2880,7 @@ typedef enum {
2840
2880
  MEMORY_TYPE_PINNED,
2841
2881
  } hsa_amd_memory_type_t;
2842
2882
 
2843
- /*
2883
+ /**
2844
2884
  * @brief Create a virtual memory handle
2845
2885
  *
2846
2886
  * Create a virtual memory handle within this pool
@@ -2869,7 +2909,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
2869
2909
  hsa_amd_memory_type_t type, uint64_t flags,
2870
2910
  hsa_amd_vmem_alloc_handle_t* memory_handle);
2871
2911
 
2872
- /*
2912
+ /**
2873
2913
  * @brief Release a virtual memory handle
2874
2914
  *
2875
2915
  * @param[in] memory handle that was previously allocated
@@ -2880,7 +2920,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
2880
2920
  */
2881
2921
  hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle);
2882
2922
 
2883
- /*
2923
+ /**
2884
2924
  * @brief Map a virtual memory handle
2885
2925
  *
2886
2926
  * Map a virtual memory handle to a reserved address range. The virtual address requested must be
@@ -2906,7 +2946,7 @@ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_hand
2906
2946
  hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset,
2907
2947
  hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags);
2908
2948
 
2909
- /*
2949
+ /**
2910
2950
  * @brief Unmap a virtual memory handle
2911
2951
  *
2912
2952
  * Unmap previously mapped virtual address range
@@ -2929,7 +2969,7 @@ typedef struct hsa_amd_memory_access_desc_s {
2929
2969
  hsa_agent_t agent_handle;
2930
2970
  } hsa_amd_memory_access_desc_t;
2931
2971
 
2932
- /*
2972
+ /**
2933
2973
  * @brief Make a memory mapping accessible
2934
2974
  *
2935
2975
  * Make previously mapped virtual address accessible to specific agents. @p size must be equal to
@@ -2958,7 +2998,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
2958
2998
  const hsa_amd_memory_access_desc_t* desc,
2959
2999
  size_t desc_cnt);
2960
3000
 
2961
- /*
3001
+ /**
2962
3002
  * @brief Get current access permissions for memory mapping
2963
3003
  *
2964
3004
  * Get access permissions for memory mapping for specific agent.
@@ -2979,7 +3019,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size,
2979
3019
  hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
2980
3020
  hsa_agent_t agent_handle);
2981
3021
 
2982
- /*
3022
+ /**
2983
3023
  * @brief Get an exportable shareable handle
2984
3024
  *
2985
3025
  * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to
@@ -3002,7 +3042,7 @@ hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms,
3002
3042
  hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
3003
3043
  hsa_amd_vmem_alloc_handle_t handle,
3004
3044
  uint64_t flags);
3005
- /*
3045
+ /**
3006
3046
  * @brief Import a shareable handle
3007
3047
  *
3008
3048
  * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed
@@ -3022,7 +3062,7 @@ hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd,
3022
3062
  hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
3023
3063
  hsa_amd_vmem_alloc_handle_t* handle);
3024
3064
 
3025
- /*
3065
+ /**
3026
3066
  * @brief Returns memory handle for mapped memory
3027
3067
  *
3028
3068
  * Return a memory handle for previously mapped memory. The handle will be the same value of handle
@@ -3039,19 +3079,19 @@ hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd,
3039
3079
  hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle,
3040
3080
  void* addr);
3041
3081
 
3042
- /*
3043
- * @brief Returns the current allocation properties of a handle
3044
- *
3045
- * Returns the allocation properties of an existing handle
3046
- *
3047
- * @param[in] memory_handle memory handle to be queried
3048
- * @param[out] pool memory pool that owns this handle
3049
- * @param[out] memory type
3050
-
3051
- * @retval ::HSA_STATUS_SUCCESS
3052
- *
3053
- * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
3054
- */
3082
+ /**
3083
+ * @brief Returns the current allocation properties of a handle
3084
+ *
3085
+ * Returns the allocation properties of an existing handle
3086
+ *
3087
+ * @param[in] memory_handle memory handle to be queried
3088
+ * @param[out] pool memory pool that owns this handle
3089
+ * @param[out] memory type
3090
+
3091
+ * @retval ::HSA_STATUS_SUCCESS
3092
+ *
3093
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle
3094
+ */
3055
3095
  hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
3056
3096
  hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool,
3057
3097
  hsa_amd_memory_type_t* type);
@@ -3083,6 +3123,22 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle(
3083
3123
  */
3084
3124
  hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold);
3085
3125
 
3126
+ typedef enum {
3127
+ /*
3128
+ * Returns the agent that owns the underlying HW queue.
3129
+ * The type of this attribute is hsa_agent_t.
3130
+ */
3131
+ HSA_AMD_QUEUE_INFO_AGENT,
3132
+ /*
3133
+ * Returns the doorbell ID of the completion signal of the queue
3134
+ * The type of this attribute is uint64_t.
3135
+ */
3136
+ HSA_AMD_QUEUE_INFO_DOORBELL_ID,
3137
+ } hsa_queue_info_attribute_t;
3138
+
3139
+ hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
3140
+ void* value);
3141
+
3086
3142
  #ifdef __cplusplus
3087
3143
  } // end extern "C" block
3088
3144
  #endif