triton-windows 3.2.0.post12__cp312-cp312-win_amd64.whl → 3.3.0a0.post12__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +3 -3
  3. triton/_internal_testing.py +59 -4
  4. triton/_utils.py +35 -0
  5. triton/backends/amd/compiler.py +121 -74
  6. triton/backends/amd/driver.py +77 -43
  7. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
  8. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
  9. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
  13. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
  15. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
  16. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
  17. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
  18. triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
  19. triton/backends/amd/include/hip/hip_ext.h +4 -2
  20. triton/backends/amd/include/hip/hip_fp8.h +33 -0
  21. triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
  22. triton/backends/amd/include/hip/hip_version.h +3 -3
  23. triton/backends/amd/include/hip/hiprtc.h +25 -25
  24. triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
  25. triton/backends/amd/include/hsa/hsa.h +11 -2
  26. triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
  27. triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
  28. triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
  29. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
  30. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
  31. triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
  32. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
  33. triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
  34. triton/backends/amd/lib/asanrtl.bc +0 -0
  35. triton/backends/compiler.py +25 -225
  36. triton/backends/driver.py +7 -2
  37. triton/backends/nvidia/bin/ptxas.exe +0 -0
  38. triton/backends/nvidia/compiler.py +135 -90
  39. triton/backends/nvidia/driver.c +0 -1
  40. triton/backends/nvidia/driver.py +135 -49
  41. triton/backends/nvidia/include/cuda.h +2162 -241
  42. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  43. triton/compiler/__init__.py +2 -2
  44. triton/compiler/code_generator.py +334 -231
  45. triton/compiler/compiler.py +77 -66
  46. triton/language/__init__.py +22 -5
  47. triton/language/core.py +448 -74
  48. triton/language/extra/cuda/_experimental_tma.py +3 -5
  49. triton/language/math.py +1 -1
  50. triton/language/random.py +2 -1
  51. triton/language/semantic.py +206 -52
  52. triton/language/standard.py +35 -18
  53. triton/runtime/_allocation.py +32 -0
  54. triton/runtime/autotuner.py +27 -32
  55. triton/runtime/build.py +1 -48
  56. triton/runtime/cache.py +6 -6
  57. triton/runtime/errors.py +10 -0
  58. triton/runtime/interpreter.py +179 -45
  59. triton/runtime/jit.py +149 -190
  60. triton/testing.py +39 -11
  61. triton/tools/compile.py +27 -20
  62. triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
  63. triton/tools/mxfp.py +301 -0
  64. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
  65. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
  66. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
  67. /triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
  68. {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0
@@ -102,7 +102,7 @@ typedef struct hipDeviceProp_t {
102
102
  char luid[8]; ///< 8-byte unique identifier. Only valid on windows
103
103
  unsigned int luidDeviceNodeMask; ///< LUID node mask
104
104
  size_t totalGlobalMem; ///< Size of global memory region (in bytes).
105
- size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes).
105
+ size_t sharedMemPerBlock; ///< Size of shared memory per block (in bytes).
106
106
  int regsPerBlock; ///< Registers per block.
107
107
  int warpSize; ///< Warp size.
108
108
  size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies
@@ -111,7 +111,8 @@ typedef struct hipDeviceProp_t {
111
111
  int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block.
112
112
  int maxGridSize[3]; ///< Max grid dimensions (XYZ).
113
113
  int clockRate; ///< Max clock frequency of the multiProcessors in khz.
114
- size_t totalConstMem; ///< Size of shared memory region (in bytes).
114
+ size_t totalConstMem; ///< Size of shared constant memory region on the device
115
+ ///< (in bytes).
115
116
  int major; ///< Major compute capability. On HCC, this is an approximation and features may
116
117
  ///< differ from CUDA CC. See the arch feature flags for portable ways to query
117
118
  ///< feature caps.
@@ -538,6 +539,12 @@ typedef enum hipDeviceAttribute_t {
538
539
  // Extended attributes for vendors
539
540
  } hipDeviceAttribute_t;
540
541
 
542
+ typedef enum hipDriverProcAddressQueryResult {
543
+ HIP_GET_PROC_ADDRESS_SUCCESS = 0,
544
+ HIP_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND = 1,
545
+ HIP_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2
546
+ } hipDriverProcAddressQueryResult;
547
+
541
548
  enum hipComputeMode {
542
549
  hipComputeModeDefault = 0,
543
550
  hipComputeModeExclusive = 1,
@@ -740,6 +747,9 @@ enum hipLimit_t {
740
747
  /** Memory allocated will be uncached. */
741
748
  #define hipDeviceMallocUncached 0x3
742
749
 
750
+ /** Memory allocated will be contiguous. */
751
+ #define hipDeviceMallocContiguous 0x4
752
+
743
753
  //Flags that can be used with hipHostRegister.
744
754
  /** Memory is Mapped and Portable.*/
745
755
  #define hipHostRegisterDefault 0x0
@@ -798,6 +808,8 @@ enum hipLimit_t {
798
808
  /** Implicit stream per application thread.*/
799
809
  #define hipStreamPerThread ((hipStream_t)2)
800
810
 
811
+ #define hipStreamLegacy ((hipStream_t)1)
812
+
801
813
  // Indicates that the external memory object is a dedicated resource
802
814
  #define hipExternalMemoryDedicated 0x1
803
815
  /**
@@ -973,7 +985,8 @@ typedef struct hipMemPoolProps {
973
985
  * Windows-specific LPSECURITYATTRIBUTES required when @p hipMemHandleTypeWin32 is specified
974
986
  */
975
987
  void* win32SecurityAttributes;
976
- unsigned char reserved[64]; ///< Reserved for future use, must be 0
988
+ size_t maxSize; ///< Maximum pool size. When set to 0, defaults to a system dependent value
989
+ unsigned char reserved[56]; ///< Reserved for future use, must be 0
977
990
  } hipMemPoolProps;
978
991
  /**
979
992
  * Opaque data structure for exporting a pool allocation
@@ -1269,13 +1282,7 @@ typedef struct hipMemAllocNodeParams {
1269
1282
  void* dptr; ///< Returned device address of the allocation
1270
1283
  } hipMemAllocNodeParams;
1271
1284
 
1272
- /**
1273
- * Kernel node attributeID
1274
- */
1275
- typedef enum hipKernelNodeAttrID {
1276
- hipKernelNodeAttributeAccessPolicyWindow = 1,
1277
- hipKernelNodeAttributeCooperative = 2,
1278
- } hipKernelNodeAttrID;
1285
+
1279
1286
  typedef enum hipAccessProperty {
1280
1287
  hipAccessPropertyNormal = 0,
1281
1288
  hipAccessPropertyStreaming = 1,
@@ -1288,10 +1295,39 @@ typedef struct hipAccessPolicyWindow {
1288
1295
  hipAccessProperty missProp;
1289
1296
  size_t num_bytes;
1290
1297
  } hipAccessPolicyWindow;
1291
- typedef union hipKernelNodeAttrValue {
1292
- hipAccessPolicyWindow accessPolicyWindow;
1293
- int cooperative;
1294
- } hipKernelNodeAttrValue;
1298
+
1299
+ /**
1300
+ * Launch Attribute ID
1301
+ */
1302
+ typedef enum hipLaunchAttributeID {
1303
+ hipLaunchAttributeAccessPolicyWindow = 1, /**< Valid for Streams, graph nodes, launches*/
1304
+ hipLaunchAttributeCooperative = 2, /**< Valid for graph nodes, launches */
1305
+ hipLaunchAttributePriority = 8, /**< Valid for graph node, streams, launches */
1306
+ } hipLaunchAttributeID;
1307
+
1308
+ /**
1309
+ * Launch Attribute Value
1310
+ */
1311
+ typedef union hipLaunchAttributeValue {
1312
+ hipAccessPolicyWindow accessPolicyWindow; /**< Value of launch attribute::
1313
+ hipLaunchAttributePolicyWindow. */
1314
+ int cooperative; /**< Value of launch attribute ::hipLaunchAttributeCooperative */
1315
+ int priority; /**< Value of launch attribute :: hipLaunchAttributePriority. Execution
1316
+ priority of kernel. */
1317
+ } hipLaunchAttributeValue;
1318
+
1319
+ /**
1320
+ * Kernel node attributeID
1321
+ */
1322
+ #define hipKernelNodeAttrID hipLaunchAttributeID
1323
+ #define hipKernelNodeAttributeAccessPolicyWindow hipLaunchAttributeAccessPolicyWindow
1324
+ #define hipKernelNodeAttributeCooperative hipLaunchAttributeCooperative
1325
+ #define hipKernelNodeAttributePriority hipLaunchAttributePriority
1326
+
1327
+ /**
1328
+ * Kernel node attribute value
1329
+ */
1330
+ #define hipKernelNodeAttrValue hipLaunchAttributeValue
1295
1331
 
1296
1332
  /**
1297
1333
  * Memset node params
@@ -1383,6 +1419,34 @@ enum hipGraphDebugDotFlags {
1383
1419
  hipGraphDebugDotFlagsHandles = 1
1384
1420
  << 10 /**< Adds node handles and every kernel function handle to output */
1385
1421
  };
1422
+
1423
+ /**
1424
+ * hipGraphInstantiateWithParams results
1425
+ */
1426
+ typedef enum hipGraphInstantiateResult {
1427
+ hipGraphInstantiateSuccess = 0, /**< Instantiation Success */
1428
+ hipGraphInstantiateError = 1, /**< Instantiation failed for an
1429
+ unexpected reason which is described in the return value of the function */
1430
+ hipGraphInstantiateInvalidStructure = 2, /**< Instantiation failed due
1431
+ to invalid structure, such as cycles */
1432
+ hipGraphInstantiateNodeOperationNotSupported = 3, /**< Instantiation for device launch failed
1433
+ because the graph contained an unsupported operation */
1434
+ hipGraphInstantiateMultipleDevicesNotSupported = 4, /**< Instantiation for device launch failed
1435
+ due to the nodes belonging to different contexts */
1436
+ }hipGraphInstantiateResult;
1437
+
1438
+ /**
1439
+ * Graph Instantiation parameters
1440
+ */
1441
+ typedef struct hipGraphInstantiateParams {
1442
+ hipGraphNode_t errNode_out; /**< The node which caused instantiation to fail, if any*/
1443
+ unsigned long long flags; /**< Instantiation flags */
1444
+ hipGraphInstantiateResult result_out; /**< Whether instantiation was successful.
1445
+ If it failed, the reason why */
1446
+ hipStream_t uploadStream; /**< Upload stream */
1447
+ } hipGraphInstantiateParams;
1448
+
1449
+
1386
1450
  /**
1387
1451
  * Memory allocation properties
1388
1452
  */
@@ -1557,6 +1621,44 @@ typedef struct hipGraphNodeParams {
1557
1621
 
1558
1622
  long long reserved2;
1559
1623
  } hipGraphNodeParams;
1624
+
1625
+ /**
1626
+ * This port activates when the kernel has finished executing.
1627
+ */
1628
+ #define hipGraphKernelNodePortDefault 0
1629
+
1630
+ /**
1631
+ * This port activates when all blocks of the kernel have begun execution.
1632
+ */
1633
+ #define hipGraphKernelNodePortLaunchCompletion 2
1634
+
1635
+ /**
1636
+ * This port activates when all blocks of the kernel have performed
1637
+ * hipTriggerProgrammaticLaunchCompletion() or have terminated.
1638
+ * It must be used with edge type hipGraphDependencyTypeProgrammatic.
1639
+ */
1640
+ #define hipGraphKernelNodePortProgrammatic 1
1641
+
1642
+ typedef enum hipGraphDependencyType {
1643
+ hipGraphDependencyTypeDefault = 0,
1644
+ hipGraphDependencyTypeProgrammatic = 1
1645
+ }hipGraphDependencyType;
1646
+
1647
+ typedef struct hipGraphEdgeData {
1648
+ unsigned char
1649
+ from_port; ///< This indicates when the dependency is triggered from the upstream node on the
1650
+ ///< edge. The meaning is specfic to the node type. A value of 0 in all cases
1651
+ ///< means full completion of the upstream node, with memory visibility to the
1652
+ ///< downstream node or portion thereof (indicated by to_port). Only kernel nodes
1653
+ ///< define non-zero ports. A kernel node can use the following output port types:
1654
+ ///< hipGraphKernelNodePortDefault, hipGraphKernelNodePortProgrammatic, or
1655
+ ///< hipGraphKernelNodePortLaunchCompletion.
1656
+ unsigned char reserved[5]; ///< These bytes are unused and must be zeroed
1657
+ unsigned char
1658
+ to_port; ///< Currently no node types define non-zero ports. This field must be set to zero.
1659
+ unsigned char type; ///< This should be populated with a value from hipGraphDependencyType
1660
+ } hipGraphEdgeData;
1661
+
1560
1662
  // Doxygen end group GlobalDefs
1561
1663
  /**
1562
1664
  * @}
@@ -1585,6 +1687,7 @@ typedef struct hipGraphNodeParams {
1585
1687
  */
1586
1688
  // TODO-ctx - more description on error codes.
1587
1689
  hipError_t hipInit(unsigned int flags);
1690
+
1588
1691
  /**
1589
1692
  * @brief Returns the approximate HIP driver version.
1590
1693
  *
@@ -1755,6 +1858,18 @@ hipError_t hipDeviceReset(void);
1755
1858
  * @see #hipGetDevice, #hipGetDeviceCount
1756
1859
  */
1757
1860
  hipError_t hipSetDevice(int deviceId);
1861
+ /**
1862
+ * @brief Set a list of devices that can be used.
1863
+ *
1864
+ * @param[in] device_arr List of devices to try
1865
+ * @param[in] len Number of devices in specified list
1866
+ *
1867
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
1868
+ *
1869
+ * @see #hipGetDevice, #hipGetDeviceCount. #hipSetDevice. #hipGetDeviceProperties. #hipSetDeviceFlags. #hipChooseDevice
1870
+ *
1871
+ * */
1872
+ hipError_t hipSetValidDevices(int* device_arr, int len);
1758
1873
  /**
1759
1874
  * @brief Return the default device id for the calling host thread.
1760
1875
  *
@@ -2100,7 +2215,7 @@ hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
2100
2215
  /**
2101
2216
  * @brief Opens an interprocess event handles.
2102
2217
  *
2103
- * Opens an interprocess event handle exported from another process with cudaIpcGetEventHandle. The returned
2218
+ * Opens an interprocess event handle exported from another process with hipIpcGetEventHandle. The returned
2104
2219
  * hipEvent_t behaves like a locally created event with the hipEventDisableTiming flag specified. This event
2105
2220
  * need be freed with hipEventDestroy. Operations on the imported event after the exported event has been freed
2106
2221
  * with hipEventDestroy will result in undefined behavior. If the function is called within the same process where
@@ -2276,7 +2391,7 @@ hipError_t hipDrvGetErrorString(hipError_t hipError, const char** errorString);
2276
2391
  * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to
2277
2392
  * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on
2278
2393
  * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory
2279
- * used by the stream, applicaiton must call hipStreamDestroy.
2394
+ * used by the stream, application must call hipStreamDestroy.
2280
2395
  *
2281
2396
  * @return #hipSuccess, #hipErrorInvalidValue
2282
2397
  *
@@ -2293,7 +2408,7 @@ hipError_t hipStreamCreate(hipStream_t* stream);
2293
2408
  * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to
2294
2409
  * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on
2295
2410
  * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory
2296
- * used by the stream, applicaiton must call hipStreamDestroy. Flags controls behavior of the
2411
+ * used by the stream, application must call hipStreamDestroy. Flags controls behavior of the
2297
2412
  * stream. See #hipStreamDefault, #hipStreamNonBlocking.
2298
2413
  *
2299
2414
  *
@@ -2311,7 +2426,7 @@ hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
2311
2426
  * Create a new asynchronous stream with the specified priority. @p stream returns an opaque handle
2312
2427
  * that can be used to reference the newly created stream in subsequent hipStream* commands. The
2313
2428
  * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
2314
- * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls
2429
+ * To release the memory used by the stream, application must call hipStreamDestroy. Flags controls
2315
2430
  * behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking.
2316
2431
  *
2317
2432
  *
@@ -2329,7 +2444,7 @@ hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags,
2329
2444
  * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers
2330
2445
  * imply greater priorities. The range of meaningful stream priorities is given by
2331
2446
  * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value
2332
- * that is outside the the meaningful range as specified by this API, the priority is automatically
2447
+ * that is outside the meaningful range as specified by this API, the priority is automatically
2333
2448
  * clamped to within the valid range.
2334
2449
  */
2335
2450
  hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
@@ -2401,8 +2516,8 @@ hipError_t hipStreamSynchronize(hipStream_t stream);
2401
2516
  * All future work submitted to @p stream will wait until @p event reports completion before
2402
2517
  * beginning execution.
2403
2518
  *
2404
- * This function only waits for commands in the current stream to complete. Notably,, this function
2405
- * does not impliciy wait for commands in the default stream to complete, even if the specified
2519
+ * This function only waits for commands in the current stream to complete. Notably, this function
2520
+ * does not implicitly wait for commands in the default stream to complete, even if the specified
2406
2521
  * stream is created with hipStreamNonBlocking = 0.
2407
2522
  *
2408
2523
  * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy
@@ -2688,7 +2803,7 @@ hipError_t hipEventCreate(hipEvent_t* event);
2688
2803
  *
2689
2804
  * If hipEventRecord() has been previously called on this event, then this call will overwrite any
2690
2805
  * existing state in event.
2691
- *
2806
+ *
2692
2807
  * If this function is called on an event that is currently being recorded, results are undefined
2693
2808
  * - either outstanding recording may save state into the event, and the order is not guaranteed.
2694
2809
  *
@@ -2730,7 +2845,6 @@ hipError_t hipEventDestroy(hipEvent_t event);
2730
2845
  * If hipEventRecord() has not been called on @p event, this function returns #hipSuccess when no
2731
2846
  * event is captured.
2732
2847
  *
2733
- * This function needs to support hipEventBlockingSync parameter.
2734
2848
  *
2735
2849
  * @param[in] event Event on which to wait.
2736
2850
  *
@@ -3252,7 +3366,7 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream,
3252
3366
  *
3253
3367
  * Inserts a memory allocation operation into @p stream.
3254
3368
  * A pointer to the allocated memory is returned immediately in *dptr.
3255
- * The allocation must not be accessed until the the allocation operation completes.
3369
+ * The allocation must not be accessed until the allocation operation completes.
3256
3370
  * The allocation comes from the memory pool associated with the stream's device.
3257
3371
  *
3258
3372
  * @note The default memory pool of a device contains device memory from that device.
@@ -3504,7 +3618,7 @@ hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool);
3504
3618
  *
3505
3619
  * Inserts an allocation operation into @p stream.
3506
3620
  * A pointer to the allocated memory is returned immediately in @p dev_ptr.
3507
- * The allocation must not be accessed until the the allocation operation completes.
3621
+ * The allocation must not be accessed until the allocation operation completes.
3508
3622
  * The allocation comes from the specified memory pool.
3509
3623
  *
3510
3624
  * @note The specified memory pool may be from a device different than that of the specified @p stream.
@@ -3915,6 +4029,68 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
3915
4029
  * hipMemHostAlloc, hipMemHostGetDevicePointer
3916
4030
  */
3917
4031
  hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
4032
+ /**
4033
+ * @brief Copies from one 1D array to device memory.
4034
+ *
4035
+ * @param[out] dstDevice Destination device pointer
4036
+ * @param[in] srcArray Source array
4037
+ * @param[in] srcOffset Offset in bytes of source array
4038
+ * @param[in] ByteCount Size of memory copy in bytes
4039
+ *
4040
+ * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
4041
+ * #hipErrorInvalidValue
4042
+ *
4043
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
4044
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
4045
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
4046
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
4047
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
4048
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
4049
+ */
4050
+ hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset,
4051
+ size_t ByteCount);
4052
+ /**
4053
+ * @brief Copies from device memory to a 1D array.
4054
+ *
4055
+ * @param[out] dstArray Destination array
4056
+ * @param[in] dstOffset Offset in bytes of destination array
4057
+ * @param[in] srcDevice Source device pointer
4058
+ * @param[in] ByteCount Size of memory copy in bytes
4059
+ *
4060
+ * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
4061
+ * #hipErrorInvalidValue
4062
+ *
4063
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
4064
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
4065
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
4066
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
4067
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
4068
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
4069
+ */
4070
+ hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset, hipDeviceptr_t srcDevice,
4071
+ size_t ByteCount);
4072
+
4073
+ /**
4074
+ * @brief Copies from one 1D array to another.
4075
+ *
4076
+ * @param[out] dstArray Destination array
4077
+ * @param[in] dstOffset Offset in bytes of destination array
4078
+ * @param[in] srcArray Source array
4079
+ * @param[in] srcOffset Offset in bytes of source array
4080
+ * @param[in] ByteCount Size of memory copy in bytes
4081
+ *
4082
+ * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
4083
+ * #hipErrorInvalidValue
4084
+ *
4085
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
4086
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
4087
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
4088
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
4089
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
4090
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
4091
+ */
4092
+ hipError_t hipMemcpyAtoA(hipArray_t dstArray, size_t dstOffset, hipArray_t srcArray,
4093
+ size_t srcOffset, size_t ByteCount);
3918
4094
  /**
3919
4095
  * @brief Copy data from Host to Device asynchronously
3920
4096
  *
@@ -3973,7 +4149,48 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
3973
4149
  */
3974
4150
  hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
3975
4151
  hipStream_t stream);
3976
-
4152
+ /**
4153
+ * @brief Copies from one 1D array to host memory.
4154
+ *
4155
+ * @param[out] dstHost Destination pointer
4156
+ * @param[in] srcArray Source array
4157
+ * @param[in] srcOffset Offset in bytes of source array
4158
+ * @param[in] ByteCount Size of memory copy in bytes
4159
+ * @param[in] stream Stream identifier
4160
+ *
4161
+ * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
4162
+ * #hipErrorInvalidValue
4163
+ *
4164
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
4165
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
4166
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
4167
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
4168
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
4169
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
4170
+ */
4171
+ hipError_t hipMemcpyAtoHAsync(void* dstHost, hipArray_t srcArray, size_t srcOffset,
4172
+ size_t ByteCount, hipStream_t stream);
4173
+ /**
4174
+ * @brief Copies from host memory to a 1D array.
4175
+ *
4176
+ * @param[out] dstArray Destination array
4177
+ * @param[in] dstOffset Offset in bytes of destination array
4178
+ * @param[in] srcHost Source host pointer
4179
+ * @param[in] ByteCount Size of memory copy in bytes
4180
+ * @param[in] stream Stream identifier
4181
+ *
4182
+ * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
4183
+ * #hipErrorInvalidValue
4184
+ *
4185
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
4186
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
4187
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
4188
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
4189
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
4190
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
4191
+ */
4192
+ hipError_t hipMemcpyHtoAAsync(hipArray_t dstArray, size_t dstOffset, const void* srcHost,
4193
+ size_t ByteCount, hipStream_t stream);
3977
4194
  /**
3978
4195
  * @brief Returns a global pointer from a module.
3979
4196
  * Returns in *dptr and *bytes the pointer and size of the global of name name located in module hmod.
@@ -4002,6 +4219,8 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
4002
4219
  */
4003
4220
  hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
4004
4221
 
4222
+
4223
+
4005
4224
  /**
4006
4225
  * @brief Gets the size of the given symbol on the device.
4007
4226
  *
@@ -4013,14 +4232,38 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
4013
4232
  */
4014
4233
  hipError_t hipGetSymbolSize(size_t* size, const void* symbol);
4015
4234
 
4235
+ /**
4236
+ * @brief Gets the pointer of requested HIP driver function.
4237
+ *
4238
+ * @param[in] symbol The Symbol name of the driver function to request.
4239
+ * @param[out] pfn Output pointer to the requested driver function.
4240
+ * @param[in] hipVersion The HIP version for the requested driver function symbol.
4241
+ * HIP version is defined as 100*version_major + version_minor. For example, in HIP 6.1, the
4242
+ * hipversion is 601, for the symbol function "hipGetDeviceProperties", the specified hipVersion 601
4243
+ * is greater or equal to the version 600, the symbol function will be handle properly as backend
4244
+ * compatible function.
4245
+ *
4246
+ * @param[in] flags Currently only default flag is suppported.
4247
+ * @param[out] symbolStatus Optional enumeration for returned status of searching for symbol driver
4248
+ * function based on the input hipVersion.
4249
+ *
4250
+ * Returns hipSuccess if the returned pfn is addressed to the pointer of found driver function.
4251
+ *
4252
+ * @return #hipSuccess, #hipErrorInvalidValue.
4253
+ */
4254
+ hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
4255
+ hipDriverProcAddressQueryResult* symbolStatus);
4256
+
4016
4257
  /**
4017
4258
  * @brief Copies data to the given symbol on the device.
4018
4259
  * Symbol HIP APIs allow a kernel to define a device-side data symbol which can be accessed on
4019
4260
  * the host side. The symbol can be in __constant or device space.
4020
4261
  * Note that the symbol name needs to be encased in the HIP_SYMBOL macro.
4021
4262
  * This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize.
4022
- * For detail usage, see the example at
4023
- * https://github.com/ROCm/HIP/blob/develop/docs/user_guide/hip_porting_guide.md
4263
+ * For detailed usage, see the
4264
+ * <a href="https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html#memcpytosymbol">memcpyToSymbol example</a>
4265
+ * in the HIP Porting Guide.
4266
+ *
4024
4267
  *
4025
4268
  * @param[out] symbol pointer to the device symbole
4026
4269
  * @param[in] src pointer to the source address
@@ -4520,6 +4763,27 @@ hipError_t hipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, co
4520
4763
  hipError_t hipMemcpy2DToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
4521
4764
  size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
4522
4765
  hipStream_t stream __dparm(0));
4766
+ /**
4767
+ * @brief Copies data between host and device.
4768
+ *
4769
+ * @param[in] dst Destination memory address
4770
+ * @param[in] wOffsetDst Destination starting X offset
4771
+ * @param[in] hOffsetDst Destination starting Y offset
4772
+ * @param[in] src Source memory address
4773
+ * @param[in] wOffsetSrc Source starting X offset
4774
+ * @param[in] hOffsetSrc Source starting Y offset (columns in bytes)
4775
+ * @param[in] width Width of matrix transfer (columns in bytes)
4776
+ * @param[in] height Height of matrix transfer (rows)
4777
+ * @param[in] kind Type of transfer
4778
+ *
4779
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidMemcpyDirection
4780
+ *
4781
+ * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
4782
+ * hipMemcpyAsync
4783
+ */
4784
+ hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
4785
+ hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc,
4786
+ size_t width, size_t height, hipMemcpyKind kind);
4523
4787
  /**
4524
4788
  * @brief Copies data between host and device.
4525
4789
  *
@@ -4734,7 +4998,7 @@ hipError_t hipDeviceDisablePeerAccess(int peerDeviceId);
4734
4998
  * @param [out] psize - Size of allocation
4735
4999
  * @param [in] dptr- Device Pointer
4736
5000
  *
4737
- * @returns #hipSuccess, #hipErrorInvalidDevicePointer
5001
+ * @returns #hipSuccess, #hipErrorNotFound
4738
5002
  *
4739
5003
  * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
4740
5004
  * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
@@ -5225,6 +5489,16 @@ hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func
5225
5489
  * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction
5226
5490
  */
5227
5491
  hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc);
5492
+ /**
5493
+ * @brief Gets pointer to device entry function that matches entry function symbolPtr.
5494
+ *
5495
+ * @param [out] functionPtr Device entry function
5496
+ * @param [in] symbolPtr Pointer to device entry function to search for
5497
+ *
5498
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction
5499
+ *
5500
+ */
5501
+ hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr);
5228
5502
  /**
5229
5503
  * @brief returns the handle of the texture reference with the name from the module.
5230
5504
  *
@@ -5646,12 +5920,26 @@ hipError_t hipLaunchKernel(const void* function_address,
5646
5920
  /**
5647
5921
  * @brief Enqueues a host function call in a stream.
5648
5922
  *
5649
- * @param [in] stream - stream to enqueue work to.
5650
- * @param [in] fn - function to call once operations enqueued preceeding are complete.
5923
+ * @param [in] stream - The stream to enqueue work in.
5924
+ * @param [in] fn - The function to call once enqueued preceeding operations are complete.
5651
5925
  * @param [in] userData - User-specified data to be passed to the function.
5926
+ *
5652
5927
  * @returns #hipSuccess, #hipErrorInvalidResourceHandle, #hipErrorInvalidValue,
5653
5928
  * #hipErrorNotSupported
5654
- * @warning : This API is marked as beta, meaning, while this is feature complete,
5929
+ *
5930
+ * The host function to call in this API will be executed after the preceding operations in
5931
+ * the stream are complete. The function is a blocking operation that blocks operations in the
5932
+ * stream that follow it, until the function is returned.
5933
+ * Event synchronization and internal callback functions make sure enqueued operations will
5934
+ * execute in order, in the stream.
5935
+ *
5936
+ * The host function must not make any HIP API calls. The host function is non-reentrant. It must
5937
+ * not perform sychronization with any operation that may depend on other processing execution
5938
+ * but is not enqueued to run earlier in the stream.
5939
+ *
5940
+ * Host functions that are enqueued respectively in different non-blocking streams can run concurrently.
5941
+ *
5942
+ * @warning This API is marked as beta, meaning, while this is feature complete,
5655
5943
  * it is still open to changes and may have outstanding issues.
5656
5944
  */
5657
5945
  hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData);
@@ -6181,7 +6469,7 @@ hipError_t hipGetTextureAlignmentOffset(
6181
6469
  DEPRECATED(DEPRECATED_MSG)
6182
6470
  hipError_t hipUnbindTexture(const textureReference* tex);
6183
6471
  /**
6184
- * @brief Gets the the address for a texture reference.
6472
+ * @brief Gets the address for a texture reference.
6185
6473
  *
6186
6474
  * @param [out] dev_ptr Pointer of device address.
6187
6475
  * @param [in] texRef Pointer of texture reference.
@@ -6564,6 +6852,30 @@ int hipGetStreamDeviceId(hipStream_t stream);
6564
6852
  */
6565
6853
  hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode);
6566
6854
 
6855
+ /**
6856
+ * @brief Begins graph capture on a stream to an existing graph.
6857
+ *
6858
+ * @param [in] stream - Stream to initiate capture.
6859
+ * @param [in] graph - Graph to capture into.
6860
+ * @param [in] dependencies - Dependencies of the first node captured in the stream. Can be NULL if
6861
+ * numDependencies is 0.
6862
+ * @param [in] dependencyData - Optional array of data associated with each dependency.
6863
+ * @param [in] numDependencies - Number of dependencies.
6864
+ * @param [in] mode - Controls the interaction of this capture sequence with other API calls that
6865
+ are not safe.
6866
+ *
6867
+ * @returns #hipSuccess, #hipErrorInvalidValue
6868
+ *
6869
+ * @warning : param "const hipGraphEdgeData* dependencyData" is currently not supported and has to
6870
+ passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still
6871
+ open to changes and may have outstanding issues.
6872
+ *
6873
+ */
6874
+ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
6875
+ const hipGraphNode_t* dependencies,
6876
+ const hipGraphEdgeData* dependencyData,
6877
+ size_t numDependencies, hipStreamCaptureMode mode);
6878
+
6567
6879
  /**
6568
6880
  * @brief Ends capture on a stream, returning the captured graph.
6569
6881
  *
@@ -6902,6 +7214,19 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
6902
7214
  hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
6903
7215
  unsigned long long flags);
6904
7216
 
7217
+ /**
7218
+ * @brief Creates an executable graph from a graph.
7219
+ *
7220
+ * @param [out] pGraphExec - pointer to instantiated executable graph that is created.
7221
+ * @param [in] graph - instance of graph to instantiate.
7222
+ * @param [in] instantiateParams - Graph Instantiate Params
7223
+ * @returns #hipSuccess, #hipErrorInvalidValue
7224
+ *
7225
+ * @warning : This API is marked as beta, meaning, while this is feature complete,
7226
+ * it is still open to changes and may have outstanding issues.
7227
+ */
7228
+ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
7229
+ hipGraphInstantiateParams *instantiateParams);
6905
7230
  /**
6906
7231
  * @brief launches an executable graph in a stream
6907
7232
  *
@@ -6926,6 +7251,22 @@ hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
6926
7251
  */
6927
7252
  hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream);
6928
7253
 
7254
+ /**
7255
+ * @brief Creates a kernel execution node and adds it to a graph.
7256
+ *
7257
+ * @param [out] pGraphNode - pointer to graph node to create.
7258
+ * @param [in] graph - instance of graph to add the created node.
7259
+ * @param [in] pDependencies - pointer to the dependencies on the kernel execution node.
7260
+ * @param [in] numDependencies - the number of the dependencies.
7261
+ * @param [in] nodeParams - pointer to the parameters for the node.
7262
+ * @returns #hipSuccess, #hipErrorInvalidValue.
7263
+ * @warning : This API is marked as beta, meaning, while this is feature complete,
7264
+ * it is still open to changes and may have outstanding issues.
7265
+ */
7266
+ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
7267
+ const hipGraphNode_t *pDependencies, size_t numDependencies,
7268
+ hipGraphNodeParams *nodeParams);
7269
+
6929
7270
  /**
6930
7271
  * @brief Destroys an executable graph
6931
7272
  *
@@ -8906,6 +9247,7 @@ static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
8906
9247
  return hipMallocManaged((void**)devPtr, size, flags);
8907
9248
  }
8908
9249
 
9250
+
8909
9251
  #endif
8910
9252
  #endif
8911
9253
  // doxygen end HIP API
@@ -4,9 +4,9 @@
4
4
  #define HIP_VERSION_H
5
5
 
6
6
  #define HIP_VERSION_MAJOR 6
7
- #define HIP_VERSION_MINOR 1
8
- #define HIP_VERSION_PATCH 40091
9
- #define HIP_VERSION_GITHASH "a8dbc0c19"
7
+ #define HIP_VERSION_MINOR 2
8
+ #define HIP_VERSION_PATCH 41134
9
+ #define HIP_VERSION_GITHASH "65d174c3e"
10
10
  #define HIP_VERSION_BUILD_ID 0
11
11
  #define HIP_VERSION_BUILD_NAME ""
12
12
  #define HIP_VERSION (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)