llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,13 @@
8
8
  #include <limits>
9
9
  #include <stdint.h>
10
10
  #include <stdio.h>
11
+ #include <string>
11
12
  #include <vector>
12
-
13
+ #include <map>
14
+ #include <array>
15
+ #include "ggml-cuda.h"
16
+ #include "ggml.h"
17
+ #include "ggml-backend-impl.h"
13
18
 
14
19
  #if defined(GGML_USE_HIPBLAS)
15
20
  #include <hip/hip_runtime.h>
@@ -77,6 +82,7 @@
77
82
  #define cudaMemcpyKind hipMemcpyKind
78
83
  #define cudaMemset hipMemset
79
84
  #define cudaMemsetAsync hipMemsetAsync
85
+ #define cudaMemGetInfo hipMemGetInfo
80
86
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
81
87
  #define cudaSetDevice hipSetDevice
82
88
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -112,10 +118,9 @@
112
118
 
113
119
  #endif // defined(GGML_USE_HIPBLAS)
114
120
 
115
- #include "ggml-cuda.h"
116
- #include "ggml.h"
117
- #include "ggml-backend-impl.h"
121
+ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
118
122
 
123
+ #define CC_PASCAL 600
119
124
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
120
125
  #define CC_VOLTA 700
121
126
  #define CC_OFFSET_AMD 1000000
@@ -183,7 +188,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
183
188
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
184
189
  #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
185
190
  c = __builtin_amdgcn_sdot4(a, b, c, false);
186
- #elif defined(__gfx1100__)
191
+ #elif defined(RDNA3)
187
192
  c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
188
193
  #elif defined(__gfx1010__) || defined(__gfx900__)
189
194
  int tmp1;
@@ -477,6 +482,23 @@ typedef struct {
477
482
  } block_q6_K;
478
483
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
479
484
 
485
+ #define QR2_XXS 8
486
+ #define QI2_XXS (QK_K / (4*QR2_XXS))
487
+ typedef struct {
488
+ half d;
489
+ uint16_t qs[QK_K/8];
490
+ } block_iq2_xxs;
491
+ static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
492
+
493
+ #define QR2_XS 8
494
+ #define QI2_XS (QK_K / (4*QR2_XS))
495
+ typedef struct {
496
+ half d;
497
+ uint16_t qs[QK_K/8];
498
+ uint8_t scales[QK_K/32];
499
+ } block_iq2_xs;
500
+ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
501
+
480
502
  #define WARP_SIZE 32
481
503
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
482
504
 
@@ -501,6 +523,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
501
523
  #define CUDA_ACC_BLOCK_SIZE 256
502
524
  #define CUDA_IM2COL_BLOCK_SIZE 256
503
525
 
526
+ #define CUDA_Q8_0_NE_ALIGN 2048
527
+
504
528
  // dmmv = dequantize_mul_mat_vec
505
529
  #ifndef GGML_CUDA_DMMV_X
506
530
  #define GGML_CUDA_DMMV_X 32
@@ -544,19 +568,16 @@ static void ggml_cuda_set_device(const int device) {
544
568
 
545
569
  static int g_device_count = -1;
546
570
  static int g_main_device = 0;
547
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
571
+ static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
548
572
 
549
573
  struct cuda_device_capabilities {
550
574
  int cc; // compute capability
575
+ size_t smpb; // max. shared memory per block
551
576
  bool vmm; // virtual memory support
552
577
  size_t vmm_granularity; // granularity of virtual memory
553
578
  };
554
579
 
555
- static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
556
-
557
- static void * g_scratch_buffer = nullptr;
558
- static size_t g_scratch_size = 0; // disabled by default
559
- static size_t g_scratch_offset = 0;
580
+ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
560
581
 
561
582
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
562
583
 
@@ -585,6 +606,19 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
585
606
  return a;
586
607
  }
587
608
 
609
+ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
610
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
611
+ #pragma unroll
612
+ for (int mask = 16; mask > 0; mask >>= 1) {
613
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
614
+ }
615
+ return a;
616
+ #else
617
+ (void) a;
618
+ bad_arch();
619
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
620
+ }
621
+
588
622
  static __device__ __forceinline__ float warp_reduce_max(float x) {
589
623
  #pragma unroll
590
624
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -593,6 +627,19 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
593
627
  return x;
594
628
  }
595
629
 
630
+ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
631
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
632
+ #pragma unroll
633
+ for (int mask = 16; mask > 0; mask >>= 1) {
634
+ x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
635
+ }
636
+ return x;
637
+ #else
638
+ (void) x;
639
+ bad_arch();
640
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
641
+ }
642
+
596
643
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
597
644
  return b;
598
645
  GGML_UNUSED(a);
@@ -1058,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
1058
1105
  #endif // GGML_CUDA_F16
1059
1106
  }
1060
1107
 
1108
+ template<typename dst_t>
1109
+ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1110
+
1111
+ const int i = blockIdx.x;
1112
+
1113
+ // assume 32 threads
1114
+ const int tid = threadIdx.x;
1115
+ const int il = tid/8;
1116
+ const int ir = tid%8;
1117
+ const int ib = 8*i + ir;
1118
+ if (ib >= nb32) {
1119
+ return;
1120
+ }
1121
+
1122
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1123
+
1124
+ const block_q4_0 * x = (const block_q4_0 *)vx + ib;
1125
+ const float d = __half2float(x->d);
1126
+ const float dm = -8*d;
1127
+
1128
+ const uint8_t * q = x->qs + 4*il;
1129
+
1130
+ for (int l = 0; l < 4; ++l) {
1131
+ y[l+ 0] = d * (q[l] & 0xF) + dm;
1132
+ y[l+16] = d * (q[l] >> 4) + dm;
1133
+ }
1134
+ }
1135
+
1136
+ template<typename dst_t>
1137
+ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1138
+
1139
+ const int i = blockIdx.x;
1140
+
1141
+ // assume 32 threads
1142
+ const int tid = threadIdx.x;
1143
+ const int il = tid/8;
1144
+ const int ir = tid%8;
1145
+ const int ib = 8*i + ir;
1146
+ if (ib >= nb32) {
1147
+ return;
1148
+ }
1149
+
1150
+ dst_t * y = yy + 256*i + 32*ir + 4*il;
1151
+
1152
+ const block_q4_1 * x = (const block_q4_1 *)vx + ib;
1153
+ const float2 d = __half22float2(x->dm);
1154
+
1155
+ const uint8_t * q = x->qs + 4*il;
1156
+
1157
+ for (int l = 0; l < 4; ++l) {
1158
+ y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
1159
+ y[l+16] = d.x * (q[l] >> 4) + d.y;
1160
+ }
1161
+ }
1162
+
1061
1163
  //================================== k-quants
1062
1164
 
1063
1165
  template<typename dst_t>
@@ -1292,6 +1394,281 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
1292
1394
  #endif
1293
1395
  }
1294
1396
 
1397
+ static const __device__ uint64_t iq2xxs_grid[256] = {
1398
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1399
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
1400
+ 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
1401
+ 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
1402
+ 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
1403
+ 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
1404
+ 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
1405
+ 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
1406
+ 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
1407
+ 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
1408
+ 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
1409
+ 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
1410
+ 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
1411
+ 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
1412
+ 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
1413
+ 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
1414
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
1415
+ 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
1416
+ 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
1417
+ 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
1418
+ 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
1419
+ 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
1420
+ 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
1421
+ 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
1422
+ 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
1423
+ 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
1424
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
1425
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
1426
+ 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
1427
+ 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
1428
+ 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
1429
+ 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
1430
+ 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
1431
+ 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
1432
+ 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
1433
+ 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
1434
+ 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
1435
+ 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
1436
+ 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
1437
+ 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
1438
+ 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
1439
+ 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
1440
+ 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
1441
+ 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
1442
+ 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
1443
+ 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
1444
+ 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
1445
+ 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
1446
+ 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
1447
+ 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
1448
+ 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
1449
+ 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
1450
+ 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
1451
+ 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
1452
+ 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
1453
+ 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
1454
+ 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
1455
+ 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
1456
+ 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
1457
+ 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
1458
+ 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
1459
+ 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
1460
+ 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
1461
+ 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
1462
+ };
1463
+
1464
+ static const __device__ uint64_t iq2xs_grid[512] = {
1465
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1466
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1467
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1468
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1469
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1470
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
1471
+ 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
1472
+ 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
1473
+ 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
1474
+ 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
1475
+ 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
1476
+ 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
1477
+ 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
1478
+ 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
1479
+ 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
1480
+ 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
1481
+ 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
1482
+ 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
1483
+ 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
1484
+ 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
1485
+ 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
1486
+ 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
1487
+ 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
1488
+ 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
1489
+ 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
1490
+ 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
1491
+ 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
1492
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
1493
+ 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
1494
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
1495
+ 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
1496
+ 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
1497
+ 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
1498
+ 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
1499
+ 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
1500
+ 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
1501
+ 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
1502
+ 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
1503
+ 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
1504
+ 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
1505
+ 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
1506
+ 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
1507
+ 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
1508
+ 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
1509
+ 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
1510
+ 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
1511
+ 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
1512
+ 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
1513
+ 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
1514
+ 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
1515
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
1516
+ 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
1517
+ 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
1518
+ 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
1519
+ 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
1520
+ 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
1521
+ 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
1522
+ 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
1523
+ 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
1524
+ 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
1525
+ 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
1526
+ 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
1527
+ 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
1528
+ 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
1529
+ 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
1530
+ 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
1531
+ 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
1532
+ 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
1533
+ 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
1534
+ 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
1535
+ 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
1536
+ 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
1537
+ 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
1538
+ 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
1539
+ 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
1540
+ 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
1541
+ 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
1542
+ 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
1543
+ 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
1544
+ 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
1545
+ 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
1546
+ 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
1547
+ 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
1548
+ 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
1549
+ 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
1550
+ 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
1551
+ 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
1552
+ 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
1553
+ 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
1554
+ 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
1555
+ 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
1556
+ 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
1557
+ 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
1558
+ 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
1559
+ 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
1560
+ 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
1561
+ 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
1562
+ 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
1563
+ 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
1564
+ 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
1565
+ 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
1566
+ 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
1567
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
1568
+ 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
1569
+ 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
1570
+ 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
1571
+ 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
1572
+ 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
1573
+ 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
1574
+ 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
1575
+ 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
1576
+ 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
1577
+ 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
1578
+ 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
1579
+ 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
1580
+ 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
1581
+ 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
1582
+ 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
1583
+ 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
1584
+ 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
1585
+ 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
1586
+ 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
1587
+ 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
1588
+ 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
1589
+ 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
1590
+ 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
1591
+ 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
1592
+ 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1593
+ };
1594
+
1595
+ static const __device__ uint8_t ksigns_iq2xs[128] = {
1596
+ 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
1597
+ 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
1598
+ 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
1599
+ 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
1600
+ 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
1601
+ 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
1602
+ 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
1603
+ 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
1604
+ };
1605
+
1606
+ static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
1607
+
1608
+ inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
1609
+ switch (type) {
1610
+ case GGML_TYPE_Q4_0:
1611
+ case GGML_TYPE_Q4_1:
1612
+ case GGML_TYPE_Q5_0:
1613
+ case GGML_TYPE_Q5_1:
1614
+ case GGML_TYPE_Q8_0:
1615
+ case GGML_TYPE_Q2_K:
1616
+ case GGML_TYPE_Q3_K:
1617
+ case GGML_TYPE_Q4_K:
1618
+ case GGML_TYPE_Q5_K:
1619
+ case GGML_TYPE_Q6_K:
1620
+ return true;
1621
+ default:
1622
+ return false;
1623
+ }
1624
+ }
1625
+
1626
+ template<typename dst_t>
1627
+ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1628
+
1629
+ const int i = blockIdx.x;
1630
+ const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
1631
+
1632
+ const int tid = threadIdx.x;
1633
+ #if QK_K == 256
1634
+ const int il = tid/8; // 0...3
1635
+ const int ib = tid%8; // 0...7
1636
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1637
+ const uint16_t * q2 = x[i].qs + 4*ib;
1638
+ const uint8_t * aux8 = (const uint8_t *)q2;
1639
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
1640
+ const uint32_t aux32 = q2[2] | (q2[3] << 16);
1641
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
1642
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
1643
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
1644
+ #else
1645
+ assert(false);
1646
+ #endif
1647
+
1648
+ }
1649
+
1650
+ template<typename dst_t>
1651
+ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1652
+
1653
+ const int i = blockIdx.x;
1654
+ const block_iq2_xs * x = (const block_iq2_xs *) vx;
1655
+
1656
+ const int tid = threadIdx.x;
1657
+ #if QK_K == 256
1658
+ const int il = tid/8; // 0...3
1659
+ const int ib = tid%8; // 0...7
1660
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1661
+ const uint16_t * q2 = x[i].qs + 4*ib;
1662
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
1663
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
1664
+ const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
1665
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
1666
+ #else
1667
+ assert(false);
1668
+ #endif
1669
+
1670
+ }
1671
+
1295
1672
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1296
1673
 
1297
1674
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -1872,14 +2249,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1872
2249
  v.y = x[ib + iqs + 1];
1873
2250
  }
1874
2251
 
1875
- static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1876
- const float * x = (const float *) vx;
1877
-
1878
- // automatic half -> float type cast if dfloat == float
1879
- v.x = x[ib + iqs + 0];
1880
- v.y = x[ib + iqs + 1];
1881
- }
1882
-
1883
2252
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1884
2253
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1885
2254
 
@@ -1983,7 +2352,7 @@ static __global__ void k_get_rows_float(
1983
2352
 
1984
2353
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1985
2354
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1986
- const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
2355
+ const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1987
2356
 
1988
2357
  if (i >= k) {
1989
2358
  return;
@@ -2002,6 +2371,58 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
2002
2371
  y[iybs + iqs + y_offset] = v.y;
2003
2372
  }
2004
2373
 
2374
+ template <typename src_t, typename dst_t>
2375
+ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
2376
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
2377
+
2378
+ if (i >= k) {
2379
+ return;
2380
+ }
2381
+
2382
+ const src_t * x = (src_t *) vx;
2383
+
2384
+ y[i] = x[i];
2385
+ }
2386
+
2387
+ template <bool need_check>
2388
+ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
2389
+ #if __CUDA_ARCH__ >= CC_PASCAL
2390
+ constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
2391
+
2392
+ const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
2393
+ const int * x0 = ((int *) vx) + blockIdx.x * nint;
2394
+ half2 * y2 = (half2 *) (y + i0);
2395
+
2396
+ __shared__ int vals[nint];
2397
+
2398
+ #pragma unroll
2399
+ for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
2400
+ if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
2401
+ break;
2402
+ }
2403
+
2404
+ const int ix = ix0 + threadIdx.x;
2405
+ vals[ix] = x0[ix];
2406
+ }
2407
+
2408
+ #pragma unroll
2409
+ for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
2410
+ if (need_check && i0 + iy + 2*threadIdx.x >= k) {
2411
+ return;
2412
+ }
2413
+
2414
+ const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
2415
+ const half d = *b0;
2416
+ const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
2417
+
2418
+ y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
2419
+ }
2420
+ #else
2421
+ (void) vx; (void) y; (void) k;
2422
+ bad_arch();
2423
+ #endif // __CUDA_ARCH__ >= CC_PASCAL
2424
+ }
2425
+
2005
2426
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2006
2427
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2007
2428
 
@@ -3820,6 +4241,91 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3820
4241
  return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3821
4242
  }
3822
4243
 
4244
+ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
4245
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4246
+ #if QK_K == 256
4247
+ const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
4248
+
4249
+ #if QR2_XXS == 8
4250
+ const int ib32 = iqs;
4251
+ const uint16_t * q2 = bq2->qs + 4*ib32;
4252
+ const uint8_t * aux8 = (const uint8_t *)q2;
4253
+ const int8_t * q8 = bq8_1[ib32].qs;
4254
+ uint32_t aux32 = q2[2] | (q2[3] << 16);
4255
+ int sumi = 0;
4256
+ for (int l = 0; l < 4; ++l) {
4257
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
4258
+ const uint8_t signs = ksigns_iq2xs[aux32 & 127];
4259
+ for (int j = 0; j < 8; ++j) {
4260
+ sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
4261
+ }
4262
+ q8 += 8;
4263
+ aux32 >>= 7;
4264
+ }
4265
+ const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
4266
+ return d * sumi;
4267
+ #else
4268
+ // iqs is 0...15
4269
+ const int ib32 = iqs/2;
4270
+ const int il = iqs%2;
4271
+ const uint16_t * q2 = bq2->qs + 4*ib32;
4272
+ const uint8_t * aux8 = (const uint8_t *)q2;
4273
+ const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
4274
+ const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
4275
+ const uint32_t aux32 = q2[2] | (q2[3] << 16);
4276
+ const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
4277
+ const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
4278
+ const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
4279
+ const int8_t * q8 = bq8_1[ib32].qs + 16*il;
4280
+ int sumi1 = 0, sumi2 = 0;
4281
+ for (int j = 0; j < 8; ++j) {
4282
+ sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
4283
+ sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
4284
+ }
4285
+ return d * (sumi1 + sumi2);
4286
+ #endif
4287
+ #else
4288
+ assert(false);
4289
+ return 0.f;
4290
+ #endif
4291
+ }
4292
+
4293
+ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4294
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4295
+ #if QK_K == 256
4296
+ const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
4297
+
4298
+ const int ib32 = iqs;
4299
+ const uint16_t * q2 = bq2->qs + 4*ib32;
4300
+ const int8_t * q8 = bq8_1[ib32].qs;
4301
+ const uint8_t ls1 = bq2->scales[ib32] & 0xf;
4302
+ const uint8_t ls2 = bq2->scales[ib32] >> 4;
4303
+ int sumi1 = 0;
4304
+ for (int l = 0; l < 2; ++l) {
4305
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
4306
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
4307
+ for (int j = 0; j < 8; ++j) {
4308
+ sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
4309
+ }
4310
+ q8 += 8;
4311
+ }
4312
+ int sumi2 = 0;
4313
+ for (int l = 2; l < 4; ++l) {
4314
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
4315
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
4316
+ for (int j = 0; j < 8; ++j) {
4317
+ sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
4318
+ }
4319
+ q8 += 8;
4320
+ }
4321
+ const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
4322
+ return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
4323
+ #else
4324
+ assert(false);
4325
+ return 0.f;
4326
+ #endif
4327
+ }
4328
+
3823
4329
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3824
4330
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3825
4331
  static __device__ __forceinline__ void mul_mat_q(
@@ -5201,142 +5707,300 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5201
5707
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
5202
5708
  }
5203
5709
 
5204
- static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
5710
+ template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5711
+ static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5712
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5713
+ const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5714
+ const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
5715
+
5205
5716
  const int tid = threadIdx.x;
5206
5717
  const int rowx = blockIdx.x;
5207
5718
  const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
5208
5719
 
5209
- const int block_size = blockDim.x;
5720
+ const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
5210
5721
 
5211
5722
  const int warp_id = threadIdx.x / WARP_SIZE;
5212
5723
  const int lane_id = threadIdx.x % WARP_SIZE;
5213
5724
 
5214
- __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
5725
+ extern __shared__ half data_soft_max_f16[];
5726
+ half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
5727
+ // (shared memory) buffer to cache values between iterations:
5728
+ half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
5729
+ // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
5730
+ // in that case col_smem == col_data must be enforced to avoid race conditions
5215
5731
 
5216
- float max_val = -INFINITY;
5732
+ half2 max_val = make_half2(-INFINITY, -INFINITY);
5217
5733
 
5218
- for (int col = tid; col < ncols; col += block_size) {
5219
- const int ix = rowx*ncols + col;
5220
- const int iy = rowy*ncols + col;
5221
- max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
5734
+ #pragma unroll
5735
+ for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
5736
+ const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
5737
+ const int col_smem = vals_smem ? col0 + tid : col_data;
5738
+
5739
+ const int ix = rowx*ncols_data + col_data;
5740
+ const int iy = rowy*ncols_data + col_data;
5741
+
5742
+ half2 val;
5743
+ if (need_check && col_data + 0 >= ncols_data) {
5744
+ val.x = -INFINITY;
5745
+ } else {
5746
+ val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
5747
+ }
5748
+ if (need_check && col_data + WARP_SIZE >= ncols_data) {
5749
+ val.y = -INFINITY;
5750
+ } else {
5751
+ val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
5752
+ }
5753
+ if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
5754
+ vals[col_smem] = val;
5755
+ }
5756
+ max_val = __hmax2(max_val, val);
5222
5757
  }
5223
5758
 
5224
5759
  // find the max value in the block
5225
5760
  max_val = warp_reduce_max(max_val);
5226
5761
  if (block_size > WARP_SIZE) {
5227
5762
  if (warp_id == 0) {
5228
- buf[lane_id] = -INFINITY;
5763
+ buf_iw[lane_id] = -INFINITY;
5229
5764
  }
5230
5765
  __syncthreads();
5231
5766
 
5232
5767
  if (lane_id == 0) {
5233
- buf[warp_id] = max_val;
5768
+ buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
5234
5769
  }
5235
5770
  __syncthreads();
5236
5771
 
5237
- max_val = buf[lane_id];
5772
+ max_val = __half2half2(buf_iw[lane_id]);
5238
5773
  max_val = warp_reduce_max(max_val);
5774
+ } else {
5775
+ max_val = __half2half2(__hmax(max_val.x, max_val.y));
5239
5776
  }
5240
5777
 
5241
- float tmp = 0.f;
5778
+ half2 tmp = make_half2(0.0f, 0.0f); // partial sums
5779
+
5780
+ #pragma unroll
5781
+ for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
5782
+ const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
5783
+
5784
+ if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
5785
+ break;
5786
+ }
5787
+
5788
+ const half2 val = h2exp(vals[col_smem] - max_val);
5242
5789
 
5243
- for (int col = tid; col < ncols; col += block_size) {
5244
- const int ix = rowx*ncols + col;
5245
- const int iy = rowy*ncols + col;
5246
- const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
5247
5790
  tmp += val;
5248
- dst[ix] = val;
5791
+ vals[col_smem] = val;
5249
5792
  }
5250
5793
 
5251
5794
  // find the sum of exps in the block
5252
5795
  tmp = warp_reduce_sum(tmp);
5253
5796
  if (block_size > WARP_SIZE) {
5254
5797
  if (warp_id == 0) {
5255
- buf[lane_id] = 0.f;
5798
+ buf_iw[lane_id] = 0.0f;
5256
5799
  }
5257
5800
  __syncthreads();
5258
5801
 
5259
5802
  if (lane_id == 0) {
5260
- buf[warp_id] = tmp;
5803
+ buf_iw[warp_id] = tmp.x + tmp.y;
5261
5804
  }
5262
5805
  __syncthreads();
5263
5806
 
5264
- tmp = buf[lane_id];
5807
+ tmp = __half2half2(buf_iw[lane_id]);
5265
5808
  tmp = warp_reduce_sum(tmp);
5809
+ } else {
5810
+ tmp = __half2half2(tmp.x + tmp.y);
5266
5811
  }
5267
5812
 
5268
- const float inv_tmp = 1.f / tmp;
5269
-
5270
- for (int col = tid; col < ncols; col += block_size) {
5271
- const int i = rowx*ncols + col;
5272
- dst[i] *= inv_tmp;
5273
- }
5274
- }
5813
+ const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
5275
5814
 
5276
- static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
5277
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5815
+ #pragma unroll
5816
+ for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
5817
+ const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
5818
+ const int col_smem = vals_smem ? col0 + tid : col_data;
5278
5819
 
5279
- if (i >= k) {
5280
- return;
5281
- }
5820
+ const int idst = rowx*ncols_data + col_data;
5821
+ const half2 result = vals[col_smem] * inv_sum;
5282
5822
 
5283
- dst[i] = scale * x[i];
5284
- }
5823
+ if (need_check && col_data + 0 >= ncols_data) {
5824
+ return;
5825
+ }
5826
+ dst[idst] = result.x;
5285
5827
 
5286
- static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
5287
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5828
+ if (need_check && col_data + WARP_SIZE >= ncols_data) {
5829
+ return;
5830
+ }
5288
5831
 
5289
- if (i >= k) {
5290
- return;
5832
+ dst[idst + WARP_SIZE] = result.y;
5291
5833
  }
5292
-
5293
- dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
5834
+ #else
5835
+ (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5836
+ bad_arch();
5837
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5294
5838
  }
5295
5839
 
5296
- static __global__ void im2col_f32_f16(
5297
- const float * x, half * dst,
5298
- int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
5299
- int s0, int s1, int p0, int p1, int d0, int d1) {
5300
- const int i = threadIdx.x + blockIdx.x * blockDim.x;
5301
- if (i >= pelements) {
5302
- return;
5303
- }
5840
+ template <bool vals_smem, int ncols_template, int block_size_template>
5841
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5842
+ const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
5304
5843
 
5305
- const int ksize = OW * (KH > 1 ? KW : 1);
5306
- const int kx = i / ksize;
5307
- const int kd = kx * ksize;
5308
- const int ky = (i - kd) / OW;
5309
- const int ix = i % OW;
5844
+ const int tid = threadIdx.x;
5845
+ const int rowx = blockIdx.x;
5846
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
5310
5847
 
5311
- const int64_t iiw = ix * s0 + kx * d0 - p0;
5312
- const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
5848
+ const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
5313
5849
 
5314
- const int64_t offset_dst =
5315
- (blockIdx.y * OW + ix) * CHW +
5316
- (blockIdx.z * (KW * KH) + ky * KW + kx);
5850
+ const int warp_id = threadIdx.x / WARP_SIZE;
5851
+ const int lane_id = threadIdx.x % WARP_SIZE;
5317
5852
 
5318
- if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5319
- dst[offset_dst] = __float2half(0.0f);
5320
- } else {
5321
- const int64_t offset_src = blockIdx.z * offset_delta;
5322
- dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5323
- }
5324
- }
5853
+ extern __shared__ float data_soft_max_f32[];
5854
+ float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
5855
+ // shared memory buffer to cache values between iterations:
5856
+ float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
5325
5857
 
5326
- template<int qk, int qr, dequantize_kernel_t dq>
5327
- static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5328
- const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5858
+ float max_val = -INFINITY;
5329
5859
 
5330
- GGML_TENSOR_BINARY_OP_LOCALS
5860
+ #pragma unroll
5861
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
5862
+ const int col = col0 + tid;
5331
5863
 
5332
- const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5333
- const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5334
- const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5864
+ if (ncols_template == 0 && col >= ncols) {
5865
+ break;
5866
+ }
5335
5867
 
5336
- // strides in elements
5337
- //const size_t s0 = nb0 / ggml_element_size(dst);
5338
- const size_t s1 = nb1 / ggml_element_size(dst);
5339
- const size_t s2 = nb2 / ggml_element_size(dst);
5868
+ const int ix = rowx*ncols + col;
5869
+ const int iy = rowy*ncols + col;
5870
+
5871
+ const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
5872
+ vals[col] = val;
5873
+ max_val = max(max_val, val);
5874
+ }
5875
+
5876
+ // find the max value in the block
5877
+ max_val = warp_reduce_max(max_val);
5878
+ if (block_size > WARP_SIZE) {
5879
+ if (warp_id == 0) {
5880
+ buf_iw[lane_id] = -INFINITY;
5881
+ }
5882
+ __syncthreads();
5883
+
5884
+ if (lane_id == 0) {
5885
+ buf_iw[warp_id] = max_val;
5886
+ }
5887
+ __syncthreads();
5888
+
5889
+ max_val = buf_iw[lane_id];
5890
+ max_val = warp_reduce_max(max_val);
5891
+ }
5892
+
5893
+ float tmp = 0.0f; // partial sum
5894
+
5895
+ #pragma unroll
5896
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
5897
+ const int col = col0 + tid;
5898
+
5899
+ if (ncols_template == 0 && col >= ncols) {
5900
+ break;
5901
+ }
5902
+
5903
+ const float val = expf(vals[col] - max_val);
5904
+ tmp += val;
5905
+ vals[col] = val;
5906
+ }
5907
+
5908
+ // find the sum of exps in the block
5909
+ tmp = warp_reduce_sum(tmp);
5910
+ if (block_size > WARP_SIZE) {
5911
+ if (warp_id == 0) {
5912
+ buf_iw[lane_id] = 0.0f;
5913
+ }
5914
+ __syncthreads();
5915
+
5916
+ if (lane_id == 0) {
5917
+ buf_iw[warp_id] = tmp;
5918
+ }
5919
+ __syncthreads();
5920
+
5921
+ tmp = buf_iw[lane_id];
5922
+ tmp = warp_reduce_sum(tmp);
5923
+ }
5924
+
5925
+ const float inv_sum = 1.0f / tmp;
5926
+
5927
+ #pragma unroll
5928
+ for (int col0 = 0; col0 < ncols; col0 += block_size) {
5929
+ const int col = col0 + tid;
5930
+
5931
+ if (ncols_template == 0 && col >= ncols) {
5932
+ return;
5933
+ }
5934
+
5935
+ const int idst = rowx*ncols + col;
5936
+ dst[idst] = vals[col] * inv_sum;
5937
+ }
5938
+ }
5939
+
5940
+ static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
5941
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
5942
+
5943
+ if (i >= k) {
5944
+ return;
5945
+ }
5946
+
5947
+ dst[i] = scale * x[i];
5948
+ }
5949
+
5950
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
5951
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
5952
+
5953
+ if (i >= k) {
5954
+ return;
5955
+ }
5956
+
5957
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
5958
+ }
5959
+
5960
+ static __global__ void im2col_f32_f16(
5961
+ const float * x, half * dst,
5962
+ int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
5963
+ int s0, int s1, int p0, int p1, int d0, int d1) {
5964
+ const int i = threadIdx.x + blockIdx.x * blockDim.x;
5965
+ if (i >= pelements) {
5966
+ return;
5967
+ }
5968
+
5969
+ const int ksize = OW * (KH > 1 ? KW : 1);
5970
+ const int kx = i / ksize;
5971
+ const int kd = kx * ksize;
5972
+ const int ky = (i - kd) / OW;
5973
+ const int ix = i % OW;
5974
+
5975
+ const int64_t iiw = ix * s0 + kx * d0 - p0;
5976
+ const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
5977
+
5978
+ const int64_t offset_dst =
5979
+ (blockIdx.y * OW + ix) * CHW +
5980
+ (blockIdx.z * (KW * KH) + ky * KW + kx);
5981
+
5982
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5983
+ dst[offset_dst] = __float2half(0.0f);
5984
+ } else {
5985
+ const int64_t offset_src = blockIdx.z * offset_delta;
5986
+ dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
5987
+ }
5988
+ }
5989
+
5990
+ template<int qk, int qr, dequantize_kernel_t dq>
5991
+ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5992
+ const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
5993
+
5994
+ GGML_TENSOR_BINARY_OP_LOCALS
5995
+
5996
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
5997
+ const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
5998
+ const dim3 block_nums(block_num_x, ne10, ne11*ne12);
5999
+
6000
+ // strides in elements
6001
+ //const size_t s0 = nb0 / ggml_element_size(dst);
6002
+ const size_t s1 = nb1 / ggml_element_size(dst);
6003
+ const size_t s2 = nb2 / ggml_element_size(dst);
5340
6004
  const size_t s3 = nb3 / ggml_element_size(dst);
5341
6005
 
5342
6006
  const size_t s10 = nb10 / ggml_element_size(src1);
@@ -5609,10 +6273,21 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
5609
6273
 
5610
6274
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5611
6275
  static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5612
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
6276
+ const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
5613
6277
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5614
6278
  }
5615
6279
 
6280
+ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
6281
+ const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
6282
+ if (k % CUDA_Q8_0_NE_ALIGN == 0) {
6283
+ const bool need_check = false;
6284
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6285
+ } else {
6286
+ const bool need_check = true;
6287
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6288
+ }
6289
+ }
6290
+
5616
6291
  template<typename dst_t>
5617
6292
  static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5618
6293
  const int nb = k / QK_K;
@@ -5633,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
5633
6308
  #endif
5634
6309
  }
5635
6310
 
6311
+ template<typename dst_t>
6312
+ static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6313
+ const int nb32 = k / 32;
6314
+ const int nb = (k + 255) / 256;
6315
+ dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
6316
+ }
6317
+
6318
+ template<typename dst_t>
6319
+ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6320
+ const int nb32 = k / 32;
6321
+ const int nb = (k + 255) / 256;
6322
+ dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
6323
+ }
6324
+
5636
6325
  template<typename dst_t>
5637
6326
  static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
5638
6327
  const int nb = k / QK_K;
@@ -5659,17 +6348,40 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
5659
6348
  #endif
5660
6349
  }
5661
6350
 
6351
+ template<typename dst_t>
6352
+ static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6353
+ const int nb = k / QK_K;
6354
+ dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
6355
+ }
6356
+
6357
+ template<typename dst_t>
6358
+ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6359
+ const int nb = k / QK_K;
6360
+ dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
6361
+ }
6362
+
6363
+ template <typename src_t, typename dst_t>
6364
+ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
6365
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
6366
+ convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
6367
+ }
6368
+
5662
6369
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6370
+ int id;
5663
6371
  switch (type) {
5664
6372
  case GGML_TYPE_Q4_0:
5665
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6373
+ return dequantize_row_q4_0_cuda;
5666
6374
  case GGML_TYPE_Q4_1:
5667
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6375
+ return dequantize_row_q4_1_cuda;
5668
6376
  case GGML_TYPE_Q5_0:
5669
6377
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5670
6378
  case GGML_TYPE_Q5_1:
5671
6379
  return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
5672
6380
  case GGML_TYPE_Q8_0:
6381
+ CUDA_CHECK(cudaGetDevice(&id));
6382
+ if (g_device_caps[id].cc >= CC_PASCAL) {
6383
+ return dequantize_block_q8_0_f16_cuda;
6384
+ }
5673
6385
  return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
5674
6386
  case GGML_TYPE_Q2_K:
5675
6387
  return dequantize_row_q2_K_cuda;
@@ -5681,8 +6393,12 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5681
6393
  return dequantize_row_q5_K_cuda;
5682
6394
  case GGML_TYPE_Q6_K:
5683
6395
  return dequantize_row_q6_K_cuda;
6396
+ case GGML_TYPE_IQ2_XXS:
6397
+ return dequantize_row_iq2_xxs_cuda;
6398
+ case GGML_TYPE_IQ2_XS:
6399
+ return dequantize_row_iq2_xs_cuda;
5684
6400
  case GGML_TYPE_F32:
5685
- return dequantize_block_cuda<1, 1, convert_f32>;
6401
+ return convert_unary_cuda<float>;
5686
6402
  default:
5687
6403
  return nullptr;
5688
6404
  }
@@ -5691,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5691
6407
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5692
6408
  switch (type) {
5693
6409
  case GGML_TYPE_Q4_0:
5694
- return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6410
+ return dequantize_row_q4_0_cuda;
5695
6411
  case GGML_TYPE_Q4_1:
5696
- return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6412
+ return dequantize_row_q4_1_cuda;
5697
6413
  case GGML_TYPE_Q5_0:
5698
6414
  return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
5699
6415
  case GGML_TYPE_Q5_1:
@@ -5710,8 +6426,12 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5710
6426
  return dequantize_row_q5_K_cuda;
5711
6427
  case GGML_TYPE_Q6_K:
5712
6428
  return dequantize_row_q6_K_cuda;
6429
+ case GGML_TYPE_IQ2_XXS:
6430
+ return dequantize_row_iq2_xxs_cuda;
6431
+ case GGML_TYPE_IQ2_XS:
6432
+ return dequantize_row_iq2_xs_cuda;
5713
6433
  case GGML_TYPE_F16:
5714
- return dequantize_block_cuda<1, 1, convert_f16>;
6434
+ return convert_unary_cuda<half>;
5715
6435
  default:
5716
6436
  return nullptr;
5717
6437
  }
@@ -5904,6 +6624,24 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
5904
6624
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
5905
6625
  }
5906
6626
 
6627
+ static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6628
+ GGML_ASSERT(ncols % QK_K == 0);
6629
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6630
+ const dim3 block_nums(block_num_y, 1, 1);
6631
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6632
+ mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
6633
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6634
+ }
6635
+
6636
+ static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6637
+ GGML_ASSERT(ncols % QK_K == 0);
6638
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6639
+ const dim3 block_nums(block_num_y, 1, 1);
6640
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6641
+ mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
6642
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6643
+ }
6644
+
5907
6645
  static void ggml_mul_mat_q4_0_q8_1_cuda(
5908
6646
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
5909
6647
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -6543,12 +7281,90 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
6543
7281
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
6544
7282
  }
6545
7283
 
7284
+ static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
7285
+ int nth = WARP_SIZE;
7286
+ while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
7287
+ const dim3 block_dims(nth, 1, 1);
7288
+ const dim3 block_nums(nrows_x, 1, 1);
7289
+ const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
7290
+ static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
7291
+ if (shmem <= g_device_caps[g_main_device].smpb) {
7292
+ switch (ncols_x) {
7293
+ case 32:
7294
+ soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7295
+ break;
7296
+ case 64:
7297
+ soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7298
+ break;
7299
+ case 128:
7300
+ soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7301
+ break;
7302
+ case 256:
7303
+ soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7304
+ break;
7305
+ case 512:
7306
+ soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7307
+ break;
7308
+ case 1024:
7309
+ soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7310
+ break;
7311
+ case 2048:
7312
+ soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7313
+ break;
7314
+ case 4096:
7315
+ soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7316
+ break;
7317
+ default:
7318
+ soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7319
+ break;
7320
+ }
7321
+ } else {
7322
+ const size_t shmem_low = WARP_SIZE*sizeof(half);
7323
+ soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7324
+ }
7325
+ }
7326
+
6546
7327
  static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
6547
7328
  int nth = WARP_SIZE;
6548
7329
  while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
6549
7330
  const dim3 block_dims(nth, 1, 1);
6550
7331
  const dim3 block_nums(nrows_x, 1, 1);
6551
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7332
+ const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
7333
+ static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
7334
+ if (shmem < g_device_caps[g_main_device].smpb) {
7335
+ switch (ncols_x) {
7336
+ case 32:
7337
+ soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7338
+ break;
7339
+ case 64:
7340
+ soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7341
+ break;
7342
+ case 128:
7343
+ soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7344
+ break;
7345
+ case 256:
7346
+ soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7347
+ break;
7348
+ case 512:
7349
+ soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7350
+ break;
7351
+ case 1024:
7352
+ soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7353
+ break;
7354
+ case 2048:
7355
+ soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7356
+ break;
7357
+ case 4096:
7358
+ soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7359
+ break;
7360
+ default:
7361
+ soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7362
+ break;
7363
+ }
7364
+ } else {
7365
+ const size_t shmem_low = WARP_SIZE*sizeof(float);
7366
+ soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7367
+ }
6552
7368
  }
6553
7369
 
6554
7370
  static void im2col_f32_f16_cuda(const float* x, half* dst,
@@ -6799,11 +7615,11 @@ struct cuda_pool_alloc {
6799
7615
 
6800
7616
  static bool g_cublas_loaded = false;
6801
7617
 
6802
- bool ggml_cublas_loaded(void) {
7618
+ GGML_CALL bool ggml_cublas_loaded(void) {
6803
7619
  return g_cublas_loaded;
6804
7620
  }
6805
7621
 
6806
- void ggml_init_cublas() {
7622
+ GGML_CALL void ggml_init_cublas() {
6807
7623
  static bool initialized = false;
6808
7624
 
6809
7625
  if (!initialized) {
@@ -6856,16 +7672,18 @@ void ggml_init_cublas() {
6856
7672
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
6857
7673
  fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
6858
7674
 
6859
- g_tensor_split[id] = total_vram;
7675
+ g_default_tensor_split[id] = total_vram;
6860
7676
  total_vram += prop.totalGlobalMem;
7677
+
6861
7678
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6862
7679
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6863
7680
  #else
6864
7681
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
6865
7682
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7683
+ g_device_caps[id].smpb = prop.sharedMemPerBlock;
6866
7684
  }
6867
7685
  for (int id = 0; id < g_device_count; ++id) {
6868
- g_tensor_split[id] /= total_vram;
7686
+ g_default_tensor_split[id] /= total_vram;
6869
7687
  }
6870
7688
 
6871
7689
  for (int id = 0; id < g_device_count; ++id) {
@@ -6889,31 +7707,7 @@ void ggml_init_cublas() {
6889
7707
  }
6890
7708
  }
6891
7709
 
6892
- void ggml_cuda_set_tensor_split(const float * tensor_split) {
6893
- if (tensor_split == nullptr) {
6894
- return;
6895
- }
6896
- bool all_zero = true;
6897
- for (int i = 0; i < g_device_count; ++i) {
6898
- if (tensor_split[i] != 0.0f) {
6899
- all_zero = false;
6900
- break;
6901
- }
6902
- }
6903
- if (all_zero) {
6904
- return;
6905
- }
6906
- float split_sum = 0.0f;
6907
- for (int i = 0; i < g_device_count; ++i) {
6908
- g_tensor_split[i] = split_sum;
6909
- split_sum += tensor_split[i];
6910
- }
6911
- for (int i = 0; i < g_device_count; ++i) {
6912
- g_tensor_split[i] /= split_sum;
6913
- }
6914
- }
6915
-
6916
- void * ggml_cuda_host_malloc(size_t size) {
7710
+ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
6917
7711
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
6918
7712
  return nullptr;
6919
7713
  }
@@ -6931,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6931
7725
  return ptr;
6932
7726
  }
6933
7727
 
6934
- void ggml_cuda_host_free(void * ptr) {
7728
+ GGML_CALL void ggml_cuda_host_free(void * ptr) {
6935
7729
  CUDA_CHECK(cudaFreeHost(ptr));
6936
7730
  }
6937
7731
 
@@ -7364,11 +8158,11 @@ static void ggml_cuda_op_mul_mat_q(
7364
8158
  (void) src1_ddf_i;
7365
8159
  }
7366
8160
 
7367
- static int64_t get_row_rounding(ggml_type type) {
8161
+ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
7368
8162
  int64_t min_compute_capability = INT_MAX;
7369
8163
  int64_t max_compute_capability = INT_MIN;
7370
8164
  for (int id = 0; id < g_device_count; ++id) {
7371
- if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8165
+ if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
7372
8166
  if (min_compute_capability > g_device_caps[id].cc) {
7373
8167
  min_compute_capability = g_device_caps[id].cc;
7374
8168
  }
@@ -7396,6 +8190,8 @@ static int64_t get_row_rounding(ggml_type type) {
7396
8190
  case GGML_TYPE_Q4_K:
7397
8191
  case GGML_TYPE_Q5_K:
7398
8192
  case GGML_TYPE_Q6_K:
8193
+ case GGML_TYPE_IQ2_XXS:
8194
+ case GGML_TYPE_IQ2_XS:
7399
8195
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
7400
8196
  default:
7401
8197
  GGML_ASSERT(false);
@@ -7416,6 +8212,8 @@ static int64_t get_row_rounding(ggml_type type) {
7416
8212
  case GGML_TYPE_Q3_K:
7417
8213
  case GGML_TYPE_Q4_K:
7418
8214
  case GGML_TYPE_Q5_K:
8215
+ case GGML_TYPE_IQ2_XXS:
8216
+ case GGML_TYPE_IQ2_XS:
7419
8217
  return max_compute_capability >= CC_VOLTA ? 128 : 64;
7420
8218
  case GGML_TYPE_Q6_K:
7421
8219
  return 64;
@@ -7425,6 +8223,21 @@ static int64_t get_row_rounding(ggml_type type) {
7425
8223
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7426
8224
  }
7427
8225
 
8226
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
8227
+ const int64_t nrows = ggml_nrows(tensor);
8228
+ const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
8229
+
8230
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
8231
+ *row_low -= *row_low % rounding;
8232
+
8233
+ if (id == g_device_count - 1) {
8234
+ *row_high = nrows;
8235
+ } else {
8236
+ *row_high = nrows*tensor_split[id + 1];
8237
+ *row_high -= *row_high % rounding;
8238
+ }
8239
+ }
8240
+
7428
8241
  static void ggml_cuda_op_mul_mat_vec_q(
7429
8242
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7430
8243
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -7466,6 +8279,12 @@ static void ggml_cuda_op_mul_mat_vec_q(
7466
8279
  case GGML_TYPE_Q6_K:
7467
8280
  mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
7468
8281
  break;
8282
+ case GGML_TYPE_IQ2_XXS:
8283
+ mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8284
+ break;
8285
+ case GGML_TYPE_IQ2_XS:
8286
+ mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8287
+ break;
7469
8288
  default:
7470
8289
  GGML_ASSERT(false);
7471
8290
  break;
@@ -7873,7 +8692,21 @@ static void ggml_cuda_op_soft_max(
7873
8692
  float scale = 1.0f;
7874
8693
  memcpy(&scale, dst->op_params, sizeof(float));
7875
8694
 
7876
- soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
8695
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
8696
+ #ifdef GGML_CUDA_F16
8697
+ const bool use_f16_soft_max = true;
8698
+ #else
8699
+ const bool use_f16_soft_max = false;
8700
+ #endif // GGML_CUDA_F16
8701
+ #else
8702
+ const bool use_f16_soft_max = false;
8703
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
8704
+
8705
+ if (use_f16_soft_max) {
8706
+ soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
8707
+ } else {
8708
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
8709
+ }
7877
8710
 
7878
8711
  (void) dst;
7879
8712
  }
@@ -8022,6 +8855,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
8022
8855
  peer_access_enabled = enable_peer_access;
8023
8856
  }
8024
8857
 
8858
+ // FIXME: move this somewhere else
8859
+ struct ggml_backend_cuda_split_buffer_type_context {
8860
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8861
+ };
8862
+
8025
8863
  static void ggml_cuda_op_mul_mat(
8026
8864
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
8027
8865
  const bool convert_src1_to_q8_1) {
@@ -8073,6 +8911,14 @@ static void ggml_cuda_op_mul_mat(
8073
8911
  GGML_ASSERT(!(split && ne03 > 1));
8074
8912
  GGML_ASSERT(!(split && ne02 < ne12));
8075
8913
 
8914
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8915
+ if (split) {
8916
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
8917
+ // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
8918
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
8919
+ tensor_split = buft_ctx->tensor_split;
8920
+ }
8921
+
8076
8922
  struct dev_data {
8077
8923
  cuda_pool_alloc<char> src0_dd_alloc;
8078
8924
  cuda_pool_alloc<float> src1_ddf_alloc;
@@ -8100,17 +8946,17 @@ static void ggml_cuda_op_mul_mat(
8100
8946
  // for multi GPU, get the row boundaries from tensor split
8101
8947
  // and round to mul_mat_q tile sizes
8102
8948
  if (split) {
8103
- const int64_t rounding = get_row_rounding(src0->type);
8949
+ const int64_t rounding = get_row_rounding(src0->type, tensor_split);
8104
8950
 
8105
8951
  if (id != 0) {
8106
- dev[id].row_low = ne01*g_tensor_split[id];
8952
+ dev[id].row_low = ne01*tensor_split[id];
8107
8953
  if (dev[id].row_low < ne01) {
8108
8954
  dev[id].row_low -= dev[id].row_low % rounding;
8109
8955
  }
8110
8956
  }
8111
8957
 
8112
8958
  if (id != g_device_count - 1) {
8113
- dev[id].row_high = ne01*g_tensor_split[id + 1];
8959
+ dev[id].row_high = ne01*tensor_split[id + 1];
8114
8960
  if (dev[id].row_high < ne01) {
8115
8961
  dev[id].row_high -= dev[id].row_high % rounding;
8116
8962
  }
@@ -8396,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
8396
9242
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
8397
9243
  }
8398
9244
 
8399
- bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9245
+ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
8400
9246
  if (!g_cublas_loaded) return false;
8401
9247
 
8402
9248
  const int64_t ne10 = src1->ne[0];
@@ -8656,10 +9502,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8656
9502
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
8657
9503
 
8658
9504
  int64_t min_compute_capability = INT_MAX;
8659
- for (int id = 0; id < g_device_count; ++id) {
8660
- if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8661
- min_compute_capability = g_device_caps[id].cc;
9505
+
9506
+ if (split) {
9507
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9508
+ auto & tensor_split = buft_ctx->tensor_split;
9509
+ for (int id = 0; id < g_device_count; ++id) {
9510
+ if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
9511
+ min_compute_capability = g_device_caps[id].cc;
9512
+ }
8662
9513
  }
9514
+ } else {
9515
+ min_compute_capability = g_device_caps[g_main_device].cc;
8663
9516
  }
8664
9517
 
8665
9518
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -8682,6 +9535,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8682
9535
 
8683
9536
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8684
9537
 
9538
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
9539
+
8685
9540
  // debug helpers
8686
9541
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
8687
9542
  //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -8696,7 +9551,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8696
9551
  } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8697
9552
  // KQV single-batch
8698
9553
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
8699
- } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
9554
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
8700
9555
  // KQ + KQV multi-batch
8701
9556
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
8702
9557
  } else if (src0->type == GGML_TYPE_F32) {
@@ -9158,299 +10013,41 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
9158
10013
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9159
10014
  }
9160
10015
 
9161
- void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9162
- const int64_t nrows = ggml_nrows(tensor);
10016
+ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
10017
+ if (main_device >= g_device_count) {
10018
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10019
+ main_device, g_device_count, g_main_device);
10020
+ return;
10021
+ }
9163
10022
 
9164
- const int64_t ne0 = tensor->ne[0];
10023
+ if (g_main_device != main_device && g_device_count > 1) {
10024
+ g_main_device = main_device;
10025
+ //cudaDeviceProp prop;
10026
+ //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10027
+ //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
10028
+ }
10029
+ }
9165
10030
 
9166
- const size_t nb1 = tensor->nb[1];
10031
+ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
10032
+ if (!g_cublas_loaded) return false;
10033
+
10034
+ ggml_cuda_func_t func;
10035
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10036
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10037
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9167
10038
 
9168
- ggml_backend_type backend = tensor->backend;
9169
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9170
- memset(extra, 0, sizeof(*extra));
10039
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10040
+ return false;
10041
+ }
9171
10042
 
9172
- for (int id = 0; id < g_device_count; ++id) {
9173
- if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9174
- continue;
10043
+ if (tensor->op == GGML_OP_MUL_MAT) {
10044
+ if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
10045
+ #ifndef NDEBUG
10046
+ fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
10047
+ #endif
10048
+ return false;
9175
10049
  }
9176
-
9177
- ggml_cuda_set_device(id);
9178
-
9179
- int64_t row_low, row_high;
9180
- if (backend == GGML_BACKEND_GPU) {
9181
- row_low = 0;
9182
- row_high = nrows;
9183
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
9184
- const int64_t rounding = get_row_rounding(tensor->type);
9185
-
9186
- row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
9187
- row_low -= row_low % rounding;
9188
-
9189
- if (id == g_device_count - 1) {
9190
- row_high = nrows;
9191
- } else {
9192
- row_high = nrows*g_tensor_split[id + 1];
9193
- row_high -= row_high % rounding;
9194
- }
9195
- } else {
9196
- GGML_ASSERT(false);
9197
- }
9198
- if (row_low == row_high) {
9199
- continue;
9200
- }
9201
-
9202
- int64_t nrows_split = row_high - row_low;
9203
-
9204
- const size_t offset_split = row_low*nb1;
9205
- size_t size = ggml_nbytes_split(tensor, nrows_split);
9206
- const size_t original_size = size;
9207
-
9208
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
9209
- if (ne0 % MATRIX_ROW_PADDING != 0) {
9210
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9211
- }
9212
-
9213
- char * buf;
9214
- CUDA_CHECK(cudaMalloc(&buf, size));
9215
- char * buf_host = (char *)data + offset_split;
9216
-
9217
- // set padding to 0 to avoid possible NaN values
9218
- if (size > original_size) {
9219
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
9220
- }
9221
-
9222
- CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
9223
-
9224
- extra->data_device[id] = buf;
9225
-
9226
- if (backend == GGML_BACKEND_GPU_SPLIT) {
9227
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9228
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
9229
- }
9230
- }
9231
- }
9232
-
9233
- tensor->extra = extra;
9234
- }
9235
-
9236
- void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9237
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9238
- return;
9239
- }
9240
-
9241
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9242
-
9243
- for (int id = 0; id < g_device_count; ++id) {
9244
- ggml_cuda_set_device(id);
9245
- if (extra->data_device[id] != nullptr) {
9246
- CUDA_CHECK(cudaFree(extra->data_device[id]));
9247
- }
9248
-
9249
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9250
- if (extra->events[id][is] != nullptr) {
9251
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9252
- }
9253
- }
9254
- }
9255
-
9256
- delete extra;
9257
- }
9258
-
9259
- static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
9260
- static size_t g_temp_tensor_extra_index = 0;
9261
-
9262
- static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
9263
- if (g_temp_tensor_extras == nullptr) {
9264
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
9265
- }
9266
-
9267
- size_t alloc_index = g_temp_tensor_extra_index;
9268
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
9269
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
9270
- memset(extra, 0, sizeof(*extra));
9271
-
9272
- return extra;
9273
- }
9274
-
9275
- static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
9276
- if (scratch && g_scratch_size == 0) {
9277
- return;
9278
- }
9279
-
9280
- tensor->backend = GGML_BACKEND_GPU;
9281
-
9282
- // recursively assign CUDA buffers until a compute tensor is found
9283
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
9284
- const ggml_op src0_op = tensor->src[0]->op;
9285
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
9286
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
9287
- }
9288
- }
9289
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
9290
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
9291
- }
9292
-
9293
- if (scratch && no_alloc) {
9294
- return;
9295
- }
9296
-
9297
- ggml_tensor_extra_gpu * extra;
9298
-
9299
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
9300
- tensor->op == GGML_OP_VIEW ||
9301
- force_inplace;
9302
- const size_t size = ggml_nbytes(tensor);
9303
-
9304
- ggml_cuda_set_device(g_main_device);
9305
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9306
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9307
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9308
- size_t offset = 0;
9309
- if (tensor->op == GGML_OP_VIEW) {
9310
- memcpy(&offset, tensor->op_params, sizeof(size_t));
9311
- }
9312
- extra = ggml_cuda_alloc_temp_tensor_extra();
9313
- extra->data_device[g_main_device] = src0_ddc + offset;
9314
- } else if (tensor->op == GGML_OP_CPY) {
9315
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
9316
- void * src1_ddv = src1_extra->data_device[g_main_device];
9317
- extra = ggml_cuda_alloc_temp_tensor_extra();
9318
- extra->data_device[g_main_device] = src1_ddv;
9319
- } else if (scratch) {
9320
- GGML_ASSERT(size <= g_scratch_size);
9321
- if (g_scratch_offset + size > g_scratch_size) {
9322
- g_scratch_offset = 0;
9323
- }
9324
-
9325
- char * data = (char *) g_scratch_buffer;
9326
- if (data == nullptr) {
9327
- CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
9328
- g_scratch_buffer = data;
9329
- }
9330
- extra = ggml_cuda_alloc_temp_tensor_extra();
9331
- extra->data_device[g_main_device] = data + g_scratch_offset;
9332
-
9333
- g_scratch_offset += size;
9334
-
9335
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
9336
- } else { // allocate new buffers outside of scratch
9337
- void * data;
9338
- CUDA_CHECK(cudaMalloc(&data, size));
9339
- CUDA_CHECK(cudaMemset(data, 0, size));
9340
- extra = new ggml_tensor_extra_gpu;
9341
- memset(extra, 0, sizeof(*extra));
9342
- extra->data_device[g_main_device] = data;
9343
- }
9344
-
9345
- tensor->extra = extra;
9346
- }
9347
-
9348
- void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
9349
- if (g_scratch_size == 0) {
9350
- return;
9351
- }
9352
- if (g_scratch_buffer == nullptr) {
9353
- ggml_cuda_set_device(g_main_device);
9354
- CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
9355
- }
9356
-
9357
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
9358
-
9359
- const bool inplace = tensor->view_src != nullptr;
9360
-
9361
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
9362
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
9363
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
9364
- size_t view_offset = 0;
9365
- if (tensor->op == GGML_OP_VIEW) {
9366
- memcpy(&view_offset, tensor->op_params, sizeof(size_t));
9367
- }
9368
- extra->data_device[g_main_device] = src0_ddc + view_offset;
9369
- } else {
9370
- extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
9371
- }
9372
-
9373
- tensor->extra = extra;
9374
- }
9375
-
9376
- void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
9377
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9378
- GGML_ASSERT(ggml_is_contiguous(tensor));
9379
-
9380
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9381
- ggml_cuda_set_device(g_main_device);
9382
- CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
9383
- }
9384
-
9385
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
9386
- ggml_cuda_assign_buffers_impl(tensor, true, false, false);
9387
- }
9388
-
9389
- void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
9390
- ggml_cuda_assign_buffers_impl(tensor, true, false, true);
9391
- }
9392
-
9393
- void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
9394
- ggml_cuda_assign_buffers_impl(tensor, false, false, false);
9395
- }
9396
-
9397
- void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
9398
- ggml_cuda_assign_buffers_impl(tensor, false, true, false);
9399
- }
9400
-
9401
- void ggml_cuda_set_main_device(const int main_device) {
9402
- if (main_device >= g_device_count) {
9403
- fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
9404
- main_device, g_device_count, g_main_device);
9405
- return;
9406
- }
9407
-
9408
- if (g_main_device != main_device && g_device_count > 1) {
9409
- g_main_device = main_device;
9410
- cudaDeviceProp prop;
9411
- CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
9412
- fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
9413
- }
9414
- }
9415
-
9416
- void ggml_cuda_set_scratch_size(const size_t scratch_size) {
9417
- // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
9418
- // it still won't always work as expected, but it's better than nothing
9419
- if (scratch_size > g_scratch_size) {
9420
- ggml_cuda_free_scratch();
9421
- }
9422
- g_scratch_size = std::max(g_scratch_size, scratch_size);
9423
- }
9424
-
9425
- void ggml_cuda_free_scratch() {
9426
- if (g_scratch_buffer == nullptr) {
9427
- return;
9428
- }
9429
-
9430
- CUDA_CHECK(cudaFree(g_scratch_buffer));
9431
- g_scratch_buffer = nullptr;
9432
- }
9433
-
9434
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
9435
- if (!g_cublas_loaded) return false;
9436
-
9437
- ggml_cuda_func_t func;
9438
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
9439
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
9440
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
9441
-
9442
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
9443
- return false;
9444
- }
9445
-
9446
- if (tensor->op == GGML_OP_MUL_MAT) {
9447
- if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
9448
- #ifndef NDEBUG
9449
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
9450
- #endif
9451
- return false;
9452
- }
9453
- }
10050
+ }
9454
10051
 
9455
10052
  switch (tensor->op) {
9456
10053
  case GGML_OP_REPEAT:
@@ -9589,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9589
10186
  return true;
9590
10187
  }
9591
10188
 
9592
- int ggml_cuda_get_device_count() {
10189
+ GGML_CALL int ggml_cuda_get_device_count() {
9593
10190
  int device_count;
9594
10191
  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
9595
10192
  return 0;
@@ -9597,7 +10194,7 @@ int ggml_cuda_get_device_count() {
9597
10194
  return device_count;
9598
10195
  }
9599
10196
 
9600
- void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10197
+ GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
9601
10198
  cudaDeviceProp prop;
9602
10199
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
9603
10200
  snprintf(description, description_size, "%s", prop.name);
@@ -9609,21 +10206,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
9609
10206
 
9610
10207
  #define UNUSED GGML_UNUSED
9611
10208
 
10209
+ struct ggml_backend_cuda_context {
10210
+ int device;
10211
+ std::string name;
10212
+ };
10213
+
9612
10214
  // cuda buffer
9613
10215
 
9614
- struct ggml_backend_buffer_context_cuda {
10216
+ struct ggml_backend_cuda_buffer_context {
9615
10217
  int device;
9616
10218
  void * dev_ptr = nullptr;
9617
10219
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
9618
10220
  size_t temp_tensor_extra_index = 0;
10221
+ std::string name;
9619
10222
 
9620
- ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
10223
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
10224
+ device(device), dev_ptr(dev_ptr),
10225
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10226
+ }
9621
10227
 
9622
- ~ggml_backend_buffer_context_cuda() {
10228
+ ~ggml_backend_cuda_buffer_context() {
9623
10229
  delete[] temp_tensor_extras;
9624
10230
  }
9625
10231
 
9626
10232
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
10233
+ // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
9627
10234
  if (temp_tensor_extras == nullptr) {
9628
10235
  temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
9629
10236
  }
@@ -9637,19 +10244,28 @@ struct ggml_backend_buffer_context_cuda {
9637
10244
  }
9638
10245
  };
9639
10246
 
9640
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9641
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10247
+ GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10248
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10249
+ return ctx->name.c_str();
10250
+ }
10251
+
10252
+ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10253
+ return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10254
+ }
10255
+
10256
+ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10257
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9642
10258
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
9643
10259
  delete ctx;
9644
10260
  }
9645
10261
 
9646
- static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
9647
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10262
+ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10263
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9648
10264
  return ctx->dev_ptr;
9649
10265
  }
9650
10266
 
9651
- static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
9652
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10267
+ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10268
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9653
10269
 
9654
10270
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
9655
10271
  assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -9678,76 +10294,106 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
9678
10294
  CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
9679
10295
  }
9680
10296
  }
9681
-
9682
- UNUSED(buffer);
9683
10297
  }
9684
10298
 
9685
- static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10299
+ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9686
10300
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9687
10301
 
9688
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10302
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9689
10303
 
9690
10304
  ggml_cuda_set_device(ctx->device);
9691
10305
  CUDA_CHECK(cudaDeviceSynchronize());
9692
-
9693
10306
  CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
10307
+ CUDA_CHECK(cudaDeviceSynchronize());
9694
10308
  }
9695
10309
 
9696
- static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10310
+ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9697
10311
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
9698
10312
 
9699
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10313
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9700
10314
 
9701
10315
  ggml_cuda_set_device(ctx->device);
9702
10316
  CUDA_CHECK(cudaDeviceSynchronize());
9703
-
9704
10317
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10318
+ CUDA_CHECK(cudaDeviceSynchronize());
9705
10319
  }
9706
10320
 
9707
- static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9708
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10321
+ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10322
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
10323
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10324
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10325
+
10326
+ ggml_cuda_set_device(src_ctx->device);
10327
+ CUDA_CHECK(cudaDeviceSynchronize());
10328
+ ggml_cuda_set_device(dst_ctx->device);
10329
+ CUDA_CHECK(cudaDeviceSynchronize());
10330
+ CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10331
+ CUDA_CHECK(cudaDeviceSynchronize());
10332
+
10333
+ return true;
10334
+ }
10335
+ return false;
10336
+ }
10337
+
10338
+ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10339
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
9709
10340
 
9710
10341
  ggml_cuda_set_device(ctx->device);
9711
10342
  CUDA_CHECK(cudaDeviceSynchronize());
9712
-
9713
10343
  CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
10344
+ CUDA_CHECK(cudaDeviceSynchronize());
9714
10345
  }
9715
10346
 
9716
- static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
10347
+ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
10348
+ /* .get_name = */ ggml_backend_cuda_buffer_get_name,
9717
10349
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
9718
10350
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
9719
10351
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
9720
10352
  /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
9721
10353
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
9722
- /* .cpy_tensor_from = */ NULL,
9723
- /* .cpy_tensor_to = */ NULL,
10354
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
9724
10355
  /* .clear = */ ggml_backend_cuda_buffer_clear,
10356
+ /* .reset = */ NULL,
9725
10357
  };
9726
10358
 
9727
10359
  // cuda buffer type
10360
+ struct ggml_backend_cuda_buffer_type_context {
10361
+ int device;
10362
+ std::string name;
10363
+ };
9728
10364
 
9729
- static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9730
- int device = (int) (intptr_t) buft->context;
10365
+ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10366
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
9731
10367
 
9732
- ggml_cuda_set_device(device);
10368
+ return ctx->name.c_str();
10369
+ }
10370
+
10371
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10372
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10373
+
10374
+ ggml_cuda_set_device(buft_ctx->device);
9733
10375
 
9734
10376
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
9735
10377
 
9736
10378
  void * dev_ptr;
9737
- CUDA_CHECK(cudaMalloc(&dev_ptr, size));
10379
+ cudaError_t err = cudaMalloc(&dev_ptr, size);
10380
+ if (err != cudaSuccess) {
10381
+ fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
10382
+ return nullptr;
10383
+ }
9738
10384
 
9739
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
10385
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
9740
10386
 
9741
- return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
10387
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
9742
10388
  }
9743
10389
 
9744
- static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10390
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
9745
10391
  return 128;
9746
10392
 
9747
10393
  UNUSED(buft);
9748
10394
  }
9749
10395
 
9750
- static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
10396
+ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
9751
10397
  int64_t row_low = 0;
9752
10398
  int64_t row_high = ggml_nrows(tensor);
9753
10399
  int64_t nrows_split = row_high - row_low;
@@ -9767,22 +10413,33 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
9767
10413
  UNUSED(buft);
9768
10414
  }
9769
10415
 
9770
- static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
9771
- return ggml_backend_is_cuda(backend);
10416
+ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10417
+ if (!ggml_backend_is_cuda(backend)) {
10418
+ return false;
10419
+ }
9772
10420
 
9773
- UNUSED(buft);
10421
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10422
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10423
+
10424
+ return buft_ctx->device == cuda_ctx->device;
9774
10425
  }
9775
10426
 
9776
10427
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10428
+ /* .get_name = */ ggml_backend_cuda_buffer_type_name,
9777
10429
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
9778
10430
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
9779
10431
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
9780
10432
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9781
- /* .is_host = */ nullptr,
10433
+ /* .is_host = */ NULL,
9782
10434
  };
9783
10435
 
9784
- ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9785
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10436
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10437
+ // FIXME: this is not thread safe
10438
+ if (device >= ggml_backend_cuda_get_device_count()) {
10439
+ return nullptr;
10440
+ }
10441
+
10442
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
9786
10443
 
9787
10444
  static bool ggml_backend_cuda_buffer_type_initialized = false;
9788
10445
 
@@ -9790,7 +10447,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9790
10447
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
9791
10448
  ggml_backend_cuda_buffer_types[i] = {
9792
10449
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
9793
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
10450
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
9794
10451
  };
9795
10452
  }
9796
10453
  ggml_backend_cuda_buffer_type_initialized = true;
@@ -9799,13 +10456,311 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9799
10456
  return &ggml_backend_cuda_buffer_types[device];
9800
10457
  }
9801
10458
 
10459
+ // cuda split buffer
10460
+
10461
+ struct ggml_backend_cuda_split_buffer_context {
10462
+ ~ggml_backend_cuda_split_buffer_context() {
10463
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
10464
+ for (int id = 0; id < g_device_count; ++id) {
10465
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10466
+ if (extra->events[id][is] != nullptr) {
10467
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
10468
+ }
10469
+ }
10470
+ if (extra->data_device[id] != nullptr) {
10471
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
10472
+ }
10473
+ }
10474
+ delete extra;
10475
+ }
10476
+ }
10477
+
10478
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10479
+ };
10480
+
10481
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10482
+ return GGML_CUDA_NAME "_Split";
10483
+
10484
+ UNUSED(buffer);
10485
+ }
10486
+
10487
+ // unused at the moment
10488
+ //static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
10489
+ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10490
+ //}
10491
+
10492
+ GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10493
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10494
+ delete ctx;
10495
+ }
10496
+
10497
+ GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10498
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10499
+ return (void *)0x1000;
10500
+
10501
+ UNUSED(buffer);
10502
+ }
10503
+
10504
+ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10505
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10506
+
10507
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10508
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10509
+
10510
+ const int64_t ne0 = tensor->ne[0];
10511
+
10512
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
10513
+
10514
+ ctx->tensor_extras.push_back(extra);
10515
+
10516
+ for (int id = 0; id < g_device_count; ++id) {
10517
+ int64_t row_low, row_high;
10518
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10519
+
10520
+ int64_t nrows_split = row_high - row_low;
10521
+ if (nrows_split == 0) {
10522
+ continue;
10523
+ }
10524
+
10525
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10526
+ const size_t original_size = size;
10527
+
10528
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10529
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10530
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10531
+ }
10532
+
10533
+ // FIXME: do not crash if cudaMalloc fails
10534
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
10535
+ ggml_cuda_set_device(id);
10536
+ char * buf;
10537
+ CUDA_CHECK(cudaMalloc(&buf, size));
10538
+
10539
+ // set padding to 0 to avoid possible NaN values
10540
+ if (size > original_size) {
10541
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
10542
+ }
10543
+
10544
+ extra->data_device[id] = buf;
10545
+
10546
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10547
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
10548
+ }
10549
+ }
10550
+ tensor->backend = GGML_BACKEND_GPU_SPLIT;
10551
+ tensor->extra = extra;
10552
+ }
10553
+
10554
+ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10555
+ // split tensors must always be set in their entirety at once
10556
+ GGML_ASSERT(offset == 0);
10557
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10558
+
10559
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10560
+
10561
+ const int64_t ne0 = tensor->ne[0];
10562
+ const size_t nb1 = tensor->nb[1];
10563
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10564
+
10565
+ for (int id = 0; id < g_device_count; ++id) {
10566
+ int64_t row_low, row_high;
10567
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10568
+
10569
+ int64_t nrows_split = row_high - row_low;
10570
+ if (nrows_split == 0) {
10571
+ continue;
10572
+ }
10573
+
10574
+ const size_t offset_split = row_low*nb1;
10575
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10576
+ const size_t original_size = size;
10577
+
10578
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10579
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10580
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10581
+ }
10582
+
10583
+ const char * buf_host = (const char *)data + offset_split;
10584
+ CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
10585
+ }
10586
+ }
10587
+
10588
+ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10589
+ // split tensors must always be set in their entirety at once
10590
+ GGML_ASSERT(offset == 0);
10591
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10592
+
10593
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10594
+
10595
+ const int64_t ne0 = tensor->ne[0];
10596
+ const size_t nb1 = tensor->nb[1];
10597
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10598
+
10599
+ for (int id = 0; id < g_device_count; ++id) {
10600
+ int64_t row_low, row_high;
10601
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10602
+
10603
+ int64_t nrows_split = row_high - row_low;
10604
+ if (nrows_split == 0) {
10605
+ continue;
10606
+ }
10607
+
10608
+ const size_t offset_split = row_low*nb1;
10609
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10610
+ const size_t original_size = size;
10611
+
10612
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10613
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10614
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10615
+ }
10616
+
10617
+ char * buf_host = (char *)data + offset_split;
10618
+ CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
10619
+ }
10620
+ }
10621
+
10622
+ GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10623
+ UNUSED(buffer);
10624
+ UNUSED(value);
10625
+ }
10626
+
10627
+ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
10628
+ /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
10629
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
10630
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
10631
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
10632
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
10633
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
10634
+ /* .cpy_tensor = */ NULL,
10635
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
10636
+ /* .reset = */ NULL,
10637
+ };
10638
+
10639
+ // cuda split buffer type
10640
+
10641
+ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10642
+ return GGML_CUDA_NAME "_Split";
10643
+
10644
+ UNUSED(buft);
10645
+ }
10646
+
10647
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10648
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10649
+ // instead, we allocate them for each tensor separately in init_tensor
10650
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
10651
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
10652
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
10653
+
10654
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10655
+ }
10656
+
10657
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10658
+ return 128;
10659
+
10660
+ UNUSED(buft);
10661
+ }
10662
+
10663
+ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10664
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10665
+
10666
+ size_t total_size = 0;
10667
+
10668
+ const int64_t ne0 = tensor->ne[0];
10669
+
10670
+ for (int id = 0; id < g_device_count; ++id) {
10671
+ int64_t row_low, row_high;
10672
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
10673
+
10674
+ int64_t nrows_split = row_high - row_low;
10675
+ if (nrows_split == 0) {
10676
+ continue;
10677
+ }
10678
+
10679
+ total_size += ggml_nbytes_split(tensor, nrows_split);
10680
+
10681
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10682
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10683
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10684
+ }
10685
+ }
10686
+
10687
+ return total_size;
10688
+ }
10689
+
10690
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10691
+ return ggml_backend_is_cuda(backend);
10692
+
10693
+ UNUSED(buft);
10694
+ }
10695
+
10696
+ GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10697
+ return false;
10698
+
10699
+ UNUSED(buft);
10700
+ }
10701
+
10702
+ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
10703
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
10704
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
10705
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
10706
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
10707
+ /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
10708
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10709
+ };
10710
+
10711
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10712
+ // FIXME: this is not thread safe
10713
+ static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10714
+
10715
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
10716
+
10717
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
10718
+ if (all_zero) {
10719
+ tensor_split_arr = g_default_tensor_split;
10720
+ } else {
10721
+ float split_sum = 0.0f;
10722
+ for (int i = 0; i < g_device_count; ++i) {
10723
+ tensor_split_arr[i] = split_sum;
10724
+ split_sum += tensor_split[i];
10725
+ }
10726
+ for (int i = 0; i < g_device_count; ++i) {
10727
+ tensor_split_arr[i] /= split_sum;
10728
+ }
10729
+ }
10730
+
10731
+ auto it = buft_map.find(tensor_split_arr);
10732
+ if (it != buft_map.end()) {
10733
+ return &it->second;
10734
+ }
10735
+
10736
+ struct ggml_backend_buffer_type buft {
10737
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
10738
+ /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
10739
+ };
10740
+
10741
+ auto result = buft_map.emplace(tensor_split_arr, buft);
10742
+ return &result.first->second;
10743
+ }
10744
+
9802
10745
  // host buffer type
9803
10746
 
9804
- static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10747
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10748
+ return GGML_CUDA_NAME "_Host";
10749
+
10750
+ UNUSED(buft);
10751
+ }
10752
+
10753
+ GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10754
+ return GGML_CUDA_NAME "_Host";
10755
+
10756
+ UNUSED(buffer);
10757
+ }
10758
+
10759
+ GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9805
10760
  ggml_cuda_host_free(buffer->context);
9806
10761
  }
9807
10762
 
9808
- static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10763
+ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9809
10764
  void * ptr = ggml_cuda_host_malloc(size);
9810
10765
 
9811
10766
  if (ptr == nullptr) {
@@ -9813,17 +10768,18 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
9813
10768
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
9814
10769
  }
9815
10770
 
9816
- // FIXME: this is a hack to avoid having to implement a new buffer type
9817
10771
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
9818
10772
  buffer->buft = buft;
10773
+ buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
9819
10774
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
9820
10775
 
9821
10776
  return buffer;
9822
10777
  }
9823
10778
 
9824
- ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10779
+ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9825
10780
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9826
10781
  /* .iface = */ {
10782
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
9827
10783
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9828
10784
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9829
10785
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -9838,31 +10794,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
9838
10794
 
9839
10795
  // backend
9840
10796
 
9841
- struct ggml_backend_context_cuda {
9842
- int device;
9843
- };
9844
-
9845
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
9846
- return GGML_CUDA_NAME;
10797
+ GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10798
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9847
10799
 
9848
- UNUSED(backend);
10800
+ return cuda_ctx->name.c_str();
9849
10801
  }
9850
10802
 
9851
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
9852
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10803
+ GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
10804
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9853
10805
 
9854
10806
  delete cuda_ctx;
9855
10807
  delete backend;
9856
10808
  }
9857
10809
 
9858
- static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
9859
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10810
+ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10811
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9860
10812
 
9861
10813
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
9862
10814
  }
9863
10815
 
9864
- static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
9865
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10816
+ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10817
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9866
10818
 
9867
10819
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9868
10820
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -9870,8 +10822,8 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
9870
10822
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
9871
10823
  }
9872
10824
 
9873
- static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
9874
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10825
+ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10826
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9875
10827
 
9876
10828
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
9877
10829
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -9879,39 +10831,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
9879
10831
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
9880
10832
  }
9881
10833
 
9882
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
9883
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
9884
-
9885
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
9886
-
9887
- UNUSED(backend);
9888
- }
9889
-
9890
- static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
9891
- GGML_ASSERT(!"not implemented");
10834
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10835
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9892
10836
 
9893
- return nullptr;
10837
+ if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
10838
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
10839
+ return true;
10840
+ }
9894
10841
 
9895
- UNUSED(backend);
9896
- UNUSED(cgraph);
10842
+ return false;
9897
10843
  }
9898
10844
 
9899
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9900
- GGML_ASSERT(!"not implemented");
10845
+ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10846
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9901
10847
 
9902
- UNUSED(backend);
9903
- UNUSED(plan);
9904
- }
9905
-
9906
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
9907
- GGML_ASSERT(!"not implemented");
10848
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
9908
10849
 
9909
10850
  UNUSED(backend);
9910
- UNUSED(plan);
9911
10851
  }
9912
10852
 
9913
- static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
9914
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10853
+ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10854
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
9915
10855
 
9916
10856
  ggml_cuda_set_main_device(cuda_ctx->device);
9917
10857
 
@@ -9921,55 +10861,35 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
9921
10861
  for (int i = 0; i < cgraph->n_nodes; i++) {
9922
10862
  ggml_tensor * node = cgraph->nodes[i];
9923
10863
 
9924
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
10864
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
9925
10865
  continue;
10866
+ }
9926
10867
 
9927
- assert(node->backend == GGML_BACKEND_GPU);
10868
+ #ifndef NDEBUG
10869
+ assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
9928
10870
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9929
10871
  assert(node->extra != nullptr);
9930
10872
 
9931
10873
  for (int j = 0; j < GGML_MAX_SRC; j++) {
9932
10874
  if (node->src[j] != nullptr) {
9933
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
10875
+ assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
9934
10876
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
9935
10877
  assert(node->src[j]->extra != nullptr);
9936
10878
  }
9937
10879
  }
10880
+ #endif
9938
10881
 
9939
10882
  bool ok = ggml_cuda_compute_forward(&params, node);
9940
10883
  if (!ok) {
9941
10884
  fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
9942
10885
  }
9943
10886
  GGML_ASSERT(ok);
9944
-
9945
- #if 0
9946
- if (node->type == GGML_TYPE_F32) {
9947
- cudaDeviceSynchronize();
9948
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
9949
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
9950
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
9951
- ggml_type_name(node->src[0]->type),
9952
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
9953
- node->src[0]->name,
9954
- node->src[1] ? node->src[1]->name : "none");
9955
- double sum = 0.0;
9956
- double sq_sum = 0.0;
9957
- for (int i = 0; i < ggml_nelements(node); i++) {
9958
- printf("%f ", tmp[i]);
9959
- sum += tmp[i];
9960
- sq_sum += tmp[i]*tmp[i];
9961
- }
9962
- printf("\n");
9963
- printf("sum: %f, ", sum);
9964
- printf("sq_sum: %f\n", sq_sum);
9965
- }
9966
- #endif
9967
10887
  }
9968
10888
 
9969
- UNUSED(backend);
10889
+ return true;
9970
10890
  }
9971
10891
 
9972
- static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
10892
+ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
9973
10893
  switch (op->op) {
9974
10894
  case GGML_OP_UNARY:
9975
10895
  switch (ggml_get_unary_op(op)) {
@@ -10080,23 +11000,22 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10080
11000
  UNUSED(backend);
10081
11001
  }
10082
11002
 
10083
- static ggml_backend_i cuda_backend_i = {
11003
+ static ggml_backend_i ggml_backend_cuda_interface = {
10084
11004
  /* .get_name = */ ggml_backend_cuda_name,
10085
11005
  /* .free = */ ggml_backend_cuda_free,
10086
11006
  /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
10087
11007
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
10088
11008
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
10089
- /* .cpy_tensor_from_async = */ NULL,
10090
- /* .cpy_tensor_to_async = */ NULL,
11009
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
10091
11010
  /* .synchronize = */ ggml_backend_cuda_synchronize,
10092
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
10093
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
10094
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
11011
+ /* .graph_plan_create = */ NULL,
11012
+ /* .graph_plan_free = */ NULL,
11013
+ /* .graph_plan_compute = */ NULL,
10095
11014
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
10096
11015
  /* .supports_op = */ ggml_backend_cuda_supports_op,
10097
11016
  };
10098
11017
 
10099
- ggml_backend_t ggml_backend_cuda_init(int device) {
11018
+ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
10100
11019
  ggml_init_cublas(); // TODO: remove from ggml.c
10101
11020
 
10102
11021
  if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -10107,32 +11026,48 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
10107
11026
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
10108
11027
  ggml_cuda_set_main_device(device);
10109
11028
 
10110
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
10111
- /* .device = */ device
11029
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
11030
+ /* .device = */ device,
11031
+ /* .name = */ GGML_CUDA_NAME + std::to_string(device),
10112
11032
  };
10113
11033
 
10114
11034
  ggml_backend_t cuda_backend = new ggml_backend {
10115
- /* .interface = */ cuda_backend_i,
11035
+ /* .interface = */ ggml_backend_cuda_interface,
10116
11036
  /* .context = */ ctx
10117
11037
  };
10118
11038
 
10119
11039
  return cuda_backend;
10120
11040
  }
10121
11041
 
10122
- bool ggml_backend_is_cuda(ggml_backend_t backend) {
10123
- return backend->iface.get_name == ggml_backend_cuda_name;
11042
+ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
11043
+ return backend && backend->iface.get_name == ggml_backend_cuda_name;
11044
+ }
11045
+
11046
+ GGML_CALL int ggml_backend_cuda_get_device_count() {
11047
+ return ggml_cuda_get_device_count();
11048
+ }
11049
+
11050
+ GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
11051
+ ggml_cuda_get_device_description(device, description, description_size);
11052
+ }
11053
+
11054
+ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
11055
+ ggml_cuda_set_device(device);
11056
+
11057
+ CUDA_CHECK(cudaMemGetInfo(free, total));
10124
11058
  }
10125
11059
 
10126
- static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
11060
+ // backend registry
11061
+ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
10127
11062
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
10128
11063
  return cuda_backend;
10129
11064
 
10130
11065
  UNUSED(params);
10131
11066
  }
10132
11067
 
10133
- extern "C" int ggml_backend_cuda_reg_devices();
11068
+ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
10134
11069
 
10135
- int ggml_backend_cuda_reg_devices() {
11070
+ GGML_CALL int ggml_backend_cuda_reg_devices() {
10136
11071
  int device_count = ggml_cuda_get_device_count();
10137
11072
  //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
10138
11073
  for (int i = 0; i < device_count; i++) {