llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,15 @@
2
2
  #include "ggml.h"
3
3
  #include "ggml-backend-impl.h"
4
4
 
5
+ #if defined(GGML_USE_HIPBLAS)
6
+ #define GGML_COMMON_DECL_HIP
7
+ #define GGML_COMMON_IMPL_HIP
8
+ #else
9
+ #define GGML_COMMON_DECL_CUDA
10
+ #define GGML_COMMON_IMPL_CUDA
11
+ #endif
12
+ #include "ggml-common.h"
13
+
5
14
  #include <algorithm>
6
15
  #include <assert.h>
7
16
  #include <atomic>
@@ -63,6 +72,7 @@
63
72
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
64
73
  #define cudaEventDisableTiming hipEventDisableTiming
65
74
  #define cudaEventRecord hipEventRecord
75
+ #define cudaEventSynchronize hipEventSynchronize
66
76
  #define cudaEvent_t hipEvent_t
67
77
  #define cudaEventDestroy hipEventDestroy
68
78
  #define cudaFree hipFree
@@ -72,6 +82,7 @@
72
82
  #define cudaGetDeviceProperties hipGetDeviceProperties
73
83
  #define cudaGetErrorString hipGetErrorString
74
84
  #define cudaGetLastError hipGetLastError
85
+ #define cudaLaunchHostFunc hipLaunchHostFunc
75
86
  #ifdef GGML_HIP_UMA
76
87
  #define cudaMalloc hipMallocManaged
77
88
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -95,6 +106,7 @@
95
106
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
96
107
  #define cudaStreamFireAndForget hipStreamFireAndForget
97
108
  #define cudaStreamNonBlocking hipStreamNonBlocking
109
+ #define cudaStreamPerThread hipStreamPerThread
98
110
  #define cudaStreamSynchronize hipStreamSynchronize
99
111
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
100
112
  #define cudaStream_t hipStream_t
@@ -172,6 +184,7 @@
172
184
  #endif
173
185
 
174
186
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
187
+ typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
175
188
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
176
189
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
177
190
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
@@ -196,6 +209,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
196
209
  return __vsubss4(a, b);
197
210
  }
198
211
 
212
+ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
213
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
214
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
215
+ unsigned int c;
216
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
217
+ #pragma unroll
218
+ for (int i = 0; i < 4; ++i) {
219
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
220
+ }
221
+ return c;
222
+ }
223
+
199
224
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
200
225
  #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
201
226
  c = __builtin_amdgcn_sdot4(a, b, c, false);
@@ -343,66 +368,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
343
368
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
344
369
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
345
370
 
346
- // QK = number of values after dequantization
347
- // QR = QK / number of values before dequantization
348
- // QI = number of 32 bit integers before dequantization
349
-
350
- #define QK4_0 32
351
- #define QR4_0 2
352
- #define QI4_0 (QK4_0 / (4 * QR4_0))
353
- typedef struct {
354
- half d; // delta
355
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
356
- } block_q4_0;
357
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
358
-
359
- #define QK4_1 32
360
- #define QR4_1 2
361
- #define QI4_1 (QK4_1 / (4 * QR4_1))
362
- typedef struct {
363
- half2 dm; // dm.x = delta, dm.y = min
364
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
365
- } block_q4_1;
366
- static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
367
-
368
- #define QK5_0 32
369
- #define QR5_0 2
370
- #define QI5_0 (QK5_0 / (4 * QR5_0))
371
- typedef struct {
372
- half d; // delta
373
- uint8_t qh[4]; // 5-th bit of quants
374
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
375
- } block_q5_0;
376
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
377
-
378
- #define QK5_1 32
379
- #define QR5_1 2
380
- #define QI5_1 (QK5_1 / (4 * QR5_1))
381
- typedef struct {
382
- half2 dm; // dm.x = delta, dm.y = min
383
- uint8_t qh[4]; // 5-th bit of quants
384
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
385
- } block_q5_1;
386
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
387
-
388
- #define QK8_0 32
389
- #define QR8_0 1
390
- #define QI8_0 (QK8_0 / (4 * QR8_0))
391
- typedef struct {
392
- half d; // delta
393
- int8_t qs[QK8_0]; // quants
394
- } block_q8_0;
395
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
396
-
397
- #define QK8_1 32
398
- #define QR8_1 1
399
- #define QI8_1 (QK8_1 / (4 * QR8_1))
400
- typedef struct {
401
- half2 ds; // ds.x = delta, ds.y = sum
402
- int8_t qs[QK8_0]; // quants
403
- } block_q8_1;
404
- static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
405
-
406
371
  typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
407
372
  typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
408
373
  typedef void (*load_tiles_cuda_t)(
@@ -412,130 +377,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
412
377
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
413
378
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
414
379
 
415
- //================================= k-quants
416
-
417
- #ifdef GGML_QKK_64
418
- #define QK_K 64
419
- #define K_SCALE_SIZE 4
420
- #else
421
- #define QK_K 256
422
- #define K_SCALE_SIZE 12
423
- #endif
424
-
425
- #define QR2_K 4
426
- #define QI2_K (QK_K / (4*QR2_K))
427
- typedef struct {
428
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
429
- uint8_t qs[QK_K/4]; // quants
430
- half2 dm; // super-block scale for quantized scales/mins
431
- } block_q2_K;
432
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
433
-
434
- #define QR3_K 4
435
- #define QI3_K (QK_K / (4*QR3_K))
436
- typedef struct {
437
- uint8_t hmask[QK_K/8]; // quants - high bit
438
- uint8_t qs[QK_K/4]; // quants - low 2 bits
439
- #ifdef GGML_QKK_64
440
- uint8_t scales[2]; // scales, quantized with 8 bits
441
- #else
442
- uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
443
- #endif
444
- half d; // super-block scale
445
- } block_q3_K;
446
- //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
447
-
448
- #define QR4_K 2
449
- #define QI4_K (QK_K / (4*QR4_K))
450
- #ifdef GGML_QKK_64
451
- typedef struct {
452
- half dm[2]; // super-block scales/mins
453
- uint8_t scales[2]; // 4-bit block scales/mins
454
- uint8_t qs[QK_K/2]; // 4--bit quants
455
- } block_q4_K;
456
- static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
457
- #else
458
- typedef struct {
459
- half2 dm; // super-block scale for quantized scales/mins
460
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
461
- uint8_t qs[QK_K/2]; // 4--bit quants
462
- } block_q4_K;
463
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
464
- #endif
465
-
466
- #define QR5_K 2
467
- #define QI5_K (QK_K / (4*QR5_K))
468
- #ifdef GGML_QKK_64
469
- typedef struct {
470
- half d; // super-block scale
471
- int8_t scales[QK_K/16]; // block scales
472
- uint8_t qh[QK_K/8]; // quants, high bit
473
- uint8_t qs[QK_K/2]; // quants, low 4 bits
474
- } block_q5_K;
475
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
476
- #else
477
- typedef struct {
478
- half2 dm; // super-block scale for quantized scales/mins
479
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
480
- uint8_t qh[QK_K/8]; // quants, high bit
481
- uint8_t qs[QK_K/2]; // quants, low 4 bits
482
- } block_q5_K;
483
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
484
- #endif
485
-
486
- #define QR6_K 2
487
- #define QI6_K (QK_K / (4*QR6_K))
488
- typedef struct {
489
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
490
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
491
- int8_t scales[QK_K/16]; // scales
492
- half d; // delta
493
- } block_q6_K;
494
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
495
-
496
- #define QR2_XXS 8
497
- #define QI2_XXS (QK_K / (4*QR2_XXS))
498
- typedef struct {
499
- half d;
500
- uint16_t qs[QK_K/8];
501
- } block_iq2_xxs;
502
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
503
-
504
- #define QR2_XS 8
505
- #define QI2_XS (QK_K / (4*QR2_XS))
506
- typedef struct {
507
- half d;
508
- uint16_t qs[QK_K/8];
509
- uint8_t scales[QK_K/32];
510
- } block_iq2_xs;
511
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
512
-
513
- #define QR3_XXS 8
514
- #define QI3_XXS (QK_K / (4*QR3_XXS))
515
- typedef struct {
516
- half d;
517
- uint8_t qs[3*(QK_K/8)];
518
- } block_iq3_xxs;
519
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
520
-
521
- #define QR1_S 8
522
- #define QI1_S (QK_K / (4*QR1_S))
523
- typedef struct {
524
- half d;
525
- uint8_t qs[QK_K/8];
526
- uint8_t scales[QK_K/16];
527
- } block_iq1_s;
528
- static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
529
-
530
- #define QK4_NL 32
531
- #define QR4_NL 2
532
- #define QI4_NL (QK4_NL / (4*QR4_NL))
533
- typedef struct {
534
- half d;
535
- uint8_t qs[QK4_NL/2];
536
- } block_iq4_nl;
537
- static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
538
-
539
380
  #define WARP_SIZE 32
540
381
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
541
382
 
@@ -559,6 +400,8 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
559
400
  #define CUDA_UPSCALE_BLOCK_SIZE 256
560
401
  #define CUDA_CONCAT_BLOCK_SIZE 256
561
402
  #define CUDA_PAD_BLOCK_SIZE 256
403
+ #define CUDA_ARANGE_BLOCK_SIZE 256
404
+ #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
562
405
  #define CUDA_ACC_BLOCK_SIZE 256
563
406
  #define CUDA_IM2COL_BLOCK_SIZE 256
564
407
  #define CUDA_POOL2D_BLOCK_SIZE 256
@@ -661,18 +504,20 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
661
504
  return a;
662
505
  }
663
506
 
664
- //static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
665
- //#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
666
- //#pragma unroll
667
- // for (int mask = 16; mask > 0; mask >>= 1) {
668
- // a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
669
- // }
670
- // return a;
671
- //#else
672
- // (void) a;
673
- // NO_DEVICE_CODE;
674
- //#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
675
- //}
507
+ #ifdef GGML_CUDA_F16
508
+ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
509
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
510
+ #pragma unroll
511
+ for (int mask = 16; mask > 0; mask >>= 1) {
512
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
513
+ }
514
+ return a;
515
+ #else
516
+ (void) a;
517
+ NO_DEVICE_CODE;
518
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
519
+ }
520
+ #endif // GGML_CUDA_F16
676
521
 
677
522
  static __device__ __forceinline__ float warp_reduce_max(float x) {
678
523
  #pragma unroll
@@ -931,17 +776,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
931
776
  nidx +
932
777
  blockIdx.y * ne0 +
933
778
  blockIdx.z * ne0 * gridDim.y;
934
- dst[offset_dst] = x[offset_src];
779
+ dst[offset_dst] = x[offset_src];
935
780
  } else {
936
781
  int offset_src =
937
782
  nidx +
938
783
  blockIdx.y * ne0 +
939
784
  (blockIdx.z - ne02) * ne0 * gridDim.y;
940
- dst[offset_dst] = y[offset_src];
785
+ dst[offset_dst] = y[offset_src];
941
786
  }
942
787
  }
943
788
 
944
- static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
789
+ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
790
+ // blockIdx.z: idx of ne02*ne03
791
+ // blockIdx.y: idx of ne01*scale_factor, aka ne1
792
+ // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
793
+ // ne00xne01: ne00 * ne01
945
794
  int ne0 = ne00 * scale_factor;
946
795
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
947
796
  if (nidx >= ne0) {
@@ -953,7 +802,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
953
802
  int offset_src =
954
803
  i00 +
955
804
  i01 * ne00 +
956
- blockIdx.z * nb02;
805
+ blockIdx.z * ne00xne01;
957
806
  int offset_dst =
958
807
  nidx +
959
808
  blockIdx.y * ne0 +
@@ -961,7 +810,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
961
810
  dst[offset_dst] = x[offset_src];
962
811
  }
963
812
 
964
- static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
813
+ static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
814
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
815
+ // blockIdx.y: idx of ne1
816
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
965
817
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
966
818
  if (nidx >= ne0) {
967
819
  return;
@@ -972,19 +824,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
972
824
  nidx +
973
825
  blockIdx.y * ne0 +
974
826
  blockIdx.z * ne0 * gridDim.y;
975
- if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
827
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
976
828
  int offset_src =
977
829
  nidx +
978
830
  blockIdx.y * ne00 +
979
831
  blockIdx.z * ne00 * ne01;
980
- dst[offset_dst] = x[offset_src];
832
+ dst[offset_dst] = x[offset_src];
981
833
  } else {
982
834
  dst[offset_dst] = 0.0f;
983
835
  }
984
836
  }
985
837
 
838
+ static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
839
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
840
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
841
+ if (nidx >= ne0) {
842
+ return;
843
+ }
844
+ dst[nidx] = start + step * nidx;
845
+ }
846
+
847
+ static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
848
+ // blockIDx.y: idx of timesteps->ne[0]
849
+ // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
850
+ int i = blockIdx.y;
851
+ int j = threadIdx.x + blockIdx.x * blockDim.x;
852
+ float * embed_data = (float *)((char *)dst + i*nb1);
853
+
854
+ if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
855
+ embed_data[dim] = 0.f;
856
+ }
857
+
858
+ int half = dim / 2;
859
+ if (j >= half) {
860
+ return;
861
+ }
862
+
863
+ float timestep = timesteps[i];
864
+ float freq = (float)expf(-logf(max_period) * j / half);
865
+ float arg = timestep * freq;
866
+ embed_data[j] = cosf(arg);
867
+ embed_data[j + half] = sinf(arg);
868
+ }
869
+
986
870
  template <int block_size>
987
871
  static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
872
+ // blockIdx.x: num_groups idx
873
+ // threadIdx.x: block_size idx
988
874
  int start = blockIdx.x * group_size;
989
875
  int end = start + group_size;
990
876
 
@@ -1467,420 +1353,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
1467
1353
  #endif
1468
1354
  }
1469
1355
 
1470
- static const __device__ uint64_t iq2xxs_grid[256] = {
1471
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1472
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
1473
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
1474
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
1475
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
1476
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
1477
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
1478
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
1479
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
1480
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
1481
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
1482
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
1483
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
1484
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
1485
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
1486
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
1487
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
1488
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
1489
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
1490
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
1491
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
1492
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
1493
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
1494
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
1495
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
1496
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
1497
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
1498
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
1499
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
1500
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
1501
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
1502
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
1503
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
1504
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
1505
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
1506
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
1507
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
1508
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
1509
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
1510
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
1511
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
1512
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
1513
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
1514
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
1515
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
1516
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
1517
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
1518
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
1519
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
1520
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
1521
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
1522
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
1523
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
1524
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
1525
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
1526
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
1527
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
1528
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
1529
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
1530
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
1531
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
1532
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
1533
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
1534
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
1535
- };
1536
-
1537
- static const __device__ uint64_t iq2xs_grid[512] = {
1538
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1539
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1540
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1541
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1542
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1543
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
1544
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
1545
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
1546
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
1547
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
1548
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
1549
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
1550
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
1551
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
1552
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
1553
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
1554
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
1555
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
1556
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
1557
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
1558
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
1559
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
1560
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
1561
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
1562
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
1563
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
1564
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
1565
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
1566
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
1567
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
1568
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
1569
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
1570
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
1571
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
1572
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
1573
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
1574
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
1575
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
1576
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
1577
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
1578
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
1579
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
1580
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
1581
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
1582
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
1583
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
1584
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
1585
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
1586
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
1587
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
1588
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
1589
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
1590
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
1591
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
1592
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
1593
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
1594
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
1595
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
1596
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
1597
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
1598
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
1599
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
1600
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
1601
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
1602
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
1603
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
1604
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
1605
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
1606
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
1607
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
1608
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
1609
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
1610
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
1611
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
1612
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
1613
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
1614
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
1615
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
1616
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
1617
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
1618
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
1619
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
1620
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
1621
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
1622
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
1623
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
1624
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
1625
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
1626
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
1627
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
1628
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
1629
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
1630
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
1631
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
1632
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
1633
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
1634
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
1635
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
1636
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
1637
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
1638
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
1639
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
1640
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
1641
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
1642
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
1643
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
1644
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
1645
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
1646
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
1647
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
1648
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
1649
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
1650
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
1651
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
1652
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
1653
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
1654
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
1655
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
1656
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
1657
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
1658
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
1659
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
1660
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
1661
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
1662
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
1663
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
1664
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
1665
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1666
- };
1667
-
1668
- static const __device__ uint32_t iq3xxs_grid[256] = {
1669
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
1670
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
1671
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
1672
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
1673
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
1674
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
1675
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
1676
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
1677
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
1678
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
1679
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
1680
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
1681
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
1682
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
1683
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
1684
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
1685
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
1686
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
1687
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
1688
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
1689
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
1690
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
1691
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
1692
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
1693
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
1694
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
1695
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
1696
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
1697
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
1698
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
1699
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
1700
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
1701
- };
1702
-
1703
- static const __device__ uint64_t iq1s_grid[512] = {
1704
- 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
1705
- 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
1706
- 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
1707
- 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
1708
- 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
1709
- 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
1710
- 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
1711
- 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
1712
- 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
1713
- 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
1714
- 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
1715
- 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
1716
- 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
1717
- 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
1718
- 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
1719
- 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
1720
- 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
1721
- 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
1722
- 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
1723
- 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
1724
- 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
1725
- 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
1726
- 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
1727
- 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
1728
- 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
1729
- 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
1730
- 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
1731
- 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
1732
- 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
1733
- 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
1734
- 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
1735
- 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
1736
- 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
1737
- 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
1738
- 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
1739
- 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
1740
- 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
1741
- 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
1742
- 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
1743
- 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
1744
- 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
1745
- 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
1746
- 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
1747
- 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
1748
- 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
1749
- 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
1750
- 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
1751
- 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
1752
- 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
1753
- 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
1754
- 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
1755
- 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
1756
- 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
1757
- 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
1758
- 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
1759
- 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
1760
- 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
1761
- 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
1762
- 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
1763
- 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
1764
- 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
1765
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
1766
- 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
1767
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
1768
- 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
1769
- 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
1770
- 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
1771
- 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
1772
- 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
1773
- 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
1774
- 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
1775
- 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
1776
- 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
1777
- 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
1778
- 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
1779
- 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
1780
- 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
1781
- 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
1782
- 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
1783
- 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
1784
- 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
1785
- 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
1786
- 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
1787
- 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
1788
- 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
1789
- 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
1790
- 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
1791
- 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
1792
- 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
1793
- 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
1794
- 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
1795
- 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
1796
- 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
1797
- 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
1798
- 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
1799
- 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
1800
- 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
1801
- 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
1802
- 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
1803
- 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
1804
- 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
1805
- 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
1806
- 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
1807
- 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
1808
- 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
1809
- 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
1810
- 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
1811
- 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
1812
- 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
1813
- 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
1814
- 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
1815
- 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
1816
- 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
1817
- 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
1818
- 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
1819
- 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
1820
- 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
1821
- 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
1822
- 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
1823
- 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
1824
- 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
1825
- 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
1826
- 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
1827
- 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
1828
- 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
1829
- 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
1830
- 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
1831
- 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
1832
- };
1833
-
1834
- static const __device__ uint8_t ksigns_iq2xs[128] = {
1835
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
1836
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
1837
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
1838
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
1839
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
1840
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
1841
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
1842
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
1843
- };
1844
-
1845
- //#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1846
- static const __device__ uint64_t ksigns64[128] = {
1847
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
1848
- 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
1849
- 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
1850
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
1851
- 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
1852
- 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
1853
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
1854
- 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
1855
- 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
1856
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
1857
- 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
1858
- 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
1859
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
1860
- 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
1861
- 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
1862
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
1863
- 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
1864
- 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
1865
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
1866
- 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
1867
- 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
1868
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
1869
- 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
1870
- 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
1871
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
1872
- 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
1873
- 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
1874
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
1875
- 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
1876
- 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
1877
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
1878
- 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
1879
- };
1880
- //#endif
1881
-
1882
- static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
1883
-
1884
1356
  inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
1885
1357
  switch (type) {
1886
1358
  case GGML_TYPE_Q4_0:
@@ -1945,6 +1417,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
1945
1417
 
1946
1418
  }
1947
1419
 
1420
+ template<typename dst_t>
1421
+ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1422
+
1423
+ const int i = blockIdx.x;
1424
+ const block_iq2_s * x = (const block_iq2_s *) vx;
1425
+
1426
+ const int tid = threadIdx.x;
1427
+ #if QK_K == 256
1428
+ const int il = tid/8; // 0...3
1429
+ const int ib = tid%8; // 0...7
1430
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1431
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
1432
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
1433
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
1434
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
1435
+ #else
1436
+ assert(false);
1437
+ #endif
1438
+
1439
+ }
1440
+
1948
1441
  template<typename dst_t>
1949
1442
  static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1950
1443
 
@@ -1973,6 +1466,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
1973
1466
 
1974
1467
  }
1975
1468
 
1469
+ template<typename dst_t>
1470
+ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1471
+
1472
+ const int i = blockIdx.x;
1473
+ const block_iq3_s * x = (const block_iq3_s *) vx;
1474
+
1475
+ const int tid = threadIdx.x;
1476
+ #if QK_K == 256
1477
+ const int il = tid/8; // 0...3
1478
+ const int ib = tid%8; // 0...7
1479
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1480
+ const uint8_t * qs = x[i].qs + 8*ib;
1481
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
1482
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
1483
+ const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
1484
+ const uint8_t signs = x[i].signs[4*ib + il];
1485
+ for (int j = 0; j < 4; ++j) {
1486
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
1487
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
1488
+ }
1489
+ #else
1490
+ assert(false);
1491
+ #endif
1492
+
1493
+ }
1494
+
1976
1495
  template<typename dst_t>
1977
1496
  static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1978
1497
 
@@ -1984,11 +1503,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
1984
1503
  const int il = tid/8; // 0...3
1985
1504
  const int ib = tid%8; // 0...7
1986
1505
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1987
- const int i8 = 4*ib+il;
1988
- uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
1989
- const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
1990
- const float d = (float)x[i].d * (2*(h & 7) + 1);
1991
- for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
1506
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
1507
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
1508
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
1509
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
1510
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
1511
+ grid32[0] &= 0x0f0f0f0f;
1512
+ for (int j = 0; j < 8; ++j) {
1513
+ y[j] = d * (q[j] + delta);
1514
+ }
1992
1515
  #else
1993
1516
  assert(false);
1994
1517
  #endif
@@ -2016,6 +1539,25 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
2016
1539
 
2017
1540
  }
2018
1541
 
1542
+ #if QK_K != 64
1543
+ template<typename dst_t>
1544
+ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1545
+ const int i = blockIdx.x;
1546
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
1547
+
1548
+ const int tid = threadIdx.x;
1549
+ const int il = tid/8; // 0...3
1550
+ const int ib = tid%8; // 0...7
1551
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
1552
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
1553
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
1554
+ for (int j = 0; j < 4; ++j) {
1555
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
1556
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
1557
+ }
1558
+ }
1559
+ #endif
1560
+
2019
1561
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
2020
1562
 
2021
1563
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -2112,10 +1654,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
2112
1654
  #endif
2113
1655
 
2114
1656
  // sum up partial sums and write back result
2115
- #pragma unroll
2116
- for (int mask = 16; mask > 0; mask >>= 1) {
2117
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
2118
- }
1657
+ tmp = warp_reduce_sum(tmp);
2119
1658
 
2120
1659
  if (threadIdx.x == 0) {
2121
1660
  dst[row] = tmp;
@@ -2216,10 +1755,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
2216
1755
  #endif
2217
1756
 
2218
1757
  // sum up partial sums and write back result
2219
- #pragma unroll
2220
- for (int mask = 16; mask > 0; mask >>= 1) {
2221
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
2222
- }
1758
+ tmp = warp_reduce_sum(tmp);
2223
1759
 
2224
1760
  if (threadIdx.x == 0) {
2225
1761
  dst[row] = tmp;
@@ -2352,10 +1888,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
2352
1888
  #endif
2353
1889
 
2354
1890
  // sum up partial sums and write back result
2355
- #pragma unroll
2356
- for (int mask = 16; mask > 0; mask >>= 1) {
2357
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
2358
- }
1891
+ tmp = warp_reduce_sum(tmp);
2359
1892
 
2360
1893
  if (tid == 0) {
2361
1894
  dst[row] = tmp;
@@ -2468,10 +2001,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
2468
2001
  #endif
2469
2002
 
2470
2003
  // sum up partial sums and write back result
2471
- #pragma unroll
2472
- for (int mask = 16; mask > 0; mask >>= 1) {
2473
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
2474
- }
2004
+ tmp = warp_reduce_sum(tmp);
2475
2005
 
2476
2006
  if (threadIdx.x == 0) {
2477
2007
  dst[row] = tmp;
@@ -2578,10 +2108,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
2578
2108
  #endif
2579
2109
 
2580
2110
  // sum up partial sums and write back result
2581
- #pragma unroll
2582
- for (int mask = 16; mask > 0; mask >>= 1) {
2583
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
2584
- }
2111
+ tmp = warp_reduce_sum(tmp);
2585
2112
 
2586
2113
  if (tid == 0) {
2587
2114
  dst[row] = tmp;
@@ -2616,11 +2143,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
2616
2143
  float amax = fabsf(xi);
2617
2144
  float sum = xi;
2618
2145
 
2619
- #pragma unroll
2620
- for (int mask = 16; mask > 0; mask >>= 1) {
2621
- amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
2622
- sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
2623
- }
2146
+ amax = warp_reduce_max(amax);
2147
+ sum = warp_reduce_sum(sum);
2624
2148
 
2625
2149
  const float d = amax / 127;
2626
2150
  const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
@@ -3827,7 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
3827
3351
  #pragma unroll
3828
3352
  for (int i = 0; i < QR2_K; ++ i) {
3829
3353
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
3830
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3354
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
3831
3355
  }
3832
3356
 
3833
3357
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -3949,7 +3473,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
3949
3473
  #pragma unroll
3950
3474
  for (int i = 0; i < QR3_K; ++i) {
3951
3475
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
3952
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3476
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
3953
3477
  }
3954
3478
 
3955
3479
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -4118,7 +3642,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
4118
3642
 
4119
3643
  for (int i = 0; i < QR4_K; ++i) {
4120
3644
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
4121
- d8[i] = __low2half(bq8i->ds);
3645
+ d8[i] = __low2float(bq8i->ds);
4122
3646
 
4123
3647
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
4124
3648
  u[2*i+0] = q8[0];
@@ -4483,7 +4007,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
4483
4007
  #pragma unroll
4484
4008
  for (int i = 0; i < QR6_K; ++i) {
4485
4009
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
4486
- d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
4010
+ d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
4487
4011
  }
4488
4012
 
4489
4013
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -4682,6 +4206,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4682
4206
  #endif
4683
4207
  }
4684
4208
 
4209
+ // TODO
4210
+ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
4211
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4212
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4213
+ #if QK_K == 256
4214
+ const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
4215
+
4216
+ const int ib32 = iqs;
4217
+ const int8_t * q8 = bq8_1[ib32].qs;
4218
+ const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
4219
+ const uint8_t ls1 = bq2->scales[ib32] & 0xf;
4220
+ const uint8_t ls2 = bq2->scales[ib32] >> 4;
4221
+ int sumi1 = 0;
4222
+ for (int l = 0; l < 2; ++l) {
4223
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
4224
+ const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
4225
+ const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
4226
+ const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
4227
+ const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
4228
+ sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
4229
+ sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
4230
+ q8 += 8;
4231
+ }
4232
+ int sumi2 = 0;
4233
+ for (int l = 2; l < 4; ++l) {
4234
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
4235
+ const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
4236
+ const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
4237
+ const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
4238
+ const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
4239
+ sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
4240
+ sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
4241
+ q8 += 8;
4242
+ }
4243
+ const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
4244
+ return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
4245
+ #else
4246
+ (void) ksigns64;
4247
+ assert(false);
4248
+ return 0.f;
4249
+ #endif
4250
+ #else
4251
+ (void) ksigns64;
4252
+ assert(false);
4253
+ return 0.f;
4254
+ #endif
4255
+ }
4256
+
4685
4257
  static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
4686
4258
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4687
4259
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -4717,43 +4289,70 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
4717
4289
  #endif
4718
4290
  }
4719
4291
 
4292
+ // TODO: don't use lookup table for signs
4293
+ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
4294
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4295
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4296
+ #if QK_K == 256
4297
+ const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
4298
+
4299
+ const int ib32 = iqs;
4300
+ const uint8_t * qs = bq2->qs + 8*ib32;
4301
+ const int8_t * q8 = bq8_1[ib32].qs;
4302
+ int sumi = 0;
4303
+ for (int l = 0; l < 4; ++l) {
4304
+ const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
4305
+ const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
4306
+ uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
4307
+ uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
4308
+ const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
4309
+ const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
4310
+ sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
4311
+ sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
4312
+ q8 += 8;
4313
+ }
4314
+ const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
4315
+ return d * sumi;
4316
+ #else
4317
+ assert(false);
4318
+ return 0.f;
4319
+ #endif
4320
+ #else
4321
+ assert(false);
4322
+ return 0.f;
4323
+ #endif
4324
+ }
4325
+
4720
4326
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
4721
4327
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4722
4328
  #if QK_K == 256
4723
4329
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
4724
4330
 
4725
4331
  const int ib32 = iqs;
4726
- int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
4727
- const uint8_t h1 = bq1->scales[2*ib32+0];
4728
- const uint8_t h2 = bq1->scales[2*ib32+1];
4332
+ int sumi = 0;
4729
4333
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4730
4334
  const int * q8 = (const int *)bq8_1[ib32].qs;
4731
- const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
4732
- const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
4733
- const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
4734
- const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
4735
- for (int j = 0; j < 2; ++j) {
4736
- sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
4737
- sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
4738
- sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
4739
- sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
4335
+ for (int l = 0; l < 4; ++l) {
4336
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4337
+ int grid0 = grid[0] & 0x0f0f0f0f;
4338
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
4339
+ sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
4740
4340
  }
4741
4341
  #else
4742
- const int8_t * q8 = bq8_1[ib32].qs;
4743
- const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
4744
- const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
4745
- const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
4746
- const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
4747
- for (int j = 0; j < 8; ++j) {
4748
- sumi1 += q8[j+ 0] * grid1[j];
4749
- sumi2 += q8[j+ 8] * grid2[j];
4750
- sumi3 += q8[j+16] * grid3[j];
4751
- sumi4 += q8[j+24] * grid4[j];
4342
+ const int8_t * q8 = bq8_1[ib32].qs;
4343
+ for (int l = 0; l < 4; ++l) {
4344
+ const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4345
+ for (int j = 0; j < 4; ++j) {
4346
+ sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
4347
+ }
4348
+ q8 += 8;
4752
4349
  }
4753
4350
  #endif
4754
- const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
4755
- return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
4756
- sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
4351
+ const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
4352
+ const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
4353
+ const float d = d1q * __low2float (bq8_1[ib32].ds);
4354
+ const float m = d1q * __high2float(bq8_1[ib32].ds);
4355
+ return d * sumi + m * delta;
4757
4356
  #else
4758
4357
  assert(false);
4759
4358
  return 0.f;
@@ -4810,6 +4409,75 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
4810
4409
  return d * (sumi1 + sumi2);
4811
4410
  }
4812
4411
 
4412
+ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
4413
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4414
+
4415
+ #if QK_K == 256
4416
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4417
+
4418
+ const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
4419
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
4420
+
4421
+ //// iqs is 0...7
4422
+ //const int ib64 = iqs/2;
4423
+ //const int il = iqs%2;
4424
+ //const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
4425
+ //const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
4426
+ //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
4427
+ //const uint32_t * q4_2 = q4_1 + 4;
4428
+ //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
4429
+ //const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
4430
+ //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
4431
+ //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
4432
+ //int v1, v2;
4433
+ //int sumi1 = 0, sumi2 = 0;
4434
+ //for (int j = 0; j < 2; ++j) {
4435
+ // get_int_from_table_16(q4_1[j], values, v1, v2);
4436
+ // sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
4437
+ // get_int_from_table_16(q4_2[j], values, v1, v2);
4438
+ // sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
4439
+ //}
4440
+ //return d1 * sumi1 + d2 * sumi2;
4441
+
4442
+ // iqs is 0...7
4443
+ const int ib32 = iqs;
4444
+ const int32_t * q8 = (const int *)bq8_1[ib32].qs;
4445
+ const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
4446
+ const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
4447
+ const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
4448
+ int v1, v2;
4449
+ int sumi1 = 0, sumi2 = 0;
4450
+ for (int j = 0; j < 4; ++j) {
4451
+ get_int_from_table_16(q4[j], values, v1, v2);
4452
+ sumi1 = __dp4a(v1, q8[j+0], sumi1);
4453
+ sumi2 = __dp4a(v2, q8[j+4], sumi2);
4454
+ }
4455
+ return d * (sumi1 + sumi2);
4456
+
4457
+ //// iqs is 0...15
4458
+ //const int ib32 = iqs/2;
4459
+ //const int il = iqs%2;
4460
+ //const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il;
4461
+ //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
4462
+ //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
4463
+ //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
4464
+ //int v1, v2;
4465
+ //int sumi1 = 0, sumi2 = 0;
4466
+ //for (int j = 0; j < 2; ++j) {
4467
+ // get_int_from_table_16(q4[j], values, v1, v2);
4468
+ // sumi1 = __dp4a(v1, q8[j+0], sumi1);
4469
+ // sumi2 = __dp4a(v2, q8[j+4], sumi2);
4470
+ //}
4471
+ //return d * (sumi1 + sumi2);
4472
+ #else
4473
+ assert(false);
4474
+ return 0.f;
4475
+ #endif
4476
+ #else
4477
+ return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
4478
+ #endif
4479
+ }
4480
+
4813
4481
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
4814
4482
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
4815
4483
  static __device__ __forceinline__ void mul_mat_q(
@@ -4876,7 +4544,7 @@ static __device__ __forceinline__ void mul_mat_q(
4876
4544
  *dsi_dst = *dsi_src;
4877
4545
  } else {
4878
4546
  float * dfi_dst = (float *) dsi_dst;
4879
- *dfi_dst = __low2half(*dsi_src);
4547
+ *dfi_dst = __low2float(*dsi_src);
4880
4548
  }
4881
4549
  }
4882
4550
 
@@ -5730,10 +5398,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
5730
5398
  }
5731
5399
 
5732
5400
  // sum up partial sums and write back result
5733
- #pragma unroll
5734
- for (int mask = 16; mask > 0; mask >>= 1) {
5735
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
5736
- }
5401
+ tmp = warp_reduce_sum(tmp);
5737
5402
 
5738
5403
  if (tid == 0) {
5739
5404
  #ifdef GGML_CUDA_F16
@@ -5783,10 +5448,7 @@ static __global__ void mul_mat_p021_f16_f32(
5783
5448
  const int idst = channel*nrows_dst + row_dst;
5784
5449
 
5785
5450
  // sum up partial sums and write back result
5786
- #pragma unroll
5787
- for (int mask = 16; mask > 0; mask >>= 1) {
5788
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
5789
- }
5451
+ tmp = warp_reduce_sum(tmp);
5790
5452
 
5791
5453
  if (threadIdx.x == 0) {
5792
5454
  dst[idst] = tmp;
@@ -5829,10 +5491,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
5829
5491
  }
5830
5492
 
5831
5493
  // sum up partial sums and write back result
5832
- #pragma unroll
5833
- for (int mask = 16; mask > 0; mask >>= 1) {
5834
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
5835
- }
5494
+ tmp = warp_reduce_sum(tmp);
5836
5495
 
5837
5496
  if (threadIdx.x == 0) {
5838
5497
  dst[idst] = tmp;
@@ -5872,7 +5531,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
5872
5531
  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
5873
5532
  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
5874
5533
  const int nb12, const int nb13) {
5875
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
5534
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
5876
5535
 
5877
5536
  if (i >= ne) {
5878
5537
  return;
@@ -5880,17 +5539,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
5880
5539
 
5881
5540
  // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
5882
5541
  // then combine those indices with the corresponding byte offsets to get the total offsets
5883
- const int i03 = i/(ne00 * ne01 * ne02);
5884
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
5885
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
5886
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
5887
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
5888
-
5889
- const int i13 = i/(ne10 * ne11 * ne12);
5890
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
5891
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
5892
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
5893
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
5542
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
5543
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
5544
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
5545
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
5546
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
5547
+
5548
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
5549
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
5550
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
5551
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
5552
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
5894
5553
 
5895
5554
  cpy_1(cx + x_offset, cdst + dst_offset);
5896
5555
  }
@@ -6216,11 +5875,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
6216
5875
  int ixj = col ^ j;
6217
5876
  if (ixj > col) {
6218
5877
  if ((col & k) == 0) {
6219
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
5878
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
6220
5879
  swap(dst_row[col], dst_row[ixj]);
6221
5880
  }
6222
5881
  } else {
6223
- if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
5882
+ if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
6224
5883
  swap(dst_row[col], dst_row[ixj]);
6225
5884
  }
6226
5885
  }
@@ -6328,6 +5987,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
6328
5987
  // find the sum of exps in the block
6329
5988
  tmp = warp_reduce_sum(tmp);
6330
5989
  if (block_size > WARP_SIZE) {
5990
+ __syncthreads();
6331
5991
  if (warp_id == 0) {
6332
5992
  buf_iw[lane_id] = 0.0f;
6333
5993
  }
@@ -6379,23 +6039,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
6379
6039
 
6380
6040
  template <typename T>
6381
6041
  static __global__ void im2col_kernel(
6382
- const float * x, T * dst, int batch_offset,
6383
- int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
6042
+ const float * x, T * dst, int64_t batch_offset,
6043
+ int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
6384
6044
  int s0, int s1, int p0, int p1, int d0, int d1) {
6385
- const int i = threadIdx.x + blockIdx.x * blockDim.x;
6045
+ const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
6386
6046
  if (i >= pelements) {
6387
6047
  return;
6388
6048
  }
6389
6049
 
6390
- const int ksize = OW * (KH > 1 ? KW : 1);
6391
- const int kx = i / ksize;
6392
- const int kd = kx * ksize;
6393
- const int ky = (i - kd) / OW;
6394
- const int ix = i % OW;
6050
+ const int64_t ksize = OW * (KH > 1 ? KW : 1);
6051
+ const int64_t kx = i / ksize;
6052
+ const int64_t kd = kx * ksize;
6053
+ const int64_t ky = (i - kd) / OW;
6054
+ const int64_t ix = i % OW;
6395
6055
 
6396
- const int oh = blockIdx.y;
6397
- const int batch = blockIdx.z / IC;
6398
- const int ic = blockIdx.z % IC;
6056
+ const int64_t oh = blockIdx.y;
6057
+ const int64_t batch = blockIdx.z / IC;
6058
+ const int64_t ic = blockIdx.z % IC;
6399
6059
 
6400
6060
  const int64_t iiw = ix * s0 + kx * d0 - p0;
6401
6061
  const int64_t iih = oh * s1 + ky * d1 - p1;
@@ -6721,19 +6381,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
6721
6381
  concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
6722
6382
  }
6723
6383
 
6724
- static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int scale_factor, cudaStream_t stream) {
6384
+ static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
6385
+ const int scale_factor, cudaStream_t stream) {
6725
6386
  int ne0 = (ne00 * scale_factor);
6726
6387
  int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
6727
- dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
6388
+ dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
6728
6389
  upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
6729
6390
  }
6730
6391
 
6731
6392
  static void pad_f32_cuda(const float * x, float * dst,
6732
- const int ne00, const int ne01, const int ne02,
6733
- const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
6393
+ const int ne00, const int ne01, const int ne02, const int ne03,
6394
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
6734
6395
  int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
6735
- dim3 gridDim(num_blocks, ne1, ne2);
6736
- pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
6396
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
6397
+ pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
6398
+ }
6399
+
6400
+ static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
6401
+ int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
6402
+ arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
6403
+ }
6404
+
6405
+ static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
6406
+ const int dim, const int max_period, cudaStream_t stream) {
6407
+ int half_ceil = (dim + 1) / 2;
6408
+ int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
6409
+ dim3 gridDim(num_blocks, ne00, 1);
6410
+ timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
6737
6411
  }
6738
6412
 
6739
6413
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
@@ -6843,12 +6517,24 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
6843
6517
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
6844
6518
  }
6845
6519
 
6520
+ template<typename dst_t>
6521
+ static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6522
+ const int nb = k / QK_K;
6523
+ dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
6524
+ }
6525
+
6846
6526
  template<typename dst_t>
6847
6527
  static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6848
6528
  const int nb = k / QK_K;
6849
6529
  dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
6850
6530
  }
6851
6531
 
6532
+ template<typename dst_t>
6533
+ static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6534
+ const int nb = k / QK_K;
6535
+ dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
6536
+ }
6537
+
6852
6538
  template<typename dst_t>
6853
6539
  static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6854
6540
  const int nb = k / QK_K;
@@ -6861,6 +6547,16 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
6861
6547
  dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
6862
6548
  }
6863
6549
 
6550
+ template<typename dst_t>
6551
+ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6552
+ const int nb = (k + QK_K - 1) / QK_K;
6553
+ #if QK_K == 64
6554
+ dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
6555
+ #else
6556
+ dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
6557
+ #endif
6558
+ }
6559
+
6864
6560
  template <typename src_t, typename dst_t>
6865
6561
  static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
6866
6562
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6898,12 +6594,18 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6898
6594
  return dequantize_row_iq2_xxs_cuda;
6899
6595
  case GGML_TYPE_IQ2_XS:
6900
6596
  return dequantize_row_iq2_xs_cuda;
6597
+ case GGML_TYPE_IQ2_S:
6598
+ return dequantize_row_iq2_s_cuda;
6901
6599
  case GGML_TYPE_IQ3_XXS:
6902
6600
  return dequantize_row_iq3_xxs_cuda;
6903
6601
  case GGML_TYPE_IQ1_S:
6904
6602
  return dequantize_row_iq1_s_cuda;
6905
6603
  case GGML_TYPE_IQ4_NL:
6906
6604
  return dequantize_row_iq4_nl_cuda;
6605
+ case GGML_TYPE_IQ4_XS:
6606
+ return dequantize_row_iq4_xs_cuda;
6607
+ case GGML_TYPE_IQ3_S:
6608
+ return dequantize_row_iq3_s_cuda;
6907
6609
  case GGML_TYPE_F32:
6908
6610
  return convert_unary_cuda<float>;
6909
6611
  default:
@@ -6937,12 +6639,18 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6937
6639
  return dequantize_row_iq2_xxs_cuda;
6938
6640
  case GGML_TYPE_IQ2_XS:
6939
6641
  return dequantize_row_iq2_xs_cuda;
6642
+ case GGML_TYPE_IQ2_S:
6643
+ return dequantize_row_iq2_s_cuda;
6940
6644
  case GGML_TYPE_IQ3_XXS:
6941
6645
  return dequantize_row_iq3_xxs_cuda;
6942
6646
  case GGML_TYPE_IQ1_S:
6943
6647
  return dequantize_row_iq1_s_cuda;
6944
6648
  case GGML_TYPE_IQ4_NL:
6945
6649
  return dequantize_row_iq4_nl_cuda;
6650
+ case GGML_TYPE_IQ4_XS:
6651
+ return dequantize_row_iq4_xs_cuda;
6652
+ case GGML_TYPE_IQ3_S:
6653
+ return dequantize_row_iq3_s_cuda;
6946
6654
  case GGML_TYPE_F16:
6947
6655
  return convert_unary_cuda<half>;
6948
6656
  default:
@@ -7764,10 +7472,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
7764
7472
 
7765
7473
  const dim3 block_dims(ncols, 1, 1);
7766
7474
  const dim3 block_nums(1, nrows, 1);
7767
- if (order == GGML_SORT_ASC) {
7768
- k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7769
- } else if (order == GGML_SORT_DESC) {
7770
- k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7475
+ if (order == GGML_SORT_ORDER_ASC) {
7476
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7477
+ } else if (order == GGML_SORT_ORDER_DESC) {
7478
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7771
7479
  } else {
7772
7480
  GGML_ASSERT(false);
7773
7481
  }
@@ -7832,8 +7540,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
7832
7540
 
7833
7541
  template <typename T>
7834
7542
  static void im2col_cuda(const float* x, T* dst,
7835
- int IW, int IH, int OW, int OH, int KW, int KH, int IC,
7836
- int batch, int batch_offset, int offset_delta,
7543
+ int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
7544
+ int64_t batch, int64_t batch_offset, int64_t offset_delta,
7837
7545
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
7838
7546
  const int parallel_elements = OW * KW * KH;
7839
7547
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
@@ -7916,8 +7624,8 @@ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual
7916
7624
  *actual_size = look_ahead_size;
7917
7625
  g_cuda_pool_size[device] += look_ahead_size;
7918
7626
  #ifdef DEBUG_CUDA_MALLOC
7919
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
7920
- (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
7627
+ fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
7628
+ (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
7921
7629
  #endif
7922
7630
  return ptr;
7923
7631
  }
@@ -8003,7 +7711,7 @@ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual
8003
7711
  g_cuda_pool_used[device] += size;
8004
7712
 
8005
7713
  #ifdef DEBUG_CUDA_MALLOC
8006
- printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
7714
+ printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
8007
7715
  #endif
8008
7716
 
8009
7717
  return ptr;
@@ -8013,7 +7721,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
8013
7721
  scoped_spin_lock lock(g_cuda_pool_lock);
8014
7722
 
8015
7723
  #ifdef DEBUG_CUDA_MALLOC
8016
- printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
7724
+ printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
8017
7725
  #endif
8018
7726
 
8019
7727
  g_cuda_pool_used[device] -= size;
@@ -8199,11 +7907,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
8199
7907
 
8200
7908
  cudaMemcpyKind kind;
8201
7909
  char * src_ptr;
8202
- if (src->backend == GGML_BACKEND_CPU) {
7910
+ if (src->backend == GGML_BACKEND_TYPE_CPU) {
8203
7911
  kind = cudaMemcpyHostToDevice;
8204
7912
  src_ptr = (char *) src->data;
8205
- } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
8206
- GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
7913
+ } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
7914
+ GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
8207
7915
  kind = cudaMemcpyDeviceToDevice;
8208
7916
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
8209
7917
  int id;
@@ -8512,7 +8220,7 @@ static void ggml_cuda_op_group_norm(
8512
8220
 
8513
8221
  int num_groups = dst->op_params[0];
8514
8222
  int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
8515
- group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
8223
+ group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
8516
8224
 
8517
8225
  (void) src1;
8518
8226
  (void) dst;
@@ -8545,7 +8253,7 @@ static void ggml_cuda_op_upscale(
8545
8253
 
8546
8254
  const int scale_factor = dst->op_params[0];
8547
8255
 
8548
- upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
8256
+ upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
8549
8257
 
8550
8258
  (void) src1;
8551
8259
  (void) dst;
@@ -8561,8 +8269,49 @@ static void ggml_cuda_op_pad(
8561
8269
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
8562
8270
 
8563
8271
  pad_f32_cuda(src0_dd, dst_dd,
8564
- src0->ne[0], src0->ne[1], src0->ne[2],
8565
- dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
8272
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
8273
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
8274
+
8275
+ (void) src1;
8276
+ (void) dst;
8277
+ (void) src1_dd;
8278
+ }
8279
+
8280
+ static void ggml_cuda_op_arange(
8281
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8282
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8283
+
8284
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
8285
+
8286
+ float start;
8287
+ float stop;
8288
+ float step;
8289
+ memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
8290
+ memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
8291
+ memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
8292
+
8293
+ int64_t steps = (int64_t)ceil((stop - start) / step);
8294
+ GGML_ASSERT(ggml_nelements(dst) == steps);
8295
+
8296
+ arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
8297
+
8298
+ (void) src0;
8299
+ (void) src1;
8300
+ (void) src0_dd;
8301
+ (void) src1_dd;
8302
+ }
8303
+
8304
+ static void ggml_cuda_op_timestep_embedding(
8305
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8306
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8307
+
8308
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8309
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
8310
+
8311
+ const int dim = dst->op_params[0];
8312
+ const int max_period = dst->op_params[1];
8313
+
8314
+ timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
8566
8315
 
8567
8316
  (void) src1;
8568
8317
  (void) dst;
@@ -8608,7 +8357,7 @@ static void ggml_cuda_op_mul_mat_q(
8608
8357
 
8609
8358
  // the main device has a larger memory buffer to hold the results from all GPUs
8610
8359
  // nrows_dst == nrows of the matrix that the kernel writes into
8611
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8360
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
8612
8361
 
8613
8362
  switch (src0->type) {
8614
8363
  case GGML_TYPE_Q4_0:
@@ -8685,9 +8434,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8685
8434
  case GGML_TYPE_Q6_K:
8686
8435
  case GGML_TYPE_IQ2_XXS:
8687
8436
  case GGML_TYPE_IQ2_XS:
8437
+ case GGML_TYPE_IQ2_S:
8688
8438
  case GGML_TYPE_IQ3_XXS:
8689
8439
  case GGML_TYPE_IQ1_S:
8690
8440
  case GGML_TYPE_IQ4_NL:
8441
+ case GGML_TYPE_IQ4_XS:
8442
+ case GGML_TYPE_IQ3_S:
8691
8443
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
8692
8444
  default:
8693
8445
  GGML_ASSERT(false);
@@ -8710,9 +8462,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8710
8462
  case GGML_TYPE_Q5_K:
8711
8463
  case GGML_TYPE_IQ2_XXS:
8712
8464
  case GGML_TYPE_IQ2_XS:
8465
+ case GGML_TYPE_IQ2_S:
8713
8466
  case GGML_TYPE_IQ3_XXS:
8714
8467
  case GGML_TYPE_IQ1_S:
8715
8468
  case GGML_TYPE_IQ4_NL:
8469
+ case GGML_TYPE_IQ4_XS:
8470
+ case GGML_TYPE_IQ3_S:
8716
8471
  return max_compute_capability >= CC_VOLTA ? 128 : 64;
8717
8472
  case GGML_TYPE_Q6_K:
8718
8473
  return 64;
@@ -8755,7 +8510,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
8755
8510
 
8756
8511
  // the main device has a larger memory buffer to hold the results from all GPUs
8757
8512
  // nrows_dst == nrows of the matrix that the kernel writes into
8758
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8513
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
8759
8514
 
8760
8515
  switch (src0->type) {
8761
8516
  case GGML_TYPE_Q4_0:
@@ -8806,6 +8561,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
8806
8561
  mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
8807
8562
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8808
8563
  break;
8564
+ case GGML_TYPE_IQ2_S:
8565
+ mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
8566
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8567
+ break;
8809
8568
  case GGML_TYPE_IQ3_XXS:
8810
8569
  mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
8811
8570
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
@@ -8818,6 +8577,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
8818
8577
  mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
8819
8578
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8820
8579
  break;
8580
+ case GGML_TYPE_IQ4_XS:
8581
+ mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
8582
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8583
+ break;
8584
+ case GGML_TYPE_IQ3_S:
8585
+ mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
8586
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8587
+ break;
8821
8588
  default:
8822
8589
  GGML_ASSERT(false);
8823
8590
  break;
@@ -8927,7 +8694,7 @@ static void ggml_cuda_op_mul_mat_cublas(
8927
8694
 
8928
8695
  // the main device has a larger memory buffer to hold the results from all GPUs
8929
8696
  // ldc == nrows of the matrix that cuBLAS writes into
8930
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8697
+ int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
8931
8698
 
8932
8699
  const int compute_capability = g_device_caps[id].cc;
8933
8700
 
@@ -9275,7 +9042,7 @@ static void ggml_cuda_op_soft_max(
9275
9042
  const bool use_src2 = src2 != nullptr;
9276
9043
 
9277
9044
  if (use_src2) {
9278
- const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
9045
+ const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
9279
9046
 
9280
9047
  if (src2_on_device) {
9281
9048
  ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
@@ -9333,16 +9100,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
9333
9100
  const bool use_src1 = src1 != nullptr;
9334
9101
  const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
9335
9102
 
9336
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
9337
- GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
9103
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9104
+ GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9338
9105
 
9339
9106
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
9340
9107
  ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
9341
9108
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9342
9109
 
9343
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
9344
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
9345
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
9110
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9111
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
9112
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
9346
9113
 
9347
9114
  // dd = data device
9348
9115
  float * src0_ddf = nullptr;
@@ -9386,7 +9153,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
9386
9153
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
9387
9154
  }
9388
9155
 
9389
- if (dst->backend == GGML_BACKEND_CPU) {
9156
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9390
9157
  CUDA_CHECK(cudaDeviceSynchronize());
9391
9158
  }
9392
9159
  }
@@ -9467,8 +9234,8 @@ static void ggml_cuda_op_mul_mat(
9467
9234
  const int nb2 = dst->nb[2];
9468
9235
  const int nb3 = dst->nb[3];
9469
9236
 
9470
- GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
9471
- GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
9237
+ GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9238
+ GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9472
9239
  GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
9473
9240
 
9474
9241
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@@ -9484,20 +9251,20 @@ static void ggml_cuda_op_mul_mat(
9484
9251
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
9485
9252
  ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9486
9253
 
9487
- const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
9254
+ const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9488
9255
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
9489
9256
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
9490
9257
 
9491
9258
  const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
9492
9259
 
9493
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9260
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
9494
9261
  GGML_ASSERT(!(split && ne02 > 1));
9495
9262
  GGML_ASSERT(!(split && ne03 > 1));
9496
9263
  GGML_ASSERT(!(split && ne02 < ne12));
9497
9264
 
9498
9265
  std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
9499
9266
  if (split) {
9500
- // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
9267
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
9501
9268
  // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
9502
9269
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9503
9270
  tensor_split = buft_ctx->tensor_split;
@@ -9555,8 +9322,8 @@ static void ggml_cuda_op_mul_mat(
9555
9322
 
9556
9323
  used_devices++;
9557
9324
 
9558
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
9559
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
9325
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9326
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9560
9327
 
9561
9328
  ggml_cuda_set_device(id);
9562
9329
  cudaStream_t stream = g_cudaStreams[id][0];
@@ -9607,8 +9374,8 @@ static void ggml_cuda_op_mul_mat(
9607
9374
  continue;
9608
9375
  }
9609
9376
 
9610
- const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
9611
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
9377
+ const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9378
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
9612
9379
  const int64_t row_diff = dev[id].row_high - dev[id].row_low;
9613
9380
 
9614
9381
  ggml_cuda_set_device(id);
@@ -9633,12 +9400,12 @@ static void ggml_cuda_op_mul_mat(
9633
9400
 
9634
9401
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
9635
9402
  // in that case an offset on dst_ddf_i is needed
9636
- if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
9403
+ if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
9637
9404
  dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
9638
9405
  }
9639
9406
 
9640
9407
  // copy src0, src1 to device if necessary
9641
- if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
9408
+ if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
9642
9409
  if (id != g_main_device) {
9643
9410
  if (convert_src1_to_q8_1) {
9644
9411
  char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
@@ -9651,14 +9418,14 @@ static void ggml_cuda_op_mul_mat(
9651
9418
  src1_ncols*ne10*sizeof(float), stream));
9652
9419
  }
9653
9420
  }
9654
- } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
9421
+ } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
9655
9422
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
9656
9423
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
9657
9424
  } else {
9658
9425
  GGML_ASSERT(false);
9659
9426
  }
9660
9427
 
9661
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
9428
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
9662
9429
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
9663
9430
  CUDA_CHECK(cudaGetLastError());
9664
9431
  }
@@ -9676,10 +9443,10 @@ static void ggml_cuda_op_mul_mat(
9676
9443
  if (!dst_on_device) {
9677
9444
  void * dst_off_device;
9678
9445
  cudaMemcpyKind kind;
9679
- if (dst->backend == GGML_BACKEND_CPU) {
9446
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9680
9447
  dst_off_device = dst->data;
9681
9448
  kind = cudaMemcpyDeviceToHost;
9682
- } else if (dst->backend == GGML_BACKEND_GPU) {
9449
+ } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
9683
9450
  dst_off_device = dst_extra->data_device[g_main_device];
9684
9451
  kind = cudaMemcpyDeviceToDevice;
9685
9452
  } else {
@@ -9744,7 +9511,7 @@ static void ggml_cuda_op_mul_mat(
9744
9511
  }
9745
9512
  }
9746
9513
 
9747
- if (dst->backend == GGML_BACKEND_CPU) {
9514
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9748
9515
  ggml_cuda_set_device(g_main_device);
9749
9516
  CUDA_CHECK(cudaDeviceSynchronize());
9750
9517
  }
@@ -9829,6 +9596,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
9829
9596
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
9830
9597
  }
9831
9598
 
9599
+ static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9600
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
9601
+
9602
+ const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
9603
+
9604
+ // dd = data device
9605
+ float * src0_ddf = nullptr;
9606
+ float * src1_ddf = nullptr;
9607
+ float * dst_ddf = nullptr;
9608
+
9609
+ cuda_pool_alloc<float> dst_f;
9610
+
9611
+ ggml_cuda_set_device(g_main_device);
9612
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
9613
+
9614
+ if (dst_on_device) {
9615
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
9616
+ } else {
9617
+ dst_ddf = dst_f.alloc(ggml_nelements(dst));
9618
+ }
9619
+
9620
+ // do the computation
9621
+ ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
9622
+ CUDA_CHECK(cudaGetLastError());
9623
+
9624
+ // copy dst to host if necessary
9625
+ if (!dst_on_device) {
9626
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
9627
+ }
9628
+
9629
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
9630
+ CUDA_CHECK(cudaDeviceSynchronize());
9631
+ }
9632
+ }
9633
+
9634
+ static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9635
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
9636
+ }
9637
+
9832
9638
  static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9833
9639
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
9834
9640
  }
@@ -9850,7 +9656,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
9850
9656
 
9851
9657
  static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
9852
9658
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
9853
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
9659
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9854
9660
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
9855
9661
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
9856
9662
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -9881,7 +9687,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
9881
9687
  GGML_ASSERT(!ggml_is_transposed(src0));
9882
9688
  GGML_ASSERT(!ggml_is_transposed(src1));
9883
9689
  GGML_ASSERT(!ggml_is_permuted(src0));
9884
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
9690
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9885
9691
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
9886
9692
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
9887
9693
 
@@ -9940,7 +9746,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
9940
9746
  GGML_ASSERT(!ggml_is_transposed(src0));
9941
9747
  GGML_ASSERT(!ggml_is_transposed(src1));
9942
9748
 
9943
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
9749
+ GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
9944
9750
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
9945
9751
 
9946
9752
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -10086,11 +9892,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
10086
9892
 
10087
9893
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10088
9894
  const bool all_on_device =
10089
- (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
10090
- (src1->backend == GGML_BACKEND_GPU) &&
10091
- ( dst->backend == GGML_BACKEND_GPU);
9895
+ (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
9896
+ (src1->backend == GGML_BACKEND_TYPE_GPU) &&
9897
+ ( dst->backend == GGML_BACKEND_TYPE_GPU);
10092
9898
 
10093
- const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9899
+ const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
10094
9900
 
10095
9901
  int64_t min_compute_capability = INT_MAX;
10096
9902
 
@@ -10240,7 +10046,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
10240
10046
  GGML_ASSERT(!ggml_is_transposed(src00));
10241
10047
  GGML_ASSERT(!ggml_is_transposed(src1));
10242
10048
 
10243
- GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
10049
+ GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
10244
10050
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
10245
10051
 
10246
10052
  const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
@@ -10384,7 +10190,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10384
10190
 
10385
10191
  cudaStream_t stream = g_cudaStreams[g_main_device][0];
10386
10192
 
10387
- if (ids->backend == GGML_BACKEND_GPU) {
10193
+ if (ids->backend == GGML_BACKEND_TYPE_GPU) {
10388
10194
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
10389
10195
  CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
10390
10196
  CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -10401,20 +10207,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10401
10207
  ggml_tensor src1_row = *src1;
10402
10208
  ggml_tensor dst_row = *dst;
10403
10209
 
10404
- src1_row.backend = GGML_BACKEND_GPU;
10405
- dst_row.backend = GGML_BACKEND_GPU;
10210
+ src1_row.backend = GGML_BACKEND_TYPE_GPU;
10211
+ dst_row.backend = GGML_BACKEND_TYPE_GPU;
10406
10212
 
10407
10213
  src1_row.extra = &src1_row_extra;
10408
10214
  dst_row.extra = &dst_row_extra;
10409
10215
 
10410
- char * src1_original = src1->backend == GGML_BACKEND_CPU ?
10216
+ char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
10411
10217
  (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
10412
- char * dst_original = dst->backend == GGML_BACKEND_CPU ?
10218
+ char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
10413
10219
  (char *) dst->data : (char *) dst_extra->data_device[g_main_device];
10414
10220
 
10415
10221
  if (src1->ne[1] == 1) {
10416
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
10417
- GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
10222
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
10223
+ GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
10418
10224
 
10419
10225
  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
10420
10226
  //int32_t row_id;
@@ -10442,9 +10248,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10442
10248
  src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
10443
10249
  dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
10444
10250
 
10445
- const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
10251
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
10446
10252
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
10447
- const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
10253
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
10448
10254
  cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
10449
10255
 
10450
10256
  for (int32_t row_id = 0; row_id < n_as; ++row_id) {
@@ -10499,7 +10305,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
10499
10305
  }
10500
10306
  }
10501
10307
 
10502
- if (dst->backend == GGML_BACKEND_CPU) {
10308
+ if (dst->backend == GGML_BACKEND_TYPE_CPU) {
10503
10309
  CUDA_CHECK(cudaStreamSynchronize(stream));
10504
10310
  }
10505
10311
  }
@@ -10516,8 +10322,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
10516
10322
  const int64_t ne = ggml_nelements(src0);
10517
10323
  GGML_ASSERT(ne == ggml_nelements(src1));
10518
10324
 
10519
- GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
10520
- GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
10325
+ GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
10326
+ GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
10521
10327
 
10522
10328
  GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
10523
10329
  GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
@@ -10648,9 +10454,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10648
10454
  if (!g_cublas_loaded) return false;
10649
10455
 
10650
10456
  ggml_cuda_func_t func;
10651
- const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
10652
- || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
10653
- || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
10457
+ const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
10458
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
10459
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
10654
10460
 
10655
10461
  if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
10656
10462
  return false;
@@ -10729,6 +10535,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10729
10535
  case GGML_OP_PAD:
10730
10536
  func = ggml_cuda_pad;
10731
10537
  break;
10538
+ case GGML_OP_ARANGE:
10539
+ func = ggml_cuda_arange;
10540
+ break;
10541
+ case GGML_OP_TIMESTEP_EMBEDDING:
10542
+ func = ggml_cuda_timestep_embedding;
10543
+ break;
10732
10544
  case GGML_OP_LEAKY_RELU:
10733
10545
  func = ggml_cuda_leaky_relu;
10734
10546
  break;
@@ -10797,14 +10609,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10797
10609
  return false;
10798
10610
  }
10799
10611
 
10800
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
10612
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
10801
10613
  ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
10802
10614
  }
10803
10615
 
10804
10616
  if (params->ith != 0) {
10805
10617
  return true;
10806
10618
  }
10807
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10619
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10808
10620
  return true;
10809
10621
  }
10810
10622
  func(tensor->src[0], tensor->src[1], tensor);
@@ -10832,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
10832
10644
  #define UNUSED GGML_UNUSED
10833
10645
 
10834
10646
  struct ggml_backend_cuda_context {
10647
+ explicit ggml_backend_cuda_context(int device) :
10648
+ device(device),
10649
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10650
+ }
10651
+
10652
+ ~ggml_backend_cuda_context() {
10653
+ if (copy_event != nullptr) {
10654
+ CUDA_CHECK(cudaEventDestroy(copy_event));
10655
+ }
10656
+ }
10657
+
10835
10658
  int device;
10836
10659
  std::string name;
10660
+ cudaEvent_t copy_event = nullptr;
10837
10661
  };
10838
10662
 
10839
10663
  // cuda buffer
@@ -10903,7 +10727,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
10903
10727
 
10904
10728
  extra->data_device[ctx->device] = tensor->data;
10905
10729
 
10906
- tensor->backend = GGML_BACKEND_GPU;
10730
+ tensor->backend = GGML_BACKEND_TYPE_GPU;
10907
10731
  tensor->extra = extra;
10908
10732
 
10909
10733
  if (ggml_is_quantized(tensor->type)) {
@@ -10918,42 +10742,40 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
10918
10742
  }
10919
10743
 
10920
10744
  GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10921
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10745
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
10922
10746
 
10923
10747
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10924
10748
 
10925
10749
  ggml_cuda_set_device(ctx->device);
10926
- CUDA_CHECK(cudaDeviceSynchronize());
10927
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
10928
- CUDA_CHECK(cudaDeviceSynchronize());
10750
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
10751
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
10929
10752
  }
10930
10753
 
10931
10754
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10932
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10755
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
10933
10756
 
10934
10757
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10935
10758
 
10936
10759
  ggml_cuda_set_device(ctx->device);
10937
- CUDA_CHECK(cudaDeviceSynchronize());
10938
- CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10939
- CUDA_CHECK(cudaDeviceSynchronize());
10760
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
10761
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
10940
10762
  }
10941
10763
 
10942
10764
  GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10943
10765
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
10944
10766
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10945
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10946
-
10947
- ggml_cuda_set_device(src_ctx->device);
10948
- CUDA_CHECK(cudaDeviceSynchronize());
10949
- ggml_cuda_set_device(dst_ctx->device);
10950
- CUDA_CHECK(cudaDeviceSynchronize());
10951
- CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10952
- CUDA_CHECK(cudaDeviceSynchronize());
10953
-
10767
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
10768
+ if (src_ctx->device == dst_ctx->device) {
10769
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
10770
+ } else {
10771
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
10772
+ }
10773
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
10954
10774
  return true;
10955
10775
  }
10956
10776
  return false;
10777
+
10778
+ UNUSED(buffer);
10957
10779
  }
10958
10780
 
10959
10781
  GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11164,7 +10986,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
11164
10986
  CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
11165
10987
  }
11166
10988
  }
11167
- tensor->backend = GGML_BACKEND_GPU_SPLIT;
10989
+ tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
11168
10990
  tensor->extra = extra;
11169
10991
  }
11170
10992
 
@@ -11198,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
11198
11020
  }
11199
11021
 
11200
11022
  const char * buf_host = (const char *)data + offset_split;
11201
- CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
11023
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
11024
+ }
11025
+
11026
+ for (int id = 0; id < g_device_count; ++id) {
11027
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11202
11028
  }
11203
11029
  }
11204
11030
 
@@ -11232,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
11232
11058
  }
11233
11059
 
11234
11060
  char * buf_host = (char *)data + offset_split;
11235
- CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
11061
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
11062
+ }
11063
+
11064
+ for (int id = 0; id < g_device_count; ++id) {
11065
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11236
11066
  }
11237
11067
  }
11238
11068
 
@@ -11411,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
11411
11241
  return &ggml_backend_cuda_buffer_type_host;
11412
11242
  }
11413
11243
 
11244
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
11245
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
11246
+ //}
11247
+
11414
11248
  // backend
11415
11249
 
11416
11250
  GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -11434,31 +11268,71 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
11434
11268
 
11435
11269
  GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
11436
11270
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11271
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
11437
11272
 
11438
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11439
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11273
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11274
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11440
11275
 
11441
11276
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
11442
11277
  }
11443
11278
 
11444
11279
  GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
11445
11280
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11281
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
11446
11282
 
11447
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11448
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
11283
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11284
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
11449
11285
 
11450
11286
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
11451
11287
  }
11452
11288
 
11453
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
11454
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11289
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
11290
+ GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
11455
11291
 
11456
- if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
11457
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
11458
- return true;
11292
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
11293
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
11294
+
11295
+ if (!ggml_backend_buffer_is_cuda(src->buffer)) {
11296
+ return false;
11459
11297
  }
11460
11298
 
11461
- return false;
11299
+ if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
11300
+ return false;
11301
+ }
11302
+
11303
+ // device -> device
11304
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
11305
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
11306
+
11307
+ if (backend_src != backend_dst) {
11308
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
11309
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
11310
+
11311
+ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
11312
+ GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
11313
+
11314
+ if (!cuda_ctx_src->copy_event) {
11315
+ ggml_cuda_set_device(cuda_ctx_src->device);
11316
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11317
+ }
11318
+
11319
+ // copy on src stream
11320
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
11321
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11322
+ } else {
11323
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
11324
+ }
11325
+
11326
+ // record event on src stream
11327
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
11328
+
11329
+ // wait on dst stream for the copy to complete
11330
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
11331
+ } else {
11332
+ // src and dst are on the same backend
11333
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11334
+ }
11335
+ return true;
11462
11336
  }
11463
11337
 
11464
11338
  GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -11469,13 +11343,13 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
11469
11343
  UNUSED(backend);
11470
11344
  }
11471
11345
 
11472
- GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
11346
+ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
11473
11347
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11474
11348
 
11475
11349
  ggml_cuda_set_main_device(cuda_ctx->device);
11476
11350
 
11477
11351
  ggml_compute_params params = {};
11478
- params.type = GGML_TASK_COMPUTE;
11352
+ params.type = GGML_TASK_TYPE_COMPUTE;
11479
11353
  params.ith = 0;
11480
11354
  for (int i = 0; i < cgraph->n_nodes; i++) {
11481
11355
  ggml_tensor * node = cgraph->nodes[i];
@@ -11485,13 +11359,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
11485
11359
  }
11486
11360
 
11487
11361
  #ifndef NDEBUG
11488
- assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
11362
+ assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
11489
11363
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
11490
11364
  assert(node->extra != nullptr);
11491
11365
 
11492
11366
  for (int j = 0; j < GGML_MAX_SRC; j++) {
11493
11367
  if (node->src[j] != nullptr) {
11494
- assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
11368
+ assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
11495
11369
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
11496
11370
  assert(node->src[j]->extra != nullptr);
11497
11371
  }
@@ -11505,7 +11379,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
11505
11379
  GGML_ASSERT(ok);
11506
11380
  }
11507
11381
 
11508
- return true;
11382
+ return GGML_STATUS_SUCCESS;
11509
11383
  }
11510
11384
 
11511
11385
  GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
@@ -11541,7 +11415,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11541
11415
  }
11542
11416
  ggml_type a_type = a->type;
11543
11417
  if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
11544
- a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
11418
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
11419
+ a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
11545
11420
  if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
11546
11421
  return false;
11547
11422
  }
@@ -11623,6 +11498,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11623
11498
  case GGML_OP_GROUP_NORM:
11624
11499
  case GGML_OP_UPSCALE:
11625
11500
  case GGML_OP_PAD:
11501
+ case GGML_OP_ARANGE:
11502
+ case GGML_OP_TIMESTEP_EMBEDDING:
11626
11503
  case GGML_OP_LEAKY_RELU:
11627
11504
  return true;
11628
11505
  default:
@@ -11632,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11632
11509
  UNUSED(backend);
11633
11510
  }
11634
11511
 
11512
+ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11513
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11514
+
11515
+ ggml_cuda_set_device(cuda_ctx->device);
11516
+
11517
+ cudaEvent_t event;
11518
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
11519
+
11520
+ return new ggml_backend_event {
11521
+ /* .backend = */ backend,
11522
+ /* .context = */ event,
11523
+ };
11524
+ }
11525
+
11526
+ static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
11527
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
11528
+
11529
+ delete event;
11530
+ }
11531
+
11532
+ static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
11533
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
11534
+
11535
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
11536
+ }
11537
+
11538
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
11539
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11540
+
11541
+ if (ggml_backend_is_cuda(event->backend)) {
11542
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
11543
+ } else {
11544
+ // untested
11545
+ auto wait_fn = [](void * user_data) {
11546
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
11547
+ ggml_backend_event_synchronize(event);
11548
+ };
11549
+
11550
+ CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
11551
+ }
11552
+ }
11553
+
11554
+ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
11555
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
11556
+ }
11557
+
11635
11558
  static ggml_backend_i ggml_backend_cuda_interface = {
11636
11559
  /* .get_name = */ ggml_backend_cuda_name,
11637
11560
  /* .free = */ ggml_backend_cuda_free,
@@ -11645,8 +11568,18 @@ static ggml_backend_i ggml_backend_cuda_interface = {
11645
11568
  /* .graph_plan_compute = */ NULL,
11646
11569
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
11647
11570
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11571
+ /* .event_new = */ ggml_backend_cuda_event_new,
11572
+ /* .event_free = */ ggml_backend_cuda_event_free,
11573
+ /* .event_record = */ ggml_backend_cuda_event_record,
11574
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
11575
+ /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
11648
11576
  };
11649
11577
 
11578
+ static ggml_guid_t ggml_backend_cuda_guid() {
11579
+ static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
11580
+ return &guid;
11581
+ }
11582
+
11650
11583
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11651
11584
  ggml_init_cublas(); // TODO: remove from ggml.c
11652
11585
 
@@ -11658,12 +11591,14 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11658
11591
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
11659
11592
  ggml_cuda_set_main_device(device);
11660
11593
 
11661
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
11662
- /* .device = */ device,
11663
- /* .name = */ GGML_CUDA_NAME + std::to_string(device),
11664
- };
11594
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
11595
+ if (ctx == nullptr) {
11596
+ fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
11597
+ return nullptr;
11598
+ }
11665
11599
 
11666
11600
  ggml_backend_t cuda_backend = new ggml_backend {
11601
+ /* .guid = */ ggml_backend_cuda_guid(),
11667
11602
  /* .interface = */ ggml_backend_cuda_interface,
11668
11603
  /* .context = */ ctx
11669
11604
  };
@@ -11672,7 +11607,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11672
11607
  }
11673
11608
 
11674
11609
  GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
11675
- return backend && backend->iface.get_name == ggml_backend_cuda_name;
11610
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
11676
11611
  }
11677
11612
 
11678
11613
  GGML_CALL int ggml_backend_cuda_get_device_count() {