llama_cpp 0.14.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,15 @@
2
2
  #include "ggml.h"
3
3
  #include "ggml-backend-impl.h"
4
4
 
5
+ #if defined(GGML_USE_HIPBLAS)
6
+ #define GGML_COMMON_DECL_HIP
7
+ #define GGML_COMMON_IMPL_HIP
8
+ #else
9
+ #define GGML_COMMON_DECL_CUDA
10
+ #define GGML_COMMON_IMPL_CUDA
11
+ #endif
12
+ #include "ggml-common.h"
13
+
5
14
  #include <algorithm>
6
15
  #include <assert.h>
7
16
  #include <atomic>
@@ -63,6 +72,7 @@
63
72
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
64
73
  #define cudaEventDisableTiming hipEventDisableTiming
65
74
  #define cudaEventRecord hipEventRecord
75
+ #define cudaEventSynchronize hipEventSynchronize
66
76
  #define cudaEvent_t hipEvent_t
67
77
  #define cudaEventDestroy hipEventDestroy
68
78
  #define cudaFree hipFree
@@ -72,6 +82,7 @@
72
82
  #define cudaGetDeviceProperties hipGetDeviceProperties
73
83
  #define cudaGetErrorString hipGetErrorString
74
84
  #define cudaGetLastError hipGetLastError
85
+ #define cudaLaunchHostFunc hipLaunchHostFunc
75
86
  #ifdef GGML_HIP_UMA
76
87
  #define cudaMalloc hipMallocManaged
77
88
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -95,6 +106,7 @@
95
106
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
96
107
  #define cudaStreamFireAndForget hipStreamFireAndForget
97
108
  #define cudaStreamNonBlocking hipStreamNonBlocking
109
+ #define cudaStreamPerThread hipStreamPerThread
98
110
  #define cudaStreamSynchronize hipStreamSynchronize
99
111
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
100
112
  #define cudaStream_t hipStream_t
@@ -356,66 +368,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
356
368
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
357
369
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
358
370
 
359
- // QK = number of values after dequantization
360
- // QR = QK / number of values before dequantization
361
- // QI = number of 32 bit integers before dequantization
362
-
363
- #define QK4_0 32
364
- #define QR4_0 2
365
- #define QI4_0 (QK4_0 / (4 * QR4_0))
366
- typedef struct {
367
- half d; // delta
368
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
369
- } block_q4_0;
370
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
371
-
372
- #define QK4_1 32
373
- #define QR4_1 2
374
- #define QI4_1 (QK4_1 / (4 * QR4_1))
375
- typedef struct {
376
- half2 dm; // dm.x = delta, dm.y = min
377
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
378
- } block_q4_1;
379
- static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
380
-
381
- #define QK5_0 32
382
- #define QR5_0 2
383
- #define QI5_0 (QK5_0 / (4 * QR5_0))
384
- typedef struct {
385
- half d; // delta
386
- uint8_t qh[4]; // 5-th bit of quants
387
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
388
- } block_q5_0;
389
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
390
-
391
- #define QK5_1 32
392
- #define QR5_1 2
393
- #define QI5_1 (QK5_1 / (4 * QR5_1))
394
- typedef struct {
395
- half2 dm; // dm.x = delta, dm.y = min
396
- uint8_t qh[4]; // 5-th bit of quants
397
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
398
- } block_q5_1;
399
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
400
-
401
- #define QK8_0 32
402
- #define QR8_0 1
403
- #define QI8_0 (QK8_0 / (4 * QR8_0))
404
- typedef struct {
405
- half d; // delta
406
- int8_t qs[QK8_0]; // quants
407
- } block_q8_0;
408
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
409
-
410
- #define QK8_1 32
411
- #define QR8_1 1
412
- #define QI8_1 (QK8_1 / (4 * QR8_1))
413
- typedef struct {
414
- half2 ds; // ds.x = delta, ds.y = sum
415
- int8_t qs[QK8_0]; // quants
416
- } block_q8_1;
417
- static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
418
-
419
371
  typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
420
372
  typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
421
373
  typedef void (*load_tiles_cuda_t)(
@@ -425,174 +377,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
425
377
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
426
378
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
427
379
 
428
- //================================= k-quants
429
-
430
- #ifdef GGML_QKK_64
431
- #define QK_K 64
432
- #define K_SCALE_SIZE 4
433
- #else
434
- #define QK_K 256
435
- #define K_SCALE_SIZE 12
436
- #endif
437
-
438
- #define QR2_K 4
439
- #define QI2_K (QK_K / (4*QR2_K))
440
- typedef struct {
441
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
442
- uint8_t qs[QK_K/4]; // quants
443
- half2 dm; // super-block scale for quantized scales/mins
444
- } block_q2_K;
445
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
446
-
447
- #define QR3_K 4
448
- #define QI3_K (QK_K / (4*QR3_K))
449
- typedef struct {
450
- uint8_t hmask[QK_K/8]; // quants - high bit
451
- uint8_t qs[QK_K/4]; // quants - low 2 bits
452
- #ifdef GGML_QKK_64
453
- uint8_t scales[2]; // scales, quantized with 8 bits
454
- #else
455
- uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
456
- #endif
457
- half d; // super-block scale
458
- } block_q3_K;
459
- //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
460
-
461
- #define QR4_K 2
462
- #define QI4_K (QK_K / (4*QR4_K))
463
- #ifdef GGML_QKK_64
464
- typedef struct {
465
- half dm[2]; // super-block scales/mins
466
- uint8_t scales[2]; // 4-bit block scales/mins
467
- uint8_t qs[QK_K/2]; // 4--bit quants
468
- } block_q4_K;
469
- static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
470
- #else
471
- typedef struct {
472
- half2 dm; // super-block scale for quantized scales/mins
473
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
474
- uint8_t qs[QK_K/2]; // 4--bit quants
475
- } block_q4_K;
476
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
477
- #endif
478
-
479
- #define QR5_K 2
480
- #define QI5_K (QK_K / (4*QR5_K))
481
- #ifdef GGML_QKK_64
482
- typedef struct {
483
- half d; // super-block scale
484
- int8_t scales[QK_K/16]; // block scales
485
- uint8_t qh[QK_K/8]; // quants, high bit
486
- uint8_t qs[QK_K/2]; // quants, low 4 bits
487
- } block_q5_K;
488
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
489
- #else
490
- typedef struct {
491
- half2 dm; // super-block scale for quantized scales/mins
492
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
493
- uint8_t qh[QK_K/8]; // quants, high bit
494
- uint8_t qs[QK_K/2]; // quants, low 4 bits
495
- } block_q5_K;
496
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
497
- #endif
498
-
499
- #define QR6_K 2
500
- #define QI6_K (QK_K / (4*QR6_K))
501
- typedef struct {
502
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
503
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
504
- int8_t scales[QK_K/16]; // scales
505
- half d; // delta
506
- } block_q6_K;
507
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
508
-
509
- #define QR2_XXS 8
510
- #define QI2_XXS (QK_K / (4*QR2_XXS))
511
- typedef struct {
512
- half d;
513
- uint16_t qs[QK_K/8];
514
- } block_iq2_xxs;
515
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
516
-
517
- #define QR2_XS 8
518
- #define QI2_XS (QK_K / (4*QR2_XS))
519
- typedef struct {
520
- half d;
521
- uint16_t qs[QK_K/8];
522
- uint8_t scales[QK_K/32];
523
- } block_iq2_xs;
524
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
525
-
526
- // 2.5625 bpw quants
527
- #define QR2_S 8
528
- #define QI2_S (QK_K / (4*QR2_S))
529
- typedef struct {
530
- half d;
531
- uint8_t qs[QK_K/4];
532
- uint8_t qh[QK_K/32];
533
- uint8_t scales[QK_K/32];
534
- } block_iq2_s;
535
- static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
536
-
537
- #define QR3_XXS 8
538
- #define QI3_XXS (QK_K / (4*QR3_XXS))
539
- typedef struct {
540
- half d;
541
- uint8_t qs[3*(QK_K/8)];
542
- } block_iq3_xxs;
543
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
544
-
545
- #define QR3_XS 8
546
- #define QI3_XS (QK_K / (4*QR3_XS))
547
- #if QK_K == 64
548
- #define IQ3S_N_SCALE 2
549
- #else
550
- #define IQ3S_N_SCALE QK_K/64
551
- #endif
552
- typedef struct {
553
- half d;
554
- uint8_t qs[QK_K/4];
555
- uint8_t qh[QK_K/32];
556
- uint8_t signs[QK_K/8];
557
- uint8_t scales[IQ3S_N_SCALE];
558
- } block_iq3_s;
559
- static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
560
-
561
- #define QR1_S 8
562
- #define QI1_S (QK_K / (4*QR1_S))
563
- typedef struct {
564
- half d;
565
- uint8_t qs[QK_K/8];
566
- uint8_t scales[QK_K/16];
567
- } block_iq1_s;
568
- static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
569
-
570
- #define QK4_NL 32
571
- #define QR4_NL 2
572
- #define QI4_NL (QK4_NL / (4*QR4_NL))
573
- typedef struct {
574
- half d;
575
- uint8_t qs[QK4_NL/2];
576
- } block_iq4_nl;
577
- static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
578
-
579
- #if QK_K == 64
580
- #define block_iq4_xs block_iq4_nl
581
- #define QR4_XS QR4_NL
582
- #define QI4_XS QI4_NL
583
- #else
584
- // QR4_XS = 8 is very slightly faster than QR4_XS = 4
585
- #define QR4_XS 8
586
- #define QI4_XS (QK_K / (4*QR4_XS))
587
- typedef struct {
588
- half d;
589
- uint16_t scales_h;
590
- uint8_t scales_l[QK_K/64];
591
- uint8_t qs[QK_K/2];
592
- } block_iq4_xs;
593
- static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
594
- #endif
595
-
596
380
  #define WARP_SIZE 32
597
381
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
598
382
 
@@ -1569,746 +1353,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
1569
1353
  #endif
1570
1354
  }
1571
1355
 
1572
- static const __device__ uint64_t iq2xxs_grid[256] = {
1573
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1574
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
1575
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
1576
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
1577
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
1578
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
1579
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
1580
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
1581
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
1582
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
1583
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
1584
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
1585
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
1586
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
1587
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
1588
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
1589
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
1590
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
1591
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
1592
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
1593
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
1594
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
1595
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
1596
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
1597
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
1598
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
1599
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
1600
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
1601
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
1602
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
1603
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
1604
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
1605
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
1606
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
1607
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
1608
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
1609
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
1610
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
1611
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
1612
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
1613
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
1614
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
1615
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
1616
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
1617
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
1618
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
1619
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
1620
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
1621
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
1622
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
1623
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
1624
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
1625
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
1626
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
1627
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
1628
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
1629
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
1630
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
1631
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
1632
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
1633
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
1634
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
1635
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
1636
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
1637
- };
1638
-
1639
- static const __device__ uint64_t iq2xs_grid[512] = {
1640
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1641
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1642
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1643
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1644
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1645
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
1646
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
1647
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
1648
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
1649
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
1650
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
1651
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
1652
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
1653
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
1654
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
1655
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
1656
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
1657
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
1658
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
1659
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
1660
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
1661
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
1662
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
1663
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
1664
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
1665
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
1666
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
1667
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
1668
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
1669
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
1670
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
1671
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
1672
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
1673
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
1674
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
1675
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
1676
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
1677
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
1678
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
1679
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
1680
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
1681
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
1682
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
1683
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
1684
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
1685
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
1686
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
1687
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
1688
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
1689
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
1690
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
1691
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
1692
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
1693
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
1694
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
1695
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
1696
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
1697
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
1698
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
1699
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
1700
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
1701
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
1702
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
1703
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
1704
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
1705
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
1706
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
1707
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
1708
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
1709
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
1710
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
1711
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
1712
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
1713
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
1714
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
1715
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
1716
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
1717
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
1718
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
1719
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
1720
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
1721
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
1722
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
1723
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
1724
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
1725
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
1726
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
1727
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
1728
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
1729
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
1730
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
1731
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
1732
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
1733
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
1734
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
1735
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
1736
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
1737
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
1738
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
1739
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
1740
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
1741
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
1742
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
1743
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
1744
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
1745
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
1746
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
1747
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
1748
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
1749
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
1750
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
1751
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
1752
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
1753
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
1754
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
1755
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
1756
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
1757
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
1758
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
1759
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
1760
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
1761
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
1762
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
1763
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
1764
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
1765
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
1766
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
1767
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1768
- };
1769
-
1770
- static const __device__ uint64_t iq2s_grid[1024] = {
1771
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1772
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1773
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1774
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1775
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1776
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
1777
- 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
1778
- 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
1779
- 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
1780
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
1781
- 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
1782
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
1783
- 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
1784
- 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
1785
- 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
1786
- 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
1787
- 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
1788
- 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
1789
- 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
1790
- 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
1791
- 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
1792
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
1793
- 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
1794
- 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
1795
- 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
1796
- 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
1797
- 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
1798
- 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
1799
- 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
1800
- 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
1801
- 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
1802
- 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
1803
- 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
1804
- 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
1805
- 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
1806
- 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
1807
- 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
1808
- 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
1809
- 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
1810
- 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
1811
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
1812
- 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
1813
- 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
1814
- 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
1815
- 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
1816
- 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
1817
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
1818
- 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
1819
- 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
1820
- 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
1821
- 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
1822
- 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
1823
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
1824
- 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
1825
- 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
1826
- 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
1827
- 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
1828
- 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
1829
- 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
1830
- 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
1831
- 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
1832
- 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
1833
- 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
1834
- 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
1835
- 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
1836
- 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
1837
- 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
1838
- 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
1839
- 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
1840
- 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
1841
- 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
1842
- 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
1843
- 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
1844
- 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
1845
- 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
1846
- 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
1847
- 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
1848
- 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
1849
- 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
1850
- 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
1851
- 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
1852
- 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
1853
- 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
1854
- 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
1855
- 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
1856
- 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
1857
- 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
1858
- 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
1859
- 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
1860
- 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
1861
- 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
1862
- 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
1863
- 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
1864
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
1865
- 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
1866
- 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
1867
- 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
1868
- 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
1869
- 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
1870
- 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
1871
- 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
1872
- 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
1873
- 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
1874
- 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
1875
- 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
1876
- 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
1877
- 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
1878
- 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
1879
- 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
1880
- 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
1881
- 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
1882
- 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
1883
- 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
1884
- 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
1885
- 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
1886
- 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
1887
- 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
1888
- 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
1889
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
1890
- 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
1891
- 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
1892
- 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
1893
- 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
1894
- 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
1895
- 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
1896
- 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
1897
- 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
1898
- 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
1899
- 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
1900
- 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
1901
- 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
1902
- 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
1903
- 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
1904
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
1905
- 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
1906
- 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
1907
- 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
1908
- 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
1909
- 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
1910
- 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
1911
- 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
1912
- 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
1913
- 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
1914
- 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
1915
- 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
1916
- 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
1917
- 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
1918
- 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
1919
- 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
1920
- 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
1921
- 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
1922
- 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
1923
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
1924
- 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
1925
- 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
1926
- 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
1927
- 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
1928
- 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
1929
- 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
1930
- 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
1931
- 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
1932
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
1933
- 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
1934
- 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
1935
- 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
1936
- 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
1937
- 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
1938
- 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
1939
- 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
1940
- 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
1941
- 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
1942
- 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
1943
- 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
1944
- 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
1945
- 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
1946
- 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
1947
- 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
1948
- 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
1949
- 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
1950
- 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
1951
- 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
1952
- 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
1953
- 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
1954
- 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
1955
- 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
1956
- 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
1957
- 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
1958
- 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
1959
- 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
1960
- 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
1961
- 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
1962
- 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
1963
- 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
1964
- 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
1965
- 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
1966
- 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
1967
- 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
1968
- 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
1969
- 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
1970
- 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
1971
- 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
1972
- 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
1973
- 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
1974
- 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
1975
- 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
1976
- 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
1977
- 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
1978
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
1979
- 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
1980
- 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
1981
- 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
1982
- 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
1983
- 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
1984
- 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
1985
- 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
1986
- 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
1987
- 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
1988
- 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
1989
- 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
1990
- 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
1991
- 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
1992
- 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
1993
- 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
1994
- 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
1995
- 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
1996
- 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
1997
- 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
1998
- 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
1999
- 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
2000
- 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
2001
- 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
2002
- 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
2003
- 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
2004
- 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
2005
- 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
2006
- 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
2007
- 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
2008
- 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
2009
- 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
2010
- 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
2011
- 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
2012
- 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
2013
- 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
2014
- 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
2015
- 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
2016
- 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
2017
- 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
2018
- 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
2019
- 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
2020
- 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
2021
- 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
2022
- 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
2023
- 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
2024
- 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
2025
- 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
2026
- 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
2027
- };
2028
-
2029
- static const __device__ uint32_t iq3xxs_grid[256] = {
2030
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
2031
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
2032
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
2033
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
2034
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
2035
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
2036
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
2037
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
2038
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
2039
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
2040
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
2041
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
2042
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
2043
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
2044
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
2045
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
2046
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
2047
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
2048
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
2049
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
2050
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
2051
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
2052
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
2053
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
2054
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
2055
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
2056
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
2057
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
2058
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
2059
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
2060
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
2061
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
2062
- };
2063
-
2064
- static const __device__ uint32_t iq3s_grid[512] = {
2065
- 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
2066
- 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
2067
- 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
2068
- 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
2069
- 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
2070
- 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
2071
- 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
2072
- 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
2073
- 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
2074
- 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
2075
- 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
2076
- 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
2077
- 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
2078
- 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
2079
- 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
2080
- 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
2081
- 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
2082
- 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
2083
- 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
2084
- 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
2085
- 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
2086
- 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
2087
- 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
2088
- 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
2089
- 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
2090
- 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
2091
- 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
2092
- 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
2093
- 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
2094
- 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
2095
- 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
2096
- 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
2097
- 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
2098
- 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
2099
- 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
2100
- 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
2101
- 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
2102
- 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
2103
- 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
2104
- 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
2105
- 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
2106
- 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
2107
- 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
2108
- 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
2109
- 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
2110
- 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
2111
- 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
2112
- 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
2113
- 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
2114
- 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
2115
- 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
2116
- 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
2117
- 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
2118
- 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
2119
- 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
2120
- 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
2121
- 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
2122
- 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
2123
- 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
2124
- 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
2125
- 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
2126
- 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
2127
- 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
2128
- 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
2129
- };
2130
-
2131
- static const __device__ uint64_t iq1s_grid[512] = {
2132
- 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
2133
- 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
2134
- 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
2135
- 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
2136
- 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
2137
- 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
2138
- 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
2139
- 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
2140
- 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
2141
- 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
2142
- 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
2143
- 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
2144
- 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
2145
- 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
2146
- 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
2147
- 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
2148
- 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
2149
- 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
2150
- 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
2151
- 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
2152
- 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
2153
- 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
2154
- 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
2155
- 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
2156
- 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
2157
- 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
2158
- 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
2159
- 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
2160
- 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
2161
- 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
2162
- 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
2163
- 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
2164
- 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
2165
- 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
2166
- 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
2167
- 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
2168
- 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
2169
- 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
2170
- 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
2171
- 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
2172
- 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
2173
- 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
2174
- 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
2175
- 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
2176
- 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
2177
- 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
2178
- 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
2179
- 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
2180
- 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
2181
- 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
2182
- 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
2183
- 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
2184
- 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
2185
- 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
2186
- 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
2187
- 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
2188
- 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
2189
- 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
2190
- 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
2191
- 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
2192
- 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
2193
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
2194
- 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
2195
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
2196
- 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
2197
- 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
2198
- 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
2199
- 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
2200
- 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
2201
- 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
2202
- 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
2203
- 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
2204
- 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
2205
- 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
2206
- 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
2207
- 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
2208
- 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
2209
- 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
2210
- 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
2211
- 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
2212
- 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
2213
- 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
2214
- 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
2215
- 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
2216
- 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
2217
- 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
2218
- 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
2219
- 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
2220
- 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
2221
- 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
2222
- 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
2223
- 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
2224
- 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
2225
- 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
2226
- 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
2227
- 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
2228
- 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
2229
- 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
2230
- 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
2231
- 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
2232
- 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
2233
- 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
2234
- 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
2235
- 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
2236
- 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
2237
- 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
2238
- 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
2239
- 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
2240
- 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
2241
- 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
2242
- 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
2243
- 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
2244
- 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
2245
- 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
2246
- 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
2247
- 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
2248
- 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
2249
- 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
2250
- 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
2251
- 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
2252
- 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
2253
- 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
2254
- 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
2255
- 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
2256
- 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
2257
- 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
2258
- 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
2259
- 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
2260
- };
2261
-
2262
- static const __device__ uint8_t ksigns_iq2xs[128] = {
2263
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
2264
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
2265
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
2266
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
2267
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
2268
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
2269
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
2270
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
2271
- };
2272
-
2273
- //#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2274
- static const __device__ uint64_t ksigns64[128] = {
2275
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
2276
- 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
2277
- 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
2278
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
2279
- 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
2280
- 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
2281
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
2282
- 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
2283
- 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
2284
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
2285
- 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
2286
- 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
2287
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
2288
- 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
2289
- 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
2290
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
2291
- 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
2292
- 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
2293
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
2294
- 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
2295
- 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
2296
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
2297
- 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
2298
- 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
2299
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
2300
- 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
2301
- 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
2302
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
2303
- 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
2304
- 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
2305
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
2306
- 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
2307
- };
2308
- //#endif
2309
-
2310
- static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2311
-
2312
1356
  inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
2313
1357
  switch (type) {
2314
1358
  case GGML_TYPE_Q4_0:
@@ -2459,11 +1503,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
2459
1503
  const int il = tid/8; // 0...3
2460
1504
  const int ib = tid%8; // 0...7
2461
1505
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
2462
- const int i8 = 4*ib+il;
2463
- uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
2464
- const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
2465
- const float d = (float)x[i].d * (2*(h & 7) + 1);
2466
- for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
1506
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
1507
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
1508
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
1509
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
1510
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
1511
+ grid32[0] &= 0x0f0f0f0f;
1512
+ for (int j = 0; j < 8; ++j) {
1513
+ y[j] = d * (q[j] + delta);
1514
+ }
2467
1515
  #else
2468
1516
  assert(false);
2469
1517
  #endif
@@ -4303,7 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
4303
3351
  #pragma unroll
4304
3352
  for (int i = 0; i < QR2_K; ++ i) {
4305
3353
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
4306
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3354
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
4307
3355
  }
4308
3356
 
4309
3357
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -4425,7 +3473,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
4425
3473
  #pragma unroll
4426
3474
  for (int i = 0; i < QR3_K; ++i) {
4427
3475
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
4428
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3476
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
4429
3477
  }
4430
3478
 
4431
3479
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -4594,7 +3642,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
4594
3642
 
4595
3643
  for (int i = 0; i < QR4_K; ++i) {
4596
3644
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
4597
- d8[i] = __low2half(bq8i->ds);
3645
+ d8[i] = __low2float(bq8i->ds);
4598
3646
 
4599
3647
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
4600
3648
  u[2*i+0] = q8[0];
@@ -4959,7 +4007,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
4959
4007
  #pragma unroll
4960
4008
  for (int i = 0; i < QR6_K; ++i) {
4961
4009
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
4962
- d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
4010
+ d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
4963
4011
  }
4964
4012
 
4965
4013
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -5275,44 +4323,36 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
5275
4323
  #endif
5276
4324
  }
5277
4325
 
5278
-
5279
4326
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
5280
4327
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5281
4328
  #if QK_K == 256
5282
4329
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
5283
4330
 
5284
4331
  const int ib32 = iqs;
5285
- int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
5286
- const uint8_t h1 = bq1->scales[2*ib32+0];
5287
- const uint8_t h2 = bq1->scales[2*ib32+1];
4332
+ int sumi = 0;
5288
4333
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
5289
4334
  const int * q8 = (const int *)bq8_1[ib32].qs;
5290
- const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
5291
- const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
5292
- const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
5293
- const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
5294
- for (int j = 0; j < 2; ++j) {
5295
- sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
5296
- sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
5297
- sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
5298
- sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
4335
+ for (int l = 0; l < 4; ++l) {
4336
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4337
+ int grid0 = grid[0] & 0x0f0f0f0f;
4338
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
4339
+ sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
5299
4340
  }
5300
4341
  #else
5301
- const int8_t * q8 = bq8_1[ib32].qs;
5302
- const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
5303
- const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
5304
- const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
5305
- const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
5306
- for (int j = 0; j < 8; ++j) {
5307
- sumi1 += q8[j+ 0] * grid1[j];
5308
- sumi2 += q8[j+ 8] * grid2[j];
5309
- sumi3 += q8[j+16] * grid3[j];
5310
- sumi4 += q8[j+24] * grid4[j];
4342
+ const int8_t * q8 = bq8_1[ib32].qs;
4343
+ for (int l = 0; l < 4; ++l) {
4344
+ const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4345
+ for (int j = 0; j < 4; ++j) {
4346
+ sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
4347
+ }
4348
+ q8 += 8;
5311
4349
  }
5312
4350
  #endif
5313
- const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
5314
- return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
5315
- sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
4351
+ const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
4352
+ const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
4353
+ const float d = d1q * __low2float (bq8_1[ib32].ds);
4354
+ const float m = d1q * __high2float(bq8_1[ib32].ds);
4355
+ return d * sumi + m * delta;
5316
4356
  #else
5317
4357
  assert(false);
5318
4358
  return 0.f;
@@ -5504,7 +4544,7 @@ static __device__ __forceinline__ void mul_mat_q(
5504
4544
  *dsi_dst = *dsi_src;
5505
4545
  } else {
5506
4546
  float * dfi_dst = (float *) dsi_dst;
5507
- *dfi_dst = __low2half(*dsi_src);
4547
+ *dfi_dst = __low2float(*dsi_src);
5508
4548
  }
5509
4549
  }
5510
4550
 
@@ -11604,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
11604
10644
  #define UNUSED GGML_UNUSED
11605
10645
 
11606
10646
  struct ggml_backend_cuda_context {
10647
+ explicit ggml_backend_cuda_context(int device) :
10648
+ device(device),
10649
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10650
+ }
10651
+
10652
+ ~ggml_backend_cuda_context() {
10653
+ if (copy_event != nullptr) {
10654
+ CUDA_CHECK(cudaEventDestroy(copy_event));
10655
+ }
10656
+ }
10657
+
11607
10658
  int device;
11608
10659
  std::string name;
10660
+ cudaEvent_t copy_event = nullptr;
11609
10661
  };
11610
10662
 
11611
10663
  // cuda buffer
@@ -11695,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
11695
10747
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11696
10748
 
11697
10749
  ggml_cuda_set_device(ctx->device);
11698
- CUDA_CHECK(cudaDeviceSynchronize());
11699
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
11700
- CUDA_CHECK(cudaDeviceSynchronize());
10750
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
10751
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11701
10752
  }
11702
10753
 
11703
10754
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -11706,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
11706
10757
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11707
10758
 
11708
10759
  ggml_cuda_set_device(ctx->device);
11709
- CUDA_CHECK(cudaDeviceSynchronize());
11710
- CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
11711
- CUDA_CHECK(cudaDeviceSynchronize());
10760
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
10761
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11712
10762
  }
11713
10763
 
11714
10764
  GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
11715
10765
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
11716
10766
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
11717
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11718
-
11719
- ggml_cuda_set_device(src_ctx->device);
11720
- CUDA_CHECK(cudaDeviceSynchronize());
11721
- ggml_cuda_set_device(dst_ctx->device);
11722
- CUDA_CHECK(cudaDeviceSynchronize());
11723
- CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
11724
- CUDA_CHECK(cudaDeviceSynchronize());
11725
-
10767
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
10768
+ if (src_ctx->device == dst_ctx->device) {
10769
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
10770
+ } else {
10771
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
10772
+ }
10773
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11726
10774
  return true;
11727
10775
  }
11728
10776
  return false;
10777
+
10778
+ UNUSED(buffer);
11729
10779
  }
11730
10780
 
11731
10781
  GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11970,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
11970
11020
  }
11971
11021
 
11972
11022
  const char * buf_host = (const char *)data + offset_split;
11973
- CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
11023
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
11024
+ }
11025
+
11026
+ for (int id = 0; id < g_device_count; ++id) {
11027
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11974
11028
  }
11975
11029
  }
11976
11030
 
@@ -12004,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
12004
11058
  }
12005
11059
 
12006
11060
  char * buf_host = (char *)data + offset_split;
12007
- CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
11061
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
11062
+ }
11063
+
11064
+ for (int id = 0; id < g_device_count; ++id) {
11065
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
12008
11066
  }
12009
11067
  }
12010
11068
 
@@ -12183,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
12183
11241
  return &ggml_backend_cuda_buffer_type_host;
12184
11242
  }
12185
11243
 
11244
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
11245
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
11246
+ //}
11247
+
12186
11248
  // backend
12187
11249
 
12188
11250
  GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -12206,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
12206
11268
 
12207
11269
  GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
12208
11270
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11271
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
12209
11272
 
12210
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11273
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
12211
11274
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
12212
11275
 
12213
11276
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -12215,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
12215
11278
 
12216
11279
  GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
12217
11280
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11281
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
12218
11282
 
12219
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11283
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
12220
11284
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
12221
11285
 
12222
11286
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
12223
11287
  }
12224
11288
 
12225
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
12226
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11289
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
11290
+ GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
12227
11291
 
12228
- if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
12229
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
12230
- return true;
11292
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
11293
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
11294
+
11295
+ if (!ggml_backend_buffer_is_cuda(src->buffer)) {
11296
+ return false;
12231
11297
  }
12232
11298
 
12233
- return false;
11299
+ if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
11300
+ return false;
11301
+ }
11302
+
11303
+ // device -> device
11304
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
11305
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
11306
+
11307
+ if (backend_src != backend_dst) {
11308
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
11309
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
11310
+
11311
+ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
11312
+ GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
11313
+
11314
+ if (!cuda_ctx_src->copy_event) {
11315
+ ggml_cuda_set_device(cuda_ctx_src->device);
11316
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11317
+ }
11318
+
11319
+ // copy on src stream
11320
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
11321
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11322
+ } else {
11323
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
11324
+ }
11325
+
11326
+ // record event on src stream
11327
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
11328
+
11329
+ // wait on dst stream for the copy to complete
11330
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
11331
+ } else {
11332
+ // src and dst are on the same backend
11333
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11334
+ }
11335
+ return true;
12234
11336
  }
12235
11337
 
12236
11338
  GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -12407,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
12407
11509
  UNUSED(backend);
12408
11510
  }
12409
11511
 
11512
+ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11513
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11514
+
11515
+ ggml_cuda_set_device(cuda_ctx->device);
11516
+
11517
+ cudaEvent_t event;
11518
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
11519
+
11520
+ return new ggml_backend_event {
11521
+ /* .backend = */ backend,
11522
+ /* .context = */ event,
11523
+ };
11524
+ }
11525
+
11526
+ static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
11527
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
11528
+
11529
+ delete event;
11530
+ }
11531
+
11532
+ static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
11533
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
11534
+
11535
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
11536
+ }
11537
+
11538
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
11539
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11540
+
11541
+ if (ggml_backend_is_cuda(event->backend)) {
11542
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
11543
+ } else {
11544
+ // untested
11545
+ auto wait_fn = [](void * user_data) {
11546
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
11547
+ ggml_backend_event_synchronize(event);
11548
+ };
11549
+
11550
+ CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
11551
+ }
11552
+ }
11553
+
11554
+ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
11555
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
11556
+ }
11557
+
12410
11558
  static ggml_backend_i ggml_backend_cuda_interface = {
12411
11559
  /* .get_name = */ ggml_backend_cuda_name,
12412
11560
  /* .free = */ ggml_backend_cuda_free,
@@ -12420,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
12420
11568
  /* .graph_plan_compute = */ NULL,
12421
11569
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
12422
11570
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11571
+ /* .event_new = */ ggml_backend_cuda_event_new,
11572
+ /* .event_free = */ ggml_backend_cuda_event_free,
11573
+ /* .event_record = */ ggml_backend_cuda_event_record,
11574
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
11575
+ /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
12423
11576
  };
12424
11577
 
12425
11578
  static ggml_guid_t ggml_backend_cuda_guid() {
@@ -12438,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
12438
11591
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
12439
11592
  ggml_cuda_set_main_device(device);
12440
11593
 
12441
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
12442
- /* .device = */ device,
12443
- /* .name = */ GGML_CUDA_NAME + std::to_string(device),
12444
- };
11594
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
11595
+ if (ctx == nullptr) {
11596
+ fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
11597
+ return nullptr;
11598
+ }
12445
11599
 
12446
11600
  ggml_backend_t cuda_backend = new ggml_backend {
12447
11601
  /* .guid = */ ggml_backend_cuda_guid(),