llama_cpp 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,15 @@
2
2
  #include "ggml.h"
3
3
  #include "ggml-backend-impl.h"
4
4
 
5
+ #if defined(GGML_USE_HIPBLAS)
6
+ #define GGML_COMMON_DECL_HIP
7
+ #define GGML_COMMON_IMPL_HIP
8
+ #else
9
+ #define GGML_COMMON_DECL_CUDA
10
+ #define GGML_COMMON_IMPL_CUDA
11
+ #endif
12
+ #include "ggml-common.h"
13
+
5
14
  #include <algorithm>
6
15
  #include <assert.h>
7
16
  #include <atomic>
@@ -63,6 +72,7 @@
63
72
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
64
73
  #define cudaEventDisableTiming hipEventDisableTiming
65
74
  #define cudaEventRecord hipEventRecord
75
+ #define cudaEventSynchronize hipEventSynchronize
66
76
  #define cudaEvent_t hipEvent_t
67
77
  #define cudaEventDestroy hipEventDestroy
68
78
  #define cudaFree hipFree
@@ -72,6 +82,7 @@
72
82
  #define cudaGetDeviceProperties hipGetDeviceProperties
73
83
  #define cudaGetErrorString hipGetErrorString
74
84
  #define cudaGetLastError hipGetLastError
85
+ #define cudaLaunchHostFunc hipLaunchHostFunc
75
86
  #ifdef GGML_HIP_UMA
76
87
  #define cudaMalloc hipMallocManaged
77
88
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
@@ -95,6 +106,7 @@
95
106
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
96
107
  #define cudaStreamFireAndForget hipStreamFireAndForget
97
108
  #define cudaStreamNonBlocking hipStreamNonBlocking
109
+ #define cudaStreamPerThread hipStreamPerThread
98
110
  #define cudaStreamSynchronize hipStreamSynchronize
99
111
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
100
112
  #define cudaStream_t hipStream_t
@@ -356,66 +368,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
356
368
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
357
369
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
358
370
 
359
- // QK = number of values after dequantization
360
- // QR = QK / number of values before dequantization
361
- // QI = number of 32 bit integers before dequantization
362
-
363
- #define QK4_0 32
364
- #define QR4_0 2
365
- #define QI4_0 (QK4_0 / (4 * QR4_0))
366
- typedef struct {
367
- half d; // delta
368
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
369
- } block_q4_0;
370
- static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
371
-
372
- #define QK4_1 32
373
- #define QR4_1 2
374
- #define QI4_1 (QK4_1 / (4 * QR4_1))
375
- typedef struct {
376
- half2 dm; // dm.x = delta, dm.y = min
377
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
378
- } block_q4_1;
379
- static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
380
-
381
- #define QK5_0 32
382
- #define QR5_0 2
383
- #define QI5_0 (QK5_0 / (4 * QR5_0))
384
- typedef struct {
385
- half d; // delta
386
- uint8_t qh[4]; // 5-th bit of quants
387
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
388
- } block_q5_0;
389
- static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
390
-
391
- #define QK5_1 32
392
- #define QR5_1 2
393
- #define QI5_1 (QK5_1 / (4 * QR5_1))
394
- typedef struct {
395
- half2 dm; // dm.x = delta, dm.y = min
396
- uint8_t qh[4]; // 5-th bit of quants
397
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
398
- } block_q5_1;
399
- static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
400
-
401
- #define QK8_0 32
402
- #define QR8_0 1
403
- #define QI8_0 (QK8_0 / (4 * QR8_0))
404
- typedef struct {
405
- half d; // delta
406
- int8_t qs[QK8_0]; // quants
407
- } block_q8_0;
408
- static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
409
-
410
- #define QK8_1 32
411
- #define QR8_1 1
412
- #define QI8_1 (QK8_1 / (4 * QR8_1))
413
- typedef struct {
414
- half2 ds; // ds.x = delta, ds.y = sum
415
- int8_t qs[QK8_0]; // quants
416
- } block_q8_1;
417
- static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
418
-
419
371
  typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
420
372
  typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
421
373
  typedef void (*load_tiles_cuda_t)(
@@ -425,174 +377,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
425
377
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
426
378
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
427
379
 
428
- //================================= k-quants
429
-
430
- #ifdef GGML_QKK_64
431
- #define QK_K 64
432
- #define K_SCALE_SIZE 4
433
- #else
434
- #define QK_K 256
435
- #define K_SCALE_SIZE 12
436
- #endif
437
-
438
- #define QR2_K 4
439
- #define QI2_K (QK_K / (4*QR2_K))
440
- typedef struct {
441
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
442
- uint8_t qs[QK_K/4]; // quants
443
- half2 dm; // super-block scale for quantized scales/mins
444
- } block_q2_K;
445
- static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
446
-
447
- #define QR3_K 4
448
- #define QI3_K (QK_K / (4*QR3_K))
449
- typedef struct {
450
- uint8_t hmask[QK_K/8]; // quants - high bit
451
- uint8_t qs[QK_K/4]; // quants - low 2 bits
452
- #ifdef GGML_QKK_64
453
- uint8_t scales[2]; // scales, quantized with 8 bits
454
- #else
455
- uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
456
- #endif
457
- half d; // super-block scale
458
- } block_q3_K;
459
- //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
460
-
461
- #define QR4_K 2
462
- #define QI4_K (QK_K / (4*QR4_K))
463
- #ifdef GGML_QKK_64
464
- typedef struct {
465
- half dm[2]; // super-block scales/mins
466
- uint8_t scales[2]; // 4-bit block scales/mins
467
- uint8_t qs[QK_K/2]; // 4--bit quants
468
- } block_q4_K;
469
- static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
470
- #else
471
- typedef struct {
472
- half2 dm; // super-block scale for quantized scales/mins
473
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
474
- uint8_t qs[QK_K/2]; // 4--bit quants
475
- } block_q4_K;
476
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
477
- #endif
478
-
479
- #define QR5_K 2
480
- #define QI5_K (QK_K / (4*QR5_K))
481
- #ifdef GGML_QKK_64
482
- typedef struct {
483
- half d; // super-block scale
484
- int8_t scales[QK_K/16]; // block scales
485
- uint8_t qh[QK_K/8]; // quants, high bit
486
- uint8_t qs[QK_K/2]; // quants, low 4 bits
487
- } block_q5_K;
488
- static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
489
- #else
490
- typedef struct {
491
- half2 dm; // super-block scale for quantized scales/mins
492
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
493
- uint8_t qh[QK_K/8]; // quants, high bit
494
- uint8_t qs[QK_K/2]; // quants, low 4 bits
495
- } block_q5_K;
496
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
497
- #endif
498
-
499
- #define QR6_K 2
500
- #define QI6_K (QK_K / (4*QR6_K))
501
- typedef struct {
502
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
503
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
504
- int8_t scales[QK_K/16]; // scales
505
- half d; // delta
506
- } block_q6_K;
507
- static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
508
-
509
- #define QR2_XXS 8
510
- #define QI2_XXS (QK_K / (4*QR2_XXS))
511
- typedef struct {
512
- half d;
513
- uint16_t qs[QK_K/8];
514
- } block_iq2_xxs;
515
- static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
516
-
517
- #define QR2_XS 8
518
- #define QI2_XS (QK_K / (4*QR2_XS))
519
- typedef struct {
520
- half d;
521
- uint16_t qs[QK_K/8];
522
- uint8_t scales[QK_K/32];
523
- } block_iq2_xs;
524
- static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
525
-
526
- // 2.5625 bpw quants
527
- #define QR2_S 8
528
- #define QI2_S (QK_K / (4*QR2_S))
529
- typedef struct {
530
- half d;
531
- uint8_t qs[QK_K/4];
532
- uint8_t qh[QK_K/32];
533
- uint8_t scales[QK_K/32];
534
- } block_iq2_s;
535
- static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
536
-
537
- #define QR3_XXS 8
538
- #define QI3_XXS (QK_K / (4*QR3_XXS))
539
- typedef struct {
540
- half d;
541
- uint8_t qs[3*(QK_K/8)];
542
- } block_iq3_xxs;
543
- static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
544
-
545
- #define QR3_XS 8
546
- #define QI3_XS (QK_K / (4*QR3_XS))
547
- #if QK_K == 64
548
- #define IQ3S_N_SCALE 2
549
- #else
550
- #define IQ3S_N_SCALE QK_K/64
551
- #endif
552
- typedef struct {
553
- half d;
554
- uint8_t qs[QK_K/4];
555
- uint8_t qh[QK_K/32];
556
- uint8_t signs[QK_K/8];
557
- uint8_t scales[IQ3S_N_SCALE];
558
- } block_iq3_s;
559
- static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
560
-
561
- #define QR1_S 8
562
- #define QI1_S (QK_K / (4*QR1_S))
563
- typedef struct {
564
- half d;
565
- uint8_t qs[QK_K/8];
566
- uint8_t scales[QK_K/16];
567
- } block_iq1_s;
568
- static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
569
-
570
- #define QK4_NL 32
571
- #define QR4_NL 2
572
- #define QI4_NL (QK4_NL / (4*QR4_NL))
573
- typedef struct {
574
- half d;
575
- uint8_t qs[QK4_NL/2];
576
- } block_iq4_nl;
577
- static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
578
-
579
- #if QK_K == 64
580
- #define block_iq4_xs block_iq4_nl
581
- #define QR4_XS QR4_NL
582
- #define QI4_XS QI4_NL
583
- #else
584
- // QR4_XS = 8 is very slightly faster than QR4_XS = 4
585
- #define QR4_XS 8
586
- #define QI4_XS (QK_K / (4*QR4_XS))
587
- typedef struct {
588
- half d;
589
- uint16_t scales_h;
590
- uint8_t scales_l[QK_K/64];
591
- uint8_t qs[QK_K/2];
592
- } block_iq4_xs;
593
- static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
594
- #endif
595
-
596
380
  #define WARP_SIZE 32
597
381
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
598
382
 
@@ -1569,746 +1353,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
1569
1353
  #endif
1570
1354
  }
1571
1355
 
1572
- static const __device__ uint64_t iq2xxs_grid[256] = {
1573
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1574
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
1575
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
1576
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
1577
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
1578
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
1579
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
1580
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
1581
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
1582
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
1583
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
1584
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
1585
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
1586
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
1587
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
1588
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
1589
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
1590
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
1591
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
1592
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
1593
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
1594
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
1595
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
1596
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
1597
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
1598
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
1599
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
1600
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
1601
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
1602
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
1603
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
1604
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
1605
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
1606
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
1607
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
1608
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
1609
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
1610
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
1611
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
1612
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
1613
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
1614
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
1615
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
1616
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
1617
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
1618
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
1619
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
1620
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
1621
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
1622
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
1623
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
1624
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
1625
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
1626
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
1627
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
1628
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
1629
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
1630
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
1631
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
1632
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
1633
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
1634
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
1635
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
1636
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
1637
- };
1638
-
1639
- static const __device__ uint64_t iq2xs_grid[512] = {
1640
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1641
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1642
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1643
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1644
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1645
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
1646
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
1647
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
1648
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
1649
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
1650
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
1651
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
1652
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
1653
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
1654
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
1655
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
1656
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
1657
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
1658
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
1659
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
1660
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
1661
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
1662
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
1663
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
1664
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
1665
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
1666
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
1667
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
1668
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
1669
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
1670
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
1671
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
1672
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
1673
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
1674
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
1675
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
1676
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
1677
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
1678
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
1679
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
1680
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
1681
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
1682
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
1683
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
1684
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
1685
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
1686
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
1687
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
1688
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
1689
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
1690
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
1691
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
1692
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
1693
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
1694
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
1695
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
1696
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
1697
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
1698
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
1699
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
1700
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
1701
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
1702
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
1703
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
1704
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
1705
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
1706
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
1707
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
1708
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
1709
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
1710
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
1711
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
1712
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
1713
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
1714
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
1715
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
1716
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
1717
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
1718
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
1719
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
1720
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
1721
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
1722
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
1723
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
1724
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
1725
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
1726
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
1727
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
1728
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
1729
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
1730
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
1731
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
1732
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
1733
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
1734
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
1735
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
1736
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
1737
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
1738
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
1739
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
1740
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
1741
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
1742
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
1743
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
1744
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
1745
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
1746
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
1747
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
1748
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
1749
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
1750
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
1751
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
1752
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
1753
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
1754
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
1755
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
1756
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
1757
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
1758
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
1759
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
1760
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
1761
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
1762
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
1763
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
1764
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
1765
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
1766
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
1767
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1768
- };
1769
-
1770
- static const __device__ uint64_t iq2s_grid[1024] = {
1771
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
1772
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
1773
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
1774
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
1775
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
1776
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
1777
- 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
1778
- 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
1779
- 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
1780
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
1781
- 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
1782
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
1783
- 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
1784
- 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
1785
- 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
1786
- 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
1787
- 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
1788
- 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
1789
- 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
1790
- 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
1791
- 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
1792
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
1793
- 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
1794
- 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
1795
- 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
1796
- 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
1797
- 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
1798
- 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
1799
- 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
1800
- 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
1801
- 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
1802
- 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
1803
- 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
1804
- 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
1805
- 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
1806
- 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
1807
- 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
1808
- 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
1809
- 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
1810
- 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
1811
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
1812
- 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
1813
- 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
1814
- 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
1815
- 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
1816
- 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
1817
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
1818
- 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
1819
- 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
1820
- 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
1821
- 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
1822
- 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
1823
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
1824
- 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
1825
- 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
1826
- 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
1827
- 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
1828
- 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
1829
- 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
1830
- 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
1831
- 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
1832
- 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
1833
- 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
1834
- 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
1835
- 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
1836
- 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
1837
- 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
1838
- 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
1839
- 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
1840
- 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
1841
- 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
1842
- 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
1843
- 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
1844
- 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
1845
- 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
1846
- 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
1847
- 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
1848
- 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
1849
- 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
1850
- 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
1851
- 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
1852
- 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
1853
- 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
1854
- 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
1855
- 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
1856
- 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
1857
- 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
1858
- 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
1859
- 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
1860
- 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
1861
- 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
1862
- 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
1863
- 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
1864
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
1865
- 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
1866
- 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
1867
- 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
1868
- 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
1869
- 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
1870
- 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
1871
- 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
1872
- 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
1873
- 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
1874
- 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
1875
- 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
1876
- 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
1877
- 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
1878
- 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
1879
- 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
1880
- 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
1881
- 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
1882
- 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
1883
- 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
1884
- 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
1885
- 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
1886
- 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
1887
- 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
1888
- 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
1889
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
1890
- 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
1891
- 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
1892
- 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
1893
- 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
1894
- 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
1895
- 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
1896
- 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
1897
- 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
1898
- 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
1899
- 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
1900
- 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
1901
- 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
1902
- 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
1903
- 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
1904
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
1905
- 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
1906
- 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
1907
- 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
1908
- 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
1909
- 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
1910
- 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
1911
- 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
1912
- 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
1913
- 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
1914
- 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
1915
- 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
1916
- 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
1917
- 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
1918
- 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
1919
- 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
1920
- 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
1921
- 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
1922
- 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
1923
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
1924
- 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
1925
- 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
1926
- 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
1927
- 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
1928
- 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
1929
- 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
1930
- 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
1931
- 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
1932
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
1933
- 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
1934
- 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
1935
- 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
1936
- 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
1937
- 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
1938
- 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
1939
- 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
1940
- 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
1941
- 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
1942
- 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
1943
- 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
1944
- 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
1945
- 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
1946
- 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
1947
- 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
1948
- 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
1949
- 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
1950
- 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
1951
- 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
1952
- 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
1953
- 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
1954
- 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
1955
- 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
1956
- 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
1957
- 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
1958
- 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
1959
- 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
1960
- 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
1961
- 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
1962
- 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
1963
- 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
1964
- 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
1965
- 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
1966
- 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
1967
- 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
1968
- 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
1969
- 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
1970
- 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
1971
- 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
1972
- 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
1973
- 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
1974
- 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
1975
- 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
1976
- 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
1977
- 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
1978
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
1979
- 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
1980
- 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
1981
- 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
1982
- 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
1983
- 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
1984
- 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
1985
- 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
1986
- 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
1987
- 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
1988
- 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
1989
- 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
1990
- 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
1991
- 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
1992
- 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
1993
- 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
1994
- 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
1995
- 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
1996
- 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
1997
- 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
1998
- 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
1999
- 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
2000
- 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
2001
- 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
2002
- 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
2003
- 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
2004
- 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
2005
- 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
2006
- 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
2007
- 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
2008
- 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
2009
- 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
2010
- 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
2011
- 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
2012
- 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
2013
- 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
2014
- 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
2015
- 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
2016
- 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
2017
- 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
2018
- 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
2019
- 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
2020
- 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
2021
- 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
2022
- 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
2023
- 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
2024
- 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
2025
- 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
2026
- 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
2027
- };
2028
-
2029
- static const __device__ uint32_t iq3xxs_grid[256] = {
2030
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
2031
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
2032
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
2033
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
2034
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
2035
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
2036
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
2037
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
2038
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
2039
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
2040
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
2041
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
2042
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
2043
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
2044
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
2045
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
2046
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
2047
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
2048
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
2049
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
2050
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
2051
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
2052
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
2053
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
2054
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
2055
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
2056
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
2057
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
2058
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
2059
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
2060
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
2061
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
2062
- };
2063
-
2064
- static const __device__ uint32_t iq3s_grid[512] = {
2065
- 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
2066
- 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
2067
- 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
2068
- 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
2069
- 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
2070
- 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
2071
- 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
2072
- 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
2073
- 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
2074
- 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
2075
- 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
2076
- 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
2077
- 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
2078
- 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
2079
- 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
2080
- 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
2081
- 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
2082
- 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
2083
- 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
2084
- 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
2085
- 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
2086
- 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
2087
- 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
2088
- 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
2089
- 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
2090
- 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
2091
- 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
2092
- 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
2093
- 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
2094
- 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
2095
- 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
2096
- 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
2097
- 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
2098
- 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
2099
- 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
2100
- 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
2101
- 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
2102
- 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
2103
- 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
2104
- 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
2105
- 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
2106
- 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
2107
- 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
2108
- 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
2109
- 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
2110
- 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
2111
- 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
2112
- 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
2113
- 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
2114
- 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
2115
- 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
2116
- 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
2117
- 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
2118
- 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
2119
- 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
2120
- 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
2121
- 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
2122
- 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
2123
- 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
2124
- 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
2125
- 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
2126
- 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
2127
- 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
2128
- 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
2129
- };
2130
-
2131
- static const __device__ uint64_t iq1s_grid[512] = {
2132
- 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
2133
- 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
2134
- 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
2135
- 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
2136
- 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
2137
- 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
2138
- 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
2139
- 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
2140
- 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
2141
- 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
2142
- 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
2143
- 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
2144
- 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
2145
- 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
2146
- 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
2147
- 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
2148
- 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
2149
- 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
2150
- 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
2151
- 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
2152
- 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
2153
- 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
2154
- 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
2155
- 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
2156
- 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
2157
- 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
2158
- 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
2159
- 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
2160
- 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
2161
- 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
2162
- 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
2163
- 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
2164
- 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
2165
- 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
2166
- 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
2167
- 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
2168
- 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
2169
- 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
2170
- 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
2171
- 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
2172
- 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
2173
- 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
2174
- 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
2175
- 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
2176
- 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
2177
- 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
2178
- 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
2179
- 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
2180
- 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
2181
- 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
2182
- 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
2183
- 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
2184
- 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
2185
- 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
2186
- 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
2187
- 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
2188
- 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
2189
- 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
2190
- 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
2191
- 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
2192
- 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
2193
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
2194
- 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
2195
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
2196
- 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
2197
- 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
2198
- 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
2199
- 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
2200
- 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
2201
- 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
2202
- 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
2203
- 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
2204
- 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
2205
- 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
2206
- 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
2207
- 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
2208
- 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
2209
- 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
2210
- 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
2211
- 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
2212
- 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
2213
- 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
2214
- 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
2215
- 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
2216
- 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
2217
- 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
2218
- 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
2219
- 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
2220
- 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
2221
- 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
2222
- 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
2223
- 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
2224
- 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
2225
- 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
2226
- 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
2227
- 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
2228
- 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
2229
- 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
2230
- 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
2231
- 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
2232
- 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
2233
- 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
2234
- 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
2235
- 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
2236
- 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
2237
- 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
2238
- 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
2239
- 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
2240
- 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
2241
- 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
2242
- 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
2243
- 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
2244
- 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
2245
- 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
2246
- 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
2247
- 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
2248
- 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
2249
- 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
2250
- 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
2251
- 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
2252
- 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
2253
- 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
2254
- 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
2255
- 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
2256
- 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
2257
- 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
2258
- 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
2259
- 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
2260
- };
2261
-
2262
- static const __device__ uint8_t ksigns_iq2xs[128] = {
2263
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
2264
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
2265
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
2266
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
2267
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
2268
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
2269
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
2270
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
2271
- };
2272
-
2273
- //#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2274
- static const __device__ uint64_t ksigns64[128] = {
2275
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
2276
- 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
2277
- 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
2278
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
2279
- 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
2280
- 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
2281
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
2282
- 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
2283
- 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
2284
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
2285
- 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
2286
- 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
2287
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
2288
- 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
2289
- 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
2290
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
2291
- 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
2292
- 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
2293
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
2294
- 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
2295
- 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
2296
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
2297
- 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
2298
- 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
2299
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
2300
- 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
2301
- 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
2302
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
2303
- 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
2304
- 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
2305
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
2306
- 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
2307
- };
2308
- //#endif
2309
-
2310
- static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2311
-
2312
1356
  inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
2313
1357
  switch (type) {
2314
1358
  case GGML_TYPE_Q4_0:
@@ -2459,11 +1503,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
2459
1503
  const int il = tid/8; // 0...3
2460
1504
  const int ib = tid%8; // 0...7
2461
1505
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
2462
- const int i8 = 4*ib+il;
2463
- uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
2464
- const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
2465
- const float d = (float)x[i].d * (2*(h & 7) + 1);
2466
- for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
1506
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
1507
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
1508
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
1509
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
1510
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
1511
+ grid32[0] &= 0x0f0f0f0f;
1512
+ for (int j = 0; j < 8; ++j) {
1513
+ y[j] = d * (q[j] + delta);
1514
+ }
2467
1515
  #else
2468
1516
  assert(false);
2469
1517
  #endif
@@ -4303,7 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
4303
3351
  #pragma unroll
4304
3352
  for (int i = 0; i < QR2_K; ++ i) {
4305
3353
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
4306
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3354
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
4307
3355
  }
4308
3356
 
4309
3357
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -4425,7 +3473,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
4425
3473
  #pragma unroll
4426
3474
  for (int i = 0; i < QR3_K; ++i) {
4427
3475
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
4428
- d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
3476
+ d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
4429
3477
  }
4430
3478
 
4431
3479
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -4594,7 +3642,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
4594
3642
 
4595
3643
  for (int i = 0; i < QR4_K; ++i) {
4596
3644
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
4597
- d8[i] = __low2half(bq8i->ds);
3645
+ d8[i] = __low2float(bq8i->ds);
4598
3646
 
4599
3647
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
4600
3648
  u[2*i+0] = q8[0];
@@ -4959,7 +4007,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
4959
4007
  #pragma unroll
4960
4008
  for (int i = 0; i < QR6_K; ++i) {
4961
4009
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
4962
- d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
4010
+ d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
4963
4011
  }
4964
4012
 
4965
4013
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -5275,44 +4323,36 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
5275
4323
  #endif
5276
4324
  }
5277
4325
 
5278
-
5279
4326
  static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
5280
4327
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5281
4328
  #if QK_K == 256
5282
4329
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
5283
4330
 
5284
4331
  const int ib32 = iqs;
5285
- int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
5286
- const uint8_t h1 = bq1->scales[2*ib32+0];
5287
- const uint8_t h2 = bq1->scales[2*ib32+1];
4332
+ int sumi = 0;
5288
4333
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
5289
4334
  const int * q8 = (const int *)bq8_1[ib32].qs;
5290
- const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
5291
- const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
5292
- const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
5293
- const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
5294
- for (int j = 0; j < 2; ++j) {
5295
- sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
5296
- sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
5297
- sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
5298
- sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
4335
+ for (int l = 0; l < 4; ++l) {
4336
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4337
+ int grid0 = grid[0] & 0x0f0f0f0f;
4338
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
4339
+ sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
5299
4340
  }
5300
4341
  #else
5301
- const int8_t * q8 = bq8_1[ib32].qs;
5302
- const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
5303
- const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
5304
- const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
5305
- const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
5306
- for (int j = 0; j < 8; ++j) {
5307
- sumi1 += q8[j+ 0] * grid1[j];
5308
- sumi2 += q8[j+ 8] * grid2[j];
5309
- sumi3 += q8[j+16] * grid3[j];
5310
- sumi4 += q8[j+24] * grid4[j];
4342
+ const int8_t * q8 = bq8_1[ib32].qs;
4343
+ for (int l = 0; l < 4; ++l) {
4344
+ const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
4345
+ for (int j = 0; j < 4; ++j) {
4346
+ sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
4347
+ }
4348
+ q8 += 8;
5311
4349
  }
5312
4350
  #endif
5313
- const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
5314
- return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
5315
- sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
4351
+ const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
4352
+ const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
4353
+ const float d = d1q * __low2float (bq8_1[ib32].ds);
4354
+ const float m = d1q * __high2float(bq8_1[ib32].ds);
4355
+ return d * sumi + m * delta;
5316
4356
  #else
5317
4357
  assert(false);
5318
4358
  return 0.f;
@@ -5504,7 +4544,7 @@ static __device__ __forceinline__ void mul_mat_q(
5504
4544
  *dsi_dst = *dsi_src;
5505
4545
  } else {
5506
4546
  float * dfi_dst = (float *) dsi_dst;
5507
- *dfi_dst = __low2half(*dsi_src);
4547
+ *dfi_dst = __low2float(*dsi_src);
5508
4548
  }
5509
4549
  }
5510
4550
 
@@ -11604,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
11604
10644
  #define UNUSED GGML_UNUSED
11605
10645
 
11606
10646
  struct ggml_backend_cuda_context {
10647
+ explicit ggml_backend_cuda_context(int device) :
10648
+ device(device),
10649
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10650
+ }
10651
+
10652
+ ~ggml_backend_cuda_context() {
10653
+ if (copy_event != nullptr) {
10654
+ CUDA_CHECK(cudaEventDestroy(copy_event));
10655
+ }
10656
+ }
10657
+
11607
10658
  int device;
11608
10659
  std::string name;
10660
+ cudaEvent_t copy_event = nullptr;
11609
10661
  };
11610
10662
 
11611
10663
  // cuda buffer
@@ -11695,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
11695
10747
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11696
10748
 
11697
10749
  ggml_cuda_set_device(ctx->device);
11698
- CUDA_CHECK(cudaDeviceSynchronize());
11699
- CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
11700
- CUDA_CHECK(cudaDeviceSynchronize());
10750
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
10751
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11701
10752
  }
11702
10753
 
11703
10754
  GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -11706,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
11706
10757
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11707
10758
 
11708
10759
  ggml_cuda_set_device(ctx->device);
11709
- CUDA_CHECK(cudaDeviceSynchronize());
11710
- CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
11711
- CUDA_CHECK(cudaDeviceSynchronize());
10760
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
10761
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11712
10762
  }
11713
10763
 
11714
10764
  GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
11715
10765
  if (ggml_backend_buffer_is_cuda(src->buffer)) {
11716
10766
  ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
11717
- ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
11718
-
11719
- ggml_cuda_set_device(src_ctx->device);
11720
- CUDA_CHECK(cudaDeviceSynchronize());
11721
- ggml_cuda_set_device(dst_ctx->device);
11722
- CUDA_CHECK(cudaDeviceSynchronize());
11723
- CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
11724
- CUDA_CHECK(cudaDeviceSynchronize());
11725
-
10767
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
10768
+ if (src_ctx->device == dst_ctx->device) {
10769
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
10770
+ } else {
10771
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
10772
+ }
10773
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11726
10774
  return true;
11727
10775
  }
11728
10776
  return false;
10777
+
10778
+ UNUSED(buffer);
11729
10779
  }
11730
10780
 
11731
10781
  GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -11970,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
11970
11020
  }
11971
11021
 
11972
11022
  const char * buf_host = (const char *)data + offset_split;
11973
- CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
11023
+ CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
11024
+ }
11025
+
11026
+ for (int id = 0; id < g_device_count; ++id) {
11027
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
11974
11028
  }
11975
11029
  }
11976
11030
 
@@ -12004,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
12004
11058
  }
12005
11059
 
12006
11060
  char * buf_host = (char *)data + offset_split;
12007
- CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
11061
+ CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
11062
+ }
11063
+
11064
+ for (int id = 0; id < g_device_count; ++id) {
11065
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
12008
11066
  }
12009
11067
  }
12010
11068
 
@@ -12183,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
12183
11241
  return &ggml_backend_cuda_buffer_type_host;
12184
11242
  }
12185
11243
 
11244
+ //static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
11245
+ // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
11246
+ //}
11247
+
12186
11248
  // backend
12187
11249
 
12188
11250
  GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -12206,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
12206
11268
 
12207
11269
  GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
12208
11270
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11271
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
12209
11272
 
12210
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11273
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
12211
11274
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
12212
11275
 
12213
11276
  CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
@@ -12215,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
12215
11278
 
12216
11279
  GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
12217
11280
  ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11281
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
12218
11282
 
12219
- GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
11283
+ GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
12220
11284
  GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
12221
11285
 
12222
11286
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
12223
11287
  }
12224
11288
 
12225
- GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
12226
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11289
+ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
11290
+ GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
12227
11291
 
12228
- if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
12229
- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
12230
- return true;
11292
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
11293
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
11294
+
11295
+ if (!ggml_backend_buffer_is_cuda(src->buffer)) {
11296
+ return false;
12231
11297
  }
12232
11298
 
12233
- return false;
11299
+ if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
11300
+ return false;
11301
+ }
11302
+
11303
+ // device -> device
11304
+ ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
11305
+ ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
11306
+
11307
+ if (backend_src != backend_dst) {
11308
+ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
11309
+ ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
11310
+
11311
+ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
11312
+ GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
11313
+
11314
+ if (!cuda_ctx_src->copy_event) {
11315
+ ggml_cuda_set_device(cuda_ctx_src->device);
11316
+ CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11317
+ }
11318
+
11319
+ // copy on src stream
11320
+ if (cuda_ctx_src->device == cuda_ctx_dst->device) {
11321
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11322
+ } else {
11323
+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
11324
+ }
11325
+
11326
+ // record event on src stream
11327
+ CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
11328
+
11329
+ // wait on dst stream for the copy to complete
11330
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
11331
+ } else {
11332
+ // src and dst are on the same backend
11333
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
11334
+ }
11335
+ return true;
12234
11336
  }
12235
11337
 
12236
11338
  GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -12407,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
12407
11509
  UNUSED(backend);
12408
11510
  }
12409
11511
 
11512
+ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11513
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11514
+
11515
+ ggml_cuda_set_device(cuda_ctx->device);
11516
+
11517
+ cudaEvent_t event;
11518
+ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
11519
+
11520
+ return new ggml_backend_event {
11521
+ /* .backend = */ backend,
11522
+ /* .context = */ event,
11523
+ };
11524
+ }
11525
+
11526
+ static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
11527
+ CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
11528
+
11529
+ delete event;
11530
+ }
11531
+
11532
+ static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
11533
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
11534
+
11535
+ CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
11536
+ }
11537
+
11538
+ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
11539
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11540
+
11541
+ if (ggml_backend_is_cuda(event->backend)) {
11542
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
11543
+ } else {
11544
+ // untested
11545
+ auto wait_fn = [](void * user_data) {
11546
+ ggml_backend_event_t event = (ggml_backend_event_t)user_data;
11547
+ ggml_backend_event_synchronize(event);
11548
+ };
11549
+
11550
+ CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
11551
+ }
11552
+ }
11553
+
11554
+ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
11555
+ CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
11556
+ }
11557
+
12410
11558
  static ggml_backend_i ggml_backend_cuda_interface = {
12411
11559
  /* .get_name = */ ggml_backend_cuda_name,
12412
11560
  /* .free = */ ggml_backend_cuda_free,
@@ -12420,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
12420
11568
  /* .graph_plan_compute = */ NULL,
12421
11569
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
12422
11570
  /* .supports_op = */ ggml_backend_cuda_supports_op,
11571
+ /* .event_new = */ ggml_backend_cuda_event_new,
11572
+ /* .event_free = */ ggml_backend_cuda_event_free,
11573
+ /* .event_record = */ ggml_backend_cuda_event_record,
11574
+ /* .event_wait = */ ggml_backend_cuda_event_wait,
11575
+ /* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
12423
11576
  };
12424
11577
 
12425
11578
  static ggml_guid_t ggml_backend_cuda_guid() {
@@ -12438,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
12438
11591
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
12439
11592
  ggml_cuda_set_main_device(device);
12440
11593
 
12441
- ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
12442
- /* .device = */ device,
12443
- /* .name = */ GGML_CUDA_NAME + std::to_string(device),
12444
- };
11594
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
11595
+ if (ctx == nullptr) {
11596
+ fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
11597
+ return nullptr;
11598
+ }
12445
11599
 
12446
11600
  ggml_backend_t cuda_backend = new ggml_backend {
12447
11601
  /* .guid = */ ggml_backend_cuda_guid(),