llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -2,6 +2,15 @@
|
|
2
2
|
#include "ggml.h"
|
3
3
|
#include "ggml-backend-impl.h"
|
4
4
|
|
5
|
+
#if defined(GGML_USE_HIPBLAS)
|
6
|
+
#define GGML_COMMON_DECL_HIP
|
7
|
+
#define GGML_COMMON_IMPL_HIP
|
8
|
+
#else
|
9
|
+
#define GGML_COMMON_DECL_CUDA
|
10
|
+
#define GGML_COMMON_IMPL_CUDA
|
11
|
+
#endif
|
12
|
+
#include "ggml-common.h"
|
13
|
+
|
5
14
|
#include <algorithm>
|
6
15
|
#include <assert.h>
|
7
16
|
#include <atomic>
|
@@ -63,6 +72,7 @@
|
|
63
72
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
64
73
|
#define cudaEventDisableTiming hipEventDisableTiming
|
65
74
|
#define cudaEventRecord hipEventRecord
|
75
|
+
#define cudaEventSynchronize hipEventSynchronize
|
66
76
|
#define cudaEvent_t hipEvent_t
|
67
77
|
#define cudaEventDestroy hipEventDestroy
|
68
78
|
#define cudaFree hipFree
|
@@ -72,6 +82,7 @@
|
|
72
82
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
73
83
|
#define cudaGetErrorString hipGetErrorString
|
74
84
|
#define cudaGetLastError hipGetLastError
|
85
|
+
#define cudaLaunchHostFunc hipLaunchHostFunc
|
75
86
|
#ifdef GGML_HIP_UMA
|
76
87
|
#define cudaMalloc hipMallocManaged
|
77
88
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
@@ -95,6 +106,7 @@
|
|
95
106
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
96
107
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
97
108
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
109
|
+
#define cudaStreamPerThread hipStreamPerThread
|
98
110
|
#define cudaStreamSynchronize hipStreamSynchronize
|
99
111
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
100
112
|
#define cudaStream_t hipStream_t
|
@@ -172,6 +184,7 @@
|
|
172
184
|
#endif
|
173
185
|
|
174
186
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
187
|
+
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
175
188
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
176
189
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
177
190
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
@@ -196,6 +209,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
196
209
|
return __vsubss4(a, b);
|
197
210
|
}
|
198
211
|
|
212
|
+
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
213
|
+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
214
|
+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
215
|
+
unsigned int c;
|
216
|
+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
217
|
+
#pragma unroll
|
218
|
+
for (int i = 0; i < 4; ++i) {
|
219
|
+
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
220
|
+
}
|
221
|
+
return c;
|
222
|
+
}
|
223
|
+
|
199
224
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
200
225
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
201
226
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
@@ -343,66 +368,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
|
|
343
368
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
344
369
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
345
370
|
|
346
|
-
// QK = number of values after dequantization
|
347
|
-
// QR = QK / number of values before dequantization
|
348
|
-
// QI = number of 32 bit integers before dequantization
|
349
|
-
|
350
|
-
#define QK4_0 32
|
351
|
-
#define QR4_0 2
|
352
|
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
353
|
-
typedef struct {
|
354
|
-
half d; // delta
|
355
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
356
|
-
} block_q4_0;
|
357
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
358
|
-
|
359
|
-
#define QK4_1 32
|
360
|
-
#define QR4_1 2
|
361
|
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
362
|
-
typedef struct {
|
363
|
-
half2 dm; // dm.x = delta, dm.y = min
|
364
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
365
|
-
} block_q4_1;
|
366
|
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
367
|
-
|
368
|
-
#define QK5_0 32
|
369
|
-
#define QR5_0 2
|
370
|
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
371
|
-
typedef struct {
|
372
|
-
half d; // delta
|
373
|
-
uint8_t qh[4]; // 5-th bit of quants
|
374
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
375
|
-
} block_q5_0;
|
376
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
377
|
-
|
378
|
-
#define QK5_1 32
|
379
|
-
#define QR5_1 2
|
380
|
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
381
|
-
typedef struct {
|
382
|
-
half2 dm; // dm.x = delta, dm.y = min
|
383
|
-
uint8_t qh[4]; // 5-th bit of quants
|
384
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
385
|
-
} block_q5_1;
|
386
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
387
|
-
|
388
|
-
#define QK8_0 32
|
389
|
-
#define QR8_0 1
|
390
|
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
391
|
-
typedef struct {
|
392
|
-
half d; // delta
|
393
|
-
int8_t qs[QK8_0]; // quants
|
394
|
-
} block_q8_0;
|
395
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
396
|
-
|
397
|
-
#define QK8_1 32
|
398
|
-
#define QR8_1 1
|
399
|
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
400
|
-
typedef struct {
|
401
|
-
half2 ds; // ds.x = delta, ds.y = sum
|
402
|
-
int8_t qs[QK8_0]; // quants
|
403
|
-
} block_q8_1;
|
404
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
405
|
-
|
406
371
|
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
407
372
|
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
408
373
|
typedef void (*load_tiles_cuda_t)(
|
@@ -412,130 +377,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
|
412
377
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
413
378
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
414
379
|
|
415
|
-
//================================= k-quants
|
416
|
-
|
417
|
-
#ifdef GGML_QKK_64
|
418
|
-
#define QK_K 64
|
419
|
-
#define K_SCALE_SIZE 4
|
420
|
-
#else
|
421
|
-
#define QK_K 256
|
422
|
-
#define K_SCALE_SIZE 12
|
423
|
-
#endif
|
424
|
-
|
425
|
-
#define QR2_K 4
|
426
|
-
#define QI2_K (QK_K / (4*QR2_K))
|
427
|
-
typedef struct {
|
428
|
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
429
|
-
uint8_t qs[QK_K/4]; // quants
|
430
|
-
half2 dm; // super-block scale for quantized scales/mins
|
431
|
-
} block_q2_K;
|
432
|
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
433
|
-
|
434
|
-
#define QR3_K 4
|
435
|
-
#define QI3_K (QK_K / (4*QR3_K))
|
436
|
-
typedef struct {
|
437
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
438
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
439
|
-
#ifdef GGML_QKK_64
|
440
|
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
441
|
-
#else
|
442
|
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
443
|
-
#endif
|
444
|
-
half d; // super-block scale
|
445
|
-
} block_q3_K;
|
446
|
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
447
|
-
|
448
|
-
#define QR4_K 2
|
449
|
-
#define QI4_K (QK_K / (4*QR4_K))
|
450
|
-
#ifdef GGML_QKK_64
|
451
|
-
typedef struct {
|
452
|
-
half dm[2]; // super-block scales/mins
|
453
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
454
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
455
|
-
} block_q4_K;
|
456
|
-
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
457
|
-
#else
|
458
|
-
typedef struct {
|
459
|
-
half2 dm; // super-block scale for quantized scales/mins
|
460
|
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
461
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
462
|
-
} block_q4_K;
|
463
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
464
|
-
#endif
|
465
|
-
|
466
|
-
#define QR5_K 2
|
467
|
-
#define QI5_K (QK_K / (4*QR5_K))
|
468
|
-
#ifdef GGML_QKK_64
|
469
|
-
typedef struct {
|
470
|
-
half d; // super-block scale
|
471
|
-
int8_t scales[QK_K/16]; // block scales
|
472
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
473
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
474
|
-
} block_q5_K;
|
475
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
476
|
-
#else
|
477
|
-
typedef struct {
|
478
|
-
half2 dm; // super-block scale for quantized scales/mins
|
479
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
480
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
481
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
482
|
-
} block_q5_K;
|
483
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
484
|
-
#endif
|
485
|
-
|
486
|
-
#define QR6_K 2
|
487
|
-
#define QI6_K (QK_K / (4*QR6_K))
|
488
|
-
typedef struct {
|
489
|
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
490
|
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
491
|
-
int8_t scales[QK_K/16]; // scales
|
492
|
-
half d; // delta
|
493
|
-
} block_q6_K;
|
494
|
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
495
|
-
|
496
|
-
#define QR2_XXS 8
|
497
|
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
498
|
-
typedef struct {
|
499
|
-
half d;
|
500
|
-
uint16_t qs[QK_K/8];
|
501
|
-
} block_iq2_xxs;
|
502
|
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
503
|
-
|
504
|
-
#define QR2_XS 8
|
505
|
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
506
|
-
typedef struct {
|
507
|
-
half d;
|
508
|
-
uint16_t qs[QK_K/8];
|
509
|
-
uint8_t scales[QK_K/32];
|
510
|
-
} block_iq2_xs;
|
511
|
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
512
|
-
|
513
|
-
#define QR3_XXS 8
|
514
|
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
515
|
-
typedef struct {
|
516
|
-
half d;
|
517
|
-
uint8_t qs[3*(QK_K/8)];
|
518
|
-
} block_iq3_xxs;
|
519
|
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
520
|
-
|
521
|
-
#define QR1_S 8
|
522
|
-
#define QI1_S (QK_K / (4*QR1_S))
|
523
|
-
typedef struct {
|
524
|
-
half d;
|
525
|
-
uint8_t qs[QK_K/8];
|
526
|
-
uint8_t scales[QK_K/16];
|
527
|
-
} block_iq1_s;
|
528
|
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
529
|
-
|
530
|
-
#define QK4_NL 32
|
531
|
-
#define QR4_NL 2
|
532
|
-
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
533
|
-
typedef struct {
|
534
|
-
half d;
|
535
|
-
uint8_t qs[QK4_NL/2];
|
536
|
-
} block_iq4_nl;
|
537
|
-
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
538
|
-
|
539
380
|
#define WARP_SIZE 32
|
540
381
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
541
382
|
|
@@ -559,6 +400,8 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
|
|
559
400
|
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
560
401
|
#define CUDA_CONCAT_BLOCK_SIZE 256
|
561
402
|
#define CUDA_PAD_BLOCK_SIZE 256
|
403
|
+
#define CUDA_ARANGE_BLOCK_SIZE 256
|
404
|
+
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
562
405
|
#define CUDA_ACC_BLOCK_SIZE 256
|
563
406
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
564
407
|
#define CUDA_POOL2D_BLOCK_SIZE 256
|
@@ -661,18 +504,20 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
661
504
|
return a;
|
662
505
|
}
|
663
506
|
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
//
|
507
|
+
#ifdef GGML_CUDA_F16
|
508
|
+
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
509
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
510
|
+
#pragma unroll
|
511
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
512
|
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
513
|
+
}
|
514
|
+
return a;
|
515
|
+
#else
|
516
|
+
(void) a;
|
517
|
+
NO_DEVICE_CODE;
|
518
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
519
|
+
}
|
520
|
+
#endif // GGML_CUDA_F16
|
676
521
|
|
677
522
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
678
523
|
#pragma unroll
|
@@ -931,17 +776,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
|
931
776
|
nidx +
|
932
777
|
blockIdx.y * ne0 +
|
933
778
|
blockIdx.z * ne0 * gridDim.y;
|
934
|
-
|
779
|
+
dst[offset_dst] = x[offset_src];
|
935
780
|
} else {
|
936
781
|
int offset_src =
|
937
782
|
nidx +
|
938
783
|
blockIdx.y * ne0 +
|
939
784
|
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
940
|
-
|
785
|
+
dst[offset_dst] = y[offset_src];
|
941
786
|
}
|
942
787
|
}
|
943
788
|
|
944
|
-
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int
|
789
|
+
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
790
|
+
// blockIdx.z: idx of ne02*ne03
|
791
|
+
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
792
|
+
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
793
|
+
// ne00xne01: ne00 * ne01
|
945
794
|
int ne0 = ne00 * scale_factor;
|
946
795
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
947
796
|
if (nidx >= ne0) {
|
@@ -953,7 +802,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
953
802
|
int offset_src =
|
954
803
|
i00 +
|
955
804
|
i01 * ne00 +
|
956
|
-
blockIdx.z *
|
805
|
+
blockIdx.z * ne00xne01;
|
957
806
|
int offset_dst =
|
958
807
|
nidx +
|
959
808
|
blockIdx.y * ne0 +
|
@@ -961,7 +810,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
961
810
|
dst[offset_dst] = x[offset_src];
|
962
811
|
}
|
963
812
|
|
964
|
-
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
813
|
+
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
814
|
+
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
815
|
+
// blockIdx.y: idx of ne1
|
816
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
965
817
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
966
818
|
if (nidx >= ne0) {
|
967
819
|
return;
|
@@ -972,19 +824,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
|
972
824
|
nidx +
|
973
825
|
blockIdx.y * ne0 +
|
974
826
|
blockIdx.z * ne0 * gridDim.y;
|
975
|
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
827
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
976
828
|
int offset_src =
|
977
829
|
nidx +
|
978
830
|
blockIdx.y * ne00 +
|
979
831
|
blockIdx.z * ne00 * ne01;
|
980
|
-
|
832
|
+
dst[offset_dst] = x[offset_src];
|
981
833
|
} else {
|
982
834
|
dst[offset_dst] = 0.0f;
|
983
835
|
}
|
984
836
|
}
|
985
837
|
|
838
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
839
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
840
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
841
|
+
if (nidx >= ne0) {
|
842
|
+
return;
|
843
|
+
}
|
844
|
+
dst[nidx] = start + step * nidx;
|
845
|
+
}
|
846
|
+
|
847
|
+
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
|
848
|
+
// blockIDx.y: idx of timesteps->ne[0]
|
849
|
+
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
|
850
|
+
int i = blockIdx.y;
|
851
|
+
int j = threadIdx.x + blockIdx.x * blockDim.x;
|
852
|
+
float * embed_data = (float *)((char *)dst + i*nb1);
|
853
|
+
|
854
|
+
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
855
|
+
embed_data[dim] = 0.f;
|
856
|
+
}
|
857
|
+
|
858
|
+
int half = dim / 2;
|
859
|
+
if (j >= half) {
|
860
|
+
return;
|
861
|
+
}
|
862
|
+
|
863
|
+
float timestep = timesteps[i];
|
864
|
+
float freq = (float)expf(-logf(max_period) * j / half);
|
865
|
+
float arg = timestep * freq;
|
866
|
+
embed_data[j] = cosf(arg);
|
867
|
+
embed_data[j + half] = sinf(arg);
|
868
|
+
}
|
869
|
+
|
986
870
|
template <int block_size>
|
987
871
|
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
872
|
+
// blockIdx.x: num_groups idx
|
873
|
+
// threadIdx.x: block_size idx
|
988
874
|
int start = blockIdx.x * group_size;
|
989
875
|
int end = start + group_size;
|
990
876
|
|
@@ -1467,420 +1353,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
1467
1353
|
#endif
|
1468
1354
|
}
|
1469
1355
|
|
1470
|
-
static const __device__ uint64_t iq2xxs_grid[256] = {
|
1471
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
1472
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
|
1473
|
-
0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
1474
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
1475
|
-
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
1476
|
-
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
1477
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
1478
|
-
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
1479
|
-
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
1480
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
1481
|
-
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
1482
|
-
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
1483
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
1484
|
-
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
1485
|
-
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
1486
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
1487
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
1488
|
-
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
1489
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
1490
|
-
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
1491
|
-
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
1492
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
1493
|
-
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
1494
|
-
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
1495
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
1496
|
-
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
1497
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
1498
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
1499
|
-
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
1500
|
-
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
1501
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
1502
|
-
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
1503
|
-
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
1504
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
1505
|
-
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
1506
|
-
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
1507
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
1508
|
-
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
1509
|
-
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
1510
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
1511
|
-
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
1512
|
-
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
1513
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
1514
|
-
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
1515
|
-
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
1516
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
1517
|
-
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
1518
|
-
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
1519
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
1520
|
-
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
1521
|
-
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
1522
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
1523
|
-
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
1524
|
-
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
1525
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
1526
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
1527
|
-
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
1528
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
1529
|
-
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
1530
|
-
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
1531
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
1532
|
-
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
1533
|
-
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
1534
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
1535
|
-
};
|
1536
|
-
|
1537
|
-
static const __device__ uint64_t iq2xs_grid[512] = {
|
1538
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
1539
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
1540
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
1541
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
1542
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
1543
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
1544
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
1545
|
-
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
1546
|
-
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
1547
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
1548
|
-
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
1549
|
-
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
1550
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
1551
|
-
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
1552
|
-
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
1553
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
1554
|
-
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
1555
|
-
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
1556
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
1557
|
-
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
1558
|
-
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
1559
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
1560
|
-
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
1561
|
-
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
1562
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
1563
|
-
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
1564
|
-
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
1565
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
1566
|
-
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
1567
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
1568
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
1569
|
-
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
1570
|
-
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
1571
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
1572
|
-
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
1573
|
-
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
1574
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
1575
|
-
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
1576
|
-
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
1577
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
1578
|
-
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
1579
|
-
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
1580
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
1581
|
-
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
1582
|
-
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
1583
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
1584
|
-
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
1585
|
-
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
1586
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
1587
|
-
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
1588
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
1589
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
1590
|
-
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
1591
|
-
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
1592
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
1593
|
-
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
1594
|
-
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
1595
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
1596
|
-
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
1597
|
-
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
1598
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
1599
|
-
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
1600
|
-
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
1601
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
1602
|
-
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
1603
|
-
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
1604
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
1605
|
-
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
1606
|
-
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
1607
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
1608
|
-
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
1609
|
-
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
1610
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
1611
|
-
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
1612
|
-
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
1613
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
1614
|
-
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
1615
|
-
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
1616
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
1617
|
-
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
1618
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
1619
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
1620
|
-
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
1621
|
-
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
1622
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
1623
|
-
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
1624
|
-
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
1625
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
1626
|
-
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
1627
|
-
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
1628
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
1629
|
-
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
1630
|
-
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
1631
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
1632
|
-
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
1633
|
-
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
1634
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
1635
|
-
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
1636
|
-
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
1637
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
1638
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
1639
|
-
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
1640
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
1641
|
-
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
1642
|
-
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
1643
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
1644
|
-
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
1645
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
1646
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
1647
|
-
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
1648
|
-
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
1649
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
1650
|
-
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
1651
|
-
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
1652
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
1653
|
-
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
1654
|
-
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
1655
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
1656
|
-
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
1657
|
-
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
1658
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
1659
|
-
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
1660
|
-
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
1661
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
1662
|
-
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
1663
|
-
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
1664
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
1665
|
-
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
1666
|
-
};
|
1667
|
-
|
1668
|
-
static const __device__ uint32_t iq3xxs_grid[256] = {
|
1669
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
1670
|
-
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
1671
|
-
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
1672
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
1673
|
-
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
1674
|
-
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
1675
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
1676
|
-
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
1677
|
-
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
1678
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
1679
|
-
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
1680
|
-
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
1681
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
1682
|
-
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
1683
|
-
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
1684
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
1685
|
-
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
1686
|
-
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
1687
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
1688
|
-
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
1689
|
-
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
1690
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
1691
|
-
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
1692
|
-
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
1693
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
1694
|
-
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
1695
|
-
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
1696
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
1697
|
-
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
1698
|
-
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
1699
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
1700
|
-
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
1701
|
-
};
|
1702
|
-
|
1703
|
-
static const __device__ uint64_t iq1s_grid[512] = {
|
1704
|
-
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
1705
|
-
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
1706
|
-
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
1707
|
-
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
1708
|
-
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
1709
|
-
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
1710
|
-
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
1711
|
-
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
1712
|
-
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
1713
|
-
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
1714
|
-
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
1715
|
-
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
1716
|
-
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
1717
|
-
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
1718
|
-
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
1719
|
-
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
1720
|
-
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
1721
|
-
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
1722
|
-
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
1723
|
-
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
1724
|
-
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
1725
|
-
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
1726
|
-
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
1727
|
-
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
1728
|
-
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
1729
|
-
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
1730
|
-
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
1731
|
-
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
1732
|
-
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
1733
|
-
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
1734
|
-
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
1735
|
-
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
1736
|
-
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
1737
|
-
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
1738
|
-
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
1739
|
-
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
1740
|
-
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
1741
|
-
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
1742
|
-
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
1743
|
-
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
1744
|
-
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
1745
|
-
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
1746
|
-
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
1747
|
-
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
1748
|
-
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
1749
|
-
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
1750
|
-
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
1751
|
-
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
1752
|
-
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
1753
|
-
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
1754
|
-
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
1755
|
-
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
1756
|
-
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
1757
|
-
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
1758
|
-
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
1759
|
-
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
1760
|
-
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
1761
|
-
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
1762
|
-
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
1763
|
-
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
1764
|
-
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
1765
|
-
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
1766
|
-
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
1767
|
-
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
1768
|
-
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
1769
|
-
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
1770
|
-
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
1771
|
-
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
1772
|
-
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
1773
|
-
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
1774
|
-
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
1775
|
-
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
1776
|
-
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
1777
|
-
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
1778
|
-
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
1779
|
-
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
1780
|
-
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
1781
|
-
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
1782
|
-
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
1783
|
-
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
1784
|
-
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
1785
|
-
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
1786
|
-
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
1787
|
-
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
1788
|
-
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
1789
|
-
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
1790
|
-
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
1791
|
-
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
1792
|
-
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
1793
|
-
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
1794
|
-
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
1795
|
-
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
1796
|
-
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
1797
|
-
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
1798
|
-
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
1799
|
-
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
1800
|
-
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
1801
|
-
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
1802
|
-
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
1803
|
-
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
1804
|
-
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
1805
|
-
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
1806
|
-
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
1807
|
-
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
1808
|
-
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
1809
|
-
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
1810
|
-
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
1811
|
-
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
1812
|
-
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
1813
|
-
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
1814
|
-
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
1815
|
-
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
1816
|
-
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
1817
|
-
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
1818
|
-
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
1819
|
-
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
1820
|
-
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
1821
|
-
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
1822
|
-
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
1823
|
-
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
1824
|
-
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
1825
|
-
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
1826
|
-
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
1827
|
-
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
1828
|
-
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
1829
|
-
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
1830
|
-
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
1831
|
-
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
1832
|
-
};
|
1833
|
-
|
1834
|
-
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
1835
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
1836
|
-
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
1837
|
-
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
1838
|
-
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
1839
|
-
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
1840
|
-
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
1841
|
-
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
1842
|
-
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
1843
|
-
};
|
1844
|
-
|
1845
|
-
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1846
|
-
static const __device__ uint64_t ksigns64[128] = {
|
1847
|
-
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
1848
|
-
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
1849
|
-
0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
1850
|
-
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
|
1851
|
-
0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
|
1852
|
-
0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
1853
|
-
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
|
1854
|
-
0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
|
1855
|
-
0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
1856
|
-
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
|
1857
|
-
0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
|
1858
|
-
0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
1859
|
-
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
|
1860
|
-
0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
|
1861
|
-
0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
1862
|
-
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
|
1863
|
-
0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
|
1864
|
-
0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
1865
|
-
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
|
1866
|
-
0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
|
1867
|
-
0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
1868
|
-
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
|
1869
|
-
0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
|
1870
|
-
0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
1871
|
-
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
|
1872
|
-
0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
|
1873
|
-
0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
1874
|
-
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
|
1875
|
-
0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
|
1876
|
-
0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
1877
|
-
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
1878
|
-
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
1879
|
-
};
|
1880
|
-
//#endif
|
1881
|
-
|
1882
|
-
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
1883
|
-
|
1884
1356
|
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
1885
1357
|
switch (type) {
|
1886
1358
|
case GGML_TYPE_Q4_0:
|
@@ -1945,6 +1417,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
1945
1417
|
|
1946
1418
|
}
|
1947
1419
|
|
1420
|
+
template<typename dst_t>
|
1421
|
+
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1422
|
+
|
1423
|
+
const int i = blockIdx.x;
|
1424
|
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
1425
|
+
|
1426
|
+
const int tid = threadIdx.x;
|
1427
|
+
#if QK_K == 256
|
1428
|
+
const int il = tid/8; // 0...3
|
1429
|
+
const int ib = tid%8; // 0...7
|
1430
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1431
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
1432
|
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
1433
|
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
1434
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
1435
|
+
#else
|
1436
|
+
assert(false);
|
1437
|
+
#endif
|
1438
|
+
|
1439
|
+
}
|
1440
|
+
|
1948
1441
|
template<typename dst_t>
|
1949
1442
|
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1950
1443
|
|
@@ -1973,6 +1466,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
1973
1466
|
|
1974
1467
|
}
|
1975
1468
|
|
1469
|
+
template<typename dst_t>
|
1470
|
+
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1471
|
+
|
1472
|
+
const int i = blockIdx.x;
|
1473
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
1474
|
+
|
1475
|
+
const int tid = threadIdx.x;
|
1476
|
+
#if QK_K == 256
|
1477
|
+
const int il = tid/8; // 0...3
|
1478
|
+
const int ib = tid%8; // 0...7
|
1479
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1480
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
1481
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
1482
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
1483
|
+
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
1484
|
+
const uint8_t signs = x[i].signs[4*ib + il];
|
1485
|
+
for (int j = 0; j < 4; ++j) {
|
1486
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
1487
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
1488
|
+
}
|
1489
|
+
#else
|
1490
|
+
assert(false);
|
1491
|
+
#endif
|
1492
|
+
|
1493
|
+
}
|
1494
|
+
|
1976
1495
|
template<typename dst_t>
|
1977
1496
|
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1978
1497
|
|
@@ -1984,11 +1503,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|
1984
1503
|
const int il = tid/8; // 0...3
|
1985
1504
|
const int ib = tid%8; // 0...7
|
1986
1505
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1987
|
-
const
|
1988
|
-
|
1989
|
-
const int8_t *
|
1990
|
-
|
1991
|
-
|
1506
|
+
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
1507
|
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
1508
|
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
1509
|
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
1510
|
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
1511
|
+
grid32[0] &= 0x0f0f0f0f;
|
1512
|
+
for (int j = 0; j < 8; ++j) {
|
1513
|
+
y[j] = d * (q[j] + delta);
|
1514
|
+
}
|
1992
1515
|
#else
|
1993
1516
|
assert(false);
|
1994
1517
|
#endif
|
@@ -2016,6 +1539,25 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|
2016
1539
|
|
2017
1540
|
}
|
2018
1541
|
|
1542
|
+
#if QK_K != 64
|
1543
|
+
template<typename dst_t>
|
1544
|
+
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1545
|
+
const int i = blockIdx.x;
|
1546
|
+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
1547
|
+
|
1548
|
+
const int tid = threadIdx.x;
|
1549
|
+
const int il = tid/8; // 0...3
|
1550
|
+
const int ib = tid%8; // 0...7
|
1551
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
1552
|
+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
1553
|
+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
1554
|
+
for (int j = 0; j < 4; ++j) {
|
1555
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
1556
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
1557
|
+
}
|
1558
|
+
}
|
1559
|
+
#endif
|
1560
|
+
|
2019
1561
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
2020
1562
|
|
2021
1563
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
@@ -2112,10 +1654,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
2112
1654
|
#endif
|
2113
1655
|
|
2114
1656
|
// sum up partial sums and write back result
|
2115
|
-
|
2116
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2117
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2118
|
-
}
|
1657
|
+
tmp = warp_reduce_sum(tmp);
|
2119
1658
|
|
2120
1659
|
if (threadIdx.x == 0) {
|
2121
1660
|
dst[row] = tmp;
|
@@ -2216,10 +1755,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
2216
1755
|
#endif
|
2217
1756
|
|
2218
1757
|
// sum up partial sums and write back result
|
2219
|
-
|
2220
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2221
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2222
|
-
}
|
1758
|
+
tmp = warp_reduce_sum(tmp);
|
2223
1759
|
|
2224
1760
|
if (threadIdx.x == 0) {
|
2225
1761
|
dst[row] = tmp;
|
@@ -2352,10 +1888,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
2352
1888
|
#endif
|
2353
1889
|
|
2354
1890
|
// sum up partial sums and write back result
|
2355
|
-
|
2356
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2357
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2358
|
-
}
|
1891
|
+
tmp = warp_reduce_sum(tmp);
|
2359
1892
|
|
2360
1893
|
if (tid == 0) {
|
2361
1894
|
dst[row] = tmp;
|
@@ -2468,10 +2001,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
2468
2001
|
#endif
|
2469
2002
|
|
2470
2003
|
// sum up partial sums and write back result
|
2471
|
-
|
2472
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2473
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2474
|
-
}
|
2004
|
+
tmp = warp_reduce_sum(tmp);
|
2475
2005
|
|
2476
2006
|
if (threadIdx.x == 0) {
|
2477
2007
|
dst[row] = tmp;
|
@@ -2578,10 +2108,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
2578
2108
|
#endif
|
2579
2109
|
|
2580
2110
|
// sum up partial sums and write back result
|
2581
|
-
|
2582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2583
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2584
|
-
}
|
2111
|
+
tmp = warp_reduce_sum(tmp);
|
2585
2112
|
|
2586
2113
|
if (tid == 0) {
|
2587
2114
|
dst[row] = tmp;
|
@@ -2616,11 +2143,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
2616
2143
|
float amax = fabsf(xi);
|
2617
2144
|
float sum = xi;
|
2618
2145
|
|
2619
|
-
|
2620
|
-
|
2621
|
-
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
2622
|
-
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
2623
|
-
}
|
2146
|
+
amax = warp_reduce_max(amax);
|
2147
|
+
sum = warp_reduce_sum(sum);
|
2624
2148
|
|
2625
2149
|
const float d = amax / 127;
|
2626
2150
|
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
@@ -3827,7 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
3827
3351
|
#pragma unroll
|
3828
3352
|
for (int i = 0; i < QR2_K; ++ i) {
|
3829
3353
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
3830
|
-
d8[i] =
|
3354
|
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
3831
3355
|
}
|
3832
3356
|
|
3833
3357
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -3949,7 +3473,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
3949
3473
|
#pragma unroll
|
3950
3474
|
for (int i = 0; i < QR3_K; ++i) {
|
3951
3475
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
3952
|
-
d8[i] =
|
3476
|
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
3953
3477
|
}
|
3954
3478
|
|
3955
3479
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -4118,7 +3642,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
4118
3642
|
|
4119
3643
|
for (int i = 0; i < QR4_K; ++i) {
|
4120
3644
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
4121
|
-
d8[i] =
|
3645
|
+
d8[i] = __low2float(bq8i->ds);
|
4122
3646
|
|
4123
3647
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
4124
3648
|
u[2*i+0] = q8[0];
|
@@ -4483,7 +4007,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
4483
4007
|
#pragma unroll
|
4484
4008
|
for (int i = 0; i < QR6_K; ++i) {
|
4485
4009
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
4486
|
-
d8[i] =
|
4010
|
+
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
|
4487
4011
|
}
|
4488
4012
|
|
4489
4013
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -4682,6 +4206,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4682
4206
|
#endif
|
4683
4207
|
}
|
4684
4208
|
|
4209
|
+
// TODO
|
4210
|
+
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
4211
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4212
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4213
|
+
#if QK_K == 256
|
4214
|
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
4215
|
+
|
4216
|
+
const int ib32 = iqs;
|
4217
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4218
|
+
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
4219
|
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
4220
|
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
4221
|
+
int sumi1 = 0;
|
4222
|
+
for (int l = 0; l < 2; ++l) {
|
4223
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
4224
|
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4225
|
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4226
|
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
4227
|
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
4228
|
+
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
4229
|
+
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
4230
|
+
q8 += 8;
|
4231
|
+
}
|
4232
|
+
int sumi2 = 0;
|
4233
|
+
for (int l = 2; l < 4; ++l) {
|
4234
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
4235
|
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4236
|
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4237
|
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
4238
|
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
4239
|
+
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
4240
|
+
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
4241
|
+
q8 += 8;
|
4242
|
+
}
|
4243
|
+
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4244
|
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
4245
|
+
#else
|
4246
|
+
(void) ksigns64;
|
4247
|
+
assert(false);
|
4248
|
+
return 0.f;
|
4249
|
+
#endif
|
4250
|
+
#else
|
4251
|
+
(void) ksigns64;
|
4252
|
+
assert(false);
|
4253
|
+
return 0.f;
|
4254
|
+
#endif
|
4255
|
+
}
|
4256
|
+
|
4685
4257
|
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
4686
4258
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4687
4259
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
@@ -4717,43 +4289,70 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|
4717
4289
|
#endif
|
4718
4290
|
}
|
4719
4291
|
|
4292
|
+
// TODO: don't use lookup table for signs
|
4293
|
+
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
4294
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4295
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4296
|
+
#if QK_K == 256
|
4297
|
+
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
4298
|
+
|
4299
|
+
const int ib32 = iqs;
|
4300
|
+
const uint8_t * qs = bq2->qs + 8*ib32;
|
4301
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4302
|
+
int sumi = 0;
|
4303
|
+
for (int l = 0; l < 4; ++l) {
|
4304
|
+
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
4305
|
+
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
4306
|
+
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4307
|
+
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4308
|
+
const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
|
4309
|
+
const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
|
4310
|
+
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
4311
|
+
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
4312
|
+
q8 += 8;
|
4313
|
+
}
|
4314
|
+
const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
|
4315
|
+
return d * sumi;
|
4316
|
+
#else
|
4317
|
+
assert(false);
|
4318
|
+
return 0.f;
|
4319
|
+
#endif
|
4320
|
+
#else
|
4321
|
+
assert(false);
|
4322
|
+
return 0.f;
|
4323
|
+
#endif
|
4324
|
+
}
|
4325
|
+
|
4720
4326
|
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
4721
4327
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4722
4328
|
#if QK_K == 256
|
4723
4329
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
4724
4330
|
|
4725
4331
|
const int ib32 = iqs;
|
4726
|
-
int
|
4727
|
-
const uint8_t h1 = bq1->scales[2*ib32+0];
|
4728
|
-
const uint8_t h2 = bq1->scales[2*ib32+1];
|
4332
|
+
int sumi = 0;
|
4729
4333
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4730
4334
|
const int * q8 = (const int *)bq8_1[ib32].qs;
|
4731
|
-
|
4732
|
-
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4736
|
-
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
4737
|
-
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
4738
|
-
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
4739
|
-
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
4335
|
+
for (int l = 0; l < 4; ++l) {
|
4336
|
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
4337
|
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
4338
|
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
4339
|
+
sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
|
4740
4340
|
}
|
4741
4341
|
#else
|
4742
|
-
const int8_t
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
|
4747
|
-
|
4748
|
-
|
4749
|
-
sumi2 += q8[j+ 8] * grid2[j];
|
4750
|
-
sumi3 += q8[j+16] * grid3[j];
|
4751
|
-
sumi4 += q8[j+24] * grid4[j];
|
4342
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4343
|
+
for (int l = 0; l < 4; ++l) {
|
4344
|
+
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
4345
|
+
for (int j = 0; j < 4; ++j) {
|
4346
|
+
sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
|
4347
|
+
}
|
4348
|
+
q8 += 8;
|
4752
4349
|
}
|
4753
4350
|
#endif
|
4754
|
-
const float
|
4755
|
-
|
4756
|
-
|
4351
|
+
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
|
4352
|
+
const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
|
4353
|
+
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
4354
|
+
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
4355
|
+
return d * sumi + m * delta;
|
4757
4356
|
#else
|
4758
4357
|
assert(false);
|
4759
4358
|
return 0.f;
|
@@ -4810,6 +4409,75 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
|
4810
4409
|
return d * (sumi1 + sumi2);
|
4811
4410
|
}
|
4812
4411
|
|
4412
|
+
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
4413
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4414
|
+
|
4415
|
+
#if QK_K == 256
|
4416
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4417
|
+
|
4418
|
+
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
4419
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
4420
|
+
|
4421
|
+
//// iqs is 0...7
|
4422
|
+
//const int ib64 = iqs/2;
|
4423
|
+
//const int il = iqs%2;
|
4424
|
+
//const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
|
4425
|
+
//const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
|
4426
|
+
//const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
|
4427
|
+
//const uint32_t * q4_2 = q4_1 + 4;
|
4428
|
+
//const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
|
4429
|
+
//const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
|
4430
|
+
//const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
|
4431
|
+
//const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
|
4432
|
+
//int v1, v2;
|
4433
|
+
//int sumi1 = 0, sumi2 = 0;
|
4434
|
+
//for (int j = 0; j < 2; ++j) {
|
4435
|
+
// get_int_from_table_16(q4_1[j], values, v1, v2);
|
4436
|
+
// sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
|
4437
|
+
// get_int_from_table_16(q4_2[j], values, v1, v2);
|
4438
|
+
// sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
|
4439
|
+
//}
|
4440
|
+
//return d1 * sumi1 + d2 * sumi2;
|
4441
|
+
|
4442
|
+
// iqs is 0...7
|
4443
|
+
const int ib32 = iqs;
|
4444
|
+
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
4445
|
+
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
4446
|
+
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
4447
|
+
const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
|
4448
|
+
int v1, v2;
|
4449
|
+
int sumi1 = 0, sumi2 = 0;
|
4450
|
+
for (int j = 0; j < 4; ++j) {
|
4451
|
+
get_int_from_table_16(q4[j], values, v1, v2);
|
4452
|
+
sumi1 = __dp4a(v1, q8[j+0], sumi1);
|
4453
|
+
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
4454
|
+
}
|
4455
|
+
return d * (sumi1 + sumi2);
|
4456
|
+
|
4457
|
+
//// iqs is 0...15
|
4458
|
+
//const int ib32 = iqs/2;
|
4459
|
+
//const int il = iqs%2;
|
4460
|
+
//const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il;
|
4461
|
+
//const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
|
4462
|
+
//const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
4463
|
+
//const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
|
4464
|
+
//int v1, v2;
|
4465
|
+
//int sumi1 = 0, sumi2 = 0;
|
4466
|
+
//for (int j = 0; j < 2; ++j) {
|
4467
|
+
// get_int_from_table_16(q4[j], values, v1, v2);
|
4468
|
+
// sumi1 = __dp4a(v1, q8[j+0], sumi1);
|
4469
|
+
// sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
4470
|
+
//}
|
4471
|
+
//return d * (sumi1 + sumi2);
|
4472
|
+
#else
|
4473
|
+
assert(false);
|
4474
|
+
return 0.f;
|
4475
|
+
#endif
|
4476
|
+
#else
|
4477
|
+
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
4478
|
+
#endif
|
4479
|
+
}
|
4480
|
+
|
4813
4481
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
4814
4482
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
4815
4483
|
static __device__ __forceinline__ void mul_mat_q(
|
@@ -4876,7 +4544,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
4876
4544
|
*dsi_dst = *dsi_src;
|
4877
4545
|
} else {
|
4878
4546
|
float * dfi_dst = (float *) dsi_dst;
|
4879
|
-
*dfi_dst =
|
4547
|
+
*dfi_dst = __low2float(*dsi_src);
|
4880
4548
|
}
|
4881
4549
|
}
|
4882
4550
|
|
@@ -5730,10 +5398,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
5730
5398
|
}
|
5731
5399
|
|
5732
5400
|
// sum up partial sums and write back result
|
5733
|
-
|
5734
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5735
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5736
|
-
}
|
5401
|
+
tmp = warp_reduce_sum(tmp);
|
5737
5402
|
|
5738
5403
|
if (tid == 0) {
|
5739
5404
|
#ifdef GGML_CUDA_F16
|
@@ -5783,10 +5448,7 @@ static __global__ void mul_mat_p021_f16_f32(
|
|
5783
5448
|
const int idst = channel*nrows_dst + row_dst;
|
5784
5449
|
|
5785
5450
|
// sum up partial sums and write back result
|
5786
|
-
|
5787
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5788
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5789
|
-
}
|
5451
|
+
tmp = warp_reduce_sum(tmp);
|
5790
5452
|
|
5791
5453
|
if (threadIdx.x == 0) {
|
5792
5454
|
dst[idst] = tmp;
|
@@ -5829,10 +5491,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
5829
5491
|
}
|
5830
5492
|
|
5831
5493
|
// sum up partial sums and write back result
|
5832
|
-
|
5833
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5834
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5835
|
-
}
|
5494
|
+
tmp = warp_reduce_sum(tmp);
|
5836
5495
|
|
5837
5496
|
if (threadIdx.x == 0) {
|
5838
5497
|
dst[idst] = tmp;
|
@@ -5872,7 +5531,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
5872
5531
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
5873
5532
|
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
5874
5533
|
const int nb12, const int nb13) {
|
5875
|
-
const
|
5534
|
+
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
5876
5535
|
|
5877
5536
|
if (i >= ne) {
|
5878
5537
|
return;
|
@@ -5880,17 +5539,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
5880
5539
|
|
5881
5540
|
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
5882
5541
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
5883
|
-
const
|
5884
|
-
const
|
5885
|
-
const
|
5886
|
-
const
|
5887
|
-
const
|
5888
|
-
|
5889
|
-
const
|
5890
|
-
const
|
5891
|
-
const
|
5892
|
-
const
|
5893
|
-
const
|
5542
|
+
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
5543
|
+
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
5544
|
+
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
5545
|
+
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
5546
|
+
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
5547
|
+
|
5548
|
+
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
5549
|
+
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
5550
|
+
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
5551
|
+
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
5552
|
+
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
5894
5553
|
|
5895
5554
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
5896
5555
|
}
|
@@ -6216,11 +5875,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
|
|
6216
5875
|
int ixj = col ^ j;
|
6217
5876
|
if (ixj > col) {
|
6218
5877
|
if ((col & k) == 0) {
|
6219
|
-
if (order ==
|
5878
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
6220
5879
|
swap(dst_row[col], dst_row[ixj]);
|
6221
5880
|
}
|
6222
5881
|
} else {
|
6223
|
-
if (order ==
|
5882
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
6224
5883
|
swap(dst_row[col], dst_row[ixj]);
|
6225
5884
|
}
|
6226
5885
|
}
|
@@ -6328,6 +5987,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
6328
5987
|
// find the sum of exps in the block
|
6329
5988
|
tmp = warp_reduce_sum(tmp);
|
6330
5989
|
if (block_size > WARP_SIZE) {
|
5990
|
+
__syncthreads();
|
6331
5991
|
if (warp_id == 0) {
|
6332
5992
|
buf_iw[lane_id] = 0.0f;
|
6333
5993
|
}
|
@@ -6379,23 +6039,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
6379
6039
|
|
6380
6040
|
template <typename T>
|
6381
6041
|
static __global__ void im2col_kernel(
|
6382
|
-
const float * x, T * dst,
|
6383
|
-
|
6042
|
+
const float * x, T * dst, int64_t batch_offset,
|
6043
|
+
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
6384
6044
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
6385
|
-
const
|
6045
|
+
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
6386
6046
|
if (i >= pelements) {
|
6387
6047
|
return;
|
6388
6048
|
}
|
6389
6049
|
|
6390
|
-
const
|
6391
|
-
const
|
6392
|
-
const
|
6393
|
-
const
|
6394
|
-
const
|
6050
|
+
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
6051
|
+
const int64_t kx = i / ksize;
|
6052
|
+
const int64_t kd = kx * ksize;
|
6053
|
+
const int64_t ky = (i - kd) / OW;
|
6054
|
+
const int64_t ix = i % OW;
|
6395
6055
|
|
6396
|
-
const
|
6397
|
-
const
|
6398
|
-
const
|
6056
|
+
const int64_t oh = blockIdx.y;
|
6057
|
+
const int64_t batch = blockIdx.z / IC;
|
6058
|
+
const int64_t ic = blockIdx.z % IC;
|
6399
6059
|
|
6400
6060
|
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
6401
6061
|
const int64_t iih = oh * s1 + ky * d1 - p1;
|
@@ -6721,19 +6381,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
|
|
6721
6381
|
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
6722
6382
|
}
|
6723
6383
|
|
6724
|
-
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int
|
6384
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
6385
|
+
const int scale_factor, cudaStream_t stream) {
|
6725
6386
|
int ne0 = (ne00 * scale_factor);
|
6726
6387
|
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
6727
|
-
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
6388
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
6728
6389
|
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
6729
6390
|
}
|
6730
6391
|
|
6731
6392
|
static void pad_f32_cuda(const float * x, float * dst,
|
6732
|
-
const int ne00, const int ne01, const int ne02,
|
6733
|
-
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
6393
|
+
const int ne00, const int ne01, const int ne02, const int ne03,
|
6394
|
+
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
6734
6395
|
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
6735
|
-
dim3 gridDim(num_blocks, ne1, ne2);
|
6736
|
-
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
6396
|
+
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
6397
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
6398
|
+
}
|
6399
|
+
|
6400
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
6401
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
6402
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
6403
|
+
}
|
6404
|
+
|
6405
|
+
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
|
6406
|
+
const int dim, const int max_period, cudaStream_t stream) {
|
6407
|
+
int half_ceil = (dim + 1) / 2;
|
6408
|
+
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
6409
|
+
dim3 gridDim(num_blocks, ne00, 1);
|
6410
|
+
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
|
6737
6411
|
}
|
6738
6412
|
|
6739
6413
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
@@ -6843,12 +6517,24 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
|
6843
6517
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
6844
6518
|
}
|
6845
6519
|
|
6520
|
+
template<typename dst_t>
|
6521
|
+
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6522
|
+
const int nb = k / QK_K;
|
6523
|
+
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
6524
|
+
}
|
6525
|
+
|
6846
6526
|
template<typename dst_t>
|
6847
6527
|
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6848
6528
|
const int nb = k / QK_K;
|
6849
6529
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
6850
6530
|
}
|
6851
6531
|
|
6532
|
+
template<typename dst_t>
|
6533
|
+
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6534
|
+
const int nb = k / QK_K;
|
6535
|
+
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
6536
|
+
}
|
6537
|
+
|
6852
6538
|
template<typename dst_t>
|
6853
6539
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6854
6540
|
const int nb = k / QK_K;
|
@@ -6861,6 +6547,16 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
|
|
6861
6547
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6862
6548
|
}
|
6863
6549
|
|
6550
|
+
template<typename dst_t>
|
6551
|
+
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6552
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
6553
|
+
#if QK_K == 64
|
6554
|
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6555
|
+
#else
|
6556
|
+
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
6557
|
+
#endif
|
6558
|
+
}
|
6559
|
+
|
6864
6560
|
template <typename src_t, typename dst_t>
|
6865
6561
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
6866
6562
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
@@ -6898,12 +6594,18 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6898
6594
|
return dequantize_row_iq2_xxs_cuda;
|
6899
6595
|
case GGML_TYPE_IQ2_XS:
|
6900
6596
|
return dequantize_row_iq2_xs_cuda;
|
6597
|
+
case GGML_TYPE_IQ2_S:
|
6598
|
+
return dequantize_row_iq2_s_cuda;
|
6901
6599
|
case GGML_TYPE_IQ3_XXS:
|
6902
6600
|
return dequantize_row_iq3_xxs_cuda;
|
6903
6601
|
case GGML_TYPE_IQ1_S:
|
6904
6602
|
return dequantize_row_iq1_s_cuda;
|
6905
6603
|
case GGML_TYPE_IQ4_NL:
|
6906
6604
|
return dequantize_row_iq4_nl_cuda;
|
6605
|
+
case GGML_TYPE_IQ4_XS:
|
6606
|
+
return dequantize_row_iq4_xs_cuda;
|
6607
|
+
case GGML_TYPE_IQ3_S:
|
6608
|
+
return dequantize_row_iq3_s_cuda;
|
6907
6609
|
case GGML_TYPE_F32:
|
6908
6610
|
return convert_unary_cuda<float>;
|
6909
6611
|
default:
|
@@ -6937,12 +6639,18 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
6937
6639
|
return dequantize_row_iq2_xxs_cuda;
|
6938
6640
|
case GGML_TYPE_IQ2_XS:
|
6939
6641
|
return dequantize_row_iq2_xs_cuda;
|
6642
|
+
case GGML_TYPE_IQ2_S:
|
6643
|
+
return dequantize_row_iq2_s_cuda;
|
6940
6644
|
case GGML_TYPE_IQ3_XXS:
|
6941
6645
|
return dequantize_row_iq3_xxs_cuda;
|
6942
6646
|
case GGML_TYPE_IQ1_S:
|
6943
6647
|
return dequantize_row_iq1_s_cuda;
|
6944
6648
|
case GGML_TYPE_IQ4_NL:
|
6945
6649
|
return dequantize_row_iq4_nl_cuda;
|
6650
|
+
case GGML_TYPE_IQ4_XS:
|
6651
|
+
return dequantize_row_iq4_xs_cuda;
|
6652
|
+
case GGML_TYPE_IQ3_S:
|
6653
|
+
return dequantize_row_iq3_s_cuda;
|
6946
6654
|
case GGML_TYPE_F16:
|
6947
6655
|
return convert_unary_cuda<half>;
|
6948
6656
|
default:
|
@@ -7764,10 +7472,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
|
7764
7472
|
|
7765
7473
|
const dim3 block_dims(ncols, 1, 1);
|
7766
7474
|
const dim3 block_nums(1, nrows, 1);
|
7767
|
-
if (order ==
|
7768
|
-
k_argsort_f32_i32<
|
7769
|
-
} else if (order ==
|
7770
|
-
k_argsort_f32_i32<
|
7475
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
7476
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
7477
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
7478
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
7771
7479
|
} else {
|
7772
7480
|
GGML_ASSERT(false);
|
7773
7481
|
}
|
@@ -7832,8 +7540,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
|
|
7832
7540
|
|
7833
7541
|
template <typename T>
|
7834
7542
|
static void im2col_cuda(const float* x, T* dst,
|
7835
|
-
|
7836
|
-
|
7543
|
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
7544
|
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
7837
7545
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
7838
7546
|
const int parallel_elements = OW * KW * KH;
|
7839
7547
|
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
@@ -7916,8 +7624,8 @@ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual
|
|
7916
7624
|
*actual_size = look_ahead_size;
|
7917
7625
|
g_cuda_pool_size[device] += look_ahead_size;
|
7918
7626
|
#ifdef DEBUG_CUDA_MALLOC
|
7919
|
-
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__,
|
7920
|
-
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[
|
7627
|
+
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
7628
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
|
7921
7629
|
#endif
|
7922
7630
|
return ptr;
|
7923
7631
|
}
|
@@ -8003,7 +7711,7 @@ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual
|
|
8003
7711
|
g_cuda_pool_used[device] += size;
|
8004
7712
|
|
8005
7713
|
#ifdef DEBUG_CUDA_MALLOC
|
8006
|
-
printf("cuda pool[%d]: allocated %llu bytes at %llx
|
7714
|
+
printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
8007
7715
|
#endif
|
8008
7716
|
|
8009
7717
|
return ptr;
|
@@ -8013,7 +7721,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
|
|
8013
7721
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
8014
7722
|
|
8015
7723
|
#ifdef DEBUG_CUDA_MALLOC
|
8016
|
-
printf("cuda pool[%d]: freed %llu bytes at %llx\n",
|
7724
|
+
printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
8017
7725
|
#endif
|
8018
7726
|
|
8019
7727
|
g_cuda_pool_used[device] -= size;
|
@@ -8199,11 +7907,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
8199
7907
|
|
8200
7908
|
cudaMemcpyKind kind;
|
8201
7909
|
char * src_ptr;
|
8202
|
-
if (src->backend ==
|
7910
|
+
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
8203
7911
|
kind = cudaMemcpyHostToDevice;
|
8204
7912
|
src_ptr = (char *) src->data;
|
8205
|
-
} else if (src->backend ==
|
8206
|
-
GGML_ASSERT(src->backend !=
|
7913
|
+
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
7914
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
8207
7915
|
kind = cudaMemcpyDeviceToDevice;
|
8208
7916
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
8209
7917
|
int id;
|
@@ -8512,7 +8220,7 @@ static void ggml_cuda_op_group_norm(
|
|
8512
8220
|
|
8513
8221
|
int num_groups = dst->op_params[0];
|
8514
8222
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
8515
|
-
group_norm_f32_cuda(src0_dd, dst_dd, num_groups
|
8223
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
|
8516
8224
|
|
8517
8225
|
(void) src1;
|
8518
8226
|
(void) dst;
|
@@ -8545,7 +8253,7 @@ static void ggml_cuda_op_upscale(
|
|
8545
8253
|
|
8546
8254
|
const int scale_factor = dst->op_params[0];
|
8547
8255
|
|
8548
|
-
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
8256
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
|
8549
8257
|
|
8550
8258
|
(void) src1;
|
8551
8259
|
(void) dst;
|
@@ -8561,8 +8269,49 @@ static void ggml_cuda_op_pad(
|
|
8561
8269
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
8562
8270
|
|
8563
8271
|
pad_f32_cuda(src0_dd, dst_dd,
|
8564
|
-
src0->ne[0], src0->ne[1], src0->ne[2],
|
8565
|
-
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
8272
|
+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
8273
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
|
8274
|
+
|
8275
|
+
(void) src1;
|
8276
|
+
(void) dst;
|
8277
|
+
(void) src1_dd;
|
8278
|
+
}
|
8279
|
+
|
8280
|
+
static void ggml_cuda_op_arange(
|
8281
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8282
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8283
|
+
|
8284
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
8285
|
+
|
8286
|
+
float start;
|
8287
|
+
float stop;
|
8288
|
+
float step;
|
8289
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
8290
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
8291
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
8292
|
+
|
8293
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
8294
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
8295
|
+
|
8296
|
+
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
|
8297
|
+
|
8298
|
+
(void) src0;
|
8299
|
+
(void) src1;
|
8300
|
+
(void) src0_dd;
|
8301
|
+
(void) src1_dd;
|
8302
|
+
}
|
8303
|
+
|
8304
|
+
static void ggml_cuda_op_timestep_embedding(
|
8305
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8306
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8307
|
+
|
8308
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
8309
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
8310
|
+
|
8311
|
+
const int dim = dst->op_params[0];
|
8312
|
+
const int max_period = dst->op_params[1];
|
8313
|
+
|
8314
|
+
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
|
8566
8315
|
|
8567
8316
|
(void) src1;
|
8568
8317
|
(void) dst;
|
@@ -8608,7 +8357,7 @@ static void ggml_cuda_op_mul_mat_q(
|
|
8608
8357
|
|
8609
8358
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8610
8359
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
8611
|
-
const int64_t nrows_dst = dst->backend ==
|
8360
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8612
8361
|
|
8613
8362
|
switch (src0->type) {
|
8614
8363
|
case GGML_TYPE_Q4_0:
|
@@ -8685,9 +8434,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8685
8434
|
case GGML_TYPE_Q6_K:
|
8686
8435
|
case GGML_TYPE_IQ2_XXS:
|
8687
8436
|
case GGML_TYPE_IQ2_XS:
|
8437
|
+
case GGML_TYPE_IQ2_S:
|
8688
8438
|
case GGML_TYPE_IQ3_XXS:
|
8689
8439
|
case GGML_TYPE_IQ1_S:
|
8690
8440
|
case GGML_TYPE_IQ4_NL:
|
8441
|
+
case GGML_TYPE_IQ4_XS:
|
8442
|
+
case GGML_TYPE_IQ3_S:
|
8691
8443
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
8692
8444
|
default:
|
8693
8445
|
GGML_ASSERT(false);
|
@@ -8710,9 +8462,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8710
8462
|
case GGML_TYPE_Q5_K:
|
8711
8463
|
case GGML_TYPE_IQ2_XXS:
|
8712
8464
|
case GGML_TYPE_IQ2_XS:
|
8465
|
+
case GGML_TYPE_IQ2_S:
|
8713
8466
|
case GGML_TYPE_IQ3_XXS:
|
8714
8467
|
case GGML_TYPE_IQ1_S:
|
8715
8468
|
case GGML_TYPE_IQ4_NL:
|
8469
|
+
case GGML_TYPE_IQ4_XS:
|
8470
|
+
case GGML_TYPE_IQ3_S:
|
8716
8471
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
8717
8472
|
case GGML_TYPE_Q6_K:
|
8718
8473
|
return 64;
|
@@ -8755,7 +8510,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8755
8510
|
|
8756
8511
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8757
8512
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
8758
|
-
const int64_t nrows_dst = dst->backend ==
|
8513
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8759
8514
|
|
8760
8515
|
switch (src0->type) {
|
8761
8516
|
case GGML_TYPE_Q4_0:
|
@@ -8806,6 +8561,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8806
8561
|
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
8807
8562
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8808
8563
|
break;
|
8564
|
+
case GGML_TYPE_IQ2_S:
|
8565
|
+
mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
|
8566
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8567
|
+
break;
|
8809
8568
|
case GGML_TYPE_IQ3_XXS:
|
8810
8569
|
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
8811
8570
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
@@ -8818,6 +8577,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8818
8577
|
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
8819
8578
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8820
8579
|
break;
|
8580
|
+
case GGML_TYPE_IQ4_XS:
|
8581
|
+
mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
|
8582
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8583
|
+
break;
|
8584
|
+
case GGML_TYPE_IQ3_S:
|
8585
|
+
mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
|
8586
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8587
|
+
break;
|
8821
8588
|
default:
|
8822
8589
|
GGML_ASSERT(false);
|
8823
8590
|
break;
|
@@ -8927,7 +8694,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
8927
8694
|
|
8928
8695
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8929
8696
|
// ldc == nrows of the matrix that cuBLAS writes into
|
8930
|
-
int ldc = dst->backend ==
|
8697
|
+
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8931
8698
|
|
8932
8699
|
const int compute_capability = g_device_caps[id].cc;
|
8933
8700
|
|
@@ -9275,7 +9042,7 @@ static void ggml_cuda_op_soft_max(
|
|
9275
9042
|
const bool use_src2 = src2 != nullptr;
|
9276
9043
|
|
9277
9044
|
if (use_src2) {
|
9278
|
-
const bool src2_on_device = src2->backend ==
|
9045
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
9279
9046
|
|
9280
9047
|
if (src2_on_device) {
|
9281
9048
|
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
@@ -9333,16 +9100,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
9333
9100
|
const bool use_src1 = src1 != nullptr;
|
9334
9101
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
9335
9102
|
|
9336
|
-
GGML_ASSERT(!use_src1 || src1->backend !=
|
9337
|
-
GGML_ASSERT( dst->backend !=
|
9103
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9104
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9338
9105
|
|
9339
9106
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
9340
9107
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
9341
9108
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9342
9109
|
|
9343
|
-
const bool src0_on_device = src0->backend ==
|
9344
|
-
const bool src1_on_device = use_src1 && src1->backend ==
|
9345
|
-
const bool dst_on_device = dst->backend ==
|
9110
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9111
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
9112
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
9346
9113
|
|
9347
9114
|
// dd = data device
|
9348
9115
|
float * src0_ddf = nullptr;
|
@@ -9386,7 +9153,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
9386
9153
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
9387
9154
|
}
|
9388
9155
|
|
9389
|
-
if (dst->backend ==
|
9156
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9390
9157
|
CUDA_CHECK(cudaDeviceSynchronize());
|
9391
9158
|
}
|
9392
9159
|
}
|
@@ -9467,8 +9234,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9467
9234
|
const int nb2 = dst->nb[2];
|
9468
9235
|
const int nb3 = dst->nb[3];
|
9469
9236
|
|
9470
|
-
GGML_ASSERT(dst->backend !=
|
9471
|
-
GGML_ASSERT(src1->backend !=
|
9237
|
+
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9238
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9472
9239
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
9473
9240
|
|
9474
9241
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
@@ -9484,20 +9251,20 @@ static void ggml_cuda_op_mul_mat(
|
|
9484
9251
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
9485
9252
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9486
9253
|
|
9487
|
-
const bool src0_on_device = src0->backend ==
|
9254
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9488
9255
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
9489
9256
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
9490
9257
|
|
9491
9258
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
9492
9259
|
|
9493
|
-
const bool split = src0->backend ==
|
9260
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9494
9261
|
GGML_ASSERT(!(split && ne02 > 1));
|
9495
9262
|
GGML_ASSERT(!(split && ne03 > 1));
|
9496
9263
|
GGML_ASSERT(!(split && ne02 < ne12));
|
9497
9264
|
|
9498
9265
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
9499
9266
|
if (split) {
|
9500
|
-
// TODO: check that src0->buffer->buft is a split buffer type, replace
|
9267
|
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
|
9501
9268
|
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
9502
9269
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9503
9270
|
tensor_split = buft_ctx->tensor_split;
|
@@ -9555,8 +9322,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9555
9322
|
|
9556
9323
|
used_devices++;
|
9557
9324
|
|
9558
|
-
const bool src1_on_device = src1->backend ==
|
9559
|
-
const bool dst_on_device = dst->backend ==
|
9325
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9326
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9560
9327
|
|
9561
9328
|
ggml_cuda_set_device(id);
|
9562
9329
|
cudaStream_t stream = g_cudaStreams[id][0];
|
@@ -9607,8 +9374,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9607
9374
|
continue;
|
9608
9375
|
}
|
9609
9376
|
|
9610
|
-
const bool src1_on_device = src1->backend ==
|
9611
|
-
const bool dst_on_device = dst->backend ==
|
9377
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9378
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9612
9379
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
9613
9380
|
|
9614
9381
|
ggml_cuda_set_device(id);
|
@@ -9633,12 +9400,12 @@ static void ggml_cuda_op_mul_mat(
|
|
9633
9400
|
|
9634
9401
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
9635
9402
|
// in that case an offset on dst_ddf_i is needed
|
9636
|
-
if (dst->backend ==
|
9403
|
+
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
9637
9404
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
9638
9405
|
}
|
9639
9406
|
|
9640
9407
|
// copy src0, src1 to device if necessary
|
9641
|
-
if (src1->backend ==
|
9408
|
+
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
9642
9409
|
if (id != g_main_device) {
|
9643
9410
|
if (convert_src1_to_q8_1) {
|
9644
9411
|
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
@@ -9651,14 +9418,14 @@ static void ggml_cuda_op_mul_mat(
|
|
9651
9418
|
src1_ncols*ne10*sizeof(float), stream));
|
9652
9419
|
}
|
9653
9420
|
}
|
9654
|
-
} else if (src1->backend ==
|
9421
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
9655
9422
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
9656
9423
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
9657
9424
|
} else {
|
9658
9425
|
GGML_ASSERT(false);
|
9659
9426
|
}
|
9660
9427
|
|
9661
|
-
if (convert_src1_to_q8_1 && (src1->backend ==
|
9428
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
9662
9429
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
9663
9430
|
CUDA_CHECK(cudaGetLastError());
|
9664
9431
|
}
|
@@ -9676,10 +9443,10 @@ static void ggml_cuda_op_mul_mat(
|
|
9676
9443
|
if (!dst_on_device) {
|
9677
9444
|
void * dst_off_device;
|
9678
9445
|
cudaMemcpyKind kind;
|
9679
|
-
if (dst->backend ==
|
9446
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9680
9447
|
dst_off_device = dst->data;
|
9681
9448
|
kind = cudaMemcpyDeviceToHost;
|
9682
|
-
} else if (dst->backend ==
|
9449
|
+
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
9683
9450
|
dst_off_device = dst_extra->data_device[g_main_device];
|
9684
9451
|
kind = cudaMemcpyDeviceToDevice;
|
9685
9452
|
} else {
|
@@ -9744,7 +9511,7 @@ static void ggml_cuda_op_mul_mat(
|
|
9744
9511
|
}
|
9745
9512
|
}
|
9746
9513
|
|
9747
|
-
if (dst->backend ==
|
9514
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9748
9515
|
ggml_cuda_set_device(g_main_device);
|
9749
9516
|
CUDA_CHECK(cudaDeviceSynchronize());
|
9750
9517
|
}
|
@@ -9829,6 +9596,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
9829
9596
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
9830
9597
|
}
|
9831
9598
|
|
9599
|
+
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9600
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9601
|
+
|
9602
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
9603
|
+
|
9604
|
+
// dd = data device
|
9605
|
+
float * src0_ddf = nullptr;
|
9606
|
+
float * src1_ddf = nullptr;
|
9607
|
+
float * dst_ddf = nullptr;
|
9608
|
+
|
9609
|
+
cuda_pool_alloc<float> dst_f;
|
9610
|
+
|
9611
|
+
ggml_cuda_set_device(g_main_device);
|
9612
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
9613
|
+
|
9614
|
+
if (dst_on_device) {
|
9615
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
9616
|
+
} else {
|
9617
|
+
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
9618
|
+
}
|
9619
|
+
|
9620
|
+
// do the computation
|
9621
|
+
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
9622
|
+
CUDA_CHECK(cudaGetLastError());
|
9623
|
+
|
9624
|
+
// copy dst to host if necessary
|
9625
|
+
if (!dst_on_device) {
|
9626
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
9627
|
+
}
|
9628
|
+
|
9629
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9630
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9631
|
+
}
|
9632
|
+
}
|
9633
|
+
|
9634
|
+
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9635
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
|
9636
|
+
}
|
9637
|
+
|
9832
9638
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9833
9639
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
9834
9640
|
}
|
@@ -9850,7 +9656,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
|
|
9850
9656
|
|
9851
9657
|
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
9852
9658
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
9853
|
-
GGML_ASSERT(src0->backend !=
|
9659
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9854
9660
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
9855
9661
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
9856
9662
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -9881,7 +9687,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
9881
9687
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9882
9688
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9883
9689
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
9884
|
-
GGML_ASSERT(src0->backend !=
|
9690
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9885
9691
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
9886
9692
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9887
9693
|
|
@@ -9940,7 +9746,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
|
9940
9746
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9941
9747
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9942
9748
|
|
9943
|
-
GGML_ASSERT(src0->backend !=
|
9749
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9944
9750
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
9945
9751
|
|
9946
9752
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -10086,11 +9892,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
|
10086
9892
|
|
10087
9893
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
10088
9894
|
const bool all_on_device =
|
10089
|
-
(src0->backend ==
|
10090
|
-
(src1->backend ==
|
10091
|
-
( dst->backend ==
|
9895
|
+
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
9896
|
+
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
9897
|
+
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
10092
9898
|
|
10093
|
-
const bool split = src0->backend ==
|
9899
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
10094
9900
|
|
10095
9901
|
int64_t min_compute_capability = INT_MAX;
|
10096
9902
|
|
@@ -10240,7 +10046,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
10240
10046
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
10241
10047
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
10242
10048
|
|
10243
|
-
GGML_ASSERT(src00->backend !=
|
10049
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
10244
10050
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
10245
10051
|
|
10246
10052
|
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
@@ -10384,7 +10190,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10384
10190
|
|
10385
10191
|
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
10386
10192
|
|
10387
|
-
if (ids->backend ==
|
10193
|
+
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
10388
10194
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
10389
10195
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
10390
10196
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
@@ -10401,20 +10207,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10401
10207
|
ggml_tensor src1_row = *src1;
|
10402
10208
|
ggml_tensor dst_row = *dst;
|
10403
10209
|
|
10404
|
-
src1_row.backend =
|
10405
|
-
dst_row.backend =
|
10210
|
+
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
10211
|
+
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
10406
10212
|
|
10407
10213
|
src1_row.extra = &src1_row_extra;
|
10408
10214
|
dst_row.extra = &dst_row_extra;
|
10409
10215
|
|
10410
|
-
char * src1_original = src1->backend ==
|
10216
|
+
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
10411
10217
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
10412
|
-
char * dst_original = dst->backend ==
|
10218
|
+
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
10413
10219
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
10414
10220
|
|
10415
10221
|
if (src1->ne[1] == 1) {
|
10416
|
-
GGML_ASSERT(src1->backend ==
|
10417
|
-
GGML_ASSERT(dst->backend ==
|
10222
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
10223
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
10418
10224
|
|
10419
10225
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
10420
10226
|
//int32_t row_id;
|
@@ -10442,9 +10248,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10442
10248
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
10443
10249
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
10444
10250
|
|
10445
|
-
const cudaMemcpyKind src1_kind = src1->backend ==
|
10251
|
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
10446
10252
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
10447
|
-
const cudaMemcpyKind dst_kind = dst->backend ==
|
10253
|
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
10448
10254
|
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
10449
10255
|
|
10450
10256
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
@@ -10499,7 +10305,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10499
10305
|
}
|
10500
10306
|
}
|
10501
10307
|
|
10502
|
-
if (dst->backend ==
|
10308
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
10503
10309
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
10504
10310
|
}
|
10505
10311
|
}
|
@@ -10516,8 +10322,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
10516
10322
|
const int64_t ne = ggml_nelements(src0);
|
10517
10323
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
10518
10324
|
|
10519
|
-
GGML_ASSERT(src0->backend ==
|
10520
|
-
GGML_ASSERT(src1->backend ==
|
10325
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
10326
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
10521
10327
|
|
10522
10328
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
10523
10329
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
@@ -10648,9 +10454,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10648
10454
|
if (!g_cublas_loaded) return false;
|
10649
10455
|
|
10650
10456
|
ggml_cuda_func_t func;
|
10651
|
-
const bool any_on_device = tensor->backend ==
|
10652
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
10653
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
10457
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
10458
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
10459
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
10654
10460
|
|
10655
10461
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10656
10462
|
return false;
|
@@ -10729,6 +10535,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10729
10535
|
case GGML_OP_PAD:
|
10730
10536
|
func = ggml_cuda_pad;
|
10731
10537
|
break;
|
10538
|
+
case GGML_OP_ARANGE:
|
10539
|
+
func = ggml_cuda_arange;
|
10540
|
+
break;
|
10541
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
10542
|
+
func = ggml_cuda_timestep_embedding;
|
10543
|
+
break;
|
10732
10544
|
case GGML_OP_LEAKY_RELU:
|
10733
10545
|
func = ggml_cuda_leaky_relu;
|
10734
10546
|
break;
|
@@ -10797,14 +10609,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10797
10609
|
return false;
|
10798
10610
|
}
|
10799
10611
|
|
10800
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
10612
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
10801
10613
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
10802
10614
|
}
|
10803
10615
|
|
10804
10616
|
if (params->ith != 0) {
|
10805
10617
|
return true;
|
10806
10618
|
}
|
10807
|
-
if (params->type ==
|
10619
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
10808
10620
|
return true;
|
10809
10621
|
}
|
10810
10622
|
func(tensor->src[0], tensor->src[1], tensor);
|
@@ -10832,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
|
|
10832
10644
|
#define UNUSED GGML_UNUSED
|
10833
10645
|
|
10834
10646
|
struct ggml_backend_cuda_context {
|
10647
|
+
explicit ggml_backend_cuda_context(int device) :
|
10648
|
+
device(device),
|
10649
|
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
10650
|
+
}
|
10651
|
+
|
10652
|
+
~ggml_backend_cuda_context() {
|
10653
|
+
if (copy_event != nullptr) {
|
10654
|
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
10655
|
+
}
|
10656
|
+
}
|
10657
|
+
|
10835
10658
|
int device;
|
10836
10659
|
std::string name;
|
10660
|
+
cudaEvent_t copy_event = nullptr;
|
10837
10661
|
};
|
10838
10662
|
|
10839
10663
|
// cuda buffer
|
@@ -10903,7 +10727,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
10903
10727
|
|
10904
10728
|
extra->data_device[ctx->device] = tensor->data;
|
10905
10729
|
|
10906
|
-
tensor->backend =
|
10730
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
10907
10731
|
tensor->extra = extra;
|
10908
10732
|
|
10909
10733
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -10918,42 +10742,40 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
10918
10742
|
}
|
10919
10743
|
|
10920
10744
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10921
|
-
GGML_ASSERT(tensor->backend ==
|
10745
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
10922
10746
|
|
10923
10747
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10924
10748
|
|
10925
10749
|
ggml_cuda_set_device(ctx->device);
|
10926
|
-
CUDA_CHECK(
|
10927
|
-
CUDA_CHECK(
|
10928
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10750
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
10751
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10929
10752
|
}
|
10930
10753
|
|
10931
10754
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10932
|
-
GGML_ASSERT(tensor->backend ==
|
10755
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
10933
10756
|
|
10934
10757
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10935
10758
|
|
10936
10759
|
ggml_cuda_set_device(ctx->device);
|
10937
|
-
CUDA_CHECK(
|
10938
|
-
CUDA_CHECK(
|
10939
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10760
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
10761
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10940
10762
|
}
|
10941
10763
|
|
10942
10764
|
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
10943
10765
|
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
10944
10766
|
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
10945
|
-
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10946
|
-
|
10947
|
-
|
10948
|
-
|
10949
|
-
|
10950
|
-
|
10951
|
-
CUDA_CHECK(
|
10952
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10953
|
-
|
10767
|
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
10768
|
+
if (src_ctx->device == dst_ctx->device) {
|
10769
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
10770
|
+
} else {
|
10771
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
10772
|
+
}
|
10773
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10954
10774
|
return true;
|
10955
10775
|
}
|
10956
10776
|
return false;
|
10777
|
+
|
10778
|
+
UNUSED(buffer);
|
10957
10779
|
}
|
10958
10780
|
|
10959
10781
|
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
@@ -11164,7 +10986,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|
11164
10986
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
11165
10987
|
}
|
11166
10988
|
}
|
11167
|
-
tensor->backend =
|
10989
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
11168
10990
|
tensor->extra = extra;
|
11169
10991
|
}
|
11170
10992
|
|
@@ -11198,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
|
|
11198
11020
|
}
|
11199
11021
|
|
11200
11022
|
const char * buf_host = (const char *)data + offset_split;
|
11201
|
-
CUDA_CHECK(
|
11023
|
+
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
11024
|
+
}
|
11025
|
+
|
11026
|
+
for (int id = 0; id < g_device_count; ++id) {
|
11027
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
11202
11028
|
}
|
11203
11029
|
}
|
11204
11030
|
|
@@ -11232,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
|
|
11232
11058
|
}
|
11233
11059
|
|
11234
11060
|
char * buf_host = (char *)data + offset_split;
|
11235
|
-
CUDA_CHECK(
|
11061
|
+
CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
11062
|
+
}
|
11063
|
+
|
11064
|
+
for (int id = 0; id < g_device_count; ++id) {
|
11065
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
11236
11066
|
}
|
11237
11067
|
}
|
11238
11068
|
|
@@ -11411,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
11411
11241
|
return &ggml_backend_cuda_buffer_type_host;
|
11412
11242
|
}
|
11413
11243
|
|
11244
|
+
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
11245
|
+
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
11246
|
+
//}
|
11247
|
+
|
11414
11248
|
// backend
|
11415
11249
|
|
11416
11250
|
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
@@ -11434,31 +11268,71 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
|
|
11434
11268
|
|
11435
11269
|
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
11436
11270
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11271
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
11437
11272
|
|
11438
|
-
GGML_ASSERT(
|
11439
|
-
GGML_ASSERT(tensor->backend ==
|
11273
|
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
11274
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
11440
11275
|
|
11441
11276
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
11442
11277
|
}
|
11443
11278
|
|
11444
11279
|
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
11445
11280
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11281
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
11446
11282
|
|
11447
|
-
GGML_ASSERT(
|
11448
|
-
GGML_ASSERT(tensor->backend ==
|
11283
|
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
11284
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
11449
11285
|
|
11450
11286
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
11451
11287
|
}
|
11452
11288
|
|
11453
|
-
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t
|
11454
|
-
|
11289
|
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
11290
|
+
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
11455
11291
|
|
11456
|
-
|
11457
|
-
|
11458
|
-
|
11292
|
+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
11293
|
+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
11294
|
+
|
11295
|
+
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
11296
|
+
return false;
|
11459
11297
|
}
|
11460
11298
|
|
11461
|
-
|
11299
|
+
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
11300
|
+
return false;
|
11301
|
+
}
|
11302
|
+
|
11303
|
+
// device -> device
|
11304
|
+
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
11305
|
+
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
11306
|
+
|
11307
|
+
if (backend_src != backend_dst) {
|
11308
|
+
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
11309
|
+
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
11310
|
+
|
11311
|
+
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
11312
|
+
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
11313
|
+
|
11314
|
+
if (!cuda_ctx_src->copy_event) {
|
11315
|
+
ggml_cuda_set_device(cuda_ctx_src->device);
|
11316
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
11317
|
+
}
|
11318
|
+
|
11319
|
+
// copy on src stream
|
11320
|
+
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
11321
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
11322
|
+
} else {
|
11323
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
|
11324
|
+
}
|
11325
|
+
|
11326
|
+
// record event on src stream
|
11327
|
+
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
|
11328
|
+
|
11329
|
+
// wait on dst stream for the copy to complete
|
11330
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
|
11331
|
+
} else {
|
11332
|
+
// src and dst are on the same backend
|
11333
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
11334
|
+
}
|
11335
|
+
return true;
|
11462
11336
|
}
|
11463
11337
|
|
11464
11338
|
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
@@ -11469,13 +11343,13 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
11469
11343
|
UNUSED(backend);
|
11470
11344
|
}
|
11471
11345
|
|
11472
|
-
GGML_CALL static
|
11346
|
+
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
11473
11347
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11474
11348
|
|
11475
11349
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
11476
11350
|
|
11477
11351
|
ggml_compute_params params = {};
|
11478
|
-
params.type =
|
11352
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
11479
11353
|
params.ith = 0;
|
11480
11354
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
11481
11355
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -11485,13 +11359,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11485
11359
|
}
|
11486
11360
|
|
11487
11361
|
#ifndef NDEBUG
|
11488
|
-
assert(node->backend ==
|
11362
|
+
assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
11489
11363
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
11490
11364
|
assert(node->extra != nullptr);
|
11491
11365
|
|
11492
11366
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
11493
11367
|
if (node->src[j] != nullptr) {
|
11494
|
-
assert(node->src[j]->backend ==
|
11368
|
+
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
11495
11369
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
11496
11370
|
assert(node->src[j]->extra != nullptr);
|
11497
11371
|
}
|
@@ -11505,7 +11379,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11505
11379
|
GGML_ASSERT(ok);
|
11506
11380
|
}
|
11507
11381
|
|
11508
|
-
return
|
11382
|
+
return GGML_STATUS_SUCCESS;
|
11509
11383
|
}
|
11510
11384
|
|
11511
11385
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
@@ -11541,7 +11415,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11541
11415
|
}
|
11542
11416
|
ggml_type a_type = a->type;
|
11543
11417
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
11544
|
-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL
|
11418
|
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
11419
|
+
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
11545
11420
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
11546
11421
|
return false;
|
11547
11422
|
}
|
@@ -11623,6 +11498,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11623
11498
|
case GGML_OP_GROUP_NORM:
|
11624
11499
|
case GGML_OP_UPSCALE:
|
11625
11500
|
case GGML_OP_PAD:
|
11501
|
+
case GGML_OP_ARANGE:
|
11502
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
11626
11503
|
case GGML_OP_LEAKY_RELU:
|
11627
11504
|
return true;
|
11628
11505
|
default:
|
@@ -11632,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11632
11509
|
UNUSED(backend);
|
11633
11510
|
}
|
11634
11511
|
|
11512
|
+
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
11513
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11514
|
+
|
11515
|
+
ggml_cuda_set_device(cuda_ctx->device);
|
11516
|
+
|
11517
|
+
cudaEvent_t event;
|
11518
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
11519
|
+
|
11520
|
+
return new ggml_backend_event {
|
11521
|
+
/* .backend = */ backend,
|
11522
|
+
/* .context = */ event,
|
11523
|
+
};
|
11524
|
+
}
|
11525
|
+
|
11526
|
+
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
11527
|
+
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
11528
|
+
|
11529
|
+
delete event;
|
11530
|
+
}
|
11531
|
+
|
11532
|
+
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
11533
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
11534
|
+
|
11535
|
+
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
|
11536
|
+
}
|
11537
|
+
|
11538
|
+
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
11539
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11540
|
+
|
11541
|
+
if (ggml_backend_is_cuda(event->backend)) {
|
11542
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
11543
|
+
} else {
|
11544
|
+
// untested
|
11545
|
+
auto wait_fn = [](void * user_data) {
|
11546
|
+
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
11547
|
+
ggml_backend_event_synchronize(event);
|
11548
|
+
};
|
11549
|
+
|
11550
|
+
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
11551
|
+
}
|
11552
|
+
}
|
11553
|
+
|
11554
|
+
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
11555
|
+
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
11556
|
+
}
|
11557
|
+
|
11635
11558
|
static ggml_backend_i ggml_backend_cuda_interface = {
|
11636
11559
|
/* .get_name = */ ggml_backend_cuda_name,
|
11637
11560
|
/* .free = */ ggml_backend_cuda_free,
|
@@ -11645,8 +11568,18 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
11645
11568
|
/* .graph_plan_compute = */ NULL,
|
11646
11569
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
11647
11570
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
11571
|
+
/* .event_new = */ ggml_backend_cuda_event_new,
|
11572
|
+
/* .event_free = */ ggml_backend_cuda_event_free,
|
11573
|
+
/* .event_record = */ ggml_backend_cuda_event_record,
|
11574
|
+
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
11575
|
+
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
11648
11576
|
};
|
11649
11577
|
|
11578
|
+
static ggml_guid_t ggml_backend_cuda_guid() {
|
11579
|
+
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
11580
|
+
return &guid;
|
11581
|
+
}
|
11582
|
+
|
11650
11583
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
11651
11584
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
11652
11585
|
|
@@ -11658,12 +11591,14 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
11658
11591
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
11659
11592
|
ggml_cuda_set_main_device(device);
|
11660
11593
|
|
11661
|
-
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context
|
11662
|
-
|
11663
|
-
|
11664
|
-
|
11594
|
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
11595
|
+
if (ctx == nullptr) {
|
11596
|
+
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
11597
|
+
return nullptr;
|
11598
|
+
}
|
11665
11599
|
|
11666
11600
|
ggml_backend_t cuda_backend = new ggml_backend {
|
11601
|
+
/* .guid = */ ggml_backend_cuda_guid(),
|
11667
11602
|
/* .interface = */ ggml_backend_cuda_interface,
|
11668
11603
|
/* .context = */ ctx
|
11669
11604
|
};
|
@@ -11672,7 +11607,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
11672
11607
|
}
|
11673
11608
|
|
11674
11609
|
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
11675
|
-
return backend && backend->
|
11610
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
11676
11611
|
}
|
11677
11612
|
|
11678
11613
|
GGML_CALL int ggml_backend_cuda_get_device_count() {
|