llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -2,6 +2,15 @@
|
|
2
2
|
#include "ggml.h"
|
3
3
|
#include "ggml-backend-impl.h"
|
4
4
|
|
5
|
+
#if defined(GGML_USE_HIPBLAS)
|
6
|
+
#define GGML_COMMON_DECL_HIP
|
7
|
+
#define GGML_COMMON_IMPL_HIP
|
8
|
+
#else
|
9
|
+
#define GGML_COMMON_DECL_CUDA
|
10
|
+
#define GGML_COMMON_IMPL_CUDA
|
11
|
+
#endif
|
12
|
+
#include "ggml-common.h"
|
13
|
+
|
5
14
|
#include <algorithm>
|
6
15
|
#include <assert.h>
|
7
16
|
#include <atomic>
|
@@ -63,6 +72,7 @@
|
|
63
72
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
64
73
|
#define cudaEventDisableTiming hipEventDisableTiming
|
65
74
|
#define cudaEventRecord hipEventRecord
|
75
|
+
#define cudaEventSynchronize hipEventSynchronize
|
66
76
|
#define cudaEvent_t hipEvent_t
|
67
77
|
#define cudaEventDestroy hipEventDestroy
|
68
78
|
#define cudaFree hipFree
|
@@ -72,6 +82,7 @@
|
|
72
82
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
73
83
|
#define cudaGetErrorString hipGetErrorString
|
74
84
|
#define cudaGetLastError hipGetLastError
|
85
|
+
#define cudaLaunchHostFunc hipLaunchHostFunc
|
75
86
|
#ifdef GGML_HIP_UMA
|
76
87
|
#define cudaMalloc hipMallocManaged
|
77
88
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
@@ -95,6 +106,7 @@
|
|
95
106
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
96
107
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
97
108
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
109
|
+
#define cudaStreamPerThread hipStreamPerThread
|
98
110
|
#define cudaStreamSynchronize hipStreamSynchronize
|
99
111
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
100
112
|
#define cudaStream_t hipStream_t
|
@@ -172,6 +184,7 @@
|
|
172
184
|
#endif
|
173
185
|
|
174
186
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
187
|
+
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
175
188
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
176
189
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
177
190
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
@@ -196,6 +209,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
196
209
|
return __vsubss4(a, b);
|
197
210
|
}
|
198
211
|
|
212
|
+
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
213
|
+
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
214
|
+
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
215
|
+
unsigned int c;
|
216
|
+
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
217
|
+
#pragma unroll
|
218
|
+
for (int i = 0; i < 4; ++i) {
|
219
|
+
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
220
|
+
}
|
221
|
+
return c;
|
222
|
+
}
|
223
|
+
|
199
224
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
200
225
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
201
226
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
@@ -343,66 +368,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
|
|
343
368
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
344
369
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
345
370
|
|
346
|
-
// QK = number of values after dequantization
|
347
|
-
// QR = QK / number of values before dequantization
|
348
|
-
// QI = number of 32 bit integers before dequantization
|
349
|
-
|
350
|
-
#define QK4_0 32
|
351
|
-
#define QR4_0 2
|
352
|
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
353
|
-
typedef struct {
|
354
|
-
half d; // delta
|
355
|
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
356
|
-
} block_q4_0;
|
357
|
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
358
|
-
|
359
|
-
#define QK4_1 32
|
360
|
-
#define QR4_1 2
|
361
|
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
362
|
-
typedef struct {
|
363
|
-
half2 dm; // dm.x = delta, dm.y = min
|
364
|
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
365
|
-
} block_q4_1;
|
366
|
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
367
|
-
|
368
|
-
#define QK5_0 32
|
369
|
-
#define QR5_0 2
|
370
|
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
371
|
-
typedef struct {
|
372
|
-
half d; // delta
|
373
|
-
uint8_t qh[4]; // 5-th bit of quants
|
374
|
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
375
|
-
} block_q5_0;
|
376
|
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
377
|
-
|
378
|
-
#define QK5_1 32
|
379
|
-
#define QR5_1 2
|
380
|
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
381
|
-
typedef struct {
|
382
|
-
half2 dm; // dm.x = delta, dm.y = min
|
383
|
-
uint8_t qh[4]; // 5-th bit of quants
|
384
|
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
385
|
-
} block_q5_1;
|
386
|
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
387
|
-
|
388
|
-
#define QK8_0 32
|
389
|
-
#define QR8_0 1
|
390
|
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
391
|
-
typedef struct {
|
392
|
-
half d; // delta
|
393
|
-
int8_t qs[QK8_0]; // quants
|
394
|
-
} block_q8_0;
|
395
|
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
396
|
-
|
397
|
-
#define QK8_1 32
|
398
|
-
#define QR8_1 1
|
399
|
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
400
|
-
typedef struct {
|
401
|
-
half2 ds; // ds.x = delta, ds.y = sum
|
402
|
-
int8_t qs[QK8_0]; // quants
|
403
|
-
} block_q8_1;
|
404
|
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
405
|
-
|
406
371
|
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
407
372
|
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
408
373
|
typedef void (*load_tiles_cuda_t)(
|
@@ -412,130 +377,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
|
412
377
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
413
378
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
414
379
|
|
415
|
-
//================================= k-quants
|
416
|
-
|
417
|
-
#ifdef GGML_QKK_64
|
418
|
-
#define QK_K 64
|
419
|
-
#define K_SCALE_SIZE 4
|
420
|
-
#else
|
421
|
-
#define QK_K 256
|
422
|
-
#define K_SCALE_SIZE 12
|
423
|
-
#endif
|
424
|
-
|
425
|
-
#define QR2_K 4
|
426
|
-
#define QI2_K (QK_K / (4*QR2_K))
|
427
|
-
typedef struct {
|
428
|
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
429
|
-
uint8_t qs[QK_K/4]; // quants
|
430
|
-
half2 dm; // super-block scale for quantized scales/mins
|
431
|
-
} block_q2_K;
|
432
|
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
433
|
-
|
434
|
-
#define QR3_K 4
|
435
|
-
#define QI3_K (QK_K / (4*QR3_K))
|
436
|
-
typedef struct {
|
437
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
438
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
439
|
-
#ifdef GGML_QKK_64
|
440
|
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
441
|
-
#else
|
442
|
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
443
|
-
#endif
|
444
|
-
half d; // super-block scale
|
445
|
-
} block_q3_K;
|
446
|
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
447
|
-
|
448
|
-
#define QR4_K 2
|
449
|
-
#define QI4_K (QK_K / (4*QR4_K))
|
450
|
-
#ifdef GGML_QKK_64
|
451
|
-
typedef struct {
|
452
|
-
half dm[2]; // super-block scales/mins
|
453
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
454
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
455
|
-
} block_q4_K;
|
456
|
-
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
457
|
-
#else
|
458
|
-
typedef struct {
|
459
|
-
half2 dm; // super-block scale for quantized scales/mins
|
460
|
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
461
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
462
|
-
} block_q4_K;
|
463
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
464
|
-
#endif
|
465
|
-
|
466
|
-
#define QR5_K 2
|
467
|
-
#define QI5_K (QK_K / (4*QR5_K))
|
468
|
-
#ifdef GGML_QKK_64
|
469
|
-
typedef struct {
|
470
|
-
half d; // super-block scale
|
471
|
-
int8_t scales[QK_K/16]; // block scales
|
472
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
473
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
474
|
-
} block_q5_K;
|
475
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
476
|
-
#else
|
477
|
-
typedef struct {
|
478
|
-
half2 dm; // super-block scale for quantized scales/mins
|
479
|
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
480
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
481
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
482
|
-
} block_q5_K;
|
483
|
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
484
|
-
#endif
|
485
|
-
|
486
|
-
#define QR6_K 2
|
487
|
-
#define QI6_K (QK_K / (4*QR6_K))
|
488
|
-
typedef struct {
|
489
|
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
490
|
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
491
|
-
int8_t scales[QK_K/16]; // scales
|
492
|
-
half d; // delta
|
493
|
-
} block_q6_K;
|
494
|
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
495
|
-
|
496
|
-
#define QR2_XXS 8
|
497
|
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
498
|
-
typedef struct {
|
499
|
-
half d;
|
500
|
-
uint16_t qs[QK_K/8];
|
501
|
-
} block_iq2_xxs;
|
502
|
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
503
|
-
|
504
|
-
#define QR2_XS 8
|
505
|
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
506
|
-
typedef struct {
|
507
|
-
half d;
|
508
|
-
uint16_t qs[QK_K/8];
|
509
|
-
uint8_t scales[QK_K/32];
|
510
|
-
} block_iq2_xs;
|
511
|
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
512
|
-
|
513
|
-
#define QR3_XXS 8
|
514
|
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
515
|
-
typedef struct {
|
516
|
-
half d;
|
517
|
-
uint8_t qs[3*(QK_K/8)];
|
518
|
-
} block_iq3_xxs;
|
519
|
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
520
|
-
|
521
|
-
#define QR1_S 8
|
522
|
-
#define QI1_S (QK_K / (4*QR1_S))
|
523
|
-
typedef struct {
|
524
|
-
half d;
|
525
|
-
uint8_t qs[QK_K/8];
|
526
|
-
uint8_t scales[QK_K/16];
|
527
|
-
} block_iq1_s;
|
528
|
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
529
|
-
|
530
|
-
#define QK4_NL 32
|
531
|
-
#define QR4_NL 2
|
532
|
-
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
533
|
-
typedef struct {
|
534
|
-
half d;
|
535
|
-
uint8_t qs[QK4_NL/2];
|
536
|
-
} block_iq4_nl;
|
537
|
-
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
538
|
-
|
539
380
|
#define WARP_SIZE 32
|
540
381
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
541
382
|
|
@@ -559,6 +400,8 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
|
|
559
400
|
#define CUDA_UPSCALE_BLOCK_SIZE 256
|
560
401
|
#define CUDA_CONCAT_BLOCK_SIZE 256
|
561
402
|
#define CUDA_PAD_BLOCK_SIZE 256
|
403
|
+
#define CUDA_ARANGE_BLOCK_SIZE 256
|
404
|
+
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
|
562
405
|
#define CUDA_ACC_BLOCK_SIZE 256
|
563
406
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
564
407
|
#define CUDA_POOL2D_BLOCK_SIZE 256
|
@@ -661,18 +504,20 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
661
504
|
return a;
|
662
505
|
}
|
663
506
|
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
//
|
507
|
+
#ifdef GGML_CUDA_F16
|
508
|
+
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
509
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
510
|
+
#pragma unroll
|
511
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
512
|
+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
513
|
+
}
|
514
|
+
return a;
|
515
|
+
#else
|
516
|
+
(void) a;
|
517
|
+
NO_DEVICE_CODE;
|
518
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
519
|
+
}
|
520
|
+
#endif // GGML_CUDA_F16
|
676
521
|
|
677
522
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
678
523
|
#pragma unroll
|
@@ -931,17 +776,21 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
|
931
776
|
nidx +
|
932
777
|
blockIdx.y * ne0 +
|
933
778
|
blockIdx.z * ne0 * gridDim.y;
|
934
|
-
|
779
|
+
dst[offset_dst] = x[offset_src];
|
935
780
|
} else {
|
936
781
|
int offset_src =
|
937
782
|
nidx +
|
938
783
|
blockIdx.y * ne0 +
|
939
784
|
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
940
|
-
|
785
|
+
dst[offset_dst] = y[offset_src];
|
941
786
|
}
|
942
787
|
}
|
943
788
|
|
944
|
-
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int
|
789
|
+
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
790
|
+
// blockIdx.z: idx of ne02*ne03
|
791
|
+
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
792
|
+
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
793
|
+
// ne00xne01: ne00 * ne01
|
945
794
|
int ne0 = ne00 * scale_factor;
|
946
795
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
947
796
|
if (nidx >= ne0) {
|
@@ -953,7 +802,7 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
953
802
|
int offset_src =
|
954
803
|
i00 +
|
955
804
|
i01 * ne00 +
|
956
|
-
blockIdx.z *
|
805
|
+
blockIdx.z * ne00xne01;
|
957
806
|
int offset_dst =
|
958
807
|
nidx +
|
959
808
|
blockIdx.y * ne0 +
|
@@ -961,7 +810,10 @@ static __global__ void upscale_f32(const float * x, float * dst, const int ne00,
|
|
961
810
|
dst[offset_dst] = x[offset_src];
|
962
811
|
}
|
963
812
|
|
964
|
-
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
|
813
|
+
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
814
|
+
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
815
|
+
// blockIdx.y: idx of ne1
|
816
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
965
817
|
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
966
818
|
if (nidx >= ne0) {
|
967
819
|
return;
|
@@ -972,19 +824,53 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
|
|
972
824
|
nidx +
|
973
825
|
blockIdx.y * ne0 +
|
974
826
|
blockIdx.z * ne0 * gridDim.y;
|
975
|
-
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02) {
|
827
|
+
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
976
828
|
int offset_src =
|
977
829
|
nidx +
|
978
830
|
blockIdx.y * ne00 +
|
979
831
|
blockIdx.z * ne00 * ne01;
|
980
|
-
|
832
|
+
dst[offset_dst] = x[offset_src];
|
981
833
|
} else {
|
982
834
|
dst[offset_dst] = 0.0f;
|
983
835
|
}
|
984
836
|
}
|
985
837
|
|
838
|
+
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
|
839
|
+
// blockIDx.x: idx of ne0 / BLOCK_SIZE
|
840
|
+
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
841
|
+
if (nidx >= ne0) {
|
842
|
+
return;
|
843
|
+
}
|
844
|
+
dst[nidx] = start + step * nidx;
|
845
|
+
}
|
846
|
+
|
847
|
+
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
|
848
|
+
// blockIDx.y: idx of timesteps->ne[0]
|
849
|
+
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
|
850
|
+
int i = blockIdx.y;
|
851
|
+
int j = threadIdx.x + blockIdx.x * blockDim.x;
|
852
|
+
float * embed_data = (float *)((char *)dst + i*nb1);
|
853
|
+
|
854
|
+
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
|
855
|
+
embed_data[dim] = 0.f;
|
856
|
+
}
|
857
|
+
|
858
|
+
int half = dim / 2;
|
859
|
+
if (j >= half) {
|
860
|
+
return;
|
861
|
+
}
|
862
|
+
|
863
|
+
float timestep = timesteps[i];
|
864
|
+
float freq = (float)expf(-logf(max_period) * j / half);
|
865
|
+
float arg = timestep * freq;
|
866
|
+
embed_data[j] = cosf(arg);
|
867
|
+
embed_data[j + half] = sinf(arg);
|
868
|
+
}
|
869
|
+
|
986
870
|
template <int block_size>
|
987
871
|
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
872
|
+
// blockIdx.x: num_groups idx
|
873
|
+
// threadIdx.x: block_size idx
|
988
874
|
int start = blockIdx.x * group_size;
|
989
875
|
int end = start + group_size;
|
990
876
|
|
@@ -1467,420 +1353,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
|
1467
1353
|
#endif
|
1468
1354
|
}
|
1469
1355
|
|
1470
|
-
static const __device__ uint64_t iq2xxs_grid[256] = {
|
1471
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
1472
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
|
1473
|
-
0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
1474
|
-
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
1475
|
-
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
1476
|
-
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
1477
|
-
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
1478
|
-
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
1479
|
-
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
1480
|
-
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
1481
|
-
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
1482
|
-
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
1483
|
-
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
1484
|
-
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
1485
|
-
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
1486
|
-
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
1487
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
1488
|
-
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
1489
|
-
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
1490
|
-
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
1491
|
-
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
1492
|
-
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
1493
|
-
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
1494
|
-
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
1495
|
-
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
1496
|
-
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
1497
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
1498
|
-
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
1499
|
-
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
1500
|
-
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
1501
|
-
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
1502
|
-
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
1503
|
-
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
1504
|
-
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
1505
|
-
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
1506
|
-
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
1507
|
-
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
1508
|
-
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
1509
|
-
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
1510
|
-
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
1511
|
-
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
1512
|
-
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
1513
|
-
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
1514
|
-
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
1515
|
-
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
1516
|
-
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
1517
|
-
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
1518
|
-
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
1519
|
-
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
1520
|
-
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
1521
|
-
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
1522
|
-
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
1523
|
-
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
1524
|
-
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
1525
|
-
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
1526
|
-
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
1527
|
-
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
1528
|
-
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
1529
|
-
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
1530
|
-
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
1531
|
-
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
1532
|
-
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
1533
|
-
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
1534
|
-
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
1535
|
-
};
|
1536
|
-
|
1537
|
-
static const __device__ uint64_t iq2xs_grid[512] = {
|
1538
|
-
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
1539
|
-
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
1540
|
-
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
1541
|
-
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
1542
|
-
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
1543
|
-
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
1544
|
-
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
1545
|
-
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
1546
|
-
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
1547
|
-
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
1548
|
-
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
1549
|
-
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
1550
|
-
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
1551
|
-
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
1552
|
-
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
1553
|
-
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
1554
|
-
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
1555
|
-
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
1556
|
-
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
1557
|
-
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
1558
|
-
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
1559
|
-
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
1560
|
-
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
1561
|
-
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
1562
|
-
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
1563
|
-
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
1564
|
-
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
1565
|
-
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
1566
|
-
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
1567
|
-
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
1568
|
-
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
1569
|
-
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
1570
|
-
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
1571
|
-
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
1572
|
-
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
1573
|
-
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
1574
|
-
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
1575
|
-
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
1576
|
-
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
1577
|
-
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
1578
|
-
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
1579
|
-
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
1580
|
-
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
1581
|
-
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
1582
|
-
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
1583
|
-
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
1584
|
-
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
1585
|
-
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
1586
|
-
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
1587
|
-
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
1588
|
-
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
1589
|
-
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
1590
|
-
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
1591
|
-
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
1592
|
-
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
1593
|
-
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
1594
|
-
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
1595
|
-
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
1596
|
-
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
1597
|
-
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
1598
|
-
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
1599
|
-
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
1600
|
-
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
1601
|
-
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
1602
|
-
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
1603
|
-
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
1604
|
-
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
1605
|
-
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
1606
|
-
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
1607
|
-
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
1608
|
-
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
1609
|
-
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
1610
|
-
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
1611
|
-
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
1612
|
-
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
1613
|
-
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
1614
|
-
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
1615
|
-
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
1616
|
-
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
1617
|
-
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
1618
|
-
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
1619
|
-
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
1620
|
-
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
1621
|
-
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
1622
|
-
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
1623
|
-
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
1624
|
-
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
1625
|
-
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
1626
|
-
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
1627
|
-
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
1628
|
-
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
1629
|
-
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
1630
|
-
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
1631
|
-
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
1632
|
-
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
1633
|
-
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
1634
|
-
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
1635
|
-
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
1636
|
-
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
1637
|
-
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
1638
|
-
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
1639
|
-
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
1640
|
-
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
1641
|
-
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
1642
|
-
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
1643
|
-
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
1644
|
-
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
1645
|
-
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
1646
|
-
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
1647
|
-
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
1648
|
-
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
1649
|
-
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
1650
|
-
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
1651
|
-
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
1652
|
-
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
1653
|
-
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
1654
|
-
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
1655
|
-
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
1656
|
-
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
1657
|
-
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
1658
|
-
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
1659
|
-
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
1660
|
-
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
1661
|
-
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
1662
|
-
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
1663
|
-
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
1664
|
-
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
1665
|
-
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
1666
|
-
};
|
1667
|
-
|
1668
|
-
static const __device__ uint32_t iq3xxs_grid[256] = {
|
1669
|
-
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
1670
|
-
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
1671
|
-
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
1672
|
-
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
1673
|
-
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
1674
|
-
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
1675
|
-
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
1676
|
-
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
1677
|
-
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
1678
|
-
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
1679
|
-
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
1680
|
-
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
1681
|
-
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
1682
|
-
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
1683
|
-
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
1684
|
-
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
1685
|
-
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
1686
|
-
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
1687
|
-
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
1688
|
-
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
1689
|
-
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
1690
|
-
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
1691
|
-
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
1692
|
-
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
1693
|
-
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
1694
|
-
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
1695
|
-
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
1696
|
-
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
1697
|
-
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
1698
|
-
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
1699
|
-
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
1700
|
-
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
1701
|
-
};
|
1702
|
-
|
1703
|
-
static const __device__ uint64_t iq1s_grid[512] = {
|
1704
|
-
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
1705
|
-
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
1706
|
-
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
1707
|
-
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
1708
|
-
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
1709
|
-
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
1710
|
-
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
1711
|
-
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
1712
|
-
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
1713
|
-
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
1714
|
-
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
1715
|
-
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
1716
|
-
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
1717
|
-
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
1718
|
-
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
1719
|
-
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
1720
|
-
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
1721
|
-
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
1722
|
-
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
1723
|
-
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
1724
|
-
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
1725
|
-
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
1726
|
-
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
1727
|
-
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
1728
|
-
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
1729
|
-
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
1730
|
-
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
1731
|
-
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
1732
|
-
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
1733
|
-
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
1734
|
-
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
1735
|
-
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
1736
|
-
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
1737
|
-
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
1738
|
-
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
1739
|
-
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
1740
|
-
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
1741
|
-
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
1742
|
-
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
1743
|
-
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
1744
|
-
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
1745
|
-
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
1746
|
-
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
1747
|
-
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
1748
|
-
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
1749
|
-
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
1750
|
-
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
1751
|
-
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
1752
|
-
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
1753
|
-
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
1754
|
-
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
1755
|
-
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
1756
|
-
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
1757
|
-
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
1758
|
-
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
1759
|
-
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
1760
|
-
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
1761
|
-
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
1762
|
-
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
1763
|
-
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
1764
|
-
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
1765
|
-
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
1766
|
-
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
1767
|
-
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
1768
|
-
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
1769
|
-
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
1770
|
-
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
1771
|
-
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
1772
|
-
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
1773
|
-
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
1774
|
-
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
1775
|
-
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
1776
|
-
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
1777
|
-
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
1778
|
-
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
1779
|
-
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
1780
|
-
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
1781
|
-
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
1782
|
-
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
1783
|
-
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
1784
|
-
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
1785
|
-
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
1786
|
-
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
1787
|
-
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
1788
|
-
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
1789
|
-
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
1790
|
-
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
1791
|
-
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
1792
|
-
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
1793
|
-
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
1794
|
-
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
1795
|
-
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
1796
|
-
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
1797
|
-
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
1798
|
-
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
1799
|
-
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
1800
|
-
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
1801
|
-
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
1802
|
-
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
1803
|
-
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
1804
|
-
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
1805
|
-
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
1806
|
-
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
1807
|
-
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
1808
|
-
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
1809
|
-
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
1810
|
-
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
1811
|
-
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
1812
|
-
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
1813
|
-
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
1814
|
-
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
1815
|
-
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
1816
|
-
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
1817
|
-
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
1818
|
-
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
1819
|
-
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
1820
|
-
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
1821
|
-
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
1822
|
-
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
1823
|
-
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
1824
|
-
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
1825
|
-
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
1826
|
-
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
1827
|
-
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
1828
|
-
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
1829
|
-
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
1830
|
-
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
1831
|
-
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
1832
|
-
};
|
1833
|
-
|
1834
|
-
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
1835
|
-
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
1836
|
-
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
1837
|
-
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
1838
|
-
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
1839
|
-
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
1840
|
-
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
1841
|
-
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
1842
|
-
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
1843
|
-
};
|
1844
|
-
|
1845
|
-
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1846
|
-
static const __device__ uint64_t ksigns64[128] = {
|
1847
|
-
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
1848
|
-
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
1849
|
-
0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
1850
|
-
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
|
1851
|
-
0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
|
1852
|
-
0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
1853
|
-
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
|
1854
|
-
0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
|
1855
|
-
0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
1856
|
-
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
|
1857
|
-
0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
|
1858
|
-
0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
1859
|
-
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
|
1860
|
-
0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
|
1861
|
-
0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
1862
|
-
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
|
1863
|
-
0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
|
1864
|
-
0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
1865
|
-
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
|
1866
|
-
0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
|
1867
|
-
0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
1868
|
-
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
|
1869
|
-
0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
|
1870
|
-
0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
1871
|
-
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
|
1872
|
-
0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
|
1873
|
-
0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
1874
|
-
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
|
1875
|
-
0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
|
1876
|
-
0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
1877
|
-
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
1878
|
-
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
1879
|
-
};
|
1880
|
-
//#endif
|
1881
|
-
|
1882
|
-
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
1883
|
-
|
1884
1356
|
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
1885
1357
|
switch (type) {
|
1886
1358
|
case GGML_TYPE_Q4_0:
|
@@ -1945,6 +1417,27 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
1945
1417
|
|
1946
1418
|
}
|
1947
1419
|
|
1420
|
+
template<typename dst_t>
|
1421
|
+
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1422
|
+
|
1423
|
+
const int i = blockIdx.x;
|
1424
|
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
1425
|
+
|
1426
|
+
const int tid = threadIdx.x;
|
1427
|
+
#if QK_K == 256
|
1428
|
+
const int il = tid/8; // 0...3
|
1429
|
+
const int ib = tid%8; // 0...7
|
1430
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1431
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
1432
|
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
1433
|
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
1434
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
1435
|
+
#else
|
1436
|
+
assert(false);
|
1437
|
+
#endif
|
1438
|
+
|
1439
|
+
}
|
1440
|
+
|
1948
1441
|
template<typename dst_t>
|
1949
1442
|
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1950
1443
|
|
@@ -1973,6 +1466,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
1973
1466
|
|
1974
1467
|
}
|
1975
1468
|
|
1469
|
+
template<typename dst_t>
|
1470
|
+
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1471
|
+
|
1472
|
+
const int i = blockIdx.x;
|
1473
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
1474
|
+
|
1475
|
+
const int tid = threadIdx.x;
|
1476
|
+
#if QK_K == 256
|
1477
|
+
const int il = tid/8; // 0...3
|
1478
|
+
const int ib = tid%8; // 0...7
|
1479
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1480
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
1481
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
1482
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
1483
|
+
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
1484
|
+
const uint8_t signs = x[i].signs[4*ib + il];
|
1485
|
+
for (int j = 0; j < 4; ++j) {
|
1486
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
1487
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
1488
|
+
}
|
1489
|
+
#else
|
1490
|
+
assert(false);
|
1491
|
+
#endif
|
1492
|
+
|
1493
|
+
}
|
1494
|
+
|
1976
1495
|
template<typename dst_t>
|
1977
1496
|
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1978
1497
|
|
@@ -1984,11 +1503,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|
1984
1503
|
const int il = tid/8; // 0...3
|
1985
1504
|
const int ib = tid%8; // 0...7
|
1986
1505
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1987
|
-
const
|
1988
|
-
|
1989
|
-
const int8_t *
|
1990
|
-
|
1991
|
-
|
1506
|
+
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
1507
|
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
1508
|
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
1509
|
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
1510
|
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
1511
|
+
grid32[0] &= 0x0f0f0f0f;
|
1512
|
+
for (int j = 0; j < 8; ++j) {
|
1513
|
+
y[j] = d * (q[j] + delta);
|
1514
|
+
}
|
1992
1515
|
#else
|
1993
1516
|
assert(false);
|
1994
1517
|
#endif
|
@@ -2016,6 +1539,25 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
|
2016
1539
|
|
2017
1540
|
}
|
2018
1541
|
|
1542
|
+
#if QK_K != 64
|
1543
|
+
template<typename dst_t>
|
1544
|
+
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1545
|
+
const int i = blockIdx.x;
|
1546
|
+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
1547
|
+
|
1548
|
+
const int tid = threadIdx.x;
|
1549
|
+
const int il = tid/8; // 0...3
|
1550
|
+
const int ib = tid%8; // 0...7
|
1551
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
1552
|
+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
1553
|
+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
1554
|
+
for (int j = 0; j < 4; ++j) {
|
1555
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
1556
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
1557
|
+
}
|
1558
|
+
}
|
1559
|
+
#endif
|
1560
|
+
|
2019
1561
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
2020
1562
|
|
2021
1563
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
@@ -2112,10 +1654,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
2112
1654
|
#endif
|
2113
1655
|
|
2114
1656
|
// sum up partial sums and write back result
|
2115
|
-
|
2116
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2117
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2118
|
-
}
|
1657
|
+
tmp = warp_reduce_sum(tmp);
|
2119
1658
|
|
2120
1659
|
if (threadIdx.x == 0) {
|
2121
1660
|
dst[row] = tmp;
|
@@ -2216,10 +1755,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|
2216
1755
|
#endif
|
2217
1756
|
|
2218
1757
|
// sum up partial sums and write back result
|
2219
|
-
|
2220
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2221
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2222
|
-
}
|
1758
|
+
tmp = warp_reduce_sum(tmp);
|
2223
1759
|
|
2224
1760
|
if (threadIdx.x == 0) {
|
2225
1761
|
dst[row] = tmp;
|
@@ -2352,10 +1888,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
2352
1888
|
#endif
|
2353
1889
|
|
2354
1890
|
// sum up partial sums and write back result
|
2355
|
-
|
2356
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2357
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2358
|
-
}
|
1891
|
+
tmp = warp_reduce_sum(tmp);
|
2359
1892
|
|
2360
1893
|
if (tid == 0) {
|
2361
1894
|
dst[row] = tmp;
|
@@ -2468,10 +2001,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
2468
2001
|
#endif
|
2469
2002
|
|
2470
2003
|
// sum up partial sums and write back result
|
2471
|
-
|
2472
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2473
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2474
|
-
}
|
2004
|
+
tmp = warp_reduce_sum(tmp);
|
2475
2005
|
|
2476
2006
|
if (threadIdx.x == 0) {
|
2477
2007
|
dst[row] = tmp;
|
@@ -2578,10 +2108,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|
2578
2108
|
#endif
|
2579
2109
|
|
2580
2110
|
// sum up partial sums and write back result
|
2581
|
-
|
2582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
2583
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
2584
|
-
}
|
2111
|
+
tmp = warp_reduce_sum(tmp);
|
2585
2112
|
|
2586
2113
|
if (tid == 0) {
|
2587
2114
|
dst[row] = tmp;
|
@@ -2616,11 +2143,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
2616
2143
|
float amax = fabsf(xi);
|
2617
2144
|
float sum = xi;
|
2618
2145
|
|
2619
|
-
|
2620
|
-
|
2621
|
-
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
2622
|
-
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
2623
|
-
}
|
2146
|
+
amax = warp_reduce_max(amax);
|
2147
|
+
sum = warp_reduce_sum(sum);
|
2624
2148
|
|
2625
2149
|
const float d = amax / 127;
|
2626
2150
|
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
@@ -3827,7 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
3827
3351
|
#pragma unroll
|
3828
3352
|
for (int i = 0; i < QR2_K; ++ i) {
|
3829
3353
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
3830
|
-
d8[i] =
|
3354
|
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
3831
3355
|
}
|
3832
3356
|
|
3833
3357
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -3949,7 +3473,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
3949
3473
|
#pragma unroll
|
3950
3474
|
for (int i = 0; i < QR3_K; ++i) {
|
3951
3475
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
3952
|
-
d8[i] =
|
3476
|
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
3953
3477
|
}
|
3954
3478
|
|
3955
3479
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -4118,7 +3642,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
4118
3642
|
|
4119
3643
|
for (int i = 0; i < QR4_K; ++i) {
|
4120
3644
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
4121
|
-
d8[i] =
|
3645
|
+
d8[i] = __low2float(bq8i->ds);
|
4122
3646
|
|
4123
3647
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
4124
3648
|
u[2*i+0] = q8[0];
|
@@ -4483,7 +4007,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
4483
4007
|
#pragma unroll
|
4484
4008
|
for (int i = 0; i < QR6_K; ++i) {
|
4485
4009
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
4486
|
-
d8[i] =
|
4010
|
+
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
|
4487
4011
|
}
|
4488
4012
|
|
4489
4013
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -4682,6 +4206,54 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4682
4206
|
#endif
|
4683
4207
|
}
|
4684
4208
|
|
4209
|
+
// TODO
|
4210
|
+
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
4211
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4212
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4213
|
+
#if QK_K == 256
|
4214
|
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
4215
|
+
|
4216
|
+
const int ib32 = iqs;
|
4217
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4218
|
+
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
4219
|
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
4220
|
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
4221
|
+
int sumi1 = 0;
|
4222
|
+
for (int l = 0; l < 2; ++l) {
|
4223
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
4224
|
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4225
|
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4226
|
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
4227
|
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
4228
|
+
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
4229
|
+
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
4230
|
+
q8 += 8;
|
4231
|
+
}
|
4232
|
+
int sumi2 = 0;
|
4233
|
+
for (int l = 2; l < 4; ++l) {
|
4234
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
4235
|
+
const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4236
|
+
const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4237
|
+
const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
|
4238
|
+
const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
|
4239
|
+
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
4240
|
+
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
4241
|
+
q8 += 8;
|
4242
|
+
}
|
4243
|
+
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4244
|
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
4245
|
+
#else
|
4246
|
+
(void) ksigns64;
|
4247
|
+
assert(false);
|
4248
|
+
return 0.f;
|
4249
|
+
#endif
|
4250
|
+
#else
|
4251
|
+
(void) ksigns64;
|
4252
|
+
assert(false);
|
4253
|
+
return 0.f;
|
4254
|
+
#endif
|
4255
|
+
}
|
4256
|
+
|
4685
4257
|
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
4686
4258
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4687
4259
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
@@ -4717,43 +4289,70 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|
4717
4289
|
#endif
|
4718
4290
|
}
|
4719
4291
|
|
4292
|
+
// TODO: don't use lookup table for signs
|
4293
|
+
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
4294
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4295
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4296
|
+
#if QK_K == 256
|
4297
|
+
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
4298
|
+
|
4299
|
+
const int ib32 = iqs;
|
4300
|
+
const uint8_t * qs = bq2->qs + 8*ib32;
|
4301
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4302
|
+
int sumi = 0;
|
4303
|
+
for (int l = 0; l < 4; ++l) {
|
4304
|
+
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
4305
|
+
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
4306
|
+
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
|
4307
|
+
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
|
4308
|
+
const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
|
4309
|
+
const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
|
4310
|
+
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
4311
|
+
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
4312
|
+
q8 += 8;
|
4313
|
+
}
|
4314
|
+
const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
|
4315
|
+
return d * sumi;
|
4316
|
+
#else
|
4317
|
+
assert(false);
|
4318
|
+
return 0.f;
|
4319
|
+
#endif
|
4320
|
+
#else
|
4321
|
+
assert(false);
|
4322
|
+
return 0.f;
|
4323
|
+
#endif
|
4324
|
+
}
|
4325
|
+
|
4720
4326
|
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
4721
4327
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4722
4328
|
#if QK_K == 256
|
4723
4329
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
4724
4330
|
|
4725
4331
|
const int ib32 = iqs;
|
4726
|
-
int
|
4727
|
-
const uint8_t h1 = bq1->scales[2*ib32+0];
|
4728
|
-
const uint8_t h2 = bq1->scales[2*ib32+1];
|
4332
|
+
int sumi = 0;
|
4729
4333
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4730
4334
|
const int * q8 = (const int *)bq8_1[ib32].qs;
|
4731
|
-
|
4732
|
-
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4736
|
-
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
4737
|
-
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
4738
|
-
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
4739
|
-
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
4335
|
+
for (int l = 0; l < 4; ++l) {
|
4336
|
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
4337
|
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
4338
|
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
4339
|
+
sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
|
4740
4340
|
}
|
4741
4341
|
#else
|
4742
|
-
const int8_t
|
4743
|
-
|
4744
|
-
|
4745
|
-
|
4746
|
-
|
4747
|
-
|
4748
|
-
|
4749
|
-
sumi2 += q8[j+ 8] * grid2[j];
|
4750
|
-
sumi3 += q8[j+16] * grid3[j];
|
4751
|
-
sumi4 += q8[j+24] * grid4[j];
|
4342
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4343
|
+
for (int l = 0; l < 4; ++l) {
|
4344
|
+
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
4345
|
+
for (int j = 0; j < 4; ++j) {
|
4346
|
+
sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
|
4347
|
+
}
|
4348
|
+
q8 += 8;
|
4752
4349
|
}
|
4753
4350
|
#endif
|
4754
|
-
const float
|
4755
|
-
|
4756
|
-
|
4351
|
+
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
|
4352
|
+
const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
|
4353
|
+
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
4354
|
+
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
4355
|
+
return d * sumi + m * delta;
|
4757
4356
|
#else
|
4758
4357
|
assert(false);
|
4759
4358
|
return 0.f;
|
@@ -4810,6 +4409,75 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
|
4810
4409
|
return d * (sumi1 + sumi2);
|
4811
4410
|
}
|
4812
4411
|
|
4412
|
+
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
4413
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4414
|
+
|
4415
|
+
#if QK_K == 256
|
4416
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4417
|
+
|
4418
|
+
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
4419
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
4420
|
+
|
4421
|
+
//// iqs is 0...7
|
4422
|
+
//const int ib64 = iqs/2;
|
4423
|
+
//const int il = iqs%2;
|
4424
|
+
//const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
|
4425
|
+
//const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
|
4426
|
+
//const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
|
4427
|
+
//const uint32_t * q4_2 = q4_1 + 4;
|
4428
|
+
//const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
|
4429
|
+
//const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
|
4430
|
+
//const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
|
4431
|
+
//const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
|
4432
|
+
//int v1, v2;
|
4433
|
+
//int sumi1 = 0, sumi2 = 0;
|
4434
|
+
//for (int j = 0; j < 2; ++j) {
|
4435
|
+
// get_int_from_table_16(q4_1[j], values, v1, v2);
|
4436
|
+
// sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
|
4437
|
+
// get_int_from_table_16(q4_2[j], values, v1, v2);
|
4438
|
+
// sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
|
4439
|
+
//}
|
4440
|
+
//return d1 * sumi1 + d2 * sumi2;
|
4441
|
+
|
4442
|
+
// iqs is 0...7
|
4443
|
+
const int ib32 = iqs;
|
4444
|
+
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
4445
|
+
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
4446
|
+
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
4447
|
+
const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
|
4448
|
+
int v1, v2;
|
4449
|
+
int sumi1 = 0, sumi2 = 0;
|
4450
|
+
for (int j = 0; j < 4; ++j) {
|
4451
|
+
get_int_from_table_16(q4[j], values, v1, v2);
|
4452
|
+
sumi1 = __dp4a(v1, q8[j+0], sumi1);
|
4453
|
+
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
4454
|
+
}
|
4455
|
+
return d * (sumi1 + sumi2);
|
4456
|
+
|
4457
|
+
//// iqs is 0...15
|
4458
|
+
//const int ib32 = iqs/2;
|
4459
|
+
//const int il = iqs%2;
|
4460
|
+
//const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il;
|
4461
|
+
//const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
|
4462
|
+
//const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
4463
|
+
//const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
|
4464
|
+
//int v1, v2;
|
4465
|
+
//int sumi1 = 0, sumi2 = 0;
|
4466
|
+
//for (int j = 0; j < 2; ++j) {
|
4467
|
+
// get_int_from_table_16(q4[j], values, v1, v2);
|
4468
|
+
// sumi1 = __dp4a(v1, q8[j+0], sumi1);
|
4469
|
+
// sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
4470
|
+
//}
|
4471
|
+
//return d * (sumi1 + sumi2);
|
4472
|
+
#else
|
4473
|
+
assert(false);
|
4474
|
+
return 0.f;
|
4475
|
+
#endif
|
4476
|
+
#else
|
4477
|
+
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
4478
|
+
#endif
|
4479
|
+
}
|
4480
|
+
|
4813
4481
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
4814
4482
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
4815
4483
|
static __device__ __forceinline__ void mul_mat_q(
|
@@ -4876,7 +4544,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
4876
4544
|
*dsi_dst = *dsi_src;
|
4877
4545
|
} else {
|
4878
4546
|
float * dfi_dst = (float *) dsi_dst;
|
4879
|
-
*dfi_dst =
|
4547
|
+
*dfi_dst = __low2float(*dsi_src);
|
4880
4548
|
}
|
4881
4549
|
}
|
4882
4550
|
|
@@ -5730,10 +5398,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
5730
5398
|
}
|
5731
5399
|
|
5732
5400
|
// sum up partial sums and write back result
|
5733
|
-
|
5734
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5735
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5736
|
-
}
|
5401
|
+
tmp = warp_reduce_sum(tmp);
|
5737
5402
|
|
5738
5403
|
if (tid == 0) {
|
5739
5404
|
#ifdef GGML_CUDA_F16
|
@@ -5783,10 +5448,7 @@ static __global__ void mul_mat_p021_f16_f32(
|
|
5783
5448
|
const int idst = channel*nrows_dst + row_dst;
|
5784
5449
|
|
5785
5450
|
// sum up partial sums and write back result
|
5786
|
-
|
5787
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5788
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5789
|
-
}
|
5451
|
+
tmp = warp_reduce_sum(tmp);
|
5790
5452
|
|
5791
5453
|
if (threadIdx.x == 0) {
|
5792
5454
|
dst[idst] = tmp;
|
@@ -5829,10 +5491,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
5829
5491
|
}
|
5830
5492
|
|
5831
5493
|
// sum up partial sums and write back result
|
5832
|
-
|
5833
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
5834
|
-
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
5835
|
-
}
|
5494
|
+
tmp = warp_reduce_sum(tmp);
|
5836
5495
|
|
5837
5496
|
if (threadIdx.x == 0) {
|
5838
5497
|
dst[idst] = tmp;
|
@@ -5872,7 +5531,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
5872
5531
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
5873
5532
|
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
5874
5533
|
const int nb12, const int nb13) {
|
5875
|
-
const
|
5534
|
+
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
5876
5535
|
|
5877
5536
|
if (i >= ne) {
|
5878
5537
|
return;
|
@@ -5880,17 +5539,17 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
5880
5539
|
|
5881
5540
|
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
5882
5541
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
5883
|
-
const
|
5884
|
-
const
|
5885
|
-
const
|
5886
|
-
const
|
5887
|
-
const
|
5888
|
-
|
5889
|
-
const
|
5890
|
-
const
|
5891
|
-
const
|
5892
|
-
const
|
5893
|
-
const
|
5542
|
+
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
5543
|
+
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
5544
|
+
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
5545
|
+
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
5546
|
+
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
5547
|
+
|
5548
|
+
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
5549
|
+
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
5550
|
+
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
5551
|
+
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
5552
|
+
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
5894
5553
|
|
5895
5554
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
5896
5555
|
}
|
@@ -6216,11 +5875,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
|
|
6216
5875
|
int ixj = col ^ j;
|
6217
5876
|
if (ixj > col) {
|
6218
5877
|
if ((col & k) == 0) {
|
6219
|
-
if (order ==
|
5878
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
6220
5879
|
swap(dst_row[col], dst_row[ixj]);
|
6221
5880
|
}
|
6222
5881
|
} else {
|
6223
|
-
if (order ==
|
5882
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
6224
5883
|
swap(dst_row[col], dst_row[ixj]);
|
6225
5884
|
}
|
6226
5885
|
}
|
@@ -6328,6 +5987,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
|
6328
5987
|
// find the sum of exps in the block
|
6329
5988
|
tmp = warp_reduce_sum(tmp);
|
6330
5989
|
if (block_size > WARP_SIZE) {
|
5990
|
+
__syncthreads();
|
6331
5991
|
if (warp_id == 0) {
|
6332
5992
|
buf_iw[lane_id] = 0.0f;
|
6333
5993
|
}
|
@@ -6379,23 +6039,23 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
6379
6039
|
|
6380
6040
|
template <typename T>
|
6381
6041
|
static __global__ void im2col_kernel(
|
6382
|
-
const float * x, T * dst,
|
6383
|
-
|
6042
|
+
const float * x, T * dst, int64_t batch_offset,
|
6043
|
+
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
6384
6044
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
6385
|
-
const
|
6045
|
+
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
6386
6046
|
if (i >= pelements) {
|
6387
6047
|
return;
|
6388
6048
|
}
|
6389
6049
|
|
6390
|
-
const
|
6391
|
-
const
|
6392
|
-
const
|
6393
|
-
const
|
6394
|
-
const
|
6050
|
+
const int64_t ksize = OW * (KH > 1 ? KW : 1);
|
6051
|
+
const int64_t kx = i / ksize;
|
6052
|
+
const int64_t kd = kx * ksize;
|
6053
|
+
const int64_t ky = (i - kd) / OW;
|
6054
|
+
const int64_t ix = i % OW;
|
6395
6055
|
|
6396
|
-
const
|
6397
|
-
const
|
6398
|
-
const
|
6056
|
+
const int64_t oh = blockIdx.y;
|
6057
|
+
const int64_t batch = blockIdx.z / IC;
|
6058
|
+
const int64_t ic = blockIdx.z % IC;
|
6399
6059
|
|
6400
6060
|
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
6401
6061
|
const int64_t iih = oh * s1 + ky * d1 - p1;
|
@@ -6721,19 +6381,33 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, const
|
|
6721
6381
|
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
6722
6382
|
}
|
6723
6383
|
|
6724
|
-
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int
|
6384
|
+
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
6385
|
+
const int scale_factor, cudaStream_t stream) {
|
6725
6386
|
int ne0 = (ne00 * scale_factor);
|
6726
6387
|
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
6727
|
-
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02);
|
6388
|
+
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
6728
6389
|
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
6729
6390
|
}
|
6730
6391
|
|
6731
6392
|
static void pad_f32_cuda(const float * x, float * dst,
|
6732
|
-
const int ne00, const int ne01, const int ne02,
|
6733
|
-
const int ne0, const int ne1, const int ne2, cudaStream_t stream) {
|
6393
|
+
const int ne00, const int ne01, const int ne02, const int ne03,
|
6394
|
+
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
6734
6395
|
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
6735
|
-
dim3 gridDim(num_blocks, ne1, ne2);
|
6736
|
-
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02);
|
6396
|
+
dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
6397
|
+
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
6398
|
+
}
|
6399
|
+
|
6400
|
+
static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
|
6401
|
+
int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
|
6402
|
+
arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start, step);
|
6403
|
+
}
|
6404
|
+
|
6405
|
+
static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
|
6406
|
+
const int dim, const int max_period, cudaStream_t stream) {
|
6407
|
+
int half_ceil = (dim + 1) / 2;
|
6408
|
+
int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
6409
|
+
dim3 gridDim(num_blocks, ne00, 1);
|
6410
|
+
timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
|
6737
6411
|
}
|
6738
6412
|
|
6739
6413
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
@@ -6843,12 +6517,24 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
|
6843
6517
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
6844
6518
|
}
|
6845
6519
|
|
6520
|
+
template<typename dst_t>
|
6521
|
+
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6522
|
+
const int nb = k / QK_K;
|
6523
|
+
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
6524
|
+
}
|
6525
|
+
|
6846
6526
|
template<typename dst_t>
|
6847
6527
|
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6848
6528
|
const int nb = k / QK_K;
|
6849
6529
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
6850
6530
|
}
|
6851
6531
|
|
6532
|
+
template<typename dst_t>
|
6533
|
+
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6534
|
+
const int nb = k / QK_K;
|
6535
|
+
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
6536
|
+
}
|
6537
|
+
|
6852
6538
|
template<typename dst_t>
|
6853
6539
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6854
6540
|
const int nb = k / QK_K;
|
@@ -6861,6 +6547,16 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
|
|
6861
6547
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6862
6548
|
}
|
6863
6549
|
|
6550
|
+
template<typename dst_t>
|
6551
|
+
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6552
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
6553
|
+
#if QK_K == 64
|
6554
|
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6555
|
+
#else
|
6556
|
+
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
6557
|
+
#endif
|
6558
|
+
}
|
6559
|
+
|
6864
6560
|
template <typename src_t, typename dst_t>
|
6865
6561
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
6866
6562
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
@@ -6898,12 +6594,18 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6898
6594
|
return dequantize_row_iq2_xxs_cuda;
|
6899
6595
|
case GGML_TYPE_IQ2_XS:
|
6900
6596
|
return dequantize_row_iq2_xs_cuda;
|
6597
|
+
case GGML_TYPE_IQ2_S:
|
6598
|
+
return dequantize_row_iq2_s_cuda;
|
6901
6599
|
case GGML_TYPE_IQ3_XXS:
|
6902
6600
|
return dequantize_row_iq3_xxs_cuda;
|
6903
6601
|
case GGML_TYPE_IQ1_S:
|
6904
6602
|
return dequantize_row_iq1_s_cuda;
|
6905
6603
|
case GGML_TYPE_IQ4_NL:
|
6906
6604
|
return dequantize_row_iq4_nl_cuda;
|
6605
|
+
case GGML_TYPE_IQ4_XS:
|
6606
|
+
return dequantize_row_iq4_xs_cuda;
|
6607
|
+
case GGML_TYPE_IQ3_S:
|
6608
|
+
return dequantize_row_iq3_s_cuda;
|
6907
6609
|
case GGML_TYPE_F32:
|
6908
6610
|
return convert_unary_cuda<float>;
|
6909
6611
|
default:
|
@@ -6937,12 +6639,18 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
6937
6639
|
return dequantize_row_iq2_xxs_cuda;
|
6938
6640
|
case GGML_TYPE_IQ2_XS:
|
6939
6641
|
return dequantize_row_iq2_xs_cuda;
|
6642
|
+
case GGML_TYPE_IQ2_S:
|
6643
|
+
return dequantize_row_iq2_s_cuda;
|
6940
6644
|
case GGML_TYPE_IQ3_XXS:
|
6941
6645
|
return dequantize_row_iq3_xxs_cuda;
|
6942
6646
|
case GGML_TYPE_IQ1_S:
|
6943
6647
|
return dequantize_row_iq1_s_cuda;
|
6944
6648
|
case GGML_TYPE_IQ4_NL:
|
6945
6649
|
return dequantize_row_iq4_nl_cuda;
|
6650
|
+
case GGML_TYPE_IQ4_XS:
|
6651
|
+
return dequantize_row_iq4_xs_cuda;
|
6652
|
+
case GGML_TYPE_IQ3_S:
|
6653
|
+
return dequantize_row_iq3_s_cuda;
|
6946
6654
|
case GGML_TYPE_F16:
|
6947
6655
|
return convert_unary_cuda<half>;
|
6948
6656
|
default:
|
@@ -7764,10 +7472,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
|
7764
7472
|
|
7765
7473
|
const dim3 block_dims(ncols, 1, 1);
|
7766
7474
|
const dim3 block_nums(1, nrows, 1);
|
7767
|
-
if (order ==
|
7768
|
-
k_argsort_f32_i32<
|
7769
|
-
} else if (order ==
|
7770
|
-
k_argsort_f32_i32<
|
7475
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
7476
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
7477
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
7478
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
7771
7479
|
} else {
|
7772
7480
|
GGML_ASSERT(false);
|
7773
7481
|
}
|
@@ -7832,8 +7540,8 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float *
|
|
7832
7540
|
|
7833
7541
|
template <typename T>
|
7834
7542
|
static void im2col_cuda(const float* x, T* dst,
|
7835
|
-
|
7836
|
-
|
7543
|
+
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
7544
|
+
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
7837
7545
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
7838
7546
|
const int parallel_elements = OW * KW * KH;
|
7839
7547
|
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
@@ -7916,8 +7624,8 @@ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual
|
|
7916
7624
|
*actual_size = look_ahead_size;
|
7917
7625
|
g_cuda_pool_size[device] += look_ahead_size;
|
7918
7626
|
#ifdef DEBUG_CUDA_MALLOC
|
7919
|
-
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__,
|
7920
|
-
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[
|
7627
|
+
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
7628
|
+
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024));
|
7921
7629
|
#endif
|
7922
7630
|
return ptr;
|
7923
7631
|
}
|
@@ -8003,7 +7711,7 @@ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual
|
|
8003
7711
|
g_cuda_pool_used[device] += size;
|
8004
7712
|
|
8005
7713
|
#ifdef DEBUG_CUDA_MALLOC
|
8006
|
-
printf("cuda pool[%d]: allocated %llu bytes at %llx
|
7714
|
+
printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
8007
7715
|
#endif
|
8008
7716
|
|
8009
7717
|
return ptr;
|
@@ -8013,7 +7721,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
|
|
8013
7721
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
8014
7722
|
|
8015
7723
|
#ifdef DEBUG_CUDA_MALLOC
|
8016
|
-
printf("cuda pool[%d]: freed %llu bytes at %llx\n",
|
7724
|
+
printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
|
8017
7725
|
#endif
|
8018
7726
|
|
8019
7727
|
g_cuda_pool_used[device] -= size;
|
@@ -8199,11 +7907,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
8199
7907
|
|
8200
7908
|
cudaMemcpyKind kind;
|
8201
7909
|
char * src_ptr;
|
8202
|
-
if (src->backend ==
|
7910
|
+
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
8203
7911
|
kind = cudaMemcpyHostToDevice;
|
8204
7912
|
src_ptr = (char *) src->data;
|
8205
|
-
} else if (src->backend ==
|
8206
|
-
GGML_ASSERT(src->backend !=
|
7913
|
+
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
7914
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
8207
7915
|
kind = cudaMemcpyDeviceToDevice;
|
8208
7916
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
8209
7917
|
int id;
|
@@ -8512,7 +8220,7 @@ static void ggml_cuda_op_group_norm(
|
|
8512
8220
|
|
8513
8221
|
int num_groups = dst->op_params[0];
|
8514
8222
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
8515
|
-
group_norm_f32_cuda(src0_dd, dst_dd, num_groups
|
8223
|
+
group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
|
8516
8224
|
|
8517
8225
|
(void) src1;
|
8518
8226
|
(void) dst;
|
@@ -8545,7 +8253,7 @@ static void ggml_cuda_op_upscale(
|
|
8545
8253
|
|
8546
8254
|
const int scale_factor = dst->op_params[0];
|
8547
8255
|
|
8548
|
-
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
8256
|
+
upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
|
8549
8257
|
|
8550
8258
|
(void) src1;
|
8551
8259
|
(void) dst;
|
@@ -8561,8 +8269,49 @@ static void ggml_cuda_op_pad(
|
|
8561
8269
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
8562
8270
|
|
8563
8271
|
pad_f32_cuda(src0_dd, dst_dd,
|
8564
|
-
src0->ne[0], src0->ne[1], src0->ne[2],
|
8565
|
-
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
8272
|
+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
8273
|
+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
|
8274
|
+
|
8275
|
+
(void) src1;
|
8276
|
+
(void) dst;
|
8277
|
+
(void) src1_dd;
|
8278
|
+
}
|
8279
|
+
|
8280
|
+
static void ggml_cuda_op_arange(
|
8281
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8282
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8283
|
+
|
8284
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
8285
|
+
|
8286
|
+
float start;
|
8287
|
+
float stop;
|
8288
|
+
float step;
|
8289
|
+
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
|
8290
|
+
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
|
8291
|
+
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
|
8292
|
+
|
8293
|
+
int64_t steps = (int64_t)ceil((stop - start) / step);
|
8294
|
+
GGML_ASSERT(ggml_nelements(dst) == steps);
|
8295
|
+
|
8296
|
+
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
|
8297
|
+
|
8298
|
+
(void) src0;
|
8299
|
+
(void) src1;
|
8300
|
+
(void) src0_dd;
|
8301
|
+
(void) src1_dd;
|
8302
|
+
}
|
8303
|
+
|
8304
|
+
static void ggml_cuda_op_timestep_embedding(
|
8305
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8306
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8307
|
+
|
8308
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
8309
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
8310
|
+
|
8311
|
+
const int dim = dst->op_params[0];
|
8312
|
+
const int max_period = dst->op_params[1];
|
8313
|
+
|
8314
|
+
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
|
8566
8315
|
|
8567
8316
|
(void) src1;
|
8568
8317
|
(void) dst;
|
@@ -8608,7 +8357,7 @@ static void ggml_cuda_op_mul_mat_q(
|
|
8608
8357
|
|
8609
8358
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8610
8359
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
8611
|
-
const int64_t nrows_dst = dst->backend ==
|
8360
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8612
8361
|
|
8613
8362
|
switch (src0->type) {
|
8614
8363
|
case GGML_TYPE_Q4_0:
|
@@ -8685,9 +8434,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8685
8434
|
case GGML_TYPE_Q6_K:
|
8686
8435
|
case GGML_TYPE_IQ2_XXS:
|
8687
8436
|
case GGML_TYPE_IQ2_XS:
|
8437
|
+
case GGML_TYPE_IQ2_S:
|
8688
8438
|
case GGML_TYPE_IQ3_XXS:
|
8689
8439
|
case GGML_TYPE_IQ1_S:
|
8690
8440
|
case GGML_TYPE_IQ4_NL:
|
8441
|
+
case GGML_TYPE_IQ4_XS:
|
8442
|
+
case GGML_TYPE_IQ3_S:
|
8691
8443
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
8692
8444
|
default:
|
8693
8445
|
GGML_ASSERT(false);
|
@@ -8710,9 +8462,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8710
8462
|
case GGML_TYPE_Q5_K:
|
8711
8463
|
case GGML_TYPE_IQ2_XXS:
|
8712
8464
|
case GGML_TYPE_IQ2_XS:
|
8465
|
+
case GGML_TYPE_IQ2_S:
|
8713
8466
|
case GGML_TYPE_IQ3_XXS:
|
8714
8467
|
case GGML_TYPE_IQ1_S:
|
8715
8468
|
case GGML_TYPE_IQ4_NL:
|
8469
|
+
case GGML_TYPE_IQ4_XS:
|
8470
|
+
case GGML_TYPE_IQ3_S:
|
8716
8471
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
8717
8472
|
case GGML_TYPE_Q6_K:
|
8718
8473
|
return 64;
|
@@ -8755,7 +8510,7 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8755
8510
|
|
8756
8511
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8757
8512
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
8758
|
-
const int64_t nrows_dst = dst->backend ==
|
8513
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8759
8514
|
|
8760
8515
|
switch (src0->type) {
|
8761
8516
|
case GGML_TYPE_Q4_0:
|
@@ -8806,6 +8561,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8806
8561
|
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
8807
8562
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8808
8563
|
break;
|
8564
|
+
case GGML_TYPE_IQ2_S:
|
8565
|
+
mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
|
8566
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8567
|
+
break;
|
8809
8568
|
case GGML_TYPE_IQ3_XXS:
|
8810
8569
|
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
8811
8570
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
@@ -8818,6 +8577,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8818
8577
|
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
8819
8578
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8820
8579
|
break;
|
8580
|
+
case GGML_TYPE_IQ4_XS:
|
8581
|
+
mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
|
8582
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8583
|
+
break;
|
8584
|
+
case GGML_TYPE_IQ3_S:
|
8585
|
+
mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
|
8586
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8587
|
+
break;
|
8821
8588
|
default:
|
8822
8589
|
GGML_ASSERT(false);
|
8823
8590
|
break;
|
@@ -8927,7 +8694,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
8927
8694
|
|
8928
8695
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8929
8696
|
// ldc == nrows of the matrix that cuBLAS writes into
|
8930
|
-
int ldc = dst->backend ==
|
8697
|
+
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff;
|
8931
8698
|
|
8932
8699
|
const int compute_capability = g_device_caps[id].cc;
|
8933
8700
|
|
@@ -9275,7 +9042,7 @@ static void ggml_cuda_op_soft_max(
|
|
9275
9042
|
const bool use_src2 = src2 != nullptr;
|
9276
9043
|
|
9277
9044
|
if (use_src2) {
|
9278
|
-
const bool src2_on_device = src2->backend ==
|
9045
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
9279
9046
|
|
9280
9047
|
if (src2_on_device) {
|
9281
9048
|
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
@@ -9333,16 +9100,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
9333
9100
|
const bool use_src1 = src1 != nullptr;
|
9334
9101
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
9335
9102
|
|
9336
|
-
GGML_ASSERT(!use_src1 || src1->backend !=
|
9337
|
-
GGML_ASSERT( dst->backend !=
|
9103
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9104
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9338
9105
|
|
9339
9106
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
9340
9107
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
9341
9108
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9342
9109
|
|
9343
|
-
const bool src0_on_device = src0->backend ==
|
9344
|
-
const bool src1_on_device = use_src1 && src1->backend ==
|
9345
|
-
const bool dst_on_device = dst->backend ==
|
9110
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9111
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
9112
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
9346
9113
|
|
9347
9114
|
// dd = data device
|
9348
9115
|
float * src0_ddf = nullptr;
|
@@ -9386,7 +9153,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
9386
9153
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
9387
9154
|
}
|
9388
9155
|
|
9389
|
-
if (dst->backend ==
|
9156
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9390
9157
|
CUDA_CHECK(cudaDeviceSynchronize());
|
9391
9158
|
}
|
9392
9159
|
}
|
@@ -9467,8 +9234,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9467
9234
|
const int nb2 = dst->nb[2];
|
9468
9235
|
const int nb3 = dst->nb[3];
|
9469
9236
|
|
9470
|
-
GGML_ASSERT(dst->backend !=
|
9471
|
-
GGML_ASSERT(src1->backend !=
|
9237
|
+
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9238
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9472
9239
|
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
9473
9240
|
|
9474
9241
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
@@ -9484,20 +9251,20 @@ static void ggml_cuda_op_mul_mat(
|
|
9484
9251
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
9485
9252
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9486
9253
|
|
9487
|
-
const bool src0_on_device = src0->backend ==
|
9254
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9488
9255
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
9489
9256
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
9490
9257
|
|
9491
9258
|
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
9492
9259
|
|
9493
|
-
const bool split = src0->backend ==
|
9260
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
9494
9261
|
GGML_ASSERT(!(split && ne02 > 1));
|
9495
9262
|
GGML_ASSERT(!(split && ne03 > 1));
|
9496
9263
|
GGML_ASSERT(!(split && ne02 < ne12));
|
9497
9264
|
|
9498
9265
|
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
9499
9266
|
if (split) {
|
9500
|
-
// TODO: check that src0->buffer->buft is a split buffer type, replace
|
9267
|
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check
|
9501
9268
|
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
9502
9269
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9503
9270
|
tensor_split = buft_ctx->tensor_split;
|
@@ -9555,8 +9322,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9555
9322
|
|
9556
9323
|
used_devices++;
|
9557
9324
|
|
9558
|
-
const bool src1_on_device = src1->backend ==
|
9559
|
-
const bool dst_on_device = dst->backend ==
|
9325
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9326
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9560
9327
|
|
9561
9328
|
ggml_cuda_set_device(id);
|
9562
9329
|
cudaStream_t stream = g_cudaStreams[id][0];
|
@@ -9607,8 +9374,8 @@ static void ggml_cuda_op_mul_mat(
|
|
9607
9374
|
continue;
|
9608
9375
|
}
|
9609
9376
|
|
9610
|
-
const bool src1_on_device = src1->backend ==
|
9611
|
-
const bool dst_on_device = dst->backend ==
|
9377
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9378
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
9612
9379
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
9613
9380
|
|
9614
9381
|
ggml_cuda_set_device(id);
|
@@ -9633,12 +9400,12 @@ static void ggml_cuda_op_mul_mat(
|
|
9633
9400
|
|
9634
9401
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
9635
9402
|
// in that case an offset on dst_ddf_i is needed
|
9636
|
-
if (dst->backend ==
|
9403
|
+
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
9637
9404
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
9638
9405
|
}
|
9639
9406
|
|
9640
9407
|
// copy src0, src1 to device if necessary
|
9641
|
-
if (src1->backend ==
|
9408
|
+
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
9642
9409
|
if (id != g_main_device) {
|
9643
9410
|
if (convert_src1_to_q8_1) {
|
9644
9411
|
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
@@ -9651,14 +9418,14 @@ static void ggml_cuda_op_mul_mat(
|
|
9651
9418
|
src1_ncols*ne10*sizeof(float), stream));
|
9652
9419
|
}
|
9653
9420
|
}
|
9654
|
-
} else if (src1->backend ==
|
9421
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
9655
9422
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
9656
9423
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
9657
9424
|
} else {
|
9658
9425
|
GGML_ASSERT(false);
|
9659
9426
|
}
|
9660
9427
|
|
9661
|
-
if (convert_src1_to_q8_1 && (src1->backend ==
|
9428
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
9662
9429
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
9663
9430
|
CUDA_CHECK(cudaGetLastError());
|
9664
9431
|
}
|
@@ -9676,10 +9443,10 @@ static void ggml_cuda_op_mul_mat(
|
|
9676
9443
|
if (!dst_on_device) {
|
9677
9444
|
void * dst_off_device;
|
9678
9445
|
cudaMemcpyKind kind;
|
9679
|
-
if (dst->backend ==
|
9446
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9680
9447
|
dst_off_device = dst->data;
|
9681
9448
|
kind = cudaMemcpyDeviceToHost;
|
9682
|
-
} else if (dst->backend ==
|
9449
|
+
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
9683
9450
|
dst_off_device = dst_extra->data_device[g_main_device];
|
9684
9451
|
kind = cudaMemcpyDeviceToDevice;
|
9685
9452
|
} else {
|
@@ -9744,7 +9511,7 @@ static void ggml_cuda_op_mul_mat(
|
|
9744
9511
|
}
|
9745
9512
|
}
|
9746
9513
|
|
9747
|
-
if (dst->backend ==
|
9514
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9748
9515
|
ggml_cuda_set_device(g_main_device);
|
9749
9516
|
CUDA_CHECK(cudaDeviceSynchronize());
|
9750
9517
|
}
|
@@ -9829,6 +9596,45 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
9829
9596
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
|
9830
9597
|
}
|
9831
9598
|
|
9599
|
+
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9600
|
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
9601
|
+
|
9602
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
9603
|
+
|
9604
|
+
// dd = data device
|
9605
|
+
float * src0_ddf = nullptr;
|
9606
|
+
float * src1_ddf = nullptr;
|
9607
|
+
float * dst_ddf = nullptr;
|
9608
|
+
|
9609
|
+
cuda_pool_alloc<float> dst_f;
|
9610
|
+
|
9611
|
+
ggml_cuda_set_device(g_main_device);
|
9612
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
9613
|
+
|
9614
|
+
if (dst_on_device) {
|
9615
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
9616
|
+
} else {
|
9617
|
+
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
9618
|
+
}
|
9619
|
+
|
9620
|
+
// do the computation
|
9621
|
+
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
9622
|
+
CUDA_CHECK(cudaGetLastError());
|
9623
|
+
|
9624
|
+
// copy dst to host if necessary
|
9625
|
+
if (!dst_on_device) {
|
9626
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
9627
|
+
}
|
9628
|
+
|
9629
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
9630
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
9631
|
+
}
|
9632
|
+
}
|
9633
|
+
|
9634
|
+
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9635
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
|
9636
|
+
}
|
9637
|
+
|
9832
9638
|
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9833
9639
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
9834
9640
|
}
|
@@ -9850,7 +9656,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru
|
|
9850
9656
|
|
9851
9657
|
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
9852
9658
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
9853
|
-
GGML_ASSERT(src0->backend !=
|
9659
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9854
9660
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
9855
9661
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
9856
9662
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -9881,7 +9687,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
9881
9687
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9882
9688
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9883
9689
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
9884
|
-
GGML_ASSERT(src0->backend !=
|
9690
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9885
9691
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
9886
9692
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
9887
9693
|
|
@@ -9940,7 +9746,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
|
9940
9746
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9941
9747
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9942
9748
|
|
9943
|
-
GGML_ASSERT(src0->backend !=
|
9749
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
9944
9750
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
9945
9751
|
|
9946
9752
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -10086,11 +9892,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
|
10086
9892
|
|
10087
9893
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
10088
9894
|
const bool all_on_device =
|
10089
|
-
(src0->backend ==
|
10090
|
-
(src1->backend ==
|
10091
|
-
( dst->backend ==
|
9895
|
+
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
9896
|
+
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
9897
|
+
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
10092
9898
|
|
10093
|
-
const bool split = src0->backend ==
|
9899
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
10094
9900
|
|
10095
9901
|
int64_t min_compute_capability = INT_MAX;
|
10096
9902
|
|
@@ -10240,7 +10046,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
|
|
10240
10046
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
10241
10047
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
10242
10048
|
|
10243
|
-
GGML_ASSERT(src00->backend !=
|
10049
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
10244
10050
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
10245
10051
|
|
10246
10052
|
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
|
@@ -10384,7 +10190,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10384
10190
|
|
10385
10191
|
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
10386
10192
|
|
10387
|
-
if (ids->backend ==
|
10193
|
+
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
10388
10194
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
10389
10195
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
10390
10196
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
@@ -10401,20 +10207,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10401
10207
|
ggml_tensor src1_row = *src1;
|
10402
10208
|
ggml_tensor dst_row = *dst;
|
10403
10209
|
|
10404
|
-
src1_row.backend =
|
10405
|
-
dst_row.backend =
|
10210
|
+
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
10211
|
+
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
10406
10212
|
|
10407
10213
|
src1_row.extra = &src1_row_extra;
|
10408
10214
|
dst_row.extra = &dst_row_extra;
|
10409
10215
|
|
10410
|
-
char * src1_original = src1->backend ==
|
10216
|
+
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
10411
10217
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
10412
|
-
char * dst_original = dst->backend ==
|
10218
|
+
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
10413
10219
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
10414
10220
|
|
10415
10221
|
if (src1->ne[1] == 1) {
|
10416
|
-
GGML_ASSERT(src1->backend ==
|
10417
|
-
GGML_ASSERT(dst->backend ==
|
10222
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
10223
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
10418
10224
|
|
10419
10225
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
10420
10226
|
//int32_t row_id;
|
@@ -10442,9 +10248,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10442
10248
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
10443
10249
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
10444
10250
|
|
10445
|
-
const cudaMemcpyKind src1_kind = src1->backend ==
|
10251
|
+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
10446
10252
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
10447
|
-
const cudaMemcpyKind dst_kind = dst->backend ==
|
10253
|
+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
10448
10254
|
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
10449
10255
|
|
10450
10256
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
@@ -10499,7 +10305,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
10499
10305
|
}
|
10500
10306
|
}
|
10501
10307
|
|
10502
|
-
if (dst->backend ==
|
10308
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
10503
10309
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
10504
10310
|
}
|
10505
10311
|
}
|
@@ -10516,8 +10322,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
10516
10322
|
const int64_t ne = ggml_nelements(src0);
|
10517
10323
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
10518
10324
|
|
10519
|
-
GGML_ASSERT(src0->backend ==
|
10520
|
-
GGML_ASSERT(src1->backend ==
|
10325
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
10326
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
10521
10327
|
|
10522
10328
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
10523
10329
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
@@ -10648,9 +10454,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10648
10454
|
if (!g_cublas_loaded) return false;
|
10649
10455
|
|
10650
10456
|
ggml_cuda_func_t func;
|
10651
|
-
const bool any_on_device = tensor->backend ==
|
10652
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
10653
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
10457
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
10458
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
10459
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
10654
10460
|
|
10655
10461
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
10656
10462
|
return false;
|
@@ -10729,6 +10535,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10729
10535
|
case GGML_OP_PAD:
|
10730
10536
|
func = ggml_cuda_pad;
|
10731
10537
|
break;
|
10538
|
+
case GGML_OP_ARANGE:
|
10539
|
+
func = ggml_cuda_arange;
|
10540
|
+
break;
|
10541
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
10542
|
+
func = ggml_cuda_timestep_embedding;
|
10543
|
+
break;
|
10732
10544
|
case GGML_OP_LEAKY_RELU:
|
10733
10545
|
func = ggml_cuda_leaky_relu;
|
10734
10546
|
break;
|
@@ -10797,14 +10609,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10797
10609
|
return false;
|
10798
10610
|
}
|
10799
10611
|
|
10800
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
10612
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
10801
10613
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
10802
10614
|
}
|
10803
10615
|
|
10804
10616
|
if (params->ith != 0) {
|
10805
10617
|
return true;
|
10806
10618
|
}
|
10807
|
-
if (params->type ==
|
10619
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
10808
10620
|
return true;
|
10809
10621
|
}
|
10810
10622
|
func(tensor->src[0], tensor->src[1], tensor);
|
@@ -10832,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description,
|
|
10832
10644
|
#define UNUSED GGML_UNUSED
|
10833
10645
|
|
10834
10646
|
struct ggml_backend_cuda_context {
|
10647
|
+
explicit ggml_backend_cuda_context(int device) :
|
10648
|
+
device(device),
|
10649
|
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
10650
|
+
}
|
10651
|
+
|
10652
|
+
~ggml_backend_cuda_context() {
|
10653
|
+
if (copy_event != nullptr) {
|
10654
|
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
10655
|
+
}
|
10656
|
+
}
|
10657
|
+
|
10835
10658
|
int device;
|
10836
10659
|
std::string name;
|
10660
|
+
cudaEvent_t copy_event = nullptr;
|
10837
10661
|
};
|
10838
10662
|
|
10839
10663
|
// cuda buffer
|
@@ -10903,7 +10727,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
10903
10727
|
|
10904
10728
|
extra->data_device[ctx->device] = tensor->data;
|
10905
10729
|
|
10906
|
-
tensor->backend =
|
10730
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
10907
10731
|
tensor->extra = extra;
|
10908
10732
|
|
10909
10733
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -10918,42 +10742,40 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
10918
10742
|
}
|
10919
10743
|
|
10920
10744
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
10921
|
-
GGML_ASSERT(tensor->backend ==
|
10745
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
10922
10746
|
|
10923
10747
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10924
10748
|
|
10925
10749
|
ggml_cuda_set_device(ctx->device);
|
10926
|
-
CUDA_CHECK(
|
10927
|
-
CUDA_CHECK(
|
10928
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10750
|
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
10751
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10929
10752
|
}
|
10930
10753
|
|
10931
10754
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
10932
|
-
GGML_ASSERT(tensor->backend ==
|
10755
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
10933
10756
|
|
10934
10757
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10935
10758
|
|
10936
10759
|
ggml_cuda_set_device(ctx->device);
|
10937
|
-
CUDA_CHECK(
|
10938
|
-
CUDA_CHECK(
|
10939
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10760
|
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
10761
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10940
10762
|
}
|
10941
10763
|
|
10942
10764
|
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
10943
10765
|
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
10944
10766
|
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
10945
|
-
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
10946
|
-
|
10947
|
-
|
10948
|
-
|
10949
|
-
|
10950
|
-
|
10951
|
-
CUDA_CHECK(
|
10952
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
10953
|
-
|
10767
|
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
|
10768
|
+
if (src_ctx->device == dst_ctx->device) {
|
10769
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
10770
|
+
} else {
|
10771
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
10772
|
+
}
|
10773
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
10954
10774
|
return true;
|
10955
10775
|
}
|
10956
10776
|
return false;
|
10777
|
+
|
10778
|
+
UNUSED(buffer);
|
10957
10779
|
}
|
10958
10780
|
|
10959
10781
|
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
@@ -11164,7 +10986,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|
11164
10986
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
11165
10987
|
}
|
11166
10988
|
}
|
11167
|
-
tensor->backend =
|
10989
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
|
11168
10990
|
tensor->extra = extra;
|
11169
10991
|
}
|
11170
10992
|
|
@@ -11198,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
|
|
11198
11020
|
}
|
11199
11021
|
|
11200
11022
|
const char * buf_host = (const char *)data + offset_split;
|
11201
|
-
CUDA_CHECK(
|
11023
|
+
CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
11024
|
+
}
|
11025
|
+
|
11026
|
+
for (int id = 0; id < g_device_count; ++id) {
|
11027
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
11202
11028
|
}
|
11203
11029
|
}
|
11204
11030
|
|
@@ -11232,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
|
|
11232
11058
|
}
|
11233
11059
|
|
11234
11060
|
char * buf_host = (char *)data + offset_split;
|
11235
|
-
CUDA_CHECK(
|
11061
|
+
CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
11062
|
+
}
|
11063
|
+
|
11064
|
+
for (int id = 0; id < g_device_count; ++id) {
|
11065
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
11236
11066
|
}
|
11237
11067
|
}
|
11238
11068
|
|
@@ -11411,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
11411
11241
|
return &ggml_backend_cuda_buffer_type_host;
|
11412
11242
|
}
|
11413
11243
|
|
11244
|
+
//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
|
11245
|
+
// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
11246
|
+
//}
|
11247
|
+
|
11414
11248
|
// backend
|
11415
11249
|
|
11416
11250
|
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
@@ -11434,31 +11268,71 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer
|
|
11434
11268
|
|
11435
11269
|
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
11436
11270
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11271
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
11437
11272
|
|
11438
|
-
GGML_ASSERT(
|
11439
|
-
GGML_ASSERT(tensor->backend ==
|
11273
|
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
11274
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
11440
11275
|
|
11441
11276
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
11442
11277
|
}
|
11443
11278
|
|
11444
11279
|
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
11445
11280
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11281
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
11446
11282
|
|
11447
|
-
GGML_ASSERT(
|
11448
|
-
GGML_ASSERT(tensor->backend ==
|
11283
|
+
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
11284
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
11449
11285
|
|
11450
11286
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
11451
11287
|
}
|
11452
11288
|
|
11453
|
-
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t
|
11454
|
-
|
11289
|
+
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
11290
|
+
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
11455
11291
|
|
11456
|
-
|
11457
|
-
|
11458
|
-
|
11292
|
+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
11293
|
+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
11294
|
+
|
11295
|
+
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
11296
|
+
return false;
|
11459
11297
|
}
|
11460
11298
|
|
11461
|
-
|
11299
|
+
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
11300
|
+
return false;
|
11301
|
+
}
|
11302
|
+
|
11303
|
+
// device -> device
|
11304
|
+
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
11305
|
+
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
11306
|
+
|
11307
|
+
if (backend_src != backend_dst) {
|
11308
|
+
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
11309
|
+
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
11310
|
+
|
11311
|
+
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
11312
|
+
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
11313
|
+
|
11314
|
+
if (!cuda_ctx_src->copy_event) {
|
11315
|
+
ggml_cuda_set_device(cuda_ctx_src->device);
|
11316
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
11317
|
+
}
|
11318
|
+
|
11319
|
+
// copy on src stream
|
11320
|
+
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
11321
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
11322
|
+
} else {
|
11323
|
+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0]));
|
11324
|
+
}
|
11325
|
+
|
11326
|
+
// record event on src stream
|
11327
|
+
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0]));
|
11328
|
+
|
11329
|
+
// wait on dst stream for the copy to complete
|
11330
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0));
|
11331
|
+
} else {
|
11332
|
+
// src and dst are on the same backend
|
11333
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
|
11334
|
+
}
|
11335
|
+
return true;
|
11462
11336
|
}
|
11463
11337
|
|
11464
11338
|
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
@@ -11469,13 +11343,13 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
11469
11343
|
UNUSED(backend);
|
11470
11344
|
}
|
11471
11345
|
|
11472
|
-
GGML_CALL static
|
11346
|
+
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
11473
11347
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11474
11348
|
|
11475
11349
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
11476
11350
|
|
11477
11351
|
ggml_compute_params params = {};
|
11478
|
-
params.type =
|
11352
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
11479
11353
|
params.ith = 0;
|
11480
11354
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
11481
11355
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -11485,13 +11359,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11485
11359
|
}
|
11486
11360
|
|
11487
11361
|
#ifndef NDEBUG
|
11488
|
-
assert(node->backend ==
|
11362
|
+
assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
11489
11363
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
11490
11364
|
assert(node->extra != nullptr);
|
11491
11365
|
|
11492
11366
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
11493
11367
|
if (node->src[j] != nullptr) {
|
11494
|
-
assert(node->src[j]->backend ==
|
11368
|
+
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT);
|
11495
11369
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
11496
11370
|
assert(node->src[j]->extra != nullptr);
|
11497
11371
|
}
|
@@ -11505,7 +11379,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11505
11379
|
GGML_ASSERT(ok);
|
11506
11380
|
}
|
11507
11381
|
|
11508
|
-
return
|
11382
|
+
return GGML_STATUS_SUCCESS;
|
11509
11383
|
}
|
11510
11384
|
|
11511
11385
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
@@ -11541,7 +11415,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11541
11415
|
}
|
11542
11416
|
ggml_type a_type = a->type;
|
11543
11417
|
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
11544
|
-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL
|
11418
|
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
11419
|
+
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
11545
11420
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
11546
11421
|
return false;
|
11547
11422
|
}
|
@@ -11623,6 +11498,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11623
11498
|
case GGML_OP_GROUP_NORM:
|
11624
11499
|
case GGML_OP_UPSCALE:
|
11625
11500
|
case GGML_OP_PAD:
|
11501
|
+
case GGML_OP_ARANGE:
|
11502
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
11626
11503
|
case GGML_OP_LEAKY_RELU:
|
11627
11504
|
return true;
|
11628
11505
|
default:
|
@@ -11632,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11632
11509
|
UNUSED(backend);
|
11633
11510
|
}
|
11634
11511
|
|
11512
|
+
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
11513
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11514
|
+
|
11515
|
+
ggml_cuda_set_device(cuda_ctx->device);
|
11516
|
+
|
11517
|
+
cudaEvent_t event;
|
11518
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
11519
|
+
|
11520
|
+
return new ggml_backend_event {
|
11521
|
+
/* .backend = */ backend,
|
11522
|
+
/* .context = */ event,
|
11523
|
+
};
|
11524
|
+
}
|
11525
|
+
|
11526
|
+
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
11527
|
+
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
11528
|
+
|
11529
|
+
delete event;
|
11530
|
+
}
|
11531
|
+
|
11532
|
+
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
11533
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
11534
|
+
|
11535
|
+
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
|
11536
|
+
}
|
11537
|
+
|
11538
|
+
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
11539
|
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
11540
|
+
|
11541
|
+
if (ggml_backend_is_cuda(event->backend)) {
|
11542
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
11543
|
+
} else {
|
11544
|
+
// untested
|
11545
|
+
auto wait_fn = [](void * user_data) {
|
11546
|
+
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
11547
|
+
ggml_backend_event_synchronize(event);
|
11548
|
+
};
|
11549
|
+
|
11550
|
+
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
11551
|
+
}
|
11552
|
+
}
|
11553
|
+
|
11554
|
+
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
11555
|
+
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
11556
|
+
}
|
11557
|
+
|
11635
11558
|
static ggml_backend_i ggml_backend_cuda_interface = {
|
11636
11559
|
/* .get_name = */ ggml_backend_cuda_name,
|
11637
11560
|
/* .free = */ ggml_backend_cuda_free,
|
@@ -11645,8 +11568,18 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
11645
11568
|
/* .graph_plan_compute = */ NULL,
|
11646
11569
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
11647
11570
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
11571
|
+
/* .event_new = */ ggml_backend_cuda_event_new,
|
11572
|
+
/* .event_free = */ ggml_backend_cuda_event_free,
|
11573
|
+
/* .event_record = */ ggml_backend_cuda_event_record,
|
11574
|
+
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
11575
|
+
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
11648
11576
|
};
|
11649
11577
|
|
11578
|
+
static ggml_guid_t ggml_backend_cuda_guid() {
|
11579
|
+
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
|
11580
|
+
return &guid;
|
11581
|
+
}
|
11582
|
+
|
11650
11583
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
11651
11584
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
11652
11585
|
|
@@ -11658,12 +11591,14 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
11658
11591
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
11659
11592
|
ggml_cuda_set_main_device(device);
|
11660
11593
|
|
11661
|
-
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context
|
11662
|
-
|
11663
|
-
|
11664
|
-
|
11594
|
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
11595
|
+
if (ctx == nullptr) {
|
11596
|
+
fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
|
11597
|
+
return nullptr;
|
11598
|
+
}
|
11665
11599
|
|
11666
11600
|
ggml_backend_t cuda_backend = new ggml_backend {
|
11601
|
+
/* .guid = */ ggml_backend_cuda_guid(),
|
11667
11602
|
/* .interface = */ ggml_backend_cuda_interface,
|
11668
11603
|
/* .context = */ ctx
|
11669
11604
|
};
|
@@ -11672,7 +11607,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
11672
11607
|
}
|
11673
11608
|
|
11674
11609
|
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
11675
|
-
return backend && backend->
|
11610
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
|
11676
11611
|
}
|
11677
11612
|
|
11678
11613
|
GGML_CALL int ggml_backend_cuda_get_device_count() {
|