llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
@@ -81,12 +81,29 @@
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#ifndef __has_builtin
|
85
|
+
#define __has_builtin(x) 0
|
86
|
+
#endif
|
87
|
+
|
84
88
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
89
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
90
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
91
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
92
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
93
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
94
|
return reinterpret_cast<const int&>(c);
|
95
|
+
#else
|
96
|
+
int8x4_t c;
|
97
|
+
int16_t tmp;
|
98
|
+
#pragma unroll
|
99
|
+
for (int i = 0; i < 4; i++) {
|
100
|
+
tmp = va[i] - vb[i];
|
101
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
102
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
103
|
+
c[i] = tmp;
|
104
|
+
}
|
105
|
+
return reinterpret_cast<int&>(c);
|
106
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
107
|
}
|
91
108
|
|
92
109
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -306,11 +323,11 @@ typedef struct {
|
|
306
323
|
#define QI4_K (QK_K / (4*QR4_K))
|
307
324
|
#ifdef GGML_QKK_64
|
308
325
|
typedef struct {
|
309
|
-
half
|
326
|
+
half dm[2]; // super-block scales/mins
|
310
327
|
uint8_t scales[2]; // 4-bit block scales/mins
|
311
328
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
312
329
|
} block_q4_K;
|
313
|
-
static_assert(sizeof(block_q4_K) ==
|
330
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
314
331
|
#else
|
315
332
|
typedef struct {
|
316
333
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
464
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
465
|
}
|
449
466
|
|
467
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
468
|
+
#pragma unroll
|
469
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
470
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
471
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
472
|
+
}
|
473
|
+
return a;
|
474
|
+
}
|
475
|
+
|
476
|
+
template <int block_size>
|
450
477
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
478
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
479
|
const int tid = threadIdx.x;
|
453
480
|
|
454
481
|
const float eps = 1e-5f;
|
455
482
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
483
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
484
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
485
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
486
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
487
|
+
mean_var.x += xi;
|
488
|
+
mean_var.y += xi * xi;
|
463
489
|
}
|
464
490
|
|
465
491
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
492
|
+
mean_var = warp_reduce_sum(mean_var);
|
493
|
+
if (block_size > WARP_SIZE) {
|
494
|
+
__shared__ float2 s_sum[32];
|
495
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
496
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
497
|
+
if (lane_id == 0) {
|
498
|
+
s_sum[warp_id] = mean_var;
|
499
|
+
}
|
500
|
+
__syncthreads();
|
501
|
+
mean_var = s_sum[lane_id];
|
502
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
503
|
}
|
471
504
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
505
|
+
const float mean = mean_var.x / ncols;
|
506
|
+
const float var = mean_var.y / ncols - mean * mean;
|
507
|
+
const float inv_std = rsqrtf(var + eps);
|
508
|
+
|
509
|
+
for (int col = tid; col < ncols; col += block_size) {
|
510
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
511
|
+
}
|
512
|
+
}
|
475
513
|
|
476
|
-
|
477
|
-
|
514
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
515
|
+
#pragma unroll
|
516
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
517
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
518
|
}
|
519
|
+
return x;
|
479
520
|
}
|
480
521
|
|
522
|
+
template <int block_size>
|
481
523
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
524
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
525
|
const int tid = threadIdx.x;
|
484
526
|
|
485
527
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
528
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
529
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
530
|
const float xi = x[row*ncols + col];
|
489
531
|
tmp += xi * xi;
|
490
532
|
}
|
491
533
|
|
492
534
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
535
|
+
tmp = warp_reduce_sum(tmp);
|
536
|
+
if (block_size > WARP_SIZE) {
|
537
|
+
__shared__ float s_sum[32];
|
538
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
539
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
540
|
+
if (lane_id == 0) {
|
541
|
+
s_sum[warp_id] = tmp;
|
542
|
+
}
|
543
|
+
__syncthreads();
|
544
|
+
tmp = s_sum[lane_id];
|
545
|
+
tmp = warp_reduce_sum(tmp);
|
496
546
|
}
|
497
547
|
|
498
548
|
const float mean = tmp / ncols;
|
499
549
|
const float scale = rsqrtf(mean + eps);
|
500
550
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
551
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
552
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
553
|
}
|
504
554
|
}
|
@@ -737,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
737
787
|
const int tid = threadIdx.x;
|
738
788
|
const uint8_t * q = x[i].qs;
|
739
789
|
float * y = yy + i*QK_K;
|
740
|
-
const float d = (float)x[i].
|
741
|
-
const float m = (float)x[i].
|
790
|
+
const float d = (float)x[i].dm[0];
|
791
|
+
const float m = (float)x[i].dm[1];
|
742
792
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
743
793
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
744
794
|
#endif
|
@@ -1155,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1155
1205
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1156
1206
|
aux16[0] = a[0] & 0x0f0f;
|
1157
1207
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1158
|
-
const float d = (float)x[i].
|
1159
|
-
const float m = (float)x[i].
|
1208
|
+
const float d = (float)x[i].dm[0];
|
1209
|
+
const float m = (float)x[i].dm[1];
|
1160
1210
|
float sum = 0.f;
|
1161
1211
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1162
1212
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -2845,8 +2895,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2845
2895
|
aux16[0] = a[0] & 0x0f0f;
|
2846
2896
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2847
2897
|
|
2848
|
-
const float dall = bq4_K->
|
2849
|
-
const float dmin = bq4_K->
|
2898
|
+
const float dall = bq4_K->dm[0];
|
2899
|
+
const float dmin = bq4_K->dm[1];
|
2850
2900
|
|
2851
2901
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
2902
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
@@ -2929,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2929
2979
|
|
2930
2980
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2931
2981
|
|
2982
|
+
#if QK_K == 256
|
2932
2983
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2984
|
+
#else
|
2985
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2986
|
+
#endif
|
2933
2987
|
}
|
2934
2988
|
|
2935
2989
|
#pragma unroll
|
@@ -3119,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3119
3173
|
|
3120
3174
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3121
3175
|
|
3176
|
+
#if QK_K == 256
|
3122
3177
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3178
|
+
#endif
|
3123
3179
|
}
|
3124
3180
|
|
3125
3181
|
#pragma unroll
|
@@ -4180,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4180
4236
|
|
4181
4237
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4182
4238
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4183
|
-
|
4184
|
-
|
4239
|
+
if (ncols < 1024) {
|
4240
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4241
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4242
|
+
} else {
|
4243
|
+
const dim3 block_dims(1024, 1, 1);
|
4244
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4245
|
+
}
|
4185
4246
|
}
|
4186
4247
|
|
4187
4248
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4188
4249
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4250
|
+
if (ncols < 1024) {
|
4251
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4252
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4253
|
+
} else {
|
4254
|
+
const dim3 block_dims(1024, 1, 1);
|
4255
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4256
|
+
}
|
4191
4257
|
}
|
4192
4258
|
|
4193
4259
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -4709,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4709
4775
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4710
4776
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4711
4777
|
|
4778
|
+
#if QK_K == 256
|
4779
|
+
|
4712
4780
|
int id;
|
4713
4781
|
CUDA_CHECK(cudaGetDevice(&id));
|
4714
4782
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4740,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4740
4808
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
4809
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4742
4810
|
}
|
4811
|
+
#endif
|
4743
4812
|
}
|
4744
4813
|
|
4745
4814
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4899,8 +4968,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4899
4968
|
|
4900
4969
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4901
4970
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4902
|
-
GGML_ASSERT(
|
4903
|
-
const dim3 block_dims(1,
|
4971
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4972
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4904
4973
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4905
4974
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4906
4975
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -4908,7 +4977,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
4908
4977
|
|
4909
4978
|
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
4979
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
-
|
4980
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4981
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
4982
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
4983
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
4984
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -6328,9 +6398,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6328
6398
|
|
6329
6399
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
6400
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6401
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6331
6402
|
|
6332
6403
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6333
6404
|
const bool is_glm = mode & 4;
|
6405
|
+
|
6334
6406
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6335
6407
|
}
|
6336
6408
|
|