llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
@@ -81,12 +81,29 @@
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#ifndef __has_builtin
|
85
|
+
#define __has_builtin(x) 0
|
86
|
+
#endif
|
87
|
+
|
84
88
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
89
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
90
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
91
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
92
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
93
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
94
|
return reinterpret_cast<const int&>(c);
|
95
|
+
#else
|
96
|
+
int8x4_t c;
|
97
|
+
int16_t tmp;
|
98
|
+
#pragma unroll
|
99
|
+
for (int i = 0; i < 4; i++) {
|
100
|
+
tmp = va[i] - vb[i];
|
101
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
102
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
103
|
+
c[i] = tmp;
|
104
|
+
}
|
105
|
+
return reinterpret_cast<int&>(c);
|
106
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
107
|
}
|
91
108
|
|
92
109
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -306,11 +323,11 @@ typedef struct {
|
|
306
323
|
#define QI4_K (QK_K / (4*QR4_K))
|
307
324
|
#ifdef GGML_QKK_64
|
308
325
|
typedef struct {
|
309
|
-
half
|
326
|
+
half dm[2]; // super-block scales/mins
|
310
327
|
uint8_t scales[2]; // 4-bit block scales/mins
|
311
328
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
312
329
|
} block_q4_K;
|
313
|
-
static_assert(sizeof(block_q4_K) ==
|
330
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
314
331
|
#else
|
315
332
|
typedef struct {
|
316
333
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
464
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
465
|
}
|
449
466
|
|
467
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
468
|
+
#pragma unroll
|
469
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
470
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
471
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
472
|
+
}
|
473
|
+
return a;
|
474
|
+
}
|
475
|
+
|
476
|
+
template <int block_size>
|
450
477
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
478
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
479
|
const int tid = threadIdx.x;
|
453
480
|
|
454
481
|
const float eps = 1e-5f;
|
455
482
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
483
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
484
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
485
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
486
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
487
|
+
mean_var.x += xi;
|
488
|
+
mean_var.y += xi * xi;
|
463
489
|
}
|
464
490
|
|
465
491
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
492
|
+
mean_var = warp_reduce_sum(mean_var);
|
493
|
+
if (block_size > WARP_SIZE) {
|
494
|
+
__shared__ float2 s_sum[32];
|
495
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
496
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
497
|
+
if (lane_id == 0) {
|
498
|
+
s_sum[warp_id] = mean_var;
|
499
|
+
}
|
500
|
+
__syncthreads();
|
501
|
+
mean_var = s_sum[lane_id];
|
502
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
503
|
}
|
471
504
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
505
|
+
const float mean = mean_var.x / ncols;
|
506
|
+
const float var = mean_var.y / ncols - mean * mean;
|
507
|
+
const float inv_std = rsqrtf(var + eps);
|
508
|
+
|
509
|
+
for (int col = tid; col < ncols; col += block_size) {
|
510
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
511
|
+
}
|
512
|
+
}
|
475
513
|
|
476
|
-
|
477
|
-
|
514
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
515
|
+
#pragma unroll
|
516
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
517
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
518
|
}
|
519
|
+
return x;
|
479
520
|
}
|
480
521
|
|
522
|
+
template <int block_size>
|
481
523
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
524
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
525
|
const int tid = threadIdx.x;
|
484
526
|
|
485
527
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
528
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
529
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
530
|
const float xi = x[row*ncols + col];
|
489
531
|
tmp += xi * xi;
|
490
532
|
}
|
491
533
|
|
492
534
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
535
|
+
tmp = warp_reduce_sum(tmp);
|
536
|
+
if (block_size > WARP_SIZE) {
|
537
|
+
__shared__ float s_sum[32];
|
538
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
539
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
540
|
+
if (lane_id == 0) {
|
541
|
+
s_sum[warp_id] = tmp;
|
542
|
+
}
|
543
|
+
__syncthreads();
|
544
|
+
tmp = s_sum[lane_id];
|
545
|
+
tmp = warp_reduce_sum(tmp);
|
496
546
|
}
|
497
547
|
|
498
548
|
const float mean = tmp / ncols;
|
499
549
|
const float scale = rsqrtf(mean + eps);
|
500
550
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
551
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
552
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
553
|
}
|
504
554
|
}
|
@@ -737,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
737
787
|
const int tid = threadIdx.x;
|
738
788
|
const uint8_t * q = x[i].qs;
|
739
789
|
float * y = yy + i*QK_K;
|
740
|
-
const float d = (float)x[i].
|
741
|
-
const float m = (float)x[i].
|
790
|
+
const float d = (float)x[i].dm[0];
|
791
|
+
const float m = (float)x[i].dm[1];
|
742
792
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
743
793
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
744
794
|
#endif
|
@@ -1155,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1155
1205
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1156
1206
|
aux16[0] = a[0] & 0x0f0f;
|
1157
1207
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1158
|
-
const float d = (float)x[i].
|
1159
|
-
const float m = (float)x[i].
|
1208
|
+
const float d = (float)x[i].dm[0];
|
1209
|
+
const float m = (float)x[i].dm[1];
|
1160
1210
|
float sum = 0.f;
|
1161
1211
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1162
1212
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -2845,8 +2895,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2845
2895
|
aux16[0] = a[0] & 0x0f0f;
|
2846
2896
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2847
2897
|
|
2848
|
-
const float dall = bq4_K->
|
2849
|
-
const float dmin = bq4_K->
|
2898
|
+
const float dall = bq4_K->dm[0];
|
2899
|
+
const float dmin = bq4_K->dm[1];
|
2850
2900
|
|
2851
2901
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
2902
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
@@ -2929,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2929
2979
|
|
2930
2980
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2931
2981
|
|
2982
|
+
#if QK_K == 256
|
2932
2983
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2984
|
+
#else
|
2985
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2986
|
+
#endif
|
2933
2987
|
}
|
2934
2988
|
|
2935
2989
|
#pragma unroll
|
@@ -3119,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3119
3173
|
|
3120
3174
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3121
3175
|
|
3176
|
+
#if QK_K == 256
|
3122
3177
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3178
|
+
#endif
|
3123
3179
|
}
|
3124
3180
|
|
3125
3181
|
#pragma unroll
|
@@ -4180,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4180
4236
|
|
4181
4237
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4182
4238
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4183
|
-
|
4184
|
-
|
4239
|
+
if (ncols < 1024) {
|
4240
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4241
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4242
|
+
} else {
|
4243
|
+
const dim3 block_dims(1024, 1, 1);
|
4244
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4245
|
+
}
|
4185
4246
|
}
|
4186
4247
|
|
4187
4248
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4188
4249
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4250
|
+
if (ncols < 1024) {
|
4251
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4252
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4253
|
+
} else {
|
4254
|
+
const dim3 block_dims(1024, 1, 1);
|
4255
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4256
|
+
}
|
4191
4257
|
}
|
4192
4258
|
|
4193
4259
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -4709,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4709
4775
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4710
4776
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4711
4777
|
|
4778
|
+
#if QK_K == 256
|
4779
|
+
|
4712
4780
|
int id;
|
4713
4781
|
CUDA_CHECK(cudaGetDevice(&id));
|
4714
4782
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4740,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4740
4808
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
4809
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4742
4810
|
}
|
4811
|
+
#endif
|
4743
4812
|
}
|
4744
4813
|
|
4745
4814
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4899,8 +4968,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4899
4968
|
|
4900
4969
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4901
4970
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4902
|
-
GGML_ASSERT(
|
4903
|
-
const dim3 block_dims(1,
|
4971
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4972
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4904
4973
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4905
4974
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4906
4975
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -4908,7 +4977,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
4908
4977
|
|
4909
4978
|
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
4979
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
-
|
4980
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4981
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
4982
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
4983
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
4984
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
@@ -6328,9 +6398,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6328
6398
|
|
6329
6399
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
6400
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6401
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6331
6402
|
|
6332
6403
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6333
6404
|
const bool is_glm = mode & 4;
|
6405
|
+
|
6334
6406
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6335
6407
|
}
|
6336
6408
|
|