llama_cpp 0.9.4 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +109 -58
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +68 -22
- data/ext/llama_cpp/src/ggml.h +10 -3
- data/ext/llama_cpp/src/llama.cpp +52 -38
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fd4e1a5e4d7e2442ab43255996da3ce92f898f9876f1bda343e2433c5050dd7
|
4
|
+
data.tar.gz: dece2da6c9befa15e6990d18fb58e2bf13d8da6c62033969b6b5104f82df736d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51a383690b6e90e9493e1f318e916dfd94a909f4e554afd8ea822d047f05e96be3e2f371e83f0da5a37a9837d9ae5ecc6992bb9d9c0fd60a9de521bcd148e8f7
|
7
|
+
data.tar.gz: 15bbe94edb232d1979f2907c6c3ab7325a1089f9dcdd5d4262d7f0955fd6183e6b01cfee16593165f6e9901991e765ea30740bc1a83cca8fad60df4417551e3b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.4...v0.9.5)] - 2023-12-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1555 to b1593.
|
4
|
+
|
1
5
|
## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1523 to b1555.
|
@@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
137
137
|
|
138
138
|
#ifdef GGML_ALLOCATOR_DEBUG
|
139
139
|
add_allocated_tensor(alloc, tensor);
|
140
|
-
size_t cur_max = (char*)addr - (char*)alloc->
|
140
|
+
size_t cur_max = (char*)addr - (char*)alloc->base + size;
|
141
141
|
if (cur_max > alloc->max_size) {
|
142
142
|
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
143
143
|
for (int i = 0; i < 1024; i++) {
|
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
443
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
444
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
445
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
446
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
447
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
448
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
|
|
501
502
|
|
502
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
503
504
|
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
|
+
}
|
510
|
+
return x;
|
511
|
+
}
|
512
|
+
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
521
|
+
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
526
|
+
}
|
527
|
+
return x;
|
528
|
+
}
|
529
|
+
|
504
530
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
505
531
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
506
532
|
|
@@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
577
603
|
dst[i] = x[i] * x[i];
|
578
604
|
}
|
579
605
|
|
580
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
581
|
-
#pragma unroll
|
582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
583
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
584
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
585
|
-
}
|
586
|
-
return a;
|
587
|
-
}
|
588
|
-
|
589
606
|
template <int block_size>
|
590
607
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
591
608
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
624
641
|
}
|
625
642
|
}
|
626
643
|
|
627
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
628
|
-
#pragma unroll
|
629
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
630
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
631
|
-
}
|
632
|
-
return x;
|
633
|
-
}
|
634
|
-
|
635
644
|
template <int block_size>
|
636
645
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
637
646
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -4610,8 +4619,8 @@ static __global__ void rope(
|
|
4610
4619
|
|
4611
4620
|
template<typename T, bool has_pos>
|
4612
4621
|
static __global__ void rope_neox(
|
4613
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4614
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4622
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4623
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4615
4624
|
) {
|
4616
4625
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4617
4626
|
|
@@ -4620,23 +4629,25 @@ static __global__ void rope_neox(
|
|
4620
4629
|
}
|
4621
4630
|
|
4622
4631
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4623
|
-
const int
|
4632
|
+
const int ib = col / n_dims;
|
4633
|
+
const int ic = col % n_dims;
|
4634
|
+
|
4635
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4624
4636
|
const int i2 = row/p_delta_rows;
|
4625
4637
|
|
4626
|
-
|
4627
|
-
const float cur_rot = -float(col)/ncols;
|
4638
|
+
float cur_rot = inv_ndims * ic - ib;
|
4628
4639
|
|
4629
4640
|
const int p = has_pos ? pos[i2] : 0;
|
4630
|
-
const float theta_base = p*powf(
|
4641
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4631
4642
|
|
4632
4643
|
float cos_theta, sin_theta;
|
4633
4644
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4634
4645
|
|
4635
4646
|
const float x0 = x[i + 0];
|
4636
|
-
const float x1 = x[i +
|
4647
|
+
const float x1 = x[i + n_dims/2];
|
4637
4648
|
|
4638
|
-
dst[i + 0]
|
4639
|
-
dst[i +
|
4649
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4650
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4640
4651
|
}
|
4641
4652
|
|
4642
4653
|
static __global__ void rope_glm_f32(
|
@@ -4715,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4715
4726
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4716
4727
|
}
|
4717
4728
|
|
4718
|
-
|
4719
|
-
|
4720
|
-
|
4721
|
-
const int
|
4722
|
-
|
4723
|
-
const int
|
4729
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4730
|
+
const int tid = threadIdx.x;
|
4731
|
+
const int rowx = blockIdx.x;
|
4732
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4733
|
+
|
4734
|
+
const int block_size = blockDim.x;
|
4735
|
+
|
4736
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4737
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4738
|
+
|
4739
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4724
4740
|
|
4725
4741
|
float max_val = -INFINITY;
|
4726
4742
|
|
4727
4743
|
for (int col = tid; col < ncols; col += block_size) {
|
4728
|
-
const int
|
4729
|
-
|
4744
|
+
const int ix = rowx*ncols + col;
|
4745
|
+
const int iy = rowy*ncols + col;
|
4746
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4730
4747
|
}
|
4731
4748
|
|
4732
4749
|
// find the max value in the block
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4750
|
+
max_val = warp_reduce_max(max_val);
|
4751
|
+
if (block_size > WARP_SIZE) {
|
4752
|
+
if (warp_id == 0) {
|
4753
|
+
buf[lane_id] = -INFINITY;
|
4754
|
+
}
|
4755
|
+
__syncthreads();
|
4756
|
+
|
4757
|
+
if (lane_id == 0) {
|
4758
|
+
buf[warp_id] = max_val;
|
4759
|
+
}
|
4760
|
+
__syncthreads();
|
4761
|
+
|
4762
|
+
max_val = buf[lane_id];
|
4763
|
+
max_val = warp_reduce_max(max_val);
|
4736
4764
|
}
|
4737
4765
|
|
4738
4766
|
float tmp = 0.f;
|
4739
4767
|
|
4740
4768
|
for (int col = tid; col < ncols; col += block_size) {
|
4741
|
-
const int
|
4742
|
-
const
|
4769
|
+
const int ix = rowx*ncols + col;
|
4770
|
+
const int iy = rowy*ncols + col;
|
4771
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4743
4772
|
tmp += val;
|
4744
|
-
dst[
|
4773
|
+
dst[ix] = val;
|
4745
4774
|
}
|
4746
4775
|
|
4747
|
-
// sum
|
4748
|
-
|
4749
|
-
|
4750
|
-
|
4776
|
+
// find the sum of exps in the block
|
4777
|
+
tmp = warp_reduce_sum(tmp);
|
4778
|
+
if (block_size > WARP_SIZE) {
|
4779
|
+
if (warp_id == 0) {
|
4780
|
+
buf[lane_id] = 0.f;
|
4781
|
+
}
|
4782
|
+
__syncthreads();
|
4783
|
+
|
4784
|
+
if (lane_id == 0) {
|
4785
|
+
buf[warp_id] = tmp;
|
4786
|
+
}
|
4787
|
+
__syncthreads();
|
4788
|
+
|
4789
|
+
tmp = buf[lane_id];
|
4790
|
+
tmp = warp_reduce_sum(tmp);
|
4751
4791
|
}
|
4752
4792
|
|
4753
4793
|
const float inv_tmp = 1.f / tmp;
|
4754
4794
|
|
4755
4795
|
for (int col = tid; col < ncols; col += block_size) {
|
4756
|
-
const int i =
|
4796
|
+
const int i = rowx*ncols + col;
|
4757
4797
|
dst[i] *= inv_tmp;
|
4758
4798
|
}
|
4759
4799
|
}
|
@@ -5739,20 +5779,26 @@ static void rope_cuda(
|
|
5739
5779
|
|
5740
5780
|
template<typename T>
|
5741
5781
|
static void rope_neox_cuda(
|
5742
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5782
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5743
5783
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5744
5784
|
) {
|
5745
5785
|
GGML_ASSERT(ncols % 2 == 0);
|
5746
5786
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5747
5787
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5748
5788
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5789
|
+
|
5790
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5791
|
+
const float inv_ndims = -1.0f / n_dims;
|
5792
|
+
|
5749
5793
|
if (pos == nullptr) {
|
5750
5794
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5751
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5795
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5796
|
+
theta_scale, inv_ndims
|
5752
5797
|
);
|
5753
5798
|
} else {
|
5754
5799
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5755
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5800
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5801
|
+
theta_scale, inv_ndims
|
5756
5802
|
);
|
5757
5803
|
}
|
5758
5804
|
}
|
@@ -5784,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5784
5830
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5785
5831
|
}
|
5786
5832
|
|
5787
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5788
|
-
|
5833
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
5834
|
+
int nth = WARP_SIZE;
|
5835
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
5836
|
+
const dim3 block_dims(nth, 1, 1);
|
5789
5837
|
const dim3 block_nums(nrows_x, 1, 1);
|
5790
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5838
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5791
5839
|
}
|
5792
5840
|
|
5793
5841
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -6707,15 +6755,14 @@ inline void ggml_cuda_op_rope(
|
|
6707
6755
|
GGML_ASSERT(false);
|
6708
6756
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6709
6757
|
} else if (is_neox) {
|
6710
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6711
6758
|
if (src0->type == GGML_TYPE_F32) {
|
6712
6759
|
rope_neox_cuda(
|
6713
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6760
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6714
6761
|
attn_factor, corr_dims, main_stream
|
6715
6762
|
);
|
6716
6763
|
} else if (src0->type == GGML_TYPE_F16) {
|
6717
6764
|
rope_neox_cuda(
|
6718
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6765
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6719
6766
|
attn_factor, corr_dims, main_stream
|
6720
6767
|
);
|
6721
6768
|
} else {
|
@@ -6839,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6839
6886
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6840
6887
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6841
6888
|
|
6889
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
6890
|
+
|
6842
6891
|
const int64_t ne00 = src0->ne[0];
|
6843
|
-
const int64_t
|
6892
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
6893
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6844
6894
|
|
6845
|
-
|
6895
|
+
float scale = 1.0f;
|
6896
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
6897
|
+
|
6898
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6846
6899
|
|
6847
|
-
(void) src1;
|
6848
6900
|
(void) dst;
|
6849
|
-
(void) src1_dd;
|
6850
6901
|
}
|
6851
6902
|
|
6852
6903
|
inline void ggml_cuda_op_scale(
|
@@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
|
|
1028
1028
|
int nth = 32; // SIMD width
|
1029
1029
|
|
1030
1030
|
if (ne00%4 == 0) {
|
1031
|
+
while (nth < ne00/4 && nth < 256) {
|
1032
|
+
nth *= 2;
|
1033
|
+
}
|
1031
1034
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
1032
1035
|
} else {
|
1033
|
-
|
1036
|
+
while (nth < ne00 && nth < 1024) {
|
1034
1037
|
nth *= 2;
|
1035
|
-
}
|
1036
|
-
nth /= 2;
|
1038
|
+
}
|
1037
1039
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
1038
1040
|
}
|
1039
|
-
|
1040
|
-
[
|
1041
|
-
|
1042
|
-
[encoder
|
1043
|
-
[encoder
|
1044
|
-
[encoder
|
1041
|
+
|
1042
|
+
const float scale = ((float *) dst->op_params)[0];
|
1043
|
+
|
1044
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1045
|
+
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1046
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
1047
|
+
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
1048
|
+
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
1049
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
1050
|
+
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
1051
|
+
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
1045
1052
|
|
1046
1053
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1047
1054
|
} break;
|
@@ -1351,15 +1358,19 @@ void ggml_metal_graph_compute(
|
|
1351
1358
|
float eps;
|
1352
1359
|
memcpy(&eps, dst->op_params, sizeof(float));
|
1353
1360
|
|
1354
|
-
|
1361
|
+
int nth = 32; // SIMD width
|
1362
|
+
|
1363
|
+
while (nth < ne00/4 && nth < 1024) {
|
1364
|
+
nth *= 2;
|
1365
|
+
}
|
1355
1366
|
|
1356
1367
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
1357
|
-
[encoder setBuffer:id_src0 offset:offs_src0
|
1358
|
-
[encoder setBuffer:id_dst offset:offs_dst
|
1359
|
-
[encoder setBytes:&ne00
|
1360
|
-
[encoder setBytes:&nb01
|
1361
|
-
[encoder setBytes:&eps
|
1362
|
-
[encoder setThreadgroupMemoryLength:
|
1368
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1369
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1370
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
1371
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
1372
|
+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
1373
|
+
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
1363
1374
|
|
1364
1375
|
const int64_t nrows = ggml_nrows(src0);
|
1365
1376
|
|
@@ -1433,7 +1444,8 @@ void ggml_metal_graph_compute(
|
|
1433
1444
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
1434
1445
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1435
1446
|
const int mode = ((int32_t *) dst->op_params)[2];
|
1436
|
-
|
1447
|
+
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
1448
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
1437
1449
|
|
1438
1450
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1439
1451
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
@@ -39,6 +39,8 @@ typedef struct {
|
|
39
39
|
int8_t qs[QK8_0]; // quants
|
40
40
|
} block_q8_0;
|
41
41
|
|
42
|
+
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
43
|
+
|
42
44
|
// general-purpose kernel for addition of two tensors
|
43
45
|
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
44
46
|
// cons: not very efficient
|
@@ -180,10 +182,12 @@ kernel void kernel_gelu(
|
|
180
182
|
|
181
183
|
kernel void kernel_soft_max(
|
182
184
|
device const float * src0,
|
185
|
+
device const float * src1,
|
183
186
|
device float * dst,
|
184
187
|
constant int64_t & ne00,
|
185
188
|
constant int64_t & ne01,
|
186
189
|
constant int64_t & ne02,
|
190
|
+
constant float & scale,
|
187
191
|
threadgroup float * buf [[threadgroup(0)]],
|
188
192
|
uint tgpig[[threadgroup_position_in_grid]],
|
189
193
|
uint tpitg[[thread_position_in_threadgroup]],
|
@@ -194,73 +198,77 @@ kernel void kernel_soft_max(
|
|
194
198
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
195
199
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
196
200
|
|
197
|
-
device const float * psrc0 =
|
198
|
-
device
|
201
|
+
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
202
|
+
device const float * pmask = src1 ? src1 + i01*ne00 : nullptr;
|
203
|
+
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
199
204
|
|
200
205
|
// parallel max
|
201
|
-
float lmax =
|
206
|
+
float lmax = -INFINITY;
|
202
207
|
|
203
|
-
for (int i00 = tpitg
|
204
|
-
lmax = MAX(lmax, psrc0[i00]);
|
208
|
+
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
209
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
205
210
|
}
|
206
211
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
212
|
+
// find the max value in the block
|
213
|
+
float max_val = simd_max(lmax);
|
214
|
+
if (ntg > N_SIMDWIDTH) {
|
215
|
+
if (sgitg == 0) {
|
216
|
+
buf[tiisg] = -INFINITY;
|
217
|
+
}
|
211
218
|
|
212
|
-
|
219
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
213
220
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
218
|
-
}
|
219
|
-
}
|
221
|
+
if (tiisg == 0) {
|
222
|
+
buf[sgitg] = max_val;
|
223
|
+
}
|
220
224
|
|
221
|
-
|
225
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
226
|
|
223
|
-
|
227
|
+
max_val = buf[tiisg];
|
228
|
+
max_val = simd_max(max_val);
|
229
|
+
}
|
224
230
|
|
225
231
|
// parallel sum
|
226
232
|
float lsum = 0.0f;
|
227
233
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
228
|
-
const float exp_psrc0 = exp(psrc0[i00] -
|
234
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
229
235
|
lsum += exp_psrc0;
|
230
|
-
// Remember the result of exp here. exp is expensive, so we really do not
|
231
|
-
// wish to compute it twice.
|
232
236
|
pdst[i00] = exp_psrc0;
|
233
237
|
}
|
234
238
|
|
235
239
|
float sum = simd_sum(lsum);
|
236
|
-
if (
|
237
|
-
|
238
|
-
|
240
|
+
if (ntg > N_SIMDWIDTH) {
|
241
|
+
if (sgitg == 0) {
|
242
|
+
buf[tiisg] = 0.0f;
|
243
|
+
}
|
239
244
|
|
240
|
-
|
245
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
241
246
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
buf[tpitg] += buf[tpitg + i];
|
246
|
-
}
|
247
|
-
}
|
247
|
+
if (tiisg == 0) {
|
248
|
+
buf[sgitg] = sum;
|
249
|
+
}
|
248
250
|
|
249
|
-
|
251
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
252
|
+
|
253
|
+
sum = buf[tiisg];
|
254
|
+
sum = simd_sum(sum);
|
255
|
+
}
|
250
256
|
|
251
|
-
|
257
|
+
const float inv_sum = 1.0f/sum;
|
252
258
|
|
253
259
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
254
|
-
pdst[i00]
|
260
|
+
pdst[i00] *= inv_sum;
|
255
261
|
}
|
256
262
|
}
|
257
263
|
|
258
264
|
kernel void kernel_soft_max_4(
|
259
265
|
device const float * src0,
|
266
|
+
device const float * src1,
|
260
267
|
device float * dst,
|
261
268
|
constant int64_t & ne00,
|
262
269
|
constant int64_t & ne01,
|
263
270
|
constant int64_t & ne02,
|
271
|
+
constant float & scale,
|
264
272
|
threadgroup float * buf [[threadgroup(0)]],
|
265
273
|
uint tgpig[[threadgroup_position_in_grid]],
|
266
274
|
uint tpitg[[thread_position_in_threadgroup]],
|
@@ -271,64 +279,68 @@ kernel void kernel_soft_max_4(
|
|
271
279
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
272
280
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
273
281
|
|
274
|
-
device const float4 * psrc4 =
|
275
|
-
device
|
282
|
+
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
283
|
+
device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
284
|
+
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
276
285
|
|
277
286
|
// parallel max
|
278
|
-
float4 lmax4 =
|
287
|
+
float4 lmax4 = -INFINITY;
|
279
288
|
|
280
|
-
for (int i00 = tpitg
|
281
|
-
lmax4 = fmax(lmax4, psrc4[i00]);
|
289
|
+
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
290
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
282
291
|
}
|
283
292
|
|
284
293
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
285
|
-
float max = simd_max(lmax);
|
286
|
-
if (tiisg == 0) {
|
287
|
-
buf[sgitg] = max;
|
288
|
-
}
|
289
294
|
|
290
|
-
|
295
|
+
float max_val = simd_max(lmax);
|
296
|
+
if (ntg > N_SIMDWIDTH) {
|
297
|
+
if (sgitg == 0) {
|
298
|
+
buf[tiisg] = -INFINITY;
|
299
|
+
}
|
291
300
|
|
292
|
-
|
293
|
-
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
294
|
-
if (tpitg < i) {
|
295
|
-
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
296
|
-
}
|
297
|
-
}
|
301
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
298
302
|
|
299
|
-
|
303
|
+
if (tiisg == 0) {
|
304
|
+
buf[sgitg] = max_val;
|
305
|
+
}
|
306
|
+
|
307
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
300
308
|
|
301
|
-
|
309
|
+
max_val = buf[tiisg];
|
310
|
+
max_val = simd_max(max_val);
|
311
|
+
}
|
302
312
|
|
303
313
|
// parallel sum
|
304
314
|
float4 lsum4 = 0.0f;
|
305
315
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
306
|
-
const float4 exp_psrc4 = exp(psrc4[i00] -
|
316
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
307
317
|
lsum4 += exp_psrc4;
|
308
318
|
pdst4[i00] = exp_psrc4;
|
309
319
|
}
|
310
320
|
|
311
321
|
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
312
322
|
float sum = simd_sum(lsum);
|
313
|
-
if (
|
314
|
-
|
315
|
-
|
323
|
+
if (ntg > N_SIMDWIDTH) {
|
324
|
+
if (sgitg == 0) {
|
325
|
+
buf[tiisg] = 0.0f;
|
326
|
+
}
|
316
327
|
|
317
|
-
|
328
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
318
329
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
buf[tpitg] += buf[tpitg + i];
|
323
|
-
}
|
324
|
-
}
|
330
|
+
if (tiisg == 0) {
|
331
|
+
buf[sgitg] = sum;
|
332
|
+
}
|
325
333
|
|
326
|
-
|
334
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
335
|
+
|
336
|
+
sum = buf[tiisg];
|
337
|
+
sum = simd_sum(sum);
|
338
|
+
}
|
327
339
|
|
328
|
-
|
340
|
+
const float inv_sum = 1.0f/sum;
|
329
341
|
|
330
342
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
331
|
-
pdst4[i00]
|
343
|
+
pdst4[i00] *= inv_sum;
|
332
344
|
}
|
333
345
|
}
|
334
346
|
|
@@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
|
|
435
447
|
constant int64_t & ne00,
|
436
448
|
constant uint64_t & nb01,
|
437
449
|
constant float & eps,
|
438
|
-
threadgroup float *
|
450
|
+
threadgroup float * buf [[threadgroup(0)]],
|
439
451
|
uint tgpig[[threadgroup_position_in_grid]],
|
440
452
|
uint tpitg[[thread_position_in_threadgroup]],
|
441
453
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
442
454
|
uint tiisg[[thread_index_in_simdgroup]],
|
443
455
|
uint ntg[[threads_per_threadgroup]]) {
|
444
|
-
device const float4 * x
|
445
|
-
device const float * x_scalar = (device const float *) x;
|
456
|
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
446
457
|
|
447
458
|
float4 sumf = 0;
|
448
459
|
float all_sum = 0;
|
@@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
|
|
453
464
|
}
|
454
465
|
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
|
455
466
|
all_sum = simd_sum(all_sum);
|
456
|
-
if (
|
457
|
-
|
458
|
-
|
467
|
+
if (ntg > N_SIMDWIDTH) {
|
468
|
+
if (sgitg == 0) {
|
469
|
+
buf[tiisg] = 0.0f;
|
470
|
+
}
|
459
471
|
|
460
|
-
|
472
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
461
473
|
|
462
|
-
|
463
|
-
|
464
|
-
if (tpitg < i) {
|
465
|
-
sum[tpitg] += sum[tpitg + i];
|
466
|
-
}
|
467
|
-
}
|
468
|
-
if (tpitg == 0) {
|
469
|
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
470
|
-
sum[0] += x_scalar[i];
|
474
|
+
if (tiisg == 0) {
|
475
|
+
buf[sgitg] = all_sum;
|
471
476
|
}
|
472
|
-
sum[0] /= ne00;
|
473
|
-
}
|
474
477
|
|
475
|
-
|
478
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
476
479
|
|
477
|
-
|
480
|
+
all_sum = buf[tiisg];
|
481
|
+
all_sum = simd_sum(all_sum);
|
482
|
+
}
|
483
|
+
|
484
|
+
const float mean = all_sum/ne00;
|
478
485
|
const float scale = 1.0f/sqrt(mean + eps);
|
479
486
|
|
480
487
|
device float4 * y = (device float4 *) (dst + tgpig*ne00);
|
481
|
-
device float * y_scalar = (device float *) y;
|
482
488
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
483
489
|
y[i00] = x[i00] * scale;
|
484
490
|
}
|
485
|
-
if (tpitg == 0) {
|
486
|
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
487
|
-
y_scalar[i00] = x_scalar[i00] * scale;
|
488
|
-
}
|
489
|
-
}
|
490
491
|
}
|
491
492
|
|
492
493
|
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
@@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
|
|
576
577
|
// putting them in the kernel cause a significant performance penalty
|
577
578
|
#define N_DST 4 // each SIMD group works on 4 rows
|
578
579
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
579
|
-
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
580
580
|
//Note: This is a template, but strictly speaking it only applies to
|
581
581
|
// quantizations where the block size is 32. It also does not
|
582
582
|
// giard against the number of rows not being divisible by
|
@@ -1,20 +1,18 @@
|
|
1
|
+
#include "ggml.h"
|
1
2
|
#include "ggml-opencl.h"
|
2
3
|
|
3
4
|
#include <array>
|
4
5
|
#include <atomic>
|
6
|
+
#include <cstdio>
|
7
|
+
#include <cstdlib>
|
8
|
+
#include <cstring>
|
9
|
+
#include <limits>
|
5
10
|
#include <sstream>
|
6
11
|
#include <vector>
|
7
|
-
#include <limits>
|
8
12
|
|
9
13
|
#define CL_TARGET_OPENCL_VERSION 110
|
10
14
|
#include <clblast.h>
|
11
15
|
|
12
|
-
#include <stdlib.h>
|
13
|
-
#include <stdio.h>
|
14
|
-
#include <string.h>
|
15
|
-
|
16
|
-
#include "ggml.h"
|
17
|
-
|
18
16
|
#if defined(_MSC_VER)
|
19
17
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
18
|
#endif
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4826
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4827
|
struct ggml_context * ctx,
|
4828
4828
|
struct ggml_tensor * a,
|
4829
|
+
struct ggml_tensor * mask,
|
4830
|
+
float scale,
|
4829
4831
|
bool inplace) {
|
4832
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4833
|
+
if (mask) {
|
4834
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4835
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4836
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4837
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4838
|
+
}
|
4839
|
+
|
4830
4840
|
bool is_node = false;
|
4831
4841
|
|
4832
4842
|
if (a->grad) {
|
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4845
|
|
4836
4846
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4847
|
|
4848
|
+
float params[] = { scale };
|
4849
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4850
|
+
|
4838
4851
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4852
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4853
|
result->src[0] = a;
|
4854
|
+
result->src[1] = mask;
|
4841
4855
|
|
4842
4856
|
return result;
|
4843
4857
|
}
|
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4859
|
struct ggml_tensor * ggml_soft_max(
|
4846
4860
|
struct ggml_context * ctx,
|
4847
4861
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4862
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4863
|
}
|
4850
4864
|
|
4851
4865
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4866
|
struct ggml_context * ctx,
|
4853
4867
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4868
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4869
|
+
}
|
4870
|
+
|
4871
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4872
|
+
struct ggml_context * ctx,
|
4873
|
+
struct ggml_tensor * a,
|
4874
|
+
struct ggml_tensor * mask,
|
4875
|
+
float scale) {
|
4876
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4877
|
}
|
4856
4878
|
|
4857
4879
|
// ggml_soft_max_back
|
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9395
|
// TODO: find the optimal values for these
|
9374
9396
|
if (ggml_is_contiguous(src0) &&
|
9375
9397
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9398
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9399
|
src1->type == GGML_TYPE_F32 &&
|
9378
9400
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9401
|
|
@@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10551
10573
|
static void ggml_compute_forward_soft_max_f32(
|
10552
10574
|
const struct ggml_compute_params * params,
|
10553
10575
|
const struct ggml_tensor * src0,
|
10554
|
-
struct ggml_tensor *
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10576
|
+
const struct ggml_tensor * src1,
|
10577
|
+
struct ggml_tensor * dst) {
|
10578
|
+
assert(ggml_is_contiguous(dst));
|
10579
|
+
assert(ggml_are_same_shape(src0, dst));
|
10558
10580
|
|
10559
10581
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10560
10582
|
return;
|
10561
10583
|
}
|
10562
10584
|
|
10585
|
+
float scale = 1.0f;
|
10586
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10587
|
+
|
10563
10588
|
// TODO: handle transposed/permuted matrices
|
10564
10589
|
|
10565
10590
|
const int ith = params->ith;
|
10566
10591
|
const int nth = params->nth;
|
10567
10592
|
|
10593
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10594
|
+
|
10568
10595
|
const int nc = src0->ne[0];
|
10569
10596
|
const int nr = ggml_nrows(src0);
|
10570
10597
|
|
@@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10575
10602
|
const int ir0 = dr*ith;
|
10576
10603
|
const int ir1 = MIN(ir0 + dr, nr);
|
10577
10604
|
|
10605
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10606
|
+
|
10578
10607
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10579
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10580
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10608
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10609
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10610
|
+
|
10611
|
+
// broadcast the mask across rows
|
10612
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10613
|
+
|
10614
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10615
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10616
|
+
if (mp) {
|
10617
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10618
|
+
}
|
10581
10619
|
|
10582
10620
|
#ifndef NDEBUG
|
10583
10621
|
for (int i = 0; i < nc; ++i) {
|
10584
10622
|
//printf("p[%d] = %f\n", i, p[i]);
|
10585
|
-
assert(!isnan(
|
10623
|
+
assert(!isnan(wp[i]));
|
10586
10624
|
}
|
10587
10625
|
#endif
|
10588
10626
|
|
10589
10627
|
float max = -INFINITY;
|
10590
|
-
ggml_vec_max_f32(nc, &max,
|
10628
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10591
10629
|
|
10592
10630
|
ggml_float sum = 0.0;
|
10593
10631
|
|
10594
10632
|
uint16_t scvt;
|
10595
10633
|
for (int i = 0; i < nc; i++) {
|
10596
|
-
if (
|
10634
|
+
if (wp[i] == -INFINITY) {
|
10597
10635
|
dp[i] = 0.0f;
|
10598
10636
|
} else {
|
10599
|
-
// const float val = (
|
10600
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10637
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10638
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10601
10639
|
memcpy(&scvt, &s, sizeof(scvt));
|
10602
10640
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10603
10641
|
sum += (ggml_float)val;
|
@@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10622
10660
|
static void ggml_compute_forward_soft_max(
|
10623
10661
|
const struct ggml_compute_params * params,
|
10624
10662
|
const struct ggml_tensor * src0,
|
10625
|
-
struct ggml_tensor *
|
10663
|
+
const struct ggml_tensor * src1,
|
10664
|
+
struct ggml_tensor * dst) {
|
10626
10665
|
switch (src0->type) {
|
10627
10666
|
case GGML_TYPE_F32:
|
10628
10667
|
{
|
10629
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10668
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10630
10669
|
} break;
|
10631
10670
|
default:
|
10632
10671
|
{
|
@@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13863
13902
|
} break;
|
13864
13903
|
case GGML_OP_SOFT_MAX:
|
13865
13904
|
{
|
13866
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
13905
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13867
13906
|
} break;
|
13868
13907
|
case GGML_OP_SOFT_MAX_BACK:
|
13869
13908
|
{
|
@@ -15689,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15689
15728
|
{
|
15690
15729
|
n_tasks = 1;
|
15691
15730
|
} break;
|
15692
|
-
case GGML_OP_COUNT:
|
15693
|
-
{
|
15694
|
-
GGML_ASSERT(false);
|
15695
|
-
} break;
|
15696
15731
|
default:
|
15697
15732
|
{
|
15698
|
-
|
15733
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15734
|
+
if (node->op < GGML_OP_COUNT) {
|
15735
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15736
|
+
} else {
|
15737
|
+
fprintf(stderr, "%d\n", node->op);
|
15738
|
+
}
|
15699
15739
|
GGML_ASSERT(false);
|
15700
15740
|
} break;
|
15701
15741
|
}
|
@@ -15898,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15898
15938
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15899
15939
|
}
|
15900
15940
|
} break;
|
15941
|
+
case GGML_OP_SOFT_MAX:
|
15942
|
+
{
|
15943
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
+
|
15945
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
|
+
} break;
|
15901
15947
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15902
15948
|
{
|
15903
15949
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -1283,6 +1282,14 @@ extern "C" {
|
|
1283
1282
|
struct ggml_context * ctx,
|
1284
1283
|
struct ggml_tensor * a);
|
1285
1284
|
|
1285
|
+
// fused soft_max(a*scale + mask)
|
1286
|
+
// mask is optional
|
1287
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1288
|
+
struct ggml_context * ctx,
|
1289
|
+
struct ggml_tensor * a,
|
1290
|
+
struct ggml_tensor * mask,
|
1291
|
+
float scale);
|
1292
|
+
|
1286
1293
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1294
|
struct ggml_context * ctx,
|
1288
1295
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -46,7 +46,6 @@
|
|
46
46
|
#endif
|
47
47
|
#include <windows.h>
|
48
48
|
#include <io.h>
|
49
|
-
#include <stdio.h> // for _fseeki64
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#include <algorithm>
|
@@ -1113,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1113
1112
|
//
|
1114
1113
|
|
1115
1114
|
struct llama_state {
|
1115
|
+
llama_state() {
|
1116
|
+
#ifdef GGML_USE_METAL
|
1117
|
+
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
1118
|
+
#endif
|
1119
|
+
}
|
1120
|
+
|
1116
1121
|
// We save the log callback globally
|
1117
1122
|
ggml_log_callback log_callback = llama_log_callback_default;
|
1118
1123
|
void * log_callback_user_data = nullptr;
|
@@ -2634,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2634
2639
|
}
|
2635
2640
|
|
2636
2641
|
// general kv
|
2637
|
-
LLAMA_LOG_INFO("%s: general.name
|
2642
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
2638
2643
|
|
2639
2644
|
// special tokens
|
2640
|
-
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token
|
2641
|
-
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token
|
2642
|
-
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token
|
2643
|
-
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token
|
2644
|
-
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token
|
2645
|
-
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token
|
2645
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
2646
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
2647
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
2648
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
2649
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
2650
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2646
2651
|
}
|
2647
2652
|
|
2648
2653
|
static void llm_load_tensors(
|
@@ -3464,7 +3469,7 @@ static void llm_build_k_shift(
|
|
3464
3469
|
struct ggml_cgraph * graph,
|
3465
3470
|
llm_rope_type type,
|
3466
3471
|
int64_t n_ctx,
|
3467
|
-
|
3472
|
+
int n_rot,
|
3468
3473
|
float freq_base,
|
3469
3474
|
float freq_scale,
|
3470
3475
|
const llm_build_cb & cb) {
|
@@ -3496,7 +3501,7 @@ static void llm_build_k_shift(
|
|
3496
3501
|
// we rotate only the first n_rot dimensions
|
3497
3502
|
ggml_rope_custom_inplace(ctx,
|
3498
3503
|
ggml_view_3d(ctx, kv.k,
|
3499
|
-
|
3504
|
+
n_embd_head, n_head_kv, n_ctx,
|
3500
3505
|
ggml_element_size(kv.k)*n_embd_head,
|
3501
3506
|
ggml_element_size(kv.k)*n_embd_gqa,
|
3502
3507
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
@@ -3694,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3694
3699
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3695
3700
|
cb(kq, "kq", il);
|
3696
3701
|
|
3697
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
3698
|
-
cb(kq, "kq_scaled", il);
|
3699
|
-
|
3700
3702
|
if (max_alibi_bias > 0.0f) {
|
3701
|
-
//
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
3705
|
-
|
3706
|
-
|
3703
|
+
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3704
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
3705
|
+
cb(kq, "kq_scaled", il);
|
3706
|
+
|
3707
|
+
if (max_alibi_bias > 0.0f) {
|
3708
|
+
// TODO: n_head or n_head_kv
|
3709
|
+
// TODO: K-shift is likely not working
|
3710
|
+
// TODO: change to ggml_add
|
3711
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3712
|
+
cb(kq, "kq_scaled_alibi", il);
|
3713
|
+
}
|
3707
3714
|
|
3708
|
-
|
3709
|
-
|
3715
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
3716
|
+
cb(kq, "kq_masked", il);
|
3710
3717
|
|
3711
|
-
|
3712
|
-
|
3718
|
+
kq = ggml_soft_max(ctx, kq);
|
3719
|
+
cb(kq, "kq_soft_max", il);
|
3720
|
+
} else {
|
3721
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
3722
|
+
cb(kq, "kq_soft_max_ext", il);
|
3723
|
+
}
|
3713
3724
|
|
3714
3725
|
// split cached v into n_head heads
|
3715
3726
|
struct ggml_tensor * v =
|
@@ -5031,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5031
5042
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
5032
5043
|
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
5033
5044
|
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
5045
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
5034
5046
|
{ "v", OFFLOAD_FUNC_V },
|
5035
5047
|
{ "kqv", OFFLOAD_FUNC_V },
|
5036
5048
|
{ "kqv_merged", OFFLOAD_FUNC_V },
|
@@ -5539,18 +5551,8 @@ static int llama_decode_internal(
|
|
5539
5551
|
n_threads = std::min(4, n_threads);
|
5540
5552
|
}
|
5541
5553
|
|
5542
|
-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5543
|
-
const bool full_offload_supported =
|
5544
|
-
model.arch == LLM_ARCH_LLAMA ||
|
5545
|
-
model.arch == LLM_ARCH_BAICHUAN ||
|
5546
|
-
model.arch == LLM_ARCH_FALCON ||
|
5547
|
-
model.arch == LLM_ARCH_REFACT ||
|
5548
|
-
model.arch == LLM_ARCH_MPT ||
|
5549
|
-
model.arch == LLM_ARCH_STARCODER ||
|
5550
|
-
model.arch == LLM_ARCH_STABLELM;
|
5551
|
-
|
5552
5554
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5553
|
-
if (ggml_cpu_has_cublas() &&
|
5555
|
+
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5554
5556
|
n_threads = 1;
|
5555
5557
|
}
|
5556
5558
|
|
@@ -6409,10 +6411,13 @@ struct llama_grammar_candidate {
|
|
6409
6411
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6410
6412
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6411
6413
|
const char * src,
|
6414
|
+
size_t n_src,
|
6412
6415
|
llama_partial_utf8 partial_start) {
|
6413
6416
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6414
6417
|
const char * pos = src;
|
6415
6418
|
std::vector<uint32_t> code_points;
|
6419
|
+
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
+
code_points.reserve(n_src + 1);
|
6416
6421
|
uint32_t value = partial_start.value;
|
6417
6422
|
int n_remain = partial_start.n_remain;
|
6418
6423
|
|
@@ -6463,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6463
6468
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6464
6469
|
}
|
6465
6470
|
|
6471
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
+
std::string src,
|
6473
|
+
llama_partial_utf8 partial_start
|
6474
|
+
) {
|
6475
|
+
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
+
}
|
6477
|
+
|
6466
6478
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6467
6479
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6468
6480
|
switch (pos->type) {
|
@@ -7016,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
7016
7028
|
// Replace the data in candidates with the new_candidates data
|
7017
7029
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
7018
7030
|
candidates->size = new_candidates.size();
|
7031
|
+
candidates->sorted = false;
|
7019
7032
|
|
7020
7033
|
if (ctx) {
|
7021
7034
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7112,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7112
7125
|
} else if (piece.empty() || piece[0] == 0) {
|
7113
7126
|
candidates->data[i].logit = -INFINITY;
|
7114
7127
|
} else {
|
7115
|
-
candidates_decoded.push_back(decode_utf8(piece
|
7128
|
+
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
7116
7129
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
7117
7130
|
}
|
7118
7131
|
}
|
@@ -7319,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7319
7332
|
const std::string piece = llama_token_to_piece(ctx, token);
|
7320
7333
|
|
7321
7334
|
// Note terminating 0 in decoded string
|
7322
|
-
const auto decoded = decode_utf8(piece
|
7335
|
+
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
7323
7336
|
const auto & code_points = decoded.first;
|
7324
7337
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
7325
7338
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -8564,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
|
|
8564
8577
|
|
8565
8578
|
#ifdef GGML_USE_METAL
|
8566
8579
|
if (model->n_gpu_layers > 0) {
|
8567
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8568
|
-
|
8569
8580
|
ctx->ctx_metal = ggml_metal_init(1);
|
8570
8581
|
if (!ctx->ctx_metal) {
|
8571
8582
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
@@ -9701,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
9701
9712
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
9702
9713
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
9703
9714
|
g_state.log_callback_user_data = user_data;
|
9715
|
+
#ifdef GGML_USE_METAL
|
9716
|
+
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
9717
|
+
#endif
|
9704
9718
|
}
|
9705
9719
|
|
9706
9720
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -185,7 +185,7 @@ extern "C" {
|
|
185
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
186
186
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
187
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
-
float yarn_ext_factor; // YaRN extrapolation mix factor,
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
189
189
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
190
|
float yarn_beta_fast; // YaRN low correction dim
|
191
191
|
float yarn_beta_slow; // YaRN high correction dim
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1593'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
- !ruby/object:Gem::Version
|
81
81
|
version: '0'
|
82
82
|
requirements: []
|
83
|
-
rubygems_version: 3.4.
|
83
|
+
rubygems_version: 3.4.22
|
84
84
|
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: Ruby bindings for the llama.cpp.
|