llama_cpp 0.9.4 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +109 -58
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +68 -22
- data/ext/llama_cpp/src/ggml.h +10 -3
- data/ext/llama_cpp/src/llama.cpp +52 -38
- data/ext/llama_cpp/src/llama.h +1 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fd4e1a5e4d7e2442ab43255996da3ce92f898f9876f1bda343e2433c5050dd7
|
4
|
+
data.tar.gz: dece2da6c9befa15e6990d18fb58e2bf13d8da6c62033969b6b5104f82df736d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51a383690b6e90e9493e1f318e916dfd94a909f4e554afd8ea822d047f05e96be3e2f371e83f0da5a37a9837d9ae5ecc6992bb9d9c0fd60a9de521bcd148e8f7
|
7
|
+
data.tar.gz: 15bbe94edb232d1979f2907c6c3ab7325a1089f9dcdd5d4262d7f0955fd6183e6b01cfee16593165f6e9901991e765ea30740bc1a83cca8fad60df4417551e3b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.4...v0.9.5)] - 2023-12-02
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1555 to b1593.
|
4
|
+
|
1
5
|
## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1523 to b1555.
|
@@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
137
137
|
|
138
138
|
#ifdef GGML_ALLOCATOR_DEBUG
|
139
139
|
add_allocated_tensor(alloc, tensor);
|
140
|
-
size_t cur_max = (char*)addr - (char*)alloc->
|
140
|
+
size_t cur_max = (char*)addr - (char*)alloc->base + size;
|
141
141
|
if (cur_max > alloc->max_size) {
|
142
142
|
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
143
143
|
for (int i = 0; i < 1024; i++) {
|
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
443
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
444
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
445
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
446
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
447
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
448
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
|
|
501
502
|
|
502
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
503
504
|
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
|
+
}
|
510
|
+
return x;
|
511
|
+
}
|
512
|
+
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
521
|
+
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
526
|
+
}
|
527
|
+
return x;
|
528
|
+
}
|
529
|
+
|
504
530
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
505
531
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
506
532
|
|
@@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
577
603
|
dst[i] = x[i] * x[i];
|
578
604
|
}
|
579
605
|
|
580
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
581
|
-
#pragma unroll
|
582
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
583
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
584
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
585
|
-
}
|
586
|
-
return a;
|
587
|
-
}
|
588
|
-
|
589
606
|
template <int block_size>
|
590
607
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
591
608
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
624
641
|
}
|
625
642
|
}
|
626
643
|
|
627
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
628
|
-
#pragma unroll
|
629
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
630
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
631
|
-
}
|
632
|
-
return x;
|
633
|
-
}
|
634
|
-
|
635
644
|
template <int block_size>
|
636
645
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
637
646
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -4610,8 +4619,8 @@ static __global__ void rope(
|
|
4610
4619
|
|
4611
4620
|
template<typename T, bool has_pos>
|
4612
4621
|
static __global__ void rope_neox(
|
4613
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4614
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4622
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4623
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4615
4624
|
) {
|
4616
4625
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4617
4626
|
|
@@ -4620,23 +4629,25 @@ static __global__ void rope_neox(
|
|
4620
4629
|
}
|
4621
4630
|
|
4622
4631
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4623
|
-
const int
|
4632
|
+
const int ib = col / n_dims;
|
4633
|
+
const int ic = col % n_dims;
|
4634
|
+
|
4635
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4624
4636
|
const int i2 = row/p_delta_rows;
|
4625
4637
|
|
4626
|
-
|
4627
|
-
const float cur_rot = -float(col)/ncols;
|
4638
|
+
float cur_rot = inv_ndims * ic - ib;
|
4628
4639
|
|
4629
4640
|
const int p = has_pos ? pos[i2] : 0;
|
4630
|
-
const float theta_base = p*powf(
|
4641
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4631
4642
|
|
4632
4643
|
float cos_theta, sin_theta;
|
4633
4644
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4634
4645
|
|
4635
4646
|
const float x0 = x[i + 0];
|
4636
|
-
const float x1 = x[i +
|
4647
|
+
const float x1 = x[i + n_dims/2];
|
4637
4648
|
|
4638
|
-
dst[i + 0]
|
4639
|
-
dst[i +
|
4649
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4650
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4640
4651
|
}
|
4641
4652
|
|
4642
4653
|
static __global__ void rope_glm_f32(
|
@@ -4715,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4715
4726
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4716
4727
|
}
|
4717
4728
|
|
4718
|
-
|
4719
|
-
|
4720
|
-
|
4721
|
-
const int
|
4722
|
-
|
4723
|
-
const int
|
4729
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4730
|
+
const int tid = threadIdx.x;
|
4731
|
+
const int rowx = blockIdx.x;
|
4732
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4733
|
+
|
4734
|
+
const int block_size = blockDim.x;
|
4735
|
+
|
4736
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4737
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4738
|
+
|
4739
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4724
4740
|
|
4725
4741
|
float max_val = -INFINITY;
|
4726
4742
|
|
4727
4743
|
for (int col = tid; col < ncols; col += block_size) {
|
4728
|
-
const int
|
4729
|
-
|
4744
|
+
const int ix = rowx*ncols + col;
|
4745
|
+
const int iy = rowy*ncols + col;
|
4746
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4730
4747
|
}
|
4731
4748
|
|
4732
4749
|
// find the max value in the block
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4750
|
+
max_val = warp_reduce_max(max_val);
|
4751
|
+
if (block_size > WARP_SIZE) {
|
4752
|
+
if (warp_id == 0) {
|
4753
|
+
buf[lane_id] = -INFINITY;
|
4754
|
+
}
|
4755
|
+
__syncthreads();
|
4756
|
+
|
4757
|
+
if (lane_id == 0) {
|
4758
|
+
buf[warp_id] = max_val;
|
4759
|
+
}
|
4760
|
+
__syncthreads();
|
4761
|
+
|
4762
|
+
max_val = buf[lane_id];
|
4763
|
+
max_val = warp_reduce_max(max_val);
|
4736
4764
|
}
|
4737
4765
|
|
4738
4766
|
float tmp = 0.f;
|
4739
4767
|
|
4740
4768
|
for (int col = tid; col < ncols; col += block_size) {
|
4741
|
-
const int
|
4742
|
-
const
|
4769
|
+
const int ix = rowx*ncols + col;
|
4770
|
+
const int iy = rowy*ncols + col;
|
4771
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4743
4772
|
tmp += val;
|
4744
|
-
dst[
|
4773
|
+
dst[ix] = val;
|
4745
4774
|
}
|
4746
4775
|
|
4747
|
-
// sum
|
4748
|
-
|
4749
|
-
|
4750
|
-
|
4776
|
+
// find the sum of exps in the block
|
4777
|
+
tmp = warp_reduce_sum(tmp);
|
4778
|
+
if (block_size > WARP_SIZE) {
|
4779
|
+
if (warp_id == 0) {
|
4780
|
+
buf[lane_id] = 0.f;
|
4781
|
+
}
|
4782
|
+
__syncthreads();
|
4783
|
+
|
4784
|
+
if (lane_id == 0) {
|
4785
|
+
buf[warp_id] = tmp;
|
4786
|
+
}
|
4787
|
+
__syncthreads();
|
4788
|
+
|
4789
|
+
tmp = buf[lane_id];
|
4790
|
+
tmp = warp_reduce_sum(tmp);
|
4751
4791
|
}
|
4752
4792
|
|
4753
4793
|
const float inv_tmp = 1.f / tmp;
|
4754
4794
|
|
4755
4795
|
for (int col = tid; col < ncols; col += block_size) {
|
4756
|
-
const int i =
|
4796
|
+
const int i = rowx*ncols + col;
|
4757
4797
|
dst[i] *= inv_tmp;
|
4758
4798
|
}
|
4759
4799
|
}
|
@@ -5739,20 +5779,26 @@ static void rope_cuda(
|
|
5739
5779
|
|
5740
5780
|
template<typename T>
|
5741
5781
|
static void rope_neox_cuda(
|
5742
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5782
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5743
5783
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5744
5784
|
) {
|
5745
5785
|
GGML_ASSERT(ncols % 2 == 0);
|
5746
5786
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5747
5787
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5748
5788
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5789
|
+
|
5790
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5791
|
+
const float inv_ndims = -1.0f / n_dims;
|
5792
|
+
|
5749
5793
|
if (pos == nullptr) {
|
5750
5794
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5751
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5795
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5796
|
+
theta_scale, inv_ndims
|
5752
5797
|
);
|
5753
5798
|
} else {
|
5754
5799
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5755
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5800
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5801
|
+
theta_scale, inv_ndims
|
5756
5802
|
);
|
5757
5803
|
}
|
5758
5804
|
}
|
@@ -5784,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5784
5830
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5785
5831
|
}
|
5786
5832
|
|
5787
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5788
|
-
|
5833
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
5834
|
+
int nth = WARP_SIZE;
|
5835
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
5836
|
+
const dim3 block_dims(nth, 1, 1);
|
5789
5837
|
const dim3 block_nums(nrows_x, 1, 1);
|
5790
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5838
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5791
5839
|
}
|
5792
5840
|
|
5793
5841
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -6707,15 +6755,14 @@ inline void ggml_cuda_op_rope(
|
|
6707
6755
|
GGML_ASSERT(false);
|
6708
6756
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6709
6757
|
} else if (is_neox) {
|
6710
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6711
6758
|
if (src0->type == GGML_TYPE_F32) {
|
6712
6759
|
rope_neox_cuda(
|
6713
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6760
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6714
6761
|
attn_factor, corr_dims, main_stream
|
6715
6762
|
);
|
6716
6763
|
} else if (src0->type == GGML_TYPE_F16) {
|
6717
6764
|
rope_neox_cuda(
|
6718
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6765
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6719
6766
|
attn_factor, corr_dims, main_stream
|
6720
6767
|
);
|
6721
6768
|
} else {
|
@@ -6839,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6839
6886
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6840
6887
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6841
6888
|
|
6889
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
6890
|
+
|
6842
6891
|
const int64_t ne00 = src0->ne[0];
|
6843
|
-
const int64_t
|
6892
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
6893
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6844
6894
|
|
6845
|
-
|
6895
|
+
float scale = 1.0f;
|
6896
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
6897
|
+
|
6898
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6846
6899
|
|
6847
|
-
(void) src1;
|
6848
6900
|
(void) dst;
|
6849
|
-
(void) src1_dd;
|
6850
6901
|
}
|
6851
6902
|
|
6852
6903
|
inline void ggml_cuda_op_scale(
|
@@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
|
|
1028
1028
|
int nth = 32; // SIMD width
|
1029
1029
|
|
1030
1030
|
if (ne00%4 == 0) {
|
1031
|
+
while (nth < ne00/4 && nth < 256) {
|
1032
|
+
nth *= 2;
|
1033
|
+
}
|
1031
1034
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
1032
1035
|
} else {
|
1033
|
-
|
1036
|
+
while (nth < ne00 && nth < 1024) {
|
1034
1037
|
nth *= 2;
|
1035
|
-
}
|
1036
|
-
nth /= 2;
|
1038
|
+
}
|
1037
1039
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
1038
1040
|
}
|
1039
|
-
|
1040
|
-
[
|
1041
|
-
|
1042
|
-
[encoder
|
1043
|
-
[encoder
|
1044
|
-
[encoder
|
1041
|
+
|
1042
|
+
const float scale = ((float *) dst->op_params)[0];
|
1043
|
+
|
1044
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1045
|
+
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
1046
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
1047
|
+
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
1048
|
+
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
1049
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
1050
|
+
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
1051
|
+
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
1045
1052
|
|
1046
1053
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1047
1054
|
} break;
|
@@ -1351,15 +1358,19 @@ void ggml_metal_graph_compute(
|
|
1351
1358
|
float eps;
|
1352
1359
|
memcpy(&eps, dst->op_params, sizeof(float));
|
1353
1360
|
|
1354
|
-
|
1361
|
+
int nth = 32; // SIMD width
|
1362
|
+
|
1363
|
+
while (nth < ne00/4 && nth < 1024) {
|
1364
|
+
nth *= 2;
|
1365
|
+
}
|
1355
1366
|
|
1356
1367
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
1357
|
-
[encoder setBuffer:id_src0 offset:offs_src0
|
1358
|
-
[encoder setBuffer:id_dst offset:offs_dst
|
1359
|
-
[encoder setBytes:&ne00
|
1360
|
-
[encoder setBytes:&nb01
|
1361
|
-
[encoder setBytes:&eps
|
1362
|
-
[encoder setThreadgroupMemoryLength:
|
1368
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1369
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1370
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
1371
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
1372
|
+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
1373
|
+
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
1363
1374
|
|
1364
1375
|
const int64_t nrows = ggml_nrows(src0);
|
1365
1376
|
|
@@ -1433,7 +1444,8 @@ void ggml_metal_graph_compute(
|
|
1433
1444
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
1434
1445
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
1435
1446
|
const int mode = ((int32_t *) dst->op_params)[2];
|
1436
|
-
|
1447
|
+
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
1448
|
+
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
1437
1449
|
|
1438
1450
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
1439
1451
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
@@ -39,6 +39,8 @@ typedef struct {
|
|
39
39
|
int8_t qs[QK8_0]; // quants
|
40
40
|
} block_q8_0;
|
41
41
|
|
42
|
+
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
43
|
+
|
42
44
|
// general-purpose kernel for addition of two tensors
|
43
45
|
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
44
46
|
// cons: not very efficient
|
@@ -180,10 +182,12 @@ kernel void kernel_gelu(
|
|
180
182
|
|
181
183
|
kernel void kernel_soft_max(
|
182
184
|
device const float * src0,
|
185
|
+
device const float * src1,
|
183
186
|
device float * dst,
|
184
187
|
constant int64_t & ne00,
|
185
188
|
constant int64_t & ne01,
|
186
189
|
constant int64_t & ne02,
|
190
|
+
constant float & scale,
|
187
191
|
threadgroup float * buf [[threadgroup(0)]],
|
188
192
|
uint tgpig[[threadgroup_position_in_grid]],
|
189
193
|
uint tpitg[[thread_position_in_threadgroup]],
|
@@ -194,73 +198,77 @@ kernel void kernel_soft_max(
|
|
194
198
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
195
199
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
196
200
|
|
197
|
-
device const float * psrc0 =
|
198
|
-
device
|
201
|
+
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
202
|
+
device const float * pmask = src1 ? src1 + i01*ne00 : nullptr;
|
203
|
+
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
199
204
|
|
200
205
|
// parallel max
|
201
|
-
float lmax =
|
206
|
+
float lmax = -INFINITY;
|
202
207
|
|
203
|
-
for (int i00 = tpitg
|
204
|
-
lmax = MAX(lmax, psrc0[i00]);
|
208
|
+
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
209
|
+
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
205
210
|
}
|
206
211
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
212
|
+
// find the max value in the block
|
213
|
+
float max_val = simd_max(lmax);
|
214
|
+
if (ntg > N_SIMDWIDTH) {
|
215
|
+
if (sgitg == 0) {
|
216
|
+
buf[tiisg] = -INFINITY;
|
217
|
+
}
|
211
218
|
|
212
|
-
|
219
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
213
220
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
218
|
-
}
|
219
|
-
}
|
221
|
+
if (tiisg == 0) {
|
222
|
+
buf[sgitg] = max_val;
|
223
|
+
}
|
220
224
|
|
221
|
-
|
225
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
226
|
|
223
|
-
|
227
|
+
max_val = buf[tiisg];
|
228
|
+
max_val = simd_max(max_val);
|
229
|
+
}
|
224
230
|
|
225
231
|
// parallel sum
|
226
232
|
float lsum = 0.0f;
|
227
233
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
228
|
-
const float exp_psrc0 = exp(psrc0[i00] -
|
234
|
+
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
229
235
|
lsum += exp_psrc0;
|
230
|
-
// Remember the result of exp here. exp is expensive, so we really do not
|
231
|
-
// wish to compute it twice.
|
232
236
|
pdst[i00] = exp_psrc0;
|
233
237
|
}
|
234
238
|
|
235
239
|
float sum = simd_sum(lsum);
|
236
|
-
if (
|
237
|
-
|
238
|
-
|
240
|
+
if (ntg > N_SIMDWIDTH) {
|
241
|
+
if (sgitg == 0) {
|
242
|
+
buf[tiisg] = 0.0f;
|
243
|
+
}
|
239
244
|
|
240
|
-
|
245
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
241
246
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
buf[tpitg] += buf[tpitg + i];
|
246
|
-
}
|
247
|
-
}
|
247
|
+
if (tiisg == 0) {
|
248
|
+
buf[sgitg] = sum;
|
249
|
+
}
|
248
250
|
|
249
|
-
|
251
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
252
|
+
|
253
|
+
sum = buf[tiisg];
|
254
|
+
sum = simd_sum(sum);
|
255
|
+
}
|
250
256
|
|
251
|
-
|
257
|
+
const float inv_sum = 1.0f/sum;
|
252
258
|
|
253
259
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
254
|
-
pdst[i00]
|
260
|
+
pdst[i00] *= inv_sum;
|
255
261
|
}
|
256
262
|
}
|
257
263
|
|
258
264
|
kernel void kernel_soft_max_4(
|
259
265
|
device const float * src0,
|
266
|
+
device const float * src1,
|
260
267
|
device float * dst,
|
261
268
|
constant int64_t & ne00,
|
262
269
|
constant int64_t & ne01,
|
263
270
|
constant int64_t & ne02,
|
271
|
+
constant float & scale,
|
264
272
|
threadgroup float * buf [[threadgroup(0)]],
|
265
273
|
uint tgpig[[threadgroup_position_in_grid]],
|
266
274
|
uint tpitg[[thread_position_in_threadgroup]],
|
@@ -271,64 +279,68 @@ kernel void kernel_soft_max_4(
|
|
271
279
|
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
272
280
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
273
281
|
|
274
|
-
device const float4 * psrc4 =
|
275
|
-
device
|
282
|
+
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
283
|
+
device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
284
|
+
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
276
285
|
|
277
286
|
// parallel max
|
278
|
-
float4 lmax4 =
|
287
|
+
float4 lmax4 = -INFINITY;
|
279
288
|
|
280
|
-
for (int i00 = tpitg
|
281
|
-
lmax4 = fmax(lmax4, psrc4[i00]);
|
289
|
+
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
290
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
282
291
|
}
|
283
292
|
|
284
293
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
285
|
-
float max = simd_max(lmax);
|
286
|
-
if (tiisg == 0) {
|
287
|
-
buf[sgitg] = max;
|
288
|
-
}
|
289
294
|
|
290
|
-
|
295
|
+
float max_val = simd_max(lmax);
|
296
|
+
if (ntg > N_SIMDWIDTH) {
|
297
|
+
if (sgitg == 0) {
|
298
|
+
buf[tiisg] = -INFINITY;
|
299
|
+
}
|
291
300
|
|
292
|
-
|
293
|
-
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
294
|
-
if (tpitg < i) {
|
295
|
-
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
296
|
-
}
|
297
|
-
}
|
301
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
298
302
|
|
299
|
-
|
303
|
+
if (tiisg == 0) {
|
304
|
+
buf[sgitg] = max_val;
|
305
|
+
}
|
306
|
+
|
307
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
300
308
|
|
301
|
-
|
309
|
+
max_val = buf[tiisg];
|
310
|
+
max_val = simd_max(max_val);
|
311
|
+
}
|
302
312
|
|
303
313
|
// parallel sum
|
304
314
|
float4 lsum4 = 0.0f;
|
305
315
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
306
|
-
const float4 exp_psrc4 = exp(psrc4[i00] -
|
316
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
307
317
|
lsum4 += exp_psrc4;
|
308
318
|
pdst4[i00] = exp_psrc4;
|
309
319
|
}
|
310
320
|
|
311
321
|
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
312
322
|
float sum = simd_sum(lsum);
|
313
|
-
if (
|
314
|
-
|
315
|
-
|
323
|
+
if (ntg > N_SIMDWIDTH) {
|
324
|
+
if (sgitg == 0) {
|
325
|
+
buf[tiisg] = 0.0f;
|
326
|
+
}
|
316
327
|
|
317
|
-
|
328
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
318
329
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
buf[tpitg] += buf[tpitg + i];
|
323
|
-
}
|
324
|
-
}
|
330
|
+
if (tiisg == 0) {
|
331
|
+
buf[sgitg] = sum;
|
332
|
+
}
|
325
333
|
|
326
|
-
|
334
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
335
|
+
|
336
|
+
sum = buf[tiisg];
|
337
|
+
sum = simd_sum(sum);
|
338
|
+
}
|
327
339
|
|
328
|
-
|
340
|
+
const float inv_sum = 1.0f/sum;
|
329
341
|
|
330
342
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
331
|
-
pdst4[i00]
|
343
|
+
pdst4[i00] *= inv_sum;
|
332
344
|
}
|
333
345
|
}
|
334
346
|
|
@@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
|
|
435
447
|
constant int64_t & ne00,
|
436
448
|
constant uint64_t & nb01,
|
437
449
|
constant float & eps,
|
438
|
-
threadgroup float *
|
450
|
+
threadgroup float * buf [[threadgroup(0)]],
|
439
451
|
uint tgpig[[threadgroup_position_in_grid]],
|
440
452
|
uint tpitg[[thread_position_in_threadgroup]],
|
441
453
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
442
454
|
uint tiisg[[thread_index_in_simdgroup]],
|
443
455
|
uint ntg[[threads_per_threadgroup]]) {
|
444
|
-
device const float4 * x
|
445
|
-
device const float * x_scalar = (device const float *) x;
|
456
|
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
446
457
|
|
447
458
|
float4 sumf = 0;
|
448
459
|
float all_sum = 0;
|
@@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
|
|
453
464
|
}
|
454
465
|
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
|
455
466
|
all_sum = simd_sum(all_sum);
|
456
|
-
if (
|
457
|
-
|
458
|
-
|
467
|
+
if (ntg > N_SIMDWIDTH) {
|
468
|
+
if (sgitg == 0) {
|
469
|
+
buf[tiisg] = 0.0f;
|
470
|
+
}
|
459
471
|
|
460
|
-
|
472
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
461
473
|
|
462
|
-
|
463
|
-
|
464
|
-
if (tpitg < i) {
|
465
|
-
sum[tpitg] += sum[tpitg + i];
|
466
|
-
}
|
467
|
-
}
|
468
|
-
if (tpitg == 0) {
|
469
|
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
470
|
-
sum[0] += x_scalar[i];
|
474
|
+
if (tiisg == 0) {
|
475
|
+
buf[sgitg] = all_sum;
|
471
476
|
}
|
472
|
-
sum[0] /= ne00;
|
473
|
-
}
|
474
477
|
|
475
|
-
|
478
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
476
479
|
|
477
|
-
|
480
|
+
all_sum = buf[tiisg];
|
481
|
+
all_sum = simd_sum(all_sum);
|
482
|
+
}
|
483
|
+
|
484
|
+
const float mean = all_sum/ne00;
|
478
485
|
const float scale = 1.0f/sqrt(mean + eps);
|
479
486
|
|
480
487
|
device float4 * y = (device float4 *) (dst + tgpig*ne00);
|
481
|
-
device float * y_scalar = (device float *) y;
|
482
488
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
483
489
|
y[i00] = x[i00] * scale;
|
484
490
|
}
|
485
|
-
if (tpitg == 0) {
|
486
|
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
487
|
-
y_scalar[i00] = x_scalar[i00] * scale;
|
488
|
-
}
|
489
|
-
}
|
490
491
|
}
|
491
492
|
|
492
493
|
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
@@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
|
|
576
577
|
// putting them in the kernel cause a significant performance penalty
|
577
578
|
#define N_DST 4 // each SIMD group works on 4 rows
|
578
579
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
579
|
-
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
580
580
|
//Note: This is a template, but strictly speaking it only applies to
|
581
581
|
// quantizations where the block size is 32. It also does not
|
582
582
|
// giard against the number of rows not being divisible by
|
@@ -1,20 +1,18 @@
|
|
1
|
+
#include "ggml.h"
|
1
2
|
#include "ggml-opencl.h"
|
2
3
|
|
3
4
|
#include <array>
|
4
5
|
#include <atomic>
|
6
|
+
#include <cstdio>
|
7
|
+
#include <cstdlib>
|
8
|
+
#include <cstring>
|
9
|
+
#include <limits>
|
5
10
|
#include <sstream>
|
6
11
|
#include <vector>
|
7
|
-
#include <limits>
|
8
12
|
|
9
13
|
#define CL_TARGET_OPENCL_VERSION 110
|
10
14
|
#include <clblast.h>
|
11
15
|
|
12
|
-
#include <stdlib.h>
|
13
|
-
#include <stdio.h>
|
14
|
-
#include <string.h>
|
15
|
-
|
16
|
-
#include "ggml.h"
|
17
|
-
|
18
16
|
#if defined(_MSC_VER)
|
19
17
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
18
|
#endif
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4826
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4827
|
struct ggml_context * ctx,
|
4828
4828
|
struct ggml_tensor * a,
|
4829
|
+
struct ggml_tensor * mask,
|
4830
|
+
float scale,
|
4829
4831
|
bool inplace) {
|
4832
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4833
|
+
if (mask) {
|
4834
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4835
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4836
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4837
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4838
|
+
}
|
4839
|
+
|
4830
4840
|
bool is_node = false;
|
4831
4841
|
|
4832
4842
|
if (a->grad) {
|
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4845
|
|
4836
4846
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4847
|
|
4848
|
+
float params[] = { scale };
|
4849
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4850
|
+
|
4838
4851
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4852
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4853
|
result->src[0] = a;
|
4854
|
+
result->src[1] = mask;
|
4841
4855
|
|
4842
4856
|
return result;
|
4843
4857
|
}
|
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4859
|
struct ggml_tensor * ggml_soft_max(
|
4846
4860
|
struct ggml_context * ctx,
|
4847
4861
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4862
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4863
|
}
|
4850
4864
|
|
4851
4865
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4866
|
struct ggml_context * ctx,
|
4853
4867
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4868
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4869
|
+
}
|
4870
|
+
|
4871
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4872
|
+
struct ggml_context * ctx,
|
4873
|
+
struct ggml_tensor * a,
|
4874
|
+
struct ggml_tensor * mask,
|
4875
|
+
float scale) {
|
4876
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4877
|
}
|
4856
4878
|
|
4857
4879
|
// ggml_soft_max_back
|
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9395
|
// TODO: find the optimal values for these
|
9374
9396
|
if (ggml_is_contiguous(src0) &&
|
9375
9397
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9398
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9399
|
src1->type == GGML_TYPE_F32 &&
|
9378
9400
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9401
|
|
@@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10551
10573
|
static void ggml_compute_forward_soft_max_f32(
|
10552
10574
|
const struct ggml_compute_params * params,
|
10553
10575
|
const struct ggml_tensor * src0,
|
10554
|
-
struct ggml_tensor *
|
10555
|
-
|
10556
|
-
|
10557
|
-
|
10576
|
+
const struct ggml_tensor * src1,
|
10577
|
+
struct ggml_tensor * dst) {
|
10578
|
+
assert(ggml_is_contiguous(dst));
|
10579
|
+
assert(ggml_are_same_shape(src0, dst));
|
10558
10580
|
|
10559
10581
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10560
10582
|
return;
|
10561
10583
|
}
|
10562
10584
|
|
10585
|
+
float scale = 1.0f;
|
10586
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10587
|
+
|
10563
10588
|
// TODO: handle transposed/permuted matrices
|
10564
10589
|
|
10565
10590
|
const int ith = params->ith;
|
10566
10591
|
const int nth = params->nth;
|
10567
10592
|
|
10593
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10594
|
+
|
10568
10595
|
const int nc = src0->ne[0];
|
10569
10596
|
const int nr = ggml_nrows(src0);
|
10570
10597
|
|
@@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10575
10602
|
const int ir0 = dr*ith;
|
10576
10603
|
const int ir1 = MIN(ir0 + dr, nr);
|
10577
10604
|
|
10605
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10606
|
+
|
10578
10607
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10579
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10580
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10608
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10609
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10610
|
+
|
10611
|
+
// broadcast the mask across rows
|
10612
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10613
|
+
|
10614
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10615
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10616
|
+
if (mp) {
|
10617
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10618
|
+
}
|
10581
10619
|
|
10582
10620
|
#ifndef NDEBUG
|
10583
10621
|
for (int i = 0; i < nc; ++i) {
|
10584
10622
|
//printf("p[%d] = %f\n", i, p[i]);
|
10585
|
-
assert(!isnan(
|
10623
|
+
assert(!isnan(wp[i]));
|
10586
10624
|
}
|
10587
10625
|
#endif
|
10588
10626
|
|
10589
10627
|
float max = -INFINITY;
|
10590
|
-
ggml_vec_max_f32(nc, &max,
|
10628
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10591
10629
|
|
10592
10630
|
ggml_float sum = 0.0;
|
10593
10631
|
|
10594
10632
|
uint16_t scvt;
|
10595
10633
|
for (int i = 0; i < nc; i++) {
|
10596
|
-
if (
|
10634
|
+
if (wp[i] == -INFINITY) {
|
10597
10635
|
dp[i] = 0.0f;
|
10598
10636
|
} else {
|
10599
|
-
// const float val = (
|
10600
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10637
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10638
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10601
10639
|
memcpy(&scvt, &s, sizeof(scvt));
|
10602
10640
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10603
10641
|
sum += (ggml_float)val;
|
@@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10622
10660
|
static void ggml_compute_forward_soft_max(
|
10623
10661
|
const struct ggml_compute_params * params,
|
10624
10662
|
const struct ggml_tensor * src0,
|
10625
|
-
struct ggml_tensor *
|
10663
|
+
const struct ggml_tensor * src1,
|
10664
|
+
struct ggml_tensor * dst) {
|
10626
10665
|
switch (src0->type) {
|
10627
10666
|
case GGML_TYPE_F32:
|
10628
10667
|
{
|
10629
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10668
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10630
10669
|
} break;
|
10631
10670
|
default:
|
10632
10671
|
{
|
@@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13863
13902
|
} break;
|
13864
13903
|
case GGML_OP_SOFT_MAX:
|
13865
13904
|
{
|
13866
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
13905
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13867
13906
|
} break;
|
13868
13907
|
case GGML_OP_SOFT_MAX_BACK:
|
13869
13908
|
{
|
@@ -15689,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15689
15728
|
{
|
15690
15729
|
n_tasks = 1;
|
15691
15730
|
} break;
|
15692
|
-
case GGML_OP_COUNT:
|
15693
|
-
{
|
15694
|
-
GGML_ASSERT(false);
|
15695
|
-
} break;
|
15696
15731
|
default:
|
15697
15732
|
{
|
15698
|
-
|
15733
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15734
|
+
if (node->op < GGML_OP_COUNT) {
|
15735
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15736
|
+
} else {
|
15737
|
+
fprintf(stderr, "%d\n", node->op);
|
15738
|
+
}
|
15699
15739
|
GGML_ASSERT(false);
|
15700
15740
|
} break;
|
15701
15741
|
}
|
@@ -15898,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15898
15938
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15899
15939
|
}
|
15900
15940
|
} break;
|
15941
|
+
case GGML_OP_SOFT_MAX:
|
15942
|
+
{
|
15943
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
+
|
15945
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
|
+
} break;
|
15901
15947
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15902
15948
|
{
|
15903
15949
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -1283,6 +1282,14 @@ extern "C" {
|
|
1283
1282
|
struct ggml_context * ctx,
|
1284
1283
|
struct ggml_tensor * a);
|
1285
1284
|
|
1285
|
+
// fused soft_max(a*scale + mask)
|
1286
|
+
// mask is optional
|
1287
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1288
|
+
struct ggml_context * ctx,
|
1289
|
+
struct ggml_tensor * a,
|
1290
|
+
struct ggml_tensor * mask,
|
1291
|
+
float scale);
|
1292
|
+
|
1286
1293
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1294
|
struct ggml_context * ctx,
|
1288
1295
|
struct ggml_tensor * a,
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -46,7 +46,6 @@
|
|
46
46
|
#endif
|
47
47
|
#include <windows.h>
|
48
48
|
#include <io.h>
|
49
|
-
#include <stdio.h> // for _fseeki64
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#include <algorithm>
|
@@ -1113,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1113
1112
|
//
|
1114
1113
|
|
1115
1114
|
struct llama_state {
|
1115
|
+
llama_state() {
|
1116
|
+
#ifdef GGML_USE_METAL
|
1117
|
+
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
1118
|
+
#endif
|
1119
|
+
}
|
1120
|
+
|
1116
1121
|
// We save the log callback globally
|
1117
1122
|
ggml_log_callback log_callback = llama_log_callback_default;
|
1118
1123
|
void * log_callback_user_data = nullptr;
|
@@ -2634,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2634
2639
|
}
|
2635
2640
|
|
2636
2641
|
// general kv
|
2637
|
-
LLAMA_LOG_INFO("%s: general.name
|
2642
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
2638
2643
|
|
2639
2644
|
// special tokens
|
2640
|
-
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token
|
2641
|
-
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token
|
2642
|
-
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token
|
2643
|
-
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token
|
2644
|
-
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token
|
2645
|
-
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token
|
2645
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
2646
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
2647
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
2648
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
2649
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
2650
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2646
2651
|
}
|
2647
2652
|
|
2648
2653
|
static void llm_load_tensors(
|
@@ -3464,7 +3469,7 @@ static void llm_build_k_shift(
|
|
3464
3469
|
struct ggml_cgraph * graph,
|
3465
3470
|
llm_rope_type type,
|
3466
3471
|
int64_t n_ctx,
|
3467
|
-
|
3472
|
+
int n_rot,
|
3468
3473
|
float freq_base,
|
3469
3474
|
float freq_scale,
|
3470
3475
|
const llm_build_cb & cb) {
|
@@ -3496,7 +3501,7 @@ static void llm_build_k_shift(
|
|
3496
3501
|
// we rotate only the first n_rot dimensions
|
3497
3502
|
ggml_rope_custom_inplace(ctx,
|
3498
3503
|
ggml_view_3d(ctx, kv.k,
|
3499
|
-
|
3504
|
+
n_embd_head, n_head_kv, n_ctx,
|
3500
3505
|
ggml_element_size(kv.k)*n_embd_head,
|
3501
3506
|
ggml_element_size(kv.k)*n_embd_gqa,
|
3502
3507
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
@@ -3694,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3694
3699
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3695
3700
|
cb(kq, "kq", il);
|
3696
3701
|
|
3697
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
3698
|
-
cb(kq, "kq_scaled", il);
|
3699
|
-
|
3700
3702
|
if (max_alibi_bias > 0.0f) {
|
3701
|
-
//
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
3705
|
-
|
3706
|
-
|
3703
|
+
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3704
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
3705
|
+
cb(kq, "kq_scaled", il);
|
3706
|
+
|
3707
|
+
if (max_alibi_bias > 0.0f) {
|
3708
|
+
// TODO: n_head or n_head_kv
|
3709
|
+
// TODO: K-shift is likely not working
|
3710
|
+
// TODO: change to ggml_add
|
3711
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3712
|
+
cb(kq, "kq_scaled_alibi", il);
|
3713
|
+
}
|
3707
3714
|
|
3708
|
-
|
3709
|
-
|
3715
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
3716
|
+
cb(kq, "kq_masked", il);
|
3710
3717
|
|
3711
|
-
|
3712
|
-
|
3718
|
+
kq = ggml_soft_max(ctx, kq);
|
3719
|
+
cb(kq, "kq_soft_max", il);
|
3720
|
+
} else {
|
3721
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
3722
|
+
cb(kq, "kq_soft_max_ext", il);
|
3723
|
+
}
|
3713
3724
|
|
3714
3725
|
// split cached v into n_head heads
|
3715
3726
|
struct ggml_tensor * v =
|
@@ -5031,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5031
5042
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
5032
5043
|
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
5033
5044
|
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
5045
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
5034
5046
|
{ "v", OFFLOAD_FUNC_V },
|
5035
5047
|
{ "kqv", OFFLOAD_FUNC_V },
|
5036
5048
|
{ "kqv_merged", OFFLOAD_FUNC_V },
|
@@ -5539,18 +5551,8 @@ static int llama_decode_internal(
|
|
5539
5551
|
n_threads = std::min(4, n_threads);
|
5540
5552
|
}
|
5541
5553
|
|
5542
|
-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5543
|
-
const bool full_offload_supported =
|
5544
|
-
model.arch == LLM_ARCH_LLAMA ||
|
5545
|
-
model.arch == LLM_ARCH_BAICHUAN ||
|
5546
|
-
model.arch == LLM_ARCH_FALCON ||
|
5547
|
-
model.arch == LLM_ARCH_REFACT ||
|
5548
|
-
model.arch == LLM_ARCH_MPT ||
|
5549
|
-
model.arch == LLM_ARCH_STARCODER ||
|
5550
|
-
model.arch == LLM_ARCH_STABLELM;
|
5551
|
-
|
5552
5554
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5553
|
-
if (ggml_cpu_has_cublas() &&
|
5555
|
+
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5554
5556
|
n_threads = 1;
|
5555
5557
|
}
|
5556
5558
|
|
@@ -6409,10 +6411,13 @@ struct llama_grammar_candidate {
|
|
6409
6411
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6410
6412
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6411
6413
|
const char * src,
|
6414
|
+
size_t n_src,
|
6412
6415
|
llama_partial_utf8 partial_start) {
|
6413
6416
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6414
6417
|
const char * pos = src;
|
6415
6418
|
std::vector<uint32_t> code_points;
|
6419
|
+
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
+
code_points.reserve(n_src + 1);
|
6416
6421
|
uint32_t value = partial_start.value;
|
6417
6422
|
int n_remain = partial_start.n_remain;
|
6418
6423
|
|
@@ -6463,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6463
6468
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6464
6469
|
}
|
6465
6470
|
|
6471
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
+
std::string src,
|
6473
|
+
llama_partial_utf8 partial_start
|
6474
|
+
) {
|
6475
|
+
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
+
}
|
6477
|
+
|
6466
6478
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6467
6479
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6468
6480
|
switch (pos->type) {
|
@@ -7016,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
7016
7028
|
// Replace the data in candidates with the new_candidates data
|
7017
7029
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
7018
7030
|
candidates->size = new_candidates.size();
|
7031
|
+
candidates->sorted = false;
|
7019
7032
|
|
7020
7033
|
if (ctx) {
|
7021
7034
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7112,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7112
7125
|
} else if (piece.empty() || piece[0] == 0) {
|
7113
7126
|
candidates->data[i].logit = -INFINITY;
|
7114
7127
|
} else {
|
7115
|
-
candidates_decoded.push_back(decode_utf8(piece
|
7128
|
+
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
7116
7129
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
7117
7130
|
}
|
7118
7131
|
}
|
@@ -7319,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7319
7332
|
const std::string piece = llama_token_to_piece(ctx, token);
|
7320
7333
|
|
7321
7334
|
// Note terminating 0 in decoded string
|
7322
|
-
const auto decoded = decode_utf8(piece
|
7335
|
+
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
7323
7336
|
const auto & code_points = decoded.first;
|
7324
7337
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
7325
7338
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -8564,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
|
|
8564
8577
|
|
8565
8578
|
#ifdef GGML_USE_METAL
|
8566
8579
|
if (model->n_gpu_layers > 0) {
|
8567
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8568
|
-
|
8569
8580
|
ctx->ctx_metal = ggml_metal_init(1);
|
8570
8581
|
if (!ctx->ctx_metal) {
|
8571
8582
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
@@ -9701,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
9701
9712
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
9702
9713
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
9703
9714
|
g_state.log_callback_user_data = user_data;
|
9715
|
+
#ifdef GGML_USE_METAL
|
9716
|
+
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
9717
|
+
#endif
|
9704
9718
|
}
|
9705
9719
|
|
9706
9720
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -185,7 +185,7 @@ extern "C" {
|
|
185
185
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
186
186
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
187
187
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
188
|
-
float yarn_ext_factor; // YaRN extrapolation mix factor,
|
188
|
+
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
189
189
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
190
190
|
float yarn_beta_fast; // YaRN low correction dim
|
191
191
|
float yarn_beta_slow; // YaRN high correction dim
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.5'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1593'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
- !ruby/object:Gem::Version
|
81
81
|
version: '0'
|
82
82
|
requirements: []
|
83
|
-
rubygems_version: 3.4.
|
83
|
+
rubygems_version: 3.4.22
|
84
84
|
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: Ruby bindings for the llama.cpp.
|