llama_cpp 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
4
- data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
3
+ metadata.gz: 4fd4e1a5e4d7e2442ab43255996da3ce92f898f9876f1bda343e2433c5050dd7
4
+ data.tar.gz: dece2da6c9befa15e6990d18fb58e2bf13d8da6c62033969b6b5104f82df736d
5
5
  SHA512:
6
- metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
7
- data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
6
+ metadata.gz: 51a383690b6e90e9493e1f318e916dfd94a909f4e554afd8ea822d047f05e96be3e2f371e83f0da5a37a9837d9ae5ecc6992bb9d9c0fd60a9de521bcd148e8f7
7
+ data.tar.gz: 15bbe94edb232d1979f2907c6c3ab7325a1089f9dcdd5d4262d7f0955fd6183e6b01cfee16593165f6e9901991e765ea30740bc1a83cca8fad60df4417551e3b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.4...v0.9.5)] - 2023-12-02
2
+
3
+ - Bump bundled llama.cpp from b1555 to b1593.
4
+
1
5
  ## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
2
6
 
3
7
  - Bump bundled llama.cpp from b1523 to b1555.
@@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
137
137
 
138
138
  #ifdef GGML_ALLOCATOR_DEBUG
139
139
  add_allocated_tensor(alloc, tensor);
140
- size_t cur_max = (char*)addr - (char*)alloc->data + size;
140
+ size_t cur_max = (char*)addr - (char*)alloc->base + size;
141
141
  if (cur_max > alloc->max_size) {
142
142
  printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
143
143
  for (int i = 0; i < 1024; i++) {
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
443
443
  #define CUDA_SCALE_BLOCK_SIZE 256
444
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
445
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
446
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
447
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
448
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
501
502
 
502
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
503
504
 
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
+ }
510
+ return x;
511
+ }
512
+
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
521
+
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
526
+ }
527
+ return x;
528
+ }
529
+
504
530
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
505
531
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
506
532
 
@@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
577
603
  dst[i] = x[i] * x[i];
578
604
  }
579
605
 
580
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
581
- #pragma unroll
582
- for (int mask = 16; mask > 0; mask >>= 1) {
583
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
584
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
585
- }
586
- return a;
587
- }
588
-
589
606
  template <int block_size>
590
607
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
591
608
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
624
641
  }
625
642
  }
626
643
 
627
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
628
- #pragma unroll
629
- for (int mask = 16; mask > 0; mask >>= 1) {
630
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
631
- }
632
- return x;
633
- }
634
-
635
644
  template <int block_size>
636
645
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
637
646
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -4610,8 +4619,8 @@ static __global__ void rope(
4610
4619
 
4611
4620
  template<typename T, bool has_pos>
4612
4621
  static __global__ void rope_neox(
4613
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4614
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4622
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4623
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4615
4624
  ) {
4616
4625
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4617
4626
 
@@ -4620,23 +4629,25 @@ static __global__ void rope_neox(
4620
4629
  }
4621
4630
 
4622
4631
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4623
- const int i = row*ncols + col/2;
4632
+ const int ib = col / n_dims;
4633
+ const int ic = col % n_dims;
4634
+
4635
+ const int i = row*ncols + ib*n_dims + ic/2;
4624
4636
  const int i2 = row/p_delta_rows;
4625
4637
 
4626
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4627
- const float cur_rot = -float(col)/ncols;
4638
+ float cur_rot = inv_ndims * ic - ib;
4628
4639
 
4629
4640
  const int p = has_pos ? pos[i2] : 0;
4630
- const float theta_base = p*powf(freq_base, cur_rot);
4641
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4631
4642
 
4632
4643
  float cos_theta, sin_theta;
4633
4644
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4634
4645
 
4635
4646
  const float x0 = x[i + 0];
4636
- const float x1 = x[i + ncols/2];
4647
+ const float x1 = x[i + n_dims/2];
4637
4648
 
4638
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4639
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4649
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4650
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4640
4651
  }
4641
4652
 
4642
4653
  static __global__ void rope_glm_f32(
@@ -4715,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4715
4726
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4716
4727
  }
4717
4728
 
4718
- // the CUDA soft max implementation differs from the CPU implementation
4719
- // instead of doubles floats are used
4720
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4721
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4722
- const int block_size = blockDim.y;
4723
- const int tid = threadIdx.y;
4729
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4730
+ const int tid = threadIdx.x;
4731
+ const int rowx = blockIdx.x;
4732
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4733
+
4734
+ const int block_size = blockDim.x;
4735
+
4736
+ const int warp_id = threadIdx.x / WARP_SIZE;
4737
+ const int lane_id = threadIdx.x % WARP_SIZE;
4738
+
4739
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4724
4740
 
4725
4741
  float max_val = -INFINITY;
4726
4742
 
4727
4743
  for (int col = tid; col < ncols; col += block_size) {
4728
- const int i = row*ncols + col;
4729
- max_val = max(max_val, x[i]);
4744
+ const int ix = rowx*ncols + col;
4745
+ const int iy = rowy*ncols + col;
4746
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4730
4747
  }
4731
4748
 
4732
4749
  // find the max value in the block
4733
- #pragma unroll
4734
- for (int mask = 16; mask > 0; mask >>= 1) {
4735
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4750
+ max_val = warp_reduce_max(max_val);
4751
+ if (block_size > WARP_SIZE) {
4752
+ if (warp_id == 0) {
4753
+ buf[lane_id] = -INFINITY;
4754
+ }
4755
+ __syncthreads();
4756
+
4757
+ if (lane_id == 0) {
4758
+ buf[warp_id] = max_val;
4759
+ }
4760
+ __syncthreads();
4761
+
4762
+ max_val = buf[lane_id];
4763
+ max_val = warp_reduce_max(max_val);
4736
4764
  }
4737
4765
 
4738
4766
  float tmp = 0.f;
4739
4767
 
4740
4768
  for (int col = tid; col < ncols; col += block_size) {
4741
- const int i = row*ncols + col;
4742
- const float val = expf(x[i] - max_val);
4769
+ const int ix = rowx*ncols + col;
4770
+ const int iy = rowy*ncols + col;
4771
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4743
4772
  tmp += val;
4744
- dst[i] = val;
4773
+ dst[ix] = val;
4745
4774
  }
4746
4775
 
4747
- // sum up partial sums
4748
- #pragma unroll
4749
- for (int mask = 16; mask > 0; mask >>= 1) {
4750
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4776
+ // find the sum of exps in the block
4777
+ tmp = warp_reduce_sum(tmp);
4778
+ if (block_size > WARP_SIZE) {
4779
+ if (warp_id == 0) {
4780
+ buf[lane_id] = 0.f;
4781
+ }
4782
+ __syncthreads();
4783
+
4784
+ if (lane_id == 0) {
4785
+ buf[warp_id] = tmp;
4786
+ }
4787
+ __syncthreads();
4788
+
4789
+ tmp = buf[lane_id];
4790
+ tmp = warp_reduce_sum(tmp);
4751
4791
  }
4752
4792
 
4753
4793
  const float inv_tmp = 1.f / tmp;
4754
4794
 
4755
4795
  for (int col = tid; col < ncols; col += block_size) {
4756
- const int i = row*ncols + col;
4796
+ const int i = rowx*ncols + col;
4757
4797
  dst[i] *= inv_tmp;
4758
4798
  }
4759
4799
  }
@@ -5739,20 +5779,26 @@ static void rope_cuda(
5739
5779
 
5740
5780
  template<typename T>
5741
5781
  static void rope_neox_cuda(
5742
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5782
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5743
5783
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5744
5784
  ) {
5745
5785
  GGML_ASSERT(ncols % 2 == 0);
5746
5786
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5747
5787
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5748
5788
  const dim3 block_nums(nrows, num_blocks_x, 1);
5789
+
5790
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
5791
+ const float inv_ndims = -1.0f / n_dims;
5792
+
5749
5793
  if (pos == nullptr) {
5750
5794
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5751
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5795
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5796
+ theta_scale, inv_ndims
5752
5797
  );
5753
5798
  } else {
5754
5799
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5755
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5800
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5801
+ theta_scale, inv_ndims
5756
5802
  );
5757
5803
  }
5758
5804
  }
@@ -5784,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5784
5830
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5785
5831
  }
5786
5832
 
5787
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5788
- const dim3 block_dims(1, WARP_SIZE, 1);
5833
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
5834
+ int nth = WARP_SIZE;
5835
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
5836
+ const dim3 block_dims(nth, 1, 1);
5789
5837
  const dim3 block_nums(nrows_x, 1, 1);
5790
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5838
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5791
5839
  }
5792
5840
 
5793
5841
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -6707,15 +6755,14 @@ inline void ggml_cuda_op_rope(
6707
6755
  GGML_ASSERT(false);
6708
6756
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6709
6757
  } else if (is_neox) {
6710
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6711
6758
  if (src0->type == GGML_TYPE_F32) {
6712
6759
  rope_neox_cuda(
6713
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6760
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6714
6761
  attn_factor, corr_dims, main_stream
6715
6762
  );
6716
6763
  } else if (src0->type == GGML_TYPE_F16) {
6717
6764
  rope_neox_cuda(
6718
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6765
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6719
6766
  attn_factor, corr_dims, main_stream
6720
6767
  );
6721
6768
  } else {
@@ -6839,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
6839
6886
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6840
6887
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6841
6888
 
6889
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
6890
+
6842
6891
  const int64_t ne00 = src0->ne[0];
6843
- const int64_t nrows = ggml_nrows(src0);
6892
+ const int64_t nrows_x = ggml_nrows(src0);
6893
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6844
6894
 
6845
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6895
+ float scale = 1.0f;
6896
+ memcpy(&scale, dst->op_params, sizeof(float));
6897
+
6898
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6846
6899
 
6847
- (void) src1;
6848
6900
  (void) dst;
6849
- (void) src1_dd;
6850
6901
  }
6851
6902
 
6852
6903
  inline void ggml_cuda_op_scale(
@@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
1028
1028
  int nth = 32; // SIMD width
1029
1029
 
1030
1030
  if (ne00%4 == 0) {
1031
+ while (nth < ne00/4 && nth < 256) {
1032
+ nth *= 2;
1033
+ }
1031
1034
  [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
1032
1035
  } else {
1033
- do {
1036
+ while (nth < ne00 && nth < 1024) {
1034
1037
  nth *= 2;
1035
- } while (nth <= ne00 && nth <= 1024);
1036
- nth /= 2;
1038
+ }
1037
1039
  [encoder setComputePipelineState:ctx->pipeline_soft_max];
1038
1040
  }
1039
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1040
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1041
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1042
- [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1043
- [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1044
- [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
1041
+
1042
+ const float scale = ((float *) dst->op_params)[0];
1043
+
1044
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1045
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1046
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1047
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
1048
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
1049
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
1050
+ [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
1051
+ [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
1045
1052
 
1046
1053
  [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1047
1054
  } break;
@@ -1351,15 +1358,19 @@ void ggml_metal_graph_compute(
1351
1358
  float eps;
1352
1359
  memcpy(&eps, dst->op_params, sizeof(float));
1353
1360
 
1354
- const int nth = MIN(512, ne00);
1361
+ int nth = 32; // SIMD width
1362
+
1363
+ while (nth < ne00/4 && nth < 1024) {
1364
+ nth *= 2;
1365
+ }
1355
1366
 
1356
1367
  [encoder setComputePipelineState:ctx->pipeline_rms_norm];
1357
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1358
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1359
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1360
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1361
- [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1362
- [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
1368
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1369
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1370
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1371
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1372
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1373
+ [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
1363
1374
 
1364
1375
  const int64_t nrows = ggml_nrows(src0);
1365
1376
 
@@ -1433,7 +1444,8 @@ void ggml_metal_graph_compute(
1433
1444
  const int n_past = ((int32_t *) dst->op_params)[0];
1434
1445
  const int n_dims = ((int32_t *) dst->op_params)[1];
1435
1446
  const int mode = ((int32_t *) dst->op_params)[2];
1436
- const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
1447
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
1448
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1437
1449
 
1438
1450
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1439
1451
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
@@ -39,6 +39,8 @@ typedef struct {
39
39
  int8_t qs[QK8_0]; // quants
40
40
  } block_q8_0;
41
41
 
42
+ #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
43
+
42
44
  // general-purpose kernel for addition of two tensors
43
45
  // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
44
46
  // cons: not very efficient
@@ -180,10 +182,12 @@ kernel void kernel_gelu(
180
182
 
181
183
  kernel void kernel_soft_max(
182
184
  device const float * src0,
185
+ device const float * src1,
183
186
  device float * dst,
184
187
  constant int64_t & ne00,
185
188
  constant int64_t & ne01,
186
189
  constant int64_t & ne02,
190
+ constant float & scale,
187
191
  threadgroup float * buf [[threadgroup(0)]],
188
192
  uint tgpig[[threadgroup_position_in_grid]],
189
193
  uint tpitg[[thread_position_in_threadgroup]],
@@ -194,73 +198,77 @@ kernel void kernel_soft_max(
194
198
  const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
195
199
  const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
196
200
 
197
- device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
198
- device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
201
+ device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
202
+ device const float * pmask = src1 ? src1 + i01*ne00 : nullptr;
203
+ device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
199
204
 
200
205
  // parallel max
201
- float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
206
+ float lmax = -INFINITY;
202
207
 
203
- for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
204
- lmax = MAX(lmax, psrc0[i00]);
208
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
209
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
205
210
  }
206
211
 
207
- float max = simd_max(lmax);
208
- if (tiisg == 0) {
209
- buf[sgitg] = max;
210
- }
212
+ // find the max value in the block
213
+ float max_val = simd_max(lmax);
214
+ if (ntg > N_SIMDWIDTH) {
215
+ if (sgitg == 0) {
216
+ buf[tiisg] = -INFINITY;
217
+ }
211
218
 
212
- threadgroup_barrier(mem_flags::mem_threadgroup);
219
+ threadgroup_barrier(mem_flags::mem_threadgroup);
213
220
 
214
- // broadcast, simd group number is ntg / 32
215
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
216
- if (tpitg < i) {
217
- buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
218
- }
219
- }
221
+ if (tiisg == 0) {
222
+ buf[sgitg] = max_val;
223
+ }
220
224
 
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
225
+ threadgroup_barrier(mem_flags::mem_threadgroup);
222
226
 
223
- max = buf[0];
227
+ max_val = buf[tiisg];
228
+ max_val = simd_max(max_val);
229
+ }
224
230
 
225
231
  // parallel sum
226
232
  float lsum = 0.0f;
227
233
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
228
- const float exp_psrc0 = exp(psrc0[i00] - max);
234
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
229
235
  lsum += exp_psrc0;
230
- // Remember the result of exp here. exp is expensive, so we really do not
231
- // wish to compute it twice.
232
236
  pdst[i00] = exp_psrc0;
233
237
  }
234
238
 
235
239
  float sum = simd_sum(lsum);
236
- if (tiisg == 0) {
237
- buf[sgitg] = sum;
238
- }
240
+ if (ntg > N_SIMDWIDTH) {
241
+ if (sgitg == 0) {
242
+ buf[tiisg] = 0.0f;
243
+ }
239
244
 
240
- threadgroup_barrier(mem_flags::mem_threadgroup);
245
+ threadgroup_barrier(mem_flags::mem_threadgroup);
241
246
 
242
- // broadcast, simd group number is ntg / 32
243
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
244
- if (tpitg < i) {
245
- buf[tpitg] += buf[tpitg + i];
246
- }
247
- }
247
+ if (tiisg == 0) {
248
+ buf[sgitg] = sum;
249
+ }
248
250
 
249
- threadgroup_barrier(mem_flags::mem_threadgroup);
251
+ threadgroup_barrier(mem_flags::mem_threadgroup);
252
+
253
+ sum = buf[tiisg];
254
+ sum = simd_sum(sum);
255
+ }
250
256
 
251
- sum = buf[0];
257
+ const float inv_sum = 1.0f/sum;
252
258
 
253
259
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
254
- pdst[i00] /= sum;
260
+ pdst[i00] *= inv_sum;
255
261
  }
256
262
  }
257
263
 
258
264
  kernel void kernel_soft_max_4(
259
265
  device const float * src0,
266
+ device const float * src1,
260
267
  device float * dst,
261
268
  constant int64_t & ne00,
262
269
  constant int64_t & ne01,
263
270
  constant int64_t & ne02,
271
+ constant float & scale,
264
272
  threadgroup float * buf [[threadgroup(0)]],
265
273
  uint tgpig[[threadgroup_position_in_grid]],
266
274
  uint tpitg[[thread_position_in_threadgroup]],
@@ -271,64 +279,68 @@ kernel void kernel_soft_max_4(
271
279
  const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
272
280
  const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
273
281
 
274
- device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
275
- device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
282
+ device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
283
+ device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
284
+ device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
276
285
 
277
286
  // parallel max
278
- float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
287
+ float4 lmax4 = -INFINITY;
279
288
 
280
- for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
281
- lmax4 = fmax(lmax4, psrc4[i00]);
289
+ for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
290
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
282
291
  }
283
292
 
284
293
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
285
- float max = simd_max(lmax);
286
- if (tiisg == 0) {
287
- buf[sgitg] = max;
288
- }
289
294
 
290
- threadgroup_barrier(mem_flags::mem_threadgroup);
295
+ float max_val = simd_max(lmax);
296
+ if (ntg > N_SIMDWIDTH) {
297
+ if (sgitg == 0) {
298
+ buf[tiisg] = -INFINITY;
299
+ }
291
300
 
292
- // broadcast, simd group number is ntg / 32
293
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
294
- if (tpitg < i) {
295
- buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
296
- }
297
- }
301
+ threadgroup_barrier(mem_flags::mem_threadgroup);
298
302
 
299
- threadgroup_barrier(mem_flags::mem_threadgroup);
303
+ if (tiisg == 0) {
304
+ buf[sgitg] = max_val;
305
+ }
306
+
307
+ threadgroup_barrier(mem_flags::mem_threadgroup);
300
308
 
301
- max = buf[0];
309
+ max_val = buf[tiisg];
310
+ max_val = simd_max(max_val);
311
+ }
302
312
 
303
313
  // parallel sum
304
314
  float4 lsum4 = 0.0f;
305
315
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
306
- const float4 exp_psrc4 = exp(psrc4[i00] - max);
316
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
307
317
  lsum4 += exp_psrc4;
308
318
  pdst4[i00] = exp_psrc4;
309
319
  }
310
320
 
311
321
  const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
312
322
  float sum = simd_sum(lsum);
313
- if (tiisg == 0) {
314
- buf[sgitg] = sum;
315
- }
323
+ if (ntg > N_SIMDWIDTH) {
324
+ if (sgitg == 0) {
325
+ buf[tiisg] = 0.0f;
326
+ }
316
327
 
317
- threadgroup_barrier(mem_flags::mem_threadgroup);
328
+ threadgroup_barrier(mem_flags::mem_threadgroup);
318
329
 
319
- // broadcast, simd group number is ntg / 32
320
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
321
- if (tpitg < i) {
322
- buf[tpitg] += buf[tpitg + i];
323
- }
324
- }
330
+ if (tiisg == 0) {
331
+ buf[sgitg] = sum;
332
+ }
325
333
 
326
- threadgroup_barrier(mem_flags::mem_threadgroup);
334
+ threadgroup_barrier(mem_flags::mem_threadgroup);
335
+
336
+ sum = buf[tiisg];
337
+ sum = simd_sum(sum);
338
+ }
327
339
 
328
- sum = buf[0];
340
+ const float inv_sum = 1.0f/sum;
329
341
 
330
342
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
331
- pdst4[i00] /= sum;
343
+ pdst4[i00] *= inv_sum;
332
344
  }
333
345
  }
334
346
 
@@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
435
447
  constant int64_t & ne00,
436
448
  constant uint64_t & nb01,
437
449
  constant float & eps,
438
- threadgroup float * sum [[threadgroup(0)]],
450
+ threadgroup float * buf [[threadgroup(0)]],
439
451
  uint tgpig[[threadgroup_position_in_grid]],
440
452
  uint tpitg[[thread_position_in_threadgroup]],
441
453
  uint sgitg[[simdgroup_index_in_threadgroup]],
442
454
  uint tiisg[[thread_index_in_simdgroup]],
443
455
  uint ntg[[threads_per_threadgroup]]) {
444
- device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
445
- device const float * x_scalar = (device const float *) x;
456
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
446
457
 
447
458
  float4 sumf = 0;
448
459
  float all_sum = 0;
@@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
453
464
  }
454
465
  all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
455
466
  all_sum = simd_sum(all_sum);
456
- if (tiisg == 0) {
457
- sum[sgitg] = all_sum;
458
- }
467
+ if (ntg > N_SIMDWIDTH) {
468
+ if (sgitg == 0) {
469
+ buf[tiisg] = 0.0f;
470
+ }
459
471
 
460
- threadgroup_barrier(mem_flags::mem_threadgroup);
472
+ threadgroup_barrier(mem_flags::mem_threadgroup);
461
473
 
462
- // broadcast, simd group number is ntg / 32
463
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
464
- if (tpitg < i) {
465
- sum[tpitg] += sum[tpitg + i];
466
- }
467
- }
468
- if (tpitg == 0) {
469
- for (int i = 4 * (ne00 / 4); i < ne00; i++) {
470
- sum[0] += x_scalar[i];
474
+ if (tiisg == 0) {
475
+ buf[sgitg] = all_sum;
471
476
  }
472
- sum[0] /= ne00;
473
- }
474
477
 
475
- threadgroup_barrier(mem_flags::mem_threadgroup);
478
+ threadgroup_barrier(mem_flags::mem_threadgroup);
476
479
 
477
- const float mean = sum[0];
480
+ all_sum = buf[tiisg];
481
+ all_sum = simd_sum(all_sum);
482
+ }
483
+
484
+ const float mean = all_sum/ne00;
478
485
  const float scale = 1.0f/sqrt(mean + eps);
479
486
 
480
487
  device float4 * y = (device float4 *) (dst + tgpig*ne00);
481
- device float * y_scalar = (device float *) y;
482
488
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
483
489
  y[i00] = x[i00] * scale;
484
490
  }
485
- if (tpitg == 0) {
486
- for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
487
- y_scalar[i00] = x_scalar[i00] * scale;
488
- }
489
- }
490
491
  }
491
492
 
492
493
  // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
576
577
  // putting them in the kernel cause a significant performance penalty
577
578
  #define N_DST 4 // each SIMD group works on 4 rows
578
579
  #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
579
- #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
580
580
  //Note: This is a template, but strictly speaking it only applies to
581
581
  // quantizations where the block size is 32. It also does not
582
582
  // giard against the number of rows not being divisible by
@@ -1,20 +1,18 @@
1
+ #include "ggml.h"
1
2
  #include "ggml-opencl.h"
2
3
 
3
4
  #include <array>
4
5
  #include <atomic>
6
+ #include <cstdio>
7
+ #include <cstdlib>
8
+ #include <cstring>
9
+ #include <limits>
5
10
  #include <sstream>
6
11
  #include <vector>
7
- #include <limits>
8
12
 
9
13
  #define CL_TARGET_OPENCL_VERSION 110
10
14
  #include <clblast.h>
11
15
 
12
- #include <stdlib.h>
13
- #include <stdio.h>
14
- #include <string.h>
15
-
16
- #include "ggml.h"
17
-
18
16
  #if defined(_MSC_VER)
19
17
  #pragma warning(disable: 4244 4267) // possible loss of data
20
18
  #endif
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
4826
4826
  static struct ggml_tensor * ggml_soft_max_impl(
4827
4827
  struct ggml_context * ctx,
4828
4828
  struct ggml_tensor * a,
4829
+ struct ggml_tensor * mask,
4830
+ float scale,
4829
4831
  bool inplace) {
4832
+ GGML_ASSERT(ggml_is_contiguous(a));
4833
+ if (mask) {
4834
+ GGML_ASSERT(ggml_is_contiguous(mask));
4835
+ GGML_ASSERT(mask->ne[2] == 1);
4836
+ GGML_ASSERT(mask->ne[3] == 1);
4837
+ GGML_ASSERT(ggml_can_repeat_rows(mask, a));
4838
+ }
4839
+
4830
4840
  bool is_node = false;
4831
4841
 
4832
4842
  if (a->grad) {
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
4835
4845
 
4836
4846
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4837
4847
 
4848
+ float params[] = { scale };
4849
+ ggml_set_op_params(result, params, sizeof(params));
4850
+
4838
4851
  result->op = GGML_OP_SOFT_MAX;
4839
4852
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4840
4853
  result->src[0] = a;
4854
+ result->src[1] = mask;
4841
4855
 
4842
4856
  return result;
4843
4857
  }
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
4845
4859
  struct ggml_tensor * ggml_soft_max(
4846
4860
  struct ggml_context * ctx,
4847
4861
  struct ggml_tensor * a) {
4848
- return ggml_soft_max_impl(ctx, a, false);
4862
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
4849
4863
  }
4850
4864
 
4851
4865
  struct ggml_tensor * ggml_soft_max_inplace(
4852
4866
  struct ggml_context * ctx,
4853
4867
  struct ggml_tensor * a) {
4854
- return ggml_soft_max_impl(ctx, a, true);
4868
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
4869
+ }
4870
+
4871
+ struct ggml_tensor * ggml_soft_max_ext(
4872
+ struct ggml_context * ctx,
4873
+ struct ggml_tensor * a,
4874
+ struct ggml_tensor * mask,
4875
+ float scale) {
4876
+ return ggml_soft_max_impl(ctx, a, mask, scale, false);
4855
4877
  }
4856
4878
 
4857
4879
  // ggml_soft_max_back
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9373
9395
  // TODO: find the optimal values for these
9374
9396
  if (ggml_is_contiguous(src0) &&
9375
9397
  ggml_is_contiguous(src1) &&
9376
- src0->type == GGML_TYPE_F32 &&
9398
+ //src0->type == GGML_TYPE_F32 &&
9377
9399
  src1->type == GGML_TYPE_F32 &&
9378
9400
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9379
9401
 
@@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
10551
10573
  static void ggml_compute_forward_soft_max_f32(
10552
10574
  const struct ggml_compute_params * params,
10553
10575
  const struct ggml_tensor * src0,
10554
- struct ggml_tensor * dst) {
10555
- GGML_ASSERT(ggml_is_contiguous(src0));
10556
- GGML_ASSERT(ggml_is_contiguous(dst));
10557
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
10576
+ const struct ggml_tensor * src1,
10577
+ struct ggml_tensor * dst) {
10578
+ assert(ggml_is_contiguous(dst));
10579
+ assert(ggml_are_same_shape(src0, dst));
10558
10580
 
10559
10581
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10560
10582
  return;
10561
10583
  }
10562
10584
 
10585
+ float scale = 1.0f;
10586
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
10587
+
10563
10588
  // TODO: handle transposed/permuted matrices
10564
10589
 
10565
10590
  const int ith = params->ith;
10566
10591
  const int nth = params->nth;
10567
10592
 
10593
+ const int64_t ne11 = src1 ? src1->ne[1] : 1;
10594
+
10568
10595
  const int nc = src0->ne[0];
10569
10596
  const int nr = ggml_nrows(src0);
10570
10597
 
@@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
10575
10602
  const int ir0 = dr*ith;
10576
10603
  const int ir1 = MIN(ir0 + dr, nr);
10577
10604
 
10605
+ float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
10606
+
10578
10607
  for (int i1 = ir0; i1 < ir1; i1++) {
10579
- float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10580
- float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10608
+ float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10609
+ float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10610
+
10611
+ // broadcast the mask across rows
10612
+ float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
10613
+
10614
+ ggml_vec_cpy_f32 (nc, wp, sp);
10615
+ ggml_vec_scale_f32(nc, wp, scale);
10616
+ if (mp) {
10617
+ ggml_vec_acc_f32(nc, wp, mp);
10618
+ }
10581
10619
 
10582
10620
  #ifndef NDEBUG
10583
10621
  for (int i = 0; i < nc; ++i) {
10584
10622
  //printf("p[%d] = %f\n", i, p[i]);
10585
- assert(!isnan(sp[i]));
10623
+ assert(!isnan(wp[i]));
10586
10624
  }
10587
10625
  #endif
10588
10626
 
10589
10627
  float max = -INFINITY;
10590
- ggml_vec_max_f32(nc, &max, sp);
10628
+ ggml_vec_max_f32(nc, &max, wp);
10591
10629
 
10592
10630
  ggml_float sum = 0.0;
10593
10631
 
10594
10632
  uint16_t scvt;
10595
10633
  for (int i = 0; i < nc; i++) {
10596
- if (sp[i] == -INFINITY) {
10634
+ if (wp[i] == -INFINITY) {
10597
10635
  dp[i] = 0.0f;
10598
10636
  } else {
10599
- // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
10600
- ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
10637
+ // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
10638
+ ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
10601
10639
  memcpy(&scvt, &s, sizeof(scvt));
10602
10640
  const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
10603
10641
  sum += (ggml_float)val;
@@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
10622
10660
  static void ggml_compute_forward_soft_max(
10623
10661
  const struct ggml_compute_params * params,
10624
10662
  const struct ggml_tensor * src0,
10625
- struct ggml_tensor * dst) {
10663
+ const struct ggml_tensor * src1,
10664
+ struct ggml_tensor * dst) {
10626
10665
  switch (src0->type) {
10627
10666
  case GGML_TYPE_F32:
10628
10667
  {
10629
- ggml_compute_forward_soft_max_f32(params, src0, dst);
10668
+ ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
10630
10669
  } break;
10631
10670
  default:
10632
10671
  {
@@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13863
13902
  } break;
13864
13903
  case GGML_OP_SOFT_MAX:
13865
13904
  {
13866
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
13905
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
13867
13906
  } break;
13868
13907
  case GGML_OP_SOFT_MAX_BACK:
13869
13908
  {
@@ -15689,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15689
15728
  {
15690
15729
  n_tasks = 1;
15691
15730
  } break;
15692
- case GGML_OP_COUNT:
15693
- {
15694
- GGML_ASSERT(false);
15695
- } break;
15696
15731
  default:
15697
15732
  {
15698
- printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15733
+ fprintf(stderr, "%s: op not implemented: ", __func__);
15734
+ if (node->op < GGML_OP_COUNT) {
15735
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
15736
+ } else {
15737
+ fprintf(stderr, "%d\n", node->op);
15738
+ }
15699
15739
  GGML_ASSERT(false);
15700
15740
  } break;
15701
15741
  }
@@ -15898,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15898
15938
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15899
15939
  }
15900
15940
  } break;
15941
+ case GGML_OP_SOFT_MAX:
15942
+ {
15943
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
+
15945
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
+ } break;
15901
15947
  case GGML_OP_CONV_TRANSPOSE_1D:
15902
15948
  {
15903
15949
  GGML_ASSERT(node->src[0]->ne[3] == 1);
@@ -244,11 +244,10 @@
244
244
  #define GGML_ASSERT(x) \
245
245
  do { \
246
246
  if (!(x)) { \
247
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- fflush(stderr); \
249
247
  fflush(stdout); \
248
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
250
249
  ggml_print_backtrace(); \
251
- exit(1); \
250
+ abort(); \
252
251
  } \
253
252
  } while (0)
254
253
 
@@ -1283,6 +1282,14 @@ extern "C" {
1283
1282
  struct ggml_context * ctx,
1284
1283
  struct ggml_tensor * a);
1285
1284
 
1285
+ // fused soft_max(a*scale + mask)
1286
+ // mask is optional
1287
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
1288
+ struct ggml_context * ctx,
1289
+ struct ggml_tensor * a,
1290
+ struct ggml_tensor * mask,
1291
+ float scale);
1292
+
1286
1293
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1287
1294
  struct ggml_context * ctx,
1288
1295
  struct ggml_tensor * a,
@@ -46,7 +46,6 @@
46
46
  #endif
47
47
  #include <windows.h>
48
48
  #include <io.h>
49
- #include <stdio.h> // for _fseeki64
50
49
  #endif
51
50
 
52
51
  #include <algorithm>
@@ -1113,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1113
1112
  //
1114
1113
 
1115
1114
  struct llama_state {
1115
+ llama_state() {
1116
+ #ifdef GGML_USE_METAL
1117
+ ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1118
+ #endif
1119
+ }
1120
+
1116
1121
  // We save the log callback globally
1117
1122
  ggml_log_callback log_callback = llama_log_callback_default;
1118
1123
  void * log_callback_user_data = nullptr;
@@ -2634,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2634
2639
  }
2635
2640
 
2636
2641
  // general kv
2637
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2642
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2638
2643
 
2639
2644
  // special tokens
2640
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2641
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2642
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2643
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2644
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2645
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2645
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2646
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2647
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2648
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2649
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2650
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2646
2651
  }
2647
2652
 
2648
2653
  static void llm_load_tensors(
@@ -3464,7 +3469,7 @@ static void llm_build_k_shift(
3464
3469
  struct ggml_cgraph * graph,
3465
3470
  llm_rope_type type,
3466
3471
  int64_t n_ctx,
3467
- int64_t n_rot,
3472
+ int n_rot,
3468
3473
  float freq_base,
3469
3474
  float freq_scale,
3470
3475
  const llm_build_cb & cb) {
@@ -3496,7 +3501,7 @@ static void llm_build_k_shift(
3496
3501
  // we rotate only the first n_rot dimensions
3497
3502
  ggml_rope_custom_inplace(ctx,
3498
3503
  ggml_view_3d(ctx, kv.k,
3499
- n_rot, n_head_kv, n_ctx,
3504
+ n_embd_head, n_head_kv, n_ctx,
3500
3505
  ggml_element_size(kv.k)*n_embd_head,
3501
3506
  ggml_element_size(kv.k)*n_embd_gqa,
3502
3507
  ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
@@ -3694,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
3694
3699
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
3695
3700
  cb(kq, "kq", il);
3696
3701
 
3697
- kq = ggml_scale(ctx, kq, kq_scale);
3698
- cb(kq, "kq_scaled", il);
3699
-
3700
3702
  if (max_alibi_bias > 0.0f) {
3701
- // TODO: n_head or n_head_kv
3702
- // TODO: K-shift is likely not working
3703
- // TODO: change to ggml_add
3704
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3705
- cb(kq, "kq_scaled_alibi", il);
3706
- }
3703
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
3704
+ kq = ggml_scale(ctx, kq, kq_scale);
3705
+ cb(kq, "kq_scaled", il);
3706
+
3707
+ if (max_alibi_bias > 0.0f) {
3708
+ // TODO: n_head or n_head_kv
3709
+ // TODO: K-shift is likely not working
3710
+ // TODO: change to ggml_add
3711
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3712
+ cb(kq, "kq_scaled_alibi", il);
3713
+ }
3707
3714
 
3708
- kq = ggml_add(ctx, kq, kq_mask);
3709
- cb(kq, "kq_masked", il);
3715
+ kq = ggml_add(ctx, kq, kq_mask);
3716
+ cb(kq, "kq_masked", il);
3710
3717
 
3711
- kq = ggml_soft_max(ctx, kq);
3712
- cb(kq, "kq_soft_max", il);
3718
+ kq = ggml_soft_max(ctx, kq);
3719
+ cb(kq, "kq_soft_max", il);
3720
+ } else {
3721
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
3722
+ cb(kq, "kq_soft_max_ext", il);
3723
+ }
3713
3724
 
3714
3725
  // split cached v into n_head heads
3715
3726
  struct ggml_tensor * v =
@@ -5031,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5031
5042
  { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5032
5043
  { "kq_masked", OFFLOAD_FUNC_KQ },
5033
5044
  { "kq_soft_max", OFFLOAD_FUNC_V },
5045
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5034
5046
  { "v", OFFLOAD_FUNC_V },
5035
5047
  { "kqv", OFFLOAD_FUNC_V },
5036
5048
  { "kqv_merged", OFFLOAD_FUNC_V },
@@ -5539,18 +5551,8 @@ static int llama_decode_internal(
5539
5551
  n_threads = std::min(4, n_threads);
5540
5552
  }
5541
5553
 
5542
- // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5543
- const bool full_offload_supported =
5544
- model.arch == LLM_ARCH_LLAMA ||
5545
- model.arch == LLM_ARCH_BAICHUAN ||
5546
- model.arch == LLM_ARCH_FALCON ||
5547
- model.arch == LLM_ARCH_REFACT ||
5548
- model.arch == LLM_ARCH_MPT ||
5549
- model.arch == LLM_ARCH_STARCODER ||
5550
- model.arch == LLM_ARCH_STABLELM;
5551
-
5552
5554
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5553
- if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5555
+ if (ggml_cpu_has_cublas() && fully_offloaded) {
5554
5556
  n_threads = 1;
5555
5557
  }
5556
5558
 
@@ -6409,10 +6411,13 @@ struct llama_grammar_candidate {
6409
6411
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6410
6412
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6411
6413
  const char * src,
6414
+ size_t n_src,
6412
6415
  llama_partial_utf8 partial_start) {
6413
6416
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6414
6417
  const char * pos = src;
6415
6418
  std::vector<uint32_t> code_points;
6419
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
+ code_points.reserve(n_src + 1);
6416
6421
  uint32_t value = partial_start.value;
6417
6422
  int n_remain = partial_start.n_remain;
6418
6423
 
@@ -6463,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6463
6468
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6464
6469
  }
6465
6470
 
6471
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
+ std::string src,
6473
+ llama_partial_utf8 partial_start
6474
+ ) {
6475
+ return decode_utf8(src.c_str(), src.size(), partial_start);
6476
+ }
6477
+
6466
6478
  // returns true iff pos points to the end of one of the definitions of a rule
6467
6479
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6468
6480
  switch (pos->type) {
@@ -7016,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
7016
7028
  // Replace the data in candidates with the new_candidates data
7017
7029
  std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
7018
7030
  candidates->size = new_candidates.size();
7031
+ candidates->sorted = false;
7019
7032
 
7020
7033
  if (ctx) {
7021
7034
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7112,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7112
7125
  } else if (piece.empty() || piece[0] == 0) {
7113
7126
  candidates->data[i].logit = -INFINITY;
7114
7127
  } else {
7115
- candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
7128
+ candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
7116
7129
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
7117
7130
  }
7118
7131
  }
@@ -7319,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7319
7332
  const std::string piece = llama_token_to_piece(ctx, token);
7320
7333
 
7321
7334
  // Note terminating 0 in decoded string
7322
- const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
7335
+ const auto decoded = decode_utf8(piece, grammar->partial_utf8);
7323
7336
  const auto & code_points = decoded.first;
7324
7337
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
7325
7338
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -8564,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
8564
8577
 
8565
8578
  #ifdef GGML_USE_METAL
8566
8579
  if (model->n_gpu_layers > 0) {
8567
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8568
-
8569
8580
  ctx->ctx_metal = ggml_metal_init(1);
8570
8581
  if (!ctx->ctx_metal) {
8571
8582
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
@@ -9701,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
9701
9712
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
9702
9713
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
9703
9714
  g_state.log_callback_user_data = user_data;
9715
+ #ifdef GGML_USE_METAL
9716
+ ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
9717
+ #endif
9704
9718
  }
9705
9719
 
9706
9720
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@@ -185,7 +185,7 @@ extern "C" {
185
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
186
186
  float rope_freq_base; // RoPE base frequency, 0 = from model
187
187
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
- float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
188
+ float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
189
189
  float yarn_attn_factor; // YaRN magnitude scaling factor
190
190
  float yarn_beta_fast; // YaRN low correction dim
191
191
  float yarn_beta_slow; // YaRN high correction dim
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.4'
6
+ VERSION = '0.9.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1555'
9
+ LLAMA_CPP_VERSION = 'b1593'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-25 00:00:00.000000000 Z
11
+ date: 2023-12-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
80
  - !ruby/object:Gem::Version
81
81
  version: '0'
82
82
  requirements: []
83
- rubygems_version: 3.4.20
83
+ rubygems_version: 3.4.22
84
84
  signing_key:
85
85
  specification_version: 4
86
86
  summary: Ruby bindings for the llama.cpp.