llama_cpp 0.9.4 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
4
- data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
3
+ metadata.gz: 4fd4e1a5e4d7e2442ab43255996da3ce92f898f9876f1bda343e2433c5050dd7
4
+ data.tar.gz: dece2da6c9befa15e6990d18fb58e2bf13d8da6c62033969b6b5104f82df736d
5
5
  SHA512:
6
- metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
7
- data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
6
+ metadata.gz: 51a383690b6e90e9493e1f318e916dfd94a909f4e554afd8ea822d047f05e96be3e2f371e83f0da5a37a9837d9ae5ecc6992bb9d9c0fd60a9de521bcd148e8f7
7
+ data.tar.gz: 15bbe94edb232d1979f2907c6c3ab7325a1089f9dcdd5d4262d7f0955fd6183e6b01cfee16593165f6e9901991e765ea30740bc1a83cca8fad60df4417551e3b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.9.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.4...v0.9.5)] - 2023-12-02
2
+
3
+ - Bump bundled llama.cpp from b1555 to b1593.
4
+
1
5
  ## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
2
6
 
3
7
  - Bump bundled llama.cpp from b1523 to b1555.
@@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
137
137
 
138
138
  #ifdef GGML_ALLOCATOR_DEBUG
139
139
  add_allocated_tensor(alloc, tensor);
140
- size_t cur_max = (char*)addr - (char*)alloc->data + size;
140
+ size_t cur_max = (char*)addr - (char*)alloc->base + size;
141
141
  if (cur_max > alloc->max_size) {
142
142
  printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
143
143
  for (int i = 0; i < 1024; i++) {
@@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
443
443
  #define CUDA_SCALE_BLOCK_SIZE 256
444
444
  #define CUDA_CLAMP_BLOCK_SIZE 256
445
445
  #define CUDA_ROPE_BLOCK_SIZE 256
446
+ #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
446
447
  #define CUDA_ALIBI_BLOCK_SIZE 32
447
448
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
448
449
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
@@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
501
502
 
502
503
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
503
504
 
505
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
509
+ }
510
+ return x;
511
+ }
512
+
513
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
514
+ #pragma unroll
515
+ for (int mask = 16; mask > 0; mask >>= 1) {
516
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
517
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
518
+ }
519
+ return a;
520
+ }
521
+
522
+ static __device__ __forceinline__ float warp_reduce_max(float x) {
523
+ #pragma unroll
524
+ for (int mask = 16; mask > 0; mask >>= 1) {
525
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
526
+ }
527
+ return x;
528
+ }
529
+
504
530
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
505
531
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
506
532
 
@@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
577
603
  dst[i] = x[i] * x[i];
578
604
  }
579
605
 
580
- static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
581
- #pragma unroll
582
- for (int mask = 16; mask > 0; mask >>= 1) {
583
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
584
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
585
- }
586
- return a;
587
- }
588
-
589
606
  template <int block_size>
590
607
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
591
608
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
624
641
  }
625
642
  }
626
643
 
627
- static __device__ __forceinline__ float warp_reduce_sum(float x) {
628
- #pragma unroll
629
- for (int mask = 16; mask > 0; mask >>= 1) {
630
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
631
- }
632
- return x;
633
- }
634
-
635
644
  template <int block_size>
636
645
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
637
646
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
@@ -4610,8 +4619,8 @@ static __global__ void rope(
4610
4619
 
4611
4620
  template<typename T, bool has_pos>
4612
4621
  static __global__ void rope_neox(
4613
- const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
4614
- float ext_factor, float attn_factor, rope_corr_dims corr_dims
4622
+ const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
4623
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
4615
4624
  ) {
4616
4625
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4617
4626
 
@@ -4620,23 +4629,25 @@ static __global__ void rope_neox(
4620
4629
  }
4621
4630
 
4622
4631
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4623
- const int i = row*ncols + col/2;
4632
+ const int ib = col / n_dims;
4633
+ const int ic = col % n_dims;
4634
+
4635
+ const int i = row*ncols + ib*n_dims + ic/2;
4624
4636
  const int i2 = row/p_delta_rows;
4625
4637
 
4626
- // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
4627
- const float cur_rot = -float(col)/ncols;
4638
+ float cur_rot = inv_ndims * ic - ib;
4628
4639
 
4629
4640
  const int p = has_pos ? pos[i2] : 0;
4630
- const float theta_base = p*powf(freq_base, cur_rot);
4641
+ const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
4631
4642
 
4632
4643
  float cos_theta, sin_theta;
4633
4644
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
4634
4645
 
4635
4646
  const float x0 = x[i + 0];
4636
- const float x1 = x[i + ncols/2];
4647
+ const float x1 = x[i + n_dims/2];
4637
4648
 
4638
- dst[i + 0] = x0*cos_theta - x1*sin_theta;
4639
- dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4649
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4650
+ dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
4640
4651
  }
4641
4652
 
4642
4653
  static __global__ void rope_glm_f32(
@@ -4715,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
4715
4726
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
4716
4727
  }
4717
4728
 
4718
- // the CUDA soft max implementation differs from the CPU implementation
4719
- // instead of doubles floats are used
4720
- static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
4721
- const int row = blockDim.x*blockIdx.x + threadIdx.x;
4722
- const int block_size = blockDim.y;
4723
- const int tid = threadIdx.y;
4729
+ static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
4730
+ const int tid = threadIdx.x;
4731
+ const int rowx = blockIdx.x;
4732
+ const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
4733
+
4734
+ const int block_size = blockDim.x;
4735
+
4736
+ const int warp_id = threadIdx.x / WARP_SIZE;
4737
+ const int lane_id = threadIdx.x % WARP_SIZE;
4738
+
4739
+ __shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
4724
4740
 
4725
4741
  float max_val = -INFINITY;
4726
4742
 
4727
4743
  for (int col = tid; col < ncols; col += block_size) {
4728
- const int i = row*ncols + col;
4729
- max_val = max(max_val, x[i]);
4744
+ const int ix = rowx*ncols + col;
4745
+ const int iy = rowy*ncols + col;
4746
+ max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
4730
4747
  }
4731
4748
 
4732
4749
  // find the max value in the block
4733
- #pragma unroll
4734
- for (int mask = 16; mask > 0; mask >>= 1) {
4735
- max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4750
+ max_val = warp_reduce_max(max_val);
4751
+ if (block_size > WARP_SIZE) {
4752
+ if (warp_id == 0) {
4753
+ buf[lane_id] = -INFINITY;
4754
+ }
4755
+ __syncthreads();
4756
+
4757
+ if (lane_id == 0) {
4758
+ buf[warp_id] = max_val;
4759
+ }
4760
+ __syncthreads();
4761
+
4762
+ max_val = buf[lane_id];
4763
+ max_val = warp_reduce_max(max_val);
4736
4764
  }
4737
4765
 
4738
4766
  float tmp = 0.f;
4739
4767
 
4740
4768
  for (int col = tid; col < ncols; col += block_size) {
4741
- const int i = row*ncols + col;
4742
- const float val = expf(x[i] - max_val);
4769
+ const int ix = rowx*ncols + col;
4770
+ const int iy = rowy*ncols + col;
4771
+ const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
4743
4772
  tmp += val;
4744
- dst[i] = val;
4773
+ dst[ix] = val;
4745
4774
  }
4746
4775
 
4747
- // sum up partial sums
4748
- #pragma unroll
4749
- for (int mask = 16; mask > 0; mask >>= 1) {
4750
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
4776
+ // find the sum of exps in the block
4777
+ tmp = warp_reduce_sum(tmp);
4778
+ if (block_size > WARP_SIZE) {
4779
+ if (warp_id == 0) {
4780
+ buf[lane_id] = 0.f;
4781
+ }
4782
+ __syncthreads();
4783
+
4784
+ if (lane_id == 0) {
4785
+ buf[warp_id] = tmp;
4786
+ }
4787
+ __syncthreads();
4788
+
4789
+ tmp = buf[lane_id];
4790
+ tmp = warp_reduce_sum(tmp);
4751
4791
  }
4752
4792
 
4753
4793
  const float inv_tmp = 1.f / tmp;
4754
4794
 
4755
4795
  for (int col = tid; col < ncols; col += block_size) {
4756
- const int i = row*ncols + col;
4796
+ const int i = rowx*ncols + col;
4757
4797
  dst[i] *= inv_tmp;
4758
4798
  }
4759
4799
  }
@@ -5739,20 +5779,26 @@ static void rope_cuda(
5739
5779
 
5740
5780
  template<typename T>
5741
5781
  static void rope_neox_cuda(
5742
- const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5782
+ const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
5743
5783
  float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
5744
5784
  ) {
5745
5785
  GGML_ASSERT(ncols % 2 == 0);
5746
5786
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5747
5787
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5748
5788
  const dim3 block_nums(nrows, num_blocks_x, 1);
5789
+
5790
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
5791
+ const float inv_ndims = -1.0f / n_dims;
5792
+
5749
5793
  if (pos == nullptr) {
5750
5794
  rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
5751
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5795
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5796
+ theta_scale, inv_ndims
5752
5797
  );
5753
5798
  } else {
5754
5799
  rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
5755
- x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
5800
+ x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
5801
+ theta_scale, inv_ndims
5756
5802
  );
5757
5803
  }
5758
5804
  }
@@ -5784,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
5784
5830
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
5785
5831
  }
5786
5832
 
5787
- static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
5788
- const dim3 block_dims(1, WARP_SIZE, 1);
5833
+ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
5834
+ int nth = WARP_SIZE;
5835
+ while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
5836
+ const dim3 block_dims(nth, 1, 1);
5789
5837
  const dim3 block_nums(nrows_x, 1, 1);
5790
- soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
5838
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
5791
5839
  }
5792
5840
 
5793
5841
  static void im2col_f32_f16_cuda(const float * x, half * dst,
@@ -6707,15 +6755,14 @@ inline void ggml_cuda_op_rope(
6707
6755
  GGML_ASSERT(false);
6708
6756
  rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
6709
6757
  } else if (is_neox) {
6710
- GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6711
6758
  if (src0->type == GGML_TYPE_F32) {
6712
6759
  rope_neox_cuda(
6713
- (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6760
+ (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6714
6761
  attn_factor, corr_dims, main_stream
6715
6762
  );
6716
6763
  } else if (src0->type == GGML_TYPE_F16) {
6717
6764
  rope_neox_cuda(
6718
- (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6765
+ (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
6719
6766
  attn_factor, corr_dims, main_stream
6720
6767
  );
6721
6768
  } else {
@@ -6839,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
6839
6886
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6840
6887
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6841
6888
 
6889
+ GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
6890
+
6842
6891
  const int64_t ne00 = src0->ne[0];
6843
- const int64_t nrows = ggml_nrows(src0);
6892
+ const int64_t nrows_x = ggml_nrows(src0);
6893
+ const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
6844
6894
 
6845
- soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
6895
+ float scale = 1.0f;
6896
+ memcpy(&scale, dst->op_params, sizeof(float));
6897
+
6898
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
6846
6899
 
6847
- (void) src1;
6848
6900
  (void) dst;
6849
- (void) src1_dd;
6850
6901
  }
6851
6902
 
6852
6903
  inline void ggml_cuda_op_scale(
@@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
1028
1028
  int nth = 32; // SIMD width
1029
1029
 
1030
1030
  if (ne00%4 == 0) {
1031
+ while (nth < ne00/4 && nth < 256) {
1032
+ nth *= 2;
1033
+ }
1031
1034
  [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
1032
1035
  } else {
1033
- do {
1036
+ while (nth < ne00 && nth < 1024) {
1034
1037
  nth *= 2;
1035
- } while (nth <= ne00 && nth <= 1024);
1036
- nth /= 2;
1038
+ }
1037
1039
  [encoder setComputePipelineState:ctx->pipeline_soft_max];
1038
1040
  }
1039
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1040
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1041
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
1042
- [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
1043
- [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
1044
- [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
1041
+
1042
+ const float scale = ((float *) dst->op_params)[0];
1043
+
1044
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1045
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
1046
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
1047
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
1048
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
1049
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
1050
+ [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
1051
+ [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
1045
1052
 
1046
1053
  [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1047
1054
  } break;
@@ -1351,15 +1358,19 @@ void ggml_metal_graph_compute(
1351
1358
  float eps;
1352
1359
  memcpy(&eps, dst->op_params, sizeof(float));
1353
1360
 
1354
- const int nth = MIN(512, ne00);
1361
+ int nth = 32; // SIMD width
1362
+
1363
+ while (nth < ne00/4 && nth < 1024) {
1364
+ nth *= 2;
1365
+ }
1355
1366
 
1356
1367
  [encoder setComputePipelineState:ctx->pipeline_rms_norm];
1357
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1358
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1359
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1360
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1361
- [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1362
- [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
1368
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1369
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1370
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
1371
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
1372
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
1373
+ [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
1363
1374
 
1364
1375
  const int64_t nrows = ggml_nrows(src0);
1365
1376
 
@@ -1433,7 +1444,8 @@ void ggml_metal_graph_compute(
1433
1444
  const int n_past = ((int32_t *) dst->op_params)[0];
1434
1445
  const int n_dims = ((int32_t *) dst->op_params)[1];
1435
1446
  const int mode = ((int32_t *) dst->op_params)[2];
1436
- const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
1447
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
1448
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1437
1449
 
1438
1450
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1439
1451
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
@@ -39,6 +39,8 @@ typedef struct {
39
39
  int8_t qs[QK8_0]; // quants
40
40
  } block_q8_0;
41
41
 
42
+ #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
43
+
42
44
  // general-purpose kernel for addition of two tensors
43
45
  // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
44
46
  // cons: not very efficient
@@ -180,10 +182,12 @@ kernel void kernel_gelu(
180
182
 
181
183
  kernel void kernel_soft_max(
182
184
  device const float * src0,
185
+ device const float * src1,
183
186
  device float * dst,
184
187
  constant int64_t & ne00,
185
188
  constant int64_t & ne01,
186
189
  constant int64_t & ne02,
190
+ constant float & scale,
187
191
  threadgroup float * buf [[threadgroup(0)]],
188
192
  uint tgpig[[threadgroup_position_in_grid]],
189
193
  uint tpitg[[thread_position_in_threadgroup]],
@@ -194,73 +198,77 @@ kernel void kernel_soft_max(
194
198
  const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
195
199
  const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
196
200
 
197
- device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
198
- device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
201
+ device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
202
+ device const float * pmask = src1 ? src1 + i01*ne00 : nullptr;
203
+ device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
199
204
 
200
205
  // parallel max
201
- float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
206
+ float lmax = -INFINITY;
202
207
 
203
- for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
204
- lmax = MAX(lmax, psrc0[i00]);
208
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
209
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
205
210
  }
206
211
 
207
- float max = simd_max(lmax);
208
- if (tiisg == 0) {
209
- buf[sgitg] = max;
210
- }
212
+ // find the max value in the block
213
+ float max_val = simd_max(lmax);
214
+ if (ntg > N_SIMDWIDTH) {
215
+ if (sgitg == 0) {
216
+ buf[tiisg] = -INFINITY;
217
+ }
211
218
 
212
- threadgroup_barrier(mem_flags::mem_threadgroup);
219
+ threadgroup_barrier(mem_flags::mem_threadgroup);
213
220
 
214
- // broadcast, simd group number is ntg / 32
215
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
216
- if (tpitg < i) {
217
- buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
218
- }
219
- }
221
+ if (tiisg == 0) {
222
+ buf[sgitg] = max_val;
223
+ }
220
224
 
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
225
+ threadgroup_barrier(mem_flags::mem_threadgroup);
222
226
 
223
- max = buf[0];
227
+ max_val = buf[tiisg];
228
+ max_val = simd_max(max_val);
229
+ }
224
230
 
225
231
  // parallel sum
226
232
  float lsum = 0.0f;
227
233
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
228
- const float exp_psrc0 = exp(psrc0[i00] - max);
234
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
229
235
  lsum += exp_psrc0;
230
- // Remember the result of exp here. exp is expensive, so we really do not
231
- // wish to compute it twice.
232
236
  pdst[i00] = exp_psrc0;
233
237
  }
234
238
 
235
239
  float sum = simd_sum(lsum);
236
- if (tiisg == 0) {
237
- buf[sgitg] = sum;
238
- }
240
+ if (ntg > N_SIMDWIDTH) {
241
+ if (sgitg == 0) {
242
+ buf[tiisg] = 0.0f;
243
+ }
239
244
 
240
- threadgroup_barrier(mem_flags::mem_threadgroup);
245
+ threadgroup_barrier(mem_flags::mem_threadgroup);
241
246
 
242
- // broadcast, simd group number is ntg / 32
243
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
244
- if (tpitg < i) {
245
- buf[tpitg] += buf[tpitg + i];
246
- }
247
- }
247
+ if (tiisg == 0) {
248
+ buf[sgitg] = sum;
249
+ }
248
250
 
249
- threadgroup_barrier(mem_flags::mem_threadgroup);
251
+ threadgroup_barrier(mem_flags::mem_threadgroup);
252
+
253
+ sum = buf[tiisg];
254
+ sum = simd_sum(sum);
255
+ }
250
256
 
251
- sum = buf[0];
257
+ const float inv_sum = 1.0f/sum;
252
258
 
253
259
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
254
- pdst[i00] /= sum;
260
+ pdst[i00] *= inv_sum;
255
261
  }
256
262
  }
257
263
 
258
264
  kernel void kernel_soft_max_4(
259
265
  device const float * src0,
266
+ device const float * src1,
260
267
  device float * dst,
261
268
  constant int64_t & ne00,
262
269
  constant int64_t & ne01,
263
270
  constant int64_t & ne02,
271
+ constant float & scale,
264
272
  threadgroup float * buf [[threadgroup(0)]],
265
273
  uint tgpig[[threadgroup_position_in_grid]],
266
274
  uint tpitg[[thread_position_in_threadgroup]],
@@ -271,64 +279,68 @@ kernel void kernel_soft_max_4(
271
279
  const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
272
280
  const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
273
281
 
274
- device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
275
- device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
282
+ device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
283
+ device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
284
+ device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
276
285
 
277
286
  // parallel max
278
- float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
287
+ float4 lmax4 = -INFINITY;
279
288
 
280
- for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
281
- lmax4 = fmax(lmax4, psrc4[i00]);
289
+ for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
290
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
282
291
  }
283
292
 
284
293
  const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
285
- float max = simd_max(lmax);
286
- if (tiisg == 0) {
287
- buf[sgitg] = max;
288
- }
289
294
 
290
- threadgroup_barrier(mem_flags::mem_threadgroup);
295
+ float max_val = simd_max(lmax);
296
+ if (ntg > N_SIMDWIDTH) {
297
+ if (sgitg == 0) {
298
+ buf[tiisg] = -INFINITY;
299
+ }
291
300
 
292
- // broadcast, simd group number is ntg / 32
293
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
294
- if (tpitg < i) {
295
- buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
296
- }
297
- }
301
+ threadgroup_barrier(mem_flags::mem_threadgroup);
298
302
 
299
- threadgroup_barrier(mem_flags::mem_threadgroup);
303
+ if (tiisg == 0) {
304
+ buf[sgitg] = max_val;
305
+ }
306
+
307
+ threadgroup_barrier(mem_flags::mem_threadgroup);
300
308
 
301
- max = buf[0];
309
+ max_val = buf[tiisg];
310
+ max_val = simd_max(max_val);
311
+ }
302
312
 
303
313
  // parallel sum
304
314
  float4 lsum4 = 0.0f;
305
315
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
306
- const float4 exp_psrc4 = exp(psrc4[i00] - max);
316
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
307
317
  lsum4 += exp_psrc4;
308
318
  pdst4[i00] = exp_psrc4;
309
319
  }
310
320
 
311
321
  const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
312
322
  float sum = simd_sum(lsum);
313
- if (tiisg == 0) {
314
- buf[sgitg] = sum;
315
- }
323
+ if (ntg > N_SIMDWIDTH) {
324
+ if (sgitg == 0) {
325
+ buf[tiisg] = 0.0f;
326
+ }
316
327
 
317
- threadgroup_barrier(mem_flags::mem_threadgroup);
328
+ threadgroup_barrier(mem_flags::mem_threadgroup);
318
329
 
319
- // broadcast, simd group number is ntg / 32
320
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
321
- if (tpitg < i) {
322
- buf[tpitg] += buf[tpitg + i];
323
- }
324
- }
330
+ if (tiisg == 0) {
331
+ buf[sgitg] = sum;
332
+ }
325
333
 
326
- threadgroup_barrier(mem_flags::mem_threadgroup);
334
+ threadgroup_barrier(mem_flags::mem_threadgroup);
335
+
336
+ sum = buf[tiisg];
337
+ sum = simd_sum(sum);
338
+ }
327
339
 
328
- sum = buf[0];
340
+ const float inv_sum = 1.0f/sum;
329
341
 
330
342
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
331
- pdst4[i00] /= sum;
343
+ pdst4[i00] *= inv_sum;
332
344
  }
333
345
  }
334
346
 
@@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
435
447
  constant int64_t & ne00,
436
448
  constant uint64_t & nb01,
437
449
  constant float & eps,
438
- threadgroup float * sum [[threadgroup(0)]],
450
+ threadgroup float * buf [[threadgroup(0)]],
439
451
  uint tgpig[[threadgroup_position_in_grid]],
440
452
  uint tpitg[[thread_position_in_threadgroup]],
441
453
  uint sgitg[[simdgroup_index_in_threadgroup]],
442
454
  uint tiisg[[thread_index_in_simdgroup]],
443
455
  uint ntg[[threads_per_threadgroup]]) {
444
- device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
445
- device const float * x_scalar = (device const float *) x;
456
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
446
457
 
447
458
  float4 sumf = 0;
448
459
  float all_sum = 0;
@@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
453
464
  }
454
465
  all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
455
466
  all_sum = simd_sum(all_sum);
456
- if (tiisg == 0) {
457
- sum[sgitg] = all_sum;
458
- }
467
+ if (ntg > N_SIMDWIDTH) {
468
+ if (sgitg == 0) {
469
+ buf[tiisg] = 0.0f;
470
+ }
459
471
 
460
- threadgroup_barrier(mem_flags::mem_threadgroup);
472
+ threadgroup_barrier(mem_flags::mem_threadgroup);
461
473
 
462
- // broadcast, simd group number is ntg / 32
463
- for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
464
- if (tpitg < i) {
465
- sum[tpitg] += sum[tpitg + i];
466
- }
467
- }
468
- if (tpitg == 0) {
469
- for (int i = 4 * (ne00 / 4); i < ne00; i++) {
470
- sum[0] += x_scalar[i];
474
+ if (tiisg == 0) {
475
+ buf[sgitg] = all_sum;
471
476
  }
472
- sum[0] /= ne00;
473
- }
474
477
 
475
- threadgroup_barrier(mem_flags::mem_threadgroup);
478
+ threadgroup_barrier(mem_flags::mem_threadgroup);
476
479
 
477
- const float mean = sum[0];
480
+ all_sum = buf[tiisg];
481
+ all_sum = simd_sum(all_sum);
482
+ }
483
+
484
+ const float mean = all_sum/ne00;
478
485
  const float scale = 1.0f/sqrt(mean + eps);
479
486
 
480
487
  device float4 * y = (device float4 *) (dst + tgpig*ne00);
481
- device float * y_scalar = (device float *) y;
482
488
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
483
489
  y[i00] = x[i00] * scale;
484
490
  }
485
- if (tpitg == 0) {
486
- for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
487
- y_scalar[i00] = x_scalar[i00] * scale;
488
- }
489
- }
490
491
  }
491
492
 
492
493
  // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
@@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
576
577
  // putting them in the kernel cause a significant performance penalty
577
578
  #define N_DST 4 // each SIMD group works on 4 rows
578
579
  #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
579
- #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
580
580
  //Note: This is a template, but strictly speaking it only applies to
581
581
  // quantizations where the block size is 32. It also does not
582
582
  // giard against the number of rows not being divisible by
@@ -1,20 +1,18 @@
1
+ #include "ggml.h"
1
2
  #include "ggml-opencl.h"
2
3
 
3
4
  #include <array>
4
5
  #include <atomic>
6
+ #include <cstdio>
7
+ #include <cstdlib>
8
+ #include <cstring>
9
+ #include <limits>
5
10
  #include <sstream>
6
11
  #include <vector>
7
- #include <limits>
8
12
 
9
13
  #define CL_TARGET_OPENCL_VERSION 110
10
14
  #include <clblast.h>
11
15
 
12
- #include <stdlib.h>
13
- #include <stdio.h>
14
- #include <string.h>
15
-
16
- #include "ggml.h"
17
-
18
16
  #if defined(_MSC_VER)
19
17
  #pragma warning(disable: 4244 4267) // possible loss of data
20
18
  #endif
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
4826
4826
  static struct ggml_tensor * ggml_soft_max_impl(
4827
4827
  struct ggml_context * ctx,
4828
4828
  struct ggml_tensor * a,
4829
+ struct ggml_tensor * mask,
4830
+ float scale,
4829
4831
  bool inplace) {
4832
+ GGML_ASSERT(ggml_is_contiguous(a));
4833
+ if (mask) {
4834
+ GGML_ASSERT(ggml_is_contiguous(mask));
4835
+ GGML_ASSERT(mask->ne[2] == 1);
4836
+ GGML_ASSERT(mask->ne[3] == 1);
4837
+ GGML_ASSERT(ggml_can_repeat_rows(mask, a));
4838
+ }
4839
+
4830
4840
  bool is_node = false;
4831
4841
 
4832
4842
  if (a->grad) {
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
4835
4845
 
4836
4846
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
4837
4847
 
4848
+ float params[] = { scale };
4849
+ ggml_set_op_params(result, params, sizeof(params));
4850
+
4838
4851
  result->op = GGML_OP_SOFT_MAX;
4839
4852
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4840
4853
  result->src[0] = a;
4854
+ result->src[1] = mask;
4841
4855
 
4842
4856
  return result;
4843
4857
  }
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
4845
4859
  struct ggml_tensor * ggml_soft_max(
4846
4860
  struct ggml_context * ctx,
4847
4861
  struct ggml_tensor * a) {
4848
- return ggml_soft_max_impl(ctx, a, false);
4862
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
4849
4863
  }
4850
4864
 
4851
4865
  struct ggml_tensor * ggml_soft_max_inplace(
4852
4866
  struct ggml_context * ctx,
4853
4867
  struct ggml_tensor * a) {
4854
- return ggml_soft_max_impl(ctx, a, true);
4868
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
4869
+ }
4870
+
4871
+ struct ggml_tensor * ggml_soft_max_ext(
4872
+ struct ggml_context * ctx,
4873
+ struct ggml_tensor * a,
4874
+ struct ggml_tensor * mask,
4875
+ float scale) {
4876
+ return ggml_soft_max_impl(ctx, a, mask, scale, false);
4855
4877
  }
4856
4878
 
4857
4879
  // ggml_soft_max_back
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9373
9395
  // TODO: find the optimal values for these
9374
9396
  if (ggml_is_contiguous(src0) &&
9375
9397
  ggml_is_contiguous(src1) &&
9376
- src0->type == GGML_TYPE_F32 &&
9398
+ //src0->type == GGML_TYPE_F32 &&
9377
9399
  src1->type == GGML_TYPE_F32 &&
9378
9400
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9379
9401
 
@@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
10551
10573
  static void ggml_compute_forward_soft_max_f32(
10552
10574
  const struct ggml_compute_params * params,
10553
10575
  const struct ggml_tensor * src0,
10554
- struct ggml_tensor * dst) {
10555
- GGML_ASSERT(ggml_is_contiguous(src0));
10556
- GGML_ASSERT(ggml_is_contiguous(dst));
10557
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
10576
+ const struct ggml_tensor * src1,
10577
+ struct ggml_tensor * dst) {
10578
+ assert(ggml_is_contiguous(dst));
10579
+ assert(ggml_are_same_shape(src0, dst));
10558
10580
 
10559
10581
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10560
10582
  return;
10561
10583
  }
10562
10584
 
10585
+ float scale = 1.0f;
10586
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
10587
+
10563
10588
  // TODO: handle transposed/permuted matrices
10564
10589
 
10565
10590
  const int ith = params->ith;
10566
10591
  const int nth = params->nth;
10567
10592
 
10593
+ const int64_t ne11 = src1 ? src1->ne[1] : 1;
10594
+
10568
10595
  const int nc = src0->ne[0];
10569
10596
  const int nr = ggml_nrows(src0);
10570
10597
 
@@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
10575
10602
  const int ir0 = dr*ith;
10576
10603
  const int ir1 = MIN(ir0 + dr, nr);
10577
10604
 
10605
+ float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
10606
+
10578
10607
  for (int i1 = ir0; i1 < ir1; i1++) {
10579
- float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10580
- float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10608
+ float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
10609
+ float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
10610
+
10611
+ // broadcast the mask across rows
10612
+ float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
10613
+
10614
+ ggml_vec_cpy_f32 (nc, wp, sp);
10615
+ ggml_vec_scale_f32(nc, wp, scale);
10616
+ if (mp) {
10617
+ ggml_vec_acc_f32(nc, wp, mp);
10618
+ }
10581
10619
 
10582
10620
  #ifndef NDEBUG
10583
10621
  for (int i = 0; i < nc; ++i) {
10584
10622
  //printf("p[%d] = %f\n", i, p[i]);
10585
- assert(!isnan(sp[i]));
10623
+ assert(!isnan(wp[i]));
10586
10624
  }
10587
10625
  #endif
10588
10626
 
10589
10627
  float max = -INFINITY;
10590
- ggml_vec_max_f32(nc, &max, sp);
10628
+ ggml_vec_max_f32(nc, &max, wp);
10591
10629
 
10592
10630
  ggml_float sum = 0.0;
10593
10631
 
10594
10632
  uint16_t scvt;
10595
10633
  for (int i = 0; i < nc; i++) {
10596
- if (sp[i] == -INFINITY) {
10634
+ if (wp[i] == -INFINITY) {
10597
10635
  dp[i] = 0.0f;
10598
10636
  } else {
10599
- // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
10600
- ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
10637
+ // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
10638
+ ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
10601
10639
  memcpy(&scvt, &s, sizeof(scvt));
10602
10640
  const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
10603
10641
  sum += (ggml_float)val;
@@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
10622
10660
  static void ggml_compute_forward_soft_max(
10623
10661
  const struct ggml_compute_params * params,
10624
10662
  const struct ggml_tensor * src0,
10625
- struct ggml_tensor * dst) {
10663
+ const struct ggml_tensor * src1,
10664
+ struct ggml_tensor * dst) {
10626
10665
  switch (src0->type) {
10627
10666
  case GGML_TYPE_F32:
10628
10667
  {
10629
- ggml_compute_forward_soft_max_f32(params, src0, dst);
10668
+ ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
10630
10669
  } break;
10631
10670
  default:
10632
10671
  {
@@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
13863
13902
  } break;
13864
13903
  case GGML_OP_SOFT_MAX:
13865
13904
  {
13866
- ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
13905
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
13867
13906
  } break;
13868
13907
  case GGML_OP_SOFT_MAX_BACK:
13869
13908
  {
@@ -15689,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15689
15728
  {
15690
15729
  n_tasks = 1;
15691
15730
  } break;
15692
- case GGML_OP_COUNT:
15693
- {
15694
- GGML_ASSERT(false);
15695
- } break;
15696
15731
  default:
15697
15732
  {
15698
- printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15733
+ fprintf(stderr, "%s: op not implemented: ", __func__);
15734
+ if (node->op < GGML_OP_COUNT) {
15735
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
15736
+ } else {
15737
+ fprintf(stderr, "%d\n", node->op);
15738
+ }
15699
15739
  GGML_ASSERT(false);
15700
15740
  } break;
15701
15741
  }
@@ -15898,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
15898
15938
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
15899
15939
  }
15900
15940
  } break;
15941
+ case GGML_OP_SOFT_MAX:
15942
+ {
15943
+ n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
15944
+
15945
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
15946
+ } break;
15901
15947
  case GGML_OP_CONV_TRANSPOSE_1D:
15902
15948
  {
15903
15949
  GGML_ASSERT(node->src[0]->ne[3] == 1);
@@ -244,11 +244,10 @@
244
244
  #define GGML_ASSERT(x) \
245
245
  do { \
246
246
  if (!(x)) { \
247
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
248
- fflush(stderr); \
249
247
  fflush(stdout); \
248
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
250
249
  ggml_print_backtrace(); \
251
- exit(1); \
250
+ abort(); \
252
251
  } \
253
252
  } while (0)
254
253
 
@@ -1283,6 +1282,14 @@ extern "C" {
1283
1282
  struct ggml_context * ctx,
1284
1283
  struct ggml_tensor * a);
1285
1284
 
1285
+ // fused soft_max(a*scale + mask)
1286
+ // mask is optional
1287
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
1288
+ struct ggml_context * ctx,
1289
+ struct ggml_tensor * a,
1290
+ struct ggml_tensor * mask,
1291
+ float scale);
1292
+
1286
1293
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1287
1294
  struct ggml_context * ctx,
1288
1295
  struct ggml_tensor * a,
@@ -46,7 +46,6 @@
46
46
  #endif
47
47
  #include <windows.h>
48
48
  #include <io.h>
49
- #include <stdio.h> // for _fseeki64
50
49
  #endif
51
50
 
52
51
  #include <algorithm>
@@ -1113,6 +1112,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1113
1112
  //
1114
1113
 
1115
1114
  struct llama_state {
1115
+ llama_state() {
1116
+ #ifdef GGML_USE_METAL
1117
+ ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1118
+ #endif
1119
+ }
1120
+
1116
1121
  // We save the log callback globally
1117
1122
  ggml_log_callback log_callback = llama_log_callback_default;
1118
1123
  void * log_callback_user_data = nullptr;
@@ -2634,15 +2639,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2634
2639
  }
2635
2640
 
2636
2641
  // general kv
2637
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2642
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
2638
2643
 
2639
2644
  // special tokens
2640
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2641
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2642
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2643
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2644
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2645
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2645
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
2646
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
2647
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
2648
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
2649
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
2650
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
2646
2651
  }
2647
2652
 
2648
2653
  static void llm_load_tensors(
@@ -3464,7 +3469,7 @@ static void llm_build_k_shift(
3464
3469
  struct ggml_cgraph * graph,
3465
3470
  llm_rope_type type,
3466
3471
  int64_t n_ctx,
3467
- int64_t n_rot,
3472
+ int n_rot,
3468
3473
  float freq_base,
3469
3474
  float freq_scale,
3470
3475
  const llm_build_cb & cb) {
@@ -3496,7 +3501,7 @@ static void llm_build_k_shift(
3496
3501
  // we rotate only the first n_rot dimensions
3497
3502
  ggml_rope_custom_inplace(ctx,
3498
3503
  ggml_view_3d(ctx, kv.k,
3499
- n_rot, n_head_kv, n_ctx,
3504
+ n_embd_head, n_head_kv, n_ctx,
3500
3505
  ggml_element_size(kv.k)*n_embd_head,
3501
3506
  ggml_element_size(kv.k)*n_embd_gqa,
3502
3507
  ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
@@ -3694,22 +3699,28 @@ static struct ggml_tensor * llm_build_kqv(
3694
3699
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
3695
3700
  cb(kq, "kq", il);
3696
3701
 
3697
- kq = ggml_scale(ctx, kq, kq_scale);
3698
- cb(kq, "kq_scaled", il);
3699
-
3700
3702
  if (max_alibi_bias > 0.0f) {
3701
- // TODO: n_head or n_head_kv
3702
- // TODO: K-shift is likely not working
3703
- // TODO: change to ggml_add
3704
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3705
- cb(kq, "kq_scaled_alibi", il);
3706
- }
3703
+ // temporary branch until we figure out how to handle ggml_alibi through ggml_add
3704
+ kq = ggml_scale(ctx, kq, kq_scale);
3705
+ cb(kq, "kq_scaled", il);
3706
+
3707
+ if (max_alibi_bias > 0.0f) {
3708
+ // TODO: n_head or n_head_kv
3709
+ // TODO: K-shift is likely not working
3710
+ // TODO: change to ggml_add
3711
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
3712
+ cb(kq, "kq_scaled_alibi", il);
3713
+ }
3707
3714
 
3708
- kq = ggml_add(ctx, kq, kq_mask);
3709
- cb(kq, "kq_masked", il);
3715
+ kq = ggml_add(ctx, kq, kq_mask);
3716
+ cb(kq, "kq_masked", il);
3710
3717
 
3711
- kq = ggml_soft_max(ctx, kq);
3712
- cb(kq, "kq_soft_max", il);
3718
+ kq = ggml_soft_max(ctx, kq);
3719
+ cb(kq, "kq_soft_max", il);
3720
+ } else {
3721
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
3722
+ cb(kq, "kq_soft_max_ext", il);
3723
+ }
3713
3724
 
3714
3725
  // split cached v into n_head heads
3715
3726
  struct ggml_tensor * v =
@@ -5031,6 +5042,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5031
5042
  { "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
5032
5043
  { "kq_masked", OFFLOAD_FUNC_KQ },
5033
5044
  { "kq_soft_max", OFFLOAD_FUNC_V },
5045
+ { "kq_soft_max_ext", OFFLOAD_FUNC_V },
5034
5046
  { "v", OFFLOAD_FUNC_V },
5035
5047
  { "kqv", OFFLOAD_FUNC_V },
5036
5048
  { "kqv_merged", OFFLOAD_FUNC_V },
@@ -5539,18 +5551,8 @@ static int llama_decode_internal(
5539
5551
  n_threads = std::min(4, n_threads);
5540
5552
  }
5541
5553
 
5542
- // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5543
- const bool full_offload_supported =
5544
- model.arch == LLM_ARCH_LLAMA ||
5545
- model.arch == LLM_ARCH_BAICHUAN ||
5546
- model.arch == LLM_ARCH_FALCON ||
5547
- model.arch == LLM_ARCH_REFACT ||
5548
- model.arch == LLM_ARCH_MPT ||
5549
- model.arch == LLM_ARCH_STARCODER ||
5550
- model.arch == LLM_ARCH_STABLELM;
5551
-
5552
5554
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5553
- if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5555
+ if (ggml_cpu_has_cublas() && fully_offloaded) {
5554
5556
  n_threads = 1;
5555
5557
  }
5556
5558
 
@@ -6409,10 +6411,13 @@ struct llama_grammar_candidate {
6409
6411
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
6410
6412
  static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6411
6413
  const char * src,
6414
+ size_t n_src,
6412
6415
  llama_partial_utf8 partial_start) {
6413
6416
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
6414
6417
  const char * pos = src;
6415
6418
  std::vector<uint32_t> code_points;
6419
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
6420
+ code_points.reserve(n_src + 1);
6416
6421
  uint32_t value = partial_start.value;
6417
6422
  int n_remain = partial_start.n_remain;
6418
6423
 
@@ -6463,6 +6468,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6463
6468
  return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
6464
6469
  }
6465
6470
 
6471
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
6472
+ std::string src,
6473
+ llama_partial_utf8 partial_start
6474
+ ) {
6475
+ return decode_utf8(src.c_str(), src.size(), partial_start);
6476
+ }
6477
+
6466
6478
  // returns true iff pos points to the end of one of the definitions of a rule
6467
6479
  static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
6468
6480
  switch (pos->type) {
@@ -7016,6 +7028,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
7016
7028
  // Replace the data in candidates with the new_candidates data
7017
7029
  std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
7018
7030
  candidates->size = new_candidates.size();
7031
+ candidates->sorted = false;
7019
7032
 
7020
7033
  if (ctx) {
7021
7034
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7112,7 +7125,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
7112
7125
  } else if (piece.empty() || piece[0] == 0) {
7113
7126
  candidates->data[i].logit = -INFINITY;
7114
7127
  } else {
7115
- candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
7128
+ candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
7116
7129
  candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
7117
7130
  }
7118
7131
  }
@@ -7319,7 +7332,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
7319
7332
  const std::string piece = llama_token_to_piece(ctx, token);
7320
7333
 
7321
7334
  // Note terminating 0 in decoded string
7322
- const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
7335
+ const auto decoded = decode_utf8(piece, grammar->partial_utf8);
7323
7336
  const auto & code_points = decoded.first;
7324
7337
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
7325
7338
  grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@@ -8564,8 +8577,6 @@ struct llama_context * llama_new_context_with_model(
8564
8577
 
8565
8578
  #ifdef GGML_USE_METAL
8566
8579
  if (model->n_gpu_layers > 0) {
8567
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8568
-
8569
8580
  ctx->ctx_metal = ggml_metal_init(1);
8570
8581
  if (!ctx->ctx_metal) {
8571
8582
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
@@ -9701,6 +9712,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
9701
9712
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
9702
9713
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
9703
9714
  g_state.log_callback_user_data = user_data;
9715
+ #ifdef GGML_USE_METAL
9716
+ ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
9717
+ #endif
9704
9718
  }
9705
9719
 
9706
9720
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@@ -185,7 +185,7 @@ extern "C" {
185
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
186
186
  float rope_freq_base; // RoPE base frequency, 0 = from model
187
187
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
- float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
188
+ float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
189
189
  float yarn_attn_factor; // YaRN magnitude scaling factor
190
190
  float yarn_beta_fast; // YaRN low correction dim
191
191
  float yarn_beta_slow; // YaRN high correction dim
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.9.4'
6
+ VERSION = '0.9.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1555'
9
+ LLAMA_CPP_VERSION = 'b1593'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-25 00:00:00.000000000 Z
11
+ date: 2023-12-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -80,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
80
  - !ruby/object:Gem::Version
81
81
  version: '0'
82
82
  requirements: []
83
- rubygems_version: 3.4.20
83
+ rubygems_version: 3.4.22
84
84
  signing_key:
85
85
  specification_version: 4
86
86
  summary: Ruby bindings for the llama.cpp.