llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,12 +81,29 @@
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #ifndef __has_builtin
85
+ #define __has_builtin(x) 0
86
+ #endif
87
+
84
88
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
89
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
90
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
91
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
93
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
94
  return reinterpret_cast<const int&>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0; i < 4; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
102
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast<int&>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
107
  }
91
108
 
92
109
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -306,11 +323,11 @@ typedef struct {
306
323
  #define QI4_K (QK_K / (4*QR4_K))
307
324
  #ifdef GGML_QKK_64
308
325
  typedef struct {
309
- half d[2]; // super-block scales/mins
326
+ half dm[2]; // super-block scales/mins
310
327
  uint8_t scales[2]; // 4-bit block scales/mins
311
328
  uint8_t qs[QK_K/2]; // 4--bit quants
312
329
  } block_q4_K;
313
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
330
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
314
331
  #else
315
332
  typedef struct {
316
333
  half2 dm; // super-block scale for quantized scales/mins
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
464
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
465
  }
449
466
 
467
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
468
+ #pragma unroll
469
+ for (int mask = 16; mask > 0; mask >>= 1) {
470
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
471
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
472
+ }
473
+ return a;
474
+ }
475
+
476
+ template <int block_size>
450
477
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
478
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
479
  const int tid = threadIdx.x;
453
480
 
454
481
  const float eps = 1e-5f;
455
482
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
483
+ float2 mean_var = make_float2(0.f, 0.f);
458
484
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
485
+ for (int col = tid; col < ncols; col += block_size) {
460
486
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
487
+ mean_var.x += xi;
488
+ mean_var.y += xi * xi;
463
489
  }
464
490
 
465
491
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
492
+ mean_var = warp_reduce_sum(mean_var);
493
+ if (block_size > WARP_SIZE) {
494
+ __shared__ float2 s_sum[32];
495
+ int warp_id = threadIdx.x / WARP_SIZE;
496
+ int lane_id = threadIdx.x % WARP_SIZE;
497
+ if (lane_id == 0) {
498
+ s_sum[warp_id] = mean_var;
499
+ }
500
+ __syncthreads();
501
+ mean_var = s_sum[lane_id];
502
+ mean_var = warp_reduce_sum(mean_var);
470
503
  }
471
504
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
505
+ const float mean = mean_var.x / ncols;
506
+ const float var = mean_var.y / ncols - mean * mean;
507
+ const float inv_std = rsqrtf(var + eps);
508
+
509
+ for (int col = tid; col < ncols; col += block_size) {
510
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
511
+ }
512
+ }
475
513
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
514
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
515
+ #pragma unroll
516
+ for (int mask = 16; mask > 0; mask >>= 1) {
517
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
518
  }
519
+ return x;
479
520
  }
480
521
 
522
+ template <int block_size>
481
523
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
524
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
525
  const int tid = threadIdx.x;
484
526
 
485
527
  float tmp = 0.0f; // partial sum for thread in warp
486
528
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
529
+ for (int col = tid; col < ncols; col += block_size) {
488
530
  const float xi = x[row*ncols + col];
489
531
  tmp += xi * xi;
490
532
  }
491
533
 
492
534
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
535
+ tmp = warp_reduce_sum(tmp);
536
+ if (block_size > WARP_SIZE) {
537
+ __shared__ float s_sum[32];
538
+ int warp_id = threadIdx.x / WARP_SIZE;
539
+ int lane_id = threadIdx.x % WARP_SIZE;
540
+ if (lane_id == 0) {
541
+ s_sum[warp_id] = tmp;
542
+ }
543
+ __syncthreads();
544
+ tmp = s_sum[lane_id];
545
+ tmp = warp_reduce_sum(tmp);
496
546
  }
497
547
 
498
548
  const float mean = tmp / ncols;
499
549
  const float scale = rsqrtf(mean + eps);
500
550
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
551
+ for (int col = tid; col < ncols; col += block_size) {
502
552
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
553
  }
504
554
  }
@@ -737,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
737
787
  const int tid = threadIdx.x;
738
788
  const uint8_t * q = x[i].qs;
739
789
  float * y = yy + i*QK_K;
740
- const float d = (float)x[i].d[0];
741
- const float m = (float)x[i].d[1];
790
+ const float d = (float)x[i].dm[0];
791
+ const float m = (float)x[i].dm[1];
742
792
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
743
793
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
744
794
  #endif
@@ -1155,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1155
1205
  const uint16_t * a = (const uint16_t *)x[i].scales;
1156
1206
  aux16[0] = a[0] & 0x0f0f;
1157
1207
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1158
- const float d = (float)x[i].d[0];
1159
- const float m = (float)x[i].d[1];
1208
+ const float d = (float)x[i].dm[0];
1209
+ const float m = (float)x[i].dm[1];
1160
1210
  float sum = 0.f;
1161
1211
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1162
1212
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -2845,8 +2895,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2845
2895
  aux16[0] = a[0] & 0x0f0f;
2846
2896
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2847
2897
 
2848
- const float dall = bq4_K->d[0];
2849
- const float dmin = bq4_K->d[1];
2898
+ const float dall = bq4_K->dm[0];
2899
+ const float dmin = bq4_K->dm[1];
2850
2900
 
2851
2901
  const float d8_1 = __low2float(bq8_1[0].ds);
2852
2902
  const float d8_2 = __low2float(bq8_1[1].ds);
@@ -2929,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2929
2979
 
2930
2980
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2931
2981
 
2982
+ #if QK_K == 256
2932
2983
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2984
+ #else
2985
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2986
+ #endif
2933
2987
  }
2934
2988
 
2935
2989
  #pragma unroll
@@ -3119,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3119
3173
 
3120
3174
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3121
3175
 
3176
+ #if QK_K == 256
3122
3177
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3178
+ #endif
3123
3179
  }
3124
3180
 
3125
3181
  #pragma unroll
@@ -4180,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4180
4236
 
4181
4237
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4182
4238
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4183
- const dim3 block_dims(WARP_SIZE, 1, 1);
4184
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4239
+ if (ncols < 1024) {
4240
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4241
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4242
+ } else {
4243
+ const dim3 block_dims(1024, 1, 1);
4244
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4245
+ }
4185
4246
  }
4186
4247
 
4187
4248
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4188
4249
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4250
+ if (ncols < 1024) {
4251
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4252
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4253
+ } else {
4254
+ const dim3 block_dims(1024, 1, 1);
4255
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4256
+ }
4191
4257
  }
4192
4258
 
4193
4259
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -4709,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4709
4775
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4710
4776
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4711
4777
 
4778
+ #if QK_K == 256
4779
+
4712
4780
  int id;
4713
4781
  CUDA_CHECK(cudaGetDevice(&id));
4714
4782
  const int compute_capability = g_compute_capabilities[id];
@@ -4740,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4740
4808
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
4809
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4742
4810
  }
4811
+ #endif
4743
4812
  }
4744
4813
 
4745
4814
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4899,8 +4968,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4899
4968
 
4900
4969
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4901
4970
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4902
- GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4971
+ GGML_ASSERT(ncols % 2 == 0);
4972
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4904
4973
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4905
4974
  const dim3 block_nums(nrows, num_blocks_x, 1);
4906
4975
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -4908,7 +4977,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
4908
4977
 
4909
4978
  static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
4979
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4980
+ GGML_ASSERT(ncols % 2 == 0);
4981
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4912
4982
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
4983
  const dim3 block_nums(nrows, num_blocks_x, 1);
4914
4984
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -6328,9 +6398,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6328
6398
 
6329
6399
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
6400
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6331
6402
 
6332
6403
  const int mode = ((int32_t *) dst->op_params)[2];
6333
6404
  const bool is_glm = mode & 4;
6405
+
6334
6406
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6335
6407
  }
6336
6408
 
@@ -24,6 +24,7 @@
24
24
 
25
25
  // max memory buffers that can be mapped to the device
26
26
  #define GGML_METAL_MAX_BUFFERS 16
27
+ #define GGML_METAL_MAX_COMMAND_BUFFERS 32
27
28
 
28
29
  struct ggml_tensor;
29
30
  struct ggml_cgraph;