llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -81,12 +81,29 @@
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #ifndef __has_builtin
85
+ #define __has_builtin(x) 0
86
+ #endif
87
+
84
88
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
89
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
90
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
91
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
92
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
93
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
94
  return reinterpret_cast<const int&>(c);
95
+ #else
96
+ int8x4_t c;
97
+ int16_t tmp;
98
+ #pragma unroll
99
+ for (int i = 0; i < 4; i++) {
100
+ tmp = va[i] - vb[i];
101
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
102
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
103
+ c[i] = tmp;
104
+ }
105
+ return reinterpret_cast<int&>(c);
106
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
107
  }
91
108
 
92
109
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -306,11 +323,11 @@ typedef struct {
306
323
  #define QI4_K (QK_K / (4*QR4_K))
307
324
  #ifdef GGML_QKK_64
308
325
  typedef struct {
309
- half d[2]; // super-block scales/mins
326
+ half dm[2]; // super-block scales/mins
310
327
  uint8_t scales[2]; // 4-bit block scales/mins
311
328
  uint8_t qs[QK_K/2]; // 4--bit quants
312
329
  } block_q4_K;
313
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
330
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
314
331
  #else
315
332
  typedef struct {
316
333
  half2 dm; // super-block scale for quantized scales/mins
@@ -447,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
464
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
465
  }
449
466
 
467
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
468
+ #pragma unroll
469
+ for (int mask = 16; mask > 0; mask >>= 1) {
470
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
471
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
472
+ }
473
+ return a;
474
+ }
475
+
476
+ template <int block_size>
450
477
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
478
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
479
  const int tid = threadIdx.x;
453
480
 
454
481
  const float eps = 1e-5f;
455
482
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
483
+ float2 mean_var = make_float2(0.f, 0.f);
458
484
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
485
+ for (int col = tid; col < ncols; col += block_size) {
460
486
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
487
+ mean_var.x += xi;
488
+ mean_var.y += xi * xi;
463
489
  }
464
490
 
465
491
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
492
+ mean_var = warp_reduce_sum(mean_var);
493
+ if (block_size > WARP_SIZE) {
494
+ __shared__ float2 s_sum[32];
495
+ int warp_id = threadIdx.x / WARP_SIZE;
496
+ int lane_id = threadIdx.x % WARP_SIZE;
497
+ if (lane_id == 0) {
498
+ s_sum[warp_id] = mean_var;
499
+ }
500
+ __syncthreads();
501
+ mean_var = s_sum[lane_id];
502
+ mean_var = warp_reduce_sum(mean_var);
470
503
  }
471
504
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
505
+ const float mean = mean_var.x / ncols;
506
+ const float var = mean_var.y / ncols - mean * mean;
507
+ const float inv_std = rsqrtf(var + eps);
508
+
509
+ for (int col = tid; col < ncols; col += block_size) {
510
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
511
+ }
512
+ }
475
513
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
514
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
515
+ #pragma unroll
516
+ for (int mask = 16; mask > 0; mask >>= 1) {
517
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
518
  }
519
+ return x;
479
520
  }
480
521
 
522
+ template <int block_size>
481
523
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
524
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
525
  const int tid = threadIdx.x;
484
526
 
485
527
  float tmp = 0.0f; // partial sum for thread in warp
486
528
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
529
+ for (int col = tid; col < ncols; col += block_size) {
488
530
  const float xi = x[row*ncols + col];
489
531
  tmp += xi * xi;
490
532
  }
491
533
 
492
534
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
535
+ tmp = warp_reduce_sum(tmp);
536
+ if (block_size > WARP_SIZE) {
537
+ __shared__ float s_sum[32];
538
+ int warp_id = threadIdx.x / WARP_SIZE;
539
+ int lane_id = threadIdx.x % WARP_SIZE;
540
+ if (lane_id == 0) {
541
+ s_sum[warp_id] = tmp;
542
+ }
543
+ __syncthreads();
544
+ tmp = s_sum[lane_id];
545
+ tmp = warp_reduce_sum(tmp);
496
546
  }
497
547
 
498
548
  const float mean = tmp / ncols;
499
549
  const float scale = rsqrtf(mean + eps);
500
550
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
551
+ for (int col = tid; col < ncols; col += block_size) {
502
552
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
553
  }
504
554
  }
@@ -737,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
737
787
  const int tid = threadIdx.x;
738
788
  const uint8_t * q = x[i].qs;
739
789
  float * y = yy + i*QK_K;
740
- const float d = (float)x[i].d[0];
741
- const float m = (float)x[i].d[1];
790
+ const float d = (float)x[i].dm[0];
791
+ const float m = (float)x[i].dm[1];
742
792
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
743
793
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
744
794
  #endif
@@ -1155,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1155
1205
  const uint16_t * a = (const uint16_t *)x[i].scales;
1156
1206
  aux16[0] = a[0] & 0x0f0f;
1157
1207
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1158
- const float d = (float)x[i].d[0];
1159
- const float m = (float)x[i].d[1];
1208
+ const float d = (float)x[i].dm[0];
1209
+ const float m = (float)x[i].dm[1];
1160
1210
  float sum = 0.f;
1161
1211
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1162
1212
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -2845,8 +2895,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2845
2895
  aux16[0] = a[0] & 0x0f0f;
2846
2896
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2847
2897
 
2848
- const float dall = bq4_K->d[0];
2849
- const float dmin = bq4_K->d[1];
2898
+ const float dall = bq4_K->dm[0];
2899
+ const float dmin = bq4_K->dm[1];
2850
2900
 
2851
2901
  const float d8_1 = __low2float(bq8_1[0].ds);
2852
2902
  const float d8_2 = __low2float(bq8_1[1].ds);
@@ -2929,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2929
2979
 
2930
2980
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2931
2981
 
2982
+ #if QK_K == 256
2932
2983
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2984
+ #else
2985
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2986
+ #endif
2933
2987
  }
2934
2988
 
2935
2989
  #pragma unroll
@@ -3119,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3119
3173
 
3120
3174
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3121
3175
 
3176
+ #if QK_K == 256
3122
3177
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3178
+ #endif
3123
3179
  }
3124
3180
 
3125
3181
  #pragma unroll
@@ -4180,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4180
4236
 
4181
4237
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4182
4238
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4183
- const dim3 block_dims(WARP_SIZE, 1, 1);
4184
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4239
+ if (ncols < 1024) {
4240
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4241
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4242
+ } else {
4243
+ const dim3 block_dims(1024, 1, 1);
4244
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4245
+ }
4185
4246
  }
4186
4247
 
4187
4248
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4188
4249
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4250
+ if (ncols < 1024) {
4251
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4252
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4253
+ } else {
4254
+ const dim3 block_dims(1024, 1, 1);
4255
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4256
+ }
4191
4257
  }
4192
4258
 
4193
4259
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -4709,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4709
4775
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4710
4776
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4711
4777
 
4778
+ #if QK_K == 256
4779
+
4712
4780
  int id;
4713
4781
  CUDA_CHECK(cudaGetDevice(&id));
4714
4782
  const int compute_capability = g_compute_capabilities[id];
@@ -4740,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4740
4808
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
4809
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4742
4810
  }
4811
+ #endif
4743
4812
  }
4744
4813
 
4745
4814
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4899,8 +4968,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4899
4968
 
4900
4969
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4901
4970
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4902
- GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4971
+ GGML_ASSERT(ncols % 2 == 0);
4972
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4904
4973
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4905
4974
  const dim3 block_nums(nrows, num_blocks_x, 1);
4906
4975
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -4908,7 +4977,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
4908
4977
 
4909
4978
  static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
4979
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
- const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4980
+ GGML_ASSERT(ncols % 2 == 0);
4981
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4912
4982
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
4983
  const dim3 block_nums(nrows, num_blocks_x, 1);
4914
4984
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
@@ -6328,9 +6398,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6328
6398
 
6329
6399
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
6400
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6331
6402
 
6332
6403
  const int mode = ((int32_t *) dst->op_params)[2];
6333
6404
  const bool is_glm = mode & 4;
6405
+
6334
6406
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6335
6407
  }
6336
6408
 
@@ -24,6 +24,7 @@
24
24
 
25
25
  // max memory buffers that can be mapped to the device
26
26
  #define GGML_METAL_MAX_BUFFERS 16
27
+ #define GGML_METAL_MAX_COMMAND_BUFFERS 32
27
28
 
28
29
  struct ggml_tensor;
29
30
  struct ggml_cgraph;