llama_cpp 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,6 +67,17 @@ kernel void kernel_add(
67
67
  dst[tpig] = src0[tpig] + src1[tpig];
68
68
  }
69
69
 
70
+ // assumption: src1 is a row
71
+ // broadcast src1 into src0
72
+ kernel void kernel_add_row(
73
+ device const float * src0,
74
+ device const float * src1,
75
+ device float * dst,
76
+ constant int64_t & ne00,
77
+ uint tpig[[thread_position_in_grid]]) {
78
+ dst[tpig] = src0[tpig] + src1[tpig % ne00];
79
+ }
80
+
70
81
  kernel void kernel_mul(
71
82
  device const float * src0,
72
83
  device const float * src1,
@@ -376,87 +387,90 @@ kernel void kernel_rms_norm(
376
387
  }
377
388
  }
378
389
 
379
- // function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
380
- float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
390
+ // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
391
+ // il indicates where the q4 quants begin (0 or QK4_0/4)
392
+ // we assume that the yl's have been multiplied with the appropriate scale factor
393
+ // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
394
+ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
381
395
  float d = qb_curr->d;
382
- float4 acc = 0.f;
383
- device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
384
- for (int i = 0; i < 16; i+=2) {
385
- acc[0] += yl[i] * (qs[i / 2] & 0x000F);
386
- acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
387
- acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
388
- acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
396
+ float2 acc = 0.f;
397
+ device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
398
+ for (int i = 0; i < 8; i+=2) {
399
+ acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
400
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
401
+ acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
402
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
389
403
  }
390
- return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
404
+ return d * (sumy * -8.f + acc[0] + acc[1]);
391
405
  }
392
406
 
393
- // function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
394
- float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
407
+ // function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
408
+ // il indicates where the q4 quants begin (0 or QK4_0/4)
409
+ // we assume that the yl's have been multiplied with the appropriate scale factor
410
+ // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
411
+ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
395
412
  float d = qb_curr->d;
396
413
  float m = qb_curr->m;
397
- float4 acc = 0.f;
398
- device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
399
- for (int i = 0; i < 16; i+=2) {
400
- acc[0] += yl[i] * (qs[i / 2] & 0x000F);
401
- acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
402
- acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
403
- acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
414
+ device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
415
+ float2 acc = 0.f;
416
+ for (int i = 0; i < 8; i+=2) {
417
+ acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
418
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
419
+ acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
420
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
404
421
  }
405
- return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
422
+ return d * (acc[0] + acc[1]) + sumy * m;
406
423
  }
407
424
 
408
425
  // putting them in the kernel cause a significant performance penalty
409
426
  #define N_DST 4 // each SIMD group works on 4 rows
410
427
  #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
411
428
  #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
412
- template<typename block_q_type>
429
+ //Note: This is a template, but strictly speaking it only applies to
430
+ // quantizations where the block size is 32. It also does not
431
+ // giard against the number of rows not being divisible by
432
+ // N_DST, so this is another explicit assumption of the implementation.
433
+ template<typename block_q_type, int nr, int nsg, int nw>
413
434
  void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
414
435
  int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
415
436
  uint2 tgpig, uint tiisg, uint sgitg) {
416
437
  const int nb = ne00/QK4_0;
417
438
  const int r0 = tgpig.x;
418
439
  const int r1 = tgpig.y;
419
- device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
440
+ const int first_row = (r0 * nsg + sgitg) * nr;
441
+ device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
420
442
  device const float * y = (device const float *) src1 + r1*ne10;
421
- float4 y_curr[8]; // src1 vector cache
422
- float sumf[N_DST]={0.f}, all_sum;
423
- thread float * yl=(thread float *)y_curr;
443
+ float yl[16]; // src1 vector cache
444
+ float sumf[nr]={0.f};
424
445
 
425
- // each thread in a SIMD group deals with 1 block.
426
- for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
427
- float sumy = 0;
428
- for (int i = 0; i < QK4_0 / 4; i++) {
429
- y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
430
- sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
431
- }
446
+ const int ix = tiisg/2;
447
+ const int il = 8*(tiisg%2);
432
448
 
433
- for (int row = 0; row < N_DST; row++) {
434
- sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
435
- }
436
- }
449
+ device const float * yb = y + ix * QK4_0 + il;
437
450
 
438
- // from now loads two rows every time and 16 blocks per row
439
- int ir = tiisg / (N_SIMDWIDTH / 2);
440
- int ib = tiisg % (N_SIMDWIDTH / 2);
441
- for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
442
- int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
451
+ // each thread in a SIMD group deals with half a block.
452
+ for (int ib = ix; ib < nb; ib += nw/2) {
443
453
  float sumy = 0;
444
- for (int i = 0; i < QK4_0 / 4; i++) {
445
- y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
446
- sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
454
+ for (int i = 0; i < 8; i += 2) {
455
+ sumy += yb[i] + yb[i+1];
456
+ yl[i+0] = yb[i+ 0];
457
+ yl[i+1] = yb[i+ 1]/256.f;
458
+ sumy += yb[i+16] + yb[i+17];
459
+ yl[i+8] = yb[i+16]/16.f;
460
+ yl[i+9] = yb[i+17]/4096.f;
447
461
  }
448
462
 
449
- for (int row = 0; row < N_DST; row+=2) {
450
- if (nb_start + ib < nb) {
451
- sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
452
- }
463
+ for (int row = 0; row < nr; row++) {
464
+ sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
453
465
  }
466
+
467
+ yb += QK4_0 * 16;
454
468
  }
455
469
 
456
- for (int row = 0; row < N_DST; ++row) {
457
- all_sum = simd_sum(sumf[row]);
458
- if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
459
- dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
470
+ for (int row = 0; row < nr; ++row) {
471
+ const float tot = simd_sum(sumf[row]);
472
+ if (tiisg == 0 && first_row + row < ne01) {
473
+ dst[r1*ne0 + first_row + row] = tot;
460
474
  }
461
475
  }
462
476
  }
@@ -472,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
472
486
  uint2 tgpig[[threadgroup_position_in_grid]],
473
487
  uint tiisg[[thread_index_in_simdgroup]],
474
488
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
475
- mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
489
+ mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
476
490
  }
477
491
 
478
492
  kernel void kernel_mul_mat_q4_1_f32(
@@ -486,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
486
500
  uint2 tgpig[[threadgroup_position_in_grid]],
487
501
  uint tiisg[[thread_index_in_simdgroup]],
488
502
  uint sgitg[[simdgroup_index_in_threadgroup]]) {
489
- mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
503
+ mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
490
504
  }
491
505
 
492
506
  kernel void kernel_mul_mat_f16_f32(
@@ -495,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
495
509
  device float * dst,
496
510
  constant int64_t & ne00,
497
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
498
513
  constant uint64_t & nb00,
499
514
  constant uint64_t & nb01,
500
515
  constant uint64_t & nb02,
501
516
  constant int64_t & ne10,
502
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
503
519
  constant uint64_t & nb10,
504
520
  constant uint64_t & nb11,
505
521
  constant uint64_t & nb12,
@@ -515,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
515
531
  const int64_t r1 = tgpig.y;
516
532
  const int64_t im = tgpig.z;
517
533
 
518
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
519
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
520
536
 
521
537
  sum[tpitg.x] = 0.0f;
@@ -538,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
538
554
  }
539
555
  }
540
556
 
557
+
541
558
  kernel void kernel_alibi_f32(
542
559
  device const float * src0,
543
560
  device float * dst,