llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,9 +25,9 @@ typedef struct {
25
25
  } block_q8_0;
26
26
 
27
27
  kernel void kernel_add(
28
- device const float * src0,
29
- device const float * src1,
30
- device float * dst,
28
+ device const float4 * src0,
29
+ device const float4 * src1,
30
+ device float4 * dst,
31
31
  uint tpig[[thread_position_in_grid]]) {
32
32
  dst[tpig] = src0[tpig] + src1[tpig];
33
33
  }
@@ -35,18 +35,18 @@ kernel void kernel_add(
35
35
  // assumption: src1 is a row
36
36
  // broadcast src1 into src0
37
37
  kernel void kernel_add_row(
38
- device const float * src0,
39
- device const float * src1,
40
- device float * dst,
41
- constant int64_t & ne00,
38
+ device const float4 * src0,
39
+ device const float4 * src1,
40
+ device float4 * dst,
41
+ constant int64_t & nb,
42
42
  uint tpig[[thread_position_in_grid]]) {
43
- dst[tpig] = src0[tpig] + src1[tpig % ne00];
43
+ dst[tpig] = src0[tpig] + src1[tpig % nb];
44
44
  }
45
45
 
46
46
  kernel void kernel_mul(
47
- device const float * src0,
48
- device const float * src1,
49
- device float * dst,
47
+ device const float4 * src0,
48
+ device const float4 * src1,
49
+ device float4 * dst,
50
50
  uint tpig[[thread_position_in_grid]]) {
51
51
  dst[tpig] = src0[tpig] * src1[tpig];
52
52
  }
@@ -54,12 +54,12 @@ kernel void kernel_mul(
54
54
  // assumption: src1 is a row
55
55
  // broadcast src1 into src0
56
56
  kernel void kernel_mul_row(
57
- device const float * src0,
58
- device const float * src1,
59
- device float * dst,
60
- constant int64_t & ne00,
57
+ device const float4 * src0,
58
+ device const float4 * src1,
59
+ device float4 * dst,
60
+ constant int64_t & nb,
61
61
  uint tpig[[thread_position_in_grid]]) {
62
- dst[tpig] = src0[tpig] * src1[tpig % ne00];
62
+ dst[tpig] = src0[tpig] * src1[tpig % nb];
63
63
  }
64
64
 
65
65
  kernel void kernel_scale(
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
133
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
134
  }
135
135
 
136
- // broadcast
137
- if (tpitg[0] == 0) {
138
- buf[0] = buf[0];
139
- }
136
+ //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
137
+ // the loop, and when that is done, buf[0] has the correct (synchronized) value
138
+ //if (tpitg[0] == 0) {
139
+ // buf[0] = buf[0];
140
+ //}
140
141
 
141
- threadgroup_barrier(mem_flags::mem_threadgroup);
142
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
142
143
 
143
144
  const float max = buf[0];
144
145
 
145
146
  // parallel sum
146
147
  buf[tpitg[0]] = 0.0f;
147
148
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
148
- buf[tpitg[0]] += exp(psrc0[i00] - max);
149
+ const float exp_psrc0 = exp(psrc0[i00] - max);
150
+ buf[tpitg[0]] += exp_psrc0;
151
+ // Remember the result of exp here. exp is expensive, so we really do not
152
+ // whish to compute it twice.
153
+ pdst[i00] = exp_psrc0;
149
154
  }
150
155
 
151
156
  // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
157
162
  threadgroup_barrier(mem_flags::mem_threadgroup);
158
163
  }
159
164
 
160
- // broadcast
161
- if (tpitg[0] == 0) {
162
- buf[0] = buf[0];
163
- }
165
+ // broadcast - not needed, see above
166
+ //// broadcast
167
+ //if (tpitg[0] == 0) {
168
+ // buf[0] = buf[0];
169
+ //}
164
170
 
165
- threadgroup_barrier(mem_flags::mem_threadgroup);
171
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
166
172
 
167
173
  const float sum = buf[0];
168
174
 
169
175
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
170
- pdst[i00] = exp(psrc0[i00] - max) / sum;
176
+ pdst[i00] /= sum;
171
177
  }
172
178
  }
173
179
 
@@ -214,25 +220,17 @@ kernel void kernel_norm(
214
220
  }
215
221
  threadgroup_barrier(mem_flags::mem_threadgroup);
216
222
  }
217
- // broadcast
218
- if (tpitg == 0) {
219
- sum[0] /= ne00;
220
- }
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
222
- const float mean = sum[0];
223
+ const float mean = sum[0] / ne00;
223
224
 
224
- // recenter
225
+ // recenter and VARIANCE
226
+ threadgroup_barrier(mem_flags::mem_threadgroup);
225
227
  device float * y = dst + tgpig*ne00;
226
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
227
- y[i00] = x[i00] - mean;
228
- }
229
-
230
- // VARIANCE
231
- // parallel sum
232
228
  sum[tpitg] = 0.0f;
233
229
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
230
+ y[i00] = x[i00] - mean;
234
231
  sum[tpitg] += y[i00] * y[i00];
235
232
  }
233
+
236
234
  // reduce
237
235
  threadgroup_barrier(mem_flags::mem_threadgroup);
238
236
  for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,12 +239,7 @@ kernel void kernel_norm(
241
239
  }
242
240
  threadgroup_barrier(mem_flags::mem_threadgroup);
243
241
  }
244
- // broadcast
245
- if (tpitg == 0) {
246
- sum[0] /= ne00;
247
- }
248
- threadgroup_barrier(mem_flags::mem_threadgroup);
249
- const float variance = sum[0];
242
+ const float variance = sum[0] / ne00;
250
243
 
251
244
  const float scale = 1.0f/sqrt(variance + eps);
252
245
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -254,7 +247,6 @@ kernel void kernel_norm(
254
247
  }
255
248
  }
256
249
 
257
-
258
250
  kernel void kernel_rms_norm(
259
251
  device const void * src0,
260
252
  device float * dst,
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
435
427
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
436
428
  }
437
429
 
430
+ #define NB_Q8_0 8
431
+
438
432
  kernel void kernel_mul_mat_q8_0_f32(
439
433
  device const void * src0,
440
434
  device const float * src1,
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
463
457
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
464
458
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
465
459
 
466
- float yl[16];
460
+ float yl[NB_Q8_0];
467
461
  float sumf[nr]={0.f};
468
462
 
469
- const int ix = tiisg/2;
470
- const int il = tiisg%2;
463
+ const int ix = tiisg/4;
464
+ const int il = tiisg%4;
471
465
 
472
- device const float * yb = y + ix * QK8_0 + 16*il;
466
+ device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
473
467
 
474
- // each thread in a SIMD group deals with half a block.
475
- for (int ib = ix; ib < nb; ib += nw/2) {
476
- for (int i = 0; i < 16; ++i) {
468
+ // each thread in a SIMD group deals with NB_Q8_0 quants at a time
469
+ for (int ib = ix; ib < nb; ib += nw/4) {
470
+ for (int i = 0; i < NB_Q8_0; ++i) {
477
471
  yl[i] = yb[i];
478
472
  }
479
473
 
480
474
  for (int row = 0; row < nr; row++) {
481
- device const int8_t * qs = x[ib+row*nb].qs + 16*il;
475
+ device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
482
476
  float sumq = 0.f;
483
- for (int iq = 0; iq < 16; ++iq) {
477
+ for (int iq = 0; iq < NB_Q8_0; ++iq) {
484
478
  sumq += qs[iq] * yl[iq];
485
479
  }
486
480
  sumf[row] += sumq*x[ib+row*nb].d;
487
481
  }
488
482
 
489
- yb += QK8_0 * 16;
483
+ yb += NB_Q8_0 * nw;
490
484
  }
491
485
 
492
486
  for (int row = 0; row < nr; ++row) {
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
497
491
  }
498
492
  }
499
493
 
500
- kernel void kernel_mul_mat_f16_f32(
494
+ kernel void kernel_mul_mat_f16_f32_1row(
501
495
  device const char * src0,
502
496
  device const char * src1,
503
497
  device float * dst,
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
515
509
  constant uint64_t & nb12,
516
510
  constant int64_t & ne0,
517
511
  constant int64_t & ne1,
518
- threadgroup float * sum [[threadgroup(0)]],
519
512
  uint3 tgpig[[threadgroup_position_in_grid]],
520
- uint3 tpig[[thread_position_in_grid]],
521
- uint3 tpitg[[thread_position_in_threadgroup]],
522
- uint3 tptg[[threads_per_threadgroup]]) {
513
+ uint tiisg[[thread_index_in_simdgroup]]) {
523
514
 
524
515
  const int64_t r0 = tgpig.x;
525
516
  const int64_t r1 = tgpig.y;
@@ -528,23 +519,100 @@ kernel void kernel_mul_mat_f16_f32(
528
519
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
529
520
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
530
521
 
531
- sum[tpitg.x] = 0.0f;
532
-
533
- for (int i = tpitg.x; i < ne00; i += tptg.x) {
534
- sum[tpitg.x] += (float) x[i] * (float) y[i];
522
+ float sumf = 0;
523
+ if (ne00 < 128) {
524
+ for (int i = tiisg; i < ne00; i += 32) {
525
+ sumf += (float) x[i] * (float) y[i];
526
+ }
527
+ float all_sum = simd_sum(sumf);
528
+ if (tiisg == 0) {
529
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
530
+ }
531
+ } else {
532
+ device const half4 * x4 = (device const half4 *) x;
533
+ device const float4 * y4 = (device const float4 *) y;
534
+ for (int i = tiisg; i < ne00/4; i += 32) {
535
+ for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
536
+ }
537
+ float all_sum = simd_sum(sumf);
538
+ if (tiisg == 0) {
539
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
540
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
541
+ }
535
542
  }
536
543
 
537
- // accumulate the sum from all threads in the threadgroup
538
- threadgroup_barrier(mem_flags::mem_threadgroup);
539
- for (uint i = tptg.x/2; i > 0; i /= 2) {
540
- if (tpitg.x < i) {
541
- sum[tpitg.x] += sum[tpitg.x + i];
544
+ }
545
+
546
+ #define N_F16_F32 4
547
+
548
+ kernel void kernel_mul_mat_f16_f32(
549
+ device const char * src0,
550
+ device const char * src1,
551
+ device float * dst,
552
+ constant int64_t & ne00,
553
+ constant int64_t & ne01,
554
+ constant int64_t & ne02,
555
+ constant uint64_t & nb00,
556
+ constant uint64_t & nb01,
557
+ constant uint64_t & nb02,
558
+ constant int64_t & ne10,
559
+ constant int64_t & ne11,
560
+ constant int64_t & ne12,
561
+ constant uint64_t & nb10,
562
+ constant uint64_t & nb11,
563
+ constant uint64_t & nb12,
564
+ constant int64_t & ne0,
565
+ constant int64_t & ne1,
566
+ uint3 tgpig[[threadgroup_position_in_grid]],
567
+ uint tiisg[[thread_index_in_simdgroup]]) {
568
+
569
+ const int64_t r0 = tgpig.x;
570
+ const int64_t rb = tgpig.y*N_F16_F32;
571
+ const int64_t im = tgpig.z;
572
+
573
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
574
+
575
+ if (ne00 < 128) {
576
+ for (int row = 0; row < N_F16_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+
584
+ float sumf = 0;
585
+ for (int i = tiisg; i < ne00; i += 32) {
586
+ sumf += (float) x[i] * (float) y[i];
587
+ }
588
+
589
+ float all_sum = simd_sum(sumf);
590
+ if (tiisg == 0) {
591
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
592
+ }
542
593
  }
543
- threadgroup_barrier(mem_flags::mem_threadgroup);
544
- }
594
+ } else {
595
+ device const half4 * x4 = (device const half4 *)x;
596
+ for (int row = 0; row < N_F16_F32; ++row) {
597
+ int r1 = rb + row;
598
+ if (r1 >= ne11) {
599
+ break;
600
+ }
601
+
602
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
603
+ device const float4 * y4 = (device const float4 *) y;
604
+
605
+ float sumf = 0;
606
+ for (int i = tiisg; i < ne00/4; i += 32) {
607
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
608
+ }
545
609
 
546
- if (tpitg.x == 0) {
547
- dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
610
+ float all_sum = simd_sum(sumf);
611
+ if (tiisg == 0) {
612
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
613
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
614
+ }
615
+ }
548
616
  }
549
617
  }
550
618
 
@@ -614,25 +682,27 @@ kernel void kernel_rope(
614
682
  constant int & mode,
615
683
  constant float & freq_base,
616
684
  constant float & freq_scale,
617
- uint3 tpig[[thread_position_in_grid]]) {
618
- const int64_t i3 = tpig[2];
619
- const int64_t i2 = tpig[1];
620
- const int64_t i1 = tpig[0];
685
+ uint tiitg[[thread_index_in_threadgroup]],
686
+ uint3 tptg[[threads_per_threadgroup]],
687
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
688
+ const int64_t i3 = tgpig[2];
689
+ const int64_t i2 = tgpig[1];
690
+ const int64_t i1 = tgpig[0];
621
691
 
622
692
  const bool is_neox = mode & 2;
623
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
624
693
 
625
694
  const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
626
695
 
627
- float theta = freq_scale * (float)p;
696
+ const float theta_0 = freq_scale * (float)p;
697
+ const float inv_ndims = -1.f/n_dims;
628
698
 
629
699
  if (!is_neox) {
630
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
700
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
701
+
702
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
631
703
  const float cos_theta = cos(theta);
632
704
  const float sin_theta = sin(theta);
633
705
 
634
- theta *= theta_scale;
635
-
636
706
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
637
707
  device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
638
708
 
@@ -644,12 +714,12 @@ kernel void kernel_rope(
644
714
  }
645
715
  } else {
646
716
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
647
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
717
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
718
+
719
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
648
720
  const float cos_theta = cos(theta);
649
721
  const float sin_theta = sin(theta);
650
722
 
651
- theta *= theta_scale;
652
-
653
723
  const int64_t i0 = ib*n_dims + ic/2;
654
724
 
655
725
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -1244,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
1244
1314
  const int r0 = tgpig.x;
1245
1315
  const int r1 = tgpig.y;
1246
1316
  const int r2 = tgpig.z;
1247
- const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1317
+ //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1318
+ const int first_row = r0 * N_DST;
1248
1319
  const int ib_row = first_row * nb;
1249
1320
  const uint offset0 = r2/gqa*(nb*ne0);
1250
1321
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }