llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -25,9 +25,9 @@ typedef struct {
25
25
  } block_q8_0;
26
26
 
27
27
  kernel void kernel_add(
28
- device const float * src0,
29
- device const float * src1,
30
- device float * dst,
28
+ device const float4 * src0,
29
+ device const float4 * src1,
30
+ device float4 * dst,
31
31
  uint tpig[[thread_position_in_grid]]) {
32
32
  dst[tpig] = src0[tpig] + src1[tpig];
33
33
  }
@@ -35,18 +35,18 @@ kernel void kernel_add(
35
35
  // assumption: src1 is a row
36
36
  // broadcast src1 into src0
37
37
  kernel void kernel_add_row(
38
- device const float * src0,
39
- device const float * src1,
40
- device float * dst,
41
- constant int64_t & ne00,
38
+ device const float4 * src0,
39
+ device const float4 * src1,
40
+ device float4 * dst,
41
+ constant int64_t & nb,
42
42
  uint tpig[[thread_position_in_grid]]) {
43
- dst[tpig] = src0[tpig] + src1[tpig % ne00];
43
+ dst[tpig] = src0[tpig] + src1[tpig % nb];
44
44
  }
45
45
 
46
46
  kernel void kernel_mul(
47
- device const float * src0,
48
- device const float * src1,
49
- device float * dst,
47
+ device const float4 * src0,
48
+ device const float4 * src1,
49
+ device float4 * dst,
50
50
  uint tpig[[thread_position_in_grid]]) {
51
51
  dst[tpig] = src0[tpig] * src1[tpig];
52
52
  }
@@ -54,12 +54,12 @@ kernel void kernel_mul(
54
54
  // assumption: src1 is a row
55
55
  // broadcast src1 into src0
56
56
  kernel void kernel_mul_row(
57
- device const float * src0,
58
- device const float * src1,
59
- device float * dst,
60
- constant int64_t & ne00,
57
+ device const float4 * src0,
58
+ device const float4 * src1,
59
+ device float4 * dst,
60
+ constant int64_t & nb,
61
61
  uint tpig[[thread_position_in_grid]]) {
62
- dst[tpig] = src0[tpig] * src1[tpig % ne00];
62
+ dst[tpig] = src0[tpig] * src1[tpig % nb];
63
63
  }
64
64
 
65
65
  kernel void kernel_scale(
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
133
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
134
  }
135
135
 
136
- // broadcast
137
- if (tpitg[0] == 0) {
138
- buf[0] = buf[0];
139
- }
136
+ //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
137
+ // the loop, and when that is done, buf[0] has the correct (synchronized) value
138
+ //if (tpitg[0] == 0) {
139
+ // buf[0] = buf[0];
140
+ //}
140
141
 
141
- threadgroup_barrier(mem_flags::mem_threadgroup);
142
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
142
143
 
143
144
  const float max = buf[0];
144
145
 
145
146
  // parallel sum
146
147
  buf[tpitg[0]] = 0.0f;
147
148
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
148
- buf[tpitg[0]] += exp(psrc0[i00] - max);
149
+ const float exp_psrc0 = exp(psrc0[i00] - max);
150
+ buf[tpitg[0]] += exp_psrc0;
151
+ // Remember the result of exp here. exp is expensive, so we really do not
152
+ // whish to compute it twice.
153
+ pdst[i00] = exp_psrc0;
149
154
  }
150
155
 
151
156
  // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
157
162
  threadgroup_barrier(mem_flags::mem_threadgroup);
158
163
  }
159
164
 
160
- // broadcast
161
- if (tpitg[0] == 0) {
162
- buf[0] = buf[0];
163
- }
165
+ // broadcast - not needed, see above
166
+ //// broadcast
167
+ //if (tpitg[0] == 0) {
168
+ // buf[0] = buf[0];
169
+ //}
164
170
 
165
- threadgroup_barrier(mem_flags::mem_threadgroup);
171
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
166
172
 
167
173
  const float sum = buf[0];
168
174
 
169
175
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
170
- pdst[i00] = exp(psrc0[i00] - max) / sum;
176
+ pdst[i00] /= sum;
171
177
  }
172
178
  }
173
179
 
@@ -214,25 +220,17 @@ kernel void kernel_norm(
214
220
  }
215
221
  threadgroup_barrier(mem_flags::mem_threadgroup);
216
222
  }
217
- // broadcast
218
- if (tpitg == 0) {
219
- sum[0] /= ne00;
220
- }
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
222
- const float mean = sum[0];
223
+ const float mean = sum[0] / ne00;
223
224
 
224
- // recenter
225
+ // recenter and VARIANCE
226
+ threadgroup_barrier(mem_flags::mem_threadgroup);
225
227
  device float * y = dst + tgpig*ne00;
226
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
227
- y[i00] = x[i00] - mean;
228
- }
229
-
230
- // VARIANCE
231
- // parallel sum
232
228
  sum[tpitg] = 0.0f;
233
229
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
230
+ y[i00] = x[i00] - mean;
234
231
  sum[tpitg] += y[i00] * y[i00];
235
232
  }
233
+
236
234
  // reduce
237
235
  threadgroup_barrier(mem_flags::mem_threadgroup);
238
236
  for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,12 +239,7 @@ kernel void kernel_norm(
241
239
  }
242
240
  threadgroup_barrier(mem_flags::mem_threadgroup);
243
241
  }
244
- // broadcast
245
- if (tpitg == 0) {
246
- sum[0] /= ne00;
247
- }
248
- threadgroup_barrier(mem_flags::mem_threadgroup);
249
- const float variance = sum[0];
242
+ const float variance = sum[0] / ne00;
250
243
 
251
244
  const float scale = 1.0f/sqrt(variance + eps);
252
245
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -254,7 +247,6 @@ kernel void kernel_norm(
254
247
  }
255
248
  }
256
249
 
257
-
258
250
  kernel void kernel_rms_norm(
259
251
  device const void * src0,
260
252
  device float * dst,
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
435
427
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
436
428
  }
437
429
 
430
+ #define NB_Q8_0 8
431
+
438
432
  kernel void kernel_mul_mat_q8_0_f32(
439
433
  device const void * src0,
440
434
  device const float * src1,
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
463
457
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
464
458
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
465
459
 
466
- float yl[16];
460
+ float yl[NB_Q8_0];
467
461
  float sumf[nr]={0.f};
468
462
 
469
- const int ix = tiisg/2;
470
- const int il = tiisg%2;
463
+ const int ix = tiisg/4;
464
+ const int il = tiisg%4;
471
465
 
472
- device const float * yb = y + ix * QK8_0 + 16*il;
466
+ device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
473
467
 
474
- // each thread in a SIMD group deals with half a block.
475
- for (int ib = ix; ib < nb; ib += nw/2) {
476
- for (int i = 0; i < 16; ++i) {
468
+ // each thread in a SIMD group deals with NB_Q8_0 quants at a time
469
+ for (int ib = ix; ib < nb; ib += nw/4) {
470
+ for (int i = 0; i < NB_Q8_0; ++i) {
477
471
  yl[i] = yb[i];
478
472
  }
479
473
 
480
474
  for (int row = 0; row < nr; row++) {
481
- device const int8_t * qs = x[ib+row*nb].qs + 16*il;
475
+ device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
482
476
  float sumq = 0.f;
483
- for (int iq = 0; iq < 16; ++iq) {
477
+ for (int iq = 0; iq < NB_Q8_0; ++iq) {
484
478
  sumq += qs[iq] * yl[iq];
485
479
  }
486
480
  sumf[row] += sumq*x[ib+row*nb].d;
487
481
  }
488
482
 
489
- yb += QK8_0 * 16;
483
+ yb += NB_Q8_0 * nw;
490
484
  }
491
485
 
492
486
  for (int row = 0; row < nr; ++row) {
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
497
491
  }
498
492
  }
499
493
 
500
- kernel void kernel_mul_mat_f16_f32(
494
+ kernel void kernel_mul_mat_f16_f32_1row(
501
495
  device const char * src0,
502
496
  device const char * src1,
503
497
  device float * dst,
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
515
509
  constant uint64_t & nb12,
516
510
  constant int64_t & ne0,
517
511
  constant int64_t & ne1,
518
- threadgroup float * sum [[threadgroup(0)]],
519
512
  uint3 tgpig[[threadgroup_position_in_grid]],
520
- uint3 tpig[[thread_position_in_grid]],
521
- uint3 tpitg[[thread_position_in_threadgroup]],
522
- uint3 tptg[[threads_per_threadgroup]]) {
513
+ uint tiisg[[thread_index_in_simdgroup]]) {
523
514
 
524
515
  const int64_t r0 = tgpig.x;
525
516
  const int64_t r1 = tgpig.y;
@@ -528,23 +519,100 @@ kernel void kernel_mul_mat_f16_f32(
528
519
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
529
520
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
530
521
 
531
- sum[tpitg.x] = 0.0f;
532
-
533
- for (int i = tpitg.x; i < ne00; i += tptg.x) {
534
- sum[tpitg.x] += (float) x[i] * (float) y[i];
522
+ float sumf = 0;
523
+ if (ne00 < 128) {
524
+ for (int i = tiisg; i < ne00; i += 32) {
525
+ sumf += (float) x[i] * (float) y[i];
526
+ }
527
+ float all_sum = simd_sum(sumf);
528
+ if (tiisg == 0) {
529
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
530
+ }
531
+ } else {
532
+ device const half4 * x4 = (device const half4 *) x;
533
+ device const float4 * y4 = (device const float4 *) y;
534
+ for (int i = tiisg; i < ne00/4; i += 32) {
535
+ for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
536
+ }
537
+ float all_sum = simd_sum(sumf);
538
+ if (tiisg == 0) {
539
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
540
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
541
+ }
535
542
  }
536
543
 
537
- // accumulate the sum from all threads in the threadgroup
538
- threadgroup_barrier(mem_flags::mem_threadgroup);
539
- for (uint i = tptg.x/2; i > 0; i /= 2) {
540
- if (tpitg.x < i) {
541
- sum[tpitg.x] += sum[tpitg.x + i];
544
+ }
545
+
546
+ #define N_F16_F32 4
547
+
548
+ kernel void kernel_mul_mat_f16_f32(
549
+ device const char * src0,
550
+ device const char * src1,
551
+ device float * dst,
552
+ constant int64_t & ne00,
553
+ constant int64_t & ne01,
554
+ constant int64_t & ne02,
555
+ constant uint64_t & nb00,
556
+ constant uint64_t & nb01,
557
+ constant uint64_t & nb02,
558
+ constant int64_t & ne10,
559
+ constant int64_t & ne11,
560
+ constant int64_t & ne12,
561
+ constant uint64_t & nb10,
562
+ constant uint64_t & nb11,
563
+ constant uint64_t & nb12,
564
+ constant int64_t & ne0,
565
+ constant int64_t & ne1,
566
+ uint3 tgpig[[threadgroup_position_in_grid]],
567
+ uint tiisg[[thread_index_in_simdgroup]]) {
568
+
569
+ const int64_t r0 = tgpig.x;
570
+ const int64_t rb = tgpig.y*N_F16_F32;
571
+ const int64_t im = tgpig.z;
572
+
573
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
574
+
575
+ if (ne00 < 128) {
576
+ for (int row = 0; row < N_F16_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+
584
+ float sumf = 0;
585
+ for (int i = tiisg; i < ne00; i += 32) {
586
+ sumf += (float) x[i] * (float) y[i];
587
+ }
588
+
589
+ float all_sum = simd_sum(sumf);
590
+ if (tiisg == 0) {
591
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
592
+ }
542
593
  }
543
- threadgroup_barrier(mem_flags::mem_threadgroup);
544
- }
594
+ } else {
595
+ device const half4 * x4 = (device const half4 *)x;
596
+ for (int row = 0; row < N_F16_F32; ++row) {
597
+ int r1 = rb + row;
598
+ if (r1 >= ne11) {
599
+ break;
600
+ }
601
+
602
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
603
+ device const float4 * y4 = (device const float4 *) y;
604
+
605
+ float sumf = 0;
606
+ for (int i = tiisg; i < ne00/4; i += 32) {
607
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
608
+ }
545
609
 
546
- if (tpitg.x == 0) {
547
- dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
610
+ float all_sum = simd_sum(sumf);
611
+ if (tiisg == 0) {
612
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
613
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
614
+ }
615
+ }
548
616
  }
549
617
  }
550
618
 
@@ -614,25 +682,27 @@ kernel void kernel_rope(
614
682
  constant int & mode,
615
683
  constant float & freq_base,
616
684
  constant float & freq_scale,
617
- uint3 tpig[[thread_position_in_grid]]) {
618
- const int64_t i3 = tpig[2];
619
- const int64_t i2 = tpig[1];
620
- const int64_t i1 = tpig[0];
685
+ uint tiitg[[thread_index_in_threadgroup]],
686
+ uint3 tptg[[threads_per_threadgroup]],
687
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
688
+ const int64_t i3 = tgpig[2];
689
+ const int64_t i2 = tgpig[1];
690
+ const int64_t i1 = tgpig[0];
621
691
 
622
692
  const bool is_neox = mode & 2;
623
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
624
693
 
625
694
  const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
626
695
 
627
- float theta = freq_scale * (float)p;
696
+ const float theta_0 = freq_scale * (float)p;
697
+ const float inv_ndims = -1.f/n_dims;
628
698
 
629
699
  if (!is_neox) {
630
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
700
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
701
+
702
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
631
703
  const float cos_theta = cos(theta);
632
704
  const float sin_theta = sin(theta);
633
705
 
634
- theta *= theta_scale;
635
-
636
706
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
637
707
  device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
638
708
 
@@ -644,12 +714,12 @@ kernel void kernel_rope(
644
714
  }
645
715
  } else {
646
716
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
647
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
717
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
718
+
719
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
648
720
  const float cos_theta = cos(theta);
649
721
  const float sin_theta = sin(theta);
650
722
 
651
- theta *= theta_scale;
652
-
653
723
  const int64_t i0 = ib*n_dims + ic/2;
654
724
 
655
725
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -1244,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
1244
1314
  const int r0 = tgpig.x;
1245
1315
  const int r1 = tgpig.y;
1246
1316
  const int r2 = tgpig.z;
1247
- const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1317
+ //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1318
+ const int first_row = r0 * N_DST;
1248
1319
  const int ib_row = first_row * nb;
1249
1320
  const uint offset0 = r2/gqa*(nb*ne0);
1250
1321
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }