llama_cpp 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
133
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
134
  }
135
135
 
136
- // broadcast
137
- if (tpitg[0] == 0) {
138
- buf[0] = buf[0];
139
- }
136
+ //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
137
+ // the loop, and when that is done, buf[0] has the correct (synchronized) value
138
+ //if (tpitg[0] == 0) {
139
+ // buf[0] = buf[0];
140
+ //}
140
141
 
141
- threadgroup_barrier(mem_flags::mem_threadgroup);
142
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
142
143
 
143
144
  const float max = buf[0];
144
145
 
145
146
  // parallel sum
146
147
  buf[tpitg[0]] = 0.0f;
147
148
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
148
- buf[tpitg[0]] += exp(psrc0[i00] - max);
149
+ const float exp_psrc0 = exp(psrc0[i00] - max);
150
+ buf[tpitg[0]] += exp_psrc0;
151
+ // Remember the result of exp here. exp is expensive, so we really do not
152
+ // whish to compute it twice.
153
+ pdst[i00] = exp_psrc0;
149
154
  }
150
155
 
151
156
  // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
157
162
  threadgroup_barrier(mem_flags::mem_threadgroup);
158
163
  }
159
164
 
160
- // broadcast
161
- if (tpitg[0] == 0) {
162
- buf[0] = buf[0];
163
- }
165
+ // broadcast - not needed, see above
166
+ //// broadcast
167
+ //if (tpitg[0] == 0) {
168
+ // buf[0] = buf[0];
169
+ //}
164
170
 
165
- threadgroup_barrier(mem_flags::mem_threadgroup);
171
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
166
172
 
167
173
  const float sum = buf[0];
168
174
 
169
175
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
170
- pdst[i00] = exp(psrc0[i00] - max) / sum;
176
+ pdst[i00] /= sum;
171
177
  }
172
178
  }
173
179
 
@@ -214,25 +220,17 @@ kernel void kernel_norm(
214
220
  }
215
221
  threadgroup_barrier(mem_flags::mem_threadgroup);
216
222
  }
217
- // broadcast
218
- if (tpitg == 0) {
219
- sum[0] /= ne00;
220
- }
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
222
- const float mean = sum[0];
223
+ const float mean = sum[0] / ne00;
223
224
 
224
- // recenter
225
+ // recenter and VARIANCE
226
+ threadgroup_barrier(mem_flags::mem_threadgroup);
225
227
  device float * y = dst + tgpig*ne00;
226
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
227
- y[i00] = x[i00] - mean;
228
- }
229
-
230
- // VARIANCE
231
- // parallel sum
232
228
  sum[tpitg] = 0.0f;
233
229
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
230
+ y[i00] = x[i00] - mean;
234
231
  sum[tpitg] += y[i00] * y[i00];
235
232
  }
233
+
236
234
  // reduce
237
235
  threadgroup_barrier(mem_flags::mem_threadgroup);
238
236
  for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,12 +239,7 @@ kernel void kernel_norm(
241
239
  }
242
240
  threadgroup_barrier(mem_flags::mem_threadgroup);
243
241
  }
244
- // broadcast
245
- if (tpitg == 0) {
246
- sum[0] /= ne00;
247
- }
248
- threadgroup_barrier(mem_flags::mem_threadgroup);
249
- const float variance = sum[0];
242
+ const float variance = sum[0] / ne00;
250
243
 
251
244
  const float scale = 1.0f/sqrt(variance + eps);
252
245
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -254,7 +247,6 @@ kernel void kernel_norm(
254
247
  }
255
248
  }
256
249
 
257
-
258
250
  kernel void kernel_rms_norm(
259
251
  device const void * src0,
260
252
  device float * dst,
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
435
427
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
436
428
  }
437
429
 
430
+ #define NB_Q8_0 8
431
+
438
432
  kernel void kernel_mul_mat_q8_0_f32(
439
433
  device const void * src0,
440
434
  device const float * src1,
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
463
457
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
464
458
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
465
459
 
466
- float yl[16];
460
+ float yl[NB_Q8_0];
467
461
  float sumf[nr]={0.f};
468
462
 
469
- const int ix = tiisg/2;
470
- const int il = tiisg%2;
463
+ const int ix = tiisg/4;
464
+ const int il = tiisg%4;
471
465
 
472
- device const float * yb = y + ix * QK8_0 + 16*il;
466
+ device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
473
467
 
474
- // each thread in a SIMD group deals with half a block.
475
- for (int ib = ix; ib < nb; ib += nw/2) {
476
- for (int i = 0; i < 16; ++i) {
468
+ // each thread in a SIMD group deals with NB_Q8_0 quants at a time
469
+ for (int ib = ix; ib < nb; ib += nw/4) {
470
+ for (int i = 0; i < NB_Q8_0; ++i) {
477
471
  yl[i] = yb[i];
478
472
  }
479
473
 
480
474
  for (int row = 0; row < nr; row++) {
481
- device const int8_t * qs = x[ib+row*nb].qs + 16*il;
475
+ device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
482
476
  float sumq = 0.f;
483
- for (int iq = 0; iq < 16; ++iq) {
477
+ for (int iq = 0; iq < NB_Q8_0; ++iq) {
484
478
  sumq += qs[iq] * yl[iq];
485
479
  }
486
480
  sumf[row] += sumq*x[ib+row*nb].d;
487
481
  }
488
482
 
489
- yb += QK8_0 * 16;
483
+ yb += NB_Q8_0 * nw;
490
484
  }
491
485
 
492
486
  for (int row = 0; row < nr; ++row) {
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
497
491
  }
498
492
  }
499
493
 
500
- kernel void kernel_mul_mat_f16_f32(
494
+ kernel void kernel_mul_mat_f16_f32_1row(
501
495
  device const char * src0,
502
496
  device const char * src1,
503
497
  device float * dst,
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
515
509
  constant uint64_t & nb12,
516
510
  constant int64_t & ne0,
517
511
  constant int64_t & ne1,
518
- threadgroup float * sum [[threadgroup(0)]],
519
512
  uint3 tgpig[[threadgroup_position_in_grid]],
520
- uint3 tpig[[thread_position_in_grid]],
521
- uint3 tpitg[[thread_position_in_threadgroup]],
522
- uint3 tptg[[threads_per_threadgroup]]) {
513
+ uint tiisg[[thread_index_in_simdgroup]]) {
523
514
 
524
515
  const int64_t r0 = tgpig.x;
525
516
  const int64_t r1 = tgpig.y;
@@ -528,42 +519,101 @@ kernel void kernel_mul_mat_f16_f32(
528
519
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
529
520
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
530
521
 
531
- uint ith = tpitg.x;
532
- uint nth = tptg.x;
522
+ float sumf = 0;
523
+ if (ne00 < 128) {
524
+ for (int i = tiisg; i < ne00; i += 32) {
525
+ sumf += (float) x[i] * (float) y[i];
526
+ }
527
+ float all_sum = simd_sum(sumf);
528
+ if (tiisg == 0) {
529
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
530
+ }
531
+ } else {
532
+ device const half4 * x4 = (device const half4 *) x;
533
+ device const float4 * y4 = (device const float4 *) y;
534
+ for (int i = tiisg; i < ne00/4; i += 32) {
535
+ for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
536
+ }
537
+ float all_sum = simd_sum(sumf);
538
+ if (tiisg == 0) {
539
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
540
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
541
+ }
542
+ }
533
543
 
534
- sum[ith] = 0.0f;
544
+ }
535
545
 
536
- for (int i = ith; i < ne00; i += nth) {
537
- sum[ith] += (float) x[i] * (float) y[i];
538
- }
546
+ #define N_F16_F32 4
539
547
 
540
- // accumulate the sum from all threads in the threadgroup
541
- threadgroup_barrier(mem_flags::mem_threadgroup);
542
- if (ith%4 == 0) {
543
- for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
544
- }
545
- threadgroup_barrier(mem_flags::mem_threadgroup);
546
- if (ith%16 == 0) {
547
- for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
548
- }
549
- threadgroup_barrier(mem_flags::mem_threadgroup);
550
- if (ith == 0) {
551
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
552
- dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
553
- }
548
+ kernel void kernel_mul_mat_f16_f32(
549
+ device const char * src0,
550
+ device const char * src1,
551
+ device float * dst,
552
+ constant int64_t & ne00,
553
+ constant int64_t & ne01,
554
+ constant int64_t & ne02,
555
+ constant uint64_t & nb00,
556
+ constant uint64_t & nb01,
557
+ constant uint64_t & nb02,
558
+ constant int64_t & ne10,
559
+ constant int64_t & ne11,
560
+ constant int64_t & ne12,
561
+ constant uint64_t & nb10,
562
+ constant uint64_t & nb11,
563
+ constant uint64_t & nb12,
564
+ constant int64_t & ne0,
565
+ constant int64_t & ne1,
566
+ uint3 tgpig[[threadgroup_position_in_grid]],
567
+ uint tiisg[[thread_index_in_simdgroup]]) {
554
568
 
555
- // Original implementation. Left behind commented out for now
556
- //threadgroup_barrier(mem_flags::mem_threadgroup);
557
- //for (uint i = tptg.x/2; i > 0; i /= 2) {
558
- // if (tpitg.x < i) {
559
- // sum[tpitg.x] += sum[tpitg.x + i];
560
- // }
561
- // threadgroup_barrier(mem_flags::mem_threadgroup);
562
- //}
563
- //
564
- //if (tpitg.x == 0) {
565
- // dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
566
- //}
569
+ const int64_t r0 = tgpig.x;
570
+ const int64_t rb = tgpig.y*N_F16_F32;
571
+ const int64_t im = tgpig.z;
572
+
573
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
574
+
575
+ if (ne00 < 128) {
576
+ for (int row = 0; row < N_F16_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+
584
+ float sumf = 0;
585
+ for (int i = tiisg; i < ne00; i += 32) {
586
+ sumf += (float) x[i] * (float) y[i];
587
+ }
588
+
589
+ float all_sum = simd_sum(sumf);
590
+ if (tiisg == 0) {
591
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
592
+ }
593
+ }
594
+ } else {
595
+ device const half4 * x4 = (device const half4 *)x;
596
+ for (int row = 0; row < N_F16_F32; ++row) {
597
+ int r1 = rb + row;
598
+ if (r1 >= ne11) {
599
+ break;
600
+ }
601
+
602
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
603
+ device const float4 * y4 = (device const float4 *) y;
604
+
605
+ float sumf = 0;
606
+ for (int i = tiisg; i < ne00/4; i += 32) {
607
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
608
+ }
609
+
610
+ float all_sum = simd_sum(sumf);
611
+ if (tiisg == 0) {
612
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
613
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
614
+ }
615
+ }
616
+ }
567
617
  }
568
618
 
569
619
  kernel void kernel_alibi_f32(
@@ -632,25 +682,27 @@ kernel void kernel_rope(
632
682
  constant int & mode,
633
683
  constant float & freq_base,
634
684
  constant float & freq_scale,
635
- uint3 tpig[[thread_position_in_grid]]) {
636
- const int64_t i3 = tpig[2];
637
- const int64_t i2 = tpig[1];
638
- const int64_t i1 = tpig[0];
685
+ uint tiitg[[thread_index_in_threadgroup]],
686
+ uint3 tptg[[threads_per_threadgroup]],
687
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
688
+ const int64_t i3 = tgpig[2];
689
+ const int64_t i2 = tgpig[1];
690
+ const int64_t i1 = tgpig[0];
639
691
 
640
692
  const bool is_neox = mode & 2;
641
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
642
693
 
643
694
  const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
644
695
 
645
- float theta = freq_scale * (float)p;
696
+ const float theta_0 = freq_scale * (float)p;
697
+ const float inv_ndims = -1.f/n_dims;
646
698
 
647
699
  if (!is_neox) {
648
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
700
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
701
+
702
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
649
703
  const float cos_theta = cos(theta);
650
704
  const float sin_theta = sin(theta);
651
705
 
652
- theta *= theta_scale;
653
-
654
706
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
655
707
  device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
656
708
 
@@ -662,12 +714,12 @@ kernel void kernel_rope(
662
714
  }
663
715
  } else {
664
716
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
665
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
717
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
718
+
719
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
666
720
  const float cos_theta = cos(theta);
667
721
  const float sin_theta = sin(theta);
668
722
 
669
- theta *= theta_scale;
670
-
671
723
  const int64_t i0 = ib*n_dims + ic/2;
672
724
 
673
725
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -1262,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
1262
1314
  const int r0 = tgpig.x;
1263
1315
  const int r1 = tgpig.y;
1264
1316
  const int r2 = tgpig.z;
1265
- const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1317
+ //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1318
+ const int first_row = r0 * N_DST;
1266
1319
  const int ib_row = first_row * nb;
1267
1320
  const uint offset0 = r2/gqa*(nb*ne0);
1268
1321
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
103
103
  #include <sys/stat.h>
104
104
  #include <unistd.h>
105
105
 
106
+ #endif
107
+ #ifdef GGML_USE_CPU_HBM
108
+ #include <hbwmalloc.h>
106
109
  #endif
107
110
 
108
111
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -192,9 +195,15 @@ typedef void * thread_ret_t;
192
195
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
193
196
  #else
194
197
  inline static void * ggml_aligned_malloc(size_t size) {
198
+ if (size == 0) {
199
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
200
+ return NULL;
201
+ }
195
202
  void * aligned_memory = NULL;
196
- #ifdef GGML_USE_METAL
197
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
203
+ #ifdef GGML_USE_CPU_HBM
204
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
205
+ #elif GGML_USE_METAL
206
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
198
207
  #else
199
208
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
200
209
  #endif
@@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
215
224
  return aligned_memory;
216
225
  }
217
226
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
227
+ #ifdef GGML_USE_CPU_HBM
228
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
229
+ #else
218
230
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
219
231
  #endif
232
+ #endif
220
233
 
221
234
  #define UNUSED GGML_UNUSED
222
235
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
830
 
818
831
  #if !defined(__aarch64__)
819
832
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
833
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
834
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
835
  }
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
838
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
839
  }
867
840
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
841
  inline static float vmaxvq_f32(float32x4_t v) {
875
842
  return
876
843
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -4612,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4612
4579
  return NULL;
4613
4580
  }
4614
4581
 
4582
+ // allow to call ggml_init with 0 size
4583
+ if (params.mem_size == 0) {
4584
+ params.mem_size = GGML_MEM_ALIGN;
4585
+ }
4586
+
4615
4587
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4616
4588
 
4617
4589
  *ctx = (struct ggml_context) {
@@ -4814,7 +4786,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4814
4786
 
4815
4787
  size_t obj_alloc_size = 0;
4816
4788
 
4817
- if (view_src == NULL && ctx->no_alloc == false) {
4789
+ if (view_src == NULL && !ctx->no_alloc) {
4818
4790
  if (ctx->scratch.data != NULL) {
4819
4791
  // allocate tensor data in the scratch buffer
4820
4792
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5515,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
5515
5487
  }
5516
5488
 
5517
5489
  if (inplace) {
5518
- GGML_ASSERT(is_node == false);
5490
+ GGML_ASSERT(!is_node);
5519
5491
  }
5520
5492
 
5521
5493
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5558,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
5558
5530
  }
5559
5531
 
5560
5532
  if (inplace) {
5561
- GGML_ASSERT(is_node == false);
5533
+ GGML_ASSERT(!is_node);
5562
5534
  }
5563
5535
 
5564
5536
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -20003,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20003
19975
 
20004
19976
  struct ggml_tensor * data = NULL;
20005
19977
 
20006
- if (params.no_alloc == false) {
19978
+ if (!params.no_alloc) {
20007
19979
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
19980
 
20009
19981
  ok = ok && data != NULL;
@@ -20044,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20044
20016
  }
20045
20017
 
20046
20018
  // point the data member to the appropriate location in the binary blob using the tensor infos
20047
- if (params.no_alloc == false) {
20019
+ if (!params.no_alloc) {
20048
20020
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
20021
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
20022
  }