llama_cpp 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
133
133
  threadgroup_barrier(mem_flags::mem_threadgroup);
134
134
  }
135
135
 
136
- // broadcast
137
- if (tpitg[0] == 0) {
138
- buf[0] = buf[0];
139
- }
136
+ //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
137
+ // the loop, and when that is done, buf[0] has the correct (synchronized) value
138
+ //if (tpitg[0] == 0) {
139
+ // buf[0] = buf[0];
140
+ //}
140
141
 
141
- threadgroup_barrier(mem_flags::mem_threadgroup);
142
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
142
143
 
143
144
  const float max = buf[0];
144
145
 
145
146
  // parallel sum
146
147
  buf[tpitg[0]] = 0.0f;
147
148
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
148
- buf[tpitg[0]] += exp(psrc0[i00] - max);
149
+ const float exp_psrc0 = exp(psrc0[i00] - max);
150
+ buf[tpitg[0]] += exp_psrc0;
151
+ // Remember the result of exp here. exp is expensive, so we really do not
152
+ // whish to compute it twice.
153
+ pdst[i00] = exp_psrc0;
149
154
  }
150
155
 
151
156
  // reduce
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
157
162
  threadgroup_barrier(mem_flags::mem_threadgroup);
158
163
  }
159
164
 
160
- // broadcast
161
- if (tpitg[0] == 0) {
162
- buf[0] = buf[0];
163
- }
165
+ // broadcast - not needed, see above
166
+ //// broadcast
167
+ //if (tpitg[0] == 0) {
168
+ // buf[0] = buf[0];
169
+ //}
164
170
 
165
- threadgroup_barrier(mem_flags::mem_threadgroup);
171
+ //threadgroup_barrier(mem_flags::mem_threadgroup);
166
172
 
167
173
  const float sum = buf[0];
168
174
 
169
175
  for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
170
- pdst[i00] = exp(psrc0[i00] - max) / sum;
176
+ pdst[i00] /= sum;
171
177
  }
172
178
  }
173
179
 
@@ -214,25 +220,17 @@ kernel void kernel_norm(
214
220
  }
215
221
  threadgroup_barrier(mem_flags::mem_threadgroup);
216
222
  }
217
- // broadcast
218
- if (tpitg == 0) {
219
- sum[0] /= ne00;
220
- }
221
- threadgroup_barrier(mem_flags::mem_threadgroup);
222
- const float mean = sum[0];
223
+ const float mean = sum[0] / ne00;
223
224
 
224
- // recenter
225
+ // recenter and VARIANCE
226
+ threadgroup_barrier(mem_flags::mem_threadgroup);
225
227
  device float * y = dst + tgpig*ne00;
226
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
227
- y[i00] = x[i00] - mean;
228
- }
229
-
230
- // VARIANCE
231
- // parallel sum
232
228
  sum[tpitg] = 0.0f;
233
229
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
230
+ y[i00] = x[i00] - mean;
234
231
  sum[tpitg] += y[i00] * y[i00];
235
232
  }
233
+
236
234
  // reduce
237
235
  threadgroup_barrier(mem_flags::mem_threadgroup);
238
236
  for (uint i = ntg/2; i > 0; i /= 2) {
@@ -241,12 +239,7 @@ kernel void kernel_norm(
241
239
  }
242
240
  threadgroup_barrier(mem_flags::mem_threadgroup);
243
241
  }
244
- // broadcast
245
- if (tpitg == 0) {
246
- sum[0] /= ne00;
247
- }
248
- threadgroup_barrier(mem_flags::mem_threadgroup);
249
- const float variance = sum[0];
242
+ const float variance = sum[0] / ne00;
250
243
 
251
244
  const float scale = 1.0f/sqrt(variance + eps);
252
245
  for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -254,7 +247,6 @@ kernel void kernel_norm(
254
247
  }
255
248
  }
256
249
 
257
-
258
250
  kernel void kernel_rms_norm(
259
251
  device const void * src0,
260
252
  device float * dst,
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
435
427
  mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
436
428
  }
437
429
 
430
+ #define NB_Q8_0 8
431
+
438
432
  kernel void kernel_mul_mat_q8_0_f32(
439
433
  device const void * src0,
440
434
  device const float * src1,
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
463
457
  device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
464
458
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
465
459
 
466
- float yl[16];
460
+ float yl[NB_Q8_0];
467
461
  float sumf[nr]={0.f};
468
462
 
469
- const int ix = tiisg/2;
470
- const int il = tiisg%2;
463
+ const int ix = tiisg/4;
464
+ const int il = tiisg%4;
471
465
 
472
- device const float * yb = y + ix * QK8_0 + 16*il;
466
+ device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
473
467
 
474
- // each thread in a SIMD group deals with half a block.
475
- for (int ib = ix; ib < nb; ib += nw/2) {
476
- for (int i = 0; i < 16; ++i) {
468
+ // each thread in a SIMD group deals with NB_Q8_0 quants at a time
469
+ for (int ib = ix; ib < nb; ib += nw/4) {
470
+ for (int i = 0; i < NB_Q8_0; ++i) {
477
471
  yl[i] = yb[i];
478
472
  }
479
473
 
480
474
  for (int row = 0; row < nr; row++) {
481
- device const int8_t * qs = x[ib+row*nb].qs + 16*il;
475
+ device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
482
476
  float sumq = 0.f;
483
- for (int iq = 0; iq < 16; ++iq) {
477
+ for (int iq = 0; iq < NB_Q8_0; ++iq) {
484
478
  sumq += qs[iq] * yl[iq];
485
479
  }
486
480
  sumf[row] += sumq*x[ib+row*nb].d;
487
481
  }
488
482
 
489
- yb += QK8_0 * 16;
483
+ yb += NB_Q8_0 * nw;
490
484
  }
491
485
 
492
486
  for (int row = 0; row < nr; ++row) {
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
497
491
  }
498
492
  }
499
493
 
500
- kernel void kernel_mul_mat_f16_f32(
494
+ kernel void kernel_mul_mat_f16_f32_1row(
501
495
  device const char * src0,
502
496
  device const char * src1,
503
497
  device float * dst,
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
515
509
  constant uint64_t & nb12,
516
510
  constant int64_t & ne0,
517
511
  constant int64_t & ne1,
518
- threadgroup float * sum [[threadgroup(0)]],
519
512
  uint3 tgpig[[threadgroup_position_in_grid]],
520
- uint3 tpig[[thread_position_in_grid]],
521
- uint3 tpitg[[thread_position_in_threadgroup]],
522
- uint3 tptg[[threads_per_threadgroup]]) {
513
+ uint tiisg[[thread_index_in_simdgroup]]) {
523
514
 
524
515
  const int64_t r0 = tgpig.x;
525
516
  const int64_t r1 = tgpig.y;
@@ -528,42 +519,101 @@ kernel void kernel_mul_mat_f16_f32(
528
519
  device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
529
520
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
530
521
 
531
- uint ith = tpitg.x;
532
- uint nth = tptg.x;
522
+ float sumf = 0;
523
+ if (ne00 < 128) {
524
+ for (int i = tiisg; i < ne00; i += 32) {
525
+ sumf += (float) x[i] * (float) y[i];
526
+ }
527
+ float all_sum = simd_sum(sumf);
528
+ if (tiisg == 0) {
529
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
530
+ }
531
+ } else {
532
+ device const half4 * x4 = (device const half4 *) x;
533
+ device const float4 * y4 = (device const float4 *) y;
534
+ for (int i = tiisg; i < ne00/4; i += 32) {
535
+ for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
536
+ }
537
+ float all_sum = simd_sum(sumf);
538
+ if (tiisg == 0) {
539
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
540
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
541
+ }
542
+ }
533
543
 
534
- sum[ith] = 0.0f;
544
+ }
535
545
 
536
- for (int i = ith; i < ne00; i += nth) {
537
- sum[ith] += (float) x[i] * (float) y[i];
538
- }
546
+ #define N_F16_F32 4
539
547
 
540
- // accumulate the sum from all threads in the threadgroup
541
- threadgroup_barrier(mem_flags::mem_threadgroup);
542
- if (ith%4 == 0) {
543
- for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
544
- }
545
- threadgroup_barrier(mem_flags::mem_threadgroup);
546
- if (ith%16 == 0) {
547
- for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
548
- }
549
- threadgroup_barrier(mem_flags::mem_threadgroup);
550
- if (ith == 0) {
551
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
552
- dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
553
- }
548
+ kernel void kernel_mul_mat_f16_f32(
549
+ device const char * src0,
550
+ device const char * src1,
551
+ device float * dst,
552
+ constant int64_t & ne00,
553
+ constant int64_t & ne01,
554
+ constant int64_t & ne02,
555
+ constant uint64_t & nb00,
556
+ constant uint64_t & nb01,
557
+ constant uint64_t & nb02,
558
+ constant int64_t & ne10,
559
+ constant int64_t & ne11,
560
+ constant int64_t & ne12,
561
+ constant uint64_t & nb10,
562
+ constant uint64_t & nb11,
563
+ constant uint64_t & nb12,
564
+ constant int64_t & ne0,
565
+ constant int64_t & ne1,
566
+ uint3 tgpig[[threadgroup_position_in_grid]],
567
+ uint tiisg[[thread_index_in_simdgroup]]) {
554
568
 
555
- // Original implementation. Left behind commented out for now
556
- //threadgroup_barrier(mem_flags::mem_threadgroup);
557
- //for (uint i = tptg.x/2; i > 0; i /= 2) {
558
- // if (tpitg.x < i) {
559
- // sum[tpitg.x] += sum[tpitg.x + i];
560
- // }
561
- // threadgroup_barrier(mem_flags::mem_threadgroup);
562
- //}
563
- //
564
- //if (tpitg.x == 0) {
565
- // dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
566
- //}
569
+ const int64_t r0 = tgpig.x;
570
+ const int64_t rb = tgpig.y*N_F16_F32;
571
+ const int64_t im = tgpig.z;
572
+
573
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
574
+
575
+ if (ne00 < 128) {
576
+ for (int row = 0; row < N_F16_F32; ++row) {
577
+ int r1 = rb + row;
578
+ if (r1 >= ne11) {
579
+ break;
580
+ }
581
+
582
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
583
+
584
+ float sumf = 0;
585
+ for (int i = tiisg; i < ne00; i += 32) {
586
+ sumf += (float) x[i] * (float) y[i];
587
+ }
588
+
589
+ float all_sum = simd_sum(sumf);
590
+ if (tiisg == 0) {
591
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
592
+ }
593
+ }
594
+ } else {
595
+ device const half4 * x4 = (device const half4 *)x;
596
+ for (int row = 0; row < N_F16_F32; ++row) {
597
+ int r1 = rb + row;
598
+ if (r1 >= ne11) {
599
+ break;
600
+ }
601
+
602
+ device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
603
+ device const float4 * y4 = (device const float4 *) y;
604
+
605
+ float sumf = 0;
606
+ for (int i = tiisg; i < ne00/4; i += 32) {
607
+ for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
608
+ }
609
+
610
+ float all_sum = simd_sum(sumf);
611
+ if (tiisg == 0) {
612
+ for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
613
+ dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
614
+ }
615
+ }
616
+ }
567
617
  }
568
618
 
569
619
  kernel void kernel_alibi_f32(
@@ -632,25 +682,27 @@ kernel void kernel_rope(
632
682
  constant int & mode,
633
683
  constant float & freq_base,
634
684
  constant float & freq_scale,
635
- uint3 tpig[[thread_position_in_grid]]) {
636
- const int64_t i3 = tpig[2];
637
- const int64_t i2 = tpig[1];
638
- const int64_t i1 = tpig[0];
685
+ uint tiitg[[thread_index_in_threadgroup]],
686
+ uint3 tptg[[threads_per_threadgroup]],
687
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
688
+ const int64_t i3 = tgpig[2];
689
+ const int64_t i2 = tgpig[1];
690
+ const int64_t i1 = tgpig[0];
639
691
 
640
692
  const bool is_neox = mode & 2;
641
- const float theta_scale = pow(freq_base, -2.0f/n_dims);
642
693
 
643
694
  const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
644
695
 
645
- float theta = freq_scale * (float)p;
696
+ const float theta_0 = freq_scale * (float)p;
697
+ const float inv_ndims = -1.f/n_dims;
646
698
 
647
699
  if (!is_neox) {
648
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
700
+ for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
701
+
702
+ const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
649
703
  const float cos_theta = cos(theta);
650
704
  const float sin_theta = sin(theta);
651
705
 
652
- theta *= theta_scale;
653
-
654
706
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
655
707
  device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
656
708
 
@@ -662,12 +714,12 @@ kernel void kernel_rope(
662
714
  }
663
715
  } else {
664
716
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
665
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
717
+ for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
718
+
719
+ const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
666
720
  const float cos_theta = cos(theta);
667
721
  const float sin_theta = sin(theta);
668
722
 
669
- theta *= theta_scale;
670
-
671
723
  const int64_t i0 = ib*n_dims + ic/2;
672
724
 
673
725
  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -1262,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
1262
1314
  const int r0 = tgpig.x;
1263
1315
  const int r1 = tgpig.y;
1264
1316
  const int r2 = tgpig.z;
1265
- const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1317
+ //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
1318
+ const int first_row = r0 * N_DST;
1266
1319
  const int ib_row = first_row * nb;
1267
1320
  const uint offset0 = r2/gqa*(nb*ne0);
1268
1321
  device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
103
103
  #include <sys/stat.h>
104
104
  #include <unistd.h>
105
105
 
106
+ #endif
107
+ #ifdef GGML_USE_CPU_HBM
108
+ #include <hbwmalloc.h>
106
109
  #endif
107
110
 
108
111
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -192,9 +195,15 @@ typedef void * thread_ret_t;
192
195
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
193
196
  #else
194
197
  inline static void * ggml_aligned_malloc(size_t size) {
198
+ if (size == 0) {
199
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
200
+ return NULL;
201
+ }
195
202
  void * aligned_memory = NULL;
196
- #ifdef GGML_USE_METAL
197
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
203
+ #ifdef GGML_USE_CPU_HBM
204
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
205
+ #elif GGML_USE_METAL
206
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
198
207
  #else
199
208
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
200
209
  #endif
@@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
215
224
  return aligned_memory;
216
225
  }
217
226
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
227
+ #ifdef GGML_USE_CPU_HBM
228
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
229
+ #else
218
230
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
219
231
  #endif
232
+ #endif
220
233
 
221
234
  #define UNUSED GGML_UNUSED
222
235
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
830
 
818
831
  #if !defined(__aarch64__)
819
832
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
833
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
834
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
835
  }
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
838
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
839
  }
867
840
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
841
  inline static float vmaxvq_f32(float32x4_t v) {
875
842
  return
876
843
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -4612,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4612
4579
  return NULL;
4613
4580
  }
4614
4581
 
4582
+ // allow to call ggml_init with 0 size
4583
+ if (params.mem_size == 0) {
4584
+ params.mem_size = GGML_MEM_ALIGN;
4585
+ }
4586
+
4615
4587
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4616
4588
 
4617
4589
  *ctx = (struct ggml_context) {
@@ -4814,7 +4786,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4814
4786
 
4815
4787
  size_t obj_alloc_size = 0;
4816
4788
 
4817
- if (view_src == NULL && ctx->no_alloc == false) {
4789
+ if (view_src == NULL && !ctx->no_alloc) {
4818
4790
  if (ctx->scratch.data != NULL) {
4819
4791
  // allocate tensor data in the scratch buffer
4820
4792
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5515,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
5515
5487
  }
5516
5488
 
5517
5489
  if (inplace) {
5518
- GGML_ASSERT(is_node == false);
5490
+ GGML_ASSERT(!is_node);
5519
5491
  }
5520
5492
 
5521
5493
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5558,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
5558
5530
  }
5559
5531
 
5560
5532
  if (inplace) {
5561
- GGML_ASSERT(is_node == false);
5533
+ GGML_ASSERT(!is_node);
5562
5534
  }
5563
5535
 
5564
5536
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -20003,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20003
19975
 
20004
19976
  struct ggml_tensor * data = NULL;
20005
19977
 
20006
- if (params.no_alloc == false) {
19978
+ if (!params.no_alloc) {
20007
19979
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
19980
 
20009
19981
  ok = ok && data != NULL;
@@ -20044,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20044
20016
  }
20045
20017
 
20046
20018
  // point the data member to the appropriate location in the binary blob using the tensor infos
20047
- if (params.no_alloc == false) {
20019
+ if (!params.no_alloc) {
20048
20020
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
20021
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
20022
  }