llama_cpp 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
|
133
133
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
134
134
|
}
|
135
135
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
136
|
+
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
137
|
+
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
138
|
+
//if (tpitg[0] == 0) {
|
139
|
+
// buf[0] = buf[0];
|
140
|
+
//}
|
140
141
|
|
141
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
143
|
|
143
144
|
const float max = buf[0];
|
144
145
|
|
145
146
|
// parallel sum
|
146
147
|
buf[tpitg[0]] = 0.0f;
|
147
148
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
148
|
-
|
149
|
+
const float exp_psrc0 = exp(psrc0[i00] - max);
|
150
|
+
buf[tpitg[0]] += exp_psrc0;
|
151
|
+
// Remember the result of exp here. exp is expensive, so we really do not
|
152
|
+
// whish to compute it twice.
|
153
|
+
pdst[i00] = exp_psrc0;
|
149
154
|
}
|
150
155
|
|
151
156
|
// reduce
|
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
|
157
162
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
158
163
|
}
|
159
164
|
|
160
|
-
// broadcast
|
161
|
-
|
162
|
-
|
163
|
-
|
165
|
+
// broadcast - not needed, see above
|
166
|
+
//// broadcast
|
167
|
+
//if (tpitg[0] == 0) {
|
168
|
+
// buf[0] = buf[0];
|
169
|
+
//}
|
164
170
|
|
165
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
171
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
166
172
|
|
167
173
|
const float sum = buf[0];
|
168
174
|
|
169
175
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
170
|
-
pdst[i00]
|
176
|
+
pdst[i00] /= sum;
|
171
177
|
}
|
172
178
|
}
|
173
179
|
|
@@ -214,25 +220,17 @@ kernel void kernel_norm(
|
|
214
220
|
}
|
215
221
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
216
222
|
}
|
217
|
-
|
218
|
-
if (tpitg == 0) {
|
219
|
-
sum[0] /= ne00;
|
220
|
-
}
|
221
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
|
-
const float mean = sum[0];
|
223
|
+
const float mean = sum[0] / ne00;
|
223
224
|
|
224
|
-
// recenter
|
225
|
+
// recenter and VARIANCE
|
226
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
225
227
|
device float * y = dst + tgpig*ne00;
|
226
|
-
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
227
|
-
y[i00] = x[i00] - mean;
|
228
|
-
}
|
229
|
-
|
230
|
-
// VARIANCE
|
231
|
-
// parallel sum
|
232
228
|
sum[tpitg] = 0.0f;
|
233
229
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
230
|
+
y[i00] = x[i00] - mean;
|
234
231
|
sum[tpitg] += y[i00] * y[i00];
|
235
232
|
}
|
233
|
+
|
236
234
|
// reduce
|
237
235
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
238
236
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
@@ -241,12 +239,7 @@ kernel void kernel_norm(
|
|
241
239
|
}
|
242
240
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
243
241
|
}
|
244
|
-
|
245
|
-
if (tpitg == 0) {
|
246
|
-
sum[0] /= ne00;
|
247
|
-
}
|
248
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
249
|
-
const float variance = sum[0];
|
242
|
+
const float variance = sum[0] / ne00;
|
250
243
|
|
251
244
|
const float scale = 1.0f/sqrt(variance + eps);
|
252
245
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
@@ -254,7 +247,6 @@ kernel void kernel_norm(
|
|
254
247
|
}
|
255
248
|
}
|
256
249
|
|
257
|
-
|
258
250
|
kernel void kernel_rms_norm(
|
259
251
|
device const void * src0,
|
260
252
|
device float * dst,
|
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
435
427
|
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
436
428
|
}
|
437
429
|
|
430
|
+
#define NB_Q8_0 8
|
431
|
+
|
438
432
|
kernel void kernel_mul_mat_q8_0_f32(
|
439
433
|
device const void * src0,
|
440
434
|
device const float * src1,
|
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
463
457
|
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
464
458
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
465
459
|
|
466
|
-
float yl[
|
460
|
+
float yl[NB_Q8_0];
|
467
461
|
float sumf[nr]={0.f};
|
468
462
|
|
469
|
-
const int ix = tiisg/
|
470
|
-
const int il = tiisg%
|
463
|
+
const int ix = tiisg/4;
|
464
|
+
const int il = tiisg%4;
|
471
465
|
|
472
|
-
device const float * yb = y + ix * QK8_0 +
|
466
|
+
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
473
467
|
|
474
|
-
// each thread in a SIMD group deals with
|
475
|
-
for (int ib = ix; ib < nb; ib += nw/
|
476
|
-
for (int i = 0; i <
|
468
|
+
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
469
|
+
for (int ib = ix; ib < nb; ib += nw/4) {
|
470
|
+
for (int i = 0; i < NB_Q8_0; ++i) {
|
477
471
|
yl[i] = yb[i];
|
478
472
|
}
|
479
473
|
|
480
474
|
for (int row = 0; row < nr; row++) {
|
481
|
-
device const int8_t * qs = x[ib+row*nb].qs +
|
475
|
+
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
482
476
|
float sumq = 0.f;
|
483
|
-
for (int iq = 0; iq <
|
477
|
+
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
484
478
|
sumq += qs[iq] * yl[iq];
|
485
479
|
}
|
486
480
|
sumf[row] += sumq*x[ib+row*nb].d;
|
487
481
|
}
|
488
482
|
|
489
|
-
yb +=
|
483
|
+
yb += NB_Q8_0 * nw;
|
490
484
|
}
|
491
485
|
|
492
486
|
for (int row = 0; row < nr; ++row) {
|
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
497
491
|
}
|
498
492
|
}
|
499
493
|
|
500
|
-
kernel void
|
494
|
+
kernel void kernel_mul_mat_f16_f32_1row(
|
501
495
|
device const char * src0,
|
502
496
|
device const char * src1,
|
503
497
|
device float * dst,
|
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
509
|
constant uint64_t & nb12,
|
516
510
|
constant int64_t & ne0,
|
517
511
|
constant int64_t & ne1,
|
518
|
-
threadgroup float * sum [[threadgroup(0)]],
|
519
512
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
520
|
-
|
521
|
-
uint3 tpitg[[thread_position_in_threadgroup]],
|
522
|
-
uint3 tptg[[threads_per_threadgroup]]) {
|
513
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
523
514
|
|
524
515
|
const int64_t r0 = tgpig.x;
|
525
516
|
const int64_t r1 = tgpig.y;
|
@@ -528,42 +519,101 @@ kernel void kernel_mul_mat_f16_f32(
|
|
528
519
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
529
520
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
530
521
|
|
531
|
-
|
532
|
-
|
522
|
+
float sumf = 0;
|
523
|
+
if (ne00 < 128) {
|
524
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
525
|
+
sumf += (float) x[i] * (float) y[i];
|
526
|
+
}
|
527
|
+
float all_sum = simd_sum(sumf);
|
528
|
+
if (tiisg == 0) {
|
529
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
530
|
+
}
|
531
|
+
} else {
|
532
|
+
device const half4 * x4 = (device const half4 *) x;
|
533
|
+
device const float4 * y4 = (device const float4 *) y;
|
534
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
535
|
+
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
536
|
+
}
|
537
|
+
float all_sum = simd_sum(sumf);
|
538
|
+
if (tiisg == 0) {
|
539
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
540
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
541
|
+
}
|
542
|
+
}
|
533
543
|
|
534
|
-
|
544
|
+
}
|
535
545
|
|
536
|
-
|
537
|
-
sum[ith] += (float) x[i] * (float) y[i];
|
538
|
-
}
|
546
|
+
#define N_F16_F32 4
|
539
547
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
548
|
+
kernel void kernel_mul_mat_f16_f32(
|
549
|
+
device const char * src0,
|
550
|
+
device const char * src1,
|
551
|
+
device float * dst,
|
552
|
+
constant int64_t & ne00,
|
553
|
+
constant int64_t & ne01,
|
554
|
+
constant int64_t & ne02,
|
555
|
+
constant uint64_t & nb00,
|
556
|
+
constant uint64_t & nb01,
|
557
|
+
constant uint64_t & nb02,
|
558
|
+
constant int64_t & ne10,
|
559
|
+
constant int64_t & ne11,
|
560
|
+
constant int64_t & ne12,
|
561
|
+
constant uint64_t & nb10,
|
562
|
+
constant uint64_t & nb11,
|
563
|
+
constant uint64_t & nb12,
|
564
|
+
constant int64_t & ne0,
|
565
|
+
constant int64_t & ne1,
|
566
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
567
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
554
568
|
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
569
|
+
const int64_t r0 = tgpig.x;
|
570
|
+
const int64_t rb = tgpig.y*N_F16_F32;
|
571
|
+
const int64_t im = tgpig.z;
|
572
|
+
|
573
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
574
|
+
|
575
|
+
if (ne00 < 128) {
|
576
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
|
584
|
+
float sumf = 0;
|
585
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
586
|
+
sumf += (float) x[i] * (float) y[i];
|
587
|
+
}
|
588
|
+
|
589
|
+
float all_sum = simd_sum(sumf);
|
590
|
+
if (tiisg == 0) {
|
591
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
592
|
+
}
|
593
|
+
}
|
594
|
+
} else {
|
595
|
+
device const half4 * x4 = (device const half4 *)x;
|
596
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
597
|
+
int r1 = rb + row;
|
598
|
+
if (r1 >= ne11) {
|
599
|
+
break;
|
600
|
+
}
|
601
|
+
|
602
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
603
|
+
device const float4 * y4 = (device const float4 *) y;
|
604
|
+
|
605
|
+
float sumf = 0;
|
606
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
607
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
608
|
+
}
|
609
|
+
|
610
|
+
float all_sum = simd_sum(sumf);
|
611
|
+
if (tiisg == 0) {
|
612
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
613
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
614
|
+
}
|
615
|
+
}
|
616
|
+
}
|
567
617
|
}
|
568
618
|
|
569
619
|
kernel void kernel_alibi_f32(
|
@@ -632,25 +682,27 @@ kernel void kernel_rope(
|
|
632
682
|
constant int & mode,
|
633
683
|
constant float & freq_base,
|
634
684
|
constant float & freq_scale,
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
const int64_t
|
685
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
686
|
+
uint3 tptg[[threads_per_threadgroup]],
|
687
|
+
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
688
|
+
const int64_t i3 = tgpig[2];
|
689
|
+
const int64_t i2 = tgpig[1];
|
690
|
+
const int64_t i1 = tgpig[0];
|
639
691
|
|
640
692
|
const bool is_neox = mode & 2;
|
641
|
-
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
642
693
|
|
643
694
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
644
695
|
|
645
|
-
float
|
696
|
+
const float theta_0 = freq_scale * (float)p;
|
697
|
+
const float inv_ndims = -1.f/n_dims;
|
646
698
|
|
647
699
|
if (!is_neox) {
|
648
|
-
for (int64_t i0 =
|
700
|
+
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
701
|
+
|
702
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
649
703
|
const float cos_theta = cos(theta);
|
650
704
|
const float sin_theta = sin(theta);
|
651
705
|
|
652
|
-
theta *= theta_scale;
|
653
|
-
|
654
706
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
655
707
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
656
708
|
|
@@ -662,12 +714,12 @@ kernel void kernel_rope(
|
|
662
714
|
}
|
663
715
|
} else {
|
664
716
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
665
|
-
for (int64_t ic =
|
717
|
+
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
718
|
+
|
719
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
666
720
|
const float cos_theta = cos(theta);
|
667
721
|
const float sin_theta = sin(theta);
|
668
722
|
|
669
|
-
theta *= theta_scale;
|
670
|
-
|
671
723
|
const int64_t i0 = ib*n_dims + ic/2;
|
672
724
|
|
673
725
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -1262,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1262
1314
|
const int r0 = tgpig.x;
|
1263
1315
|
const int r1 = tgpig.y;
|
1264
1316
|
const int r2 = tgpig.z;
|
1265
|
-
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1317
|
+
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1318
|
+
const int first_row = r0 * N_DST;
|
1266
1319
|
const int ib_row = first_row * nb;
|
1267
1320
|
const uint offset0 = r2/gqa*(nb*ne0);
|
1268
1321
|
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
|
|
103
103
|
#include <sys/stat.h>
|
104
104
|
#include <unistd.h>
|
105
105
|
|
106
|
+
#endif
|
107
|
+
#ifdef GGML_USE_CPU_HBM
|
108
|
+
#include <hbwmalloc.h>
|
106
109
|
#endif
|
107
110
|
|
108
111
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -192,9 +195,15 @@ typedef void * thread_ret_t;
|
|
192
195
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
193
196
|
#else
|
194
197
|
inline static void * ggml_aligned_malloc(size_t size) {
|
198
|
+
if (size == 0) {
|
199
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
200
|
+
return NULL;
|
201
|
+
}
|
195
202
|
void * aligned_memory = NULL;
|
196
|
-
#ifdef
|
197
|
-
int result =
|
203
|
+
#ifdef GGML_USE_CPU_HBM
|
204
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
205
|
+
#elif GGML_USE_METAL
|
206
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
198
207
|
#else
|
199
208
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
200
209
|
#endif
|
@@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
215
224
|
return aligned_memory;
|
216
225
|
}
|
217
226
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
227
|
+
#ifdef GGML_USE_CPU_HBM
|
228
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
229
|
+
#else
|
218
230
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
219
231
|
#endif
|
232
|
+
#endif
|
220
233
|
|
221
234
|
#define UNUSED GGML_UNUSED
|
222
235
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
830
|
|
818
831
|
#if !defined(__aarch64__)
|
819
832
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
833
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
834
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
835
|
}
|
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
838
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
839
|
}
|
867
840
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
841
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
842
|
return
|
876
843
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -4612,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4612
4579
|
return NULL;
|
4613
4580
|
}
|
4614
4581
|
|
4582
|
+
// allow to call ggml_init with 0 size
|
4583
|
+
if (params.mem_size == 0) {
|
4584
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4585
|
+
}
|
4586
|
+
|
4615
4587
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4616
4588
|
|
4617
4589
|
*ctx = (struct ggml_context) {
|
@@ -4814,7 +4786,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4814
4786
|
|
4815
4787
|
size_t obj_alloc_size = 0;
|
4816
4788
|
|
4817
|
-
if (view_src == NULL && ctx->no_alloc
|
4789
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4818
4790
|
if (ctx->scratch.data != NULL) {
|
4819
4791
|
// allocate tensor data in the scratch buffer
|
4820
4792
|
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
@@ -5515,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5515
5487
|
}
|
5516
5488
|
|
5517
5489
|
if (inplace) {
|
5518
|
-
GGML_ASSERT(is_node
|
5490
|
+
GGML_ASSERT(!is_node);
|
5519
5491
|
}
|
5520
5492
|
|
5521
5493
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5558,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5558
5530
|
}
|
5559
5531
|
|
5560
5532
|
if (inplace) {
|
5561
|
-
GGML_ASSERT(is_node
|
5533
|
+
GGML_ASSERT(!is_node);
|
5562
5534
|
}
|
5563
5535
|
|
5564
5536
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -20003,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20003
19975
|
|
20004
19976
|
struct ggml_tensor * data = NULL;
|
20005
19977
|
|
20006
|
-
if (params.no_alloc
|
19978
|
+
if (!params.no_alloc) {
|
20007
19979
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
19980
|
|
20009
19981
|
ok = ok && data != NULL;
|
@@ -20044,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20044
20016
|
}
|
20045
20017
|
|
20046
20018
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
-
if (params.no_alloc
|
20019
|
+
if (!params.no_alloc) {
|
20048
20020
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
20021
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
20022
|
}
|