llama_cpp 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +106 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +83 -23
- data/ext/llama_cpp/src/ggml-metal.m +35 -11
- data/ext/llama_cpp/src/ggml-metal.metal +145 -92
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +25 -53
- data/ext/llama_cpp/src/k_quants.c +45 -12
- data/ext/llama_cpp/src/llama.cpp +146 -70
- data/ext/llama_cpp/src/llama.h +3 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
|
133
133
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
134
134
|
}
|
135
135
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
136
|
+
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
137
|
+
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
138
|
+
//if (tpitg[0] == 0) {
|
139
|
+
// buf[0] = buf[0];
|
140
|
+
//}
|
140
141
|
|
141
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
143
|
|
143
144
|
const float max = buf[0];
|
144
145
|
|
145
146
|
// parallel sum
|
146
147
|
buf[tpitg[0]] = 0.0f;
|
147
148
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
148
|
-
|
149
|
+
const float exp_psrc0 = exp(psrc0[i00] - max);
|
150
|
+
buf[tpitg[0]] += exp_psrc0;
|
151
|
+
// Remember the result of exp here. exp is expensive, so we really do not
|
152
|
+
// whish to compute it twice.
|
153
|
+
pdst[i00] = exp_psrc0;
|
149
154
|
}
|
150
155
|
|
151
156
|
// reduce
|
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
|
157
162
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
158
163
|
}
|
159
164
|
|
160
|
-
// broadcast
|
161
|
-
|
162
|
-
|
163
|
-
|
165
|
+
// broadcast - not needed, see above
|
166
|
+
//// broadcast
|
167
|
+
//if (tpitg[0] == 0) {
|
168
|
+
// buf[0] = buf[0];
|
169
|
+
//}
|
164
170
|
|
165
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
171
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
166
172
|
|
167
173
|
const float sum = buf[0];
|
168
174
|
|
169
175
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
170
|
-
pdst[i00]
|
176
|
+
pdst[i00] /= sum;
|
171
177
|
}
|
172
178
|
}
|
173
179
|
|
@@ -214,25 +220,17 @@ kernel void kernel_norm(
|
|
214
220
|
}
|
215
221
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
216
222
|
}
|
217
|
-
|
218
|
-
if (tpitg == 0) {
|
219
|
-
sum[0] /= ne00;
|
220
|
-
}
|
221
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
|
-
const float mean = sum[0];
|
223
|
+
const float mean = sum[0] / ne00;
|
223
224
|
|
224
|
-
// recenter
|
225
|
+
// recenter and VARIANCE
|
226
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
225
227
|
device float * y = dst + tgpig*ne00;
|
226
|
-
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
227
|
-
y[i00] = x[i00] - mean;
|
228
|
-
}
|
229
|
-
|
230
|
-
// VARIANCE
|
231
|
-
// parallel sum
|
232
228
|
sum[tpitg] = 0.0f;
|
233
229
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
230
|
+
y[i00] = x[i00] - mean;
|
234
231
|
sum[tpitg] += y[i00] * y[i00];
|
235
232
|
}
|
233
|
+
|
236
234
|
// reduce
|
237
235
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
238
236
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
@@ -241,12 +239,7 @@ kernel void kernel_norm(
|
|
241
239
|
}
|
242
240
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
243
241
|
}
|
244
|
-
|
245
|
-
if (tpitg == 0) {
|
246
|
-
sum[0] /= ne00;
|
247
|
-
}
|
248
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
249
|
-
const float variance = sum[0];
|
242
|
+
const float variance = sum[0] / ne00;
|
250
243
|
|
251
244
|
const float scale = 1.0f/sqrt(variance + eps);
|
252
245
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
@@ -254,7 +247,6 @@ kernel void kernel_norm(
|
|
254
247
|
}
|
255
248
|
}
|
256
249
|
|
257
|
-
|
258
250
|
kernel void kernel_rms_norm(
|
259
251
|
device const void * src0,
|
260
252
|
device float * dst,
|
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
435
427
|
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
436
428
|
}
|
437
429
|
|
430
|
+
#define NB_Q8_0 8
|
431
|
+
|
438
432
|
kernel void kernel_mul_mat_q8_0_f32(
|
439
433
|
device const void * src0,
|
440
434
|
device const float * src1,
|
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
463
457
|
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
464
458
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
465
459
|
|
466
|
-
float yl[
|
460
|
+
float yl[NB_Q8_0];
|
467
461
|
float sumf[nr]={0.f};
|
468
462
|
|
469
|
-
const int ix = tiisg/
|
470
|
-
const int il = tiisg%
|
463
|
+
const int ix = tiisg/4;
|
464
|
+
const int il = tiisg%4;
|
471
465
|
|
472
|
-
device const float * yb = y + ix * QK8_0 +
|
466
|
+
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
473
467
|
|
474
|
-
// each thread in a SIMD group deals with
|
475
|
-
for (int ib = ix; ib < nb; ib += nw/
|
476
|
-
for (int i = 0; i <
|
468
|
+
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
469
|
+
for (int ib = ix; ib < nb; ib += nw/4) {
|
470
|
+
for (int i = 0; i < NB_Q8_0; ++i) {
|
477
471
|
yl[i] = yb[i];
|
478
472
|
}
|
479
473
|
|
480
474
|
for (int row = 0; row < nr; row++) {
|
481
|
-
device const int8_t * qs = x[ib+row*nb].qs +
|
475
|
+
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
482
476
|
float sumq = 0.f;
|
483
|
-
for (int iq = 0; iq <
|
477
|
+
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
484
478
|
sumq += qs[iq] * yl[iq];
|
485
479
|
}
|
486
480
|
sumf[row] += sumq*x[ib+row*nb].d;
|
487
481
|
}
|
488
482
|
|
489
|
-
yb +=
|
483
|
+
yb += NB_Q8_0 * nw;
|
490
484
|
}
|
491
485
|
|
492
486
|
for (int row = 0; row < nr; ++row) {
|
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
497
491
|
}
|
498
492
|
}
|
499
493
|
|
500
|
-
kernel void
|
494
|
+
kernel void kernel_mul_mat_f16_f32_1row(
|
501
495
|
device const char * src0,
|
502
496
|
device const char * src1,
|
503
497
|
device float * dst,
|
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
509
|
constant uint64_t & nb12,
|
516
510
|
constant int64_t & ne0,
|
517
511
|
constant int64_t & ne1,
|
518
|
-
threadgroup float * sum [[threadgroup(0)]],
|
519
512
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
520
|
-
|
521
|
-
uint3 tpitg[[thread_position_in_threadgroup]],
|
522
|
-
uint3 tptg[[threads_per_threadgroup]]) {
|
513
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
523
514
|
|
524
515
|
const int64_t r0 = tgpig.x;
|
525
516
|
const int64_t r1 = tgpig.y;
|
@@ -528,42 +519,101 @@ kernel void kernel_mul_mat_f16_f32(
|
|
528
519
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
529
520
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
530
521
|
|
531
|
-
|
532
|
-
|
522
|
+
float sumf = 0;
|
523
|
+
if (ne00 < 128) {
|
524
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
525
|
+
sumf += (float) x[i] * (float) y[i];
|
526
|
+
}
|
527
|
+
float all_sum = simd_sum(sumf);
|
528
|
+
if (tiisg == 0) {
|
529
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
530
|
+
}
|
531
|
+
} else {
|
532
|
+
device const half4 * x4 = (device const half4 *) x;
|
533
|
+
device const float4 * y4 = (device const float4 *) y;
|
534
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
535
|
+
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
536
|
+
}
|
537
|
+
float all_sum = simd_sum(sumf);
|
538
|
+
if (tiisg == 0) {
|
539
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
540
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
541
|
+
}
|
542
|
+
}
|
533
543
|
|
534
|
-
|
544
|
+
}
|
535
545
|
|
536
|
-
|
537
|
-
sum[ith] += (float) x[i] * (float) y[i];
|
538
|
-
}
|
546
|
+
#define N_F16_F32 4
|
539
547
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
548
|
+
kernel void kernel_mul_mat_f16_f32(
|
549
|
+
device const char * src0,
|
550
|
+
device const char * src1,
|
551
|
+
device float * dst,
|
552
|
+
constant int64_t & ne00,
|
553
|
+
constant int64_t & ne01,
|
554
|
+
constant int64_t & ne02,
|
555
|
+
constant uint64_t & nb00,
|
556
|
+
constant uint64_t & nb01,
|
557
|
+
constant uint64_t & nb02,
|
558
|
+
constant int64_t & ne10,
|
559
|
+
constant int64_t & ne11,
|
560
|
+
constant int64_t & ne12,
|
561
|
+
constant uint64_t & nb10,
|
562
|
+
constant uint64_t & nb11,
|
563
|
+
constant uint64_t & nb12,
|
564
|
+
constant int64_t & ne0,
|
565
|
+
constant int64_t & ne1,
|
566
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
567
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
554
568
|
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
569
|
+
const int64_t r0 = tgpig.x;
|
570
|
+
const int64_t rb = tgpig.y*N_F16_F32;
|
571
|
+
const int64_t im = tgpig.z;
|
572
|
+
|
573
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
574
|
+
|
575
|
+
if (ne00 < 128) {
|
576
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
|
584
|
+
float sumf = 0;
|
585
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
586
|
+
sumf += (float) x[i] * (float) y[i];
|
587
|
+
}
|
588
|
+
|
589
|
+
float all_sum = simd_sum(sumf);
|
590
|
+
if (tiisg == 0) {
|
591
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
592
|
+
}
|
593
|
+
}
|
594
|
+
} else {
|
595
|
+
device const half4 * x4 = (device const half4 *)x;
|
596
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
597
|
+
int r1 = rb + row;
|
598
|
+
if (r1 >= ne11) {
|
599
|
+
break;
|
600
|
+
}
|
601
|
+
|
602
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
603
|
+
device const float4 * y4 = (device const float4 *) y;
|
604
|
+
|
605
|
+
float sumf = 0;
|
606
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
607
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
608
|
+
}
|
609
|
+
|
610
|
+
float all_sum = simd_sum(sumf);
|
611
|
+
if (tiisg == 0) {
|
612
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
613
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
614
|
+
}
|
615
|
+
}
|
616
|
+
}
|
567
617
|
}
|
568
618
|
|
569
619
|
kernel void kernel_alibi_f32(
|
@@ -632,25 +682,27 @@ kernel void kernel_rope(
|
|
632
682
|
constant int & mode,
|
633
683
|
constant float & freq_base,
|
634
684
|
constant float & freq_scale,
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
const int64_t
|
685
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
686
|
+
uint3 tptg[[threads_per_threadgroup]],
|
687
|
+
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
688
|
+
const int64_t i3 = tgpig[2];
|
689
|
+
const int64_t i2 = tgpig[1];
|
690
|
+
const int64_t i1 = tgpig[0];
|
639
691
|
|
640
692
|
const bool is_neox = mode & 2;
|
641
|
-
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
642
693
|
|
643
694
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
644
695
|
|
645
|
-
float
|
696
|
+
const float theta_0 = freq_scale * (float)p;
|
697
|
+
const float inv_ndims = -1.f/n_dims;
|
646
698
|
|
647
699
|
if (!is_neox) {
|
648
|
-
for (int64_t i0 =
|
700
|
+
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
701
|
+
|
702
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
649
703
|
const float cos_theta = cos(theta);
|
650
704
|
const float sin_theta = sin(theta);
|
651
705
|
|
652
|
-
theta *= theta_scale;
|
653
|
-
|
654
706
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
655
707
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
656
708
|
|
@@ -662,12 +714,12 @@ kernel void kernel_rope(
|
|
662
714
|
}
|
663
715
|
} else {
|
664
716
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
665
|
-
for (int64_t ic =
|
717
|
+
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
718
|
+
|
719
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
666
720
|
const float cos_theta = cos(theta);
|
667
721
|
const float sin_theta = sin(theta);
|
668
722
|
|
669
|
-
theta *= theta_scale;
|
670
|
-
|
671
723
|
const int64_t i0 = ib*n_dims + ic/2;
|
672
724
|
|
673
725
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -1262,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1262
1314
|
const int r0 = tgpig.x;
|
1263
1315
|
const int r1 = tgpig.y;
|
1264
1316
|
const int r2 = tgpig.z;
|
1265
|
-
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1317
|
+
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1318
|
+
const int first_row = r0 * N_DST;
|
1266
1319
|
const int ib_row = first_row * nb;
|
1267
1320
|
const uint offset0 = r2/gqa*(nb*ne0);
|
1268
1321
|
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -103,6 +103,9 @@ typedef void * thread_ret_t;
|
|
103
103
|
#include <sys/stat.h>
|
104
104
|
#include <unistd.h>
|
105
105
|
|
106
|
+
#endif
|
107
|
+
#ifdef GGML_USE_CPU_HBM
|
108
|
+
#include <hbwmalloc.h>
|
106
109
|
#endif
|
107
110
|
|
108
111
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -192,9 +195,15 @@ typedef void * thread_ret_t;
|
|
192
195
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
193
196
|
#else
|
194
197
|
inline static void * ggml_aligned_malloc(size_t size) {
|
198
|
+
if (size == 0) {
|
199
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
200
|
+
return NULL;
|
201
|
+
}
|
195
202
|
void * aligned_memory = NULL;
|
196
|
-
#ifdef
|
197
|
-
int result =
|
203
|
+
#ifdef GGML_USE_CPU_HBM
|
204
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
205
|
+
#elif GGML_USE_METAL
|
206
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
198
207
|
#else
|
199
208
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
200
209
|
#endif
|
@@ -215,8 +224,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
215
224
|
return aligned_memory;
|
216
225
|
}
|
217
226
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
227
|
+
#ifdef GGML_USE_CPU_HBM
|
228
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
229
|
+
#else
|
218
230
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
219
231
|
#endif
|
232
|
+
#endif
|
220
233
|
|
221
234
|
#define UNUSED GGML_UNUSED
|
222
235
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
@@ -817,46 +830,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
830
|
|
818
831
|
#if !defined(__aarch64__)
|
819
832
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
833
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
834
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
835
|
}
|
@@ -865,12 +838,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
838
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
839
|
}
|
867
840
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
841
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
842
|
return
|
876
843
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -4612,6 +4579,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4612
4579
|
return NULL;
|
4613
4580
|
}
|
4614
4581
|
|
4582
|
+
// allow to call ggml_init with 0 size
|
4583
|
+
if (params.mem_size == 0) {
|
4584
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4585
|
+
}
|
4586
|
+
|
4615
4587
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4616
4588
|
|
4617
4589
|
*ctx = (struct ggml_context) {
|
@@ -4814,7 +4786,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4814
4786
|
|
4815
4787
|
size_t obj_alloc_size = 0;
|
4816
4788
|
|
4817
|
-
if (view_src == NULL && ctx->no_alloc
|
4789
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4818
4790
|
if (ctx->scratch.data != NULL) {
|
4819
4791
|
// allocate tensor data in the scratch buffer
|
4820
4792
|
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
@@ -5515,7 +5487,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5515
5487
|
}
|
5516
5488
|
|
5517
5489
|
if (inplace) {
|
5518
|
-
GGML_ASSERT(is_node
|
5490
|
+
GGML_ASSERT(!is_node);
|
5519
5491
|
}
|
5520
5492
|
|
5521
5493
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5558,7 +5530,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5558
5530
|
}
|
5559
5531
|
|
5560
5532
|
if (inplace) {
|
5561
|
-
GGML_ASSERT(is_node
|
5533
|
+
GGML_ASSERT(!is_node);
|
5562
5534
|
}
|
5563
5535
|
|
5564
5536
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -20003,7 +19975,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20003
19975
|
|
20004
19976
|
struct ggml_tensor * data = NULL;
|
20005
19977
|
|
20006
|
-
if (params.no_alloc
|
19978
|
+
if (!params.no_alloc) {
|
20007
19979
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
19980
|
|
20009
19981
|
ok = ok && data != NULL;
|
@@ -20044,7 +20016,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20044
20016
|
}
|
20045
20017
|
|
20046
20018
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
-
if (params.no_alloc
|
20019
|
+
if (!params.no_alloc) {
|
20048
20020
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
20021
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
20022
|
}
|