llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
@@ -25,9 +25,9 @@ typedef struct {
|
|
25
25
|
} block_q8_0;
|
26
26
|
|
27
27
|
kernel void kernel_add(
|
28
|
-
device const
|
29
|
-
device const
|
30
|
-
device
|
28
|
+
device const float4 * src0,
|
29
|
+
device const float4 * src1,
|
30
|
+
device float4 * dst,
|
31
31
|
uint tpig[[thread_position_in_grid]]) {
|
32
32
|
dst[tpig] = src0[tpig] + src1[tpig];
|
33
33
|
}
|
@@ -35,18 +35,18 @@ kernel void kernel_add(
|
|
35
35
|
// assumption: src1 is a row
|
36
36
|
// broadcast src1 into src0
|
37
37
|
kernel void kernel_add_row(
|
38
|
-
device const
|
39
|
-
device const
|
40
|
-
device
|
41
|
-
constant int64_t &
|
38
|
+
device const float4 * src0,
|
39
|
+
device const float4 * src1,
|
40
|
+
device float4 * dst,
|
41
|
+
constant int64_t & nb,
|
42
42
|
uint tpig[[thread_position_in_grid]]) {
|
43
|
-
dst[tpig] = src0[tpig] + src1[tpig %
|
43
|
+
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
44
44
|
}
|
45
45
|
|
46
46
|
kernel void kernel_mul(
|
47
|
-
device const
|
48
|
-
device const
|
49
|
-
device
|
47
|
+
device const float4 * src0,
|
48
|
+
device const float4 * src1,
|
49
|
+
device float4 * dst,
|
50
50
|
uint tpig[[thread_position_in_grid]]) {
|
51
51
|
dst[tpig] = src0[tpig] * src1[tpig];
|
52
52
|
}
|
@@ -54,12 +54,12 @@ kernel void kernel_mul(
|
|
54
54
|
// assumption: src1 is a row
|
55
55
|
// broadcast src1 into src0
|
56
56
|
kernel void kernel_mul_row(
|
57
|
-
device const
|
58
|
-
device const
|
59
|
-
device
|
60
|
-
constant
|
57
|
+
device const float4 * src0,
|
58
|
+
device const float4 * src1,
|
59
|
+
device float4 * dst,
|
60
|
+
constant int64_t & nb,
|
61
61
|
uint tpig[[thread_position_in_grid]]) {
|
62
|
-
dst[tpig] = src0[tpig] * src1[tpig %
|
62
|
+
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
63
63
|
}
|
64
64
|
|
65
65
|
kernel void kernel_scale(
|
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
|
133
133
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
134
134
|
}
|
135
135
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
136
|
+
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
137
|
+
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
138
|
+
//if (tpitg[0] == 0) {
|
139
|
+
// buf[0] = buf[0];
|
140
|
+
//}
|
140
141
|
|
141
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
143
|
|
143
144
|
const float max = buf[0];
|
144
145
|
|
145
146
|
// parallel sum
|
146
147
|
buf[tpitg[0]] = 0.0f;
|
147
148
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
148
|
-
|
149
|
+
const float exp_psrc0 = exp(psrc0[i00] - max);
|
150
|
+
buf[tpitg[0]] += exp_psrc0;
|
151
|
+
// Remember the result of exp here. exp is expensive, so we really do not
|
152
|
+
// whish to compute it twice.
|
153
|
+
pdst[i00] = exp_psrc0;
|
149
154
|
}
|
150
155
|
|
151
156
|
// reduce
|
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
|
157
162
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
158
163
|
}
|
159
164
|
|
160
|
-
// broadcast
|
161
|
-
|
162
|
-
|
163
|
-
|
165
|
+
// broadcast - not needed, see above
|
166
|
+
//// broadcast
|
167
|
+
//if (tpitg[0] == 0) {
|
168
|
+
// buf[0] = buf[0];
|
169
|
+
//}
|
164
170
|
|
165
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
171
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
166
172
|
|
167
173
|
const float sum = buf[0];
|
168
174
|
|
169
175
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
170
|
-
pdst[i00]
|
176
|
+
pdst[i00] /= sum;
|
171
177
|
}
|
172
178
|
}
|
173
179
|
|
@@ -214,25 +220,17 @@ kernel void kernel_norm(
|
|
214
220
|
}
|
215
221
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
216
222
|
}
|
217
|
-
|
218
|
-
if (tpitg == 0) {
|
219
|
-
sum[0] /= ne00;
|
220
|
-
}
|
221
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
|
-
const float mean = sum[0];
|
223
|
+
const float mean = sum[0] / ne00;
|
223
224
|
|
224
|
-
// recenter
|
225
|
+
// recenter and VARIANCE
|
226
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
225
227
|
device float * y = dst + tgpig*ne00;
|
226
|
-
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
227
|
-
y[i00] = x[i00] - mean;
|
228
|
-
}
|
229
|
-
|
230
|
-
// VARIANCE
|
231
|
-
// parallel sum
|
232
228
|
sum[tpitg] = 0.0f;
|
233
229
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
230
|
+
y[i00] = x[i00] - mean;
|
234
231
|
sum[tpitg] += y[i00] * y[i00];
|
235
232
|
}
|
233
|
+
|
236
234
|
// reduce
|
237
235
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
238
236
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
@@ -241,12 +239,7 @@ kernel void kernel_norm(
|
|
241
239
|
}
|
242
240
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
243
241
|
}
|
244
|
-
|
245
|
-
if (tpitg == 0) {
|
246
|
-
sum[0] /= ne00;
|
247
|
-
}
|
248
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
249
|
-
const float variance = sum[0];
|
242
|
+
const float variance = sum[0] / ne00;
|
250
243
|
|
251
244
|
const float scale = 1.0f/sqrt(variance + eps);
|
252
245
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
@@ -254,7 +247,6 @@ kernel void kernel_norm(
|
|
254
247
|
}
|
255
248
|
}
|
256
249
|
|
257
|
-
|
258
250
|
kernel void kernel_rms_norm(
|
259
251
|
device const void * src0,
|
260
252
|
device float * dst,
|
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
435
427
|
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
436
428
|
}
|
437
429
|
|
430
|
+
#define NB_Q8_0 8
|
431
|
+
|
438
432
|
kernel void kernel_mul_mat_q8_0_f32(
|
439
433
|
device const void * src0,
|
440
434
|
device const float * src1,
|
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
463
457
|
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
464
458
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
465
459
|
|
466
|
-
float yl[
|
460
|
+
float yl[NB_Q8_0];
|
467
461
|
float sumf[nr]={0.f};
|
468
462
|
|
469
|
-
const int ix = tiisg/
|
470
|
-
const int il = tiisg%
|
463
|
+
const int ix = tiisg/4;
|
464
|
+
const int il = tiisg%4;
|
471
465
|
|
472
|
-
device const float * yb = y + ix * QK8_0 +
|
466
|
+
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
473
467
|
|
474
|
-
// each thread in a SIMD group deals with
|
475
|
-
for (int ib = ix; ib < nb; ib += nw/
|
476
|
-
for (int i = 0; i <
|
468
|
+
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
469
|
+
for (int ib = ix; ib < nb; ib += nw/4) {
|
470
|
+
for (int i = 0; i < NB_Q8_0; ++i) {
|
477
471
|
yl[i] = yb[i];
|
478
472
|
}
|
479
473
|
|
480
474
|
for (int row = 0; row < nr; row++) {
|
481
|
-
device const int8_t * qs = x[ib+row*nb].qs +
|
475
|
+
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
482
476
|
float sumq = 0.f;
|
483
|
-
for (int iq = 0; iq <
|
477
|
+
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
484
478
|
sumq += qs[iq] * yl[iq];
|
485
479
|
}
|
486
480
|
sumf[row] += sumq*x[ib+row*nb].d;
|
487
481
|
}
|
488
482
|
|
489
|
-
yb +=
|
483
|
+
yb += NB_Q8_0 * nw;
|
490
484
|
}
|
491
485
|
|
492
486
|
for (int row = 0; row < nr; ++row) {
|
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
497
491
|
}
|
498
492
|
}
|
499
493
|
|
500
|
-
kernel void
|
494
|
+
kernel void kernel_mul_mat_f16_f32_1row(
|
501
495
|
device const char * src0,
|
502
496
|
device const char * src1,
|
503
497
|
device float * dst,
|
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
509
|
constant uint64_t & nb12,
|
516
510
|
constant int64_t & ne0,
|
517
511
|
constant int64_t & ne1,
|
518
|
-
threadgroup float * sum [[threadgroup(0)]],
|
519
512
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
520
|
-
|
521
|
-
uint3 tpitg[[thread_position_in_threadgroup]],
|
522
|
-
uint3 tptg[[threads_per_threadgroup]]) {
|
513
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
523
514
|
|
524
515
|
const int64_t r0 = tgpig.x;
|
525
516
|
const int64_t r1 = tgpig.y;
|
@@ -528,23 +519,100 @@ kernel void kernel_mul_mat_f16_f32(
|
|
528
519
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
529
520
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
530
521
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
522
|
+
float sumf = 0;
|
523
|
+
if (ne00 < 128) {
|
524
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
525
|
+
sumf += (float) x[i] * (float) y[i];
|
526
|
+
}
|
527
|
+
float all_sum = simd_sum(sumf);
|
528
|
+
if (tiisg == 0) {
|
529
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
530
|
+
}
|
531
|
+
} else {
|
532
|
+
device const half4 * x4 = (device const half4 *) x;
|
533
|
+
device const float4 * y4 = (device const float4 *) y;
|
534
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
535
|
+
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
536
|
+
}
|
537
|
+
float all_sum = simd_sum(sumf);
|
538
|
+
if (tiisg == 0) {
|
539
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
540
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
541
|
+
}
|
535
542
|
}
|
536
543
|
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
544
|
+
}
|
545
|
+
|
546
|
+
#define N_F16_F32 4
|
547
|
+
|
548
|
+
kernel void kernel_mul_mat_f16_f32(
|
549
|
+
device const char * src0,
|
550
|
+
device const char * src1,
|
551
|
+
device float * dst,
|
552
|
+
constant int64_t & ne00,
|
553
|
+
constant int64_t & ne01,
|
554
|
+
constant int64_t & ne02,
|
555
|
+
constant uint64_t & nb00,
|
556
|
+
constant uint64_t & nb01,
|
557
|
+
constant uint64_t & nb02,
|
558
|
+
constant int64_t & ne10,
|
559
|
+
constant int64_t & ne11,
|
560
|
+
constant int64_t & ne12,
|
561
|
+
constant uint64_t & nb10,
|
562
|
+
constant uint64_t & nb11,
|
563
|
+
constant uint64_t & nb12,
|
564
|
+
constant int64_t & ne0,
|
565
|
+
constant int64_t & ne1,
|
566
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
567
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
568
|
+
|
569
|
+
const int64_t r0 = tgpig.x;
|
570
|
+
const int64_t rb = tgpig.y*N_F16_F32;
|
571
|
+
const int64_t im = tgpig.z;
|
572
|
+
|
573
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
574
|
+
|
575
|
+
if (ne00 < 128) {
|
576
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
|
584
|
+
float sumf = 0;
|
585
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
586
|
+
sumf += (float) x[i] * (float) y[i];
|
587
|
+
}
|
588
|
+
|
589
|
+
float all_sum = simd_sum(sumf);
|
590
|
+
if (tiisg == 0) {
|
591
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
592
|
+
}
|
542
593
|
}
|
543
|
-
|
544
|
-
|
594
|
+
} else {
|
595
|
+
device const half4 * x4 = (device const half4 *)x;
|
596
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
597
|
+
int r1 = rb + row;
|
598
|
+
if (r1 >= ne11) {
|
599
|
+
break;
|
600
|
+
}
|
601
|
+
|
602
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
603
|
+
device const float4 * y4 = (device const float4 *) y;
|
604
|
+
|
605
|
+
float sumf = 0;
|
606
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
607
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
608
|
+
}
|
545
609
|
|
546
|
-
|
547
|
-
|
610
|
+
float all_sum = simd_sum(sumf);
|
611
|
+
if (tiisg == 0) {
|
612
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
613
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
614
|
+
}
|
615
|
+
}
|
548
616
|
}
|
549
617
|
}
|
550
618
|
|
@@ -614,25 +682,27 @@ kernel void kernel_rope(
|
|
614
682
|
constant int & mode,
|
615
683
|
constant float & freq_base,
|
616
684
|
constant float & freq_scale,
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
const int64_t
|
685
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
686
|
+
uint3 tptg[[threads_per_threadgroup]],
|
687
|
+
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
688
|
+
const int64_t i3 = tgpig[2];
|
689
|
+
const int64_t i2 = tgpig[1];
|
690
|
+
const int64_t i1 = tgpig[0];
|
621
691
|
|
622
692
|
const bool is_neox = mode & 2;
|
623
|
-
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
624
693
|
|
625
694
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
626
695
|
|
627
|
-
float
|
696
|
+
const float theta_0 = freq_scale * (float)p;
|
697
|
+
const float inv_ndims = -1.f/n_dims;
|
628
698
|
|
629
699
|
if (!is_neox) {
|
630
|
-
for (int64_t i0 =
|
700
|
+
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
701
|
+
|
702
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
631
703
|
const float cos_theta = cos(theta);
|
632
704
|
const float sin_theta = sin(theta);
|
633
705
|
|
634
|
-
theta *= theta_scale;
|
635
|
-
|
636
706
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
637
707
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
638
708
|
|
@@ -644,12 +714,12 @@ kernel void kernel_rope(
|
|
644
714
|
}
|
645
715
|
} else {
|
646
716
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
647
|
-
for (int64_t ic =
|
717
|
+
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
718
|
+
|
719
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
648
720
|
const float cos_theta = cos(theta);
|
649
721
|
const float sin_theta = sin(theta);
|
650
722
|
|
651
|
-
theta *= theta_scale;
|
652
|
-
|
653
723
|
const int64_t i0 = ib*n_dims + ic/2;
|
654
724
|
|
655
725
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -1244,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1244
1314
|
const int r0 = tgpig.x;
|
1245
1315
|
const int r1 = tgpig.y;
|
1246
1316
|
const int r2 = tgpig.z;
|
1247
|
-
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1317
|
+
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1318
|
+
const int first_row = r0 * N_DST;
|
1248
1319
|
const int ib_row = first_row * nb;
|
1249
1320
|
const uint offset0 = r2/gqa*(nb*ne0);
|
1250
1321
|
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|