llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
@@ -25,9 +25,9 @@ typedef struct {
|
|
25
25
|
} block_q8_0;
|
26
26
|
|
27
27
|
kernel void kernel_add(
|
28
|
-
device const
|
29
|
-
device const
|
30
|
-
device
|
28
|
+
device const float4 * src0,
|
29
|
+
device const float4 * src1,
|
30
|
+
device float4 * dst,
|
31
31
|
uint tpig[[thread_position_in_grid]]) {
|
32
32
|
dst[tpig] = src0[tpig] + src1[tpig];
|
33
33
|
}
|
@@ -35,18 +35,18 @@ kernel void kernel_add(
|
|
35
35
|
// assumption: src1 is a row
|
36
36
|
// broadcast src1 into src0
|
37
37
|
kernel void kernel_add_row(
|
38
|
-
device const
|
39
|
-
device const
|
40
|
-
device
|
41
|
-
constant int64_t &
|
38
|
+
device const float4 * src0,
|
39
|
+
device const float4 * src1,
|
40
|
+
device float4 * dst,
|
41
|
+
constant int64_t & nb,
|
42
42
|
uint tpig[[thread_position_in_grid]]) {
|
43
|
-
dst[tpig] = src0[tpig] + src1[tpig %
|
43
|
+
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
44
44
|
}
|
45
45
|
|
46
46
|
kernel void kernel_mul(
|
47
|
-
device const
|
48
|
-
device const
|
49
|
-
device
|
47
|
+
device const float4 * src0,
|
48
|
+
device const float4 * src1,
|
49
|
+
device float4 * dst,
|
50
50
|
uint tpig[[thread_position_in_grid]]) {
|
51
51
|
dst[tpig] = src0[tpig] * src1[tpig];
|
52
52
|
}
|
@@ -54,12 +54,12 @@ kernel void kernel_mul(
|
|
54
54
|
// assumption: src1 is a row
|
55
55
|
// broadcast src1 into src0
|
56
56
|
kernel void kernel_mul_row(
|
57
|
-
device const
|
58
|
-
device const
|
59
|
-
device
|
60
|
-
constant
|
57
|
+
device const float4 * src0,
|
58
|
+
device const float4 * src1,
|
59
|
+
device float4 * dst,
|
60
|
+
constant int64_t & nb,
|
61
61
|
uint tpig[[thread_position_in_grid]]) {
|
62
|
-
dst[tpig] = src0[tpig] * src1[tpig %
|
62
|
+
dst[tpig] = src0[tpig] * src1[tpig % nb];
|
63
63
|
}
|
64
64
|
|
65
65
|
kernel void kernel_scale(
|
@@ -133,19 +133,24 @@ kernel void kernel_soft_max(
|
|
133
133
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
134
134
|
}
|
135
135
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
136
|
+
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
137
|
+
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
138
|
+
//if (tpitg[0] == 0) {
|
139
|
+
// buf[0] = buf[0];
|
140
|
+
//}
|
140
141
|
|
141
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
142
143
|
|
143
144
|
const float max = buf[0];
|
144
145
|
|
145
146
|
// parallel sum
|
146
147
|
buf[tpitg[0]] = 0.0f;
|
147
148
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
148
|
-
|
149
|
+
const float exp_psrc0 = exp(psrc0[i00] - max);
|
150
|
+
buf[tpitg[0]] += exp_psrc0;
|
151
|
+
// Remember the result of exp here. exp is expensive, so we really do not
|
152
|
+
// whish to compute it twice.
|
153
|
+
pdst[i00] = exp_psrc0;
|
149
154
|
}
|
150
155
|
|
151
156
|
// reduce
|
@@ -157,17 +162,18 @@ kernel void kernel_soft_max(
|
|
157
162
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
158
163
|
}
|
159
164
|
|
160
|
-
// broadcast
|
161
|
-
|
162
|
-
|
163
|
-
|
165
|
+
// broadcast - not needed, see above
|
166
|
+
//// broadcast
|
167
|
+
//if (tpitg[0] == 0) {
|
168
|
+
// buf[0] = buf[0];
|
169
|
+
//}
|
164
170
|
|
165
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
171
|
+
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
166
172
|
|
167
173
|
const float sum = buf[0];
|
168
174
|
|
169
175
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
170
|
-
pdst[i00]
|
176
|
+
pdst[i00] /= sum;
|
171
177
|
}
|
172
178
|
}
|
173
179
|
|
@@ -214,25 +220,17 @@ kernel void kernel_norm(
|
|
214
220
|
}
|
215
221
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
216
222
|
}
|
217
|
-
|
218
|
-
if (tpitg == 0) {
|
219
|
-
sum[0] /= ne00;
|
220
|
-
}
|
221
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
222
|
-
const float mean = sum[0];
|
223
|
+
const float mean = sum[0] / ne00;
|
223
224
|
|
224
|
-
// recenter
|
225
|
+
// recenter and VARIANCE
|
226
|
+
threadgroup_barrier(mem_flags::mem_threadgroup);
|
225
227
|
device float * y = dst + tgpig*ne00;
|
226
|
-
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
227
|
-
y[i00] = x[i00] - mean;
|
228
|
-
}
|
229
|
-
|
230
|
-
// VARIANCE
|
231
|
-
// parallel sum
|
232
228
|
sum[tpitg] = 0.0f;
|
233
229
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
230
|
+
y[i00] = x[i00] - mean;
|
234
231
|
sum[tpitg] += y[i00] * y[i00];
|
235
232
|
}
|
233
|
+
|
236
234
|
// reduce
|
237
235
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
238
236
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
@@ -241,12 +239,7 @@ kernel void kernel_norm(
|
|
241
239
|
}
|
242
240
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
243
241
|
}
|
244
|
-
|
245
|
-
if (tpitg == 0) {
|
246
|
-
sum[0] /= ne00;
|
247
|
-
}
|
248
|
-
threadgroup_barrier(mem_flags::mem_threadgroup);
|
249
|
-
const float variance = sum[0];
|
242
|
+
const float variance = sum[0] / ne00;
|
250
243
|
|
251
244
|
const float scale = 1.0f/sqrt(variance + eps);
|
252
245
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
@@ -254,7 +247,6 @@ kernel void kernel_norm(
|
|
254
247
|
}
|
255
248
|
}
|
256
249
|
|
257
|
-
|
258
250
|
kernel void kernel_rms_norm(
|
259
251
|
device const void * src0,
|
260
252
|
device float * dst,
|
@@ -435,6 +427,8 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
435
427
|
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
436
428
|
}
|
437
429
|
|
430
|
+
#define NB_Q8_0 8
|
431
|
+
|
438
432
|
kernel void kernel_mul_mat_q8_0_f32(
|
439
433
|
device const void * src0,
|
440
434
|
device const float * src1,
|
@@ -463,30 +457,30 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
463
457
|
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
464
458
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
465
459
|
|
466
|
-
float yl[
|
460
|
+
float yl[NB_Q8_0];
|
467
461
|
float sumf[nr]={0.f};
|
468
462
|
|
469
|
-
const int ix = tiisg/
|
470
|
-
const int il = tiisg%
|
463
|
+
const int ix = tiisg/4;
|
464
|
+
const int il = tiisg%4;
|
471
465
|
|
472
|
-
device const float * yb = y + ix * QK8_0 +
|
466
|
+
device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
|
473
467
|
|
474
|
-
// each thread in a SIMD group deals with
|
475
|
-
for (int ib = ix; ib < nb; ib += nw/
|
476
|
-
for (int i = 0; i <
|
468
|
+
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
469
|
+
for (int ib = ix; ib < nb; ib += nw/4) {
|
470
|
+
for (int i = 0; i < NB_Q8_0; ++i) {
|
477
471
|
yl[i] = yb[i];
|
478
472
|
}
|
479
473
|
|
480
474
|
for (int row = 0; row < nr; row++) {
|
481
|
-
device const int8_t * qs = x[ib+row*nb].qs +
|
475
|
+
device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
|
482
476
|
float sumq = 0.f;
|
483
|
-
for (int iq = 0; iq <
|
477
|
+
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
484
478
|
sumq += qs[iq] * yl[iq];
|
485
479
|
}
|
486
480
|
sumf[row] += sumq*x[ib+row*nb].d;
|
487
481
|
}
|
488
482
|
|
489
|
-
yb +=
|
483
|
+
yb += NB_Q8_0 * nw;
|
490
484
|
}
|
491
485
|
|
492
486
|
for (int row = 0; row < nr; ++row) {
|
@@ -497,7 +491,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
497
491
|
}
|
498
492
|
}
|
499
493
|
|
500
|
-
kernel void
|
494
|
+
kernel void kernel_mul_mat_f16_f32_1row(
|
501
495
|
device const char * src0,
|
502
496
|
device const char * src1,
|
503
497
|
device float * dst,
|
@@ -515,11 +509,8 @@ kernel void kernel_mul_mat_f16_f32(
|
|
515
509
|
constant uint64_t & nb12,
|
516
510
|
constant int64_t & ne0,
|
517
511
|
constant int64_t & ne1,
|
518
|
-
threadgroup float * sum [[threadgroup(0)]],
|
519
512
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
520
|
-
|
521
|
-
uint3 tpitg[[thread_position_in_threadgroup]],
|
522
|
-
uint3 tptg[[threads_per_threadgroup]]) {
|
513
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
523
514
|
|
524
515
|
const int64_t r0 = tgpig.x;
|
525
516
|
const int64_t r1 = tgpig.y;
|
@@ -528,23 +519,100 @@ kernel void kernel_mul_mat_f16_f32(
|
|
528
519
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
529
520
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
530
521
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
522
|
+
float sumf = 0;
|
523
|
+
if (ne00 < 128) {
|
524
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
525
|
+
sumf += (float) x[i] * (float) y[i];
|
526
|
+
}
|
527
|
+
float all_sum = simd_sum(sumf);
|
528
|
+
if (tiisg == 0) {
|
529
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
530
|
+
}
|
531
|
+
} else {
|
532
|
+
device const half4 * x4 = (device const half4 *) x;
|
533
|
+
device const float4 * y4 = (device const float4 *) y;
|
534
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
535
|
+
for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
|
536
|
+
}
|
537
|
+
float all_sum = simd_sum(sumf);
|
538
|
+
if (tiisg == 0) {
|
539
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
540
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
541
|
+
}
|
535
542
|
}
|
536
543
|
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
544
|
+
}
|
545
|
+
|
546
|
+
#define N_F16_F32 4
|
547
|
+
|
548
|
+
kernel void kernel_mul_mat_f16_f32(
|
549
|
+
device const char * src0,
|
550
|
+
device const char * src1,
|
551
|
+
device float * dst,
|
552
|
+
constant int64_t & ne00,
|
553
|
+
constant int64_t & ne01,
|
554
|
+
constant int64_t & ne02,
|
555
|
+
constant uint64_t & nb00,
|
556
|
+
constant uint64_t & nb01,
|
557
|
+
constant uint64_t & nb02,
|
558
|
+
constant int64_t & ne10,
|
559
|
+
constant int64_t & ne11,
|
560
|
+
constant int64_t & ne12,
|
561
|
+
constant uint64_t & nb10,
|
562
|
+
constant uint64_t & nb11,
|
563
|
+
constant uint64_t & nb12,
|
564
|
+
constant int64_t & ne0,
|
565
|
+
constant int64_t & ne1,
|
566
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
567
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
568
|
+
|
569
|
+
const int64_t r0 = tgpig.x;
|
570
|
+
const int64_t rb = tgpig.y*N_F16_F32;
|
571
|
+
const int64_t im = tgpig.z;
|
572
|
+
|
573
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
574
|
+
|
575
|
+
if (ne00 < 128) {
|
576
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
577
|
+
int r1 = rb + row;
|
578
|
+
if (r1 >= ne11) {
|
579
|
+
break;
|
580
|
+
}
|
581
|
+
|
582
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
583
|
+
|
584
|
+
float sumf = 0;
|
585
|
+
for (int i = tiisg; i < ne00; i += 32) {
|
586
|
+
sumf += (float) x[i] * (float) y[i];
|
587
|
+
}
|
588
|
+
|
589
|
+
float all_sum = simd_sum(sumf);
|
590
|
+
if (tiisg == 0) {
|
591
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
592
|
+
}
|
542
593
|
}
|
543
|
-
|
544
|
-
|
594
|
+
} else {
|
595
|
+
device const half4 * x4 = (device const half4 *)x;
|
596
|
+
for (int row = 0; row < N_F16_F32; ++row) {
|
597
|
+
int r1 = rb + row;
|
598
|
+
if (r1 >= ne11) {
|
599
|
+
break;
|
600
|
+
}
|
601
|
+
|
602
|
+
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
603
|
+
device const float4 * y4 = (device const float4 *) y;
|
604
|
+
|
605
|
+
float sumf = 0;
|
606
|
+
for (int i = tiisg; i < ne00/4; i += 32) {
|
607
|
+
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
608
|
+
}
|
545
609
|
|
546
|
-
|
547
|
-
|
610
|
+
float all_sum = simd_sum(sumf);
|
611
|
+
if (tiisg == 0) {
|
612
|
+
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
613
|
+
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
614
|
+
}
|
615
|
+
}
|
548
616
|
}
|
549
617
|
}
|
550
618
|
|
@@ -614,25 +682,27 @@ kernel void kernel_rope(
|
|
614
682
|
constant int & mode,
|
615
683
|
constant float & freq_base,
|
616
684
|
constant float & freq_scale,
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
const int64_t
|
685
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
686
|
+
uint3 tptg[[threads_per_threadgroup]],
|
687
|
+
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
688
|
+
const int64_t i3 = tgpig[2];
|
689
|
+
const int64_t i2 = tgpig[1];
|
690
|
+
const int64_t i1 = tgpig[0];
|
621
691
|
|
622
692
|
const bool is_neox = mode & 2;
|
623
|
-
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
624
693
|
|
625
694
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
626
695
|
|
627
|
-
float
|
696
|
+
const float theta_0 = freq_scale * (float)p;
|
697
|
+
const float inv_ndims = -1.f/n_dims;
|
628
698
|
|
629
699
|
if (!is_neox) {
|
630
|
-
for (int64_t i0 =
|
700
|
+
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
701
|
+
|
702
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
631
703
|
const float cos_theta = cos(theta);
|
632
704
|
const float sin_theta = sin(theta);
|
633
705
|
|
634
|
-
theta *= theta_scale;
|
635
|
-
|
636
706
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
637
707
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
638
708
|
|
@@ -644,12 +714,12 @@ kernel void kernel_rope(
|
|
644
714
|
}
|
645
715
|
} else {
|
646
716
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
647
|
-
for (int64_t ic =
|
717
|
+
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
718
|
+
|
719
|
+
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
648
720
|
const float cos_theta = cos(theta);
|
649
721
|
const float sin_theta = sin(theta);
|
650
722
|
|
651
|
-
theta *= theta_scale;
|
652
|
-
|
653
723
|
const int64_t i0 = ib*n_dims + ic/2;
|
654
724
|
|
655
725
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
@@ -1244,7 +1314,8 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1244
1314
|
const int r0 = tgpig.x;
|
1245
1315
|
const int r1 = tgpig.y;
|
1246
1316
|
const int r2 = tgpig.z;
|
1247
|
-
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1317
|
+
//const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
1318
|
+
const int first_row = r0 * N_DST;
|
1248
1319
|
const int ib_row = first_row * nb;
|
1249
1320
|
const uint offset0 = r2/gqa*(nb*ne0);
|
1250
1321
|
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|