llama_cpp 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,8 +13,8 @@ typedef struct {
13
13
 
14
14
  #define QK4_1 32
15
15
  typedef struct {
16
- half d; // delta
17
- half m; // min
16
+ half d; // delta
17
+ half m; // min
18
18
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
19
19
  } block_q4_1;
20
20
 
@@ -132,6 +132,13 @@ kernel void kernel_relu(
132
132
  dst[tpig] = max(0.0f, src0[tpig]);
133
133
  }
134
134
 
135
+ kernel void kernel_sqr(
136
+ device const float * src0,
137
+ device float * dst,
138
+ uint tpig[[thread_position_in_grid]]) {
139
+ dst[tpig] = src0[tpig] * src0[tpig];
140
+ }
141
+
135
142
  constant float GELU_COEF_A = 0.044715f;
136
143
  constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
137
144
 
@@ -338,10 +345,11 @@ kernel void kernel_rms_norm(
338
345
  uint sgitg[[simdgroup_index_in_threadgroup]],
339
346
  uint tiisg[[thread_index_in_simdgroup]],
340
347
  uint ntg[[threads_per_threadgroup]]) {
341
- device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
342
- device const float * x_scalar = (device const float *) x;
343
- float4 sumf=0;
344
- float all_sum=0;
348
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
349
+ device const float * x_scalar = (device const float *) x;
350
+
351
+ float4 sumf = 0;
352
+ float all_sum = 0;
345
353
 
346
354
  // parallel sum
347
355
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -354,6 +362,7 @@ kernel void kernel_rms_norm(
354
362
  }
355
363
 
356
364
  threadgroup_barrier(mem_flags::mem_threadgroup);
365
+
357
366
  // broadcast, simd group number is ntg / 32
358
367
  for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
359
368
  if (tpitg < i) {
@@ -361,7 +370,9 @@ kernel void kernel_rms_norm(
361
370
  }
362
371
  }
363
372
  if (tpitg == 0) {
364
- for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
373
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {
374
+ sum[0] += x_scalar[i];
375
+ }
365
376
  sum[0] /= ne00;
366
377
  }
367
378
 
@@ -376,7 +387,9 @@ kernel void kernel_rms_norm(
376
387
  y[i00] = x[i00] * scale;
377
388
  }
378
389
  if (tpitg == 0) {
379
- for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
390
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
391
+ y_scalar[i00] = x_scalar[i00] * scale;
392
+ }
380
393
  }
381
394
  }
382
395
 
@@ -416,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
416
429
  }
417
430
 
418
431
  // putting them in the kernel cause a significant performance penalty
419
- #define N_DST 4 // each SIMD group works on 4 rows
420
- #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
432
+ #define N_DST 4 // each SIMD group works on 4 rows
433
+ #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
421
434
  #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
422
435
  //Note: This is a template, but strictly speaking it only applies to
423
436
  // quantizations where the block size is 32. It also does not
@@ -428,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
428
441
  int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
429
442
  uint3 tgpig, uint tiisg, uint sgitg) {
430
443
  const int nb = ne00/QK4_0;
444
+
431
445
  const int r0 = tgpig.x;
432
446
  const int r1 = tgpig.y;
433
447
  const int im = tgpig.z;
448
+
434
449
  const int first_row = (r0 * nsg + sgitg) * nr;
450
+
435
451
  const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
452
+
436
453
  device const block_q_type * x = (device const block_q_type *) src0 + offset0;
437
454
  device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
438
- float yl[16]; // src1 vector cache
439
- float sumf[nr]={0.f};
440
455
 
441
- const int ix = tiisg/2;
442
- const int il = 8*(tiisg%2);
456
+ float yl[16]; // src1 vector cache
457
+ float sumf[nr] = {0.f};
458
+
459
+ const int ix = (tiisg/2);
460
+ const int il = (tiisg%2)*8;
443
461
 
444
462
  device const float * yb = y + ix * QK4_0 + il;
445
463
 
@@ -450,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
450
468
  sumy += yb[i] + yb[i+1];
451
469
  yl[i+0] = yb[i+ 0];
452
470
  yl[i+1] = yb[i+ 1]/256.f;
471
+
453
472
  sumy += yb[i+16] + yb[i+17];
454
473
  yl[i+8] = yb[i+16]/16.f;
455
474
  yl[i+9] = yb[i+17]/4096.f;
@@ -465,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
465
484
  for (int row = 0; row < nr; ++row) {
466
485
  const float tot = simd_sum(sumf[row]);
467
486
  if (tiisg == 0 && first_row + row < ne01) {
468
- dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
487
+ dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
469
488
  }
470
489
  }
471
490
  }
472
491
 
473
- kernel void kernel_mul_mat_q4_0_f32(
492
+ kernel void kernel_mul_mv_q4_0_f32(
474
493
  device const void * src0,
475
494
  device const float * src1,
476
495
  device float * dst,
@@ -483,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
483
502
  constant int64_t & ne1[[buffer(16)]],
484
503
  constant uint & gqa[[buffer(17)]],
485
504
  uint3 tgpig[[threadgroup_position_in_grid]],
486
- uint tiisg[[thread_index_in_simdgroup]],
487
- uint sgitg[[simdgroup_index_in_threadgroup]]) {
505
+ uint tiisg[[thread_index_in_simdgroup]],
506
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
488
507
  mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
489
508
  }
490
509
 
491
- kernel void kernel_mul_mat_q4_1_f32(
510
+ kernel void kernel_mul_mv_q4_1_f32(
492
511
  device const void * src0,
493
512
  device const float * src1,
494
513
  device float * dst,
@@ -508,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
508
527
 
509
528
  #define NB_Q8_0 8
510
529
 
511
- kernel void kernel_mul_mat_q8_0_f32(
530
+ kernel void kernel_mul_mv_q8_0_f32(
512
531
  device const void * src0,
513
532
  device const float * src1,
514
533
  device float * dst,
@@ -572,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
572
591
 
573
592
  #define N_F32_F32 4
574
593
 
575
- kernel void kernel_mul_mat_f32_f32(
594
+ kernel void kernel_mul_mv_f32_f32(
576
595
  device const char * src0,
577
596
  device const char * src1,
578
597
  device float * dst,
@@ -643,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
643
662
  }
644
663
  }
645
664
 
646
- kernel void kernel_mul_mat_f16_f32_1row(
665
+ kernel void kernel_mul_mv_f16_f32_1row(
647
666
  device const char * src0,
648
667
  device const char * src1,
649
668
  device float * dst,
@@ -662,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
662
681
  constant int64_t & ne0,
663
682
  constant int64_t & ne1,
664
683
  uint3 tgpig[[threadgroup_position_in_grid]],
665
- uint tiisg[[thread_index_in_simdgroup]]) {
684
+ uint tiisg[[thread_index_in_simdgroup]]) {
666
685
 
667
686
  const int64_t r0 = tgpig.x;
668
687
  const int64_t r1 = tgpig.y;
@@ -697,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
697
716
 
698
717
  #define N_F16_F32 4
699
718
 
700
- kernel void kernel_mul_mat_f16_f32(
719
+ kernel void kernel_mul_mv_f16_f32(
701
720
  device const char * src0,
702
721
  device const char * src1,
703
722
  device float * dst,
@@ -769,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
769
788
  }
770
789
 
771
790
  // Assumes row size (ne00) is a multiple of 4
772
- kernel void kernel_mul_mat_f16_f32_l4(
791
+ kernel void kernel_mul_mv_f16_f32_l4(
773
792
  device const char * src0,
774
793
  device const char * src1,
775
794
  device float * dst,
@@ -1098,6 +1117,62 @@ kernel void kernel_cpy_f32_f32(
1098
1117
  }
1099
1118
  }
1100
1119
 
1120
+ kernel void kernel_concat(
1121
+ device const char * src0,
1122
+ device const char * src1,
1123
+ device char * dst,
1124
+ constant int64_t & ne00,
1125
+ constant int64_t & ne01,
1126
+ constant int64_t & ne02,
1127
+ constant int64_t & ne03,
1128
+ constant uint64_t & nb00,
1129
+ constant uint64_t & nb01,
1130
+ constant uint64_t & nb02,
1131
+ constant uint64_t & nb03,
1132
+ constant int64_t & ne10,
1133
+ constant int64_t & ne11,
1134
+ constant int64_t & ne12,
1135
+ constant int64_t & ne13,
1136
+ constant uint64_t & nb10,
1137
+ constant uint64_t & nb11,
1138
+ constant uint64_t & nb12,
1139
+ constant uint64_t & nb13,
1140
+ constant int64_t & ne0,
1141
+ constant int64_t & ne1,
1142
+ constant int64_t & ne2,
1143
+ constant int64_t & ne3,
1144
+ constant uint64_t & nb0,
1145
+ constant uint64_t & nb1,
1146
+ constant uint64_t & nb2,
1147
+ constant uint64_t & nb3,
1148
+ uint3 tgpig[[threadgroup_position_in_grid]],
1149
+ uint3 tpitg[[thread_position_in_threadgroup]],
1150
+ uint3 ntg[[threads_per_threadgroup]]) {
1151
+
1152
+ const int64_t i03 = tgpig.z;
1153
+ const int64_t i02 = tgpig.y;
1154
+ const int64_t i01 = tgpig.x;
1155
+
1156
+ const int64_t i13 = i03 % ne13;
1157
+ const int64_t i12 = i02 % ne12;
1158
+ const int64_t i11 = i01 % ne11;
1159
+
1160
+ device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
1161
+ device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
1162
+ device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
1163
+
1164
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
1165
+ if (i02 < ne02) {
1166
+ ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
1167
+ src0_ptr += ntg.x*nb00;
1168
+ } else {
1169
+ ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
1170
+ src1_ptr += ntg.x*nb10;
1171
+ }
1172
+ dst_ptr += ntg.x*nb0;
1173
+ }
1174
+ }
1175
+
1101
1176
  //============================================ k-quants ======================================================
1102
1177
 
1103
1178
  #ifndef QK_K
@@ -1190,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
1190
1265
 
1191
1266
  //====================================== dot products =========================
1192
1267
 
1193
- kernel void kernel_mul_mat_q2_K_f32(
1268
+ kernel void kernel_mul_mv_q2_K_f32(
1194
1269
  device const void * src0,
1195
1270
  device const float * src1,
1196
1271
  device float * dst,
@@ -1334,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
1334
1409
  }
1335
1410
 
1336
1411
  #if QK_K == 256
1337
- kernel void kernel_mul_mat_q3_K_f32(
1412
+ kernel void kernel_mul_mv_q3_K_f32(
1338
1413
  device const void * src0,
1339
1414
  device const float * src1,
1340
1415
  device float * dst,
@@ -1486,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
1486
1561
  }
1487
1562
  }
1488
1563
  #else
1489
- kernel void kernel_mul_mat_q3_K_f32(
1564
+ kernel void kernel_mul_mv_q3_K_f32(
1490
1565
  device const void * src0,
1491
1566
  device const float * src1,
1492
1567
  device float * dst,
@@ -1557,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
1557
1632
  #endif
1558
1633
 
1559
1634
  #if QK_K == 256
1560
- kernel void kernel_mul_mat_q4_K_f32(
1635
+ kernel void kernel_mul_mv_q4_K_f32(
1561
1636
  device const void * src0,
1562
1637
  device const float * src1,
1563
1638
  device float * dst,
@@ -1663,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
1663
1738
  }
1664
1739
  }
1665
1740
  #else
1666
- kernel void kernel_mul_mat_q4_K_f32(
1741
+ kernel void kernel_mul_mv_q4_K_f32(
1667
1742
  device const void * src0,
1668
1743
  device const float * src1,
1669
1744
  device float * dst,
@@ -1752,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
1752
1827
  }
1753
1828
  #endif
1754
1829
 
1755
- kernel void kernel_mul_mat_q5_K_f32(
1830
+ kernel void kernel_mul_mv_q5_K_f32(
1756
1831
  device const void * src0,
1757
1832
  device const float * src1,
1758
1833
  device float * dst,
@@ -1925,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
1925
2000
 
1926
2001
  }
1927
2002
 
1928
- kernel void kernel_mul_mat_q6_K_f32(
2003
+ kernel void kernel_mul_mv_q6_K_f32(
1929
2004
  device const void * src0,
1930
2005
  device const float * src1,
1931
2006
  device float * dst,
@@ -2263,7 +2338,7 @@ kernel void kernel_get_rows(
2263
2338
  }
2264
2339
 
2265
2340
  #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
2266
- #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
2341
+ #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
2267
2342
  #define BLOCK_SIZE_K 32
2268
2343
  #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
2269
2344
  #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@@ -2300,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
2300
2375
  const uint r0 = tgpig.y;
2301
2376
  const uint r1 = tgpig.x;
2302
2377
  const uint im = tgpig.z;
2378
+
2303
2379
  // if this block is of 64x32 shape or smaller
2304
2380
  short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
2305
2381
  short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
2382
+
2306
2383
  // a thread shouldn't load data outside of the matrix
2307
2384
  short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
2308
2385
  short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
@@ -2326,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
2326
2403
  + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
2327
2404
 
2328
2405
  for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
2329
- //load data and store to threadgroup memory
2406
+ // load data and store to threadgroup memory
2330
2407
  half4x4 temp_a;
2331
2408
  dequantize_func(x, il, temp_a);
2332
2409
  threadgroup_barrier(mem_flags::mem_threadgroup);
2410
+
2333
2411
  #pragma unroll(16)
2334
2412
  for (int i = 0; i < 16; i++) {
2335
2413
  *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
2336
- + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
2337
- + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
2414
+ + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
2415
+ + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
2338
2416
  }
2339
- *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
2340
- = *((device float2x4 *)y);
2417
+
2418
+ *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
2419
+
2341
2420
  il = (il + 2 < nl) ? il + 2 : il % 2;
2342
2421
  x = (il < 2) ? x + (2+nl-1)/nl : x;
2343
2422
  y += BLOCK_SIZE_K;
2344
2423
 
2345
2424
  threadgroup_barrier(mem_flags::mem_threadgroup);
2346
- //load matrices from threadgroup memory and conduct outer products
2425
+
2426
+ // load matrices from threadgroup memory and conduct outer products
2347
2427
  threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
2348
2428
  threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
2429
+
2349
2430
  #pragma unroll(4)
2350
2431
  for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
2351
2432
  #pragma unroll(4)
@@ -2360,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
2360
2441
 
2361
2442
  lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
2362
2443
  lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
2444
+
2363
2445
  #pragma unroll(8)
2364
2446
  for (int i = 0; i < 8; i++){
2365
2447
  simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
@@ -2368,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
2368
2450
  }
2369
2451
 
2370
2452
  if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
2371
- device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
2372
- + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
2453
+ device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
2454
+ + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
2373
2455
  for (int i = 0; i < 8; i++) {
2374
2456
  simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
2375
2457
  }
2376
2458
  } else {
2377
2459
  // block is smaller than 64x32, we should avoid writing data outside of the matrix
2378
2460
  threadgroup_barrier(mem_flags::mem_threadgroup);
2379
- threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
2461
+ threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
2380
2462
  + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
2381
2463
  for (int i = 0; i < 8; i++) {
2382
2464
  simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
2383
2465
  }
2384
2466
 
2385
2467
  threadgroup_barrier(mem_flags::mem_threadgroup);
2386
- device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
2387
- if (sgitg==0) {
2468
+
2469
+ device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
2470
+ if (sgitg == 0) {
2388
2471
  for (int i = 0; i < n_rows; i++) {
2389
- for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
2472
+ for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
2390
2473
  *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
2391
2474
  }
2392
2475
  }
@@ -19,7 +19,7 @@
19
19
  #pragma warning(disable: 4244 4267) // possible loss of data
20
20
  #endif
21
21
 
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
 
24
24
  #ifndef K_QUANTS_PER_ITERATION
25
25
  #define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
  const int row = get_group_id(0);
339
339
 
340
340
  const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342
342
 
343
343
  __global const struct block_q2_K * x = xx + ib0;
344
344
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
  const int row = get_group_id(0);
414
414
 
415
415
  const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417
417
 
418
418
  __global const struct block_q3_K * x = xx + ib0;
419
419
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
 
490
490
  const int row = get_group_id(0);
491
491
  const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493
493
 
494
494
  const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495
495
  const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
 
563
563
  const int row = get_group_id(0);
564
564
  const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566
566
 
567
567
  const int tid = get_local_id(0)/2; // 0...15
568
568
  const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
  const int row = get_group_id(0);
642
642
 
643
643
  const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645
645
 
646
646
  __global const struct block_q6_K * x = xx + ib0;
647
647
 
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
 
746
746
  std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
  __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748
- const int block_size = get_local_size(0);
748
+ const int local_size = get_local_size(0);
749
749
  const int row = get_group_id(0);
750
750
  const int tid = get_local_id(0);
751
751
 
752
752
  const uint qk = QUANT_K;
753
753
  const uint qr = QUANT_R;
754
754
 
755
+ const int col_step = local_size * 2;
755
756
  const int y_offset = qr == 1 ? 1 : qk/2;
756
757
 
758
+ x += get_global_offset(0);
759
+
757
760
  tmp[tid] = 0;
758
761
 
759
- for (int i = 0; i < ncols/block_size; i += 2) {
760
- const int col = i*block_size + 2*tid;
762
+ for (int col = tid*2; col < ncols; col += col_step) {
761
763
  const int ib = (row*ncols + col)/qk; // block index
762
764
  const int iqs = (col%qk)/qr; // quant index
763
765
  const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
 
774
776
  // sum up partial sums and write back result
775
777
  barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size/2; s>0; s>>=1) {
778
+ for (int s=local_size/2; s>0; s>>=1) {
777
779
  if (tid < s) {
778
780
  tmp[tid] += tmp[tid + s];
779
781
  }
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1706
  const int nb2 = dst->nb[2];
1705
1707
  const int nb3 = dst->nb[3];
1706
1708
  const ggml_type type = src0->type;
1707
- const bool mul_mat_vec = ne11 == 1;
1709
+ const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1708
1710
 
1709
1711
  const int64_t r2 = ne12 / ne02;
1710
1712
  const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1737
1739
  GGML_ASSERT(to_fp32_cl != nullptr);
1738
1740
 
1739
1741
  const size_t global_denom = ggml_cl_global_denom(type);
1740
- const size_t local = ggml_cl_local_size(type);
1742
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1741
1743
 
1742
1744
  size_t ev_idx = 0;
1743
1745
  std::vector<cl_event> events;
@@ -1770,8 +1772,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1770
1772
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1771
1773
 
1772
1774
  // compute
1773
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1774
- const size_t local = CL_DMMV_BLOCK_SIZE;
1775
+ const size_t global = ne01 * local;
1776
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1775
1777
  const cl_int ncols = ne00;
1776
1778
  events.emplace_back();
1777
1779
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1779,7 +1781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1779
1781
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1780
1782
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1781
1783
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1782
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1784
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1783
1785
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1784
1786
  // convert src0 to fp32 on device
1785
1787
  const size_t global = x_ne / global_denom;