llama_cpp 0.15.2 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3847,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
3847
3847
  }
3848
3848
  }
3849
3849
 
3850
- static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor,
3851
- const sycl::nd_item<3> &item_ct1) {
3852
- int ne0 = ne00 * scale_factor;
3853
- int nidx = item_ct1.get_local_id(2) +
3854
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
3855
- if (nidx >= ne0) {
3850
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
3851
+ const int nb02, const int nb03, const int ne10, const int ne11,
3852
+ const int ne12, const int ne13, const float sf0, const float sf1,
3853
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
3854
+ int index = item_ct1.get_local_id(0) +
3855
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
3856
+ if (index >= ne10 * ne11 * ne12 * ne13) {
3856
3857
  return;
3857
3858
  }
3858
3859
  // operation
3859
- int i00 = nidx / scale_factor;
3860
- int i01 = item_ct1.get_group(1) / scale_factor;
3861
- int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
3862
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
3863
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
3864
- dst[offset_dst] = x[offset_src];
3860
+ int i10 = index % ne10;
3861
+ int i11 = (index / ne10) % ne11;
3862
+ int i12 = (index / (ne10 * ne11)) % ne12;
3863
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
3864
+
3865
+ int i00 = i10 / sf0;
3866
+ int i01 = i11 / sf1;
3867
+ int i02 = i12 / sf2;
3868
+ int i03 = i13 / sf3;
3869
+
3870
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
3865
3871
  }
3866
3872
 
3867
3873
  static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -4191,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4191
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4192
4198
 
4193
4199
  const int tid = item_ct1.get_local_id(2);
4194
- #if QK_K == 256
4195
4200
  const int n = tid/32;
4196
4201
  const int l = tid - 32*n;
4197
4202
  const int is = 8*n + l/16;
@@ -4205,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4205
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4206
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4207
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4208
- #else
4209
- const int is = tid/16; // 0 or 1
4210
- const int il = tid%16; // 0...15
4211
- const uint8_t q = x[i].qs[il] >> (2*is);
4212
- dst_t * y = yy + i*QK_K + 16*is + il;
4213
-
4214
- float dall = x[i].dm[0];
4215
- float dmin = x[i].dm[1];
4216
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4217
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4218
- #endif
4219
-
4220
4213
  }
4221
4214
 
4222
4215
  template<typename dst_t>
@@ -4226,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4226
4219
  const int i = item_ct1.get_group(2);
4227
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4228
4221
 
4229
- #if QK_K == 256
4230
4222
  const int r = item_ct1.get_local_id(2) / 4;
4231
4223
  const int tid = r/2;
4232
4224
  const int is0 = r%2;
@@ -4250,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4250
4242
  const uint8_t * hm = x[i].hmask;
4251
4243
 
4252
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4253
- #else
4254
- const int tid = item_ct1.get_local_id(2);
4255
- const int is = tid/16; // 0 or 1
4256
- const int il = tid%16; // 0...15
4257
- const int im = il/8; // 0...1
4258
- const int in = il%8; // 0...7
4259
-
4260
- dst_t * y = yy + i*QK_K + 16*is + il;
4261
-
4262
- const uint8_t q = x[i].qs[il] >> (2*is);
4263
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4264
- const float d = (float)x[i].d;
4265
-
4266
- if (is == 0) {
4267
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4268
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4269
- } else {
4270
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4271
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4272
- }
4273
- #endif
4274
-
4275
4245
  }
4276
4246
 
4277
- #if QK_K == 256
4278
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4279
4248
  if (j < 4) {
4280
4249
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4283,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4283
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4284
4253
  }
4285
4254
  }
4286
- #endif
4287
4255
 
4288
4256
  template<typename dst_t>
4289
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4292,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4292
4260
 
4293
4261
  const int i = item_ct1.get_group(2);
4294
4262
 
4295
- #if QK_K == 256
4296
4263
  // assume 32 threads
4297
4264
  const int tid = item_ct1.get_local_id(2);
4298
4265
  const int il = tid/8;
@@ -4316,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4316
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4317
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4318
4285
  }
4319
- #else
4320
- const int tid = item_ct1.get_local_id(2);
4321
- const uint8_t * q = x[i].qs;
4322
- dst_t * y = yy + i*QK_K;
4323
- const float d = (float)x[i].dm[0];
4324
- const float m = (float)x[i].dm[1];
4325
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4326
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4327
- #endif
4328
4286
  }
4329
4287
 
4330
4288
  template<typename dst_t>
@@ -4334,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4334
4292
 
4335
4293
  const int i = item_ct1.get_group(2);
4336
4294
 
4337
- #if QK_K == 256
4338
4295
  // assume 64 threads - this is very slightly better than the one below
4339
4296
  const int tid = item_ct1.get_local_id(2);
4340
4297
  const int il = tid/16; // il is in 0...3
@@ -4361,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4361
4318
  hm <<= 1;
4362
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4363
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4364
- #else
4365
- const int tid = item_ct1.get_local_id(2);
4366
- const uint8_t q = x[i].qs[tid];
4367
- const int im = tid/8; // 0...3
4368
- const int in = tid%8; // 0...7
4369
- const int is = tid/16; // 0 or 1
4370
- const uint8_t h = x[i].qh[in] >> im;
4371
- const float d = x[i].d;
4372
- dst_t * y = yy + i*QK_K + tid;
4373
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4374
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4375
- #endif
4376
4321
  }
4377
4322
 
4378
4323
  template<typename dst_t>
@@ -4381,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4381
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4382
4327
 
4383
4328
  const int i = item_ct1.get_group(2);
4384
- #if QK_K == 256
4385
4329
 
4386
4330
  // assume 64 threads - this is very slightly better than the one below
4387
4331
  const int tid = item_ct1.get_local_id(2);
@@ -4401,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4401
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4402
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4403
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4404
- #else
4405
-
4406
- // assume 32 threads
4407
- const int tid = item_ct1.get_local_id(2);
4408
- const int ip = tid/16; // 0 or 1
4409
- const int il = tid - 16*ip; // 0...15
4410
-
4411
- dst_t * y = yy + i*QK_K + 16*ip + il;
4412
-
4413
- const float d = x[i].d;
4414
-
4415
- const uint8_t ql = x[i].ql[16*ip + il];
4416
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4417
- const int8_t * sc = x[i].scales;
4418
-
4419
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4420
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4421
- #endif
4422
4348
  }
4423
4349
 
4424
4350
  template<typename dst_t>
@@ -4432,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4432
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4433
4359
 
4434
4360
  const int tid = item_ct1.get_local_id(2);
4435
- #if QK_K == 256
4436
4361
  const int il = tid/8; // 0...3
4437
4362
  const int ib = tid%8; // 0...7
4438
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4443,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4443
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4444
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4445
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4446
- #else
4447
- assert(false);
4448
- #endif
4449
-
4450
4371
  }
4451
4372
 
4452
4373
  template<typename dst_t>
@@ -4460,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4460
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4461
4382
 
4462
4383
  const int tid = item_ct1.get_local_id(2);
4463
- #if QK_K == 256
4464
4384
  const int il = tid/8; // 0...3
4465
4385
  const int ib = tid%8; // 0...7
4466
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4469,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4469
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4470
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4471
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4472
- #else
4473
- assert(false);
4474
- #endif
4475
-
4476
4392
  }
4477
4393
 
4478
4394
  template <typename dst_t>
@@ -4484,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4484
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4485
4401
 
4486
4402
  const int tid = item_ct1.get_local_id(2);
4487
- #if QK_K == 256
4488
4403
  const int il = tid/8; // 0...3
4489
4404
  const int ib = tid%8; // 0...7
4490
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4492,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4492
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4493
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4494
4409
  #pragma unroll
4495
- for (int j = 0; j < 8; ++j)
4410
+ for (int j = 0; j < 8; ++j) {
4496
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4497
- #else
4498
- assert(false);
4499
-
4500
- #endif
4501
-
4412
+ }
4502
4413
  }
4503
4414
 
4504
4415
  template<typename dst_t>
@@ -4512,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4512
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4513
4424
 
4514
4425
  const int tid = item_ct1.get_local_id(2);
4515
- #if QK_K == 256
4516
4426
  const int il = tid/8; // 0...3
4517
4427
  const int ib = tid%8; // 0...7
4518
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4527,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4527
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4528
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4529
4439
  }
4530
- #else
4531
- assert(false);
4532
- #endif
4533
-
4534
4440
  }
4535
4441
 
4536
4442
  template <typename dst_t>
@@ -4543,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4543
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4544
4450
 
4545
4451
  const int tid = item_ct1.get_local_id(2);
4546
- #if QK_K == 256
4547
4452
  const int il = tid/8; // 0...3
4548
4453
  const int ib = tid%8; // 0...7
4549
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4557,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4557
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4558
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4559
4464
  }
4560
- #else
4561
- assert(false);
4562
- #endif
4563
-
4564
4465
  }
4565
4466
 
4566
4467
  template <typename dst_t>
@@ -4573,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4573
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4574
4475
 
4575
4476
  const int tid = item_ct1.get_local_id(2);
4576
- #if QK_K == 256
4577
4477
  const int il = tid/8; // 0...3
4578
4478
  const int ib = tid%8; // 0...7
4579
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4587,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4587
4487
  for (int j = 0; j < 8; ++j) {
4588
4488
  y[j] = d * (q[j] + delta);
4589
4489
  }
4590
- #else
4591
- assert(false);
4592
- #endif
4593
-
4594
4490
  }
4595
4491
 
4596
4492
  template <typename dst_t>
@@ -4603,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4603
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4604
4500
 
4605
4501
  const int tid = item_ct1.get_local_id(2);
4606
- #if QK_K == 256
4607
4502
  const int il = tid/8; // 0...3
4608
4503
  const int ib = tid%8; // 0...7
4609
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4621,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4621
4516
  for (int j = 0; j < 8; ++j) {
4622
4517
  y[j] = d * (q[j] + delta);
4623
4518
  }
4624
- #else
4625
- assert(false);
4626
- #endif
4627
-
4628
4519
  }
4629
4520
 
4630
4521
  template <typename dst_t>
@@ -4698,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4698
4589
 
4699
4590
  float tmp = 0; // partial sum for thread in warp
4700
4591
 
4701
- #if QK_K == 256
4702
4592
  const int tid =
4703
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4704
4594
  const int ix =
@@ -4749,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4749
4639
  tmp += dall * sum1 - dmin * sum2;
4750
4640
 
4751
4641
  }
4752
- #else
4753
- const int tid = item_ct1.get_local_id(2) /
4754
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4755
- const int ix = item_ct1.get_local_id(2) %
4756
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4757
- const int offset = tid * K_QUANTS_PER_ITERATION;
4758
-
4759
- uint32_t uaux[2];
4760
- const uint8_t * d = (const uint8_t *)uaux;
4761
-
4762
-
4763
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4764
-
4765
- const float * y = yy + i * QK_K + offset;
4766
- const uint8_t * q = x[i].qs + offset;
4767
- const uint32_t * s = (const uint32_t *)x[i].scales;
4768
-
4769
- uaux[0] = s[0] & 0x0f0f0f0f;
4770
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4771
-
4772
- const sycl::float2 dall =
4773
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4774
-
4775
- float sum1 = 0, sum2 = 0;
4776
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4777
- const uint8_t ql = q[l];
4778
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4779
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4780
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4781
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4782
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4783
- }
4784
- tmp += dall.x() * sum1 - dall.y() * sum2;
4785
- }
4786
-
4787
- #endif
4788
4642
 
4789
4643
  // sum up partial sums and write back result
4790
4644
  #pragma unroll
@@ -4822,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4822
4676
 
4823
4677
  float tmp = 0; // partial sum for thread in warp
4824
4678
 
4825
- #if QK_K == 256
4826
-
4827
4679
  const uint16_t kmask1 = 0x0303;
4828
4680
  const uint16_t kmask2 = 0x0f0f;
4829
4681
 
@@ -4876,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4876
4728
  tmp += d * sum;
4877
4729
 
4878
4730
  }
4879
- #else
4880
-
4881
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4882
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4883
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4884
- const int in = offset/8; // 0 or 1
4885
- const int im = offset%8; // 0...7
4886
-
4887
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4888
-
4889
- const float * y = yy + i * QK_K + offset;
4890
- const uint8_t * q = x[i].qs + offset;
4891
- const uint8_t * s = x[i].scales;
4892
-
4893
- const float dall = (float)x[i].d;
4894
-
4895
- float sum = 0;
4896
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4897
- const uint8_t hl = x[i].hmask[im+l] >> in;
4898
- const uint8_t ql = q[l];
4899
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4900
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4901
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4902
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4903
- }
4904
- tmp += sum;
4905
- }
4906
- #endif
4907
4731
 
4908
4732
  // sum up partial sums and write back result
4909
4733
  #pragma unroll
@@ -4938,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4938
4762
 
4939
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4940
4764
 
4941
- #if QK_K == 256
4942
4765
  const uint16_t kmask1 = 0x3f3f;
4943
4766
  const uint16_t kmask2 = 0x0f0f;
4944
4767
  const uint16_t kmask3 = 0xc0c0;
@@ -5027,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5027
4850
  #endif
5028
4851
 
5029
4852
  }
5030
- #else
5031
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5032
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5033
-
5034
- const int step = tid * K_QUANTS_PER_ITERATION;
5035
-
5036
- uint16_t aux16[2];
5037
- const uint8_t * s = (const uint8_t *)aux16;
5038
-
5039
- float tmp = 0;
5040
-
5041
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5042
- const uint8_t * q = x[i].qs + step;
5043
- const float * y = yy + i*QK_K + step;
5044
- const uint16_t * a = (const uint16_t *)x[i].scales;
5045
- aux16[0] = a[0] & 0x0f0f;
5046
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5047
- const float d = (float)x[i].dm[0];
5048
- const float m = (float)x[i].dm[1];
5049
- float sum = 0.f;
5050
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5051
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5052
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5053
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5054
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5055
- }
5056
- tmp += sum;
5057
- }
5058
-
5059
- #endif
5060
4853
 
5061
4854
  // sum up partial sums and write back result
5062
4855
  #pragma unroll
@@ -5091,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5091
4884
 
5092
4885
  float tmp = 0; // partial sum for thread in warp
5093
4886
 
5094
- #if QK_K == 256
5095
4887
  const uint16_t kmask1 = 0x3f3f;
5096
4888
  const uint16_t kmask2 = 0x0f0f;
5097
4889
  const uint16_t kmask3 = 0xc0c0;
@@ -5168,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5168
4960
  dmin * smin;
5169
4961
  }
5170
4962
 
5171
- #else
5172
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5173
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5174
- const int step = tid * K_QUANTS_PER_ITERATION;
5175
- const int im = step/8;
5176
- const int in = step%8;
5177
-
5178
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5179
- const uint8_t * q = x[i].qs + step;
5180
- const int8_t * s = x[i].scales;
5181
- const float * y = yy + i*QK_K + step;
5182
- const float d = x[i].d;
5183
- float sum = 0.f;
5184
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5185
- const uint8_t h = x[i].qh[in+j] >> im;
5186
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5187
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5188
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5189
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5190
- }
5191
- tmp += sum;
5192
- }
5193
- #endif
5194
-
5195
4963
  // sum up partial sums and write back result
5196
4964
  #pragma unroll
5197
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5218,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5218
4986
 
5219
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5220
4988
 
5221
- #if QK_K == 256
5222
-
5223
4989
  const int tid =
5224
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5225
4991
  const int ix =
@@ -5276,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5276
5042
 
5277
5043
  }
5278
5044
 
5279
- #else
5280
-
5281
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5282
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5283
-
5284
- const int step = tid * K_QUANTS_PER_ITERATION;
5285
-
5286
- float tmp = 0; // partial sum for thread in warp
5287
-
5288
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5289
-
5290
- const float * y = yy + i * QK_K + step;
5291
- const uint8_t * ql = x[i].ql + step;
5292
- const uint8_t * qh = x[i].qh + step;
5293
- const int8_t * s = x[i].scales;
5294
-
5295
- const float d = x[i+0].d;
5296
-
5297
- float sum = 0;
5298
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5299
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5300
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5301
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5302
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5303
- }
5304
- tmp += sum;
5305
-
5306
- }
5307
-
5308
- #endif
5309
-
5310
5045
  // sum up partial sums and write back result
5311
5046
  #pragma unroll
5312
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6851,7 +6586,6 @@ static __dpct_inline__ float
6851
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6852
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6853
6588
 
6854
- #ifndef GGML_QKK_64
6855
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6856
6590
 
6857
6591
  int v[2];
@@ -6893,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6893
6627
  }
6894
6628
 
6895
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6896
-
6897
- #else
6898
-
6899
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6900
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6901
-
6902
- float sumf_d = 0.0f;
6903
- float sumf_m = 0.0f;
6904
-
6905
- uint16_t aux16[2];
6906
- const uint8_t * s = (const uint8_t *)aux16;
6907
-
6908
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6909
- aux16[0] = a[0] & 0x0f0f;
6910
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6911
-
6912
- const float dall = bq4_K->dm[0];
6913
- const float dmin = bq4_K->dm[1];
6914
-
6915
- const float d8_1 = bq8_1[0].ds[0];
6916
- const float d8_2 = bq8_1[1].ds[1];
6917
-
6918
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6919
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6920
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6921
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6922
-
6923
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6924
- const int v1 = q4[0];
6925
- const int v2 = q4[4];
6926
-
6927
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6928
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6929
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6930
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6931
-
6932
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6933
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6934
-
6935
- return dall * sumf_d - dmin * sumf_m;
6936
-
6937
- #else
6938
- bad_arch();
6939
- #endif // __SYCL_ARCH__ >= VER_4VEC
6940
-
6941
- #endif
6942
6630
  }
6943
6631
 
6944
6632
  template <int mmq_y>
@@ -6997,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
6997
6685
 
6998
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
6999
6687
 
7000
- #if QK_K == 256
7001
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7002
- #else
7003
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7004
- #endif
7005
6689
  }
7006
6690
 
7007
6691
  #pragma unroll
@@ -7044,7 +6728,6 @@ static __dpct_inline__ float
7044
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7045
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7046
6730
 
7047
- #ifndef GGML_QKK_64
7048
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7049
6732
 
7050
6733
  int vl[2];
@@ -7086,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7086
6769
  }
7087
6770
 
7088
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7089
-
7090
- #else
7091
-
7092
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7093
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7094
-
7095
- const int8_t * s = bq5_K->scales;
7096
-
7097
- const float d = bq5_K->d;
7098
-
7099
- const float d8_1 = bq8_1[0].ds[0];
7100
- const float d8_2 = bq8_1[1].ds[1];
7101
-
7102
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7103
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7104
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7105
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7106
-
7107
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7108
- const int vl1 = ql[0];
7109
- const int vl2 = ql[4];
7110
-
7111
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7112
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7113
- const int in = step%8; // 0, 4, 0, 4
7114
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7115
-
7116
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7117
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7118
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7119
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7120
-
7121
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7122
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7123
-
7124
- return d * sumf_d;
7125
-
7126
- #else
7127
- bad_arch();
7128
- #endif // __SYCL_ARCH__ >= VER_4VEC
7129
-
7130
- #endif
7131
6772
  }
7132
6773
 
7133
6774
  template <int mmq_y>
@@ -7199,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7199
6840
 
7200
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7201
6842
 
7202
- #if QK_K == 256
7203
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7204
- #endif
7205
6844
  }
7206
6845
 
7207
6846
  #pragma unroll
@@ -7381,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7381
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7382
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7383
7022
  const uint8_t *kmask_iq2xs) {
7384
- #if QK_K == 256
7385
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7386
7024
 
7387
7025
  #if QR2_XXS == 8
@@ -7422,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7422
7060
  }
7423
7061
  return d * (sumi1 + sumi2);
7424
7062
  #endif
7425
- #else
7426
- assert(false);
7427
- return 0.f;
7428
- #endif
7429
7063
  }
7430
7064
 
7431
7065
  static __dpct_inline__ float
@@ -7434,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7434
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7435
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7436
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7437
- #if QK_K == 256
7438
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7439
7072
 
7440
7073
  const int ib32 = iqs;
@@ -7472,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7472
7105
  assert(false);
7473
7106
  return 0.f;
7474
7107
  #endif
7475
- #else
7476
- assert(false);
7477
- return 0.f;
7478
- #endif
7479
7108
  }
7480
7109
 
7481
7110
  static __dpct_inline__ float
7482
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7483
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7484
- #if QK_K == 256
7485
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7486
7114
 
7487
7115
  const int ib32 = iqs;
@@ -7525,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7525
7153
  }
7526
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7527
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7528
- #else
7529
- assert(false);
7530
- #endif
7531
7156
  }
7532
7157
 
7533
7158
  static __dpct_inline__ float
@@ -7536,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7536
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7537
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7538
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7539
- #if QK_K == 256
7540
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7541
7165
 
7542
7166
  const int ib32 = iqs;
@@ -7564,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7564
7188
  assert(false);
7565
7189
  return 0.f;
7566
7190
  #endif
7567
- #else
7568
- assert(false);
7569
- return 0.f;
7570
- #endif
7571
7191
  }
7572
7192
 
7573
7193
  static __dpct_inline__ float
7574
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7575
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7576
7196
  const uint32_t *iq3s_grid) {
7577
- #if QK_K == 256
7578
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7579
7198
 
7580
7199
  const int ib32 = iqs;
@@ -7603,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7603
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7604
7223
  bq8_1[ib32].ds[0];
7605
7224
  return d * sumi;
7606
- #else
7607
- assert(false);
7608
- #endif
7609
7225
  }
7610
7226
 
7611
7227
  static __dpct_inline__ float
7612
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7613
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7614
7230
  const uint32_t *iq1s_grid_gpu) {
7615
- #if QK_K == 256
7616
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7617
7232
 
7618
7233
  const int ib32 = iqs;
@@ -7631,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7631
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7632
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7633
7248
  return d * sumi + m * delta;
7634
- #else
7635
- assert(false);
7636
- #endif
7637
7249
  }
7638
7250
 
7639
7251
  static __dpct_inline__ float
7640
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7641
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7642
- #if QK_K == 256
7643
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7644
7255
 
7645
7256
  const int ib32 = iqs;
@@ -7664,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7664
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7665
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7666
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7667
- #else
7668
- assert(false);
7669
- #endif
7670
7278
  }
7671
7279
 
7672
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7714,7 +7322,6 @@ static __dpct_inline__ float
7714
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7715
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7716
7324
 
7717
- #if QK_K == 256
7718
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7719
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7720
7327
 
@@ -7732,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7732
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7733
7340
  }
7734
7341
  return d * (sumi1 + sumi2);
7735
- #else
7736
- assert(false);
7737
- #endif
7738
7342
  }
7739
7343
 
7740
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -10085,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
10085
9689
  });
10086
9690
  }
10087
9691
 
10088
- static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
10089
- const int ne01, const int ne02,
10090
- const int scale_factor, dpct::queue_ptr stream) {
10091
- int ne0 = (ne00 * scale_factor);
10092
- int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
10093
- sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
9692
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
9693
+ const int nb02, const int nb03, const int ne10, const int ne11,
9694
+ const int ne12, const int ne13, const float sf0, const float sf1,
9695
+ const float sf2, const float sf3, dpct::queue_ptr stream) {
9696
+ int dst_size = ne10 * ne11 * ne12 * ne13;
9697
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
9698
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
10094
9699
  stream->parallel_for(
10095
- sycl::nd_range<3>(gridDim *
10096
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
10097
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
10098
- [=](sycl::nd_item<3> item_ct1) {
10099
- upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
9700
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
9701
+ [=](sycl::nd_item<1> item_ct1) {
9702
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
10100
9703
  });
10101
9704
  }
10102
9705
 
@@ -10198,7 +9801,6 @@ template <typename dst_t>
10198
9801
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10199
9802
  dpct::queue_ptr stream) {
10200
9803
  const int nb = k / QK_K;
10201
- #if QK_K == 256
10202
9804
  {
10203
9805
  dpct::has_capability_or_fail(stream->get_device(),
10204
9806
  {sycl::aspect::fp16});
@@ -10210,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10210
9812
  dequantize_block_q2_K(vx, y, item_ct1);
10211
9813
  });
10212
9814
  }
10213
- #else
10214
- {
10215
- dpct::has_capability_or_fail(stream->get_device(),
10216
- {sycl::aspect::fp16});
10217
-
10218
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10219
- sycl::range<3>(1, 1, 32),
10220
- sycl::range<3>(1, 1, 32)),
10221
- [=](sycl::nd_item<3> item_ct1) {
10222
- dequantize_block_q2_K(vx, y, item_ct1);
10223
- });
10224
- }
10225
-
10226
- #endif
10227
9815
  }
10228
9816
 
10229
9817
  template <typename dst_t>
10230
9818
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10231
9819
  dpct::queue_ptr stream) {
10232
9820
  const int nb = k / QK_K;
10233
- #if QK_K == 256
10234
9821
  {
10235
9822
  dpct::has_capability_or_fail(stream->get_device(),
10236
9823
  {sycl::aspect::fp16});
@@ -10242,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10242
9829
  dequantize_block_q3_K(vx, y, item_ct1);
10243
9830
  });
10244
9831
  }
10245
- #else
10246
- {
10247
- dpct::has_capability_or_fail(stream->get_device(),
10248
- {sycl::aspect::fp16});
10249
-
10250
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10251
- sycl::range<3>(1, 1, 32),
10252
- sycl::range<3>(1, 1, 32)),
10253
- [=](sycl::nd_item<3> item_ct1) {
10254
- dequantize_block_q3_K(vx, y, item_ct1);
10255
- });
10256
- }
10257
- #endif
10258
9832
  }
10259
9833
 
10260
9834
  template <typename dst_t>
@@ -10315,7 +9889,6 @@ template <typename dst_t>
10315
9889
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10316
9890
  dpct::queue_ptr stream) {
10317
9891
  const int nb = k / QK_K;
10318
- #if QK_K == 256
10319
9892
  {
10320
9893
  dpct::has_capability_or_fail(stream->get_device(),
10321
9894
  {sycl::aspect::fp16});
@@ -10327,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10327
9900
  dequantize_block_q5_K(vx, y, item_ct1);
10328
9901
  });
10329
9902
  }
10330
- #else
10331
- {
10332
- dpct::has_capability_or_fail(stream->get_device(),
10333
- {sycl::aspect::fp16});
10334
-
10335
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10336
- sycl::range<3>(1, 1, 32),
10337
- sycl::range<3>(1, 1, 32)),
10338
- [=](sycl::nd_item<3> item_ct1) {
10339
- dequantize_block_q5_K(vx, y, item_ct1);
10340
- });
10341
- }
10342
-
10343
- #endif
10344
9903
  }
10345
9904
 
10346
9905
  template <typename dst_t>
10347
9906
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10348
9907
  dpct::queue_ptr stream) {
10349
9908
  const int nb = k / QK_K;
10350
- #if QK_K == 256
10351
9909
  {
10352
9910
  dpct::has_capability_or_fail(stream->get_device(),
10353
9911
  {sycl::aspect::fp16});
@@ -10359,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10359
9917
  dequantize_block_q6_K(vx, y, item_ct1);
10360
9918
  });
10361
9919
  }
10362
- #else
10363
- {
10364
- dpct::has_capability_or_fail(stream->get_device(),
10365
- {sycl::aspect::fp16});
10366
-
10367
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10368
- sycl::range<3>(1, 1, 32),
10369
- sycl::range<3>(1, 1, 32)),
10370
- [=](sycl::nd_item<3> item_ct1) {
10371
- dequantize_block_q6_K(vx, y, item_ct1);
10372
- });
10373
- }
10374
-
10375
- #endif
10376
9920
  }
10377
9921
 
10378
9922
  template <typename dst_t>
@@ -10524,9 +10068,6 @@ template <typename dst_t>
10524
10068
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10525
10069
  dpct::queue_ptr stream) {
10526
10070
  const int nb = (k + QK_K - 1) / QK_K;
10527
- #if QK_K == 64
10528
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10529
- #else
10530
10071
  {
10531
10072
  dpct::has_capability_or_fail(stream->get_device(),
10532
10073
  {sycl::aspect::fp16});
@@ -10541,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10541
10082
  });
10542
10083
  });
10543
10084
  }
10544
- #endif
10545
10085
  }
10546
10086
 
10547
10087
 
@@ -12046,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12046
11586
  const int nrows_y, const int nrows_dst,
12047
11587
  dpct::queue_ptr stream) try {
12048
11588
 
12049
- #if QK_K == 256
12050
-
12051
11589
  int id;
12052
11590
  SYCL_CHECK(
12053
11591
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12162,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12162
11700
  });
12163
11701
  }
12164
11702
  }
12165
- #endif
12166
11703
  }
12167
11704
  catch (sycl::exception const &exc) {
12168
11705
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13985,15 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
13985
13522
 
13986
13523
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
13987
13524
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
13988
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
13989
-
13990
- #pragma message("TODO: generalize upscale operator")
13991
- #pragma message(" https://github.com/ggerganov/ggml/pull/814")
13992
- GGML_ASSERT(false && "TODO: generalize upscale operator");
13993
13525
 
13994
- const int scale_factor = dst->op_params[0];
13526
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
13527
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
13528
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
13529
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
13995
13530
 
13996
- upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
13531
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
13532
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
13533
+ main_stream);
13997
13534
 
13998
13535
  (void) src1;
13999
13536
  (void) dst;
@@ -14449,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14449
13986
  ggml_tensor *dst, const float *src0_dd,
14450
13987
  const float *src1_dd, float *dst_dd,
14451
13988
  const dpct::queue_ptr &main_stream) {
13989
+ #pragma message("TODO: implement phi3 frequency factors support")
13990
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
13991
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
14452
13992
 
14453
13993
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
14454
13994
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);