llama_cpp 0.15.2 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3847,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
3847
3847
  }
3848
3848
  }
3849
3849
 
3850
- static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor,
3851
- const sycl::nd_item<3> &item_ct1) {
3852
- int ne0 = ne00 * scale_factor;
3853
- int nidx = item_ct1.get_local_id(2) +
3854
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
3855
- if (nidx >= ne0) {
3850
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
3851
+ const int nb02, const int nb03, const int ne10, const int ne11,
3852
+ const int ne12, const int ne13, const float sf0, const float sf1,
3853
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
3854
+ int index = item_ct1.get_local_id(0) +
3855
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
3856
+ if (index >= ne10 * ne11 * ne12 * ne13) {
3856
3857
  return;
3857
3858
  }
3858
3859
  // operation
3859
- int i00 = nidx / scale_factor;
3860
- int i01 = item_ct1.get_group(1) / scale_factor;
3861
- int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
3862
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
3863
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
3864
- dst[offset_dst] = x[offset_src];
3860
+ int i10 = index % ne10;
3861
+ int i11 = (index / ne10) % ne11;
3862
+ int i12 = (index / (ne10 * ne11)) % ne12;
3863
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
3864
+
3865
+ int i00 = i10 / sf0;
3866
+ int i01 = i11 / sf1;
3867
+ int i02 = i12 / sf2;
3868
+ int i03 = i13 / sf3;
3869
+
3870
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
3865
3871
  }
3866
3872
 
3867
3873
  static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -4191,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4191
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4192
4198
 
4193
4199
  const int tid = item_ct1.get_local_id(2);
4194
- #if QK_K == 256
4195
4200
  const int n = tid/32;
4196
4201
  const int l = tid - 32*n;
4197
4202
  const int is = 8*n + l/16;
@@ -4205,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4205
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4206
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4207
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4208
- #else
4209
- const int is = tid/16; // 0 or 1
4210
- const int il = tid%16; // 0...15
4211
- const uint8_t q = x[i].qs[il] >> (2*is);
4212
- dst_t * y = yy + i*QK_K + 16*is + il;
4213
-
4214
- float dall = x[i].dm[0];
4215
- float dmin = x[i].dm[1];
4216
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4217
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4218
- #endif
4219
-
4220
4213
  }
4221
4214
 
4222
4215
  template<typename dst_t>
@@ -4226,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4226
4219
  const int i = item_ct1.get_group(2);
4227
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4228
4221
 
4229
- #if QK_K == 256
4230
4222
  const int r = item_ct1.get_local_id(2) / 4;
4231
4223
  const int tid = r/2;
4232
4224
  const int is0 = r%2;
@@ -4250,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4250
4242
  const uint8_t * hm = x[i].hmask;
4251
4243
 
4252
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4253
- #else
4254
- const int tid = item_ct1.get_local_id(2);
4255
- const int is = tid/16; // 0 or 1
4256
- const int il = tid%16; // 0...15
4257
- const int im = il/8; // 0...1
4258
- const int in = il%8; // 0...7
4259
-
4260
- dst_t * y = yy + i*QK_K + 16*is + il;
4261
-
4262
- const uint8_t q = x[i].qs[il] >> (2*is);
4263
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4264
- const float d = (float)x[i].d;
4265
-
4266
- if (is == 0) {
4267
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4268
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4269
- } else {
4270
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4271
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4272
- }
4273
- #endif
4274
-
4275
4245
  }
4276
4246
 
4277
- #if QK_K == 256
4278
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4279
4248
  if (j < 4) {
4280
4249
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4283,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4283
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4284
4253
  }
4285
4254
  }
4286
- #endif
4287
4255
 
4288
4256
  template<typename dst_t>
4289
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4292,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4292
4260
 
4293
4261
  const int i = item_ct1.get_group(2);
4294
4262
 
4295
- #if QK_K == 256
4296
4263
  // assume 32 threads
4297
4264
  const int tid = item_ct1.get_local_id(2);
4298
4265
  const int il = tid/8;
@@ -4316,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4316
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4317
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4318
4285
  }
4319
- #else
4320
- const int tid = item_ct1.get_local_id(2);
4321
- const uint8_t * q = x[i].qs;
4322
- dst_t * y = yy + i*QK_K;
4323
- const float d = (float)x[i].dm[0];
4324
- const float m = (float)x[i].dm[1];
4325
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4326
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4327
- #endif
4328
4286
  }
4329
4287
 
4330
4288
  template<typename dst_t>
@@ -4334,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4334
4292
 
4335
4293
  const int i = item_ct1.get_group(2);
4336
4294
 
4337
- #if QK_K == 256
4338
4295
  // assume 64 threads - this is very slightly better than the one below
4339
4296
  const int tid = item_ct1.get_local_id(2);
4340
4297
  const int il = tid/16; // il is in 0...3
@@ -4361,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4361
4318
  hm <<= 1;
4362
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4363
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4364
- #else
4365
- const int tid = item_ct1.get_local_id(2);
4366
- const uint8_t q = x[i].qs[tid];
4367
- const int im = tid/8; // 0...3
4368
- const int in = tid%8; // 0...7
4369
- const int is = tid/16; // 0 or 1
4370
- const uint8_t h = x[i].qh[in] >> im;
4371
- const float d = x[i].d;
4372
- dst_t * y = yy + i*QK_K + tid;
4373
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4374
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4375
- #endif
4376
4321
  }
4377
4322
 
4378
4323
  template<typename dst_t>
@@ -4381,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4381
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4382
4327
 
4383
4328
  const int i = item_ct1.get_group(2);
4384
- #if QK_K == 256
4385
4329
 
4386
4330
  // assume 64 threads - this is very slightly better than the one below
4387
4331
  const int tid = item_ct1.get_local_id(2);
@@ -4401,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4401
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4402
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4403
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4404
- #else
4405
-
4406
- // assume 32 threads
4407
- const int tid = item_ct1.get_local_id(2);
4408
- const int ip = tid/16; // 0 or 1
4409
- const int il = tid - 16*ip; // 0...15
4410
-
4411
- dst_t * y = yy + i*QK_K + 16*ip + il;
4412
-
4413
- const float d = x[i].d;
4414
-
4415
- const uint8_t ql = x[i].ql[16*ip + il];
4416
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4417
- const int8_t * sc = x[i].scales;
4418
-
4419
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4420
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4421
- #endif
4422
4348
  }
4423
4349
 
4424
4350
  template<typename dst_t>
@@ -4432,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4432
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4433
4359
 
4434
4360
  const int tid = item_ct1.get_local_id(2);
4435
- #if QK_K == 256
4436
4361
  const int il = tid/8; // 0...3
4437
4362
  const int ib = tid%8; // 0...7
4438
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4443,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4443
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4444
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4445
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4446
- #else
4447
- assert(false);
4448
- #endif
4449
-
4450
4371
  }
4451
4372
 
4452
4373
  template<typename dst_t>
@@ -4460,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4460
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4461
4382
 
4462
4383
  const int tid = item_ct1.get_local_id(2);
4463
- #if QK_K == 256
4464
4384
  const int il = tid/8; // 0...3
4465
4385
  const int ib = tid%8; // 0...7
4466
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4469,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4469
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4470
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4471
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4472
- #else
4473
- assert(false);
4474
- #endif
4475
-
4476
4392
  }
4477
4393
 
4478
4394
  template <typename dst_t>
@@ -4484,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4484
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4485
4401
 
4486
4402
  const int tid = item_ct1.get_local_id(2);
4487
- #if QK_K == 256
4488
4403
  const int il = tid/8; // 0...3
4489
4404
  const int ib = tid%8; // 0...7
4490
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4492,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4492
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4493
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4494
4409
  #pragma unroll
4495
- for (int j = 0; j < 8; ++j)
4410
+ for (int j = 0; j < 8; ++j) {
4496
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4497
- #else
4498
- assert(false);
4499
-
4500
- #endif
4501
-
4412
+ }
4502
4413
  }
4503
4414
 
4504
4415
  template<typename dst_t>
@@ -4512,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4512
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4513
4424
 
4514
4425
  const int tid = item_ct1.get_local_id(2);
4515
- #if QK_K == 256
4516
4426
  const int il = tid/8; // 0...3
4517
4427
  const int ib = tid%8; // 0...7
4518
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4527,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4527
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4528
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4529
4439
  }
4530
- #else
4531
- assert(false);
4532
- #endif
4533
-
4534
4440
  }
4535
4441
 
4536
4442
  template <typename dst_t>
@@ -4543,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4543
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4544
4450
 
4545
4451
  const int tid = item_ct1.get_local_id(2);
4546
- #if QK_K == 256
4547
4452
  const int il = tid/8; // 0...3
4548
4453
  const int ib = tid%8; // 0...7
4549
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4557,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4557
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4558
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4559
4464
  }
4560
- #else
4561
- assert(false);
4562
- #endif
4563
-
4564
4465
  }
4565
4466
 
4566
4467
  template <typename dst_t>
@@ -4573,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4573
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4574
4475
 
4575
4476
  const int tid = item_ct1.get_local_id(2);
4576
- #if QK_K == 256
4577
4477
  const int il = tid/8; // 0...3
4578
4478
  const int ib = tid%8; // 0...7
4579
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4587,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4587
4487
  for (int j = 0; j < 8; ++j) {
4588
4488
  y[j] = d * (q[j] + delta);
4589
4489
  }
4590
- #else
4591
- assert(false);
4592
- #endif
4593
-
4594
4490
  }
4595
4491
 
4596
4492
  template <typename dst_t>
@@ -4603,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4603
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4604
4500
 
4605
4501
  const int tid = item_ct1.get_local_id(2);
4606
- #if QK_K == 256
4607
4502
  const int il = tid/8; // 0...3
4608
4503
  const int ib = tid%8; // 0...7
4609
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4621,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4621
4516
  for (int j = 0; j < 8; ++j) {
4622
4517
  y[j] = d * (q[j] + delta);
4623
4518
  }
4624
- #else
4625
- assert(false);
4626
- #endif
4627
-
4628
4519
  }
4629
4520
 
4630
4521
  template <typename dst_t>
@@ -4698,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4698
4589
 
4699
4590
  float tmp = 0; // partial sum for thread in warp
4700
4591
 
4701
- #if QK_K == 256
4702
4592
  const int tid =
4703
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4704
4594
  const int ix =
@@ -4749,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4749
4639
  tmp += dall * sum1 - dmin * sum2;
4750
4640
 
4751
4641
  }
4752
- #else
4753
- const int tid = item_ct1.get_local_id(2) /
4754
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4755
- const int ix = item_ct1.get_local_id(2) %
4756
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4757
- const int offset = tid * K_QUANTS_PER_ITERATION;
4758
-
4759
- uint32_t uaux[2];
4760
- const uint8_t * d = (const uint8_t *)uaux;
4761
-
4762
-
4763
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4764
-
4765
- const float * y = yy + i * QK_K + offset;
4766
- const uint8_t * q = x[i].qs + offset;
4767
- const uint32_t * s = (const uint32_t *)x[i].scales;
4768
-
4769
- uaux[0] = s[0] & 0x0f0f0f0f;
4770
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4771
-
4772
- const sycl::float2 dall =
4773
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4774
-
4775
- float sum1 = 0, sum2 = 0;
4776
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4777
- const uint8_t ql = q[l];
4778
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4779
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4780
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4781
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4782
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4783
- }
4784
- tmp += dall.x() * sum1 - dall.y() * sum2;
4785
- }
4786
-
4787
- #endif
4788
4642
 
4789
4643
  // sum up partial sums and write back result
4790
4644
  #pragma unroll
@@ -4822,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4822
4676
 
4823
4677
  float tmp = 0; // partial sum for thread in warp
4824
4678
 
4825
- #if QK_K == 256
4826
-
4827
4679
  const uint16_t kmask1 = 0x0303;
4828
4680
  const uint16_t kmask2 = 0x0f0f;
4829
4681
 
@@ -4876,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4876
4728
  tmp += d * sum;
4877
4729
 
4878
4730
  }
4879
- #else
4880
-
4881
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4882
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4883
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4884
- const int in = offset/8; // 0 or 1
4885
- const int im = offset%8; // 0...7
4886
-
4887
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4888
-
4889
- const float * y = yy + i * QK_K + offset;
4890
- const uint8_t * q = x[i].qs + offset;
4891
- const uint8_t * s = x[i].scales;
4892
-
4893
- const float dall = (float)x[i].d;
4894
-
4895
- float sum = 0;
4896
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4897
- const uint8_t hl = x[i].hmask[im+l] >> in;
4898
- const uint8_t ql = q[l];
4899
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4900
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4901
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4902
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4903
- }
4904
- tmp += sum;
4905
- }
4906
- #endif
4907
4731
 
4908
4732
  // sum up partial sums and write back result
4909
4733
  #pragma unroll
@@ -4938,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4938
4762
 
4939
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4940
4764
 
4941
- #if QK_K == 256
4942
4765
  const uint16_t kmask1 = 0x3f3f;
4943
4766
  const uint16_t kmask2 = 0x0f0f;
4944
4767
  const uint16_t kmask3 = 0xc0c0;
@@ -5027,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5027
4850
  #endif
5028
4851
 
5029
4852
  }
5030
- #else
5031
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5032
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5033
-
5034
- const int step = tid * K_QUANTS_PER_ITERATION;
5035
-
5036
- uint16_t aux16[2];
5037
- const uint8_t * s = (const uint8_t *)aux16;
5038
-
5039
- float tmp = 0;
5040
-
5041
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5042
- const uint8_t * q = x[i].qs + step;
5043
- const float * y = yy + i*QK_K + step;
5044
- const uint16_t * a = (const uint16_t *)x[i].scales;
5045
- aux16[0] = a[0] & 0x0f0f;
5046
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5047
- const float d = (float)x[i].dm[0];
5048
- const float m = (float)x[i].dm[1];
5049
- float sum = 0.f;
5050
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5051
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5052
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5053
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5054
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5055
- }
5056
- tmp += sum;
5057
- }
5058
-
5059
- #endif
5060
4853
 
5061
4854
  // sum up partial sums and write back result
5062
4855
  #pragma unroll
@@ -5091,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5091
4884
 
5092
4885
  float tmp = 0; // partial sum for thread in warp
5093
4886
 
5094
- #if QK_K == 256
5095
4887
  const uint16_t kmask1 = 0x3f3f;
5096
4888
  const uint16_t kmask2 = 0x0f0f;
5097
4889
  const uint16_t kmask3 = 0xc0c0;
@@ -5168,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5168
4960
  dmin * smin;
5169
4961
  }
5170
4962
 
5171
- #else
5172
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5173
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5174
- const int step = tid * K_QUANTS_PER_ITERATION;
5175
- const int im = step/8;
5176
- const int in = step%8;
5177
-
5178
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5179
- const uint8_t * q = x[i].qs + step;
5180
- const int8_t * s = x[i].scales;
5181
- const float * y = yy + i*QK_K + step;
5182
- const float d = x[i].d;
5183
- float sum = 0.f;
5184
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5185
- const uint8_t h = x[i].qh[in+j] >> im;
5186
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5187
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5188
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5189
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5190
- }
5191
- tmp += sum;
5192
- }
5193
- #endif
5194
-
5195
4963
  // sum up partial sums and write back result
5196
4964
  #pragma unroll
5197
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5218,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5218
4986
 
5219
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5220
4988
 
5221
- #if QK_K == 256
5222
-
5223
4989
  const int tid =
5224
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5225
4991
  const int ix =
@@ -5276,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5276
5042
 
5277
5043
  }
5278
5044
 
5279
- #else
5280
-
5281
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5282
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5283
-
5284
- const int step = tid * K_QUANTS_PER_ITERATION;
5285
-
5286
- float tmp = 0; // partial sum for thread in warp
5287
-
5288
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5289
-
5290
- const float * y = yy + i * QK_K + step;
5291
- const uint8_t * ql = x[i].ql + step;
5292
- const uint8_t * qh = x[i].qh + step;
5293
- const int8_t * s = x[i].scales;
5294
-
5295
- const float d = x[i+0].d;
5296
-
5297
- float sum = 0;
5298
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5299
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5300
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5301
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5302
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5303
- }
5304
- tmp += sum;
5305
-
5306
- }
5307
-
5308
- #endif
5309
-
5310
5045
  // sum up partial sums and write back result
5311
5046
  #pragma unroll
5312
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6851,7 +6586,6 @@ static __dpct_inline__ float
6851
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6852
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6853
6588
 
6854
- #ifndef GGML_QKK_64
6855
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6856
6590
 
6857
6591
  int v[2];
@@ -6893,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6893
6627
  }
6894
6628
 
6895
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6896
-
6897
- #else
6898
-
6899
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6900
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6901
-
6902
- float sumf_d = 0.0f;
6903
- float sumf_m = 0.0f;
6904
-
6905
- uint16_t aux16[2];
6906
- const uint8_t * s = (const uint8_t *)aux16;
6907
-
6908
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6909
- aux16[0] = a[0] & 0x0f0f;
6910
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6911
-
6912
- const float dall = bq4_K->dm[0];
6913
- const float dmin = bq4_K->dm[1];
6914
-
6915
- const float d8_1 = bq8_1[0].ds[0];
6916
- const float d8_2 = bq8_1[1].ds[1];
6917
-
6918
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6919
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6920
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6921
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6922
-
6923
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6924
- const int v1 = q4[0];
6925
- const int v2 = q4[4];
6926
-
6927
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6928
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6929
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6930
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6931
-
6932
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6933
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6934
-
6935
- return dall * sumf_d - dmin * sumf_m;
6936
-
6937
- #else
6938
- bad_arch();
6939
- #endif // __SYCL_ARCH__ >= VER_4VEC
6940
-
6941
- #endif
6942
6630
  }
6943
6631
 
6944
6632
  template <int mmq_y>
@@ -6997,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
6997
6685
 
6998
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
6999
6687
 
7000
- #if QK_K == 256
7001
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7002
- #else
7003
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7004
- #endif
7005
6689
  }
7006
6690
 
7007
6691
  #pragma unroll
@@ -7044,7 +6728,6 @@ static __dpct_inline__ float
7044
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7045
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7046
6730
 
7047
- #ifndef GGML_QKK_64
7048
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7049
6732
 
7050
6733
  int vl[2];
@@ -7086,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7086
6769
  }
7087
6770
 
7088
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7089
-
7090
- #else
7091
-
7092
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7093
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7094
-
7095
- const int8_t * s = bq5_K->scales;
7096
-
7097
- const float d = bq5_K->d;
7098
-
7099
- const float d8_1 = bq8_1[0].ds[0];
7100
- const float d8_2 = bq8_1[1].ds[1];
7101
-
7102
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7103
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7104
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7105
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7106
-
7107
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7108
- const int vl1 = ql[0];
7109
- const int vl2 = ql[4];
7110
-
7111
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7112
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7113
- const int in = step%8; // 0, 4, 0, 4
7114
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7115
-
7116
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7117
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7118
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7119
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7120
-
7121
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7122
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7123
-
7124
- return d * sumf_d;
7125
-
7126
- #else
7127
- bad_arch();
7128
- #endif // __SYCL_ARCH__ >= VER_4VEC
7129
-
7130
- #endif
7131
6772
  }
7132
6773
 
7133
6774
  template <int mmq_y>
@@ -7199,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7199
6840
 
7200
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7201
6842
 
7202
- #if QK_K == 256
7203
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7204
- #endif
7205
6844
  }
7206
6845
 
7207
6846
  #pragma unroll
@@ -7381,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7381
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7382
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7383
7022
  const uint8_t *kmask_iq2xs) {
7384
- #if QK_K == 256
7385
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7386
7024
 
7387
7025
  #if QR2_XXS == 8
@@ -7422,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7422
7060
  }
7423
7061
  return d * (sumi1 + sumi2);
7424
7062
  #endif
7425
- #else
7426
- assert(false);
7427
- return 0.f;
7428
- #endif
7429
7063
  }
7430
7064
 
7431
7065
  static __dpct_inline__ float
@@ -7434,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7434
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7435
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7436
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7437
- #if QK_K == 256
7438
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7439
7072
 
7440
7073
  const int ib32 = iqs;
@@ -7472,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7472
7105
  assert(false);
7473
7106
  return 0.f;
7474
7107
  #endif
7475
- #else
7476
- assert(false);
7477
- return 0.f;
7478
- #endif
7479
7108
  }
7480
7109
 
7481
7110
  static __dpct_inline__ float
7482
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7483
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7484
- #if QK_K == 256
7485
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7486
7114
 
7487
7115
  const int ib32 = iqs;
@@ -7525,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7525
7153
  }
7526
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7527
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7528
- #else
7529
- assert(false);
7530
- #endif
7531
7156
  }
7532
7157
 
7533
7158
  static __dpct_inline__ float
@@ -7536,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7536
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7537
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7538
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7539
- #if QK_K == 256
7540
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7541
7165
 
7542
7166
  const int ib32 = iqs;
@@ -7564,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7564
7188
  assert(false);
7565
7189
  return 0.f;
7566
7190
  #endif
7567
- #else
7568
- assert(false);
7569
- return 0.f;
7570
- #endif
7571
7191
  }
7572
7192
 
7573
7193
  static __dpct_inline__ float
7574
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7575
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7576
7196
  const uint32_t *iq3s_grid) {
7577
- #if QK_K == 256
7578
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7579
7198
 
7580
7199
  const int ib32 = iqs;
@@ -7603,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7603
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7604
7223
  bq8_1[ib32].ds[0];
7605
7224
  return d * sumi;
7606
- #else
7607
- assert(false);
7608
- #endif
7609
7225
  }
7610
7226
 
7611
7227
  static __dpct_inline__ float
7612
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7613
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7614
7230
  const uint32_t *iq1s_grid_gpu) {
7615
- #if QK_K == 256
7616
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7617
7232
 
7618
7233
  const int ib32 = iqs;
@@ -7631,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7631
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7632
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7633
7248
  return d * sumi + m * delta;
7634
- #else
7635
- assert(false);
7636
- #endif
7637
7249
  }
7638
7250
 
7639
7251
  static __dpct_inline__ float
7640
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7641
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7642
- #if QK_K == 256
7643
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7644
7255
 
7645
7256
  const int ib32 = iqs;
@@ -7664,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7664
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7665
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7666
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7667
- #else
7668
- assert(false);
7669
- #endif
7670
7278
  }
7671
7279
 
7672
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7714,7 +7322,6 @@ static __dpct_inline__ float
7714
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7715
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7716
7324
 
7717
- #if QK_K == 256
7718
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7719
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7720
7327
 
@@ -7732,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7732
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7733
7340
  }
7734
7341
  return d * (sumi1 + sumi2);
7735
- #else
7736
- assert(false);
7737
- #endif
7738
7342
  }
7739
7343
 
7740
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -10085,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
10085
9689
  });
10086
9690
  }
10087
9691
 
10088
- static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
10089
- const int ne01, const int ne02,
10090
- const int scale_factor, dpct::queue_ptr stream) {
10091
- int ne0 = (ne00 * scale_factor);
10092
- int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
10093
- sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
9692
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
9693
+ const int nb02, const int nb03, const int ne10, const int ne11,
9694
+ const int ne12, const int ne13, const float sf0, const float sf1,
9695
+ const float sf2, const float sf3, dpct::queue_ptr stream) {
9696
+ int dst_size = ne10 * ne11 * ne12 * ne13;
9697
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
9698
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
10094
9699
  stream->parallel_for(
10095
- sycl::nd_range<3>(gridDim *
10096
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
10097
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
10098
- [=](sycl::nd_item<3> item_ct1) {
10099
- upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
9700
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
9701
+ [=](sycl::nd_item<1> item_ct1) {
9702
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
10100
9703
  });
10101
9704
  }
10102
9705
 
@@ -10198,7 +9801,6 @@ template <typename dst_t>
10198
9801
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10199
9802
  dpct::queue_ptr stream) {
10200
9803
  const int nb = k / QK_K;
10201
- #if QK_K == 256
10202
9804
  {
10203
9805
  dpct::has_capability_or_fail(stream->get_device(),
10204
9806
  {sycl::aspect::fp16});
@@ -10210,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10210
9812
  dequantize_block_q2_K(vx, y, item_ct1);
10211
9813
  });
10212
9814
  }
10213
- #else
10214
- {
10215
- dpct::has_capability_or_fail(stream->get_device(),
10216
- {sycl::aspect::fp16});
10217
-
10218
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10219
- sycl::range<3>(1, 1, 32),
10220
- sycl::range<3>(1, 1, 32)),
10221
- [=](sycl::nd_item<3> item_ct1) {
10222
- dequantize_block_q2_K(vx, y, item_ct1);
10223
- });
10224
- }
10225
-
10226
- #endif
10227
9815
  }
10228
9816
 
10229
9817
  template <typename dst_t>
10230
9818
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10231
9819
  dpct::queue_ptr stream) {
10232
9820
  const int nb = k / QK_K;
10233
- #if QK_K == 256
10234
9821
  {
10235
9822
  dpct::has_capability_or_fail(stream->get_device(),
10236
9823
  {sycl::aspect::fp16});
@@ -10242,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10242
9829
  dequantize_block_q3_K(vx, y, item_ct1);
10243
9830
  });
10244
9831
  }
10245
- #else
10246
- {
10247
- dpct::has_capability_or_fail(stream->get_device(),
10248
- {sycl::aspect::fp16});
10249
-
10250
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10251
- sycl::range<3>(1, 1, 32),
10252
- sycl::range<3>(1, 1, 32)),
10253
- [=](sycl::nd_item<3> item_ct1) {
10254
- dequantize_block_q3_K(vx, y, item_ct1);
10255
- });
10256
- }
10257
- #endif
10258
9832
  }
10259
9833
 
10260
9834
  template <typename dst_t>
@@ -10315,7 +9889,6 @@ template <typename dst_t>
10315
9889
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10316
9890
  dpct::queue_ptr stream) {
10317
9891
  const int nb = k / QK_K;
10318
- #if QK_K == 256
10319
9892
  {
10320
9893
  dpct::has_capability_or_fail(stream->get_device(),
10321
9894
  {sycl::aspect::fp16});
@@ -10327,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10327
9900
  dequantize_block_q5_K(vx, y, item_ct1);
10328
9901
  });
10329
9902
  }
10330
- #else
10331
- {
10332
- dpct::has_capability_or_fail(stream->get_device(),
10333
- {sycl::aspect::fp16});
10334
-
10335
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10336
- sycl::range<3>(1, 1, 32),
10337
- sycl::range<3>(1, 1, 32)),
10338
- [=](sycl::nd_item<3> item_ct1) {
10339
- dequantize_block_q5_K(vx, y, item_ct1);
10340
- });
10341
- }
10342
-
10343
- #endif
10344
9903
  }
10345
9904
 
10346
9905
  template <typename dst_t>
10347
9906
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10348
9907
  dpct::queue_ptr stream) {
10349
9908
  const int nb = k / QK_K;
10350
- #if QK_K == 256
10351
9909
  {
10352
9910
  dpct::has_capability_or_fail(stream->get_device(),
10353
9911
  {sycl::aspect::fp16});
@@ -10359,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10359
9917
  dequantize_block_q6_K(vx, y, item_ct1);
10360
9918
  });
10361
9919
  }
10362
- #else
10363
- {
10364
- dpct::has_capability_or_fail(stream->get_device(),
10365
- {sycl::aspect::fp16});
10366
-
10367
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10368
- sycl::range<3>(1, 1, 32),
10369
- sycl::range<3>(1, 1, 32)),
10370
- [=](sycl::nd_item<3> item_ct1) {
10371
- dequantize_block_q6_K(vx, y, item_ct1);
10372
- });
10373
- }
10374
-
10375
- #endif
10376
9920
  }
10377
9921
 
10378
9922
  template <typename dst_t>
@@ -10524,9 +10068,6 @@ template <typename dst_t>
10524
10068
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10525
10069
  dpct::queue_ptr stream) {
10526
10070
  const int nb = (k + QK_K - 1) / QK_K;
10527
- #if QK_K == 64
10528
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10529
- #else
10530
10071
  {
10531
10072
  dpct::has_capability_or_fail(stream->get_device(),
10532
10073
  {sycl::aspect::fp16});
@@ -10541,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10541
10082
  });
10542
10083
  });
10543
10084
  }
10544
- #endif
10545
10085
  }
10546
10086
 
10547
10087
 
@@ -12046,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12046
11586
  const int nrows_y, const int nrows_dst,
12047
11587
  dpct::queue_ptr stream) try {
12048
11588
 
12049
- #if QK_K == 256
12050
-
12051
11589
  int id;
12052
11590
  SYCL_CHECK(
12053
11591
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12162,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12162
11700
  });
12163
11701
  }
12164
11702
  }
12165
- #endif
12166
11703
  }
12167
11704
  catch (sycl::exception const &exc) {
12168
11705
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -13985,15 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
13985
13522
 
13986
13523
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
13987
13524
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
13988
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
13989
-
13990
- #pragma message("TODO: generalize upscale operator")
13991
- #pragma message(" https://github.com/ggerganov/ggml/pull/814")
13992
- GGML_ASSERT(false && "TODO: generalize upscale operator");
13993
13525
 
13994
- const int scale_factor = dst->op_params[0];
13526
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
13527
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
13528
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
13529
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
13995
13530
 
13996
- upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
13531
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
13532
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
13533
+ main_stream);
13997
13534
 
13998
13535
  (void) src1;
13999
13536
  (void) dst;
@@ -14449,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14449
13986
  ggml_tensor *dst, const float *src0_dd,
14450
13987
  const float *src1_dd, float *dst_dd,
14451
13988
  const dpct::queue_ptr &main_stream) {
13989
+ #pragma message("TODO: implement phi3 frequency factors support")
13990
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
13991
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
14452
13992
 
14453
13993
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
14454
13994
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);