llama_cpp 0.15.1 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_ALIBI_BLOCK_SIZE 32
3158
3157
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3159
3158
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
3160
3159
  #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
3848
3847
  }
3849
3848
  }
3850
3849
 
3851
- static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor,
3852
- const sycl::nd_item<3> &item_ct1) {
3853
- int ne0 = ne00 * scale_factor;
3854
- int nidx = item_ct1.get_local_id(2) +
3855
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
3856
- if (nidx >= ne0) {
3850
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
3851
+ const int nb02, const int nb03, const int ne10, const int ne11,
3852
+ const int ne12, const int ne13, const float sf0, const float sf1,
3853
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
3854
+ int index = item_ct1.get_local_id(0) +
3855
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
3856
+ if (index >= ne10 * ne11 * ne12 * ne13) {
3857
3857
  return;
3858
3858
  }
3859
3859
  // operation
3860
- int i00 = nidx / scale_factor;
3861
- int i01 = item_ct1.get_group(1) / scale_factor;
3862
- int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
3863
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
3864
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
3865
- dst[offset_dst] = x[offset_src];
3860
+ int i10 = index % ne10;
3861
+ int i11 = (index / ne10) % ne11;
3862
+ int i12 = (index / (ne10 * ne11)) % ne12;
3863
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
3864
+
3865
+ int i00 = i10 / sf0;
3866
+ int i01 = i11 / sf1;
3867
+ int i02 = i12 / sf2;
3868
+ int i03 = i13 / sf3;
3869
+
3870
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
3866
3871
  }
3867
3872
 
3868
3873
  static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4192
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4193
4198
 
4194
4199
  const int tid = item_ct1.get_local_id(2);
4195
- #if QK_K == 256
4196
4200
  const int n = tid/32;
4197
4201
  const int l = tid - 32*n;
4198
4202
  const int is = 8*n + l/16;
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4206
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4207
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4208
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4209
- #else
4210
- const int is = tid/16; // 0 or 1
4211
- const int il = tid%16; // 0...15
4212
- const uint8_t q = x[i].qs[il] >> (2*is);
4213
- dst_t * y = yy + i*QK_K + 16*is + il;
4214
-
4215
- float dall = x[i].dm[0];
4216
- float dmin = x[i].dm[1];
4217
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4218
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4219
- #endif
4220
-
4221
4213
  }
4222
4214
 
4223
4215
  template<typename dst_t>
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4227
4219
  const int i = item_ct1.get_group(2);
4228
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4229
4221
 
4230
- #if QK_K == 256
4231
4222
  const int r = item_ct1.get_local_id(2) / 4;
4232
4223
  const int tid = r/2;
4233
4224
  const int is0 = r%2;
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4251
4242
  const uint8_t * hm = x[i].hmask;
4252
4243
 
4253
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4254
- #else
4255
- const int tid = item_ct1.get_local_id(2);
4256
- const int is = tid/16; // 0 or 1
4257
- const int il = tid%16; // 0...15
4258
- const int im = il/8; // 0...1
4259
- const int in = il%8; // 0...7
4260
-
4261
- dst_t * y = yy + i*QK_K + 16*is + il;
4262
-
4263
- const uint8_t q = x[i].qs[il] >> (2*is);
4264
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4265
- const float d = (float)x[i].d;
4266
-
4267
- if (is == 0) {
4268
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4269
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4270
- } else {
4271
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4272
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4273
- }
4274
- #endif
4275
-
4276
4245
  }
4277
4246
 
4278
- #if QK_K == 256
4279
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4280
4248
  if (j < 4) {
4281
4249
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4284
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4285
4253
  }
4286
4254
  }
4287
- #endif
4288
4255
 
4289
4256
  template<typename dst_t>
4290
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4293
4260
 
4294
4261
  const int i = item_ct1.get_group(2);
4295
4262
 
4296
- #if QK_K == 256
4297
4263
  // assume 32 threads
4298
4264
  const int tid = item_ct1.get_local_id(2);
4299
4265
  const int il = tid/8;
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4317
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4318
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4319
4285
  }
4320
- #else
4321
- const int tid = item_ct1.get_local_id(2);
4322
- const uint8_t * q = x[i].qs;
4323
- dst_t * y = yy + i*QK_K;
4324
- const float d = (float)x[i].dm[0];
4325
- const float m = (float)x[i].dm[1];
4326
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4327
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4328
- #endif
4329
4286
  }
4330
4287
 
4331
4288
  template<typename dst_t>
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4335
4292
 
4336
4293
  const int i = item_ct1.get_group(2);
4337
4294
 
4338
- #if QK_K == 256
4339
4295
  // assume 64 threads - this is very slightly better than the one below
4340
4296
  const int tid = item_ct1.get_local_id(2);
4341
4297
  const int il = tid/16; // il is in 0...3
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4362
4318
  hm <<= 1;
4363
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4364
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4365
- #else
4366
- const int tid = item_ct1.get_local_id(2);
4367
- const uint8_t q = x[i].qs[tid];
4368
- const int im = tid/8; // 0...3
4369
- const int in = tid%8; // 0...7
4370
- const int is = tid/16; // 0 or 1
4371
- const uint8_t h = x[i].qh[in] >> im;
4372
- const float d = x[i].d;
4373
- dst_t * y = yy + i*QK_K + tid;
4374
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4375
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4376
- #endif
4377
4321
  }
4378
4322
 
4379
4323
  template<typename dst_t>
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4382
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4383
4327
 
4384
4328
  const int i = item_ct1.get_group(2);
4385
- #if QK_K == 256
4386
4329
 
4387
4330
  // assume 64 threads - this is very slightly better than the one below
4388
4331
  const int tid = item_ct1.get_local_id(2);
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4402
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4403
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4404
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4405
- #else
4406
-
4407
- // assume 32 threads
4408
- const int tid = item_ct1.get_local_id(2);
4409
- const int ip = tid/16; // 0 or 1
4410
- const int il = tid - 16*ip; // 0...15
4411
-
4412
- dst_t * y = yy + i*QK_K + 16*ip + il;
4413
-
4414
- const float d = x[i].d;
4415
-
4416
- const uint8_t ql = x[i].ql[16*ip + il];
4417
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4418
- const int8_t * sc = x[i].scales;
4419
-
4420
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4421
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4422
- #endif
4423
4348
  }
4424
4349
 
4425
4350
  template<typename dst_t>
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4433
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4434
4359
 
4435
4360
  const int tid = item_ct1.get_local_id(2);
4436
- #if QK_K == 256
4437
4361
  const int il = tid/8; // 0...3
4438
4362
  const int ib = tid%8; // 0...7
4439
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4444
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4445
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4446
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4447
- #else
4448
- assert(false);
4449
- #endif
4450
-
4451
4371
  }
4452
4372
 
4453
4373
  template<typename dst_t>
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4461
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4462
4382
 
4463
4383
  const int tid = item_ct1.get_local_id(2);
4464
- #if QK_K == 256
4465
4384
  const int il = tid/8; // 0...3
4466
4385
  const int ib = tid%8; // 0...7
4467
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4470
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4471
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4472
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4473
- #else
4474
- assert(false);
4475
- #endif
4476
-
4477
4392
  }
4478
4393
 
4479
4394
  template <typename dst_t>
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4485
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4486
4401
 
4487
4402
  const int tid = item_ct1.get_local_id(2);
4488
- #if QK_K == 256
4489
4403
  const int il = tid/8; // 0...3
4490
4404
  const int ib = tid%8; // 0...7
4491
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4493
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4494
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4495
4409
  #pragma unroll
4496
- for (int j = 0; j < 8; ++j)
4410
+ for (int j = 0; j < 8; ++j) {
4497
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4498
- #else
4499
- assert(false);
4500
-
4501
- #endif
4502
-
4412
+ }
4503
4413
  }
4504
4414
 
4505
4415
  template<typename dst_t>
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4513
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4514
4424
 
4515
4425
  const int tid = item_ct1.get_local_id(2);
4516
- #if QK_K == 256
4517
4426
  const int il = tid/8; // 0...3
4518
4427
  const int ib = tid%8; // 0...7
4519
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4528
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4529
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4530
4439
  }
4531
- #else
4532
- assert(false);
4533
- #endif
4534
-
4535
4440
  }
4536
4441
 
4537
4442
  template <typename dst_t>
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4544
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4545
4450
 
4546
4451
  const int tid = item_ct1.get_local_id(2);
4547
- #if QK_K == 256
4548
4452
  const int il = tid/8; // 0...3
4549
4453
  const int ib = tid%8; // 0...7
4550
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4558
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4559
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4560
4464
  }
4561
- #else
4562
- assert(false);
4563
- #endif
4564
-
4565
4465
  }
4566
4466
 
4567
4467
  template <typename dst_t>
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4574
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4575
4475
 
4576
4476
  const int tid = item_ct1.get_local_id(2);
4577
- #if QK_K == 256
4578
4477
  const int il = tid/8; // 0...3
4579
4478
  const int ib = tid%8; // 0...7
4580
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4588
4487
  for (int j = 0; j < 8; ++j) {
4589
4488
  y[j] = d * (q[j] + delta);
4590
4489
  }
4591
- #else
4592
- assert(false);
4593
- #endif
4594
-
4595
4490
  }
4596
4491
 
4597
4492
  template <typename dst_t>
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4604
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4605
4500
 
4606
4501
  const int tid = item_ct1.get_local_id(2);
4607
- #if QK_K == 256
4608
4502
  const int il = tid/8; // 0...3
4609
4503
  const int ib = tid%8; // 0...7
4610
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4622
4516
  for (int j = 0; j < 8; ++j) {
4623
4517
  y[j] = d * (q[j] + delta);
4624
4518
  }
4625
- #else
4626
- assert(false);
4627
- #endif
4628
-
4629
4519
  }
4630
4520
 
4631
4521
  template <typename dst_t>
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4699
4589
 
4700
4590
  float tmp = 0; // partial sum for thread in warp
4701
4591
 
4702
- #if QK_K == 256
4703
4592
  const int tid =
4704
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4705
4594
  const int ix =
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4750
4639
  tmp += dall * sum1 - dmin * sum2;
4751
4640
 
4752
4641
  }
4753
- #else
4754
- const int tid = item_ct1.get_local_id(2) /
4755
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4756
- const int ix = item_ct1.get_local_id(2) %
4757
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4758
- const int offset = tid * K_QUANTS_PER_ITERATION;
4759
-
4760
- uint32_t uaux[2];
4761
- const uint8_t * d = (const uint8_t *)uaux;
4762
-
4763
-
4764
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4765
-
4766
- const float * y = yy + i * QK_K + offset;
4767
- const uint8_t * q = x[i].qs + offset;
4768
- const uint32_t * s = (const uint32_t *)x[i].scales;
4769
-
4770
- uaux[0] = s[0] & 0x0f0f0f0f;
4771
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4772
-
4773
- const sycl::float2 dall =
4774
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4775
-
4776
- float sum1 = 0, sum2 = 0;
4777
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4778
- const uint8_t ql = q[l];
4779
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4780
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4781
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4782
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4783
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4784
- }
4785
- tmp += dall.x() * sum1 - dall.y() * sum2;
4786
- }
4787
-
4788
- #endif
4789
4642
 
4790
4643
  // sum up partial sums and write back result
4791
4644
  #pragma unroll
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4823
4676
 
4824
4677
  float tmp = 0; // partial sum for thread in warp
4825
4678
 
4826
- #if QK_K == 256
4827
-
4828
4679
  const uint16_t kmask1 = 0x0303;
4829
4680
  const uint16_t kmask2 = 0x0f0f;
4830
4681
 
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4877
4728
  tmp += d * sum;
4878
4729
 
4879
4730
  }
4880
- #else
4881
-
4882
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4883
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4884
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4885
- const int in = offset/8; // 0 or 1
4886
- const int im = offset%8; // 0...7
4887
-
4888
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4889
-
4890
- const float * y = yy + i * QK_K + offset;
4891
- const uint8_t * q = x[i].qs + offset;
4892
- const uint8_t * s = x[i].scales;
4893
-
4894
- const float dall = (float)x[i].d;
4895
-
4896
- float sum = 0;
4897
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4898
- const uint8_t hl = x[i].hmask[im+l] >> in;
4899
- const uint8_t ql = q[l];
4900
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4901
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4902
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4903
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4904
- }
4905
- tmp += sum;
4906
- }
4907
- #endif
4908
4731
 
4909
4732
  // sum up partial sums and write back result
4910
4733
  #pragma unroll
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4939
4762
 
4940
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4941
4764
 
4942
- #if QK_K == 256
4943
4765
  const uint16_t kmask1 = 0x3f3f;
4944
4766
  const uint16_t kmask2 = 0x0f0f;
4945
4767
  const uint16_t kmask3 = 0xc0c0;
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5028
4850
  #endif
5029
4851
 
5030
4852
  }
5031
- #else
5032
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5033
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5034
-
5035
- const int step = tid * K_QUANTS_PER_ITERATION;
5036
-
5037
- uint16_t aux16[2];
5038
- const uint8_t * s = (const uint8_t *)aux16;
5039
-
5040
- float tmp = 0;
5041
-
5042
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5043
- const uint8_t * q = x[i].qs + step;
5044
- const float * y = yy + i*QK_K + step;
5045
- const uint16_t * a = (const uint16_t *)x[i].scales;
5046
- aux16[0] = a[0] & 0x0f0f;
5047
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5048
- const float d = (float)x[i].dm[0];
5049
- const float m = (float)x[i].dm[1];
5050
- float sum = 0.f;
5051
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5052
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5053
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5054
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5055
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5056
- }
5057
- tmp += sum;
5058
- }
5059
-
5060
- #endif
5061
4853
 
5062
4854
  // sum up partial sums and write back result
5063
4855
  #pragma unroll
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5092
4884
 
5093
4885
  float tmp = 0; // partial sum for thread in warp
5094
4886
 
5095
- #if QK_K == 256
5096
4887
  const uint16_t kmask1 = 0x3f3f;
5097
4888
  const uint16_t kmask2 = 0x0f0f;
5098
4889
  const uint16_t kmask3 = 0xc0c0;
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5169
4960
  dmin * smin;
5170
4961
  }
5171
4962
 
5172
- #else
5173
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5174
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5175
- const int step = tid * K_QUANTS_PER_ITERATION;
5176
- const int im = step/8;
5177
- const int in = step%8;
5178
-
5179
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5180
- const uint8_t * q = x[i].qs + step;
5181
- const int8_t * s = x[i].scales;
5182
- const float * y = yy + i*QK_K + step;
5183
- const float d = x[i].d;
5184
- float sum = 0.f;
5185
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5186
- const uint8_t h = x[i].qh[in+j] >> im;
5187
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5188
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5189
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5190
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5191
- }
5192
- tmp += sum;
5193
- }
5194
- #endif
5195
-
5196
4963
  // sum up partial sums and write back result
5197
4964
  #pragma unroll
5198
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5219
4986
 
5220
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5221
4988
 
5222
- #if QK_K == 256
5223
-
5224
4989
  const int tid =
5225
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5226
4991
  const int ix =
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5277
5042
 
5278
5043
  }
5279
5044
 
5280
- #else
5281
-
5282
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5283
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5284
-
5285
- const int step = tid * K_QUANTS_PER_ITERATION;
5286
-
5287
- float tmp = 0; // partial sum for thread in warp
5288
-
5289
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5290
-
5291
- const float * y = yy + i * QK_K + step;
5292
- const uint8_t * ql = x[i].ql + step;
5293
- const uint8_t * qh = x[i].qh + step;
5294
- const int8_t * s = x[i].scales;
5295
-
5296
- const float d = x[i+0].d;
5297
-
5298
- float sum = 0;
5299
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5300
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5301
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5302
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5303
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5304
- }
5305
- tmp += sum;
5306
-
5307
- }
5308
-
5309
- #endif
5310
-
5311
5045
  // sum up partial sums and write back result
5312
5046
  #pragma unroll
5313
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
6852
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6853
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6854
6588
 
6855
- #ifndef GGML_QKK_64
6856
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6857
6590
 
6858
6591
  int v[2];
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6894
6627
  }
6895
6628
 
6896
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6897
-
6898
- #else
6899
-
6900
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6901
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6902
-
6903
- float sumf_d = 0.0f;
6904
- float sumf_m = 0.0f;
6905
-
6906
- uint16_t aux16[2];
6907
- const uint8_t * s = (const uint8_t *)aux16;
6908
-
6909
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6910
- aux16[0] = a[0] & 0x0f0f;
6911
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6912
-
6913
- const float dall = bq4_K->dm[0];
6914
- const float dmin = bq4_K->dm[1];
6915
-
6916
- const float d8_1 = bq8_1[0].ds[0];
6917
- const float d8_2 = bq8_1[1].ds[1];
6918
-
6919
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6920
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6921
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6922
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6923
-
6924
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6925
- const int v1 = q4[0];
6926
- const int v2 = q4[4];
6927
-
6928
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6929
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6930
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6931
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6932
-
6933
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6934
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6935
-
6936
- return dall * sumf_d - dmin * sumf_m;
6937
-
6938
- #else
6939
- bad_arch();
6940
- #endif // __SYCL_ARCH__ >= VER_4VEC
6941
-
6942
- #endif
6943
6630
  }
6944
6631
 
6945
6632
  template <int mmq_y>
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
6998
6685
 
6999
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
7000
6687
 
7001
- #if QK_K == 256
7002
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7003
- #else
7004
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7005
- #endif
7006
6689
  }
7007
6690
 
7008
6691
  #pragma unroll
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
7045
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7046
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7047
6730
 
7048
- #ifndef GGML_QKK_64
7049
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7050
6732
 
7051
6733
  int vl[2];
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7087
6769
  }
7088
6770
 
7089
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7090
-
7091
- #else
7092
-
7093
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7094
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7095
-
7096
- const int8_t * s = bq5_K->scales;
7097
-
7098
- const float d = bq5_K->d;
7099
-
7100
- const float d8_1 = bq8_1[0].ds[0];
7101
- const float d8_2 = bq8_1[1].ds[1];
7102
-
7103
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7104
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7105
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7106
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7107
-
7108
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7109
- const int vl1 = ql[0];
7110
- const int vl2 = ql[4];
7111
-
7112
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7113
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7114
- const int in = step%8; // 0, 4, 0, 4
7115
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7116
-
7117
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7118
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7119
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7120
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7121
-
7122
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7123
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7124
-
7125
- return d * sumf_d;
7126
-
7127
- #else
7128
- bad_arch();
7129
- #endif // __SYCL_ARCH__ >= VER_4VEC
7130
-
7131
- #endif
7132
6772
  }
7133
6773
 
7134
6774
  template <int mmq_y>
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7200
6840
 
7201
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7202
6842
 
7203
- #if QK_K == 256
7204
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7205
- #endif
7206
6844
  }
7207
6845
 
7208
6846
  #pragma unroll
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7382
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7383
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7384
7022
  const uint8_t *kmask_iq2xs) {
7385
- #if QK_K == 256
7386
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7387
7024
 
7388
7025
  #if QR2_XXS == 8
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7423
7060
  }
7424
7061
  return d * (sumi1 + sumi2);
7425
7062
  #endif
7426
- #else
7427
- assert(false);
7428
- return 0.f;
7429
- #endif
7430
7063
  }
7431
7064
 
7432
7065
  static __dpct_inline__ float
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7435
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7436
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7437
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7438
- #if QK_K == 256
7439
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7440
7072
 
7441
7073
  const int ib32 = iqs;
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7473
7105
  assert(false);
7474
7106
  return 0.f;
7475
7107
  #endif
7476
- #else
7477
- assert(false);
7478
- return 0.f;
7479
- #endif
7480
7108
  }
7481
7109
 
7482
7110
  static __dpct_inline__ float
7483
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7484
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7485
- #if QK_K == 256
7486
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7487
7114
 
7488
7115
  const int ib32 = iqs;
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7526
7153
  }
7527
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7528
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7529
- #else
7530
- assert(false);
7531
- #endif
7532
7156
  }
7533
7157
 
7534
7158
  static __dpct_inline__ float
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7537
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7538
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7539
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7540
- #if QK_K == 256
7541
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7542
7165
 
7543
7166
  const int ib32 = iqs;
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7565
7188
  assert(false);
7566
7189
  return 0.f;
7567
7190
  #endif
7568
- #else
7569
- assert(false);
7570
- return 0.f;
7571
- #endif
7572
7191
  }
7573
7192
 
7574
7193
  static __dpct_inline__ float
7575
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7576
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7577
7196
  const uint32_t *iq3s_grid) {
7578
- #if QK_K == 256
7579
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7580
7198
 
7581
7199
  const int ib32 = iqs;
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7604
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7605
7223
  bq8_1[ib32].ds[0];
7606
7224
  return d * sumi;
7607
- #else
7608
- assert(false);
7609
- #endif
7610
7225
  }
7611
7226
 
7612
7227
  static __dpct_inline__ float
7613
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7614
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7615
7230
  const uint32_t *iq1s_grid_gpu) {
7616
- #if QK_K == 256
7617
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7618
7232
 
7619
7233
  const int ib32 = iqs;
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7632
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7633
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7634
7248
  return d * sumi + m * delta;
7635
- #else
7636
- assert(false);
7637
- #endif
7638
7249
  }
7639
7250
 
7640
7251
  static __dpct_inline__ float
7641
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7642
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7643
- #if QK_K == 256
7644
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7645
7255
 
7646
7256
  const int ib32 = iqs;
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7665
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7666
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7667
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7668
- #else
7669
- assert(false);
7670
- #endif
7671
7278
  }
7672
7279
 
7673
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
7715
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7716
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7717
7324
 
7718
- #if QK_K == 256
7719
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7720
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7721
7327
 
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7733
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7734
7340
  }
7735
7341
  return d * (sumi1 + sumi2);
7736
- #else
7737
- assert(false);
7738
- #endif
7739
7342
  }
7740
7343
 
7741
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -9316,32 +8919,6 @@ static void rope_glm_f32(
9316
8919
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
9317
8920
  }
9318
8921
 
9319
- static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
9320
- const int n_heads_log2_floor, const float m0, const float m1,
9321
- const sycl::nd_item<3> &item_ct1) {
9322
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
9323
- item_ct1.get_local_id(2);
9324
-
9325
- if (col >= ncols) {
9326
- return;
9327
- }
9328
-
9329
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9330
- item_ct1.get_local_id(1);
9331
- const int i = row*ncols + col;
9332
-
9333
- const int k = row/k_rows;
9334
-
9335
- float m_k;
9336
- if (k < n_heads_log2_floor) {
9337
- m_k = dpct::pow(m0, k + 1);
9338
- } else {
9339
- m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
9340
- }
9341
-
9342
- dst[i] = col * m_k + x[i];
9343
- }
9344
-
9345
8922
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9346
8923
  const sycl::nd_item<3> &item_ct1) {
9347
8924
  const int row = item_ct1.get_group(1);
@@ -9443,7 +9020,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
9443
9020
 
9444
9021
 
9445
9022
  template <bool vals_smem, int ncols_template, int block_size_template>
9446
- static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
9023
+ static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
9447
9024
  const int nrows_y, const float scale, const float max_bias, const float m0,
9448
9025
  const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
9449
9026
  const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -9457,7 +9034,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9457
9034
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
9458
9035
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
9459
9036
 
9460
- float slope = 0.0f;
9037
+ float slope = 1.0f;
9461
9038
 
9462
9039
  // ALiBi
9463
9040
  if (max_bias > 0.0f) {
@@ -9482,7 +9059,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9482
9059
  const int ix = rowx*ncols + col;
9483
9060
  const int iy = rowy*ncols + col;
9484
9061
 
9485
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
9062
+ const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
9486
9063
 
9487
9064
  vals[col] = val;
9488
9065
  max_val = sycl::max(max_val, val);
@@ -10112,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
10112
9689
  });
10113
9690
  }
10114
9691
 
10115
- static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
10116
- const int ne01, const int ne02,
10117
- const int scale_factor, dpct::queue_ptr stream) {
10118
- int ne0 = (ne00 * scale_factor);
10119
- int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
10120
- sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
9692
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
9693
+ const int nb02, const int nb03, const int ne10, const int ne11,
9694
+ const int ne12, const int ne13, const float sf0, const float sf1,
9695
+ const float sf2, const float sf3, dpct::queue_ptr stream) {
9696
+ int dst_size = ne10 * ne11 * ne12 * ne13;
9697
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
9698
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
10121
9699
  stream->parallel_for(
10122
- sycl::nd_range<3>(gridDim *
10123
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
10124
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
10125
- [=](sycl::nd_item<3> item_ct1) {
10126
- upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
9700
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
9701
+ [=](sycl::nd_item<1> item_ct1) {
9702
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
10127
9703
  });
10128
9704
  }
10129
9705
 
@@ -10225,7 +9801,6 @@ template <typename dst_t>
10225
9801
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10226
9802
  dpct::queue_ptr stream) {
10227
9803
  const int nb = k / QK_K;
10228
- #if QK_K == 256
10229
9804
  {
10230
9805
  dpct::has_capability_or_fail(stream->get_device(),
10231
9806
  {sycl::aspect::fp16});
@@ -10237,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10237
9812
  dequantize_block_q2_K(vx, y, item_ct1);
10238
9813
  });
10239
9814
  }
10240
- #else
10241
- {
10242
- dpct::has_capability_or_fail(stream->get_device(),
10243
- {sycl::aspect::fp16});
10244
-
10245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10246
- sycl::range<3>(1, 1, 32),
10247
- sycl::range<3>(1, 1, 32)),
10248
- [=](sycl::nd_item<3> item_ct1) {
10249
- dequantize_block_q2_K(vx, y, item_ct1);
10250
- });
10251
- }
10252
-
10253
- #endif
10254
9815
  }
10255
9816
 
10256
9817
  template <typename dst_t>
10257
9818
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10258
9819
  dpct::queue_ptr stream) {
10259
9820
  const int nb = k / QK_K;
10260
- #if QK_K == 256
10261
9821
  {
10262
9822
  dpct::has_capability_or_fail(stream->get_device(),
10263
9823
  {sycl::aspect::fp16});
@@ -10269,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10269
9829
  dequantize_block_q3_K(vx, y, item_ct1);
10270
9830
  });
10271
9831
  }
10272
- #else
10273
- {
10274
- dpct::has_capability_or_fail(stream->get_device(),
10275
- {sycl::aspect::fp16});
10276
-
10277
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10278
- sycl::range<3>(1, 1, 32),
10279
- sycl::range<3>(1, 1, 32)),
10280
- [=](sycl::nd_item<3> item_ct1) {
10281
- dequantize_block_q3_K(vx, y, item_ct1);
10282
- });
10283
- }
10284
- #endif
10285
9832
  }
10286
9833
 
10287
9834
  template <typename dst_t>
@@ -10342,7 +9889,6 @@ template <typename dst_t>
10342
9889
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10343
9890
  dpct::queue_ptr stream) {
10344
9891
  const int nb = k / QK_K;
10345
- #if QK_K == 256
10346
9892
  {
10347
9893
  dpct::has_capability_or_fail(stream->get_device(),
10348
9894
  {sycl::aspect::fp16});
@@ -10354,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10354
9900
  dequantize_block_q5_K(vx, y, item_ct1);
10355
9901
  });
10356
9902
  }
10357
- #else
10358
- {
10359
- dpct::has_capability_or_fail(stream->get_device(),
10360
- {sycl::aspect::fp16});
10361
-
10362
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10363
- sycl::range<3>(1, 1, 32),
10364
- sycl::range<3>(1, 1, 32)),
10365
- [=](sycl::nd_item<3> item_ct1) {
10366
- dequantize_block_q5_K(vx, y, item_ct1);
10367
- });
10368
- }
10369
-
10370
- #endif
10371
9903
  }
10372
9904
 
10373
9905
  template <typename dst_t>
10374
9906
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10375
9907
  dpct::queue_ptr stream) {
10376
9908
  const int nb = k / QK_K;
10377
- #if QK_K == 256
10378
9909
  {
10379
9910
  dpct::has_capability_or_fail(stream->get_device(),
10380
9911
  {sycl::aspect::fp16});
@@ -10386,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10386
9917
  dequantize_block_q6_K(vx, y, item_ct1);
10387
9918
  });
10388
9919
  }
10389
- #else
10390
- {
10391
- dpct::has_capability_or_fail(stream->get_device(),
10392
- {sycl::aspect::fp16});
10393
-
10394
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10395
- sycl::range<3>(1, 1, 32),
10396
- sycl::range<3>(1, 1, 32)),
10397
- [=](sycl::nd_item<3> item_ct1) {
10398
- dequantize_block_q6_K(vx, y, item_ct1);
10399
- });
10400
- }
10401
-
10402
- #endif
10403
9920
  }
10404
9921
 
10405
9922
  template <typename dst_t>
@@ -10551,9 +10068,6 @@ template <typename dst_t>
10551
10068
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10552
10069
  dpct::queue_ptr stream) {
10553
10070
  const int nb = (k + QK_K - 1) / QK_K;
10554
- #if QK_K == 64
10555
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10556
- #else
10557
10071
  {
10558
10072
  dpct::has_capability_or_fail(stream->get_device(),
10559
10073
  {sycl::aspect::fp16});
@@ -10568,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10568
10082
  });
10569
10083
  });
10570
10084
  }
10571
- #endif
10572
10085
  }
10573
10086
 
10574
10087
 
@@ -12073,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12073
11586
  const int nrows_y, const int nrows_dst,
12074
11587
  dpct::queue_ptr stream) try {
12075
11588
 
12076
- #if QK_K == 256
12077
-
12078
11589
  int id;
12079
11590
  SYCL_CHECK(
12080
11591
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12189,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12189
11700
  });
12190
11701
  }
12191
11702
  }
12192
- #endif
12193
11703
  }
12194
11704
  catch (sycl::exception const &exc) {
12195
11705
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -12964,20 +12474,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12964
12474
  });
12965
12475
  }
12966
12476
 
12967
- static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
12968
- const int nrows, const int k_rows,
12969
- const int n_heads_log2_floor, const float m0,
12970
- const float m1, dpct::queue_ptr stream) {
12971
- const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
12972
- const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
12973
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12974
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12975
- [=](sycl::nd_item<3> item_ct1) {
12976
- alibi_f32(x, dst, ncols, k_rows,
12977
- n_heads_log2_floor, m0, m1, item_ct1);
12978
- });
12979
- }
12980
-
12981
12477
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12982
12478
  const int nrows, dpct::queue_ptr stream) {
12983
12479
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13058,7 +12554,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
13058
12554
  }
13059
12555
 
13060
12556
  template <bool vals_smem, int ncols_template, int block_size_template>
13061
- static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
12557
+ static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
13062
12558
  const int nrows_y, const float scale, const float max_bias, const float m0,
13063
12559
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
13064
12560
  const size_t n_local_scratch, dpct::queue_ptr stream) {
@@ -13068,7 +12564,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13068
12564
  cgh.parallel_for(
13069
12565
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
13070
12566
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
13071
- soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
12567
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
13072
12568
  nrows_y, scale, max_bias, m0,
13073
12569
  m1, n_head_log2, item_ct1,
13074
12570
  local_buf_acc.get_pointer());
@@ -13076,7 +12572,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13076
12572
  });
13077
12573
  }
13078
12574
 
13079
- static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
12575
+ static void soft_max_f32_sycl(const float * x, const float * mask,
13080
12576
  float * dst, const int ncols_x, const int nrows_x,
13081
12577
  const int nrows_y, const float scale, const float max_bias,
13082
12578
  dpct::queue_ptr stream) {
@@ -13098,60 +12594,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13098
12594
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13099
12595
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13100
12596
  if (ncols_x > max_block_size) {
13101
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12597
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13102
12598
  max_bias, m0, m1, n_head_log2, block_nums,
13103
12599
  block_dims, n_local_scratch, stream);
13104
12600
  return;
13105
12601
  }
13106
12602
  switch (ncols_x) {
13107
12603
  case 32:
13108
- soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12604
+ soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
13109
12605
  max_bias, m0, m1, n_head_log2, block_nums,
13110
12606
  block_dims, n_local_scratch, stream);
13111
12607
  break;
13112
12608
  case 64:
13113
- soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12609
+ soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
13114
12610
  max_bias, m0, m1, n_head_log2, block_nums,
13115
12611
  block_dims, n_local_scratch, stream);
13116
12612
  break;
13117
12613
  case 128:
13118
- soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12614
+ soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
13119
12615
  max_bias, m0, m1, n_head_log2, block_nums,
13120
12616
  block_dims, n_local_scratch, stream);
13121
12617
  break;
13122
12618
  case 256:
13123
- soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12619
+ soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
13124
12620
  max_bias, m0, m1, n_head_log2, block_nums,
13125
12621
  block_dims, n_local_scratch, stream);
13126
12622
  break;
13127
12623
  case 512:
13128
- soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12624
+ soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
13129
12625
  max_bias, m0, m1, n_head_log2, block_nums,
13130
12626
  block_dims, n_local_scratch, stream);
13131
12627
  break;
13132
12628
  case 1024:
13133
- soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12629
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13134
12630
  max_bias, m0, m1, n_head_log2, block_nums,
13135
12631
  block_dims, n_local_scratch, stream);
13136
12632
  break;
13137
12633
  case 2048:
13138
- soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12634
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13139
12635
  max_bias, m0, m1, n_head_log2, block_nums,
13140
12636
  block_dims, n_local_scratch, stream);
13141
12637
  break;
13142
12638
  case 4096:
13143
- soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12639
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13144
12640
  max_bias, m0, m1, n_head_log2, block_nums,
13145
12641
  block_dims, n_local_scratch, stream);
13146
12642
  break;
13147
12643
  default:
13148
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12644
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13149
12645
  max_bias, m0, m1, n_head_log2, block_nums,
13150
12646
  block_dims, n_local_scratch, stream);
13151
12647
  break;
13152
12648
  }
13153
12649
  } else {
13154
- soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12650
+ soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13155
12651
  max_bias, m0, m1, n_head_log2, block_nums,
13156
12652
  block_dims, WARP_SIZE, stream);
13157
12653
  }
@@ -14026,11 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
14026
13522
 
14027
13523
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14028
13524
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
14029
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
14030
13525
 
14031
- const int scale_factor = dst->op_params[0];
13526
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
13527
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
13528
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
13529
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
14032
13530
 
14033
- upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
13531
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
13532
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
13533
+ main_stream);
14034
13534
 
14035
13535
  (void) src1;
14036
13536
  (void) dst;
@@ -14486,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14486
13986
  ggml_tensor *dst, const float *src0_dd,
14487
13987
  const float *src1_dd, float *dst_dd,
14488
13988
  const dpct::queue_ptr &main_stream) {
13989
+ #pragma message("TODO: implement phi3 frequency factors support")
13990
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
13991
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
14489
13992
 
14490
13993
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
14491
13994
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@@ -14562,36 +14065,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14562
14065
  (void) src1_dd;
14563
14066
  }
14564
14067
 
14565
- inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
14566
- ggml_tensor *dst, const float *src0_dd,
14567
- const float *src1_dd, float *dst_dd,
14568
- const dpct::queue_ptr &main_stream) {
14569
-
14570
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
14571
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
14572
-
14573
- GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
14574
- const int64_t nrows = ggml_nrows(src0);
14575
-
14576
- //const int n_past = ((int32_t *) dst->op_params)[0];
14577
- const int n_head = ((int32_t *) dst->op_params)[1];
14578
- float max_bias;
14579
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
14580
-
14581
- //GGML_ASSERT(ne01 + n_past == ne00);
14582
- GGML_ASSERT(n_head == ne02);
14583
-
14584
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
14585
-
14586
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
14587
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
14588
-
14589
- alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
14590
-
14591
- (void) src1;
14592
- (void) src1_dd;
14593
- }
14594
-
14595
14068
  static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
14596
14069
  const ggml_tensor *src1, ggml_tensor *dst,
14597
14070
  const float *src0_dd, const float *src1_dd,
@@ -14746,12 +14219,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14746
14219
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14747
14220
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14748
14221
 
14749
- const ggml_tensor * src2 = dst->src[2];
14750
-
14751
- #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14222
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
14752
14223
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14753
14224
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14754
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14755
14225
 
14756
14226
  const int64_t ne00 = src0->ne[0];
14757
14227
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14763,25 +14233,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14763
14233
  memcpy(&scale, dst->op_params + 0, sizeof(float));
14764
14234
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
14765
14235
 
14766
- // positions tensor
14767
- float * src2_dd = nullptr;
14768
- sycl_pool_alloc<float> src2_f;
14769
-
14770
- const bool use_src2 = src2 != nullptr;
14771
-
14772
- if (use_src2) {
14773
- const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
14774
-
14775
- if (src2_on_device) {
14776
- ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
14777
- src2_dd = (float *) src2_extra->data_device[g_main_device];
14778
- } else {
14779
- src2_dd = src2_f.alloc(ggml_nelements(src2));
14780
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
14781
- }
14782
- }
14783
-
14784
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
14236
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
14785
14237
  nrows_x, nrows_y, scale, max_bias, main_stream);
14786
14238
  }
14787
14239
 
@@ -15656,26 +15108,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15656
15108
  const int64_t r2 = ne12/ne02;
15657
15109
  const int64_t r3 = ne13/ne03;
15658
15110
 
15659
- #if 0
15660
- // use syclGemmEx
15661
- {
15662
- for (int i13 = 0; i13 < ne13; ++i13) {
15663
- for (int i12 = 0; i12 < ne12; ++i12) {
15664
- int i03 = i13 / r3;
15665
- int i02 = i12 / r2;
15666
-
15667
- SYCL_CHECK(
15668
- syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
15669
- ne01, ne11, ne10,
15670
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
15671
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
15672
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
15673
- cu_compute_type,
15674
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
15675
- }
15676
- }
15677
- }
15678
- #else
15679
15111
  if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
15680
15112
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
15681
15113
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15687,7 +15119,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15687
15119
  nb11 / nb10, nb12 / nb10, beta,
15688
15120
  (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
15689
15121
  ne12 * ne13, cu_compute_type)));
15690
- g_sycl_handles[g_main_device]->wait();
15691
15122
  } else {
15692
15123
  const int ne23 = ne12*ne13;
15693
15124
 
@@ -15718,7 +15149,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15718
15149
  nb02, nb03, nb12_scaled, nb13_scaled,
15719
15150
  nbd2, nbd3, r2, r3, item_ct1);
15720
15151
  });
15721
- }).wait();
15152
+ });
15722
15153
  }
15723
15154
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
15724
15155
  *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15729,9 +15160,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15729
15160
  dpct::library_data_t::real_half, nb11 / nb10, beta,
15730
15161
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
15731
15162
  cu_compute_type)));
15732
- g_sycl_handles[g_main_device]->wait();
15733
15163
  }
15734
- #endif
15735
15164
 
15736
15165
  if (no_mixed_dtypes) {
15737
15166
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@@ -16232,10 +15661,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
16232
15661
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
16233
15662
  }
16234
15663
 
16235
- static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16236
- ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
16237
- }
16238
-
16239
15664
  static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16240
15665
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
16241
15666
  }
@@ -16612,9 +16037,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16612
16037
  case GGML_OP_ROPE:
16613
16038
  func = ggml_sycl_rope;
16614
16039
  break;
16615
- case GGML_OP_ALIBI:
16616
- func = ggml_sycl_alibi;
16617
- break;
16618
16040
  case GGML_OP_IM2COL:
16619
16041
  func = ggml_sycl_im2col;
16620
16042
  break;
@@ -17744,7 +17166,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17744
17166
  case GGML_OP_DIAG_MASK_INF:
17745
17167
  case GGML_OP_SOFT_MAX:
17746
17168
  case GGML_OP_ROPE:
17747
- case GGML_OP_ALIBI:
17748
17169
  case GGML_OP_IM2COL:
17749
17170
  case GGML_OP_POOL_2D:
17750
17171
  case GGML_OP_SUM_ROWS: