llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_ALIBI_BLOCK_SIZE 32
3158
3157
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3159
3158
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
3160
3159
  #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
3848
3847
  }
3849
3848
  }
3850
3849
 
3851
- static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor,
3852
- const sycl::nd_item<3> &item_ct1) {
3853
- int ne0 = ne00 * scale_factor;
3854
- int nidx = item_ct1.get_local_id(2) +
3855
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
3856
- if (nidx >= ne0) {
3850
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
3851
+ const int nb02, const int nb03, const int ne10, const int ne11,
3852
+ const int ne12, const int ne13, const float sf0, const float sf1,
3853
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
3854
+ int index = item_ct1.get_local_id(0) +
3855
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
3856
+ if (index >= ne10 * ne11 * ne12 * ne13) {
3857
3857
  return;
3858
3858
  }
3859
3859
  // operation
3860
- int i00 = nidx / scale_factor;
3861
- int i01 = item_ct1.get_group(1) / scale_factor;
3862
- int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
3863
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
3864
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
3865
- dst[offset_dst] = x[offset_src];
3860
+ int i10 = index % ne10;
3861
+ int i11 = (index / ne10) % ne11;
3862
+ int i12 = (index / (ne10 * ne11)) % ne12;
3863
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
3864
+
3865
+ int i00 = i10 / sf0;
3866
+ int i01 = i11 / sf1;
3867
+ int i02 = i12 / sf2;
3868
+ int i03 = i13 / sf3;
3869
+
3870
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
3866
3871
  }
3867
3872
 
3868
3873
  static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4192
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4193
4198
 
4194
4199
  const int tid = item_ct1.get_local_id(2);
4195
- #if QK_K == 256
4196
4200
  const int n = tid/32;
4197
4201
  const int l = tid - 32*n;
4198
4202
  const int is = 8*n + l/16;
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4206
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4207
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4208
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4209
- #else
4210
- const int is = tid/16; // 0 or 1
4211
- const int il = tid%16; // 0...15
4212
- const uint8_t q = x[i].qs[il] >> (2*is);
4213
- dst_t * y = yy + i*QK_K + 16*is + il;
4214
-
4215
- float dall = x[i].dm[0];
4216
- float dmin = x[i].dm[1];
4217
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4218
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4219
- #endif
4220
-
4221
4213
  }
4222
4214
 
4223
4215
  template<typename dst_t>
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4227
4219
  const int i = item_ct1.get_group(2);
4228
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4229
4221
 
4230
- #if QK_K == 256
4231
4222
  const int r = item_ct1.get_local_id(2) / 4;
4232
4223
  const int tid = r/2;
4233
4224
  const int is0 = r%2;
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4251
4242
  const uint8_t * hm = x[i].hmask;
4252
4243
 
4253
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4254
- #else
4255
- const int tid = item_ct1.get_local_id(2);
4256
- const int is = tid/16; // 0 or 1
4257
- const int il = tid%16; // 0...15
4258
- const int im = il/8; // 0...1
4259
- const int in = il%8; // 0...7
4260
-
4261
- dst_t * y = yy + i*QK_K + 16*is + il;
4262
-
4263
- const uint8_t q = x[i].qs[il] >> (2*is);
4264
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4265
- const float d = (float)x[i].d;
4266
-
4267
- if (is == 0) {
4268
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4269
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4270
- } else {
4271
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4272
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4273
- }
4274
- #endif
4275
-
4276
4245
  }
4277
4246
 
4278
- #if QK_K == 256
4279
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4280
4248
  if (j < 4) {
4281
4249
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4284
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4285
4253
  }
4286
4254
  }
4287
- #endif
4288
4255
 
4289
4256
  template<typename dst_t>
4290
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4293
4260
 
4294
4261
  const int i = item_ct1.get_group(2);
4295
4262
 
4296
- #if QK_K == 256
4297
4263
  // assume 32 threads
4298
4264
  const int tid = item_ct1.get_local_id(2);
4299
4265
  const int il = tid/8;
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4317
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4318
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4319
4285
  }
4320
- #else
4321
- const int tid = item_ct1.get_local_id(2);
4322
- const uint8_t * q = x[i].qs;
4323
- dst_t * y = yy + i*QK_K;
4324
- const float d = (float)x[i].dm[0];
4325
- const float m = (float)x[i].dm[1];
4326
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4327
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4328
- #endif
4329
4286
  }
4330
4287
 
4331
4288
  template<typename dst_t>
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4335
4292
 
4336
4293
  const int i = item_ct1.get_group(2);
4337
4294
 
4338
- #if QK_K == 256
4339
4295
  // assume 64 threads - this is very slightly better than the one below
4340
4296
  const int tid = item_ct1.get_local_id(2);
4341
4297
  const int il = tid/16; // il is in 0...3
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4362
4318
  hm <<= 1;
4363
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4364
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4365
- #else
4366
- const int tid = item_ct1.get_local_id(2);
4367
- const uint8_t q = x[i].qs[tid];
4368
- const int im = tid/8; // 0...3
4369
- const int in = tid%8; // 0...7
4370
- const int is = tid/16; // 0 or 1
4371
- const uint8_t h = x[i].qh[in] >> im;
4372
- const float d = x[i].d;
4373
- dst_t * y = yy + i*QK_K + tid;
4374
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4375
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4376
- #endif
4377
4321
  }
4378
4322
 
4379
4323
  template<typename dst_t>
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4382
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4383
4327
 
4384
4328
  const int i = item_ct1.get_group(2);
4385
- #if QK_K == 256
4386
4329
 
4387
4330
  // assume 64 threads - this is very slightly better than the one below
4388
4331
  const int tid = item_ct1.get_local_id(2);
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4402
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4403
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4404
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4405
- #else
4406
-
4407
- // assume 32 threads
4408
- const int tid = item_ct1.get_local_id(2);
4409
- const int ip = tid/16; // 0 or 1
4410
- const int il = tid - 16*ip; // 0...15
4411
-
4412
- dst_t * y = yy + i*QK_K + 16*ip + il;
4413
-
4414
- const float d = x[i].d;
4415
-
4416
- const uint8_t ql = x[i].ql[16*ip + il];
4417
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4418
- const int8_t * sc = x[i].scales;
4419
-
4420
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4421
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4422
- #endif
4423
4348
  }
4424
4349
 
4425
4350
  template<typename dst_t>
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4433
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4434
4359
 
4435
4360
  const int tid = item_ct1.get_local_id(2);
4436
- #if QK_K == 256
4437
4361
  const int il = tid/8; // 0...3
4438
4362
  const int ib = tid%8; // 0...7
4439
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4444
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4445
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4446
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4447
- #else
4448
- assert(false);
4449
- #endif
4450
-
4451
4371
  }
4452
4372
 
4453
4373
  template<typename dst_t>
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4461
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4462
4382
 
4463
4383
  const int tid = item_ct1.get_local_id(2);
4464
- #if QK_K == 256
4465
4384
  const int il = tid/8; // 0...3
4466
4385
  const int ib = tid%8; // 0...7
4467
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4470
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4471
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4472
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4473
- #else
4474
- assert(false);
4475
- #endif
4476
-
4477
4392
  }
4478
4393
 
4479
4394
  template <typename dst_t>
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4485
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4486
4401
 
4487
4402
  const int tid = item_ct1.get_local_id(2);
4488
- #if QK_K == 256
4489
4403
  const int il = tid/8; // 0...3
4490
4404
  const int ib = tid%8; // 0...7
4491
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4493
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4494
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4495
4409
  #pragma unroll
4496
- for (int j = 0; j < 8; ++j)
4410
+ for (int j = 0; j < 8; ++j) {
4497
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4498
- #else
4499
- assert(false);
4500
-
4501
- #endif
4502
-
4412
+ }
4503
4413
  }
4504
4414
 
4505
4415
  template<typename dst_t>
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4513
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4514
4424
 
4515
4425
  const int tid = item_ct1.get_local_id(2);
4516
- #if QK_K == 256
4517
4426
  const int il = tid/8; // 0...3
4518
4427
  const int ib = tid%8; // 0...7
4519
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4528
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4529
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4530
4439
  }
4531
- #else
4532
- assert(false);
4533
- #endif
4534
-
4535
4440
  }
4536
4441
 
4537
4442
  template <typename dst_t>
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4544
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4545
4450
 
4546
4451
  const int tid = item_ct1.get_local_id(2);
4547
- #if QK_K == 256
4548
4452
  const int il = tid/8; // 0...3
4549
4453
  const int ib = tid%8; // 0...7
4550
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4558
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4559
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4560
4464
  }
4561
- #else
4562
- assert(false);
4563
- #endif
4564
-
4565
4465
  }
4566
4466
 
4567
4467
  template <typename dst_t>
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4574
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4575
4475
 
4576
4476
  const int tid = item_ct1.get_local_id(2);
4577
- #if QK_K == 256
4578
4477
  const int il = tid/8; // 0...3
4579
4478
  const int ib = tid%8; // 0...7
4580
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4588
4487
  for (int j = 0; j < 8; ++j) {
4589
4488
  y[j] = d * (q[j] + delta);
4590
4489
  }
4591
- #else
4592
- assert(false);
4593
- #endif
4594
-
4595
4490
  }
4596
4491
 
4597
4492
  template <typename dst_t>
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4604
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4605
4500
 
4606
4501
  const int tid = item_ct1.get_local_id(2);
4607
- #if QK_K == 256
4608
4502
  const int il = tid/8; // 0...3
4609
4503
  const int ib = tid%8; // 0...7
4610
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4622
4516
  for (int j = 0; j < 8; ++j) {
4623
4517
  y[j] = d * (q[j] + delta);
4624
4518
  }
4625
- #else
4626
- assert(false);
4627
- #endif
4628
-
4629
4519
  }
4630
4520
 
4631
4521
  template <typename dst_t>
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4699
4589
 
4700
4590
  float tmp = 0; // partial sum for thread in warp
4701
4591
 
4702
- #if QK_K == 256
4703
4592
  const int tid =
4704
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4705
4594
  const int ix =
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4750
4639
  tmp += dall * sum1 - dmin * sum2;
4751
4640
 
4752
4641
  }
4753
- #else
4754
- const int tid = item_ct1.get_local_id(2) /
4755
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4756
- const int ix = item_ct1.get_local_id(2) %
4757
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4758
- const int offset = tid * K_QUANTS_PER_ITERATION;
4759
-
4760
- uint32_t uaux[2];
4761
- const uint8_t * d = (const uint8_t *)uaux;
4762
-
4763
-
4764
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4765
-
4766
- const float * y = yy + i * QK_K + offset;
4767
- const uint8_t * q = x[i].qs + offset;
4768
- const uint32_t * s = (const uint32_t *)x[i].scales;
4769
-
4770
- uaux[0] = s[0] & 0x0f0f0f0f;
4771
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4772
-
4773
- const sycl::float2 dall =
4774
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4775
-
4776
- float sum1 = 0, sum2 = 0;
4777
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4778
- const uint8_t ql = q[l];
4779
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4780
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4781
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4782
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4783
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4784
- }
4785
- tmp += dall.x() * sum1 - dall.y() * sum2;
4786
- }
4787
-
4788
- #endif
4789
4642
 
4790
4643
  // sum up partial sums and write back result
4791
4644
  #pragma unroll
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4823
4676
 
4824
4677
  float tmp = 0; // partial sum for thread in warp
4825
4678
 
4826
- #if QK_K == 256
4827
-
4828
4679
  const uint16_t kmask1 = 0x0303;
4829
4680
  const uint16_t kmask2 = 0x0f0f;
4830
4681
 
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4877
4728
  tmp += d * sum;
4878
4729
 
4879
4730
  }
4880
- #else
4881
-
4882
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4883
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4884
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4885
- const int in = offset/8; // 0 or 1
4886
- const int im = offset%8; // 0...7
4887
-
4888
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4889
-
4890
- const float * y = yy + i * QK_K + offset;
4891
- const uint8_t * q = x[i].qs + offset;
4892
- const uint8_t * s = x[i].scales;
4893
-
4894
- const float dall = (float)x[i].d;
4895
-
4896
- float sum = 0;
4897
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4898
- const uint8_t hl = x[i].hmask[im+l] >> in;
4899
- const uint8_t ql = q[l];
4900
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4901
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4902
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4903
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4904
- }
4905
- tmp += sum;
4906
- }
4907
- #endif
4908
4731
 
4909
4732
  // sum up partial sums and write back result
4910
4733
  #pragma unroll
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4939
4762
 
4940
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4941
4764
 
4942
- #if QK_K == 256
4943
4765
  const uint16_t kmask1 = 0x3f3f;
4944
4766
  const uint16_t kmask2 = 0x0f0f;
4945
4767
  const uint16_t kmask3 = 0xc0c0;
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5028
4850
  #endif
5029
4851
 
5030
4852
  }
5031
- #else
5032
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5033
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5034
-
5035
- const int step = tid * K_QUANTS_PER_ITERATION;
5036
-
5037
- uint16_t aux16[2];
5038
- const uint8_t * s = (const uint8_t *)aux16;
5039
-
5040
- float tmp = 0;
5041
-
5042
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5043
- const uint8_t * q = x[i].qs + step;
5044
- const float * y = yy + i*QK_K + step;
5045
- const uint16_t * a = (const uint16_t *)x[i].scales;
5046
- aux16[0] = a[0] & 0x0f0f;
5047
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5048
- const float d = (float)x[i].dm[0];
5049
- const float m = (float)x[i].dm[1];
5050
- float sum = 0.f;
5051
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5052
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5053
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5054
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5055
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5056
- }
5057
- tmp += sum;
5058
- }
5059
-
5060
- #endif
5061
4853
 
5062
4854
  // sum up partial sums and write back result
5063
4855
  #pragma unroll
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5092
4884
 
5093
4885
  float tmp = 0; // partial sum for thread in warp
5094
4886
 
5095
- #if QK_K == 256
5096
4887
  const uint16_t kmask1 = 0x3f3f;
5097
4888
  const uint16_t kmask2 = 0x0f0f;
5098
4889
  const uint16_t kmask3 = 0xc0c0;
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5169
4960
  dmin * smin;
5170
4961
  }
5171
4962
 
5172
- #else
5173
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5174
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5175
- const int step = tid * K_QUANTS_PER_ITERATION;
5176
- const int im = step/8;
5177
- const int in = step%8;
5178
-
5179
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5180
- const uint8_t * q = x[i].qs + step;
5181
- const int8_t * s = x[i].scales;
5182
- const float * y = yy + i*QK_K + step;
5183
- const float d = x[i].d;
5184
- float sum = 0.f;
5185
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5186
- const uint8_t h = x[i].qh[in+j] >> im;
5187
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5188
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5189
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5190
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5191
- }
5192
- tmp += sum;
5193
- }
5194
- #endif
5195
-
5196
4963
  // sum up partial sums and write back result
5197
4964
  #pragma unroll
5198
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5219
4986
 
5220
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5221
4988
 
5222
- #if QK_K == 256
5223
-
5224
4989
  const int tid =
5225
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5226
4991
  const int ix =
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5277
5042
 
5278
5043
  }
5279
5044
 
5280
- #else
5281
-
5282
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5283
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5284
-
5285
- const int step = tid * K_QUANTS_PER_ITERATION;
5286
-
5287
- float tmp = 0; // partial sum for thread in warp
5288
-
5289
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5290
-
5291
- const float * y = yy + i * QK_K + step;
5292
- const uint8_t * ql = x[i].ql + step;
5293
- const uint8_t * qh = x[i].qh + step;
5294
- const int8_t * s = x[i].scales;
5295
-
5296
- const float d = x[i+0].d;
5297
-
5298
- float sum = 0;
5299
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5300
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5301
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5302
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5303
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5304
- }
5305
- tmp += sum;
5306
-
5307
- }
5308
-
5309
- #endif
5310
-
5311
5045
  // sum up partial sums and write back result
5312
5046
  #pragma unroll
5313
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
6852
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6853
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6854
6588
 
6855
- #ifndef GGML_QKK_64
6856
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6857
6590
 
6858
6591
  int v[2];
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6894
6627
  }
6895
6628
 
6896
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6897
-
6898
- #else
6899
-
6900
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6901
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6902
-
6903
- float sumf_d = 0.0f;
6904
- float sumf_m = 0.0f;
6905
-
6906
- uint16_t aux16[2];
6907
- const uint8_t * s = (const uint8_t *)aux16;
6908
-
6909
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6910
- aux16[0] = a[0] & 0x0f0f;
6911
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6912
-
6913
- const float dall = bq4_K->dm[0];
6914
- const float dmin = bq4_K->dm[1];
6915
-
6916
- const float d8_1 = bq8_1[0].ds[0];
6917
- const float d8_2 = bq8_1[1].ds[1];
6918
-
6919
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6920
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6921
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6922
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6923
-
6924
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6925
- const int v1 = q4[0];
6926
- const int v2 = q4[4];
6927
-
6928
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6929
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6930
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6931
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6932
-
6933
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6934
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6935
-
6936
- return dall * sumf_d - dmin * sumf_m;
6937
-
6938
- #else
6939
- bad_arch();
6940
- #endif // __SYCL_ARCH__ >= VER_4VEC
6941
-
6942
- #endif
6943
6630
  }
6944
6631
 
6945
6632
  template <int mmq_y>
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
6998
6685
 
6999
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
7000
6687
 
7001
- #if QK_K == 256
7002
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7003
- #else
7004
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7005
- #endif
7006
6689
  }
7007
6690
 
7008
6691
  #pragma unroll
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
7045
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7046
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7047
6730
 
7048
- #ifndef GGML_QKK_64
7049
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7050
6732
 
7051
6733
  int vl[2];
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7087
6769
  }
7088
6770
 
7089
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7090
-
7091
- #else
7092
-
7093
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7094
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7095
-
7096
- const int8_t * s = bq5_K->scales;
7097
-
7098
- const float d = bq5_K->d;
7099
-
7100
- const float d8_1 = bq8_1[0].ds[0];
7101
- const float d8_2 = bq8_1[1].ds[1];
7102
-
7103
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7104
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7105
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7106
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7107
-
7108
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7109
- const int vl1 = ql[0];
7110
- const int vl2 = ql[4];
7111
-
7112
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7113
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7114
- const int in = step%8; // 0, 4, 0, 4
7115
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7116
-
7117
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7118
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7119
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7120
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7121
-
7122
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7123
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7124
-
7125
- return d * sumf_d;
7126
-
7127
- #else
7128
- bad_arch();
7129
- #endif // __SYCL_ARCH__ >= VER_4VEC
7130
-
7131
- #endif
7132
6772
  }
7133
6773
 
7134
6774
  template <int mmq_y>
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7200
6840
 
7201
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7202
6842
 
7203
- #if QK_K == 256
7204
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7205
- #endif
7206
6844
  }
7207
6845
 
7208
6846
  #pragma unroll
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7382
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7383
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7384
7022
  const uint8_t *kmask_iq2xs) {
7385
- #if QK_K == 256
7386
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7387
7024
 
7388
7025
  #if QR2_XXS == 8
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7423
7060
  }
7424
7061
  return d * (sumi1 + sumi2);
7425
7062
  #endif
7426
- #else
7427
- assert(false);
7428
- return 0.f;
7429
- #endif
7430
7063
  }
7431
7064
 
7432
7065
  static __dpct_inline__ float
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7435
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7436
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7437
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7438
- #if QK_K == 256
7439
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7440
7072
 
7441
7073
  const int ib32 = iqs;
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7473
7105
  assert(false);
7474
7106
  return 0.f;
7475
7107
  #endif
7476
- #else
7477
- assert(false);
7478
- return 0.f;
7479
- #endif
7480
7108
  }
7481
7109
 
7482
7110
  static __dpct_inline__ float
7483
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7484
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7485
- #if QK_K == 256
7486
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7487
7114
 
7488
7115
  const int ib32 = iqs;
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7526
7153
  }
7527
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7528
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7529
- #else
7530
- assert(false);
7531
- #endif
7532
7156
  }
7533
7157
 
7534
7158
  static __dpct_inline__ float
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7537
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7538
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7539
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7540
- #if QK_K == 256
7541
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7542
7165
 
7543
7166
  const int ib32 = iqs;
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7565
7188
  assert(false);
7566
7189
  return 0.f;
7567
7190
  #endif
7568
- #else
7569
- assert(false);
7570
- return 0.f;
7571
- #endif
7572
7191
  }
7573
7192
 
7574
7193
  static __dpct_inline__ float
7575
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7576
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7577
7196
  const uint32_t *iq3s_grid) {
7578
- #if QK_K == 256
7579
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7580
7198
 
7581
7199
  const int ib32 = iqs;
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7604
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7605
7223
  bq8_1[ib32].ds[0];
7606
7224
  return d * sumi;
7607
- #else
7608
- assert(false);
7609
- #endif
7610
7225
  }
7611
7226
 
7612
7227
  static __dpct_inline__ float
7613
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7614
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7615
7230
  const uint32_t *iq1s_grid_gpu) {
7616
- #if QK_K == 256
7617
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7618
7232
 
7619
7233
  const int ib32 = iqs;
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7632
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7633
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7634
7248
  return d * sumi + m * delta;
7635
- #else
7636
- assert(false);
7637
- #endif
7638
7249
  }
7639
7250
 
7640
7251
  static __dpct_inline__ float
7641
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7642
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7643
- #if QK_K == 256
7644
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7645
7255
 
7646
7256
  const int ib32 = iqs;
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7665
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7666
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7667
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7668
- #else
7669
- assert(false);
7670
- #endif
7671
7278
  }
7672
7279
 
7673
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
7715
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7716
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7717
7324
 
7718
- #if QK_K == 256
7719
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7720
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7721
7327
 
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7733
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7734
7340
  }
7735
7341
  return d * (sumi1 + sumi2);
7736
- #else
7737
- assert(false);
7738
- #endif
7739
7342
  }
7740
7343
 
7741
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -9316,32 +8919,6 @@ static void rope_glm_f32(
9316
8919
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
9317
8920
  }
9318
8921
 
9319
- static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
9320
- const int n_heads_log2_floor, const float m0, const float m1,
9321
- const sycl::nd_item<3> &item_ct1) {
9322
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
9323
- item_ct1.get_local_id(2);
9324
-
9325
- if (col >= ncols) {
9326
- return;
9327
- }
9328
-
9329
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9330
- item_ct1.get_local_id(1);
9331
- const int i = row*ncols + col;
9332
-
9333
- const int k = row/k_rows;
9334
-
9335
- float m_k;
9336
- if (k < n_heads_log2_floor) {
9337
- m_k = dpct::pow(m0, k + 1);
9338
- } else {
9339
- m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
9340
- }
9341
-
9342
- dst[i] = col * m_k + x[i];
9343
- }
9344
-
9345
8922
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9346
8923
  const sycl::nd_item<3> &item_ct1) {
9347
8924
  const int row = item_ct1.get_group(1);
@@ -9443,7 +9020,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
9443
9020
 
9444
9021
 
9445
9022
  template <bool vals_smem, int ncols_template, int block_size_template>
9446
- static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
9023
+ static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
9447
9024
  const int nrows_y, const float scale, const float max_bias, const float m0,
9448
9025
  const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
9449
9026
  const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -9457,7 +9034,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9457
9034
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
9458
9035
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
9459
9036
 
9460
- float slope = 0.0f;
9037
+ float slope = 1.0f;
9461
9038
 
9462
9039
  // ALiBi
9463
9040
  if (max_bias > 0.0f) {
@@ -9482,7 +9059,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9482
9059
  const int ix = rowx*ncols + col;
9483
9060
  const int iy = rowy*ncols + col;
9484
9061
 
9485
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
9062
+ const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
9486
9063
 
9487
9064
  vals[col] = val;
9488
9065
  max_val = sycl::max(max_val, val);
@@ -10112,18 +9689,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
10112
9689
  });
10113
9690
  }
10114
9691
 
10115
- static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
10116
- const int ne01, const int ne02,
10117
- const int scale_factor, dpct::queue_ptr stream) {
10118
- int ne0 = (ne00 * scale_factor);
10119
- int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
10120
- sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
9692
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
9693
+ const int nb02, const int nb03, const int ne10, const int ne11,
9694
+ const int ne12, const int ne13, const float sf0, const float sf1,
9695
+ const float sf2, const float sf3, dpct::queue_ptr stream) {
9696
+ int dst_size = ne10 * ne11 * ne12 * ne13;
9697
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
9698
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
10121
9699
  stream->parallel_for(
10122
- sycl::nd_range<3>(gridDim *
10123
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
10124
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
10125
- [=](sycl::nd_item<3> item_ct1) {
10126
- upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
9700
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
9701
+ [=](sycl::nd_item<1> item_ct1) {
9702
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
10127
9703
  });
10128
9704
  }
10129
9705
 
@@ -10225,7 +9801,6 @@ template <typename dst_t>
10225
9801
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10226
9802
  dpct::queue_ptr stream) {
10227
9803
  const int nb = k / QK_K;
10228
- #if QK_K == 256
10229
9804
  {
10230
9805
  dpct::has_capability_or_fail(stream->get_device(),
10231
9806
  {sycl::aspect::fp16});
@@ -10237,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10237
9812
  dequantize_block_q2_K(vx, y, item_ct1);
10238
9813
  });
10239
9814
  }
10240
- #else
10241
- {
10242
- dpct::has_capability_or_fail(stream->get_device(),
10243
- {sycl::aspect::fp16});
10244
-
10245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10246
- sycl::range<3>(1, 1, 32),
10247
- sycl::range<3>(1, 1, 32)),
10248
- [=](sycl::nd_item<3> item_ct1) {
10249
- dequantize_block_q2_K(vx, y, item_ct1);
10250
- });
10251
- }
10252
-
10253
- #endif
10254
9815
  }
10255
9816
 
10256
9817
  template <typename dst_t>
10257
9818
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10258
9819
  dpct::queue_ptr stream) {
10259
9820
  const int nb = k / QK_K;
10260
- #if QK_K == 256
10261
9821
  {
10262
9822
  dpct::has_capability_or_fail(stream->get_device(),
10263
9823
  {sycl::aspect::fp16});
@@ -10269,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10269
9829
  dequantize_block_q3_K(vx, y, item_ct1);
10270
9830
  });
10271
9831
  }
10272
- #else
10273
- {
10274
- dpct::has_capability_or_fail(stream->get_device(),
10275
- {sycl::aspect::fp16});
10276
-
10277
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10278
- sycl::range<3>(1, 1, 32),
10279
- sycl::range<3>(1, 1, 32)),
10280
- [=](sycl::nd_item<3> item_ct1) {
10281
- dequantize_block_q3_K(vx, y, item_ct1);
10282
- });
10283
- }
10284
- #endif
10285
9832
  }
10286
9833
 
10287
9834
  template <typename dst_t>
@@ -10342,7 +9889,6 @@ template <typename dst_t>
10342
9889
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10343
9890
  dpct::queue_ptr stream) {
10344
9891
  const int nb = k / QK_K;
10345
- #if QK_K == 256
10346
9892
  {
10347
9893
  dpct::has_capability_or_fail(stream->get_device(),
10348
9894
  {sycl::aspect::fp16});
@@ -10354,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10354
9900
  dequantize_block_q5_K(vx, y, item_ct1);
10355
9901
  });
10356
9902
  }
10357
- #else
10358
- {
10359
- dpct::has_capability_or_fail(stream->get_device(),
10360
- {sycl::aspect::fp16});
10361
-
10362
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10363
- sycl::range<3>(1, 1, 32),
10364
- sycl::range<3>(1, 1, 32)),
10365
- [=](sycl::nd_item<3> item_ct1) {
10366
- dequantize_block_q5_K(vx, y, item_ct1);
10367
- });
10368
- }
10369
-
10370
- #endif
10371
9903
  }
10372
9904
 
10373
9905
  template <typename dst_t>
10374
9906
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10375
9907
  dpct::queue_ptr stream) {
10376
9908
  const int nb = k / QK_K;
10377
- #if QK_K == 256
10378
9909
  {
10379
9910
  dpct::has_capability_or_fail(stream->get_device(),
10380
9911
  {sycl::aspect::fp16});
@@ -10386,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10386
9917
  dequantize_block_q6_K(vx, y, item_ct1);
10387
9918
  });
10388
9919
  }
10389
- #else
10390
- {
10391
- dpct::has_capability_or_fail(stream->get_device(),
10392
- {sycl::aspect::fp16});
10393
-
10394
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10395
- sycl::range<3>(1, 1, 32),
10396
- sycl::range<3>(1, 1, 32)),
10397
- [=](sycl::nd_item<3> item_ct1) {
10398
- dequantize_block_q6_K(vx, y, item_ct1);
10399
- });
10400
- }
10401
-
10402
- #endif
10403
9920
  }
10404
9921
 
10405
9922
  template <typename dst_t>
@@ -10551,9 +10068,6 @@ template <typename dst_t>
10551
10068
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10552
10069
  dpct::queue_ptr stream) {
10553
10070
  const int nb = (k + QK_K - 1) / QK_K;
10554
- #if QK_K == 64
10555
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10556
- #else
10557
10071
  {
10558
10072
  dpct::has_capability_or_fail(stream->get_device(),
10559
10073
  {sycl::aspect::fp16});
@@ -10568,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10568
10082
  });
10569
10083
  });
10570
10084
  }
10571
- #endif
10572
10085
  }
10573
10086
 
10574
10087
 
@@ -12073,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12073
11586
  const int nrows_y, const int nrows_dst,
12074
11587
  dpct::queue_ptr stream) try {
12075
11588
 
12076
- #if QK_K == 256
12077
-
12078
11589
  int id;
12079
11590
  SYCL_CHECK(
12080
11591
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12189,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12189
11700
  });
12190
11701
  }
12191
11702
  }
12192
- #endif
12193
11703
  }
12194
11704
  catch (sycl::exception const &exc) {
12195
11705
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -12964,20 +12474,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12964
12474
  });
12965
12475
  }
12966
12476
 
12967
- static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
12968
- const int nrows, const int k_rows,
12969
- const int n_heads_log2_floor, const float m0,
12970
- const float m1, dpct::queue_ptr stream) {
12971
- const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
12972
- const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
12973
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12974
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12975
- [=](sycl::nd_item<3> item_ct1) {
12976
- alibi_f32(x, dst, ncols, k_rows,
12977
- n_heads_log2_floor, m0, m1, item_ct1);
12978
- });
12979
- }
12980
-
12981
12477
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12982
12478
  const int nrows, dpct::queue_ptr stream) {
12983
12479
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13058,7 +12554,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
13058
12554
  }
13059
12555
 
13060
12556
  template <bool vals_smem, int ncols_template, int block_size_template>
13061
- static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
12557
+ static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
13062
12558
  const int nrows_y, const float scale, const float max_bias, const float m0,
13063
12559
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
13064
12560
  const size_t n_local_scratch, dpct::queue_ptr stream) {
@@ -13068,7 +12564,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13068
12564
  cgh.parallel_for(
13069
12565
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
13070
12566
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
13071
- soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
12567
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
13072
12568
  nrows_y, scale, max_bias, m0,
13073
12569
  m1, n_head_log2, item_ct1,
13074
12570
  local_buf_acc.get_pointer());
@@ -13076,7 +12572,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13076
12572
  });
13077
12573
  }
13078
12574
 
13079
- static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
12575
+ static void soft_max_f32_sycl(const float * x, const float * mask,
13080
12576
  float * dst, const int ncols_x, const int nrows_x,
13081
12577
  const int nrows_y, const float scale, const float max_bias,
13082
12578
  dpct::queue_ptr stream) {
@@ -13098,60 +12594,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13098
12594
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13099
12595
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13100
12596
  if (ncols_x > max_block_size) {
13101
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12597
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13102
12598
  max_bias, m0, m1, n_head_log2, block_nums,
13103
12599
  block_dims, n_local_scratch, stream);
13104
12600
  return;
13105
12601
  }
13106
12602
  switch (ncols_x) {
13107
12603
  case 32:
13108
- soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12604
+ soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
13109
12605
  max_bias, m0, m1, n_head_log2, block_nums,
13110
12606
  block_dims, n_local_scratch, stream);
13111
12607
  break;
13112
12608
  case 64:
13113
- soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12609
+ soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
13114
12610
  max_bias, m0, m1, n_head_log2, block_nums,
13115
12611
  block_dims, n_local_scratch, stream);
13116
12612
  break;
13117
12613
  case 128:
13118
- soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12614
+ soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
13119
12615
  max_bias, m0, m1, n_head_log2, block_nums,
13120
12616
  block_dims, n_local_scratch, stream);
13121
12617
  break;
13122
12618
  case 256:
13123
- soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12619
+ soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
13124
12620
  max_bias, m0, m1, n_head_log2, block_nums,
13125
12621
  block_dims, n_local_scratch, stream);
13126
12622
  break;
13127
12623
  case 512:
13128
- soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12624
+ soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
13129
12625
  max_bias, m0, m1, n_head_log2, block_nums,
13130
12626
  block_dims, n_local_scratch, stream);
13131
12627
  break;
13132
12628
  case 1024:
13133
- soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12629
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13134
12630
  max_bias, m0, m1, n_head_log2, block_nums,
13135
12631
  block_dims, n_local_scratch, stream);
13136
12632
  break;
13137
12633
  case 2048:
13138
- soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12634
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13139
12635
  max_bias, m0, m1, n_head_log2, block_nums,
13140
12636
  block_dims, n_local_scratch, stream);
13141
12637
  break;
13142
12638
  case 4096:
13143
- soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12639
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13144
12640
  max_bias, m0, m1, n_head_log2, block_nums,
13145
12641
  block_dims, n_local_scratch, stream);
13146
12642
  break;
13147
12643
  default:
13148
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12644
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13149
12645
  max_bias, m0, m1, n_head_log2, block_nums,
13150
12646
  block_dims, n_local_scratch, stream);
13151
12647
  break;
13152
12648
  }
13153
12649
  } else {
13154
- soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12650
+ soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13155
12651
  max_bias, m0, m1, n_head_log2, block_nums,
13156
12652
  block_dims, WARP_SIZE, stream);
13157
12653
  }
@@ -14026,11 +13522,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
14026
13522
 
14027
13523
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14028
13524
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
14029
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
14030
13525
 
14031
- const int scale_factor = dst->op_params[0];
13526
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
13527
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
13528
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
13529
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
14032
13530
 
14033
- upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
13531
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
13532
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
13533
+ main_stream);
14034
13534
 
14035
13535
  (void) src1;
14036
13536
  (void) dst;
@@ -14486,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14486
13986
  ggml_tensor *dst, const float *src0_dd,
14487
13987
  const float *src1_dd, float *dst_dd,
14488
13988
  const dpct::queue_ptr &main_stream) {
13989
+ #pragma message("TODO: implement phi3 frequency factors support")
13990
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
13991
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
14489
13992
 
14490
13993
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
14491
13994
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@@ -14562,36 +14065,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14562
14065
  (void) src1_dd;
14563
14066
  }
14564
14067
 
14565
- inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
14566
- ggml_tensor *dst, const float *src0_dd,
14567
- const float *src1_dd, float *dst_dd,
14568
- const dpct::queue_ptr &main_stream) {
14569
-
14570
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
14571
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
14572
-
14573
- GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
14574
- const int64_t nrows = ggml_nrows(src0);
14575
-
14576
- //const int n_past = ((int32_t *) dst->op_params)[0];
14577
- const int n_head = ((int32_t *) dst->op_params)[1];
14578
- float max_bias;
14579
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
14580
-
14581
- //GGML_ASSERT(ne01 + n_past == ne00);
14582
- GGML_ASSERT(n_head == ne02);
14583
-
14584
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
14585
-
14586
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
14587
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
14588
-
14589
- alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
14590
-
14591
- (void) src1;
14592
- (void) src1_dd;
14593
- }
14594
-
14595
14068
  static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
14596
14069
  const ggml_tensor *src1, ggml_tensor *dst,
14597
14070
  const float *src0_dd, const float *src1_dd,
@@ -14746,12 +14219,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14746
14219
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14747
14220
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14748
14221
 
14749
- const ggml_tensor * src2 = dst->src[2];
14750
-
14751
- #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14222
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
14752
14223
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14753
14224
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14754
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14755
14225
 
14756
14226
  const int64_t ne00 = src0->ne[0];
14757
14227
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14763,25 +14233,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14763
14233
  memcpy(&scale, dst->op_params + 0, sizeof(float));
14764
14234
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
14765
14235
 
14766
- // positions tensor
14767
- float * src2_dd = nullptr;
14768
- sycl_pool_alloc<float> src2_f;
14769
-
14770
- const bool use_src2 = src2 != nullptr;
14771
-
14772
- if (use_src2) {
14773
- const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
14774
-
14775
- if (src2_on_device) {
14776
- ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
14777
- src2_dd = (float *) src2_extra->data_device[g_main_device];
14778
- } else {
14779
- src2_dd = src2_f.alloc(ggml_nelements(src2));
14780
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
14781
- }
14782
- }
14783
-
14784
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
14236
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
14785
14237
  nrows_x, nrows_y, scale, max_bias, main_stream);
14786
14238
  }
14787
14239
 
@@ -15656,26 +15108,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15656
15108
  const int64_t r2 = ne12/ne02;
15657
15109
  const int64_t r3 = ne13/ne03;
15658
15110
 
15659
- #if 0
15660
- // use syclGemmEx
15661
- {
15662
- for (int i13 = 0; i13 < ne13; ++i13) {
15663
- for (int i12 = 0; i12 < ne12; ++i12) {
15664
- int i03 = i13 / r3;
15665
- int i02 = i12 / r2;
15666
-
15667
- SYCL_CHECK(
15668
- syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
15669
- ne01, ne11, ne10,
15670
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
15671
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
15672
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
15673
- cu_compute_type,
15674
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
15675
- }
15676
- }
15677
- }
15678
- #else
15679
15111
  if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
15680
15112
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
15681
15113
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15687,7 +15119,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15687
15119
  nb11 / nb10, nb12 / nb10, beta,
15688
15120
  (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
15689
15121
  ne12 * ne13, cu_compute_type)));
15690
- g_sycl_handles[g_main_device]->wait();
15691
15122
  } else {
15692
15123
  const int ne23 = ne12*ne13;
15693
15124
 
@@ -15718,7 +15149,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15718
15149
  nb02, nb03, nb12_scaled, nb13_scaled,
15719
15150
  nbd2, nbd3, r2, r3, item_ct1);
15720
15151
  });
15721
- }).wait();
15152
+ });
15722
15153
  }
15723
15154
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
15724
15155
  *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15729,9 +15160,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15729
15160
  dpct::library_data_t::real_half, nb11 / nb10, beta,
15730
15161
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
15731
15162
  cu_compute_type)));
15732
- g_sycl_handles[g_main_device]->wait();
15733
15163
  }
15734
- #endif
15735
15164
 
15736
15165
  if (no_mixed_dtypes) {
15737
15166
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@@ -16232,10 +15661,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
16232
15661
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
16233
15662
  }
16234
15663
 
16235
- static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16236
- ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
16237
- }
16238
-
16239
15664
  static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16240
15665
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
16241
15666
  }
@@ -16612,9 +16037,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16612
16037
  case GGML_OP_ROPE:
16613
16038
  func = ggml_sycl_rope;
16614
16039
  break;
16615
- case GGML_OP_ALIBI:
16616
- func = ggml_sycl_alibi;
16617
- break;
16618
16040
  case GGML_OP_IM2COL:
16619
16041
  func = ggml_sycl_im2col;
16620
16042
  break;
@@ -17744,7 +17166,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17744
17166
  case GGML_OP_DIAG_MASK_INF:
17745
17167
  case GGML_OP_SOFT_MAX:
17746
17168
  case GGML_OP_ROPE:
17747
- case GGML_OP_ALIBI:
17748
17169
  case GGML_OP_IM2COL:
17749
17170
  case GGML_OP_POOL_2D:
17750
17171
  case GGML_OP_SUM_ROWS: