@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_ALIBI_BLOCK_SIZE 32
3158
3157
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3159
3158
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
3160
3159
  #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@@ -3848,21 +3847,27 @@ static void concat_f32(const float *x,const float *y, float *dst, const int ne
3848
3847
  }
3849
3848
  }
3850
3849
 
3851
- static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor,
3852
- const sycl::nd_item<3> &item_ct1) {
3853
- int ne0 = ne00 * scale_factor;
3854
- int nidx = item_ct1.get_local_id(2) +
3855
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
3856
- if (nidx >= ne0) {
3850
+ static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
3851
+ const int nb02, const int nb03, const int ne10, const int ne11,
3852
+ const int ne12, const int ne13, const float sf0, const float sf1,
3853
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
3854
+ int index = item_ct1.get_local_id(0) +
3855
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
3856
+ if (index >= ne10 * ne11 * ne12 * ne13) {
3857
3857
  return;
3858
3858
  }
3859
3859
  // operation
3860
- int i00 = nidx / scale_factor;
3861
- int i01 = item_ct1.get_group(1) / scale_factor;
3862
- int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
3863
- int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
3864
- item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
3865
- dst[offset_dst] = x[offset_src];
3860
+ int i10 = index % ne10;
3861
+ int i11 = (index / ne10) % ne11;
3862
+ int i12 = (index / (ne10 * ne11)) % ne12;
3863
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
3864
+
3865
+ int i00 = i10 / sf0;
3866
+ int i01 = i11 / sf1;
3867
+ int i02 = i12 / sf2;
3868
+ int i03 = i13 / sf3;
3869
+
3870
+ dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
3866
3871
  }
3867
3872
 
3868
3873
  static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
@@ -4192,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4192
4197
  const block_q2_K * x = (const block_q2_K *) vx;
4193
4198
 
4194
4199
  const int tid = item_ct1.get_local_id(2);
4195
- #if QK_K == 256
4196
4200
  const int n = tid/32;
4197
4201
  const int l = tid - 32*n;
4198
4202
  const int is = 8*n + l/16;
@@ -4206,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
4206
4210
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
4207
4211
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
4208
4212
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
4209
- #else
4210
- const int is = tid/16; // 0 or 1
4211
- const int il = tid%16; // 0...15
4212
- const uint8_t q = x[i].qs[il] >> (2*is);
4213
- dst_t * y = yy + i*QK_K + 16*is + il;
4214
-
4215
- float dall = x[i].dm[0];
4216
- float dmin = x[i].dm[1];
4217
- y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
4218
- y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
4219
- #endif
4220
-
4221
4213
  }
4222
4214
 
4223
4215
  template<typename dst_t>
@@ -4227,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4227
4219
  const int i = item_ct1.get_group(2);
4228
4220
  const block_q3_K * x = (const block_q3_K *) vx;
4229
4221
 
4230
- #if QK_K == 256
4231
4222
  const int r = item_ct1.get_local_id(2) / 4;
4232
4223
  const int tid = r/2;
4233
4224
  const int is0 = r%2;
@@ -4251,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
4251
4242
  const uint8_t * hm = x[i].hmask;
4252
4243
 
4253
4244
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
4254
- #else
4255
- const int tid = item_ct1.get_local_id(2);
4256
- const int is = tid/16; // 0 or 1
4257
- const int il = tid%16; // 0...15
4258
- const int im = il/8; // 0...1
4259
- const int in = il%8; // 0...7
4260
-
4261
- dst_t * y = yy + i*QK_K + 16*is + il;
4262
-
4263
- const uint8_t q = x[i].qs[il] >> (2*is);
4264
- const uint8_t h = x[i].hmask[in] >> (2*is + im);
4265
- const float d = (float)x[i].d;
4266
-
4267
- if (is == 0) {
4268
- y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4269
- y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4270
- } else {
4271
- y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
4272
- y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
4273
- }
4274
- #endif
4275
-
4276
4245
  }
4277
4246
 
4278
- #if QK_K == 256
4279
4247
  static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
4280
4248
  if (j < 4) {
4281
4249
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -4284,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
4284
4252
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
4285
4253
  }
4286
4254
  }
4287
- #endif
4288
4255
 
4289
4256
  template<typename dst_t>
4290
4257
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
@@ -4293,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4293
4260
 
4294
4261
  const int i = item_ct1.get_group(2);
4295
4262
 
4296
- #if QK_K == 256
4297
4263
  // assume 32 threads
4298
4264
  const int tid = item_ct1.get_local_id(2);
4299
4265
  const int il = tid/8;
@@ -4317,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
4317
4283
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
4318
4284
  y[l +32] = d2 * (q[l] >> 4) - m2;
4319
4285
  }
4320
- #else
4321
- const int tid = item_ct1.get_local_id(2);
4322
- const uint8_t * q = x[i].qs;
4323
- dst_t * y = yy + i*QK_K;
4324
- const float d = (float)x[i].dm[0];
4325
- const float m = (float)x[i].dm[1];
4326
- y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
4327
- y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
4328
- #endif
4329
4286
  }
4330
4287
 
4331
4288
  template<typename dst_t>
@@ -4335,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4335
4292
 
4336
4293
  const int i = item_ct1.get_group(2);
4337
4294
 
4338
- #if QK_K == 256
4339
4295
  // assume 64 threads - this is very slightly better than the one below
4340
4296
  const int tid = item_ct1.get_local_id(2);
4341
4297
  const int il = tid/16; // il is in 0...3
@@ -4362,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
4362
4318
  hm <<= 1;
4363
4319
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
4364
4320
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
4365
- #else
4366
- const int tid = item_ct1.get_local_id(2);
4367
- const uint8_t q = x[i].qs[tid];
4368
- const int im = tid/8; // 0...3
4369
- const int in = tid%8; // 0...7
4370
- const int is = tid/16; // 0 or 1
4371
- const uint8_t h = x[i].qh[in] >> im;
4372
- const float d = x[i].d;
4373
- dst_t * y = yy + i*QK_K + tid;
4374
- y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
4375
- y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
4376
- #endif
4377
4321
  }
4378
4322
 
4379
4323
  template<typename dst_t>
@@ -4382,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4382
4326
  const block_q6_K * x = (const block_q6_K *) vx;
4383
4327
 
4384
4328
  const int i = item_ct1.get_group(2);
4385
- #if QK_K == 256
4386
4329
 
4387
4330
  // assume 64 threads - this is very slightly better than the one below
4388
4331
  const int tid = item_ct1.get_local_id(2);
@@ -4402,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
4402
4345
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
4403
4346
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4404
4347
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
4405
- #else
4406
-
4407
- // assume 32 threads
4408
- const int tid = item_ct1.get_local_id(2);
4409
- const int ip = tid/16; // 0 or 1
4410
- const int il = tid - 16*ip; // 0...15
4411
-
4412
- dst_t * y = yy + i*QK_K + 16*ip + il;
4413
-
4414
- const float d = x[i].d;
4415
-
4416
- const uint8_t ql = x[i].ql[16*ip + il];
4417
- const uint8_t qh = x[i].qh[il] >> (2*ip);
4418
- const int8_t * sc = x[i].scales;
4419
-
4420
- y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
4421
- y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
4422
- #endif
4423
4348
  }
4424
4349
 
4425
4350
  template<typename dst_t>
@@ -4433,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4433
4358
  const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
4434
4359
 
4435
4360
  const int tid = item_ct1.get_local_id(2);
4436
- #if QK_K == 256
4437
4361
  const int il = tid/8; // 0...3
4438
4362
  const int ib = tid%8; // 0...7
4439
4363
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4444,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
4444
4368
  const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
4445
4369
  const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
4446
4370
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
4447
- #else
4448
- assert(false);
4449
- #endif
4450
-
4451
4371
  }
4452
4372
 
4453
4373
  template<typename dst_t>
@@ -4461,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4461
4381
  const block_iq2_xs * x = (const block_iq2_xs *) vx;
4462
4382
 
4463
4383
  const int tid = item_ct1.get_local_id(2);
4464
- #if QK_K == 256
4465
4384
  const int il = tid/8; // 0...3
4466
4385
  const int ib = tid%8; // 0...7
4467
4386
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4470,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4470
4389
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4471
4390
  const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
4472
4391
  for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4473
- #else
4474
- assert(false);
4475
- #endif
4476
-
4477
4392
  }
4478
4393
 
4479
4394
  template <typename dst_t>
@@ -4485,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4485
4400
  const block_iq2_s * x = (const block_iq2_s *) vx;
4486
4401
 
4487
4402
  const int tid = item_ct1.get_local_id(2);
4488
- #if QK_K == 256
4489
4403
  const int il = tid/8; // 0...3
4490
4404
  const int ib = tid%8; // 0...7
4491
4405
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4493,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4493
4407
  const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4494
4408
  const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4495
4409
  #pragma unroll
4496
- for (int j = 0; j < 8; ++j)
4410
+ for (int j = 0; j < 8; ++j) {
4497
4411
  y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4498
- #else
4499
- assert(false);
4500
-
4501
- #endif
4502
-
4412
+ }
4503
4413
  }
4504
4414
 
4505
4415
  template<typename dst_t>
@@ -4513,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4513
4423
  const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
4514
4424
 
4515
4425
  const int tid = item_ct1.get_local_id(2);
4516
- #if QK_K == 256
4517
4426
  const int il = tid/8; // 0...3
4518
4427
  const int ib = tid%8; // 0...7
4519
4428
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4528,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4528
4437
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4529
4438
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4530
4439
  }
4531
- #else
4532
- assert(false);
4533
- #endif
4534
-
4535
4440
  }
4536
4441
 
4537
4442
  template <typename dst_t>
@@ -4544,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4544
4449
  const block_iq3_s * x = (const block_iq3_s *) vx;
4545
4450
 
4546
4451
  const int tid = item_ct1.get_local_id(2);
4547
- #if QK_K == 256
4548
4452
  const int il = tid/8; // 0...3
4549
4453
  const int ib = tid%8; // 0...7
4550
4454
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4558,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4558
4462
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4559
4463
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4560
4464
  }
4561
- #else
4562
- assert(false);
4563
- #endif
4564
-
4565
4465
  }
4566
4466
 
4567
4467
  template <typename dst_t>
@@ -4574,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4574
4474
  const block_iq1_s * x = (const block_iq1_s *) vx;
4575
4475
 
4576
4476
  const int tid = item_ct1.get_local_id(2);
4577
- #if QK_K == 256
4578
4477
  const int il = tid/8; // 0...3
4579
4478
  const int ib = tid%8; // 0...7
4580
4479
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4588,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4588
4487
  for (int j = 0; j < 8; ++j) {
4589
4488
  y[j] = d * (q[j] + delta);
4590
4489
  }
4591
- #else
4592
- assert(false);
4593
- #endif
4594
-
4595
4490
  }
4596
4491
 
4597
4492
  template <typename dst_t>
@@ -4604,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4604
4499
  const block_iq1_m * x = (const block_iq1_m *) vx;
4605
4500
 
4606
4501
  const int tid = item_ct1.get_local_id(2);
4607
- #if QK_K == 256
4608
4502
  const int il = tid/8; // 0...3
4609
4503
  const int ib = tid%8; // 0...7
4610
4504
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -4622,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4622
4516
  for (int j = 0; j < 8; ++j) {
4623
4517
  y[j] = d * (q[j] + delta);
4624
4518
  }
4625
- #else
4626
- assert(false);
4627
- #endif
4628
-
4629
4519
  }
4630
4520
 
4631
4521
  template <typename dst_t>
@@ -4699,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4699
4589
 
4700
4590
  float tmp = 0; // partial sum for thread in warp
4701
4591
 
4702
- #if QK_K == 256
4703
4592
  const int tid =
4704
4593
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
4705
4594
  const int ix =
@@ -4750,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
4750
4639
  tmp += dall * sum1 - dmin * sum2;
4751
4640
 
4752
4641
  }
4753
- #else
4754
- const int tid = item_ct1.get_local_id(2) /
4755
- (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4756
- const int ix = item_ct1.get_local_id(2) %
4757
- (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4758
- const int offset = tid * K_QUANTS_PER_ITERATION;
4759
-
4760
- uint32_t uaux[2];
4761
- const uint8_t * d = (const uint8_t *)uaux;
4762
-
4763
-
4764
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4765
-
4766
- const float * y = yy + i * QK_K + offset;
4767
- const uint8_t * q = x[i].qs + offset;
4768
- const uint32_t * s = (const uint32_t *)x[i].scales;
4769
-
4770
- uaux[0] = s[0] & 0x0f0f0f0f;
4771
- uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
4772
-
4773
- const sycl::float2 dall =
4774
- x[i].dm.convert<float, sycl::rounding_mode::automatic>();
4775
-
4776
- float sum1 = 0, sum2 = 0;
4777
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4778
- const uint8_t ql = q[l];
4779
- sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
4780
- + y[l+16] * d[1] * ((ql >> 2) & 3)
4781
- + y[l+32] * d[2] * ((ql >> 4) & 3)
4782
- + y[l+48] * d[3] * ((ql >> 6) & 3);
4783
- sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
4784
- }
4785
- tmp += dall.x() * sum1 - dall.y() * sum2;
4786
- }
4787
-
4788
- #endif
4789
4642
 
4790
4643
  // sum up partial sums and write back result
4791
4644
  #pragma unroll
@@ -4823,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4823
4676
 
4824
4677
  float tmp = 0; // partial sum for thread in warp
4825
4678
 
4826
- #if QK_K == 256
4827
-
4828
4679
  const uint16_t kmask1 = 0x0303;
4829
4680
  const uint16_t kmask2 = 0x0f0f;
4830
4681
 
@@ -4877,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
4877
4728
  tmp += d * sum;
4878
4729
 
4879
4730
  }
4880
- #else
4881
-
4882
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
4883
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
4884
- const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
4885
- const int in = offset/8; // 0 or 1
4886
- const int im = offset%8; // 0...7
4887
-
4888
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
4889
-
4890
- const float * y = yy + i * QK_K + offset;
4891
- const uint8_t * q = x[i].qs + offset;
4892
- const uint8_t * s = x[i].scales;
4893
-
4894
- const float dall = (float)x[i].d;
4895
-
4896
- float sum = 0;
4897
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
4898
- const uint8_t hl = x[i].hmask[im+l] >> in;
4899
- const uint8_t ql = q[l];
4900
- sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
4901
- + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
4902
- + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
4903
- + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
4904
- }
4905
- tmp += sum;
4906
- }
4907
- #endif
4908
4731
 
4909
4732
  // sum up partial sums and write back result
4910
4733
  #pragma unroll
@@ -4939,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
4939
4762
 
4940
4763
  const block_q4_K * x = (const block_q4_K *)vx + ib0;
4941
4764
 
4942
- #if QK_K == 256
4943
4765
  const uint16_t kmask1 = 0x3f3f;
4944
4766
  const uint16_t kmask2 = 0x0f0f;
4945
4767
  const uint16_t kmask3 = 0xc0c0;
@@ -5028,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
5028
4850
  #endif
5029
4851
 
5030
4852
  }
5031
- #else
5032
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5033
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5034
-
5035
- const int step = tid * K_QUANTS_PER_ITERATION;
5036
-
5037
- uint16_t aux16[2];
5038
- const uint8_t * s = (const uint8_t *)aux16;
5039
-
5040
- float tmp = 0;
5041
-
5042
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5043
- const uint8_t * q = x[i].qs + step;
5044
- const float * y = yy + i*QK_K + step;
5045
- const uint16_t * a = (const uint16_t *)x[i].scales;
5046
- aux16[0] = a[0] & 0x0f0f;
5047
- aux16[1] = (a[0] >> 4) & 0x0f0f;
5048
- const float d = (float)x[i].dm[0];
5049
- const float m = (float)x[i].dm[1];
5050
- float sum = 0.f;
5051
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5052
- sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
5053
- + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
5054
- + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
5055
- + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
5056
- }
5057
- tmp += sum;
5058
- }
5059
-
5060
- #endif
5061
4853
 
5062
4854
  // sum up partial sums and write back result
5063
4855
  #pragma unroll
@@ -5092,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5092
4884
 
5093
4885
  float tmp = 0; // partial sum for thread in warp
5094
4886
 
5095
- #if QK_K == 256
5096
4887
  const uint16_t kmask1 = 0x3f3f;
5097
4888
  const uint16_t kmask2 = 0x0f0f;
5098
4889
  const uint16_t kmask3 = 0xc0c0;
@@ -5169,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
5169
4960
  dmin * smin;
5170
4961
  }
5171
4962
 
5172
- #else
5173
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
5174
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
5175
- const int step = tid * K_QUANTS_PER_ITERATION;
5176
- const int im = step/8;
5177
- const int in = step%8;
5178
-
5179
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5180
- const uint8_t * q = x[i].qs + step;
5181
- const int8_t * s = x[i].scales;
5182
- const float * y = yy + i*QK_K + step;
5183
- const float d = x[i].d;
5184
- float sum = 0.f;
5185
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5186
- const uint8_t h = x[i].qh[in+j] >> im;
5187
- sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
5188
- + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
5189
- + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
5190
- + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
5191
- }
5192
- tmp += sum;
5193
- }
5194
- #endif
5195
-
5196
4963
  // sum up partial sums and write back result
5197
4964
  #pragma unroll
5198
4965
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -5219,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5219
4986
 
5220
4987
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
5221
4988
 
5222
- #if QK_K == 256
5223
-
5224
4989
  const int tid =
5225
4990
  item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
5226
4991
  const int ix =
@@ -5277,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
5277
5042
 
5278
5043
  }
5279
5044
 
5280
- #else
5281
-
5282
- const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
5283
- const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
5284
-
5285
- const int step = tid * K_QUANTS_PER_ITERATION;
5286
-
5287
- float tmp = 0; // partial sum for thread in warp
5288
-
5289
- for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
5290
-
5291
- const float * y = yy + i * QK_K + step;
5292
- const uint8_t * ql = x[i].ql + step;
5293
- const uint8_t * qh = x[i].qh + step;
5294
- const int8_t * s = x[i].scales;
5295
-
5296
- const float d = x[i+0].d;
5297
-
5298
- float sum = 0;
5299
- for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
5300
- sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
5301
- + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
5302
- + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
5303
- + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
5304
- }
5305
- tmp += sum;
5306
-
5307
- }
5308
-
5309
- #endif
5310
-
5311
5045
  // sum up partial sums and write back result
5312
5046
  #pragma unroll
5313
5047
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -6852,7 +6586,6 @@ static __dpct_inline__ float
6852
6586
  vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6853
6587
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
6854
6588
 
6855
- #ifndef GGML_QKK_64
6856
6589
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6857
6590
 
6858
6591
  int v[2];
@@ -6894,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
6894
6627
  }
6895
6628
 
6896
6629
  return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
6897
-
6898
- #else
6899
-
6900
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
6901
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
6902
-
6903
- float sumf_d = 0.0f;
6904
- float sumf_m = 0.0f;
6905
-
6906
- uint16_t aux16[2];
6907
- const uint8_t * s = (const uint8_t *)aux16;
6908
-
6909
- const uint16_t * a = (const uint16_t *)bq4_K->scales;
6910
- aux16[0] = a[0] & 0x0f0f;
6911
- aux16[1] = (a[0] >> 4) & 0x0f0f;
6912
-
6913
- const float dall = bq4_K->dm[0];
6914
- const float dmin = bq4_K->dm[1];
6915
-
6916
- const float d8_1 = bq8_1[0].ds[0];
6917
- const float d8_2 = bq8_1[1].ds[1];
6918
-
6919
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
6920
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
6921
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
6922
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
6923
-
6924
- const int * q4 = (const int *)bq4_K->qs + (iqs/2);
6925
- const int v1 = q4[0];
6926
- const int v2 = q4[4];
6927
-
6928
- const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
6929
- const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
6930
- const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
6931
- const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
6932
-
6933
- sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
6934
- sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
6935
-
6936
- return dall * sumf_d - dmin * sumf_m;
6937
-
6938
- #else
6939
- bad_arch();
6940
- #endif // __SYCL_ARCH__ >= VER_4VEC
6941
-
6942
- #endif
6943
6630
  }
6944
6631
 
6945
6632
  template <int mmq_y>
@@ -6998,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
6998
6685
 
6999
6686
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
7000
6687
 
7001
- #if QK_K == 256
7002
6688
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
7003
- #else
7004
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
7005
- #endif
7006
6689
  }
7007
6690
 
7008
6691
  #pragma unroll
@@ -7045,7 +6728,6 @@ static __dpct_inline__ float
7045
6728
  vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7046
6729
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7047
6730
 
7048
- #ifndef GGML_QKK_64
7049
6731
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7050
6732
 
7051
6733
  int vl[2];
@@ -7087,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
7087
6769
  }
7088
6770
 
7089
6771
  return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
7090
-
7091
- #else
7092
-
7093
- #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
7094
- const block_q5_K * bq5_K = (const block_q5_K *) vbq;
7095
-
7096
- const int8_t * s = bq5_K->scales;
7097
-
7098
- const float d = bq5_K->d;
7099
-
7100
- const float d8_1 = bq8_1[0].ds[0];
7101
- const float d8_2 = bq8_1[1].ds[1];
7102
-
7103
- const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
7104
- const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
7105
- const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
7106
- const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
7107
-
7108
- const int * ql = (const int *)bq5_K->qs + (iqs/2);
7109
- const int vl1 = ql[0];
7110
- const int vl2 = ql[4];
7111
-
7112
- const int step = 4 * (iqs/2); // 0, 4, 8, 12
7113
- const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
7114
- const int in = step%8; // 0, 4, 0, 4
7115
- const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
7116
-
7117
- const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
7118
- const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
7119
- const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
7120
- const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
7121
-
7122
- const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
7123
- + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
7124
-
7125
- return d * sumf_d;
7126
-
7127
- #else
7128
- bad_arch();
7129
- #endif // __SYCL_ARCH__ >= VER_4VEC
7130
-
7131
- #endif
7132
6772
  }
7133
6773
 
7134
6774
  template <int mmq_y>
@@ -7200,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
7200
6840
 
7201
6841
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
7202
6842
 
7203
- #if QK_K == 256
7204
6843
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
7205
- #endif
7206
6844
  }
7207
6845
 
7208
6846
  #pragma unroll
@@ -7382,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7382
7020
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7383
7021
  const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
7384
7022
  const uint8_t *kmask_iq2xs) {
7385
- #if QK_K == 256
7386
7023
  const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
7387
7024
 
7388
7025
  #if QR2_XXS == 8
@@ -7423,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
7423
7060
  }
7424
7061
  return d * (sumi1 + sumi2);
7425
7062
  #endif
7426
- #else
7427
- assert(false);
7428
- return 0.f;
7429
- #endif
7430
7063
  }
7431
7064
 
7432
7065
  static __dpct_inline__ float
@@ -7435,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7435
7068
  const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
7436
7069
  #if DPCT_COMPATIBILITY_TEMP >= \
7437
7070
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7438
- #if QK_K == 256
7439
7071
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
7440
7072
 
7441
7073
  const int ib32 = iqs;
@@ -7473,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7473
7105
  assert(false);
7474
7106
  return 0.f;
7475
7107
  #endif
7476
- #else
7477
- assert(false);
7478
- return 0.f;
7479
- #endif
7480
7108
  }
7481
7109
 
7482
7110
  static __dpct_inline__ float
7483
7111
  vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7484
7112
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7485
- #if QK_K == 256
7486
7113
  const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7487
7114
 
7488
7115
  const int ib32 = iqs;
@@ -7526,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7526
7153
  }
7527
7154
  const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7528
7155
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7529
- #else
7530
- assert(false);
7531
- #endif
7532
7156
  }
7533
7157
 
7534
7158
  static __dpct_inline__ float
@@ -7537,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7537
7161
  const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
7538
7162
  #if DPCT_COMPATIBILITY_TEMP >= \
7539
7163
  MIN_CC_DP4A // lowest compute capability for integer intrinsics
7540
- #if QK_K == 256
7541
7164
  const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
7542
7165
 
7543
7166
  const int ib32 = iqs;
@@ -7565,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7565
7188
  assert(false);
7566
7189
  return 0.f;
7567
7190
  #endif
7568
- #else
7569
- assert(false);
7570
- return 0.f;
7571
- #endif
7572
7191
  }
7573
7192
 
7574
7193
  static __dpct_inline__ float
7575
7194
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7576
7195
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7577
7196
  const uint32_t *iq3s_grid) {
7578
- #if QK_K == 256
7579
7197
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7580
7198
 
7581
7199
  const int ib32 = iqs;
@@ -7604,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7604
7222
  (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7605
7223
  bq8_1[ib32].ds[0];
7606
7224
  return d * sumi;
7607
- #else
7608
- assert(false);
7609
- #endif
7610
7225
  }
7611
7226
 
7612
7227
  static __dpct_inline__ float
7613
7228
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7614
7229
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7615
7230
  const uint32_t *iq1s_grid_gpu) {
7616
- #if QK_K == 256
7617
7231
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7618
7232
 
7619
7233
  const int ib32 = iqs;
@@ -7632,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7632
7246
  const float d = d1q * bq8_1[ib32].ds[0];
7633
7247
  const float m = d1q * bq8_1[ib32].ds[1];
7634
7248
  return d * sumi + m * delta;
7635
- #else
7636
- assert(false);
7637
- #endif
7638
7249
  }
7639
7250
 
7640
7251
  static __dpct_inline__ float
7641
7252
  vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7642
7253
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7643
- #if QK_K == 256
7644
7254
  const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7645
7255
 
7646
7256
  const int ib32 = iqs;
@@ -7665,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7665
7275
  scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7666
7276
  const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7667
7277
  return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7668
- #else
7669
- assert(false);
7670
- #endif
7671
7278
  }
7672
7279
 
7673
7280
  static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
@@ -7715,7 +7322,6 @@ static __dpct_inline__ float
7715
7322
  vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7716
7323
  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7717
7324
 
7718
- #if QK_K == 256
7719
7325
  const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7720
7326
  const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7721
7327
 
@@ -7733,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7733
7339
  sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7734
7340
  }
7735
7341
  return d * (sumi1 + sumi2);
7736
- #else
7737
- assert(false);
7738
- #endif
7739
7342
  }
7740
7343
 
7741
7344
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -9227,12 +8830,11 @@ static void rope(
9227
8830
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
9228
8831
  }
9229
8832
 
9230
- template<typename T, bool has_pos>
8833
+ template<typename T, bool has_pos, bool has_freq_facs>
9231
8834
  static void rope_neox(
9232
8835
  const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
9233
- float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
9234
- ,
9235
- const sycl::nd_item<3> &item_ct1) {
8836
+ float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
8837
+ const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
9236
8838
  const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9237
8839
  item_ct1.get_local_id(1));
9238
8840
 
@@ -9260,8 +8862,10 @@ static void rope_neox(
9260
8862
  float cur_rot = inv_ndims * ic - ib;
9261
8863
 
9262
8864
  const int p = has_pos ? pos[i2] : 0;
8865
+ const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
8866
+
9263
8867
  const float theta_base =
9264
- p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
8868
+ p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
9265
8869
 
9266
8870
  float cos_theta, sin_theta;
9267
8871
  rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -9316,32 +8920,6 @@ static void rope_glm_f32(
9316
8920
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
9317
8921
  }
9318
8922
 
9319
- static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
9320
- const int n_heads_log2_floor, const float m0, const float m1,
9321
- const sycl::nd_item<3> &item_ct1) {
9322
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
9323
- item_ct1.get_local_id(2);
9324
-
9325
- if (col >= ncols) {
9326
- return;
9327
- }
9328
-
9329
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9330
- item_ct1.get_local_id(1);
9331
- const int i = row*ncols + col;
9332
-
9333
- const int k = row/k_rows;
9334
-
9335
- float m_k;
9336
- if (k < n_heads_log2_floor) {
9337
- m_k = dpct::pow(m0, k + 1);
9338
- } else {
9339
- m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
9340
- }
9341
-
9342
- dst[i] = col * m_k + x[i];
9343
- }
9344
-
9345
8923
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9346
8924
  const sycl::nd_item<3> &item_ct1) {
9347
8925
  const int row = item_ct1.get_group(1);
@@ -9443,7 +9021,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
9443
9021
 
9444
9022
 
9445
9023
  template <bool vals_smem, int ncols_template, int block_size_template>
9446
- static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
9024
+ static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
9447
9025
  const int nrows_y, const float scale, const float max_bias, const float m0,
9448
9026
  const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
9449
9027
  const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -9457,7 +9035,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9457
9035
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
9458
9036
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
9459
9037
 
9460
- float slope = 0.0f;
9038
+ float slope = 1.0f;
9461
9039
 
9462
9040
  // ALiBi
9463
9041
  if (max_bias > 0.0f) {
@@ -9482,7 +9060,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9482
9060
  const int ix = rowx*ncols + col;
9483
9061
  const int iy = rowy*ncols + col;
9484
9062
 
9485
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
9063
+ const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
9486
9064
 
9487
9065
  vals[col] = val;
9488
9066
  max_val = sycl::max(max_val, val);
@@ -10112,18 +9690,17 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
10112
9690
  });
10113
9691
  }
10114
9692
 
10115
- static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
10116
- const int ne01, const int ne02,
10117
- const int scale_factor, dpct::queue_ptr stream) {
10118
- int ne0 = (ne00 * scale_factor);
10119
- int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
10120
- sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
9693
+ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
9694
+ const int nb02, const int nb03, const int ne10, const int ne11,
9695
+ const int ne12, const int ne13, const float sf0, const float sf1,
9696
+ const float sf2, const float sf3, dpct::queue_ptr stream) {
9697
+ int dst_size = ne10 * ne11 * ne12 * ne13;
9698
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
9699
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
10121
9700
  stream->parallel_for(
10122
- sycl::nd_range<3>(gridDim *
10123
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
10124
- sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
10125
- [=](sycl::nd_item<3> item_ct1) {
10126
- upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
9701
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
9702
+ [=](sycl::nd_item<1> item_ct1) {
9703
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
10127
9704
  });
10128
9705
  }
10129
9706
 
@@ -10225,7 +9802,6 @@ template <typename dst_t>
10225
9802
  static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10226
9803
  dpct::queue_ptr stream) {
10227
9804
  const int nb = k / QK_K;
10228
- #if QK_K == 256
10229
9805
  {
10230
9806
  dpct::has_capability_or_fail(stream->get_device(),
10231
9807
  {sycl::aspect::fp16});
@@ -10237,27 +9813,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
10237
9813
  dequantize_block_q2_K(vx, y, item_ct1);
10238
9814
  });
10239
9815
  }
10240
- #else
10241
- {
10242
- dpct::has_capability_or_fail(stream->get_device(),
10243
- {sycl::aspect::fp16});
10244
-
10245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10246
- sycl::range<3>(1, 1, 32),
10247
- sycl::range<3>(1, 1, 32)),
10248
- [=](sycl::nd_item<3> item_ct1) {
10249
- dequantize_block_q2_K(vx, y, item_ct1);
10250
- });
10251
- }
10252
-
10253
- #endif
10254
9816
  }
10255
9817
 
10256
9818
  template <typename dst_t>
10257
9819
  static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10258
9820
  dpct::queue_ptr stream) {
10259
9821
  const int nb = k / QK_K;
10260
- #if QK_K == 256
10261
9822
  {
10262
9823
  dpct::has_capability_or_fail(stream->get_device(),
10263
9824
  {sycl::aspect::fp16});
@@ -10269,19 +9830,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
10269
9830
  dequantize_block_q3_K(vx, y, item_ct1);
10270
9831
  });
10271
9832
  }
10272
- #else
10273
- {
10274
- dpct::has_capability_or_fail(stream->get_device(),
10275
- {sycl::aspect::fp16});
10276
-
10277
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10278
- sycl::range<3>(1, 1, 32),
10279
- sycl::range<3>(1, 1, 32)),
10280
- [=](sycl::nd_item<3> item_ct1) {
10281
- dequantize_block_q3_K(vx, y, item_ct1);
10282
- });
10283
- }
10284
- #endif
10285
9833
  }
10286
9834
 
10287
9835
  template <typename dst_t>
@@ -10342,7 +9890,6 @@ template <typename dst_t>
10342
9890
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10343
9891
  dpct::queue_ptr stream) {
10344
9892
  const int nb = k / QK_K;
10345
- #if QK_K == 256
10346
9893
  {
10347
9894
  dpct::has_capability_or_fail(stream->get_device(),
10348
9895
  {sycl::aspect::fp16});
@@ -10354,27 +9901,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
10354
9901
  dequantize_block_q5_K(vx, y, item_ct1);
10355
9902
  });
10356
9903
  }
10357
- #else
10358
- {
10359
- dpct::has_capability_or_fail(stream->get_device(),
10360
- {sycl::aspect::fp16});
10361
-
10362
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10363
- sycl::range<3>(1, 1, 32),
10364
- sycl::range<3>(1, 1, 32)),
10365
- [=](sycl::nd_item<3> item_ct1) {
10366
- dequantize_block_q5_K(vx, y, item_ct1);
10367
- });
10368
- }
10369
-
10370
- #endif
10371
9904
  }
10372
9905
 
10373
9906
  template <typename dst_t>
10374
9907
  static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10375
9908
  dpct::queue_ptr stream) {
10376
9909
  const int nb = k / QK_K;
10377
- #if QK_K == 256
10378
9910
  {
10379
9911
  dpct::has_capability_or_fail(stream->get_device(),
10380
9912
  {sycl::aspect::fp16});
@@ -10386,20 +9918,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10386
9918
  dequantize_block_q6_K(vx, y, item_ct1);
10387
9919
  });
10388
9920
  }
10389
- #else
10390
- {
10391
- dpct::has_capability_or_fail(stream->get_device(),
10392
- {sycl::aspect::fp16});
10393
-
10394
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10395
- sycl::range<3>(1, 1, 32),
10396
- sycl::range<3>(1, 1, 32)),
10397
- [=](sycl::nd_item<3> item_ct1) {
10398
- dequantize_block_q6_K(vx, y, item_ct1);
10399
- });
10400
- }
10401
-
10402
- #endif
10403
9921
  }
10404
9922
 
10405
9923
  template <typename dst_t>
@@ -10551,9 +10069,6 @@ template <typename dst_t>
10551
10069
  static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10552
10070
  dpct::queue_ptr stream) {
10553
10071
  const int nb = (k + QK_K - 1) / QK_K;
10554
- #if QK_K == 64
10555
- dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10556
- #else
10557
10072
  {
10558
10073
  dpct::has_capability_or_fail(stream->get_device(),
10559
10074
  {sycl::aspect::fp16});
@@ -10568,7 +10083,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10568
10083
  });
10569
10084
  });
10570
10085
  }
10571
- #endif
10572
10086
  }
10573
10087
 
10574
10088
 
@@ -12073,8 +11587,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12073
11587
  const int nrows_y, const int nrows_dst,
12074
11588
  dpct::queue_ptr stream) try {
12075
11589
 
12076
- #if QK_K == 256
12077
-
12078
11590
  int id;
12079
11591
  SYCL_CHECK(
12080
11592
  CHECK_TRY_ERROR(id = get_current_device_id()));
@@ -12189,7 +11701,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
12189
11701
  });
12190
11702
  }
12191
11703
  }
12192
- #endif
12193
11704
  }
12194
11705
  catch (sycl::exception const &exc) {
12195
11706
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -12903,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12903
12414
  const int32_t *pos, float freq_scale,
12904
12415
  int p_delta_rows, float freq_base, float ext_factor,
12905
12416
  float attn_factor, rope_corr_dims corr_dims,
12906
- dpct::queue_ptr stream) {
12417
+ const float * freq_factors, dpct::queue_ptr stream) {
12907
12418
  GGML_ASSERT(ncols % 2 == 0);
12908
12419
  const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
12909
12420
  const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
@@ -12913,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
12913
12424
  const float inv_ndims = -1.0f / n_dims;
12914
12425
 
12915
12426
  if (pos == nullptr) {
12916
- /*
12917
- DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
12918
- the limit. To get the device limit, query
12919
- info::device::max_work_group_size. Adjust the work-group size if needed.
12920
- */
12921
12427
  dpct::has_capability_or_fail(stream->get_device(),
12922
12428
  {sycl::aspect::fp16});
12923
-
12924
- stream->parallel_for(
12925
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12926
- [=](sycl::nd_item<3> item_ct1) {
12927
- rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
12928
- p_delta_rows, ext_factor, attn_factor,
12929
- corr_dims, theta_scale, inv_ndims,
12930
- item_ct1);
12931
- });
12429
+ if (freq_factors == nullptr) {
12430
+ stream->parallel_for(
12431
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
12432
+ [=](sycl::nd_item<3> item_ct1) {
12433
+ rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
12434
+ p_delta_rows, ext_factor, attn_factor,
12435
+ corr_dims, theta_scale, inv_ndims, freq_factors,
12436
+ item_ct1);
12437
+ });
12438
+ } else {
12439
+ stream->parallel_for(
12440
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
12441
+ [=](sycl::nd_item<3> item_ct1) {
12442
+ rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
12443
+ p_delta_rows, ext_factor, attn_factor,
12444
+ corr_dims, theta_scale, inv_ndims, freq_factors,
12445
+ item_ct1);
12446
+ });
12447
+ }
12932
12448
  } else {
12933
- /*
12934
- DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
12935
- the limit. To get the device limit, query
12936
- info::device::max_work_group_size. Adjust the work-group size if needed.
12937
- */
12938
12449
  dpct::has_capability_or_fail(stream->get_device(),
12939
12450
  {sycl::aspect::fp16});
12940
12451
 
12941
- stream->parallel_for(
12942
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12943
- [=](sycl::nd_item<3> item_ct1) {
12944
- rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
12945
- p_delta_rows, ext_factor, attn_factor,
12946
- corr_dims, theta_scale, inv_ndims, item_ct1);
12947
- });
12452
+ if (freq_factors == nullptr) {
12453
+ stream->parallel_for(
12454
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
12455
+ [=](sycl::nd_item<3> item_ct1) {
12456
+ rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
12457
+ p_delta_rows, ext_factor, attn_factor,
12458
+ corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12459
+ });
12460
+ } else {
12461
+ stream->parallel_for(
12462
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
12463
+ [=](sycl::nd_item<3> item_ct1) {
12464
+ rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
12465
+ p_delta_rows, ext_factor, attn_factor,
12466
+ corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
12467
+ });
12468
+ }
12948
12469
  }
12949
12470
  }
12950
12471
 
@@ -12964,20 +12485,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12964
12485
  });
12965
12486
  }
12966
12487
 
12967
- static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
12968
- const int nrows, const int k_rows,
12969
- const int n_heads_log2_floor, const float m0,
12970
- const float m1, dpct::queue_ptr stream) {
12971
- const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
12972
- const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
12973
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12974
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12975
- [=](sycl::nd_item<3> item_ct1) {
12976
- alibi_f32(x, dst, ncols, k_rows,
12977
- n_heads_log2_floor, m0, m1, item_ct1);
12978
- });
12979
- }
12980
-
12981
12488
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12982
12489
  const int nrows, dpct::queue_ptr stream) {
12983
12490
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13058,7 +12565,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
13058
12565
  }
13059
12566
 
13060
12567
  template <bool vals_smem, int ncols_template, int block_size_template>
13061
- static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
12568
+ static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
13062
12569
  const int nrows_y, const float scale, const float max_bias, const float m0,
13063
12570
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
13064
12571
  const size_t n_local_scratch, dpct::queue_ptr stream) {
@@ -13068,7 +12575,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13068
12575
  cgh.parallel_for(
13069
12576
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
13070
12577
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
13071
- soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
12578
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
13072
12579
  nrows_y, scale, max_bias, m0,
13073
12580
  m1, n_head_log2, item_ct1,
13074
12581
  local_buf_acc.get_pointer());
@@ -13076,7 +12583,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13076
12583
  });
13077
12584
  }
13078
12585
 
13079
- static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
12586
+ static void soft_max_f32_sycl(const float * x, const float * mask,
13080
12587
  float * dst, const int ncols_x, const int nrows_x,
13081
12588
  const int nrows_y, const float scale, const float max_bias,
13082
12589
  dpct::queue_ptr stream) {
@@ -13098,60 +12605,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13098
12605
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13099
12606
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13100
12607
  if (ncols_x > max_block_size) {
13101
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12608
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13102
12609
  max_bias, m0, m1, n_head_log2, block_nums,
13103
12610
  block_dims, n_local_scratch, stream);
13104
12611
  return;
13105
12612
  }
13106
12613
  switch (ncols_x) {
13107
12614
  case 32:
13108
- soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12615
+ soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
13109
12616
  max_bias, m0, m1, n_head_log2, block_nums,
13110
12617
  block_dims, n_local_scratch, stream);
13111
12618
  break;
13112
12619
  case 64:
13113
- soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12620
+ soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
13114
12621
  max_bias, m0, m1, n_head_log2, block_nums,
13115
12622
  block_dims, n_local_scratch, stream);
13116
12623
  break;
13117
12624
  case 128:
13118
- soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12625
+ soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
13119
12626
  max_bias, m0, m1, n_head_log2, block_nums,
13120
12627
  block_dims, n_local_scratch, stream);
13121
12628
  break;
13122
12629
  case 256:
13123
- soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12630
+ soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
13124
12631
  max_bias, m0, m1, n_head_log2, block_nums,
13125
12632
  block_dims, n_local_scratch, stream);
13126
12633
  break;
13127
12634
  case 512:
13128
- soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12635
+ soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
13129
12636
  max_bias, m0, m1, n_head_log2, block_nums,
13130
12637
  block_dims, n_local_scratch, stream);
13131
12638
  break;
13132
12639
  case 1024:
13133
- soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12640
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13134
12641
  max_bias, m0, m1, n_head_log2, block_nums,
13135
12642
  block_dims, n_local_scratch, stream);
13136
12643
  break;
13137
12644
  case 2048:
13138
- soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12645
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13139
12646
  max_bias, m0, m1, n_head_log2, block_nums,
13140
12647
  block_dims, n_local_scratch, stream);
13141
12648
  break;
13142
12649
  case 4096:
13143
- soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12650
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13144
12651
  max_bias, m0, m1, n_head_log2, block_nums,
13145
12652
  block_dims, n_local_scratch, stream);
13146
12653
  break;
13147
12654
  default:
13148
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12655
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13149
12656
  max_bias, m0, m1, n_head_log2, block_nums,
13150
12657
  block_dims, n_local_scratch, stream);
13151
12658
  break;
13152
12659
  }
13153
12660
  } else {
13154
- soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
12661
+ soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13155
12662
  max_bias, m0, m1, n_head_log2, block_nums,
13156
12663
  block_dims, WARP_SIZE, stream);
13157
12664
  }
@@ -14005,6 +13512,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
14005
13512
  const float *src0_dd, const float *src1_dd,
14006
13513
  float *dst_dd,
14007
13514
  const dpct::queue_ptr &main_stream) {
13515
+ #pragma message("TODO: generalize concat kernel for dim != 2")
13516
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
13517
+ int dim = dst->op_params[0];
13518
+ GGML_ASSERT(dim != 2);
14008
13519
 
14009
13520
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14010
13521
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -14026,11 +13537,15 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
14026
13537
 
14027
13538
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14028
13539
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
14029
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
14030
13540
 
14031
- const int scale_factor = dst->op_params[0];
13541
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
13542
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
13543
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
13544
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
14032
13545
 
14033
- upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
13546
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
13547
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
13548
+ main_stream);
14034
13549
 
14035
13550
  (void) src1;
14036
13551
  (void) dst;
@@ -14486,6 +14001,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14486
14001
  ggml_tensor *dst, const float *src0_dd,
14487
14002
  const float *src1_dd, float *dst_dd,
14488
14003
  const dpct::queue_ptr &main_stream) {
14004
+ const ggml_tensor * src2 = dst->src[2];
14489
14005
 
14490
14006
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
14491
14007
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@@ -14511,6 +14027,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14511
14027
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
14512
14028
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
14513
14029
 
14030
+ const float * freq_factors = nullptr;
14514
14031
  const int32_t * pos = nullptr;
14515
14032
  if ((mode & 1) == 0) {
14516
14033
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
@@ -14521,6 +14038,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14521
14038
  const bool is_neox = mode & 2;
14522
14039
  const bool is_glm = mode & 4;
14523
14040
 
14041
+ if (is_neox) {
14042
+ pos = (const int32_t *) src1_dd;
14043
+
14044
+ if (src2 != nullptr) {
14045
+ freq_factors = (const float *) src2->data;
14046
+ }
14047
+ } else {
14048
+ GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
14049
+ }
14050
+
14524
14051
  rope_corr_dims corr_dims;
14525
14052
  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
14526
14053
 
@@ -14532,13 +14059,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14532
14059
  if (src0->type == GGML_TYPE_F32) {
14533
14060
  rope_neox_sycl(
14534
14061
  (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
14535
- attn_factor, corr_dims, main_stream
14062
+ attn_factor, corr_dims, freq_factors, main_stream
14536
14063
  );
14537
14064
  } else if (src0->type == GGML_TYPE_F16) {
14538
14065
  rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
14539
14066
  ne00, n_dims, nrows, pos, freq_scale, ne01,
14540
14067
  freq_base, ext_factor, attn_factor, corr_dims,
14541
- main_stream);
14068
+ freq_factors, main_stream);
14542
14069
  } else {
14543
14070
  GGML_ASSERT(false);
14544
14071
  }
@@ -14562,36 +14089,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14562
14089
  (void) src1_dd;
14563
14090
  }
14564
14091
 
14565
- inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
14566
- ggml_tensor *dst, const float *src0_dd,
14567
- const float *src1_dd, float *dst_dd,
14568
- const dpct::queue_ptr &main_stream) {
14569
-
14570
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
14571
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
14572
-
14573
- GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
14574
- const int64_t nrows = ggml_nrows(src0);
14575
-
14576
- //const int n_past = ((int32_t *) dst->op_params)[0];
14577
- const int n_head = ((int32_t *) dst->op_params)[1];
14578
- float max_bias;
14579
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
14580
-
14581
- //GGML_ASSERT(ne01 + n_past == ne00);
14582
- GGML_ASSERT(n_head == ne02);
14583
-
14584
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
14585
-
14586
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
14587
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
14588
-
14589
- alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
14590
-
14591
- (void) src1;
14592
- (void) src1_dd;
14593
- }
14594
-
14595
14092
  static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
14596
14093
  const ggml_tensor *src1, ggml_tensor *dst,
14597
14094
  const float *src0_dd, const float *src1_dd,
@@ -14746,12 +14243,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14746
14243
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14747
14244
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14748
14245
 
14749
- const ggml_tensor * src2 = dst->src[2];
14750
-
14751
- #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14246
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
14752
14247
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14753
14248
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14754
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14755
14249
 
14756
14250
  const int64_t ne00 = src0->ne[0];
14757
14251
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14763,25 +14257,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14763
14257
  memcpy(&scale, dst->op_params + 0, sizeof(float));
14764
14258
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
14765
14259
 
14766
- // positions tensor
14767
- float * src2_dd = nullptr;
14768
- sycl_pool_alloc<float> src2_f;
14769
-
14770
- const bool use_src2 = src2 != nullptr;
14771
-
14772
- if (use_src2) {
14773
- const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
14774
-
14775
- if (src2_on_device) {
14776
- ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
14777
- src2_dd = (float *) src2_extra->data_device[g_main_device];
14778
- } else {
14779
- src2_dd = src2_f.alloc(ggml_nelements(src2));
14780
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
14781
- }
14782
- }
14783
-
14784
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
14260
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
14785
14261
  nrows_x, nrows_y, scale, max_bias, main_stream);
14786
14262
  }
14787
14263
 
@@ -15656,26 +15132,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15656
15132
  const int64_t r2 = ne12/ne02;
15657
15133
  const int64_t r3 = ne13/ne03;
15658
15134
 
15659
- #if 0
15660
- // use syclGemmEx
15661
- {
15662
- for (int i13 = 0; i13 < ne13; ++i13) {
15663
- for (int i12 = 0; i12 < ne12; ++i12) {
15664
- int i03 = i13 / r3;
15665
- int i02 = i12 / r2;
15666
-
15667
- SYCL_CHECK(
15668
- syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
15669
- ne01, ne11, ne10,
15670
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
15671
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
15672
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
15673
- cu_compute_type,
15674
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
15675
- }
15676
- }
15677
- }
15678
- #else
15679
15135
  if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
15680
15136
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
15681
15137
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15687,7 +15143,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15687
15143
  nb11 / nb10, nb12 / nb10, beta,
15688
15144
  (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
15689
15145
  ne12 * ne13, cu_compute_type)));
15690
- g_sycl_handles[g_main_device]->wait();
15691
15146
  } else {
15692
15147
  const int ne23 = ne12*ne13;
15693
15148
 
@@ -15718,7 +15173,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15718
15173
  nb02, nb03, nb12_scaled, nb13_scaled,
15719
15174
  nbd2, nbd3, r2, r3, item_ct1);
15720
15175
  });
15721
- }).wait();
15176
+ });
15722
15177
  }
15723
15178
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
15724
15179
  *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15729,9 +15184,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15729
15184
  dpct::library_data_t::real_half, nb11 / nb10, beta,
15730
15185
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
15731
15186
  cu_compute_type)));
15732
- g_sycl_handles[g_main_device]->wait();
15733
15187
  }
15734
- #endif
15735
15188
 
15736
15189
  if (no_mixed_dtypes) {
15737
15190
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@@ -15814,6 +15267,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
15814
15267
  }
15815
15268
  } else {
15816
15269
  bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
15270
+ use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
15817
15271
 
15818
15272
  if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
15819
15273
  use_mul_mat_q = false;
@@ -16232,10 +15686,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
16232
15686
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
16233
15687
  }
16234
15688
 
16235
- static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16236
- ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
16237
- }
16238
-
16239
15689
  static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16240
15690
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
16241
15691
  }
@@ -16612,9 +16062,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16612
16062
  case GGML_OP_ROPE:
16613
16063
  func = ggml_sycl_rope;
16614
16064
  break;
16615
- case GGML_OP_ALIBI:
16616
- func = ggml_sycl_alibi;
16617
- break;
16618
16065
  case GGML_OP_IM2COL:
16619
16066
  func = ggml_sycl_im2col;
16620
16067
  break;
@@ -17744,7 +17191,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17744
17191
  case GGML_OP_DIAG_MASK_INF:
17745
17192
  case GGML_OP_SOFT_MAX:
17746
17193
  case GGML_OP_ROPE:
17747
- case GGML_OP_ALIBI:
17748
17194
  case GGML_OP_IM2COL:
17749
17195
  case GGML_OP_POOL_2D:
17750
17196
  case GGML_OP_SUM_ROWS: