llama_cpp 0.15.0 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define GGML_RPC_MAX_SERVERS 16
11
+
12
+ // backend API
13
+ GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
14
+ GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
15
+
16
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
17
+
18
+ GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
+
20
+ GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
21
+
22
+ #ifdef __cplusplus
23
+ }
24
+ #endif
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_ALIBI_BLOCK_SIZE 32
3158
3157
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3159
3158
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
3160
3159
  #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@@ -8330,24 +8329,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
8330
8329
  const int blocks_per_row = ncols / qk;
8331
8330
  const int blocks_per_warp = vdr * WARP_SIZE / qi;
8332
8331
 
8333
- // partial sum for each thread
8332
+ const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
8333
+
8334
+ // partial sum for each thread
8334
8335
  float tmp = 0.0f;
8335
8336
 
8336
8337
  const block_q_t * x = (const block_q_t *) vx;
8337
8338
  const block_q8_1 * y = (const block_q8_1 *) vy;
8338
8339
 
8339
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8340
+ for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
8340
8341
  i += blocks_per_warp) {
8341
- const int ibx = row*blocks_per_row + i; // x block index
8342
+ const int ibx = row * blocks_per_row + i; // x block index
8342
8343
 
8343
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8344
+ const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
8344
8345
 
8345
- const int iqs =
8346
- vdr *
8347
- (item_ct1.get_local_id(2) %
8348
- (qi / vdr)); // x block quant index when casting the quants to int
8346
+ const int iqs =
8347
+ vdr *
8348
+ (item_ct1.get_local_id(2) -
8349
+ i * qi_vdr); // x block quant index when casting the quants to int
8349
8350
 
8350
- tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8351
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8351
8352
  }
8352
8353
 
8353
8354
  // sum up partial sums and write back result
@@ -9314,32 +9315,6 @@ static void rope_glm_f32(
9314
9315
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
9315
9316
  }
9316
9317
 
9317
- static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
9318
- const int n_heads_log2_floor, const float m0, const float m1,
9319
- const sycl::nd_item<3> &item_ct1) {
9320
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
9321
- item_ct1.get_local_id(2);
9322
-
9323
- if (col >= ncols) {
9324
- return;
9325
- }
9326
-
9327
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9328
- item_ct1.get_local_id(1);
9329
- const int i = row*ncols + col;
9330
-
9331
- const int k = row/k_rows;
9332
-
9333
- float m_k;
9334
- if (k < n_heads_log2_floor) {
9335
- m_k = dpct::pow(m0, k + 1);
9336
- } else {
9337
- m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
9338
- }
9339
-
9340
- dst[i] = col * m_k + x[i];
9341
- }
9342
-
9343
9318
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9344
9319
  const sycl::nd_item<3> &item_ct1) {
9345
9320
  const int row = item_ct1.get_group(1);
@@ -9441,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
9441
9416
 
9442
9417
 
9443
9418
  template <bool vals_smem, int ncols_template, int block_size_template>
9444
- static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
9419
+ static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
9445
9420
  const int nrows_y, const float scale, const float max_bias, const float m0,
9446
9421
  const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
9447
9422
  const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -9455,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9455
9430
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
9456
9431
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
9457
9432
 
9458
- float slope = 0.0f;
9433
+ float slope = 1.0f;
9459
9434
 
9460
9435
  // ALiBi
9461
9436
  if (max_bias > 0.0f) {
@@ -9480,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9480
9455
  const int ix = rowx*ncols + col;
9481
9456
  const int iy = rowy*ncols + col;
9482
9457
 
9483
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
9458
+ const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
9484
9459
 
9485
9460
  vals[col] = val;
9486
9461
  max_val = sycl::max(max_val, val);
@@ -12962,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12962
12937
  });
12963
12938
  }
12964
12939
 
12965
- static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
12966
- const int nrows, const int k_rows,
12967
- const int n_heads_log2_floor, const float m0,
12968
- const float m1, dpct::queue_ptr stream) {
12969
- const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
12970
- const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
12971
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12972
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12973
- [=](sycl::nd_item<3> item_ct1) {
12974
- alibi_f32(x, dst, ncols, k_rows,
12975
- n_heads_log2_floor, m0, m1, item_ct1);
12976
- });
12977
- }
12978
-
12979
12940
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12980
12941
  const int nrows, dpct::queue_ptr stream) {
12981
12942
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13056,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
13056
13017
  }
13057
13018
 
13058
13019
  template <bool vals_smem, int ncols_template, int block_size_template>
13059
- static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
13020
+ static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
13060
13021
  const int nrows_y, const float scale, const float max_bias, const float m0,
13061
13022
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
13062
13023
  const size_t n_local_scratch, dpct::queue_ptr stream) {
@@ -13066,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13066
13027
  cgh.parallel_for(
13067
13028
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
13068
13029
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
13069
- soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
13030
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
13070
13031
  nrows_y, scale, max_bias, m0,
13071
13032
  m1, n_head_log2, item_ct1,
13072
13033
  local_buf_acc.get_pointer());
@@ -13074,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13074
13035
  });
13075
13036
  }
13076
13037
 
13077
- static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
13038
+ static void soft_max_f32_sycl(const float * x, const float * mask,
13078
13039
  float * dst, const int ncols_x, const int nrows_x,
13079
13040
  const int nrows_y, const float scale, const float max_bias,
13080
13041
  dpct::queue_ptr stream) {
@@ -13096,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13096
13057
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13097
13058
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13098
13059
  if (ncols_x > max_block_size) {
13099
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13060
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13100
13061
  max_bias, m0, m1, n_head_log2, block_nums,
13101
13062
  block_dims, n_local_scratch, stream);
13102
13063
  return;
13103
13064
  }
13104
13065
  switch (ncols_x) {
13105
13066
  case 32:
13106
- soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13067
+ soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
13107
13068
  max_bias, m0, m1, n_head_log2, block_nums,
13108
13069
  block_dims, n_local_scratch, stream);
13109
13070
  break;
13110
13071
  case 64:
13111
- soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13072
+ soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
13112
13073
  max_bias, m0, m1, n_head_log2, block_nums,
13113
13074
  block_dims, n_local_scratch, stream);
13114
13075
  break;
13115
13076
  case 128:
13116
- soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13077
+ soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
13117
13078
  max_bias, m0, m1, n_head_log2, block_nums,
13118
13079
  block_dims, n_local_scratch, stream);
13119
13080
  break;
13120
13081
  case 256:
13121
- soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13082
+ soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
13122
13083
  max_bias, m0, m1, n_head_log2, block_nums,
13123
13084
  block_dims, n_local_scratch, stream);
13124
13085
  break;
13125
13086
  case 512:
13126
- soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13087
+ soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
13127
13088
  max_bias, m0, m1, n_head_log2, block_nums,
13128
13089
  block_dims, n_local_scratch, stream);
13129
13090
  break;
13130
13091
  case 1024:
13131
- soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13092
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13132
13093
  max_bias, m0, m1, n_head_log2, block_nums,
13133
13094
  block_dims, n_local_scratch, stream);
13134
13095
  break;
13135
13096
  case 2048:
13136
- soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13097
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13137
13098
  max_bias, m0, m1, n_head_log2, block_nums,
13138
13099
  block_dims, n_local_scratch, stream);
13139
13100
  break;
13140
13101
  case 4096:
13141
- soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13102
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13142
13103
  max_bias, m0, m1, n_head_log2, block_nums,
13143
13104
  block_dims, n_local_scratch, stream);
13144
13105
  break;
13145
13106
  default:
13146
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13107
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13147
13108
  max_bias, m0, m1, n_head_log2, block_nums,
13148
13109
  block_dims, n_local_scratch, stream);
13149
13110
  break;
13150
13111
  }
13151
13112
  } else {
13152
- soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13113
+ soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13153
13114
  max_bias, m0, m1, n_head_log2, block_nums,
13154
13115
  block_dims, WARP_SIZE, stream);
13155
13116
  }
@@ -14026,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
14026
13987
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
14027
13988
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
14028
13989
 
13990
+ #pragma message("TODO: generalize upscale operator")
13991
+ #pragma message(" https://github.com/ggerganov/ggml/pull/814")
13992
+ GGML_ASSERT(false && "TODO: generalize upscale operator");
13993
+
14029
13994
  const int scale_factor = dst->op_params[0];
14030
13995
 
14031
13996
  upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
@@ -14560,36 +14525,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14560
14525
  (void) src1_dd;
14561
14526
  }
14562
14527
 
14563
- inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
14564
- ggml_tensor *dst, const float *src0_dd,
14565
- const float *src1_dd, float *dst_dd,
14566
- const dpct::queue_ptr &main_stream) {
14567
-
14568
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
14569
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
14570
-
14571
- GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
14572
- const int64_t nrows = ggml_nrows(src0);
14573
-
14574
- //const int n_past = ((int32_t *) dst->op_params)[0];
14575
- const int n_head = ((int32_t *) dst->op_params)[1];
14576
- float max_bias;
14577
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
14578
-
14579
- //GGML_ASSERT(ne01 + n_past == ne00);
14580
- GGML_ASSERT(n_head == ne02);
14581
-
14582
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
14583
-
14584
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
14585
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
14586
-
14587
- alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
14588
-
14589
- (void) src1;
14590
- (void) src1_dd;
14591
- }
14592
-
14593
14528
  static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
14594
14529
  const ggml_tensor *src1, ggml_tensor *dst,
14595
14530
  const float *src0_dd, const float *src1_dd,
@@ -14744,12 +14679,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14744
14679
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14745
14680
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14746
14681
 
14747
- const ggml_tensor * src2 = dst->src[2];
14748
-
14749
- #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14682
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
14750
14683
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14751
14684
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14752
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14753
14685
 
14754
14686
  const int64_t ne00 = src0->ne[0];
14755
14687
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14761,25 +14693,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14761
14693
  memcpy(&scale, dst->op_params + 0, sizeof(float));
14762
14694
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
14763
14695
 
14764
- // positions tensor
14765
- float * src2_dd = nullptr;
14766
- sycl_pool_alloc<float> src2_f;
14767
-
14768
- const bool use_src2 = src2 != nullptr;
14769
-
14770
- if (use_src2) {
14771
- const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
14772
-
14773
- if (src2_on_device) {
14774
- ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
14775
- src2_dd = (float *) src2_extra->data_device[g_main_device];
14776
- } else {
14777
- src2_dd = src2_f.alloc(ggml_nelements(src2));
14778
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
14779
- }
14780
- }
14781
-
14782
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
14696
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
14783
14697
  nrows_x, nrows_y, scale, max_bias, main_stream);
14784
14698
  }
14785
14699
 
@@ -15654,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15654
15568
  const int64_t r2 = ne12/ne02;
15655
15569
  const int64_t r3 = ne13/ne03;
15656
15570
 
15657
- #if 0
15658
- // use syclGemmEx
15659
- {
15660
- for (int i13 = 0; i13 < ne13; ++i13) {
15661
- for (int i12 = 0; i12 < ne12; ++i12) {
15662
- int i03 = i13 / r3;
15663
- int i02 = i12 / r2;
15664
-
15665
- SYCL_CHECK(
15666
- syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
15667
- ne01, ne11, ne10,
15668
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
15669
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
15670
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
15671
- cu_compute_type,
15672
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
15673
- }
15674
- }
15675
- }
15676
- #else
15677
15571
  if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
15678
15572
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
15679
15573
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15685,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15685
15579
  nb11 / nb10, nb12 / nb10, beta,
15686
15580
  (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
15687
15581
  ne12 * ne13, cu_compute_type)));
15688
- g_sycl_handles[g_main_device]->wait();
15689
15582
  } else {
15690
15583
  const int ne23 = ne12*ne13;
15691
15584
 
@@ -15716,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15716
15609
  nb02, nb03, nb12_scaled, nb13_scaled,
15717
15610
  nbd2, nbd3, r2, r3, item_ct1);
15718
15611
  });
15719
- }).wait();
15612
+ });
15720
15613
  }
15721
15614
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
15722
15615
  *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15727,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15727
15620
  dpct::library_data_t::real_half, nb11 / nb10, beta,
15728
15621
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
15729
15622
  cu_compute_type)));
15730
- g_sycl_handles[g_main_device]->wait();
15731
15623
  }
15732
- #endif
15733
15624
 
15734
15625
  if (no_mixed_dtypes) {
15735
15626
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@@ -16230,10 +16121,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
16230
16121
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
16231
16122
  }
16232
16123
 
16233
- static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16234
- ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
16235
- }
16236
-
16237
16124
  static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16238
16125
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
16239
16126
  }
@@ -16610,9 +16497,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16610
16497
  case GGML_OP_ROPE:
16611
16498
  func = ggml_sycl_rope;
16612
16499
  break;
16613
- case GGML_OP_ALIBI:
16614
- func = ggml_sycl_alibi;
16615
- break;
16616
16500
  case GGML_OP_IM2COL:
16617
16501
  func = ggml_sycl_im2col;
16618
16502
  break;
@@ -17742,7 +17626,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17742
17626
  case GGML_OP_DIAG_MASK_INF:
17743
17627
  case GGML_OP_SOFT_MAX:
17744
17628
  case GGML_OP_ROPE:
17745
- case GGML_OP_ALIBI:
17746
17629
  case GGML_OP_IM2COL:
17747
17630
  case GGML_OP_POOL_2D:
17748
17631
  case GGML_OP_SUM_ROWS: