llama_cpp 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define GGML_RPC_MAX_SERVERS 16
11
+
12
+ // backend API
13
+ GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
14
+ GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
15
+
16
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
17
+
18
+ GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
+
20
+ GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
21
+
22
+ #ifdef __cplusplus
23
+ }
24
+ #endif
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
3154
3154
  #define SYCL_SCALE_BLOCK_SIZE 256
3155
3155
  #define SYCL_CLAMP_BLOCK_SIZE 256
3156
3156
  #define SYCL_ROPE_BLOCK_SIZE 256
3157
- #define SYCL_ALIBI_BLOCK_SIZE 32
3158
3157
  #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
3159
3158
  #define SYCL_QUANTIZE_BLOCK_SIZE 256
3160
3159
  #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@@ -8330,24 +8329,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
8330
8329
  const int blocks_per_row = ncols / qk;
8331
8330
  const int blocks_per_warp = vdr * WARP_SIZE / qi;
8332
8331
 
8333
- // partial sum for each thread
8332
+ const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
8333
+
8334
+ // partial sum for each thread
8334
8335
  float tmp = 0.0f;
8335
8336
 
8336
8337
  const block_q_t * x = (const block_q_t *) vx;
8337
8338
  const block_q8_1 * y = (const block_q8_1 *) vy;
8338
8339
 
8339
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8340
+ for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
8340
8341
  i += blocks_per_warp) {
8341
- const int ibx = row*blocks_per_row + i; // x block index
8342
+ const int ibx = row * blocks_per_row + i; // x block index
8342
8343
 
8343
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8344
+ const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
8344
8345
 
8345
- const int iqs =
8346
- vdr *
8347
- (item_ct1.get_local_id(2) %
8348
- (qi / vdr)); // x block quant index when casting the quants to int
8346
+ const int iqs =
8347
+ vdr *
8348
+ (item_ct1.get_local_id(2) -
8349
+ i * qi_vdr); // x block quant index when casting the quants to int
8349
8350
 
8350
- tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8351
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
8351
8352
  }
8352
8353
 
8353
8354
  // sum up partial sums and write back result
@@ -9314,32 +9315,6 @@ static void rope_glm_f32(
9314
9315
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
9315
9316
  }
9316
9317
 
9317
- static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
9318
- const int n_heads_log2_floor, const float m0, const float m1,
9319
- const sycl::nd_item<3> &item_ct1) {
9320
- const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
9321
- item_ct1.get_local_id(2);
9322
-
9323
- if (col >= ncols) {
9324
- return;
9325
- }
9326
-
9327
- const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
9328
- item_ct1.get_local_id(1);
9329
- const int i = row*ncols + col;
9330
-
9331
- const int k = row/k_rows;
9332
-
9333
- float m_k;
9334
- if (k < n_heads_log2_floor) {
9335
- m_k = dpct::pow(m0, k + 1);
9336
- } else {
9337
- m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
9338
- }
9339
-
9340
- dst[i] = col * m_k + x[i];
9341
- }
9342
-
9343
9318
  static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9344
9319
  const sycl::nd_item<3> &item_ct1) {
9345
9320
  const int row = item_ct1.get_group(1);
@@ -9441,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
9441
9416
 
9442
9417
 
9443
9418
  template <bool vals_smem, int ncols_template, int block_size_template>
9444
- static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
9419
+ static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
9445
9420
  const int nrows_y, const float scale, const float max_bias, const float m0,
9446
9421
  const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
9447
9422
  const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@@ -9455,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9455
9430
  const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
9456
9431
  const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
9457
9432
 
9458
- float slope = 0.0f;
9433
+ float slope = 1.0f;
9459
9434
 
9460
9435
  // ALiBi
9461
9436
  if (max_bias > 0.0f) {
@@ -9480,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
9480
9455
  const int ix = rowx*ncols + col;
9481
9456
  const int iy = rowy*ncols + col;
9482
9457
 
9483
- const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
9458
+ const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
9484
9459
 
9485
9460
  vals[col] = val;
9486
9461
  max_val = sycl::max(max_val, val);
@@ -12962,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
12962
12937
  });
12963
12938
  }
12964
12939
 
12965
- static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
12966
- const int nrows, const int k_rows,
12967
- const int n_heads_log2_floor, const float m0,
12968
- const float m1, dpct::queue_ptr stream) {
12969
- const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
12970
- const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
12971
- const sycl::range<3> block_nums(1, nrows, num_blocks_x);
12972
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
12973
- [=](sycl::nd_item<3> item_ct1) {
12974
- alibi_f32(x, dst, ncols, k_rows,
12975
- n_heads_log2_floor, m0, m1, item_ct1);
12976
- });
12977
- }
12978
-
12979
12940
  static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12980
12941
  const int nrows, dpct::queue_ptr stream) {
12981
12942
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -13056,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
13056
13017
  }
13057
13018
 
13058
13019
  template <bool vals_smem, int ncols_template, int block_size_template>
13059
- static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
13020
+ static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
13060
13021
  const int nrows_y, const float scale, const float max_bias, const float m0,
13061
13022
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
13062
13023
  const size_t n_local_scratch, dpct::queue_ptr stream) {
@@ -13066,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13066
13027
  cgh.parallel_for(
13067
13028
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
13068
13029
  [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
13069
- soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
13030
+ soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
13070
13031
  nrows_y, scale, max_bias, m0,
13071
13032
  m1, n_head_log2, item_ct1,
13072
13033
  local_buf_acc.get_pointer());
@@ -13074,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
13074
13035
  });
13075
13036
  }
13076
13037
 
13077
- static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
13038
+ static void soft_max_f32_sycl(const float * x, const float * mask,
13078
13039
  float * dst, const int ncols_x, const int nrows_x,
13079
13040
  const int nrows_y, const float scale, const float max_bias,
13080
13041
  dpct::queue_ptr stream) {
@@ -13096,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
13096
13057
  const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
13097
13058
  if (n_local_scratch*sizeof(float) < local_mem_size) {
13098
13059
  if (ncols_x > max_block_size) {
13099
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13060
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13100
13061
  max_bias, m0, m1, n_head_log2, block_nums,
13101
13062
  block_dims, n_local_scratch, stream);
13102
13063
  return;
13103
13064
  }
13104
13065
  switch (ncols_x) {
13105
13066
  case 32:
13106
- soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13067
+ soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
13107
13068
  max_bias, m0, m1, n_head_log2, block_nums,
13108
13069
  block_dims, n_local_scratch, stream);
13109
13070
  break;
13110
13071
  case 64:
13111
- soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13072
+ soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
13112
13073
  max_bias, m0, m1, n_head_log2, block_nums,
13113
13074
  block_dims, n_local_scratch, stream);
13114
13075
  break;
13115
13076
  case 128:
13116
- soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13077
+ soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
13117
13078
  max_bias, m0, m1, n_head_log2, block_nums,
13118
13079
  block_dims, n_local_scratch, stream);
13119
13080
  break;
13120
13081
  case 256:
13121
- soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13082
+ soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
13122
13083
  max_bias, m0, m1, n_head_log2, block_nums,
13123
13084
  block_dims, n_local_scratch, stream);
13124
13085
  break;
13125
13086
  case 512:
13126
- soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13087
+ soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
13127
13088
  max_bias, m0, m1, n_head_log2, block_nums,
13128
13089
  block_dims, n_local_scratch, stream);
13129
13090
  break;
13130
13091
  case 1024:
13131
- soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13092
+ soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13132
13093
  max_bias, m0, m1, n_head_log2, block_nums,
13133
13094
  block_dims, n_local_scratch, stream);
13134
13095
  break;
13135
13096
  case 2048:
13136
- soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13097
+ soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13137
13098
  max_bias, m0, m1, n_head_log2, block_nums,
13138
13099
  block_dims, n_local_scratch, stream);
13139
13100
  break;
13140
13101
  case 4096:
13141
- soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13102
+ soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
13142
13103
  max_bias, m0, m1, n_head_log2, block_nums,
13143
13104
  block_dims, n_local_scratch, stream);
13144
13105
  break;
13145
13106
  default:
13146
- soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13107
+ soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13147
13108
  max_bias, m0, m1, n_head_log2, block_nums,
13148
13109
  block_dims, n_local_scratch, stream);
13149
13110
  break;
13150
13111
  }
13151
13112
  } else {
13152
- soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
13113
+ soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
13153
13114
  max_bias, m0, m1, n_head_log2, block_nums,
13154
13115
  block_dims, WARP_SIZE, stream);
13155
13116
  }
@@ -14026,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
14026
13987
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
14027
13988
  GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
14028
13989
 
13990
+ #pragma message("TODO: generalize upscale operator")
13991
+ #pragma message(" https://github.com/ggerganov/ggml/pull/814")
13992
+ GGML_ASSERT(false && "TODO: generalize upscale operator");
13993
+
14029
13994
  const int scale_factor = dst->op_params[0];
14030
13995
 
14031
13996
  upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
@@ -14560,36 +14525,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
14560
14525
  (void) src1_dd;
14561
14526
  }
14562
14527
 
14563
- inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
14564
- ggml_tensor *dst, const float *src0_dd,
14565
- const float *src1_dd, float *dst_dd,
14566
- const dpct::queue_ptr &main_stream) {
14567
-
14568
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
14569
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
14570
-
14571
- GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
14572
- const int64_t nrows = ggml_nrows(src0);
14573
-
14574
- //const int n_past = ((int32_t *) dst->op_params)[0];
14575
- const int n_head = ((int32_t *) dst->op_params)[1];
14576
- float max_bias;
14577
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
14578
-
14579
- //GGML_ASSERT(ne01 + n_past == ne00);
14580
- GGML_ASSERT(n_head == ne02);
14581
-
14582
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
14583
-
14584
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
14585
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
14586
-
14587
- alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
14588
-
14589
- (void) src1;
14590
- (void) src1_dd;
14591
- }
14592
-
14593
14528
  static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
14594
14529
  const ggml_tensor *src1, ggml_tensor *dst,
14595
14530
  const float *src0_dd, const float *src1_dd,
@@ -14744,12 +14679,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14744
14679
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
14745
14680
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
14746
14681
 
14747
- const ggml_tensor * src2 = dst->src[2];
14748
-
14749
- #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
14682
+ #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
14750
14683
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
14751
14684
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
14752
- GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
14753
14685
 
14754
14686
  const int64_t ne00 = src0->ne[0];
14755
14687
  const int64_t nrows_x = ggml_nrows(src0);
@@ -14761,25 +14693,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
14761
14693
  memcpy(&scale, dst->op_params + 0, sizeof(float));
14762
14694
  memcpy(&max_bias, dst->op_params + 1, sizeof(float));
14763
14695
 
14764
- // positions tensor
14765
- float * src2_dd = nullptr;
14766
- sycl_pool_alloc<float> src2_f;
14767
-
14768
- const bool use_src2 = src2 != nullptr;
14769
-
14770
- if (use_src2) {
14771
- const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
14772
-
14773
- if (src2_on_device) {
14774
- ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
14775
- src2_dd = (float *) src2_extra->data_device[g_main_device];
14776
- } else {
14777
- src2_dd = src2_f.alloc(ggml_nelements(src2));
14778
- SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
14779
- }
14780
- }
14781
-
14782
- soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
14696
+ soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
14783
14697
  nrows_x, nrows_y, scale, max_bias, main_stream);
14784
14698
  }
14785
14699
 
@@ -15654,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15654
15568
  const int64_t r2 = ne12/ne02;
15655
15569
  const int64_t r3 = ne13/ne03;
15656
15570
 
15657
- #if 0
15658
- // use syclGemmEx
15659
- {
15660
- for (int i13 = 0; i13 < ne13; ++i13) {
15661
- for (int i12 = 0; i12 < ne12; ++i12) {
15662
- int i03 = i13 / r3;
15663
- int i02 = i12 / r2;
15664
-
15665
- SYCL_CHECK(
15666
- syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
15667
- ne01, ne11, ne10,
15668
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
15669
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
15670
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
15671
- cu_compute_type,
15672
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
15673
- }
15674
- }
15675
- }
15676
- #else
15677
15571
  if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
15678
15572
  // there is no broadcast and src0, src1 are contiguous across dims 2, 3
15679
15573
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15685,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15685
15579
  nb11 / nb10, nb12 / nb10, beta,
15686
15580
  (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
15687
15581
  ne12 * ne13, cu_compute_type)));
15688
- g_sycl_handles[g_main_device]->wait();
15689
15582
  } else {
15690
15583
  const int ne23 = ne12*ne13;
15691
15584
 
@@ -15716,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15716
15609
  nb02, nb03, nb12_scaled, nb13_scaled,
15717
15610
  nbd2, nbd3, r2, r3, item_ct1);
15718
15611
  });
15719
- }).wait();
15612
+ });
15720
15613
  }
15721
15614
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
15722
15615
  *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15727,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15727
15620
  dpct::library_data_t::real_half, nb11 / nb10, beta,
15728
15621
  (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
15729
15622
  cu_compute_type)));
15730
- g_sycl_handles[g_main_device]->wait();
15731
15623
  }
15732
- #endif
15733
15624
 
15734
15625
  if (no_mixed_dtypes) {
15735
15626
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
@@ -16230,10 +16121,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
16230
16121
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
16231
16122
  }
16232
16123
 
16233
- static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16234
- ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
16235
- }
16236
-
16237
16124
  static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
16238
16125
  ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
16239
16126
  }
@@ -16610,9 +16497,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16610
16497
  case GGML_OP_ROPE:
16611
16498
  func = ggml_sycl_rope;
16612
16499
  break;
16613
- case GGML_OP_ALIBI:
16614
- func = ggml_sycl_alibi;
16615
- break;
16616
16500
  case GGML_OP_IM2COL:
16617
16501
  func = ggml_sycl_im2col;
16618
16502
  break;
@@ -17742,7 +17626,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17742
17626
  case GGML_OP_DIAG_MASK_INF:
17743
17627
  case GGML_OP_SOFT_MAX:
17744
17628
  case GGML_OP_ROPE:
17745
- case GGML_OP_ALIBI:
17746
17629
  case GGML_OP_IM2COL:
17747
17630
  case GGML_OP_POOL_2D:
17748
17631
  case GGML_OP_SUM_ROWS: