llama_cpp 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -740,11 +740,7 @@ namespace dpct
740
740
 
741
741
  sycl::queue &default_queue()
742
742
  {
743
- #ifdef DPCT_USM_LEVEL_NONE
744
- return out_of_order_queue();
745
- #else
746
743
  return in_order_queue();
747
- #endif // DPCT_USM_LEVEL_NONE
748
744
  }
749
745
 
750
746
  void queues_wait_and_throw()
@@ -763,11 +759,7 @@ namespace dpct
763
759
 
764
760
  sycl::queue *create_queue(bool enable_exception_handler = false)
765
761
  {
766
- #ifdef DPCT_USM_LEVEL_NONE
767
- return create_out_of_order_queue(enable_exception_handler);
768
- #else
769
762
  return create_in_order_queue(enable_exception_handler);
770
- #endif // DPCT_USM_LEVEL_NONE
771
763
  }
772
764
 
773
765
  sycl::queue *create_queue(sycl::context context, sycl::device device,
@@ -1075,11 +1067,6 @@ namespace dpct
1075
1067
  static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
1076
1068
  const void *ptr)
1077
1069
  {
1078
- #ifdef DPCT_USM_LEVEL_NONE
1079
- return mem_mgr::instance().is_device_ptr(ptr)
1080
- ? pointer_access_attribute::device_only
1081
- : pointer_access_attribute::host_only;
1082
- #else
1083
1070
  switch (sycl::get_pointer_type(ptr, q.get_context()))
1084
1071
  {
1085
1072
  case sycl::usm::alloc::unknown:
@@ -1090,7 +1077,6 @@ namespace dpct
1090
1077
  case sycl::usm::alloc::host:
1091
1078
  return pointer_access_attribute::host_device;
1092
1079
  }
1093
- #endif
1094
1080
  }
1095
1081
 
1096
1082
  template <typename ArgT>
@@ -1273,11 +1259,7 @@ namespace dpct
1273
1259
 
1274
1260
  static inline void *dpct_malloc(size_t size, sycl::queue &q)
1275
1261
  {
1276
- #ifdef DPCT_USM_LEVEL_NONE
1277
- return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
1278
- #else
1279
1262
  return sycl::malloc_device(size, q.get_device(), q.get_context());
1280
- #endif // DPCT_USM_LEVEL_NONE
1281
1263
  }
1282
1264
 
1283
1265
  #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
@@ -1301,25 +1283,7 @@ namespace dpct
1301
1283
  static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
1302
1284
  valueT value, size_t size)
1303
1285
  {
1304
- #ifdef DPCT_USM_LEVEL_NONE
1305
- auto &mm = mem_mgr::instance();
1306
- assert(mm.is_device_ptr(dev_ptr));
1307
- auto alloc = mm.translate_ptr(dev_ptr);
1308
- size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
1309
-
1310
- return q.submit([&](sycl::handler &cgh)
1311
- {
1312
- auto r = sycl::range<1>(size);
1313
- auto o = sycl::id<1>(offset);
1314
- auto new_buffer = alloc.buffer.reinterpret<valueT>(
1315
- sycl::range<1>(alloc.size / sizeof(valueT)));
1316
- sycl::accessor<valueT, 1, sycl::access_mode::write,
1317
- sycl::access::target::device>
1318
- acc(new_buffer, cgh, r, o);
1319
- cgh.fill(acc, value); });
1320
- #else
1321
1286
  return q.fill(dev_ptr, value, size);
1322
- #endif // DPCT_USM_LEVEL_NONE
1323
1287
  }
1324
1288
 
1325
1289
  /**
@@ -1413,72 +1377,8 @@ namespace dpct
1413
1377
  {
1414
1378
  if (!size)
1415
1379
  return sycl::event{};
1416
- #ifdef DPCT_USM_LEVEL_NONE
1417
- auto &mm = mem_mgr::instance();
1418
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
1419
-
1420
- switch (real_direction)
1421
- {
1422
- case host_to_host:
1423
- return q.submit([&](sycl::handler &cgh)
1424
- {
1425
- cgh.depends_on(dep_events);
1426
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
1427
- case host_to_device:
1428
- {
1429
- auto alloc = mm.translate_ptr(to_ptr);
1430
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
1431
- return q.submit([&](sycl::handler &cgh)
1432
- {
1433
- cgh.depends_on(dep_events);
1434
- auto r = sycl::range<1>(size);
1435
- auto o = sycl::id<1>(offset);
1436
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1437
- sycl::access::target::device>
1438
- acc(alloc.buffer, cgh, r, o);
1439
- cgh.copy(from_ptr, acc); });
1440
- }
1441
- case device_to_host:
1442
- {
1443
- auto alloc = mm.translate_ptr(from_ptr);
1444
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
1445
- return q.submit([&](sycl::handler &cgh)
1446
- {
1447
- cgh.depends_on(dep_events);
1448
- auto r = sycl::range<1>(size);
1449
- auto o = sycl::id<1>(offset);
1450
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1451
- sycl::access::target::device>
1452
- acc(alloc.buffer, cgh, r, o);
1453
- cgh.copy(acc, to_ptr); });
1454
- }
1455
- case device_to_device:
1456
- {
1457
- auto to_alloc = mm.translate_ptr(to_ptr);
1458
- auto from_alloc = mm.translate_ptr(from_ptr);
1459
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
1460
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
1461
- return q.submit([&](sycl::handler &cgh)
1462
- {
1463
- cgh.depends_on(dep_events);
1464
- auto r = sycl::range<1>(size);
1465
- auto to_o = sycl::id<1>(to_offset);
1466
- auto from_o = sycl::id<1>(from_offset);
1467
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1468
- sycl::access::target::device>
1469
- to_acc(to_alloc.buffer, cgh, r, to_o);
1470
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1471
- sycl::access::target::device>
1472
- from_acc(from_alloc.buffer, cgh, r, from_o);
1473
- cgh.copy(from_acc, to_acc); });
1474
- }
1475
- default:
1476
- throw std::runtime_error("dpct_memcpy: invalid direction value");
1477
- }
1478
- #else
1479
1380
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
1480
1381
  GGML_UNUSED(direction);
1481
- #endif // DPCT_USM_LEVEL_NONE
1482
1382
  }
1483
1383
 
1484
1384
  // Get actual copy range and make sure it will not exceed range.
@@ -1618,45 +1518,15 @@ namespace dpct
1618
1518
  break;
1619
1519
  }
1620
1520
  case device_to_device:
1621
- #ifdef DPCT_USM_LEVEL_NONE
1622
- {
1623
- auto &mm = mem_mgr::instance();
1624
- auto to_alloc = mm.translate_ptr(to_surface);
1625
- auto from_alloc = mm.translate_ptr(from_surface);
1626
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
1627
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
1628
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1629
- {
1630
- cgh.depends_on(dep_events);
1631
- auto to_o = sycl::id<1>(to_offset);
1632
- auto from_o = sycl::id<1>(from_offset);
1633
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1634
- sycl::access::target::device>
1635
- to_acc(to_alloc.buffer, cgh,
1636
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
1637
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1638
- sycl::access::target::device>
1639
- from_acc(from_alloc.buffer, cgh,
1640
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
1641
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
1642
- size,
1643
- [=](sycl::id<3> id) {
1644
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
1645
- from_acc[get_offset(id, from_slice, from_range.get(0))];
1646
- }); }));
1647
- }
1648
- #else
1649
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1650
- {
1651
- cgh.depends_on(dep_events);
1652
- cgh.parallel_for<class dpct_memcpy_3d_detail>(
1653
- size,
1654
- [=](sycl::id<3> id) {
1655
- to_surface[get_offset(id, to_slice, to_range.get(0))] =
1656
- from_surface[get_offset(id, from_slice, from_range.get(0))];
1657
- }); }));
1658
- #endif
1659
- break;
1521
+ event_list.push_back(q.submit([&](sycl::handler &cgh){
1522
+ cgh.depends_on(dep_events);
1523
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
1524
+ size,
1525
+ [=](sycl::id<3> id) {
1526
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
1527
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
1528
+ }); }));
1529
+ break;
1660
1530
  default:
1661
1531
  throw std::runtime_error("dpct_memcpy: invalid direction value");
1662
1532
  }
@@ -1754,11 +1624,7 @@ namespace dpct
1754
1624
  {
1755
1625
  if (ptr)
1756
1626
  {
1757
- #ifdef DPCT_USM_LEVEL_NONE
1758
- detail::mem_mgr::instance().mem_free(ptr);
1759
- #else
1760
1627
  sycl::free(ptr, q.get_context());
1761
- #endif // DPCT_USM_LEVEL_NONE
1762
1628
  }
1763
1629
  }
1764
1630
 
@@ -1766,11 +1632,7 @@ namespace dpct
1766
1632
  inline auto get_memory(const void *x)
1767
1633
  {
1768
1634
  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
1769
- #ifdef DPCT_USM_LEVEL_NONE
1770
- return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
1771
- #else
1772
1635
  return new_x;
1773
- #endif
1774
1636
  }
1775
1637
 
1776
1638
  template <typename T>
@@ -2222,72 +2084,8 @@ namespace dpct
2222
2084
  {
2223
2085
  if (!size)
2224
2086
  return sycl::event{};
2225
- #ifdef DPCT_USM_LEVEL_NONE
2226
- auto &mm = mem_mgr::instance();
2227
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
2228
-
2229
- switch (real_direction)
2230
- {
2231
- case host_to_host:
2232
- return q.submit([&](sycl::handler &cgh)
2233
- {
2234
- cgh.depends_on(dep_events);
2235
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
2236
- case host_to_device:
2237
- {
2238
- auto alloc = mm.translate_ptr(to_ptr);
2239
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
2240
- return q.submit([&](sycl::handler &cgh)
2241
- {
2242
- cgh.depends_on(dep_events);
2243
- auto r = sycl::range<1>(size);
2244
- auto o = sycl::id<1>(offset);
2245
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2246
- sycl::access::target::device>
2247
- acc(alloc.buffer, cgh, r, o);
2248
- cgh.copy(from_ptr, acc); });
2249
- }
2250
- case device_to_host:
2251
- {
2252
- auto alloc = mm.translate_ptr(from_ptr);
2253
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
2254
- return q.submit([&](sycl::handler &cgh)
2255
- {
2256
- cgh.depends_on(dep_events);
2257
- auto r = sycl::range<1>(size);
2258
- auto o = sycl::id<1>(offset);
2259
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2260
- sycl::access::target::device>
2261
- acc(alloc.buffer, cgh, r, o);
2262
- cgh.copy(acc, to_ptr); });
2263
- }
2264
- case device_to_device:
2265
- {
2266
- auto to_alloc = mm.translate_ptr(to_ptr);
2267
- auto from_alloc = mm.translate_ptr(from_ptr);
2268
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
2269
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
2270
- return q.submit([&](sycl::handler &cgh)
2271
- {
2272
- cgh.depends_on(dep_events);
2273
- auto r = sycl::range<1>(size);
2274
- auto to_o = sycl::id<1>(to_offset);
2275
- auto from_o = sycl::id<1>(from_offset);
2276
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2277
- sycl::access::target::device>
2278
- to_acc(to_alloc.buffer, cgh, r, to_o);
2279
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2280
- sycl::access::target::device>
2281
- from_acc(from_alloc.buffer, cgh, r, from_o);
2282
- cgh.copy(from_acc, to_acc); });
2283
- }
2284
- default:
2285
- throw std::runtime_error("dpct_memcpy: invalid direction value");
2286
- }
2287
- #else
2288
2087
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
2289
2088
  GGML_UNUSED(direction);
2290
- #endif // DPCT_USM_LEVEL_NONE
2291
2089
  }
2292
2090
 
2293
2091
  // Get actual copy range and make sure it will not exceed range.
@@ -2427,34 +2225,6 @@ namespace dpct
2427
2225
  break;
2428
2226
  }
2429
2227
  case device_to_device:
2430
- #ifdef DPCT_USM_LEVEL_NONE
2431
- {
2432
- auto &mm = mem_mgr::instance();
2433
- auto to_alloc = mm.translate_ptr(to_surface);
2434
- auto from_alloc = mm.translate_ptr(from_surface);
2435
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
2436
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
2437
- event_list.push_back(q.submit([&](sycl::handler &cgh)
2438
- {
2439
- cgh.depends_on(dep_events);
2440
- auto to_o = sycl::id<1>(to_offset);
2441
- auto from_o = sycl::id<1>(from_offset);
2442
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2443
- sycl::access::target::device>
2444
- to_acc(to_alloc.buffer, cgh,
2445
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
2446
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2447
- sycl::access::target::device>
2448
- from_acc(from_alloc.buffer, cgh,
2449
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
2450
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
2451
- size,
2452
- [=](sycl::id<3> id) {
2453
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
2454
- from_acc[get_offset(id, from_slice, from_range.get(0))];
2455
- }); }));
2456
- }
2457
- #else
2458
2228
  event_list.push_back(q.submit([&](sycl::handler &cgh)
2459
2229
  {
2460
2230
  cgh.depends_on(dep_events);
@@ -2464,7 +2234,6 @@ namespace dpct
2464
2234
  to_surface[get_offset(id, to_slice, to_range.get(0))] =
2465
2235
  from_surface[get_offset(id, from_slice, from_range.get(0))];
2466
2236
  }); }));
2467
- #endif
2468
2237
  break;
2469
2238
  default:
2470
2239
  throw std::runtime_error("dpct_memcpy: invalid direction value");
@@ -2655,9 +2424,6 @@ namespace dpct
2655
2424
  void *c[], library_data_t c_type, int ldc,
2656
2425
  int batch_size, library_data_t scaling_type)
2657
2426
  {
2658
- #ifdef DPCT_USM_LEVEL_NONE
2659
- throw std::runtime_error("this API is unsupported when USM level is none");
2660
- #else
2661
2427
  if (scaling_type == library_data_t::real_float &&
2662
2428
  c_type == library_data_t::complex_float)
2663
2429
  {
@@ -2792,7 +2558,6 @@ namespace dpct
2792
2558
  default:
2793
2559
  throw std::runtime_error("the combination of data type is unsupported");
2794
2560
  }
2795
- #endif
2796
2561
  }
2797
2562
 
2798
2563
  /// Computes a batch of matrix-matrix product with general matrices.
@@ -3131,24 +2896,9 @@ namespace dpct
3131
2896
  template <size_t D = Dimension>
3132
2897
  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
3133
2898
  init();
3134
- #ifdef DPCT_USM_LEVEL_NONE
3135
- return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
3136
- _device_ptr)
3137
- .template get_access<sycl::access_mode::read_write>()[index];
3138
- #else
3139
2899
  return _device_ptr[index];
3140
- #endif // DPCT_USM_LEVEL_NONE
3141
2900
  }
3142
2901
 
3143
- #ifdef DPCT_USM_LEVEL_NONE
3144
- /// Get sycl::accessor for the device memory object when usm is not used.
3145
- accessor_t get_access(sycl::handler &cgh) {
3146
- return get_buffer(_device_ptr)
3147
- .template reinterpret<T, Dimension>(_range)
3148
- .template get_access<detail::memory_traits<Memory, T>::mode,
3149
- detail::memory_traits<Memory, T>::target>(cgh);
3150
- }
3151
- #else
3152
2902
  /// Get dpct::accessor with dimension info for the device memory object
3153
2903
  /// when usm is used and dimension is greater than 1.
3154
2904
  template <size_t D = Dimension>
@@ -3156,7 +2906,6 @@ namespace dpct
3156
2906
  get_access(sycl::handler &cgh) {
3157
2907
  return dpct_accessor_t((T *)_device_ptr, _range);
3158
2908
  }
3159
- #endif // DPCT_USM_LEVEL_NONE
3160
2909
 
3161
2910
  private:
3162
2911
  device_memory(value_t *memory_ptr, size_t size)
@@ -3201,15 +2950,6 @@ namespace dpct
3201
2950
 
3202
2951
  /// Default constructor
3203
2952
  device_memory() : base(1) {}
3204
-
3205
- #ifdef DPCT_USM_LEVEL_NONE
3206
- /// Get sycl::accessor for the device memory object when usm is not used.
3207
- accessor_t get_access(sycl::handler &cgh) {
3208
- auto buf = get_buffer(base::get_ptr())
3209
- .template reinterpret<T, 1>(sycl::range<1>(1));
3210
- return accessor_t(buf, cgh);
3211
- }
3212
- #endif // DPCT_USM_LEVEL_NONE
3213
2953
  };
3214
2954
  } // namespace detail
3215
2955
 
@@ -3228,7 +2968,7 @@ namespace dpct
3228
2968
  #include "ggml-common.h"
3229
2969
 
3230
2970
  static int g_ggml_sycl_debug=0;
3231
- #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
2971
+ #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
3232
2972
 
3233
2973
  #define CHECK_TRY_ERROR(expr) \
3234
2974
  [&]() { \
@@ -8339,7 +8079,7 @@ template <bool need_check> static void
8339
8079
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8340
8080
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8341
8081
  const sycl::nd_item<3> &item_ct1,
8342
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
8082
+ const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
8343
8083
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8344
8084
  item_ct1.get_local_id(1);
8345
8085
 
@@ -10216,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10216
9956
  dpct::queue_ptr stream) {
10217
9957
  const int nb = k / QK_K;
10218
9958
  {
10219
- iq2xxs_grid.init(*stream);
10220
- ksigns_iq2xs.init(*stream);
10221
- kmask_iq2xs.init(*stream);
10222
9959
 
10223
9960
  dpct::has_capability_or_fail(stream->get_device(),
10224
9961
  {sycl::aspect::fp16});
10225
9962
 
10226
9963
  stream->submit([&](sycl::handler &cgh) {
10227
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10228
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10229
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9964
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
9965
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9966
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10230
9967
 
10231
9968
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10232
9969
  sycl::range<3>(1, 1, 32),
@@ -10245,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
10245
9982
  dpct::queue_ptr stream) {
10246
9983
  const int nb = k / QK_K;
10247
9984
  {
10248
- iq2xs_grid.init(*stream);
10249
- ksigns_iq2xs.init(*stream);
10250
- kmask_iq2xs.init(*stream);
10251
9985
 
10252
9986
  dpct::has_capability_or_fail(stream->get_device(),
10253
9987
  {sycl::aspect::fp16});
10254
9988
 
10255
9989
  stream->submit([&](sycl::handler &cgh) {
10256
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10257
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10258
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9990
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
9991
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9992
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10259
9993
 
10260
9994
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10261
9995
  sycl::range<3>(1, 1, 32),
@@ -10274,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10274
10008
  dpct::queue_ptr stream) {
10275
10009
  const int nb = k / QK_K;
10276
10010
  {
10277
- iq3xxs_grid.init(*stream);
10278
- ksigns_iq2xs.init(*stream);
10279
- kmask_iq2xs.init(*stream);
10280
10011
 
10281
10012
  dpct::has_capability_or_fail(stream->get_device(),
10282
10013
  {sycl::aspect::fp16});
10283
10014
 
10284
10015
  stream->submit([&](sycl::handler &cgh) {
10285
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10286
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10287
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10016
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10017
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10018
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10288
10019
 
10289
10020
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10290
10021
  sycl::range<3>(1, 1, 32),
@@ -10303,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10303
10034
  dpct::queue_ptr stream) {
10304
10035
  const int nb = k / QK_K;
10305
10036
  {
10306
- iq3s_grid.init(*stream);
10307
- ksigns_iq2xs.init(*stream);
10308
- kmask_iq2xs.init(*stream);
10309
10037
 
10310
10038
  dpct::has_capability_or_fail(stream->get_device(),
10311
10039
  {sycl::aspect::fp16});
10312
10040
 
10313
10041
  stream->submit([&](sycl::handler &cgh) {
10314
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10315
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10316
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10042
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10043
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10044
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10317
10045
 
10318
10046
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10319
10047
  sycl::range<3>(1, 1, 32),
@@ -10332,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10332
10060
  dpct::queue_ptr stream) {
10333
10061
  const int nb = k / QK_K;
10334
10062
  {
10335
- iq1s_grid_gpu.init(*stream);
10336
- ksigns_iq2xs.init(*stream);
10337
- kmask_iq2xs.init(*stream);
10338
10063
 
10339
10064
  dpct::has_capability_or_fail(stream->get_device(),
10340
10065
  {sycl::aspect::fp16});
10341
10066
 
10342
10067
  stream->submit([&](sycl::handler &cgh) {
10343
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10344
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10345
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10068
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10069
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10070
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10346
10071
 
10347
10072
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10348
10073
  sycl::range<3>(1, 1, 32),
@@ -10675,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10675
10400
  const sycl::range<3> block_nums(1, 1, block_num_y);
10676
10401
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10677
10402
  {
10678
- iq3xxs_grid.init(*stream);
10679
- ksigns64.init(*stream);
10680
10403
 
10681
10404
  stream->submit([&](sycl::handler &cgh) {
10682
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10683
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10684
10405
 
10685
10406
  cgh.parallel_for(
10686
10407
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10688,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10688
10409
  [[intel::reqd_sub_group_size(32)]] {
10689
10410
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10690
10411
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10691
- vx, vy, dst, ncols, nrows, item_ct1,
10692
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10412
+ vx, vy, dst, ncols, nrows, item_ct1);
10693
10413
  });
10694
10414
  });
10695
10415
  }
@@ -10704,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10704
10424
  const sycl::range<3> block_nums(1, 1, block_num_y);
10705
10425
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10706
10426
  {
10707
- iq3xxs_grid.init(*stream);
10708
- ksigns64.init(*stream);
10709
10427
 
10710
10428
  stream->submit([&](sycl::handler &cgh) {
10711
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10712
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10713
10429
 
10714
10430
  cgh.parallel_for(
10715
10431
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10717,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10717
10433
  [[intel::reqd_sub_group_size(32)]] {
10718
10434
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10719
10435
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10720
- vx, vy, dst, ncols, nrows, item_ct1,
10721
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10436
+ vx, vy, dst, ncols, nrows, item_ct1);
10722
10437
  });
10723
10438
  });
10724
10439
  }
@@ -10733,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10733
10448
  const sycl::range<3> block_nums(1, 1, block_num_y);
10734
10449
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10735
10450
  {
10736
- iq3xxs_grid.init(*stream);
10737
- ksigns64.init(*stream);
10738
10451
 
10739
10452
  stream->submit([&](sycl::handler &cgh) {
10740
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10741
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10742
10453
 
10743
10454
  cgh.parallel_for(
10744
10455
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10746,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10746
10457
  [[intel::reqd_sub_group_size(32)]] {
10747
10458
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10748
10459
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10749
- vx, vy, dst, ncols, nrows, item_ct1,
10750
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10460
+ vx, vy, dst, ncols, nrows, item_ct1);
10751
10461
  });
10752
10462
  });
10753
10463
  }
@@ -10762,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10762
10472
  const sycl::range<3> block_nums(1, 1, block_num_y);
10763
10473
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10764
10474
  {
10765
- iq3xxs_grid.init(*stream);
10766
- ksigns64.init(*stream);
10767
10475
 
10768
10476
  stream->submit([&](sycl::handler &cgh) {
10769
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10770
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10771
10477
 
10772
10478
  cgh.parallel_for(
10773
10479
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10775,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10775
10481
  [[intel::reqd_sub_group_size(32)]] {
10776
10482
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10777
10483
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10778
- vx, vy, dst, ncols, nrows, item_ct1,
10779
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10484
+ vx, vy, dst, ncols, nrows, item_ct1);
10780
10485
  });
10781
10486
  });
10782
10487
  }
@@ -10791,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10791
10496
  const sycl::range<3> block_nums(1, 1, block_num_y);
10792
10497
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10793
10498
  {
10794
- iq3xxs_grid.init(*stream);
10795
- ksigns64.init(*stream);
10796
10499
 
10797
10500
  stream->submit([&](sycl::handler &cgh) {
10798
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10799
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10800
10501
 
10801
10502
  cgh.parallel_for(
10802
10503
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10804,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10804
10505
  [[intel::reqd_sub_group_size(32)]] {
10805
10506
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10806
10507
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10807
- vx, vy, dst, ncols, nrows, item_ct1,
10808
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10508
+ vx, vy, dst, ncols, nrows, item_ct1);
10809
10509
  });
10810
10510
  });
10811
10511
  }
@@ -10820,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10820
10520
  const sycl::range<3> block_nums(1, 1, block_num_y);
10821
10521
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10822
10522
  {
10823
- iq3xxs_grid.init(*stream);
10824
- ksigns64.init(*stream);
10825
10523
 
10826
10524
  stream->submit([&](sycl::handler &cgh) {
10827
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10828
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10829
10525
 
10830
10526
  cgh.parallel_for(
10831
10527
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10833,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10833
10529
  [[intel::reqd_sub_group_size(32)]] {
10834
10530
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10835
10531
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10836
- vx, vy, dst, ncols, nrows, item_ct1,
10837
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10532
+ vx, vy, dst, ncols, nrows, item_ct1);
10838
10533
  });
10839
10534
  });
10840
10535
  }
@@ -10849,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10849
10544
  const sycl::range<3> block_nums(1, 1, block_num_y);
10850
10545
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10851
10546
  {
10852
- iq3xxs_grid.init(*stream);
10853
- ksigns64.init(*stream);
10854
10547
 
10855
10548
  stream->submit([&](sycl::handler &cgh) {
10856
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10857
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10858
10549
 
10859
10550
  cgh.parallel_for(
10860
10551
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10862,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10862
10553
  [[intel::reqd_sub_group_size(32)]] {
10863
10554
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10864
10555
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10865
- vx, vy, dst, ncols, nrows, item_ct1,
10866
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10556
+ vx, vy, dst, ncols, nrows, item_ct1);
10867
10557
  });
10868
10558
  });
10869
10559
  }
@@ -10878,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10878
10568
  const sycl::range<3> block_nums(1, 1, block_num_y);
10879
10569
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10880
10570
  {
10881
- iq3xxs_grid.init(*stream);
10882
- ksigns64.init(*stream);
10883
10571
 
10884
10572
  stream->submit([&](sycl::handler &cgh) {
10885
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10886
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10887
10573
 
10888
10574
  cgh.parallel_for(
10889
10575
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10891,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10891
10577
  [[intel::reqd_sub_group_size(32)]] {
10892
10578
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10893
10579
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10894
- vx, vy, dst, ncols, nrows, item_ct1,
10895
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10580
+ vx, vy, dst, ncols, nrows, item_ct1);
10896
10581
  });
10897
10582
  });
10898
10583
  }
@@ -10907,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10907
10592
  const sycl::range<3> block_nums(1, 1, block_num_y);
10908
10593
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10909
10594
  {
10910
- iq3xxs_grid.init(*stream);
10911
- ksigns64.init(*stream);
10912
10595
 
10913
10596
  stream->submit([&](sycl::handler &cgh) {
10914
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10915
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10916
10597
 
10917
10598
  cgh.parallel_for(
10918
10599
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10920,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10920
10601
  [[intel::reqd_sub_group_size(32)]] {
10921
10602
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10922
10603
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10923
- vx, vy, dst, ncols, nrows, item_ct1,
10924
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10604
+ vx, vy, dst, ncols, nrows, item_ct1);
10925
10605
  });
10926
10606
  });
10927
10607
  }
@@ -10936,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10936
10616
  const sycl::range<3> block_nums(1, 1, block_num_y);
10937
10617
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10938
10618
  {
10939
- iq3xxs_grid.init(*stream);
10940
- ksigns64.init(*stream);
10941
10619
 
10942
10620
  stream->submit([&](sycl::handler &cgh) {
10943
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10944
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10945
10621
 
10946
10622
  cgh.parallel_for(
10947
10623
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10949,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10949
10625
  [[intel::reqd_sub_group_size(32)]] {
10950
10626
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10951
10627
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10952
- vx, vy, dst, ncols, nrows, item_ct1,
10953
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10628
+ vx, vy, dst, ncols, nrows, item_ct1);
10954
10629
  });
10955
10630
  });
10956
10631
  }
10957
10632
  }
10958
10633
 
10634
+
10959
10635
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10960
10636
  float *dst, const int ncols,
10961
10637
  const int nrows,
@@ -10965,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10965
10641
  const sycl::range<3> block_nums(1, 1, block_num_y);
10966
10642
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10967
10643
  {
10968
- iq2xxs_grid.init(*stream);
10969
- ksigns_iq2xs.init(*stream);
10970
- kmask_iq2xs.init(*stream);
10971
-
10972
10644
 
10973
10645
  stream->submit([&](sycl::handler &cgh) {
10974
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10975
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10976
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10646
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
10647
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10648
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10977
10649
 
10978
10650
  cgh.parallel_for(
10979
10651
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10996,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10996
10668
  const sycl::range<3> block_nums(1, 1, block_num_y);
10997
10669
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10998
10670
  {
10999
- iq2xs_grid.init(*stream);
11000
- ksigns64.init(*stream);
11001
10671
 
11002
10672
  stream->submit([&](sycl::handler &cgh) {
11003
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
11004
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10673
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
10674
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11005
10675
 
11006
10676
  cgh.parallel_for(
11007
10677
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11024,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
11024
10694
  const sycl::range<3> block_nums(1, 1, block_num_y);
11025
10695
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11026
10696
  {
11027
- iq3xxs_grid.init(*stream);
11028
- ksigns64.init(*stream);
11029
10697
 
11030
10698
  stream->submit([&](sycl::handler &cgh) {
11031
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
11032
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10699
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10700
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11033
10701
 
11034
10702
  cgh.parallel_for(
11035
10703
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11052,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
11052
10720
  const sycl::range<3> block_nums(1, 1, block_num_y);
11053
10721
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11054
10722
  {
11055
- iq3s_grid.init(*stream);
11056
- ksigns64.init(*stream);
11057
10723
 
11058
10724
  stream->submit([&](sycl::handler &cgh) {
11059
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
11060
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10725
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10726
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11061
10727
 
11062
10728
  cgh.parallel_for(
11063
10729
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11080,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
11080
10746
  const sycl::range<3> block_nums(1, 1, block_num_y);
11081
10747
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11082
10748
  {
11083
- iq1s_grid_gpu.init(*stream);
11084
- ksigns64.init(*stream);
11085
10749
 
11086
10750
  stream->submit([&](sycl::handler &cgh) {
11087
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
11088
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10751
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10752
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11089
10753
 
11090
10754
  cgh.parallel_for(
11091
10755
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -13128,6 +12792,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
13128
12792
  }
13129
12793
 
13130
12794
  void ggml_backend_sycl_print_sycl_devices() {
12795
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
13131
12796
  int device_count = dpct::dev_mgr::instance().device_count();
13132
12797
  std::map<std::string, size_t> DeviceNums;
13133
12798
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
@@ -13181,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
13181
12846
  return prop.get_max_work_group_size();
13182
12847
  }
13183
12848
 
13184
- void ggml_init_sycl() try {
12849
+ static void ggml_init_sycl() try {
13185
12850
  static bool initialized = false;
13186
12851
 
13187
12852
  if (!initialized) {
12853
+ fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
13188
12854
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
12855
+
13189
12856
  fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
13190
12857
 
13191
12858
  #if defined(GGML_SYCL_F16)
@@ -15246,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15246
14913
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
15247
14914
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
15248
14915
 
14916
+ bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
14917
+ main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
14918
+
15249
14919
  SYCL_CHECK(
15250
14920
  CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
15251
14921
 
@@ -15276,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15276
14946
 
15277
14947
  dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
15278
14948
  dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
14949
+ if (no_mixed_dtypes) {
14950
+ cu_compute_type = dpct::library_data_t::real_half;
14951
+ cu_data_type = dpct::library_data_t::real_half;
14952
+ }
15279
14953
 
15280
14954
  // dst strides
15281
14955
  size_t nbd2 = dst->nb[2];
15282
14956
  size_t nbd3 = dst->nb[3];
15283
14957
 
14958
+ const float alpha_f32 = 1.0f;
14959
+ const float beta_f32 = 0.0f;
14960
+
15284
14961
  const sycl::half alpha_f16 = 1.0f;
15285
14962
  const sycl::half beta_f16 = 0.0f;
15286
14963
 
15287
- const float alpha_f32 = 1.0f;
15288
- const float beta_f32 = 0.0f;
15289
-
15290
14964
  const void * alpha = &alpha_f32;
15291
14965
  const void * beta = &beta_f32;
14966
+ if (no_mixed_dtypes) {
14967
+ alpha = &alpha_f16;
14968
+ beta = &beta_f16;
14969
+ }
15292
14970
 
15293
14971
  // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
15294
- // oneMKL open source supports half, half, float, float: datatypes
14972
+ // when oneMKL open source supports half, half, float, float: datatypes
15295
14973
 
15296
14974
  dst_t = (char *) dst_ddf;
14975
+ if (no_mixed_dtypes) {
14976
+ dst_t = (char *) dst_f16.alloc(ne_dst);
14977
+
14978
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
14979
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
14980
+ }
15297
14981
 
15298
14982
  GGML_ASSERT(ne12 % ne02 == 0);
15299
14983
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -15379,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15379
15063
  }
15380
15064
  #endif
15381
15065
 
15066
+ if (no_mixed_dtypes) {
15067
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
15068
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
15069
+ }
15382
15070
  }
15383
15071
  catch (sycl::exception const &exc) {
15384
15072
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -16278,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16278
15966
  }
16279
15967
 
16280
15968
  GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
15969
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
16281
15970
  for(int i=0;i<max_len;i++) id_list[i] = -1;
16282
15971
 
16283
15972
  if (!g_sycl_gpu_mgr) {
@@ -16312,6 +16001,7 @@ catch (sycl::exception const &exc) {
16312
16001
 
16313
16002
  GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
16314
16003
  size_t description_size) try {
16004
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
16315
16005
  dpct::device_info prop;
16316
16006
  int device_id = g_sycl_gpu_mgr->gpus[device];
16317
16007
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -16326,6 +16016,7 @@ catch (sycl::exception const &exc) {
16326
16016
 
16327
16017
  GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
16328
16018
  size_t *total) try {
16019
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
16329
16020
  ggml_sycl_set_device(device);
16330
16021
 
16331
16022
  /*
@@ -16677,6 +16368,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16677
16368
  };
16678
16369
 
16679
16370
  ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16371
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
16372
+
16680
16373
  if (device_index>=g_device_count or device_index<0) {
16681
16374
  printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16682
16375
  device_index, g_device_count-1);
@@ -17046,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
17046
16739
  };
17047
16740
 
17048
16741
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
16742
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
16743
+ ggml_init_sycl();
17049
16744
  // FIXME: this is not thread safe
17050
16745
  static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
17051
16746
 
@@ -17117,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
17117
16812
  }
17118
16813
 
17119
16814
  ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
16815
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
17120
16816
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
17121
16817
  /* .iface = */ {
17122
16818
  /* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
@@ -17231,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
17231
16927
  params.ith = 0;
17232
16928
  for (int i = 0; i < cgraph->n_nodes; i++) {
17233
16929
  ggml_tensor * node = cgraph->nodes[i];
17234
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
16930
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
17235
16931
  continue;
17236
16932
  }
17237
16933
  #ifndef NDEBUG
@@ -17379,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17379
17075
  UNUSED(backend);
17380
17076
  }
17381
17077
 
17078
+ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17079
+ const int min_batch_size = 32;
17080
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17081
+ GGML_UNUSED(backend);
17082
+ }
17083
+
17084
+
17382
17085
  static ggml_backend_i ggml_backend_sycl_interface = {
17383
17086
  /* .get_name = */ ggml_backend_sycl_name,
17384
17087
  /* .free = */ ggml_backend_sycl_free,
@@ -17392,7 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17392
17095
  /* .graph_plan_compute = */ NULL,
17393
17096
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17394
17097
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17395
- /* .offload_op = */ NULL,
17098
+ /* .offload_op = */ ggml_backend_sycl_offload_op,
17396
17099
  /* .event_new = */ NULL,
17397
17100
  /* .event_free = */ NULL,
17398
17101
  /* .event_record = */ NULL,
@@ -17406,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
17406
17109
  }
17407
17110
 
17408
17111
  GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
17409
- ggml_init_sycl(); // TODO: remove from ggml.c
17112
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
17113
+ ggml_init_sycl();
17410
17114
 
17411
17115
  check_allow_gpu_index(device);
17412
17116
 
@@ -17432,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
17432
17136
  }
17433
17137
 
17434
17138
  GGML_CALL int ggml_backend_sycl_get_device_count() {
17139
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
17435
17140
  if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17436
17141
  return g_sycl_gpu_mgr->get_gpu_count();
17437
17142
  }
@@ -17444,16 +17149,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
17444
17149
  }
17445
17150
 
17446
17151
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17152
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
17447
17153
  return g_sycl_gpu_mgr->get_index(device_id);
17448
17154
  }
17449
17155
 
17450
17156
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17157
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
17451
17158
  return g_sycl_gpu_mgr->gpus[device_index];
17452
17159
  }
17453
17160
 
17454
17161
  GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17455
- GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17162
+ ggml_init_sycl();
17163
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
17456
17164
  fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17165
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17166
+
17457
17167
  if (g_sycl_gpu_mgr) {
17458
17168
  delete g_sycl_gpu_mgr;
17459
17169
  }
@@ -17464,6 +17174,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
17464
17174
  }
17465
17175
 
17466
17176
  GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17177
+ ggml_init_sycl();
17178
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
17179
+
17467
17180
  if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17468
17181
  return;
17469
17182
  }