llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -740,11 +740,7 @@ namespace dpct
740
740
 
741
741
  sycl::queue &default_queue()
742
742
  {
743
- #ifdef DPCT_USM_LEVEL_NONE
744
- return out_of_order_queue();
745
- #else
746
743
  return in_order_queue();
747
- #endif // DPCT_USM_LEVEL_NONE
748
744
  }
749
745
 
750
746
  void queues_wait_and_throw()
@@ -763,11 +759,7 @@ namespace dpct
763
759
 
764
760
  sycl::queue *create_queue(bool enable_exception_handler = false)
765
761
  {
766
- #ifdef DPCT_USM_LEVEL_NONE
767
- return create_out_of_order_queue(enable_exception_handler);
768
- #else
769
762
  return create_in_order_queue(enable_exception_handler);
770
- #endif // DPCT_USM_LEVEL_NONE
771
763
  }
772
764
 
773
765
  sycl::queue *create_queue(sycl::context context, sycl::device device,
@@ -1075,11 +1067,6 @@ namespace dpct
1075
1067
  static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
1076
1068
  const void *ptr)
1077
1069
  {
1078
- #ifdef DPCT_USM_LEVEL_NONE
1079
- return mem_mgr::instance().is_device_ptr(ptr)
1080
- ? pointer_access_attribute::device_only
1081
- : pointer_access_attribute::host_only;
1082
- #else
1083
1070
  switch (sycl::get_pointer_type(ptr, q.get_context()))
1084
1071
  {
1085
1072
  case sycl::usm::alloc::unknown:
@@ -1090,7 +1077,6 @@ namespace dpct
1090
1077
  case sycl::usm::alloc::host:
1091
1078
  return pointer_access_attribute::host_device;
1092
1079
  }
1093
- #endif
1094
1080
  }
1095
1081
 
1096
1082
  template <typename ArgT>
@@ -1273,11 +1259,7 @@ namespace dpct
1273
1259
 
1274
1260
  static inline void *dpct_malloc(size_t size, sycl::queue &q)
1275
1261
  {
1276
- #ifdef DPCT_USM_LEVEL_NONE
1277
- return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
1278
- #else
1279
1262
  return sycl::malloc_device(size, q.get_device(), q.get_context());
1280
- #endif // DPCT_USM_LEVEL_NONE
1281
1263
  }
1282
1264
 
1283
1265
  #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
@@ -1301,25 +1283,7 @@ namespace dpct
1301
1283
  static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
1302
1284
  valueT value, size_t size)
1303
1285
  {
1304
- #ifdef DPCT_USM_LEVEL_NONE
1305
- auto &mm = mem_mgr::instance();
1306
- assert(mm.is_device_ptr(dev_ptr));
1307
- auto alloc = mm.translate_ptr(dev_ptr);
1308
- size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
1309
-
1310
- return q.submit([&](sycl::handler &cgh)
1311
- {
1312
- auto r = sycl::range<1>(size);
1313
- auto o = sycl::id<1>(offset);
1314
- auto new_buffer = alloc.buffer.reinterpret<valueT>(
1315
- sycl::range<1>(alloc.size / sizeof(valueT)));
1316
- sycl::accessor<valueT, 1, sycl::access_mode::write,
1317
- sycl::access::target::device>
1318
- acc(new_buffer, cgh, r, o);
1319
- cgh.fill(acc, value); });
1320
- #else
1321
1286
  return q.fill(dev_ptr, value, size);
1322
- #endif // DPCT_USM_LEVEL_NONE
1323
1287
  }
1324
1288
 
1325
1289
  /**
@@ -1413,72 +1377,8 @@ namespace dpct
1413
1377
  {
1414
1378
  if (!size)
1415
1379
  return sycl::event{};
1416
- #ifdef DPCT_USM_LEVEL_NONE
1417
- auto &mm = mem_mgr::instance();
1418
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
1419
-
1420
- switch (real_direction)
1421
- {
1422
- case host_to_host:
1423
- return q.submit([&](sycl::handler &cgh)
1424
- {
1425
- cgh.depends_on(dep_events);
1426
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
1427
- case host_to_device:
1428
- {
1429
- auto alloc = mm.translate_ptr(to_ptr);
1430
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
1431
- return q.submit([&](sycl::handler &cgh)
1432
- {
1433
- cgh.depends_on(dep_events);
1434
- auto r = sycl::range<1>(size);
1435
- auto o = sycl::id<1>(offset);
1436
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1437
- sycl::access::target::device>
1438
- acc(alloc.buffer, cgh, r, o);
1439
- cgh.copy(from_ptr, acc); });
1440
- }
1441
- case device_to_host:
1442
- {
1443
- auto alloc = mm.translate_ptr(from_ptr);
1444
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
1445
- return q.submit([&](sycl::handler &cgh)
1446
- {
1447
- cgh.depends_on(dep_events);
1448
- auto r = sycl::range<1>(size);
1449
- auto o = sycl::id<1>(offset);
1450
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1451
- sycl::access::target::device>
1452
- acc(alloc.buffer, cgh, r, o);
1453
- cgh.copy(acc, to_ptr); });
1454
- }
1455
- case device_to_device:
1456
- {
1457
- auto to_alloc = mm.translate_ptr(to_ptr);
1458
- auto from_alloc = mm.translate_ptr(from_ptr);
1459
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
1460
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
1461
- return q.submit([&](sycl::handler &cgh)
1462
- {
1463
- cgh.depends_on(dep_events);
1464
- auto r = sycl::range<1>(size);
1465
- auto to_o = sycl::id<1>(to_offset);
1466
- auto from_o = sycl::id<1>(from_offset);
1467
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1468
- sycl::access::target::device>
1469
- to_acc(to_alloc.buffer, cgh, r, to_o);
1470
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1471
- sycl::access::target::device>
1472
- from_acc(from_alloc.buffer, cgh, r, from_o);
1473
- cgh.copy(from_acc, to_acc); });
1474
- }
1475
- default:
1476
- throw std::runtime_error("dpct_memcpy: invalid direction value");
1477
- }
1478
- #else
1479
1380
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
1480
1381
  GGML_UNUSED(direction);
1481
- #endif // DPCT_USM_LEVEL_NONE
1482
1382
  }
1483
1383
 
1484
1384
  // Get actual copy range and make sure it will not exceed range.
@@ -1618,45 +1518,15 @@ namespace dpct
1618
1518
  break;
1619
1519
  }
1620
1520
  case device_to_device:
1621
- #ifdef DPCT_USM_LEVEL_NONE
1622
- {
1623
- auto &mm = mem_mgr::instance();
1624
- auto to_alloc = mm.translate_ptr(to_surface);
1625
- auto from_alloc = mm.translate_ptr(from_surface);
1626
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
1627
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
1628
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1629
- {
1630
- cgh.depends_on(dep_events);
1631
- auto to_o = sycl::id<1>(to_offset);
1632
- auto from_o = sycl::id<1>(from_offset);
1633
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1634
- sycl::access::target::device>
1635
- to_acc(to_alloc.buffer, cgh,
1636
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
1637
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1638
- sycl::access::target::device>
1639
- from_acc(from_alloc.buffer, cgh,
1640
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
1641
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
1642
- size,
1643
- [=](sycl::id<3> id) {
1644
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
1645
- from_acc[get_offset(id, from_slice, from_range.get(0))];
1646
- }); }));
1647
- }
1648
- #else
1649
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1650
- {
1651
- cgh.depends_on(dep_events);
1652
- cgh.parallel_for<class dpct_memcpy_3d_detail>(
1653
- size,
1654
- [=](sycl::id<3> id) {
1655
- to_surface[get_offset(id, to_slice, to_range.get(0))] =
1656
- from_surface[get_offset(id, from_slice, from_range.get(0))];
1657
- }); }));
1658
- #endif
1659
- break;
1521
+ event_list.push_back(q.submit([&](sycl::handler &cgh){
1522
+ cgh.depends_on(dep_events);
1523
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
1524
+ size,
1525
+ [=](sycl::id<3> id) {
1526
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
1527
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
1528
+ }); }));
1529
+ break;
1660
1530
  default:
1661
1531
  throw std::runtime_error("dpct_memcpy: invalid direction value");
1662
1532
  }
@@ -1754,11 +1624,7 @@ namespace dpct
1754
1624
  {
1755
1625
  if (ptr)
1756
1626
  {
1757
- #ifdef DPCT_USM_LEVEL_NONE
1758
- detail::mem_mgr::instance().mem_free(ptr);
1759
- #else
1760
1627
  sycl::free(ptr, q.get_context());
1761
- #endif // DPCT_USM_LEVEL_NONE
1762
1628
  }
1763
1629
  }
1764
1630
 
@@ -1766,11 +1632,7 @@ namespace dpct
1766
1632
  inline auto get_memory(const void *x)
1767
1633
  {
1768
1634
  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
1769
- #ifdef DPCT_USM_LEVEL_NONE
1770
- return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
1771
- #else
1772
1635
  return new_x;
1773
- #endif
1774
1636
  }
1775
1637
 
1776
1638
  template <typename T>
@@ -2222,72 +2084,8 @@ namespace dpct
2222
2084
  {
2223
2085
  if (!size)
2224
2086
  return sycl::event{};
2225
- #ifdef DPCT_USM_LEVEL_NONE
2226
- auto &mm = mem_mgr::instance();
2227
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
2228
-
2229
- switch (real_direction)
2230
- {
2231
- case host_to_host:
2232
- return q.submit([&](sycl::handler &cgh)
2233
- {
2234
- cgh.depends_on(dep_events);
2235
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
2236
- case host_to_device:
2237
- {
2238
- auto alloc = mm.translate_ptr(to_ptr);
2239
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
2240
- return q.submit([&](sycl::handler &cgh)
2241
- {
2242
- cgh.depends_on(dep_events);
2243
- auto r = sycl::range<1>(size);
2244
- auto o = sycl::id<1>(offset);
2245
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2246
- sycl::access::target::device>
2247
- acc(alloc.buffer, cgh, r, o);
2248
- cgh.copy(from_ptr, acc); });
2249
- }
2250
- case device_to_host:
2251
- {
2252
- auto alloc = mm.translate_ptr(from_ptr);
2253
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
2254
- return q.submit([&](sycl::handler &cgh)
2255
- {
2256
- cgh.depends_on(dep_events);
2257
- auto r = sycl::range<1>(size);
2258
- auto o = sycl::id<1>(offset);
2259
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2260
- sycl::access::target::device>
2261
- acc(alloc.buffer, cgh, r, o);
2262
- cgh.copy(acc, to_ptr); });
2263
- }
2264
- case device_to_device:
2265
- {
2266
- auto to_alloc = mm.translate_ptr(to_ptr);
2267
- auto from_alloc = mm.translate_ptr(from_ptr);
2268
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
2269
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
2270
- return q.submit([&](sycl::handler &cgh)
2271
- {
2272
- cgh.depends_on(dep_events);
2273
- auto r = sycl::range<1>(size);
2274
- auto to_o = sycl::id<1>(to_offset);
2275
- auto from_o = sycl::id<1>(from_offset);
2276
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2277
- sycl::access::target::device>
2278
- to_acc(to_alloc.buffer, cgh, r, to_o);
2279
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2280
- sycl::access::target::device>
2281
- from_acc(from_alloc.buffer, cgh, r, from_o);
2282
- cgh.copy(from_acc, to_acc); });
2283
- }
2284
- default:
2285
- throw std::runtime_error("dpct_memcpy: invalid direction value");
2286
- }
2287
- #else
2288
2087
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
2289
2088
  GGML_UNUSED(direction);
2290
- #endif // DPCT_USM_LEVEL_NONE
2291
2089
  }
2292
2090
 
2293
2091
  // Get actual copy range and make sure it will not exceed range.
@@ -2427,34 +2225,6 @@ namespace dpct
2427
2225
  break;
2428
2226
  }
2429
2227
  case device_to_device:
2430
- #ifdef DPCT_USM_LEVEL_NONE
2431
- {
2432
- auto &mm = mem_mgr::instance();
2433
- auto to_alloc = mm.translate_ptr(to_surface);
2434
- auto from_alloc = mm.translate_ptr(from_surface);
2435
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
2436
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
2437
- event_list.push_back(q.submit([&](sycl::handler &cgh)
2438
- {
2439
- cgh.depends_on(dep_events);
2440
- auto to_o = sycl::id<1>(to_offset);
2441
- auto from_o = sycl::id<1>(from_offset);
2442
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2443
- sycl::access::target::device>
2444
- to_acc(to_alloc.buffer, cgh,
2445
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
2446
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2447
- sycl::access::target::device>
2448
- from_acc(from_alloc.buffer, cgh,
2449
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
2450
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
2451
- size,
2452
- [=](sycl::id<3> id) {
2453
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
2454
- from_acc[get_offset(id, from_slice, from_range.get(0))];
2455
- }); }));
2456
- }
2457
- #else
2458
2228
  event_list.push_back(q.submit([&](sycl::handler &cgh)
2459
2229
  {
2460
2230
  cgh.depends_on(dep_events);
@@ -2464,7 +2234,6 @@ namespace dpct
2464
2234
  to_surface[get_offset(id, to_slice, to_range.get(0))] =
2465
2235
  from_surface[get_offset(id, from_slice, from_range.get(0))];
2466
2236
  }); }));
2467
- #endif
2468
2237
  break;
2469
2238
  default:
2470
2239
  throw std::runtime_error("dpct_memcpy: invalid direction value");
@@ -2655,9 +2424,6 @@ namespace dpct
2655
2424
  void *c[], library_data_t c_type, int ldc,
2656
2425
  int batch_size, library_data_t scaling_type)
2657
2426
  {
2658
- #ifdef DPCT_USM_LEVEL_NONE
2659
- throw std::runtime_error("this API is unsupported when USM level is none");
2660
- #else
2661
2427
  if (scaling_type == library_data_t::real_float &&
2662
2428
  c_type == library_data_t::complex_float)
2663
2429
  {
@@ -2792,7 +2558,6 @@ namespace dpct
2792
2558
  default:
2793
2559
  throw std::runtime_error("the combination of data type is unsupported");
2794
2560
  }
2795
- #endif
2796
2561
  }
2797
2562
 
2798
2563
  /// Computes a batch of matrix-matrix product with general matrices.
@@ -3131,24 +2896,9 @@ namespace dpct
3131
2896
  template <size_t D = Dimension>
3132
2897
  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
3133
2898
  init();
3134
- #ifdef DPCT_USM_LEVEL_NONE
3135
- return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
3136
- _device_ptr)
3137
- .template get_access<sycl::access_mode::read_write>()[index];
3138
- #else
3139
2899
  return _device_ptr[index];
3140
- #endif // DPCT_USM_LEVEL_NONE
3141
2900
  }
3142
2901
 
3143
- #ifdef DPCT_USM_LEVEL_NONE
3144
- /// Get sycl::accessor for the device memory object when usm is not used.
3145
- accessor_t get_access(sycl::handler &cgh) {
3146
- return get_buffer(_device_ptr)
3147
- .template reinterpret<T, Dimension>(_range)
3148
- .template get_access<detail::memory_traits<Memory, T>::mode,
3149
- detail::memory_traits<Memory, T>::target>(cgh);
3150
- }
3151
- #else
3152
2902
  /// Get dpct::accessor with dimension info for the device memory object
3153
2903
  /// when usm is used and dimension is greater than 1.
3154
2904
  template <size_t D = Dimension>
@@ -3156,7 +2906,6 @@ namespace dpct
3156
2906
  get_access(sycl::handler &cgh) {
3157
2907
  return dpct_accessor_t((T *)_device_ptr, _range);
3158
2908
  }
3159
- #endif // DPCT_USM_LEVEL_NONE
3160
2909
 
3161
2910
  private:
3162
2911
  device_memory(value_t *memory_ptr, size_t size)
@@ -3201,15 +2950,6 @@ namespace dpct
3201
2950
 
3202
2951
  /// Default constructor
3203
2952
  device_memory() : base(1) {}
3204
-
3205
- #ifdef DPCT_USM_LEVEL_NONE
3206
- /// Get sycl::accessor for the device memory object when usm is not used.
3207
- accessor_t get_access(sycl::handler &cgh) {
3208
- auto buf = get_buffer(base::get_ptr())
3209
- .template reinterpret<T, 1>(sycl::range<1>(1));
3210
- return accessor_t(buf, cgh);
3211
- }
3212
- #endif // DPCT_USM_LEVEL_NONE
3213
2953
  };
3214
2954
  } // namespace detail
3215
2955
 
@@ -3228,7 +2968,7 @@ namespace dpct
3228
2968
  #include "ggml-common.h"
3229
2969
 
3230
2970
  static int g_ggml_sycl_debug=0;
3231
- #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
2971
+ #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
3232
2972
 
3233
2973
  #define CHECK_TRY_ERROR(expr) \
3234
2974
  [&]() { \
@@ -8339,7 +8079,7 @@ template <bool need_check> static void
8339
8079
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8340
8080
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8341
8081
  const sycl::nd_item<3> &item_ct1,
8342
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
8082
+ const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
8343
8083
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8344
8084
  item_ct1.get_local_id(1);
8345
8085
 
@@ -10216,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10216
9956
  dpct::queue_ptr stream) {
10217
9957
  const int nb = k / QK_K;
10218
9958
  {
10219
- iq2xxs_grid.init(*stream);
10220
- ksigns_iq2xs.init(*stream);
10221
- kmask_iq2xs.init(*stream);
10222
9959
 
10223
9960
  dpct::has_capability_or_fail(stream->get_device(),
10224
9961
  {sycl::aspect::fp16});
10225
9962
 
10226
9963
  stream->submit([&](sycl::handler &cgh) {
10227
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10228
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10229
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9964
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
9965
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9966
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10230
9967
 
10231
9968
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10232
9969
  sycl::range<3>(1, 1, 32),
@@ -10245,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
10245
9982
  dpct::queue_ptr stream) {
10246
9983
  const int nb = k / QK_K;
10247
9984
  {
10248
- iq2xs_grid.init(*stream);
10249
- ksigns_iq2xs.init(*stream);
10250
- kmask_iq2xs.init(*stream);
10251
9985
 
10252
9986
  dpct::has_capability_or_fail(stream->get_device(),
10253
9987
  {sycl::aspect::fp16});
10254
9988
 
10255
9989
  stream->submit([&](sycl::handler &cgh) {
10256
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10257
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10258
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9990
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
9991
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9992
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10259
9993
 
10260
9994
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10261
9995
  sycl::range<3>(1, 1, 32),
@@ -10274,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10274
10008
  dpct::queue_ptr stream) {
10275
10009
  const int nb = k / QK_K;
10276
10010
  {
10277
- iq3xxs_grid.init(*stream);
10278
- ksigns_iq2xs.init(*stream);
10279
- kmask_iq2xs.init(*stream);
10280
10011
 
10281
10012
  dpct::has_capability_or_fail(stream->get_device(),
10282
10013
  {sycl::aspect::fp16});
10283
10014
 
10284
10015
  stream->submit([&](sycl::handler &cgh) {
10285
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10286
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10287
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10016
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10017
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10018
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10288
10019
 
10289
10020
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10290
10021
  sycl::range<3>(1, 1, 32),
@@ -10303,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10303
10034
  dpct::queue_ptr stream) {
10304
10035
  const int nb = k / QK_K;
10305
10036
  {
10306
- iq3s_grid.init(*stream);
10307
- ksigns_iq2xs.init(*stream);
10308
- kmask_iq2xs.init(*stream);
10309
10037
 
10310
10038
  dpct::has_capability_or_fail(stream->get_device(),
10311
10039
  {sycl::aspect::fp16});
10312
10040
 
10313
10041
  stream->submit([&](sycl::handler &cgh) {
10314
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10315
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10316
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10042
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10043
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10044
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10317
10045
 
10318
10046
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10319
10047
  sycl::range<3>(1, 1, 32),
@@ -10332,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10332
10060
  dpct::queue_ptr stream) {
10333
10061
  const int nb = k / QK_K;
10334
10062
  {
10335
- iq1s_grid_gpu.init(*stream);
10336
- ksigns_iq2xs.init(*stream);
10337
- kmask_iq2xs.init(*stream);
10338
10063
 
10339
10064
  dpct::has_capability_or_fail(stream->get_device(),
10340
10065
  {sycl::aspect::fp16});
10341
10066
 
10342
10067
  stream->submit([&](sycl::handler &cgh) {
10343
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10344
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10345
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10068
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10069
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10070
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10346
10071
 
10347
10072
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10348
10073
  sycl::range<3>(1, 1, 32),
@@ -10675,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10675
10400
  const sycl::range<3> block_nums(1, 1, block_num_y);
10676
10401
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10677
10402
  {
10678
- iq3xxs_grid.init(*stream);
10679
- ksigns64.init(*stream);
10680
10403
 
10681
10404
  stream->submit([&](sycl::handler &cgh) {
10682
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10683
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10684
10405
 
10685
10406
  cgh.parallel_for(
10686
10407
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10688,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10688
10409
  [[intel::reqd_sub_group_size(32)]] {
10689
10410
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10690
10411
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10691
- vx, vy, dst, ncols, nrows, item_ct1,
10692
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10412
+ vx, vy, dst, ncols, nrows, item_ct1);
10693
10413
  });
10694
10414
  });
10695
10415
  }
@@ -10704,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10704
10424
  const sycl::range<3> block_nums(1, 1, block_num_y);
10705
10425
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10706
10426
  {
10707
- iq3xxs_grid.init(*stream);
10708
- ksigns64.init(*stream);
10709
10427
 
10710
10428
  stream->submit([&](sycl::handler &cgh) {
10711
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10712
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10713
10429
 
10714
10430
  cgh.parallel_for(
10715
10431
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10717,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10717
10433
  [[intel::reqd_sub_group_size(32)]] {
10718
10434
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10719
10435
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10720
- vx, vy, dst, ncols, nrows, item_ct1,
10721
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10436
+ vx, vy, dst, ncols, nrows, item_ct1);
10722
10437
  });
10723
10438
  });
10724
10439
  }
@@ -10733,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10733
10448
  const sycl::range<3> block_nums(1, 1, block_num_y);
10734
10449
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10735
10450
  {
10736
- iq3xxs_grid.init(*stream);
10737
- ksigns64.init(*stream);
10738
10451
 
10739
10452
  stream->submit([&](sycl::handler &cgh) {
10740
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10741
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10742
10453
 
10743
10454
  cgh.parallel_for(
10744
10455
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10746,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10746
10457
  [[intel::reqd_sub_group_size(32)]] {
10747
10458
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10748
10459
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10749
- vx, vy, dst, ncols, nrows, item_ct1,
10750
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10460
+ vx, vy, dst, ncols, nrows, item_ct1);
10751
10461
  });
10752
10462
  });
10753
10463
  }
@@ -10762,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10762
10472
  const sycl::range<3> block_nums(1, 1, block_num_y);
10763
10473
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10764
10474
  {
10765
- iq3xxs_grid.init(*stream);
10766
- ksigns64.init(*stream);
10767
10475
 
10768
10476
  stream->submit([&](sycl::handler &cgh) {
10769
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10770
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10771
10477
 
10772
10478
  cgh.parallel_for(
10773
10479
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10775,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10775
10481
  [[intel::reqd_sub_group_size(32)]] {
10776
10482
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10777
10483
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10778
- vx, vy, dst, ncols, nrows, item_ct1,
10779
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10484
+ vx, vy, dst, ncols, nrows, item_ct1);
10780
10485
  });
10781
10486
  });
10782
10487
  }
@@ -10791,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10791
10496
  const sycl::range<3> block_nums(1, 1, block_num_y);
10792
10497
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10793
10498
  {
10794
- iq3xxs_grid.init(*stream);
10795
- ksigns64.init(*stream);
10796
10499
 
10797
10500
  stream->submit([&](sycl::handler &cgh) {
10798
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10799
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10800
10501
 
10801
10502
  cgh.parallel_for(
10802
10503
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10804,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10804
10505
  [[intel::reqd_sub_group_size(32)]] {
10805
10506
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10806
10507
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10807
- vx, vy, dst, ncols, nrows, item_ct1,
10808
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10508
+ vx, vy, dst, ncols, nrows, item_ct1);
10809
10509
  });
10810
10510
  });
10811
10511
  }
@@ -10820,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10820
10520
  const sycl::range<3> block_nums(1, 1, block_num_y);
10821
10521
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10822
10522
  {
10823
- iq3xxs_grid.init(*stream);
10824
- ksigns64.init(*stream);
10825
10523
 
10826
10524
  stream->submit([&](sycl::handler &cgh) {
10827
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10828
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10829
10525
 
10830
10526
  cgh.parallel_for(
10831
10527
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10833,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10833
10529
  [[intel::reqd_sub_group_size(32)]] {
10834
10530
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10835
10531
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10836
- vx, vy, dst, ncols, nrows, item_ct1,
10837
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10532
+ vx, vy, dst, ncols, nrows, item_ct1);
10838
10533
  });
10839
10534
  });
10840
10535
  }
@@ -10849,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10849
10544
  const sycl::range<3> block_nums(1, 1, block_num_y);
10850
10545
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10851
10546
  {
10852
- iq3xxs_grid.init(*stream);
10853
- ksigns64.init(*stream);
10854
10547
 
10855
10548
  stream->submit([&](sycl::handler &cgh) {
10856
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10857
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10858
10549
 
10859
10550
  cgh.parallel_for(
10860
10551
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10862,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10862
10553
  [[intel::reqd_sub_group_size(32)]] {
10863
10554
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10864
10555
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10865
- vx, vy, dst, ncols, nrows, item_ct1,
10866
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10556
+ vx, vy, dst, ncols, nrows, item_ct1);
10867
10557
  });
10868
10558
  });
10869
10559
  }
@@ -10878,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10878
10568
  const sycl::range<3> block_nums(1, 1, block_num_y);
10879
10569
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10880
10570
  {
10881
- iq3xxs_grid.init(*stream);
10882
- ksigns64.init(*stream);
10883
10571
 
10884
10572
  stream->submit([&](sycl::handler &cgh) {
10885
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10886
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10887
10573
 
10888
10574
  cgh.parallel_for(
10889
10575
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10891,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10891
10577
  [[intel::reqd_sub_group_size(32)]] {
10892
10578
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10893
10579
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10894
- vx, vy, dst, ncols, nrows, item_ct1,
10895
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10580
+ vx, vy, dst, ncols, nrows, item_ct1);
10896
10581
  });
10897
10582
  });
10898
10583
  }
@@ -10907,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10907
10592
  const sycl::range<3> block_nums(1, 1, block_num_y);
10908
10593
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10909
10594
  {
10910
- iq3xxs_grid.init(*stream);
10911
- ksigns64.init(*stream);
10912
10595
 
10913
10596
  stream->submit([&](sycl::handler &cgh) {
10914
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10915
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10916
10597
 
10917
10598
  cgh.parallel_for(
10918
10599
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10920,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10920
10601
  [[intel::reqd_sub_group_size(32)]] {
10921
10602
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10922
10603
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10923
- vx, vy, dst, ncols, nrows, item_ct1,
10924
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10604
+ vx, vy, dst, ncols, nrows, item_ct1);
10925
10605
  });
10926
10606
  });
10927
10607
  }
@@ -10936,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10936
10616
  const sycl::range<3> block_nums(1, 1, block_num_y);
10937
10617
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10938
10618
  {
10939
- iq3xxs_grid.init(*stream);
10940
- ksigns64.init(*stream);
10941
10619
 
10942
10620
  stream->submit([&](sycl::handler &cgh) {
10943
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10944
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10945
10621
 
10946
10622
  cgh.parallel_for(
10947
10623
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10949,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10949
10625
  [[intel::reqd_sub_group_size(32)]] {
10950
10626
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10951
10627
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10952
- vx, vy, dst, ncols, nrows, item_ct1,
10953
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10628
+ vx, vy, dst, ncols, nrows, item_ct1);
10954
10629
  });
10955
10630
  });
10956
10631
  }
10957
10632
  }
10958
10633
 
10634
+
10959
10635
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10960
10636
  float *dst, const int ncols,
10961
10637
  const int nrows,
@@ -10965,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10965
10641
  const sycl::range<3> block_nums(1, 1, block_num_y);
10966
10642
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10967
10643
  {
10968
- iq2xxs_grid.init(*stream);
10969
- ksigns_iq2xs.init(*stream);
10970
- kmask_iq2xs.init(*stream);
10971
-
10972
10644
 
10973
10645
  stream->submit([&](sycl::handler &cgh) {
10974
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10975
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10976
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10646
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
10647
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10648
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10977
10649
 
10978
10650
  cgh.parallel_for(
10979
10651
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10996,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10996
10668
  const sycl::range<3> block_nums(1, 1, block_num_y);
10997
10669
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10998
10670
  {
10999
- iq2xs_grid.init(*stream);
11000
- ksigns64.init(*stream);
11001
10671
 
11002
10672
  stream->submit([&](sycl::handler &cgh) {
11003
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
11004
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10673
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
10674
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11005
10675
 
11006
10676
  cgh.parallel_for(
11007
10677
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11024,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
11024
10694
  const sycl::range<3> block_nums(1, 1, block_num_y);
11025
10695
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11026
10696
  {
11027
- iq3xxs_grid.init(*stream);
11028
- ksigns64.init(*stream);
11029
10697
 
11030
10698
  stream->submit([&](sycl::handler &cgh) {
11031
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
11032
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10699
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10700
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11033
10701
 
11034
10702
  cgh.parallel_for(
11035
10703
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11052,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
11052
10720
  const sycl::range<3> block_nums(1, 1, block_num_y);
11053
10721
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11054
10722
  {
11055
- iq3s_grid.init(*stream);
11056
- ksigns64.init(*stream);
11057
10723
 
11058
10724
  stream->submit([&](sycl::handler &cgh) {
11059
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
11060
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10725
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10726
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11061
10727
 
11062
10728
  cgh.parallel_for(
11063
10729
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -11080,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
11080
10746
  const sycl::range<3> block_nums(1, 1, block_num_y);
11081
10747
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11082
10748
  {
11083
- iq1s_grid_gpu.init(*stream);
11084
- ksigns64.init(*stream);
11085
10749
 
11086
10750
  stream->submit([&](sycl::handler &cgh) {
11087
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
11088
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10751
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10752
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11089
10753
 
11090
10754
  cgh.parallel_for(
11091
10755
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -13128,6 +12792,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
13128
12792
  }
13129
12793
 
13130
12794
  void ggml_backend_sycl_print_sycl_devices() {
12795
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
13131
12796
  int device_count = dpct::dev_mgr::instance().device_count();
13132
12797
  std::map<std::string, size_t> DeviceNums;
13133
12798
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
@@ -13181,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
13181
12846
  return prop.get_max_work_group_size();
13182
12847
  }
13183
12848
 
13184
- void ggml_init_sycl() try {
12849
+ static void ggml_init_sycl() try {
13185
12850
  static bool initialized = false;
13186
12851
 
13187
12852
  if (!initialized) {
12853
+ fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
13188
12854
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
12855
+
13189
12856
  fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
13190
12857
 
13191
12858
  #if defined(GGML_SYCL_F16)
@@ -15246,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15246
14913
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
15247
14914
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
15248
14915
 
14916
+ bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
14917
+ main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
14918
+
15249
14919
  SYCL_CHECK(
15250
14920
  CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
15251
14921
 
@@ -15276,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15276
14946
 
15277
14947
  dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
15278
14948
  dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
14949
+ if (no_mixed_dtypes) {
14950
+ cu_compute_type = dpct::library_data_t::real_half;
14951
+ cu_data_type = dpct::library_data_t::real_half;
14952
+ }
15279
14953
 
15280
14954
  // dst strides
15281
14955
  size_t nbd2 = dst->nb[2];
15282
14956
  size_t nbd3 = dst->nb[3];
15283
14957
 
14958
+ const float alpha_f32 = 1.0f;
14959
+ const float beta_f32 = 0.0f;
14960
+
15284
14961
  const sycl::half alpha_f16 = 1.0f;
15285
14962
  const sycl::half beta_f16 = 0.0f;
15286
14963
 
15287
- const float alpha_f32 = 1.0f;
15288
- const float beta_f32 = 0.0f;
15289
-
15290
14964
  const void * alpha = &alpha_f32;
15291
14965
  const void * beta = &beta_f32;
14966
+ if (no_mixed_dtypes) {
14967
+ alpha = &alpha_f16;
14968
+ beta = &beta_f16;
14969
+ }
15292
14970
 
15293
14971
  // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
15294
- // oneMKL open source supports half, half, float, float: datatypes
14972
+ // when oneMKL open source supports half, half, float, float: datatypes
15295
14973
 
15296
14974
  dst_t = (char *) dst_ddf;
14975
+ if (no_mixed_dtypes) {
14976
+ dst_t = (char *) dst_f16.alloc(ne_dst);
14977
+
14978
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
14979
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
14980
+ }
15297
14981
 
15298
14982
  GGML_ASSERT(ne12 % ne02 == 0);
15299
14983
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -15379,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15379
15063
  }
15380
15064
  #endif
15381
15065
 
15066
+ if (no_mixed_dtypes) {
15067
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
15068
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
15069
+ }
15382
15070
  }
15383
15071
  catch (sycl::exception const &exc) {
15384
15072
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -16278,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16278
15966
  }
16279
15967
 
16280
15968
  GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
15969
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
16281
15970
  for(int i=0;i<max_len;i++) id_list[i] = -1;
16282
15971
 
16283
15972
  if (!g_sycl_gpu_mgr) {
@@ -16312,6 +16001,7 @@ catch (sycl::exception const &exc) {
16312
16001
 
16313
16002
  GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
16314
16003
  size_t description_size) try {
16004
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
16315
16005
  dpct::device_info prop;
16316
16006
  int device_id = g_sycl_gpu_mgr->gpus[device];
16317
16007
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -16326,6 +16016,7 @@ catch (sycl::exception const &exc) {
16326
16016
 
16327
16017
  GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
16328
16018
  size_t *total) try {
16019
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
16329
16020
  ggml_sycl_set_device(device);
16330
16021
 
16331
16022
  /*
@@ -16677,6 +16368,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16677
16368
  };
16678
16369
 
16679
16370
  ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16371
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
16372
+
16680
16373
  if (device_index>=g_device_count or device_index<0) {
16681
16374
  printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16682
16375
  device_index, g_device_count-1);
@@ -17046,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
17046
16739
  };
17047
16740
 
17048
16741
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
16742
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
16743
+ ggml_init_sycl();
17049
16744
  // FIXME: this is not thread safe
17050
16745
  static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
17051
16746
 
@@ -17117,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
17117
16812
  }
17118
16813
 
17119
16814
  ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
16815
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
17120
16816
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
17121
16817
  /* .iface = */ {
17122
16818
  /* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
@@ -17231,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
17231
16927
  params.ith = 0;
17232
16928
  for (int i = 0; i < cgraph->n_nodes; i++) {
17233
16929
  ggml_tensor * node = cgraph->nodes[i];
17234
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
16930
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
17235
16931
  continue;
17236
16932
  }
17237
16933
  #ifndef NDEBUG
@@ -17379,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17379
17075
  UNUSED(backend);
17380
17076
  }
17381
17077
 
17078
+ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17079
+ const int min_batch_size = 32;
17080
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17081
+ GGML_UNUSED(backend);
17082
+ }
17083
+
17084
+
17382
17085
  static ggml_backend_i ggml_backend_sycl_interface = {
17383
17086
  /* .get_name = */ ggml_backend_sycl_name,
17384
17087
  /* .free = */ ggml_backend_sycl_free,
@@ -17392,7 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17392
17095
  /* .graph_plan_compute = */ NULL,
17393
17096
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17394
17097
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17395
- /* .offload_op = */ NULL,
17098
+ /* .offload_op = */ ggml_backend_sycl_offload_op,
17396
17099
  /* .event_new = */ NULL,
17397
17100
  /* .event_free = */ NULL,
17398
17101
  /* .event_record = */ NULL,
@@ -17406,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
17406
17109
  }
17407
17110
 
17408
17111
  GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
17409
- ggml_init_sycl(); // TODO: remove from ggml.c
17112
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
17113
+ ggml_init_sycl();
17410
17114
 
17411
17115
  check_allow_gpu_index(device);
17412
17116
 
@@ -17432,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
17432
17136
  }
17433
17137
 
17434
17138
  GGML_CALL int ggml_backend_sycl_get_device_count() {
17139
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
17435
17140
  if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17436
17141
  return g_sycl_gpu_mgr->get_gpu_count();
17437
17142
  }
@@ -17444,16 +17149,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
17444
17149
  }
17445
17150
 
17446
17151
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17152
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
17447
17153
  return g_sycl_gpu_mgr->get_index(device_id);
17448
17154
  }
17449
17155
 
17450
17156
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17157
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
17451
17158
  return g_sycl_gpu_mgr->gpus[device_index];
17452
17159
  }
17453
17160
 
17454
17161
  GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17455
- GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17162
+ ggml_init_sycl();
17163
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
17456
17164
  fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17165
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17166
+
17457
17167
  if (g_sycl_gpu_mgr) {
17458
17168
  delete g_sycl_gpu_mgr;
17459
17169
  }
@@ -17464,6 +17174,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
17464
17174
  }
17465
17175
 
17466
17176
  GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17177
+ ggml_init_sycl();
17178
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
17179
+
17467
17180
  if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17468
17181
  return;
17469
17182
  }