llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -740,11 +740,7 @@ namespace dpct
740
740
 
741
741
  sycl::queue &default_queue()
742
742
  {
743
- #ifdef DPCT_USM_LEVEL_NONE
744
- return out_of_order_queue();
745
- #else
746
743
  return in_order_queue();
747
- #endif // DPCT_USM_LEVEL_NONE
748
744
  }
749
745
 
750
746
  void queues_wait_and_throw()
@@ -763,11 +759,7 @@ namespace dpct
763
759
 
764
760
  sycl::queue *create_queue(bool enable_exception_handler = false)
765
761
  {
766
- #ifdef DPCT_USM_LEVEL_NONE
767
- return create_out_of_order_queue(enable_exception_handler);
768
- #else
769
762
  return create_in_order_queue(enable_exception_handler);
770
- #endif // DPCT_USM_LEVEL_NONE
771
763
  }
772
764
 
773
765
  sycl::queue *create_queue(sycl::context context, sycl::device device,
@@ -1075,11 +1067,6 @@ namespace dpct
1075
1067
  static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
1076
1068
  const void *ptr)
1077
1069
  {
1078
- #ifdef DPCT_USM_LEVEL_NONE
1079
- return mem_mgr::instance().is_device_ptr(ptr)
1080
- ? pointer_access_attribute::device_only
1081
- : pointer_access_attribute::host_only;
1082
- #else
1083
1070
  switch (sycl::get_pointer_type(ptr, q.get_context()))
1084
1071
  {
1085
1072
  case sycl::usm::alloc::unknown:
@@ -1090,7 +1077,6 @@ namespace dpct
1090
1077
  case sycl::usm::alloc::host:
1091
1078
  return pointer_access_attribute::host_device;
1092
1079
  }
1093
- #endif
1094
1080
  }
1095
1081
 
1096
1082
  template <typename ArgT>
@@ -1273,11 +1259,7 @@ namespace dpct
1273
1259
 
1274
1260
  static inline void *dpct_malloc(size_t size, sycl::queue &q)
1275
1261
  {
1276
- #ifdef DPCT_USM_LEVEL_NONE
1277
- return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
1278
- #else
1279
1262
  return sycl::malloc_device(size, q.get_device(), q.get_context());
1280
- #endif // DPCT_USM_LEVEL_NONE
1281
1263
  }
1282
1264
 
1283
1265
  #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
@@ -1301,25 +1283,7 @@ namespace dpct
1301
1283
  static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
1302
1284
  valueT value, size_t size)
1303
1285
  {
1304
- #ifdef DPCT_USM_LEVEL_NONE
1305
- auto &mm = mem_mgr::instance();
1306
- assert(mm.is_device_ptr(dev_ptr));
1307
- auto alloc = mm.translate_ptr(dev_ptr);
1308
- size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
1309
-
1310
- return q.submit([&](sycl::handler &cgh)
1311
- {
1312
- auto r = sycl::range<1>(size);
1313
- auto o = sycl::id<1>(offset);
1314
- auto new_buffer = alloc.buffer.reinterpret<valueT>(
1315
- sycl::range<1>(alloc.size / sizeof(valueT)));
1316
- sycl::accessor<valueT, 1, sycl::access_mode::write,
1317
- sycl::access::target::device>
1318
- acc(new_buffer, cgh, r, o);
1319
- cgh.fill(acc, value); });
1320
- #else
1321
1286
  return q.fill(dev_ptr, value, size);
1322
- #endif // DPCT_USM_LEVEL_NONE
1323
1287
  }
1324
1288
 
1325
1289
  /**
@@ -1413,72 +1377,8 @@ namespace dpct
1413
1377
  {
1414
1378
  if (!size)
1415
1379
  return sycl::event{};
1416
- #ifdef DPCT_USM_LEVEL_NONE
1417
- auto &mm = mem_mgr::instance();
1418
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
1419
-
1420
- switch (real_direction)
1421
- {
1422
- case host_to_host:
1423
- return q.submit([&](sycl::handler &cgh)
1424
- {
1425
- cgh.depends_on(dep_events);
1426
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
1427
- case host_to_device:
1428
- {
1429
- auto alloc = mm.translate_ptr(to_ptr);
1430
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
1431
- return q.submit([&](sycl::handler &cgh)
1432
- {
1433
- cgh.depends_on(dep_events);
1434
- auto r = sycl::range<1>(size);
1435
- auto o = sycl::id<1>(offset);
1436
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1437
- sycl::access::target::device>
1438
- acc(alloc.buffer, cgh, r, o);
1439
- cgh.copy(from_ptr, acc); });
1440
- }
1441
- case device_to_host:
1442
- {
1443
- auto alloc = mm.translate_ptr(from_ptr);
1444
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
1445
- return q.submit([&](sycl::handler &cgh)
1446
- {
1447
- cgh.depends_on(dep_events);
1448
- auto r = sycl::range<1>(size);
1449
- auto o = sycl::id<1>(offset);
1450
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1451
- sycl::access::target::device>
1452
- acc(alloc.buffer, cgh, r, o);
1453
- cgh.copy(acc, to_ptr); });
1454
- }
1455
- case device_to_device:
1456
- {
1457
- auto to_alloc = mm.translate_ptr(to_ptr);
1458
- auto from_alloc = mm.translate_ptr(from_ptr);
1459
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
1460
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
1461
- return q.submit([&](sycl::handler &cgh)
1462
- {
1463
- cgh.depends_on(dep_events);
1464
- auto r = sycl::range<1>(size);
1465
- auto to_o = sycl::id<1>(to_offset);
1466
- auto from_o = sycl::id<1>(from_offset);
1467
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1468
- sycl::access::target::device>
1469
- to_acc(to_alloc.buffer, cgh, r, to_o);
1470
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1471
- sycl::access::target::device>
1472
- from_acc(from_alloc.buffer, cgh, r, from_o);
1473
- cgh.copy(from_acc, to_acc); });
1474
- }
1475
- default:
1476
- throw std::runtime_error("dpct_memcpy: invalid direction value");
1477
- }
1478
- #else
1479
1380
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
1480
1381
  GGML_UNUSED(direction);
1481
- #endif // DPCT_USM_LEVEL_NONE
1482
1382
  }
1483
1383
 
1484
1384
  // Get actual copy range and make sure it will not exceed range.
@@ -1618,45 +1518,15 @@ namespace dpct
1618
1518
  break;
1619
1519
  }
1620
1520
  case device_to_device:
1621
- #ifdef DPCT_USM_LEVEL_NONE
1622
- {
1623
- auto &mm = mem_mgr::instance();
1624
- auto to_alloc = mm.translate_ptr(to_surface);
1625
- auto from_alloc = mm.translate_ptr(from_surface);
1626
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
1627
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
1628
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1629
- {
1630
- cgh.depends_on(dep_events);
1631
- auto to_o = sycl::id<1>(to_offset);
1632
- auto from_o = sycl::id<1>(from_offset);
1633
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1634
- sycl::access::target::device>
1635
- to_acc(to_alloc.buffer, cgh,
1636
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
1637
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1638
- sycl::access::target::device>
1639
- from_acc(from_alloc.buffer, cgh,
1640
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
1641
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
1642
- size,
1643
- [=](sycl::id<3> id) {
1644
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
1645
- from_acc[get_offset(id, from_slice, from_range.get(0))];
1646
- }); }));
1647
- }
1648
- #else
1649
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1650
- {
1651
- cgh.depends_on(dep_events);
1652
- cgh.parallel_for<class dpct_memcpy_3d_detail>(
1653
- size,
1654
- [=](sycl::id<3> id) {
1655
- to_surface[get_offset(id, to_slice, to_range.get(0))] =
1656
- from_surface[get_offset(id, from_slice, from_range.get(0))];
1657
- }); }));
1658
- #endif
1659
- break;
1521
+ event_list.push_back(q.submit([&](sycl::handler &cgh){
1522
+ cgh.depends_on(dep_events);
1523
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
1524
+ size,
1525
+ [=](sycl::id<3> id) {
1526
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
1527
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
1528
+ }); }));
1529
+ break;
1660
1530
  default:
1661
1531
  throw std::runtime_error("dpct_memcpy: invalid direction value");
1662
1532
  }
@@ -1754,11 +1624,7 @@ namespace dpct
1754
1624
  {
1755
1625
  if (ptr)
1756
1626
  {
1757
- #ifdef DPCT_USM_LEVEL_NONE
1758
- detail::mem_mgr::instance().mem_free(ptr);
1759
- #else
1760
1627
  sycl::free(ptr, q.get_context());
1761
- #endif // DPCT_USM_LEVEL_NONE
1762
1628
  }
1763
1629
  }
1764
1630
 
@@ -1766,11 +1632,7 @@ namespace dpct
1766
1632
  inline auto get_memory(const void *x)
1767
1633
  {
1768
1634
  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
1769
- #ifdef DPCT_USM_LEVEL_NONE
1770
- return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
1771
- #else
1772
1635
  return new_x;
1773
- #endif
1774
1636
  }
1775
1637
 
1776
1638
  template <typename T>
@@ -1802,24 +1664,6 @@ namespace dpct
1802
1664
  const void *alpha, const void *a, int lda, const void *b,
1803
1665
  int ldb, const void *beta, void *c, int ldc)
1804
1666
  {
1805
- #ifndef __INTEL_MKL__
1806
- GGML_UNUSED(q);
1807
- GGML_UNUSED(a_trans);
1808
- GGML_UNUSED(b_trans);
1809
- GGML_UNUSED(m);
1810
- GGML_UNUSED(n);
1811
- GGML_UNUSED(k);
1812
- GGML_UNUSED(alpha);
1813
- GGML_UNUSED(a);
1814
- GGML_UNUSED(lda);
1815
- GGML_UNUSED(b);
1816
- GGML_UNUSED(ldb);
1817
- GGML_UNUSED(beta);
1818
- GGML_UNUSED(c);
1819
- GGML_UNUSED(ldc);
1820
- throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
1821
- "Project does not support this API.");
1822
- #else
1823
1667
  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
1824
1668
  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
1825
1669
  auto data_a = get_memory<const Ta>(a);
@@ -1828,7 +1672,6 @@ namespace dpct
1828
1672
  oneapi::mkl::blas::column_major::gemm(
1829
1673
  q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
1830
1674
  data_b, ldb, beta_value, data_c, ldc);
1831
- #endif
1832
1675
  }
1833
1676
 
1834
1677
  template <typename VecT, class BinaryOperation, class = void>
@@ -2222,72 +2065,8 @@ namespace dpct
2222
2065
  {
2223
2066
  if (!size)
2224
2067
  return sycl::event{};
2225
- #ifdef DPCT_USM_LEVEL_NONE
2226
- auto &mm = mem_mgr::instance();
2227
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
2228
-
2229
- switch (real_direction)
2230
- {
2231
- case host_to_host:
2232
- return q.submit([&](sycl::handler &cgh)
2233
- {
2234
- cgh.depends_on(dep_events);
2235
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
2236
- case host_to_device:
2237
- {
2238
- auto alloc = mm.translate_ptr(to_ptr);
2239
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
2240
- return q.submit([&](sycl::handler &cgh)
2241
- {
2242
- cgh.depends_on(dep_events);
2243
- auto r = sycl::range<1>(size);
2244
- auto o = sycl::id<1>(offset);
2245
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2246
- sycl::access::target::device>
2247
- acc(alloc.buffer, cgh, r, o);
2248
- cgh.copy(from_ptr, acc); });
2249
- }
2250
- case device_to_host:
2251
- {
2252
- auto alloc = mm.translate_ptr(from_ptr);
2253
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
2254
- return q.submit([&](sycl::handler &cgh)
2255
- {
2256
- cgh.depends_on(dep_events);
2257
- auto r = sycl::range<1>(size);
2258
- auto o = sycl::id<1>(offset);
2259
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2260
- sycl::access::target::device>
2261
- acc(alloc.buffer, cgh, r, o);
2262
- cgh.copy(acc, to_ptr); });
2263
- }
2264
- case device_to_device:
2265
- {
2266
- auto to_alloc = mm.translate_ptr(to_ptr);
2267
- auto from_alloc = mm.translate_ptr(from_ptr);
2268
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
2269
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
2270
- return q.submit([&](sycl::handler &cgh)
2271
- {
2272
- cgh.depends_on(dep_events);
2273
- auto r = sycl::range<1>(size);
2274
- auto to_o = sycl::id<1>(to_offset);
2275
- auto from_o = sycl::id<1>(from_offset);
2276
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2277
- sycl::access::target::device>
2278
- to_acc(to_alloc.buffer, cgh, r, to_o);
2279
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2280
- sycl::access::target::device>
2281
- from_acc(from_alloc.buffer, cgh, r, from_o);
2282
- cgh.copy(from_acc, to_acc); });
2283
- }
2284
- default:
2285
- throw std::runtime_error("dpct_memcpy: invalid direction value");
2286
- }
2287
- #else
2288
2068
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
2289
2069
  GGML_UNUSED(direction);
2290
- #endif // DPCT_USM_LEVEL_NONE
2291
2070
  }
2292
2071
 
2293
2072
  // Get actual copy range and make sure it will not exceed range.
@@ -2427,34 +2206,6 @@ namespace dpct
2427
2206
  break;
2428
2207
  }
2429
2208
  case device_to_device:
2430
- #ifdef DPCT_USM_LEVEL_NONE
2431
- {
2432
- auto &mm = mem_mgr::instance();
2433
- auto to_alloc = mm.translate_ptr(to_surface);
2434
- auto from_alloc = mm.translate_ptr(from_surface);
2435
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
2436
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
2437
- event_list.push_back(q.submit([&](sycl::handler &cgh)
2438
- {
2439
- cgh.depends_on(dep_events);
2440
- auto to_o = sycl::id<1>(to_offset);
2441
- auto from_o = sycl::id<1>(from_offset);
2442
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2443
- sycl::access::target::device>
2444
- to_acc(to_alloc.buffer, cgh,
2445
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
2446
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2447
- sycl::access::target::device>
2448
- from_acc(from_alloc.buffer, cgh,
2449
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
2450
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
2451
- size,
2452
- [=](sycl::id<3> id) {
2453
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
2454
- from_acc[get_offset(id, from_slice, from_range.get(0))];
2455
- }); }));
2456
- }
2457
- #else
2458
2209
  event_list.push_back(q.submit([&](sycl::handler &cgh)
2459
2210
  {
2460
2211
  cgh.depends_on(dep_events);
@@ -2464,7 +2215,6 @@ namespace dpct
2464
2215
  to_surface[get_offset(id, to_slice, to_range.get(0))] =
2465
2216
  from_surface[get_offset(id, from_slice, from_range.get(0))];
2466
2217
  }); }));
2467
- #endif
2468
2218
  break;
2469
2219
  default:
2470
2220
  throw std::runtime_error("dpct_memcpy: invalid direction value");
@@ -2561,6 +2311,7 @@ namespace dpct
2561
2311
  lda, b, ldb, beta, c, ldc);
2562
2312
  break;
2563
2313
  }
2314
+ #ifdef __INTEL_MKL__
2564
2315
  case detail::get_type_combination_id(
2565
2316
  library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2566
2317
  library_data_t::real_float, library_data_t::real_float):
@@ -2622,6 +2373,7 @@ namespace dpct
2622
2373
  q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
2623
2374
  break;
2624
2375
  }
2376
+ #endif // __INTEL_MKL__
2625
2377
  default:
2626
2378
  throw std::runtime_error("the combination of data type is unsupported");
2627
2379
  }
@@ -2655,9 +2407,6 @@ namespace dpct
2655
2407
  void *c[], library_data_t c_type, int ldc,
2656
2408
  int batch_size, library_data_t scaling_type)
2657
2409
  {
2658
- #ifdef DPCT_USM_LEVEL_NONE
2659
- throw std::runtime_error("this API is unsupported when USM level is none");
2660
- #else
2661
2410
  if (scaling_type == library_data_t::real_float &&
2662
2411
  c_type == library_data_t::complex_float)
2663
2412
  {
@@ -2792,7 +2541,6 @@ namespace dpct
2792
2541
  default:
2793
2542
  throw std::runtime_error("the combination of data type is unsupported");
2794
2543
  }
2795
- #endif
2796
2544
  }
2797
2545
 
2798
2546
  /// Computes a batch of matrix-matrix product with general matrices.
@@ -3131,24 +2879,9 @@ namespace dpct
3131
2879
  template <size_t D = Dimension>
3132
2880
  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
3133
2881
  init();
3134
- #ifdef DPCT_USM_LEVEL_NONE
3135
- return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
3136
- _device_ptr)
3137
- .template get_access<sycl::access_mode::read_write>()[index];
3138
- #else
3139
2882
  return _device_ptr[index];
3140
- #endif // DPCT_USM_LEVEL_NONE
3141
2883
  }
3142
2884
 
3143
- #ifdef DPCT_USM_LEVEL_NONE
3144
- /// Get sycl::accessor for the device memory object when usm is not used.
3145
- accessor_t get_access(sycl::handler &cgh) {
3146
- return get_buffer(_device_ptr)
3147
- .template reinterpret<T, Dimension>(_range)
3148
- .template get_access<detail::memory_traits<Memory, T>::mode,
3149
- detail::memory_traits<Memory, T>::target>(cgh);
3150
- }
3151
- #else
3152
2885
  /// Get dpct::accessor with dimension info for the device memory object
3153
2886
  /// when usm is used and dimension is greater than 1.
3154
2887
  template <size_t D = Dimension>
@@ -3156,7 +2889,6 @@ namespace dpct
3156
2889
  get_access(sycl::handler &cgh) {
3157
2890
  return dpct_accessor_t((T *)_device_ptr, _range);
3158
2891
  }
3159
- #endif // DPCT_USM_LEVEL_NONE
3160
2892
 
3161
2893
  private:
3162
2894
  device_memory(value_t *memory_ptr, size_t size)
@@ -3201,15 +2933,6 @@ namespace dpct
3201
2933
 
3202
2934
  /// Default constructor
3203
2935
  device_memory() : base(1) {}
3204
-
3205
- #ifdef DPCT_USM_LEVEL_NONE
3206
- /// Get sycl::accessor for the device memory object when usm is not used.
3207
- accessor_t get_access(sycl::handler &cgh) {
3208
- auto buf = get_buffer(base::get_ptr())
3209
- .template reinterpret<T, 1>(sycl::range<1>(1));
3210
- return accessor_t(buf, cgh);
3211
- }
3212
- #endif // DPCT_USM_LEVEL_NONE
3213
2936
  };
3214
2937
  } // namespace detail
3215
2938
 
@@ -3228,7 +2951,7 @@ namespace dpct
3228
2951
  #include "ggml-common.h"
3229
2952
 
3230
2953
  static int g_ggml_sycl_debug=0;
3231
- #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
2954
+ #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
3232
2955
 
3233
2956
  #define CHECK_TRY_ERROR(expr) \
3234
2957
  [&]() { \
@@ -3315,6 +3038,10 @@ typedef float dfloat; // dequantize float
3315
3038
  typedef sycl::float2 dfloat2;
3316
3039
  #endif //GGML_SYCL_F16
3317
3040
 
3041
+ #define MMVQ_MAX_BATCH_SIZE 8
3042
+
3043
+ static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
3044
+
3318
3045
  bool ggml_sycl_loaded(void);
3319
3046
  void * ggml_sycl_host_malloc(size_t size);
3320
3047
  void ggml_sycl_host_free(void * ptr);
@@ -4750,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
4750
4477
 
4751
4478
  }
4752
4479
 
4480
+ template <typename dst_t>
4481
+ __dpct_inline__ static void
4482
+ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4483
+ const sycl::nd_item<3> &item_ct1) {
4484
+
4485
+ const int i = item_ct1.get_group(2);
4486
+ const block_iq2_s * x = (const block_iq2_s *) vx;
4487
+
4488
+ const int tid = item_ct1.get_local_id(2);
4489
+ #if QK_K == 256
4490
+ const int il = tid/8; // 0...3
4491
+ const int ib = tid%8; // 0...7
4492
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4493
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
4494
+ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
4495
+ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
4496
+ #pragma unroll
4497
+ for (int j = 0; j < 8; ++j)
4498
+ y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
4499
+ #else
4500
+ assert(false);
4501
+
4502
+ #endif
4503
+
4504
+ }
4505
+
4753
4506
  template<typename dst_t>
4754
4507
  static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
4755
4508
  const sycl::nd_item<3> &item_ct1,
@@ -4782,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
4782
4535
 
4783
4536
  }
4784
4537
 
4785
- template<typename dst_t>
4786
- static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4787
- const sycl::nd_item<3> &item_ct1,
4788
- const uint32_t *iq3s_grid,
4789
- const uint8_t *ksigns_iq2xs,
4790
- const uint8_t *kmask_iq2xs) {
4538
+ template <typename dst_t>
4539
+ __dpct_inline__ static void
4540
+ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4541
+ const sycl::nd_item<3> &item_ct1,
4542
+ const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
4791
4543
 
4792
4544
  const int i = item_ct1.get_group(2);
4793
- const block_iq3_s * x = (const block_iq3_s *) vx;
4545
+ const block_iq3_s * x = (const block_iq3_s *) vx;
4794
4546
 
4795
4547
  const int tid = item_ct1.get_local_id(2);
4796
4548
  #if QK_K == 256
4797
4549
  const int il = tid/8; // 0...3
4798
4550
  const int ib = tid%8; // 0...7
4799
4551
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4800
- const uint8_t * qs = x[i].qs + 8*ib;
4801
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]);
4802
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]);
4552
+ const uint8_t * qs = x[i].qs + 8*ib;
4553
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
4554
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
4803
4555
  const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
4804
4556
  const uint8_t signs = x[i].signs[4*ib + il];
4557
+ #pragma unroll
4805
4558
  for (int j = 0; j < 4; ++j) {
4806
4559
  y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4807
4560
  y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
@@ -4812,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
4812
4565
 
4813
4566
  }
4814
4567
 
4815
- template<typename dst_t>
4816
- static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
4817
- const sycl::nd_item<3> &item_ct1,
4818
- const uint32_t *iq1s_grid,
4819
- const uint8_t *ksigns_iq2xs,
4820
- const uint8_t *kmask_iq2xs) {
4568
+ template <typename dst_t>
4569
+ __dpct_inline__ static void
4570
+ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
4571
+ const sycl::nd_item<3> &item_ct1,
4572
+ const uint32_t *iq1s_grid_gpu) {
4573
+
4821
4574
  const int i = item_ct1.get_group(2);
4822
4575
  const block_iq1_s * x = (const block_iq1_s *) vx;
4823
4576
 
@@ -4826,14 +4579,49 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
4826
4579
  const int il = tid/8; // 0...3
4827
4580
  const int ib = tid%8; // 0...7
4828
4581
  dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4829
- const uint8_t * qs = x[i].qs + 8*ib;
4830
- const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
4831
- const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
4832
- const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
4833
- const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
4834
- for (int j = 0; j < 4; ++j) {
4835
- y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
4836
- y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
4582
+ const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
4583
+ const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
4584
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
4585
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
4586
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
4587
+ grid32[0] &= 0x0f0f0f0f;
4588
+ #pragma unroll
4589
+ for (int j = 0; j < 8; ++j) {
4590
+ y[j] = d * (q[j] + delta);
4591
+ }
4592
+ #else
4593
+ assert(false);
4594
+ #endif
4595
+
4596
+ }
4597
+
4598
+ template <typename dst_t>
4599
+ __dpct_inline__ static void
4600
+ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
4601
+ const sycl::nd_item<3> &item_ct1,
4602
+ const uint32_t *iq1s_grid_gpu) {
4603
+
4604
+ const int i = item_ct1.get_group(2);
4605
+ const block_iq1_m * x = (const block_iq1_m *) vx;
4606
+
4607
+ const int tid = item_ct1.get_local_id(2);
4608
+ #if QK_K == 256
4609
+ const int il = tid/8; // 0...3
4610
+ const int ib = tid%8; // 0...7
4611
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
4612
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
4613
+ iq1m_scale_t scale;
4614
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
4615
+ const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
4616
+ const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
4617
+ const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
4618
+ uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
4619
+ grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
4620
+ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
4621
+ grid32[0] &= 0x0f0f0f0f;
4622
+ #pragma unroll
4623
+ for (int j = 0; j < 8; ++j) {
4624
+ y[j] = d * (q[j] + delta);
4837
4625
  }
4838
4626
  #else
4839
4627
  assert(false);
@@ -4841,6 +4629,51 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
4841
4629
 
4842
4630
  }
4843
4631
 
4632
+ template <typename dst_t>
4633
+ __dpct_inline__ static void
4634
+ dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
4635
+ const sycl::nd_item<3> &item_ct1) {
4636
+
4637
+ const int i = item_ct1.get_group(2);
4638
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
4639
+
4640
+ const int tid = item_ct1.get_local_id(2);
4641
+ const int il = tid/8; // 0...3
4642
+ const int ib = tid%8; // 0...7
4643
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
4644
+ const uint8_t * q4 = x[ib].qs + 4*il;
4645
+ const float d = (float)x[ib].d;
4646
+ #pragma unroll
4647
+ for (int j = 0; j < 4; ++j) {
4648
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
4649
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
4650
+ }
4651
+
4652
+ }
4653
+
4654
+
4655
+ template <typename dst_t>
4656
+ __dpct_inline__ static void
4657
+ dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
4658
+ const sycl::nd_item<3> &item_ct1) {
4659
+ const int i = item_ct1.get_group(2);
4660
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
4661
+
4662
+ const int tid = item_ct1.get_local_id(2);
4663
+ const int il = tid/8; // 0...3
4664
+ const int ib = tid%8; // 0...7
4665
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
4666
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
4667
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
4668
+ #pragma unroll
4669
+ for (int j = 0; j < 4; ++j) {
4670
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
4671
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
4672
+ }
4673
+ }
4674
+
4675
+
4676
+
4844
4677
  /*
4845
4678
  DPCT1110:4: The total declared local variable size in device function
4846
4679
  dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -7647,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
7647
7480
  #endif
7648
7481
  }
7649
7482
 
7483
+ static __dpct_inline__ float
7484
+ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
7485
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7486
+ #if QK_K == 256
7487
+ const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
7488
+
7489
+ const int ib32 = iqs;
7490
+ const int8_t * q8 = bq8_1[ib32].qs;
7491
+ const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
7492
+ const uint8_t ls1 = bq2->scales[ib32] & 0xf;
7493
+ const uint8_t ls2 = bq2->scales[ib32] >> 4;
7494
+ int sumi1 = 0;
7495
+ for (int l = 0; l < 2; ++l) {
7496
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
7497
+ const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7498
+ ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
7499
+ std::equal_to<>());
7500
+ const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7501
+ ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
7502
+ std::equal_to<>());
7503
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7504
+ grid[0] ^ signs0, signs0, std::minus<>());
7505
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7506
+ grid[1] ^ signs1, signs1, std::minus<>());
7507
+ sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
7508
+ sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
7509
+ q8 += 8;
7510
+ }
7511
+ int sumi2 = 0;
7512
+ for (int l = 2; l < 4; ++l) {
7513
+ const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
7514
+ const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7515
+ ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
7516
+ std::equal_to<>());
7517
+ const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7518
+ ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
7519
+ std::equal_to<>());
7520
+ const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7521
+ grid[0] ^ signs0, signs0, std::minus<>());
7522
+ const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7523
+ grid[1] ^ signs1, signs1, std::minus<>());
7524
+ sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
7525
+ sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
7526
+ q8 += 8;
7527
+ }
7528
+ const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
7529
+ return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
7530
+ #else
7531
+ assert(false);
7532
+ #endif
7533
+ }
7534
+
7650
7535
  static __dpct_inline__ float
7651
7536
  vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7652
7537
  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
@@ -7689,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
7689
7574
 
7690
7575
  static __dpct_inline__ float
7691
7576
  vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7692
- const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7693
- const uint32_t *iq3s_grid, const uint64_t *ksigns64) {
7694
- #if DPCT_COMPATIBILITY_TEMP >= \
7695
- MIN_CC_DP4A // lowest compute capability for integer intrinsics
7577
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7578
+ const uint32_t *iq3s_grid) {
7696
7579
  #if QK_K == 256
7697
7580
  const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
7698
7581
 
@@ -7704,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7704
7587
  const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
7705
7588
  const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
7706
7589
  uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
7707
- ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7590
+ ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
7591
+ 0x08040201, std::equal_to<>());
7708
7592
  uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
7709
- ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
7593
+ ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
7594
+ 0x08040201, std::equal_to<>());
7710
7595
  const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7711
7596
  grid1[0] ^ signs0, signs0, std::minus<>());
7712
7597
  const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
@@ -7715,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
7715
7600
  sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7716
7601
  q8 += 8;
7717
7602
  }
7718
- const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0];
7603
+ const float d =
7604
+ (float)bq2->d *
7605
+ (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
7606
+ bq8_1[ib32].ds[0];
7719
7607
  return d * sumi;
7720
7608
  #else
7721
7609
  assert(false);
7722
- return 0.f;
7723
- #endif
7724
- #else
7725
- assert(false);
7726
- return 0.f;
7727
7610
  #endif
7728
7611
  }
7729
7612
 
7730
7613
  static __dpct_inline__ float
7731
7614
  vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
7732
- const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7733
- const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
7615
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs,
7616
+ const uint32_t *iq1s_grid_gpu) {
7734
7617
  #if QK_K == 256
7735
7618
  const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
7736
7619
 
7737
7620
  const int ib32 = iqs;
7738
- const uint8_t * qs = bq1->qs + 4*ib32;
7739
- const int8_t * q8 = bq8_1[ib32].qs;
7740
7621
  int sumi = 0;
7622
+ const int * q8 = (const int *)bq8_1[ib32].qs;
7741
7623
  for (int l = 0; l < 4; ++l) {
7742
- const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
7743
- const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
7744
- const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
7745
- grid[0] ^ signs[0], signs[0], std::minus<>());
7746
- const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
7747
- grid[1] ^ signs[1], signs[1], std::minus<>());
7748
- sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
7749
- sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
7750
- q8 += 8;
7624
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
7625
+ int grid0 = grid[0] & 0x0f0f0f0f;
7626
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
7627
+ sumi = dpct::dp4a(q8[2 * l + 1], grid1,
7628
+ dpct::dp4a(q8[2 * l + 0], grid0, sumi));
7629
+ }
7630
+
7631
+ const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
7632
+ const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
7633
+ const float d = d1q * bq8_1[ib32].ds[0];
7634
+ const float m = d1q * bq8_1[ib32].ds[1];
7635
+ return d * sumi + m * delta;
7636
+ #else
7637
+ assert(false);
7638
+ #endif
7639
+ }
7640
+
7641
+ static __dpct_inline__ float
7642
+ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
7643
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7644
+ #if QK_K == 256
7645
+ const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
7646
+
7647
+ const int ib32 = iqs;
7648
+ int sumi[2] = {0, 0};
7649
+ float sumf[2] = {0.f, 0.f};
7650
+
7651
+ const int * q8 = (const int *)bq8_1[ib32].qs;
7652
+ for (int l = 0; l < 4; ++l) {
7653
+ const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
7654
+ int grid0 = grid[0] & 0x0f0f0f0f;
7655
+ int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
7656
+ sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
7657
+ dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
7658
+ const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
7659
+ const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
7660
+ dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
7661
+ sumf[l/2] += delta*sumy;
7662
+ }
7663
+
7664
+ iq1m_scale_t scale;
7665
+ const uint16_t * sc = (const uint16_t *)bq1->scales;
7666
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
7667
+ const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
7668
+ return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
7669
+ #else
7670
+ assert(false);
7671
+ #endif
7672
+ }
7673
+
7674
+ static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
7675
+ const uint8_t *values,
7676
+ int &val1, int &val2) {
7677
+
7678
+ uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
7679
+ aux32 = q4 & 0x0f0f0f0f;
7680
+ uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
7681
+ uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
7682
+ val1 = v1 | (v2 << 16);
7683
+ aux32 = (q4 >> 4) & 0x0f0f0f0f;
7684
+ v1 = values[q8[0]] | (values[q8[1]] << 8);
7685
+ v2 = values[q8[2]] | (values[q8[3]] << 8);
7686
+ val2 = v1 | (v2 << 16);
7687
+ }
7688
+
7689
+
7690
+ static __dpct_inline__ float
7691
+ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
7692
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7693
+
7694
+ const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
7695
+
7696
+ const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
7697
+ const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
7698
+
7699
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7700
+
7701
+ int v1, v2;
7702
+ int sumi1 = 0, sumi2 = 0;
7703
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
7704
+ const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
7705
+ get_int_from_table_16(aux, values, v1, v2);
7706
+ sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
7707
+ sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
7751
7708
  }
7752
- const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
7753
- return d * sumi;
7709
+
7710
+ const float d = (float)bq->d * bq8_1->ds[0];
7711
+ return d * (sumi1 + sumi2);
7712
+ }
7713
+
7714
+
7715
+ static __dpct_inline__ float
7716
+ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
7717
+ const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
7718
+
7719
+ #if QK_K == 256
7720
+ const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
7721
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
7722
+
7723
+ // iqs is 0...7
7724
+ const int ib32 = iqs;
7725
+ const int32_t * q8 = (const int *)bq8_1[ib32].qs;
7726
+ const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
7727
+ const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
7728
+ const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
7729
+ int v1, v2;
7730
+ int sumi1 = 0, sumi2 = 0;
7731
+ for (int j = 0; j < 4; ++j) {
7732
+ get_int_from_table_16(q4[j], values, v1, v2);
7733
+ sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
7734
+ sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
7735
+ }
7736
+ return d * (sumi1 + sumi2);
7754
7737
  #else
7755
7738
  assert(false);
7756
- return 0.f;
7757
7739
  #endif
7758
7740
  }
7759
7741
 
@@ -8338,8 +8320,7 @@ template <bool need_check> static void
8338
8320
 
8339
8321
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8340
8322
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8341
- const sycl::nd_item<3> &item_ct1,
8342
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
8323
+ const sycl::nd_item<3> &item_ct1) {
8343
8324
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8344
8325
  item_ct1.get_local_id(1);
8345
8326
 
@@ -8383,10 +8364,203 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
8383
8364
  }
8384
8365
 
8385
8366
  template <int qk, int qi, typename block_q_t, int vdr>
8386
- static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8387
- const sycl::nd_item<3> &item_ct1,
8388
- const uint64_t *iq2xxs_grid_ptr, const uint8_t *ksigns_iq2xs_ptr,
8389
- const uint8_t *kmask_iq2xs_ptr ) {
8367
+ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
8368
+ const void *__restrict__ vy,
8369
+ float *__restrict__ dst, const int ncols,
8370
+ const int nrows,
8371
+ const sycl::nd_item<3> &item_ct1) {
8372
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8373
+ item_ct1.get_local_id(1);
8374
+
8375
+ if (row >= nrows) {
8376
+ return;
8377
+ }
8378
+
8379
+ const int blocks_per_row = ncols / qk;
8380
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8381
+
8382
+ // partial sum for each thread
8383
+ float tmp = 0.0f;
8384
+
8385
+ const block_q_t * x = (const block_q_t *) vx;
8386
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8387
+
8388
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8389
+ i += blocks_per_warp) {
8390
+ const int ibx = row*blocks_per_row + i; // x block index
8391
+
8392
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8393
+
8394
+ const int iqs =
8395
+ vdr *
8396
+ (item_ct1.get_local_id(2) %
8397
+ (qi / vdr)); // x block quant index when casting the quants to int
8398
+
8399
+ tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
8400
+ }
8401
+
8402
+ // sum up partial sums and write back result
8403
+ #pragma unroll
8404
+ for (int mask = 16; mask > 0; mask >>= 1) {
8405
+ tmp +=
8406
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8407
+ }
8408
+
8409
+ if (item_ct1.get_local_id(2) == 0) {
8410
+ dst[row] = tmp;
8411
+ }
8412
+ }
8413
+
8414
+ template <int qk, int qi, typename block_q_t, int vdr>
8415
+ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
8416
+ const void *__restrict__ vy,
8417
+ float *__restrict__ dst, const int ncols,
8418
+ const int nrows,
8419
+ const sycl::nd_item<3> &item_ct1) {
8420
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8421
+ item_ct1.get_local_id(1);
8422
+
8423
+ if (row >= nrows) {
8424
+ return;
8425
+ }
8426
+
8427
+ const int blocks_per_row = ncols / qk;
8428
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8429
+
8430
+ // partial sum for each thread
8431
+ float tmp = 0.0f;
8432
+
8433
+ const block_q_t * x = (const block_q_t *) vx;
8434
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8435
+
8436
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8437
+ i += blocks_per_warp) {
8438
+ const int ibx = row*blocks_per_row + i; // x block index
8439
+
8440
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8441
+
8442
+ const int iqs =
8443
+ vdr *
8444
+ (item_ct1.get_local_id(2) %
8445
+ (qi / vdr)); // x block quant index when casting the quants to int
8446
+
8447
+ tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
8448
+ }
8449
+
8450
+ // sum up partial sums and write back result
8451
+ #pragma unroll
8452
+ for (int mask = 16; mask > 0; mask >>= 1) {
8453
+ tmp +=
8454
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8455
+ }
8456
+
8457
+ if (item_ct1.get_local_id(2) == 0) {
8458
+ dst[row] = tmp;
8459
+ }
8460
+ }
8461
+
8462
+ template <int qk, int qi, typename block_q_t, int vdr>
8463
+ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
8464
+ const void *__restrict__ vy,
8465
+ float *__restrict__ dst, const int ncols,
8466
+ const int nrows,
8467
+ const sycl::nd_item<3> &item_ct1) {
8468
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8469
+ item_ct1.get_local_id(1);
8470
+
8471
+ if (row >= nrows) {
8472
+ return;
8473
+ }
8474
+
8475
+ const int blocks_per_row = ncols / qk;
8476
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8477
+
8478
+ // partial sum for each thread
8479
+ float tmp = 0.0f;
8480
+
8481
+ const block_q_t * x = (const block_q_t *) vx;
8482
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8483
+
8484
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8485
+ i += blocks_per_warp) {
8486
+ const int ibx = row*blocks_per_row + i; // x block index
8487
+
8488
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8489
+
8490
+ const int iqs =
8491
+ vdr *
8492
+ (item_ct1.get_local_id(2) %
8493
+ (qi / vdr)); // x block quant index when casting the quants to int
8494
+
8495
+ tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
8496
+ }
8497
+
8498
+ // sum up partial sums and write back result
8499
+ #pragma unroll
8500
+ for (int mask = 16; mask > 0; mask >>= 1) {
8501
+ tmp +=
8502
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8503
+ }
8504
+
8505
+ if (item_ct1.get_local_id(2) == 0) {
8506
+ dst[row] = tmp;
8507
+ }
8508
+ }
8509
+
8510
+ template <int qk, int qi, typename block_q_t, int vdr>
8511
+ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
8512
+ const void *__restrict__ vy,
8513
+ float *__restrict__ dst, const int ncols,
8514
+ const int nrows,
8515
+ const sycl::nd_item<3> &item_ct1) {
8516
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8517
+ item_ct1.get_local_id(1);
8518
+
8519
+ if (row >= nrows) {
8520
+ return;
8521
+ }
8522
+
8523
+ const int blocks_per_row = ncols / qk;
8524
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
8525
+
8526
+ // partial sum for each thread
8527
+ float tmp = 0.0f;
8528
+
8529
+ const block_q_t * x = (const block_q_t *) vx;
8530
+ const block_q8_1 * y = (const block_q8_1 *) vy;
8531
+
8532
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
8533
+ i += blocks_per_warp) {
8534
+ const int ibx = row*blocks_per_row + i; // x block index
8535
+
8536
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
8537
+
8538
+ const int iqs =
8539
+ vdr *
8540
+ (item_ct1.get_local_id(2) %
8541
+ (qi / vdr)); // x block quant index when casting the quants to int
8542
+
8543
+ tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
8544
+ }
8545
+
8546
+ // sum up partial sums and write back result
8547
+ #pragma unroll
8548
+ for (int mask = 16; mask > 0; mask >>= 1) {
8549
+ tmp +=
8550
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
8551
+ }
8552
+
8553
+ if (item_ct1.get_local_id(2) == 0) {
8554
+ dst[row] = tmp;
8555
+ }
8556
+ }
8557
+
8558
+ template <int qk, int qi, typename block_q_t, int vdr>
8559
+ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
8560
+ const void *__restrict__ vy,
8561
+ float *__restrict__ dst, const int ncols,
8562
+ const int nrows,
8563
+ const sycl::nd_item<3> &item_ct1) {
8390
8564
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8391
8565
  item_ct1.get_local_id(1);
8392
8566
 
@@ -8414,7 +8588,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
8414
8588
  (item_ct1.get_local_id(2) %
8415
8589
  (qi / vdr)); // x block quant index when casting the quants to int
8416
8590
 
8417
- tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid_ptr, ksigns_iq2xs_ptr, kmask_iq2xs_ptr);
8591
+ tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
8418
8592
  }
8419
8593
 
8420
8594
  // sum up partial sums and write back result
@@ -8430,9 +8604,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
8430
8604
  }
8431
8605
 
8432
8606
  template <int qk, int qi, typename block_q_t, int vdr>
8433
- static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8434
- const sycl::nd_item<3> &item_ct1,
8435
- const uint64_t *iq2xs_grid_ptr, const uint64_t *ksigns64_ptr ) {
8607
+ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
8608
+ const void *__restrict__ vy,
8609
+ float *__restrict__ dst, const int ncols,
8610
+ const int nrows,
8611
+ const sycl::nd_item<3> &item_ct1) {
8436
8612
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8437
8613
  item_ct1.get_local_id(1);
8438
8614
 
@@ -8460,7 +8636,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
8460
8636
  (item_ct1.get_local_id(2) %
8461
8637
  (qi / vdr)); // x block quant index when casting the quants to int
8462
8638
 
8463
- tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid_ptr, ksigns64_ptr);
8639
+ tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
8464
8640
  }
8465
8641
 
8466
8642
  // sum up partial sums and write back result
@@ -8476,9 +8652,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
8476
8652
  }
8477
8653
 
8478
8654
  template <int qk, int qi, typename block_q_t, int vdr>
8479
- static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8480
- const sycl::nd_item<3> &item_ct1,
8481
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr ) {
8655
+ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
8656
+ const void *__restrict__ vy,
8657
+ float *__restrict__ dst, const int ncols,
8658
+ const int nrows,
8659
+ const sycl::nd_item<3> &item_ct1) {
8482
8660
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8483
8661
  item_ct1.get_local_id(1);
8484
8662
 
@@ -8506,7 +8684,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8506
8684
  (item_ct1.get_local_id(2) %
8507
8685
  (qi / vdr)); // x block quant index when casting the quants to int
8508
8686
 
8509
- tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid_ptr, ksigns64_ptr);
8687
+ tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
8510
8688
  }
8511
8689
 
8512
8690
  // sum up partial sums and write back result
@@ -8522,9 +8700,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
8522
8700
  }
8523
8701
 
8524
8702
  template <int qk, int qi, typename block_q_t, int vdr>
8525
- static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8526
- const sycl::nd_item<3> &item_ct1,
8527
- const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8703
+ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
8704
+ const void *__restrict__ vy,
8705
+ float *__restrict__ dst, const int ncols,
8706
+ const int nrows,
8707
+ const sycl::nd_item<3> &item_ct1) {
8528
8708
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8529
8709
  item_ct1.get_local_id(1);
8530
8710
 
@@ -8552,7 +8732,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
8552
8732
  (item_ct1.get_local_id(2) %
8553
8733
  (qi / vdr)); // x block quant index when casting the quants to int
8554
8734
 
8555
- tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr);
8735
+ tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
8556
8736
  }
8557
8737
 
8558
8738
  // sum up partial sums and write back result
@@ -8567,10 +8747,13 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
8567
8747
  }
8568
8748
  }
8569
8749
 
8750
+
8570
8751
  template <int qk, int qi, typename block_q_t, int vdr>
8571
- static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8572
- const sycl::nd_item<3> &item_ct1,
8573
- const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
8752
+ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
8753
+ const void *__restrict__ vy,
8754
+ float *__restrict__ dst, const int ncols,
8755
+ const int nrows,
8756
+ const sycl::nd_item<3> &item_ct1) {
8574
8757
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8575
8758
  item_ct1.get_local_id(1);
8576
8759
 
@@ -8598,7 +8781,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
8598
8781
  (item_ct1.get_local_id(2) %
8599
8782
  (qi / vdr)); // x block quant index when casting the quants to int
8600
8783
 
8601
- tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr);
8784
+ tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
8602
8785
  }
8603
8786
 
8604
8787
  // sum up partial sums and write back result
@@ -8613,6 +8796,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
8613
8796
  }
8614
8797
  }
8615
8798
 
8799
+
8616
8800
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
8617
8801
  static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
8618
8802
  const sycl::nd_item<3> &item_ct1) {
@@ -9174,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
9174
9358
  }
9175
9359
  }
9176
9360
 
9361
+
9177
9362
  template<typename T>
9178
- static inline void swap(T & a, T & b) {
9363
+ static inline void ggml_sycl_swap(T & a, T & b) {
9179
9364
  T tmp = a;
9180
9365
  a = b;
9181
9366
  b = tmp;
9182
9367
  }
9183
9368
 
9184
- template<ggml_sort_order order>
9185
- static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
9186
- const sycl::nd_item<3> &item_ct1) {
9369
+ template <ggml_sort_order order>
9370
+ __dpct_inline__ static void
9371
+ k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
9372
+ const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
9187
9373
  // bitonic sort
9188
9374
  int col = item_ct1.get_local_id(2);
9189
9375
  int row = item_ct1.get_group(1);
9190
9376
 
9191
- if (col >= ncols) return;
9377
+ if (col >= ncols_pad) {
9378
+ return;
9379
+ }
9192
9380
 
9193
9381
  const float * x_row = x + row * ncols;
9194
- int * dst_row = dst + row * ncols;
9382
+ auto dst_row = (int *)dpct_local;
9195
9383
 
9196
9384
  // initialize indices
9197
- if (col < ncols) {
9198
- dst_row[col] = col;
9199
- }
9200
- /*
9201
- DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
9202
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
9203
- performance if there is no access to global memory.
9204
- */
9205
- item_ct1.barrier();
9385
+ dst_row[col] = col;
9386
+
9387
+ item_ct1.barrier(sycl::access::fence_space::local_space);
9206
9388
 
9207
- for (int k = 2; k <= ncols; k *= 2) {
9389
+ for (int k = 2; k <= ncols_pad; k *= 2) {
9208
9390
  for (int j = k / 2; j > 0; j /= 2) {
9209
9391
  int ixj = col ^ j;
9210
9392
  if (ixj > col) {
9211
9393
  if ((col & k) == 0) {
9212
- if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
9213
- swap(dst_row[col], dst_row[ixj]);
9394
+ if (dst_row[col] >= ncols ||
9395
+ (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
9396
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
9397
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
9398
+ ) {
9399
+ ggml_sycl_swap(dst_row[col], dst_row[ixj]);
9214
9400
  }
9215
9401
  } else {
9216
- if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
9217
- swap(dst_row[col], dst_row[ixj]);
9402
+ if (dst_row[ixj] >= ncols ||
9403
+ (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
9404
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
9405
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
9406
+ ) {
9407
+ ggml_sycl_swap(dst_row[col], dst_row[ixj]);
9218
9408
  }
9219
9409
  }
9220
9410
  }
9221
9411
  /*
9222
- DPCT1118:11: SYCL group functions and algorithms must be encountered
9412
+ DPCT1118:1: SYCL group functions and algorithms must be encountered
9223
9413
  in converged control flow. You may need to adjust the code.
9224
9414
  */
9225
- /*
9226
- DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
9227
- sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
9228
- better performance if there is no access to global memory.
9229
- */
9230
- item_ct1.barrier();
9415
+ item_ct1.barrier(sycl::access::fence_space::local_space);
9231
9416
  }
9232
9417
  }
9418
+
9419
+ // copy the result to dst without the padding
9420
+ if (col < ncols) {
9421
+ dst[row * ncols + col] = dst_row[col];
9422
+ }
9233
9423
  }
9234
9424
 
9425
+
9235
9426
  static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
9236
9427
  const sycl::nd_item<3> &item_ct1) {
9237
9428
  const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
@@ -10210,31 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
10210
10401
  #endif
10211
10402
  }
10212
10403
 
10213
-
10214
10404
  template <typename dst_t>
10215
- static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10405
+ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10216
10406
  dpct::queue_ptr stream) {
10217
10407
  const int nb = k / QK_K;
10218
10408
  {
10219
- iq2xxs_grid.init(*stream);
10220
- ksigns_iq2xs.init(*stream);
10221
- kmask_iq2xs.init(*stream);
10409
+ dpct::has_capability_or_fail(stream->get_device(),
10410
+ {sycl::aspect::fp16});
10411
+
10412
+ stream->submit([&](sycl::handler &cgh) {
10413
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10414
+ sycl::range<3>(1, 1, 32),
10415
+ sycl::range<3>(1, 1, 32)),
10416
+ [=](sycl::nd_item<3> item_ct1) {
10417
+ dequantize_block_iq1_s(
10418
+ vx, y, item_ct1, iq1s_grid_gpu
10419
+ );
10420
+ });
10421
+ });
10422
+ }
10423
+ }
10222
10424
 
10425
+ template <typename dst_t>
10426
+ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
10427
+ dpct::queue_ptr stream) {
10428
+ const int nb = k / QK_K;
10429
+ {
10223
10430
  dpct::has_capability_or_fail(stream->get_device(),
10224
10431
  {sycl::aspect::fp16});
10225
10432
 
10226
10433
  stream->submit([&](sycl::handler &cgh) {
10227
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10228
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10229
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10434
+ cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10435
+ sycl::range<3>(1, 1, 32),
10436
+ sycl::range<3>(1, 1, 32)),
10437
+ [=](sycl::nd_item<3> item_ct1) {
10438
+ dequantize_block_iq1_m(
10439
+ vx, y, item_ct1, iq1s_grid_gpu
10440
+ );
10441
+ });
10442
+ });
10443
+ }
10444
+ }
10445
+
10446
+ template <typename dst_t>
10447
+ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10448
+ dpct::queue_ptr stream) {
10449
+ const int nb = k / QK_K;
10450
+ {
10451
+ dpct::has_capability_or_fail(stream->get_device(),
10452
+ {sycl::aspect::fp16});
10230
10453
 
10454
+ stream->submit([&](sycl::handler &cgh) {
10231
10455
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10232
10456
  sycl::range<3>(1, 1, 32),
10233
10457
  sycl::range<3>(1, 1, 32)),
10234
10458
  [=](sycl::nd_item<3> item_ct1) {
10235
10459
  dequantize_block_iq2_xxs(
10236
- vx, y, item_ct1, iq2xxs_grid_ptr_ct1,
10237
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10460
+ vx, y, item_ct1, iq2xxs_grid,
10461
+ ksigns_iq2xs, kmask_iq2xs);
10238
10462
  });
10239
10463
  });
10240
10464
  }
@@ -10245,117 +10469,130 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
10245
10469
  dpct::queue_ptr stream) {
10246
10470
  const int nb = k / QK_K;
10247
10471
  {
10248
- iq2xs_grid.init(*stream);
10249
- ksigns_iq2xs.init(*stream);
10250
- kmask_iq2xs.init(*stream);
10251
-
10252
10472
  dpct::has_capability_or_fail(stream->get_device(),
10253
10473
  {sycl::aspect::fp16});
10254
10474
 
10255
10475
  stream->submit([&](sycl::handler &cgh) {
10256
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10257
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10258
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10259
-
10260
10476
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10261
10477
  sycl::range<3>(1, 1, 32),
10262
10478
  sycl::range<3>(1, 1, 32)),
10263
10479
  [=](sycl::nd_item<3> item_ct1) {
10264
10480
  dequantize_block_iq2_xs(
10265
- vx, y, item_ct1, iq2xs_grid_ptr_ct1,
10266
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10481
+ vx, y, item_ct1, iq2xs_grid,
10482
+ ksigns_iq2xs, kmask_iq2xs);
10267
10483
  });
10268
10484
  });
10269
10485
  }
10270
10486
  }
10271
10487
 
10272
10488
  template <typename dst_t>
10273
- static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10274
- dpct::queue_ptr stream) {
10489
+ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
10490
+ dpct::queue_ptr stream) {
10275
10491
  const int nb = k / QK_K;
10276
10492
  {
10277
- iq3xxs_grid.init(*stream);
10278
- ksigns_iq2xs.init(*stream);
10279
- kmask_iq2xs.init(*stream);
10280
-
10281
10493
  dpct::has_capability_or_fail(stream->get_device(),
10282
10494
  {sycl::aspect::fp16});
10283
10495
 
10284
10496
  stream->submit([&](sycl::handler &cgh) {
10285
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10286
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10287
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10288
-
10289
10497
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10290
10498
  sycl::range<3>(1, 1, 32),
10291
10499
  sycl::range<3>(1, 1, 32)),
10292
10500
  [=](sycl::nd_item<3> item_ct1) {
10293
- dequantize_block_iq3_xxs(
10294
- vx, y, item_ct1, iq3xxs_grid_ptr_ct1,
10295
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10501
+ dequantize_block_iq2_s(vx, y, item_ct1);
10296
10502
  });
10297
10503
  });
10298
10504
  }
10299
10505
  }
10300
10506
 
10507
+
10301
10508
  template <typename dst_t>
10302
- static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10509
+ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10303
10510
  dpct::queue_ptr stream) {
10304
10511
  const int nb = k / QK_K;
10305
10512
  {
10306
- iq3s_grid.init(*stream);
10307
- ksigns_iq2xs.init(*stream);
10308
- kmask_iq2xs.init(*stream);
10309
-
10310
10513
  dpct::has_capability_or_fail(stream->get_device(),
10311
10514
  {sycl::aspect::fp16});
10312
10515
 
10313
10516
  stream->submit([&](sycl::handler &cgh) {
10314
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10315
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10316
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10317
-
10318
10517
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10319
10518
  sycl::range<3>(1, 1, 32),
10320
10519
  sycl::range<3>(1, 1, 32)),
10321
10520
  [=](sycl::nd_item<3> item_ct1) {
10322
- dequantize_block_iq3_s(
10323
- vx, y, item_ct1, iq3s_grid_ptr_ct1,
10324
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10521
+ dequantize_block_iq3_xxs(
10522
+ vx, y, item_ct1, iq3xxs_grid,
10523
+ ksigns_iq2xs, kmask_iq2xs);
10325
10524
  });
10326
10525
  });
10327
10526
  }
10328
10527
  }
10329
10528
 
10330
10529
  template <typename dst_t>
10331
- static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10530
+ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10332
10531
  dpct::queue_ptr stream) {
10333
10532
  const int nb = k / QK_K;
10334
10533
  {
10335
- iq1s_grid_gpu.init(*stream);
10336
- ksigns_iq2xs.init(*stream);
10337
- kmask_iq2xs.init(*stream);
10338
-
10339
10534
  dpct::has_capability_or_fail(stream->get_device(),
10340
10535
  {sycl::aspect::fp16});
10341
10536
 
10342
10537
  stream->submit([&](sycl::handler &cgh) {
10343
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10344
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10345
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10346
-
10347
10538
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10348
10539
  sycl::range<3>(1, 1, 32),
10349
10540
  sycl::range<3>(1, 1, 32)),
10350
10541
  [=](sycl::nd_item<3> item_ct1) {
10351
- dequantize_block_iq1_s(
10352
- vx, y, item_ct1, iq1s_grid_ptr_ct1,
10353
- ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
10542
+ dequantize_block_iq3_s(
10543
+ vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
10354
10544
  });
10355
10545
  });
10356
10546
  }
10357
10547
  }
10358
10548
 
10549
+ template <typename dst_t>
10550
+ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
10551
+ dpct::queue_ptr stream) {
10552
+ const int nb = (k + QK_K - 1) / QK_K;
10553
+ #if QK_K == 64
10554
+ dequantize_row_iq4_nl_sycl(vx, y, k, stream);
10555
+ #else
10556
+ {
10557
+ dpct::has_capability_or_fail(stream->get_device(),
10558
+ {sycl::aspect::fp16});
10559
+
10560
+ stream->submit([&](sycl::handler &cgh) {
10561
+ cgh.parallel_for(
10562
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10563
+ sycl::range<3>(1, 1, 32),
10564
+ sycl::range<3>(1, 1, 32)),
10565
+ [=](sycl::nd_item<3> item_ct1) {
10566
+ dequantize_block_iq4_xs(vx, y, item_ct1);
10567
+ });
10568
+ });
10569
+ }
10570
+ #endif
10571
+ }
10572
+
10573
+
10574
+ template <typename dst_t>
10575
+ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
10576
+ dpct::queue_ptr stream) {
10577
+ const int nb = (k + QK_K - 1) / QK_K;
10578
+ {
10579
+ dpct::has_capability_or_fail(stream->get_device(),
10580
+ {sycl::aspect::fp16});
10581
+
10582
+ stream->submit([&](sycl::handler &cgh) {
10583
+ cgh.parallel_for(
10584
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10585
+ sycl::range<3>(1, 1, 32),
10586
+ sycl::range<3>(1, 1, 32)),
10587
+ [=](sycl::nd_item<3> item_ct1) {
10588
+ dequantize_block_iq4_nl(vx, y, item_ct1);
10589
+ });
10590
+ });
10591
+ }
10592
+ }
10593
+
10594
+
10595
+
10359
10596
  template <typename src_t, typename dst_t>
10360
10597
  static void convert_unary_sycl(const void *__restrict__ vx,
10361
10598
  dst_t *__restrict__ y, const int k,
@@ -10400,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
10400
10637
  return dequantize_row_q5_K_sycl;
10401
10638
  case GGML_TYPE_Q6_K:
10402
10639
  return dequantize_row_q6_K_sycl;
10640
+ case GGML_TYPE_IQ1_S:
10641
+ return dequantize_row_iq1_s_sycl;
10642
+ case GGML_TYPE_IQ1_M:
10643
+ return dequantize_row_iq1_m_sycl;
10403
10644
  case GGML_TYPE_IQ2_XXS:
10404
10645
  return dequantize_row_iq2_xxs_sycl;
10405
10646
  case GGML_TYPE_IQ2_XS:
10406
10647
  return dequantize_row_iq2_xs_sycl;
10648
+ case GGML_TYPE_IQ2_S:
10649
+ return dequantize_row_iq2_s_sycl;
10407
10650
  case GGML_TYPE_IQ3_XXS:
10408
10651
  return dequantize_row_iq3_xxs_sycl;
10409
10652
  case GGML_TYPE_IQ3_S:
10410
10653
  return dequantize_row_iq3_s_sycl;
10411
- case GGML_TYPE_IQ1_S:
10412
- return dequantize_row_iq1_s_sycl;
10654
+ case GGML_TYPE_IQ4_XS:
10655
+ return dequantize_row_iq4_xs_sycl;
10656
+ case GGML_TYPE_IQ4_NL:
10657
+ return dequantize_row_iq4_nl_sycl;
10413
10658
  case GGML_TYPE_F32:
10414
10659
  return convert_unary_sycl<float>;
10415
10660
  default:
@@ -10444,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
10444
10689
  return dequantize_row_q5_K_sycl;
10445
10690
  case GGML_TYPE_Q6_K:
10446
10691
  return dequantize_row_q6_K_sycl;
10692
+ case GGML_TYPE_IQ1_S:
10693
+ return dequantize_row_iq1_s_sycl;
10694
+ case GGML_TYPE_IQ1_M:
10695
+ return dequantize_row_iq1_m_sycl;
10447
10696
  case GGML_TYPE_IQ2_XXS:
10448
10697
  return dequantize_row_iq2_xxs_sycl;
10449
10698
  case GGML_TYPE_IQ2_XS:
10450
10699
  return dequantize_row_iq2_xs_sycl;
10700
+ case GGML_TYPE_IQ2_S:
10701
+ return dequantize_row_iq2_s_sycl;
10451
10702
  case GGML_TYPE_IQ3_XXS:
10452
10703
  return dequantize_row_iq3_xxs_sycl;
10453
10704
  case GGML_TYPE_IQ3_S:
10454
10705
  return dequantize_row_iq3_s_sycl;
10455
- case GGML_TYPE_IQ1_S:
10456
- return dequantize_row_iq1_s_sycl;
10706
+ case GGML_TYPE_IQ4_XS:
10707
+ return dequantize_row_iq4_xs_sycl;
10708
+ case GGML_TYPE_IQ4_NL:
10709
+ return dequantize_row_iq4_nl_sycl;
10457
10710
  case GGML_TYPE_F16:
10458
10711
  return convert_unary_sycl<sycl::half>;
10459
10712
  default:
@@ -10675,12 +10928,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10675
10928
  const sycl::range<3> block_nums(1, 1, block_num_y);
10676
10929
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10677
10930
  {
10678
- iq3xxs_grid.init(*stream);
10679
- ksigns64.init(*stream);
10680
10931
 
10681
10932
  stream->submit([&](sycl::handler &cgh) {
10682
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10683
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10684
10933
 
10685
10934
  cgh.parallel_for(
10686
10935
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10688,8 +10937,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10688
10937
  [[intel::reqd_sub_group_size(32)]] {
10689
10938
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10690
10939
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10691
- vx, vy, dst, ncols, nrows, item_ct1,
10692
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10940
+ vx, vy, dst, ncols, nrows, item_ct1);
10693
10941
  });
10694
10942
  });
10695
10943
  }
@@ -10704,12 +10952,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10704
10952
  const sycl::range<3> block_nums(1, 1, block_num_y);
10705
10953
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10706
10954
  {
10707
- iq3xxs_grid.init(*stream);
10708
- ksigns64.init(*stream);
10709
10955
 
10710
10956
  stream->submit([&](sycl::handler &cgh) {
10711
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10712
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10713
10957
 
10714
10958
  cgh.parallel_for(
10715
10959
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10717,8 +10961,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10717
10961
  [[intel::reqd_sub_group_size(32)]] {
10718
10962
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10719
10963
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10720
- vx, vy, dst, ncols, nrows, item_ct1,
10721
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10964
+ vx, vy, dst, ncols, nrows, item_ct1);
10722
10965
  });
10723
10966
  });
10724
10967
  }
@@ -10733,12 +10976,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10733
10976
  const sycl::range<3> block_nums(1, 1, block_num_y);
10734
10977
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10735
10978
  {
10736
- iq3xxs_grid.init(*stream);
10737
- ksigns64.init(*stream);
10738
10979
 
10739
10980
  stream->submit([&](sycl::handler &cgh) {
10740
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10741
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10742
10981
 
10743
10982
  cgh.parallel_for(
10744
10983
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10746,8 +10985,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10746
10985
  [[intel::reqd_sub_group_size(32)]] {
10747
10986
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10748
10987
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10749
- vx, vy, dst, ncols, nrows, item_ct1,
10750
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10988
+ vx, vy, dst, ncols, nrows, item_ct1);
10751
10989
  });
10752
10990
  });
10753
10991
  }
@@ -10762,12 +11000,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10762
11000
  const sycl::range<3> block_nums(1, 1, block_num_y);
10763
11001
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10764
11002
  {
10765
- iq3xxs_grid.init(*stream);
10766
- ksigns64.init(*stream);
10767
11003
 
10768
11004
  stream->submit([&](sycl::handler &cgh) {
10769
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10770
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10771
11005
 
10772
11006
  cgh.parallel_for(
10773
11007
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10775,8 +11009,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10775
11009
  [[intel::reqd_sub_group_size(32)]] {
10776
11010
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10777
11011
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10778
- vx, vy, dst, ncols, nrows, item_ct1,
10779
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11012
+ vx, vy, dst, ncols, nrows, item_ct1);
10780
11013
  });
10781
11014
  });
10782
11015
  }
@@ -10791,12 +11024,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10791
11024
  const sycl::range<3> block_nums(1, 1, block_num_y);
10792
11025
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10793
11026
  {
10794
- iq3xxs_grid.init(*stream);
10795
- ksigns64.init(*stream);
10796
11027
 
10797
11028
  stream->submit([&](sycl::handler &cgh) {
10798
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10799
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10800
11029
 
10801
11030
  cgh.parallel_for(
10802
11031
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10804,8 +11033,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10804
11033
  [[intel::reqd_sub_group_size(32)]] {
10805
11034
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10806
11035
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10807
- vx, vy, dst, ncols, nrows, item_ct1,
10808
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11036
+ vx, vy, dst, ncols, nrows, item_ct1);
10809
11037
  });
10810
11038
  });
10811
11039
  }
@@ -10820,12 +11048,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10820
11048
  const sycl::range<3> block_nums(1, 1, block_num_y);
10821
11049
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10822
11050
  {
10823
- iq3xxs_grid.init(*stream);
10824
- ksigns64.init(*stream);
10825
11051
 
10826
11052
  stream->submit([&](sycl::handler &cgh) {
10827
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10828
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10829
11053
 
10830
11054
  cgh.parallel_for(
10831
11055
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10833,8 +11057,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10833
11057
  [[intel::reqd_sub_group_size(32)]] {
10834
11058
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10835
11059
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10836
- vx, vy, dst, ncols, nrows, item_ct1,
10837
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11060
+ vx, vy, dst, ncols, nrows, item_ct1);
10838
11061
  });
10839
11062
  });
10840
11063
  }
@@ -10849,12 +11072,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10849
11072
  const sycl::range<3> block_nums(1, 1, block_num_y);
10850
11073
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10851
11074
  {
10852
- iq3xxs_grid.init(*stream);
10853
- ksigns64.init(*stream);
10854
11075
 
10855
11076
  stream->submit([&](sycl::handler &cgh) {
10856
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10857
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10858
11077
 
10859
11078
  cgh.parallel_for(
10860
11079
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10862,8 +11081,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10862
11081
  [[intel::reqd_sub_group_size(32)]] {
10863
11082
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10864
11083
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10865
- vx, vy, dst, ncols, nrows, item_ct1,
10866
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11084
+ vx, vy, dst, ncols, nrows, item_ct1);
10867
11085
  });
10868
11086
  });
10869
11087
  }
@@ -10878,12 +11096,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10878
11096
  const sycl::range<3> block_nums(1, 1, block_num_y);
10879
11097
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10880
11098
  {
10881
- iq3xxs_grid.init(*stream);
10882
- ksigns64.init(*stream);
10883
11099
 
10884
11100
  stream->submit([&](sycl::handler &cgh) {
10885
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10886
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10887
11101
 
10888
11102
  cgh.parallel_for(
10889
11103
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10891,8 +11105,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10891
11105
  [[intel::reqd_sub_group_size(32)]] {
10892
11106
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10893
11107
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10894
- vx, vy, dst, ncols, nrows, item_ct1,
10895
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11108
+ vx, vy, dst, ncols, nrows, item_ct1);
10896
11109
  });
10897
11110
  });
10898
11111
  }
@@ -10907,12 +11120,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10907
11120
  const sycl::range<3> block_nums(1, 1, block_num_y);
10908
11121
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10909
11122
  {
10910
- iq3xxs_grid.init(*stream);
10911
- ksigns64.init(*stream);
10912
11123
 
10913
11124
  stream->submit([&](sycl::handler &cgh) {
10914
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10915
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10916
11125
 
10917
11126
  cgh.parallel_for(
10918
11127
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10920,8 +11129,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10920
11129
  [[intel::reqd_sub_group_size(32)]] {
10921
11130
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10922
11131
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10923
- vx, vy, dst, ncols, nrows, item_ct1,
10924
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11132
+ vx, vy, dst, ncols, nrows, item_ct1);
10925
11133
  });
10926
11134
  });
10927
11135
  }
@@ -10936,12 +11144,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10936
11144
  const sycl::range<3> block_nums(1, 1, block_num_y);
10937
11145
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10938
11146
  {
10939
- iq3xxs_grid.init(*stream);
10940
- ksigns64.init(*stream);
10941
11147
 
10942
11148
  stream->submit([&](sycl::handler &cgh) {
10943
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10944
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10945
11149
 
10946
11150
  cgh.parallel_for(
10947
11151
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10949,13 +11153,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10949
11153
  [[intel::reqd_sub_group_size(32)]] {
10950
11154
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10951
11155
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10952
- vx, vy, dst, ncols, nrows, item_ct1,
10953
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11156
+ vx, vy, dst, ncols, nrows, item_ct1);
10954
11157
  });
10955
11158
  });
10956
11159
  }
10957
11160
  }
10958
11161
 
11162
+
10959
11163
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10960
11164
  float *dst, const int ncols,
10961
11165
  const int nrows,
@@ -10965,23 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10965
11169
  const sycl::range<3> block_nums(1, 1, block_num_y);
10966
11170
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10967
11171
  {
10968
- iq2xxs_grid.init(*stream);
10969
- ksigns_iq2xs.init(*stream);
10970
- kmask_iq2xs.init(*stream);
10971
-
10972
-
10973
11172
  stream->submit([&](sycl::handler &cgh) {
10974
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10975
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10976
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10977
-
10978
11173
  cgh.parallel_for(
10979
11174
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
10980
11175
  [=](sycl::nd_item<3> item_ct1)
10981
11176
  [[intel::reqd_sub_group_size(32)]] {
10982
11177
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
10983
- vx, vy, dst, ncols, nrows, item_ct1,
10984
- iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
11178
+ vx, vy, dst, ncols, nrows, item_ct1);
10985
11179
  });
10986
11180
  });
10987
11181
  }
@@ -10996,20 +11190,42 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10996
11190
  const sycl::range<3> block_nums(1, 1, block_num_y);
10997
11191
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10998
11192
  {
10999
- iq2xs_grid.init(*stream);
11000
- ksigns64.init(*stream);
11001
11193
 
11002
11194
  stream->submit([&](sycl::handler &cgh) {
11003
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
11004
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
11195
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
11196
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11005
11197
 
11006
11198
  cgh.parallel_for(
11007
11199
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11008
11200
  [=](sycl::nd_item<3> item_ct1)
11009
11201
  [[intel::reqd_sub_group_size(32)]] {
11010
11202
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
11011
- vx, vy, dst, ncols, nrows, item_ct1,
11012
- iq2xs_grid_ptr_ct1, ksigns64_ptr_ct1);
11203
+ vx, vy, dst, ncols, nrows, item_ct1);
11204
+ });
11205
+ });
11206
+ }
11207
+ }
11208
+
11209
+ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
11210
+ float *dst, const int ncols,
11211
+ const int nrows,
11212
+ dpct::queue_ptr stream) {
11213
+ GGML_ASSERT(ncols % QK_K == 0);
11214
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11215
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11216
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11217
+ {
11218
+
11219
+ stream->submit([&](sycl::handler &cgh) {
11220
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
11221
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11222
+
11223
+ cgh.parallel_for(
11224
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11225
+ [=](sycl::nd_item<3> item_ct1)
11226
+ [[intel::reqd_sub_group_size(32)]] {
11227
+ mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
11228
+ vx, vy, dst, ncols, nrows, item_ct1);
11013
11229
  });
11014
11230
  });
11015
11231
  }
@@ -11024,20 +11240,17 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
11024
11240
  const sycl::range<3> block_nums(1, 1, block_num_y);
11025
11241
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11026
11242
  {
11027
- iq3xxs_grid.init(*stream);
11028
- ksigns64.init(*stream);
11029
11243
 
11030
11244
  stream->submit([&](sycl::handler &cgh) {
11031
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
11032
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
11245
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
11246
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11033
11247
 
11034
11248
  cgh.parallel_for(
11035
11249
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11036
11250
  [=](sycl::nd_item<3> item_ct1)
11037
11251
  [[intel::reqd_sub_group_size(32)]] {
11038
11252
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
11039
- vx, vy, dst, ncols, nrows, item_ct1,
11040
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
11253
+ vx, vy, dst, ncols, nrows, item_ct1);
11041
11254
  });
11042
11255
  });
11043
11256
  }
@@ -11052,20 +11265,16 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
11052
11265
  const sycl::range<3> block_nums(1, 1, block_num_y);
11053
11266
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11054
11267
  {
11055
- iq3s_grid.init(*stream);
11056
- ksigns64.init(*stream);
11057
11268
 
11058
11269
  stream->submit([&](sycl::handler &cgh) {
11059
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
11060
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
11270
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
11061
11271
 
11062
11272
  cgh.parallel_for(
11063
11273
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11064
11274
  [=](sycl::nd_item<3> item_ct1)
11065
11275
  [[intel::reqd_sub_group_size(32)]] {
11066
11276
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
11067
- vx, vy, dst, ncols, nrows, item_ct1,
11068
- iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
11277
+ vx, vy, dst, ncols, nrows, item_ct1);
11069
11278
  });
11070
11279
  });
11071
11280
  }
@@ -11080,20 +11289,82 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
11080
11289
  const sycl::range<3> block_nums(1, 1, block_num_y);
11081
11290
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11082
11291
  {
11083
- iq1s_grid_gpu.init(*stream);
11084
- ksigns64.init(*stream);
11085
11292
 
11086
11293
  stream->submit([&](sycl::handler &cgh) {
11087
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
11088
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
11294
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
11295
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
11089
11296
 
11090
11297
  cgh.parallel_for(
11091
11298
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
11092
11299
  [=](sycl::nd_item<3> item_ct1)
11093
11300
  [[intel::reqd_sub_group_size(32)]] {
11094
11301
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
11095
- vx, vy, dst, ncols, nrows, item_ct1,
11096
- iq1s_grid_ptr_ct1, ksigns64_ptr_ct1);
11302
+ vx, vy, dst, ncols, nrows, item_ct1);
11303
+ });
11304
+ });
11305
+ }
11306
+ }
11307
+
11308
+ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
11309
+ float *dst, const int ncols,
11310
+ const int nrows,
11311
+ dpct::queue_ptr stream) {
11312
+ GGML_ASSERT(ncols % QK_K == 0);
11313
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11314
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11315
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11316
+ {
11317
+ stream->submit([&](sycl::handler &cgh) {
11318
+ cgh.parallel_for(
11319
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11320
+ [=](sycl::nd_item<3> item_ct1)
11321
+ [[intel::reqd_sub_group_size(32)]] {
11322
+ mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
11323
+ vx, vy, dst, ncols, nrows, item_ct1);
11324
+ });
11325
+ });
11326
+ }
11327
+ }
11328
+
11329
+ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
11330
+ float *dst, const int ncols,
11331
+ const int nrows,
11332
+ dpct::queue_ptr stream) {
11333
+ GGML_ASSERT(ncols % QK4_NL == 0);
11334
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11335
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11336
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11337
+ {
11338
+
11339
+ stream->submit([&](sycl::handler &cgh) {
11340
+ cgh.parallel_for(
11341
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11342
+ [=](sycl::nd_item<3> item_ct1)
11343
+ [[intel::reqd_sub_group_size(32)]] {
11344
+ mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
11345
+ vx, vy, dst, ncols, nrows, item_ct1);
11346
+ });
11347
+ });
11348
+ }
11349
+ }
11350
+
11351
+ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
11352
+ float *dst, const int ncols,
11353
+ const int nrows,
11354
+ dpct::queue_ptr stream) {
11355
+ GGML_ASSERT(ncols % QK_K == 0);
11356
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
11357
+ const sycl::range<3> block_nums(1, 1, block_num_y);
11358
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
11359
+ {
11360
+
11361
+ stream->submit([&](sycl::handler &cgh) {
11362
+ cgh.parallel_for(
11363
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
11364
+ [=](sycl::nd_item<3> item_ct1)
11365
+ [[intel::reqd_sub_group_size(32)]] {
11366
+ mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
11367
+ vx, vy, dst, ncols, nrows, item_ct1);
11097
11368
  });
11098
11369
  });
11099
11370
  }
@@ -12717,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
12717
12988
  });
12718
12989
  }
12719
12990
 
12991
+ static int next_power_of_2(int x) {
12992
+ int n = 1;
12993
+ while (n < x) {
12994
+ n *= 2;
12995
+ }
12996
+ return n;
12997
+ }
12998
+
12720
12999
  static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
12721
13000
  const int nrows, ggml_sort_order order,
12722
13001
  dpct::queue_ptr stream) {
12723
13002
  // bitonic sort requires ncols to be power of 2
12724
- GGML_ASSERT((ncols & (ncols - 1)) == 0);
13003
+ const int ncols_pad = next_power_of_2(ncols);
12725
13004
 
12726
- const sycl::range<3> block_dims(1, 1, ncols);
13005
+ const sycl::range<3> block_dims(1, 1, ncols_pad);
12727
13006
  const sycl::range<3> block_nums(1, nrows, 1);
13007
+ const size_t shared_mem = ncols_pad * sizeof(int);
13008
+
13009
+ // GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
13010
+
12728
13011
  if (order == GGML_SORT_ORDER_ASC) {
12729
- /*
12730
- DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
12731
- the limit. To get the device limit, query
12732
- info::device::max_work_group_size. Adjust the work-group size if needed.
12733
- */
12734
- stream->parallel_for(
12735
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12736
- [=](sycl::nd_item<3> item_ct1) {
12737
- k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
12738
- });
13012
+ stream->submit([&](sycl::handler &cgh) {
13013
+ sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
13014
+ sycl::range<1>(shared_mem), cgh);
13015
+
13016
+ cgh.parallel_for(
13017
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
13018
+ [=](sycl::nd_item<3> item_ct1) {
13019
+ k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
13020
+ x, dst, ncols, ncols_pad, item_ct1,
13021
+ dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
13022
+ .get());
13023
+ });
13024
+ });
12739
13025
  } else if (order == GGML_SORT_ORDER_DESC) {
12740
- /*
12741
- DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
12742
- the limit. To get the device limit, query
12743
- info::device::max_work_group_size. Adjust the work-group size if needed.
12744
- */
12745
- stream->parallel_for(
12746
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
12747
- [=](sycl::nd_item<3> item_ct1) {
12748
- k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
12749
- });
13026
+ stream->submit([&](sycl::handler &cgh) {
13027
+ sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
13028
+ sycl::range<1>(shared_mem), cgh);
13029
+
13030
+ cgh.parallel_for(
13031
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
13032
+ [=](sycl::nd_item<3> item_ct1) {
13033
+ k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
13034
+ x, dst, ncols, ncols_pad, item_ct1,
13035
+ dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
13036
+ .get());
13037
+ });
13038
+ });
12750
13039
  } else {
12751
13040
  GGML_ASSERT(false);
12752
13041
  }
@@ -13128,6 +13417,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
13128
13417
  }
13129
13418
 
13130
13419
  void ggml_backend_sycl_print_sycl_devices() {
13420
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
13131
13421
  int device_count = dpct::dev_mgr::instance().device_count();
13132
13422
  std::map<std::string, size_t> DeviceNums;
13133
13423
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
@@ -13181,11 +13471,13 @@ int get_work_group_size(int user_device_id) {
13181
13471
  return prop.get_max_work_group_size();
13182
13472
  }
13183
13473
 
13184
- void ggml_init_sycl() try {
13474
+ static void ggml_init_sycl() try {
13185
13475
  static bool initialized = false;
13186
13476
 
13187
13477
  if (!initialized) {
13478
+ fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
13188
13479
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
13480
+
13189
13481
  fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
13190
13482
 
13191
13483
  #if defined(GGML_SYCL_F16)
@@ -13871,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
13871
14163
  case GGML_TYPE_Q5_K:
13872
14164
  case GGML_TYPE_IQ2_XXS:
13873
14165
  case GGML_TYPE_IQ2_XS:
14166
+ case GGML_TYPE_IQ2_S:
13874
14167
  case GGML_TYPE_IQ1_S:
14168
+ case GGML_TYPE_IQ1_M:
13875
14169
  case GGML_TYPE_IQ3_XXS:
14170
+ case GGML_TYPE_IQ4_XS:
14171
+ case GGML_TYPE_IQ4_NL:
13876
14172
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
13877
14173
  case GGML_TYPE_IQ3_S:
13878
14174
  return max_compute_capability >= VER_GEN9 ? 128 : 64;
@@ -13891,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13891
14187
  const int64_t src1_ncols, const int64_t src1_padded_row_size,
13892
14188
  const dpct::queue_ptr &stream) {
13893
14189
 
13894
- GGML_ASSERT(ggml_nrows(src1) == 1);
14190
+ const int64_t ne10 = src1->ne[0];
14191
+ GGML_ASSERT(ne10 % QK8_1 == 0);
13895
14192
 
13896
14193
  const int64_t ne00 = src0->ne[0];
13897
14194
  const int64_t row_diff = row_high - row_low;
13898
14195
 
14196
+ int id;
14197
+ SYCL_CHECK(
14198
+ CHECK_TRY_ERROR(id = get_current_device_id()));
14199
+
14200
+ // the main device has a larger memory buffer to hold the results from all GPUs
14201
+ // nrows_dst == nrows of the matrix that the kernel writes into
14202
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff;
14203
+
13899
14204
  switch (src0->type) {
13900
14205
  case GGML_TYPE_Q4_0:
13901
14206
  mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
@@ -13927,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q(
13927
14232
  case GGML_TYPE_Q6_K:
13928
14233
  mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13929
14234
  break;
14235
+ case GGML_TYPE_IQ1_S:
14236
+ mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14237
+ break;
14238
+ case GGML_TYPE_IQ1_M:
14239
+ mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14240
+ break;
13930
14241
  case GGML_TYPE_IQ2_XXS:
13931
14242
  mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13932
14243
  break;
13933
14244
  case GGML_TYPE_IQ2_XS:
13934
14245
  mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13935
14246
  break;
14247
+ case GGML_TYPE_IQ2_S:
14248
+ mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14249
+ break;
13936
14250
  case GGML_TYPE_IQ3_XXS:
13937
14251
  mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13938
14252
  break;
13939
14253
  case GGML_TYPE_IQ3_S:
13940
14254
  mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13941
14255
  break;
13942
- case GGML_TYPE_IQ1_S:
13943
- mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14256
+ case GGML_TYPE_IQ4_NL:
14257
+ mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
14258
+ break;
14259
+ case GGML_TYPE_IQ4_XS:
14260
+ mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
13944
14261
  break;
13945
14262
  default:
13946
14263
  GGML_ASSERT(false);
@@ -14022,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
14022
14339
  convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
14023
14340
  break;
14024
14341
  default:
14342
+ printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
14025
14343
  GGML_ASSERT(false);
14026
14344
  break;
14027
14345
  }
@@ -14876,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
14876
15194
  src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
14877
15195
  }
14878
15196
  // do the computation
14879
- op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
14880
- dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream);
15197
+ SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
15198
+ dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
14881
15199
  /*
14882
15200
  DPCT1010:93: SYCL uses exceptions to report errors and does not
14883
15201
  use the error codes. The call was replaced with 0. You need to
@@ -15246,6 +15564,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15246
15564
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
15247
15565
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
15248
15566
 
15567
+ bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
15568
+ main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
15569
+
15249
15570
  SYCL_CHECK(
15250
15571
  CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
15251
15572
 
@@ -15276,24 +15597,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15276
15597
 
15277
15598
  dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
15278
15599
  dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
15600
+ if (no_mixed_dtypes) {
15601
+ cu_compute_type = dpct::library_data_t::real_half;
15602
+ cu_data_type = dpct::library_data_t::real_half;
15603
+ }
15279
15604
 
15280
15605
  // dst strides
15281
15606
  size_t nbd2 = dst->nb[2];
15282
15607
  size_t nbd3 = dst->nb[3];
15283
15608
 
15609
+ const float alpha_f32 = 1.0f;
15610
+ const float beta_f32 = 0.0f;
15611
+
15284
15612
  const sycl::half alpha_f16 = 1.0f;
15285
15613
  const sycl::half beta_f16 = 0.0f;
15286
15614
 
15287
- const float alpha_f32 = 1.0f;
15288
- const float beta_f32 = 0.0f;
15289
-
15290
15615
  const void * alpha = &alpha_f32;
15291
15616
  const void * beta = &beta_f32;
15617
+ if (no_mixed_dtypes) {
15618
+ alpha = &alpha_f16;
15619
+ beta = &beta_f16;
15620
+ }
15292
15621
 
15293
15622
  // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
15294
- // oneMKL open source supports half, half, float, float: datatypes
15623
+ // when oneMKL open source supports half, half, float, float: datatypes
15295
15624
 
15296
15625
  dst_t = (char *) dst_ddf;
15626
+ if (no_mixed_dtypes) {
15627
+ dst_t = (char *) dst_f16.alloc(ne_dst);
15628
+
15629
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
15630
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
15631
+ }
15297
15632
 
15298
15633
  GGML_ASSERT(ne12 % ne02 == 0);
15299
15634
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -15379,6 +15714,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15379
15714
  }
15380
15715
  #endif
15381
15716
 
15717
+ if (no_mixed_dtypes) {
15718
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
15719
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
15720
+ }
15382
15721
  }
15383
15722
  catch (sycl::exception const &exc) {
15384
15723
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -15437,11 +15776,17 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
15437
15776
  #ifdef GGML_SYCL_FORCE_DMMV
15438
15777
  const bool use_mul_mat_vec_q = false;
15439
15778
  #else
15440
- const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
15779
+ bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
15780
+ use_mul_mat_vec_q = use_mul_mat_vec_q ||
15781
+ (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
15782
+ (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
15783
+ (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
15784
+ (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
15785
+
15786
+
15441
15787
  #endif // GGML_SYCL_FORCE_DMMV
15442
15788
 
15443
15789
  if (use_mul_mat_vec_q) {
15444
- // NOTE: this kernel does not support ggml_nrows(src1) > 1
15445
15790
  // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
15446
15791
  ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
15447
15792
  } else {
@@ -16278,6 +16623,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16278
16623
  }
16279
16624
 
16280
16625
  GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
16626
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
16281
16627
  for(int i=0;i<max_len;i++) id_list[i] = -1;
16282
16628
 
16283
16629
  if (!g_sycl_gpu_mgr) {
@@ -16312,6 +16658,7 @@ catch (sycl::exception const &exc) {
16312
16658
 
16313
16659
  GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
16314
16660
  size_t description_size) try {
16661
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
16315
16662
  dpct::device_info prop;
16316
16663
  int device_id = g_sycl_gpu_mgr->gpus[device];
16317
16664
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -16326,6 +16673,7 @@ catch (sycl::exception const &exc) {
16326
16673
 
16327
16674
  GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
16328
16675
  size_t *total) try {
16676
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
16329
16677
  ggml_sycl_set_device(device);
16330
16678
 
16331
16679
  /*
@@ -16677,6 +17025,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16677
17025
  };
16678
17026
 
16679
17027
  ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
17028
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
17029
+
16680
17030
  if (device_index>=g_device_count or device_index<0) {
16681
17031
  printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16682
17032
  device_index, g_device_count-1);
@@ -17046,6 +17396,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
17046
17396
  };
17047
17397
 
17048
17398
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
17399
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
17400
+ ggml_init_sycl();
17049
17401
  // FIXME: this is not thread safe
17050
17402
  static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
17051
17403
 
@@ -17117,6 +17469,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
17117
17469
  }
17118
17470
 
17119
17471
  ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
17472
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
17120
17473
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
17121
17474
  /* .iface = */ {
17122
17475
  /* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
@@ -17231,7 +17584,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
17231
17584
  params.ith = 0;
17232
17585
  for (int i = 0; i < cgraph->n_nodes; i++) {
17233
17586
  ggml_tensor * node = cgraph->nodes[i];
17234
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
17587
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
17235
17588
  continue;
17236
17589
  }
17237
17590
  #ifndef NDEBUG
@@ -17289,9 +17642,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17289
17642
  return false;
17290
17643
  }
17291
17644
  ggml_type a_type = a->type;
17292
- if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
17293
- a_type == GGML_TYPE_IQ4_XS) {
17294
- return false;
17645
+ if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
17646
+ a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
17647
+ a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
17648
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
17649
+ ) {
17650
+ if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
17651
+ return false;
17652
+ }
17295
17653
  }
17296
17654
  return true;
17297
17655
  } break;
@@ -17379,6 +17737,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17379
17737
  UNUSED(backend);
17380
17738
  }
17381
17739
 
17740
+ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17741
+ const int min_batch_size = 32;
17742
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17743
+ GGML_UNUSED(backend);
17744
+ }
17745
+
17746
+
17382
17747
  static ggml_backend_i ggml_backend_sycl_interface = {
17383
17748
  /* .get_name = */ ggml_backend_sycl_name,
17384
17749
  /* .free = */ ggml_backend_sycl_free,
@@ -17392,7 +17757,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17392
17757
  /* .graph_plan_compute = */ NULL,
17393
17758
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17394
17759
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17395
- /* .offload_op = */ NULL,
17760
+ /* .offload_op = */ ggml_backend_sycl_offload_op,
17396
17761
  /* .event_new = */ NULL,
17397
17762
  /* .event_free = */ NULL,
17398
17763
  /* .event_record = */ NULL,
@@ -17406,7 +17771,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
17406
17771
  }
17407
17772
 
17408
17773
  GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
17409
- ggml_init_sycl(); // TODO: remove from ggml.c
17774
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
17775
+ ggml_init_sycl();
17410
17776
 
17411
17777
  check_allow_gpu_index(device);
17412
17778
 
@@ -17432,6 +17798,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
17432
17798
  }
17433
17799
 
17434
17800
  GGML_CALL int ggml_backend_sycl_get_device_count() {
17801
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
17435
17802
  if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17436
17803
  return g_sycl_gpu_mgr->get_gpu_count();
17437
17804
  }
@@ -17444,16 +17811,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
17444
17811
  }
17445
17812
 
17446
17813
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17814
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
17447
17815
  return g_sycl_gpu_mgr->get_index(device_id);
17448
17816
  }
17449
17817
 
17450
17818
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17819
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
17451
17820
  return g_sycl_gpu_mgr->gpus[device_index];
17452
17821
  }
17453
17822
 
17454
17823
  GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17455
- GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17824
+ ggml_init_sycl();
17825
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
17456
17826
  fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17827
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17828
+
17457
17829
  if (g_sycl_gpu_mgr) {
17458
17830
  delete g_sycl_gpu_mgr;
17459
17831
  }
@@ -17464,6 +17836,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
17464
17836
  }
17465
17837
 
17466
17838
  GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17839
+ ggml_init_sycl();
17840
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
17841
+
17467
17842
  if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17468
17843
  return;
17469
17844
  }