llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@
16
16
  #include <cinttypes>
17
17
  #include <cstddef>
18
18
  #include <cstdint>
19
+ #include <cstdlib>
19
20
  #include <float.h>
20
21
  #include <limits>
21
22
  #include <stdint.h>
@@ -24,10 +25,9 @@
24
25
  #include <cmath>
25
26
  #include <iostream>
26
27
  #include <fstream>
27
-
28
28
  #include <stdio.h>
29
29
  #include <stdlib.h>
30
-
30
+ #include <regex>
31
31
 
32
32
  #include <sycl/sycl.hpp>
33
33
  #include <sycl/half_type.hpp>
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
82
82
  #define __dpct_noinline__ __attribute__((noinline))
83
83
  #endif
84
84
 
85
+
86
+ std::string get_device_type_name(const sycl::device &Device) {
87
+ auto DeviceType = Device.get_info<sycl::info::device::device_type>();
88
+ switch (DeviceType) {
89
+ case sycl::info::device_type::cpu:
90
+ return "cpu";
91
+ case sycl::info::device_type::gpu:
92
+ return "gpu";
93
+ case sycl::info::device_type::host:
94
+ return "host";
95
+ case sycl::info::device_type::accelerator:
96
+ return "acc";
97
+ default:
98
+ return "unknown";
99
+ }
100
+ }
101
+
102
+ std::string get_device_backend_and_type(const sycl::device &device) {
103
+ std::stringstream device_type;
104
+ sycl::backend backend = device.get_backend();
105
+ device_type << backend << ":" << get_device_type_name(device);
106
+ return device_type.str();
107
+ }
108
+
85
109
  namespace dpct
86
110
  {
87
111
  typedef sycl::queue *queue_ptr;
@@ -716,11 +740,7 @@ namespace dpct
716
740
 
717
741
  sycl::queue &default_queue()
718
742
  {
719
- #ifdef DPCT_USM_LEVEL_NONE
720
- return out_of_order_queue();
721
- #else
722
743
  return in_order_queue();
723
- #endif // DPCT_USM_LEVEL_NONE
724
744
  }
725
745
 
726
746
  void queues_wait_and_throw()
@@ -739,11 +759,7 @@ namespace dpct
739
759
 
740
760
  sycl::queue *create_queue(bool enable_exception_handler = false)
741
761
  {
742
- #ifdef DPCT_USM_LEVEL_NONE
743
- return create_out_of_order_queue(enable_exception_handler);
744
- #else
745
762
  return create_in_order_queue(enable_exception_handler);
746
- #endif // DPCT_USM_LEVEL_NONE
747
763
  }
748
764
 
749
765
  sycl::queue *create_queue(sycl::context context, sycl::device device,
@@ -942,17 +958,67 @@ namespace dpct
942
958
 
943
959
  private:
944
960
  mutable std::recursive_mutex m_mutex;
961
+ static bool compare_dev(sycl::device &device1, sycl::device &device2)
962
+ {
963
+ dpct::device_info prop1;
964
+ dpct::get_device_info(prop1, device1);
965
+ dpct::device_info prop2;
966
+ dpct::get_device_info(prop2, device2);
967
+ return prop1.get_max_compute_units() > prop2.get_max_compute_units();
968
+ }
969
+ static int convert_backend_index(std::string & backend) {
970
+ if (backend == "ext_oneapi_level_zero:gpu") return 0;
971
+ if (backend == "opencl:gpu") return 1;
972
+ if (backend == "ext_oneapi_cuda:gpu") return 2;
973
+ if (backend == "ext_oneapi_hip:gpu") return 3;
974
+ if (backend == "opencl:cpu") return 4;
975
+ if (backend == "opencl:acc") return 5;
976
+ printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
977
+ GGML_ASSERT(false);
978
+ }
979
+ static bool compare_backend(std::string &backend1, std::string &backend2) {
980
+ return convert_backend_index(backend1) < convert_backend_index(backend2);
981
+ }
945
982
  dev_mgr()
946
983
  {
947
984
  sycl::device default_device =
948
985
  sycl::device(sycl::default_selector_v);
949
986
  _devs.push_back(std::make_shared<device_ext>(default_device));
950
987
 
951
- std::vector<sycl::device> sycl_all_devs =
952
- sycl::device::get_devices(sycl::info::device_type::all);
988
+ std::vector<sycl::device> sycl_all_devs;
953
989
  // Collect other devices except for the default device.
954
990
  if (default_device.is_cpu())
955
991
  _cpu_device = 0;
992
+
993
+ auto Platforms = sycl::platform::get_platforms();
994
+ // Keep track of the number of devices per backend
995
+ std::map<sycl::backend, size_t> DeviceNums;
996
+ std::map<std::string, std::vector<sycl::device>> backend_devices;
997
+
998
+ while (!Platforms.empty()) {
999
+ auto Platform = Platforms.back();
1000
+ Platforms.pop_back();
1001
+ auto devices = Platform.get_devices();
1002
+ std::string backend_type = get_device_backend_and_type(devices[0]);
1003
+ for (const auto &device : devices) {
1004
+ backend_devices[backend_type].push_back(device);
1005
+ }
1006
+ }
1007
+
1008
+ std::vector<std::string> keys;
1009
+ for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
1010
+ keys.push_back(it->first);
1011
+ }
1012
+ std::sort(keys.begin(), keys.end(), compare_backend);
1013
+
1014
+ for (auto &key : keys) {
1015
+ std::vector<sycl::device> devs = backend_devices[key];
1016
+ std::sort(devs.begin(), devs.end(), compare_dev);
1017
+ for (const auto &dev : devs) {
1018
+ sycl_all_devs.push_back(dev);
1019
+ }
1020
+ }
1021
+
956
1022
  for (auto &dev : sycl_all_devs)
957
1023
  {
958
1024
  if (dev == default_device)
@@ -1001,11 +1067,6 @@ namespace dpct
1001
1067
  static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
1002
1068
  const void *ptr)
1003
1069
  {
1004
- #ifdef DPCT_USM_LEVEL_NONE
1005
- return mem_mgr::instance().is_device_ptr(ptr)
1006
- ? pointer_access_attribute::device_only
1007
- : pointer_access_attribute::host_only;
1008
- #else
1009
1070
  switch (sycl::get_pointer_type(ptr, q.get_context()))
1010
1071
  {
1011
1072
  case sycl::usm::alloc::unknown:
@@ -1016,7 +1077,6 @@ namespace dpct
1016
1077
  case sycl::usm::alloc::host:
1017
1078
  return pointer_access_attribute::host_device;
1018
1079
  }
1019
- #endif
1020
1080
  }
1021
1081
 
1022
1082
  template <typename ArgT>
@@ -1199,11 +1259,7 @@ namespace dpct
1199
1259
 
1200
1260
  static inline void *dpct_malloc(size_t size, sycl::queue &q)
1201
1261
  {
1202
- #ifdef DPCT_USM_LEVEL_NONE
1203
- return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
1204
- #else
1205
1262
  return sycl::malloc_device(size, q.get_device(), q.get_context());
1206
- #endif // DPCT_USM_LEVEL_NONE
1207
1263
  }
1208
1264
 
1209
1265
  #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
@@ -1227,25 +1283,7 @@ namespace dpct
1227
1283
  static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
1228
1284
  valueT value, size_t size)
1229
1285
  {
1230
- #ifdef DPCT_USM_LEVEL_NONE
1231
- auto &mm = mem_mgr::instance();
1232
- assert(mm.is_device_ptr(dev_ptr));
1233
- auto alloc = mm.translate_ptr(dev_ptr);
1234
- size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
1235
-
1236
- return q.submit([&](sycl::handler &cgh)
1237
- {
1238
- auto r = sycl::range<1>(size);
1239
- auto o = sycl::id<1>(offset);
1240
- auto new_buffer = alloc.buffer.reinterpret<valueT>(
1241
- sycl::range<1>(alloc.size / sizeof(valueT)));
1242
- sycl::accessor<valueT, 1, sycl::access_mode::write,
1243
- sycl::access::target::device>
1244
- acc(new_buffer, cgh, r, o);
1245
- cgh.fill(acc, value); });
1246
- #else
1247
1286
  return q.fill(dev_ptr, value, size);
1248
- #endif // DPCT_USM_LEVEL_NONE
1249
1287
  }
1250
1288
 
1251
1289
  /**
@@ -1339,72 +1377,8 @@ namespace dpct
1339
1377
  {
1340
1378
  if (!size)
1341
1379
  return sycl::event{};
1342
- #ifdef DPCT_USM_LEVEL_NONE
1343
- auto &mm = mem_mgr::instance();
1344
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
1345
-
1346
- switch (real_direction)
1347
- {
1348
- case host_to_host:
1349
- return q.submit([&](sycl::handler &cgh)
1350
- {
1351
- cgh.depends_on(dep_events);
1352
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
1353
- case host_to_device:
1354
- {
1355
- auto alloc = mm.translate_ptr(to_ptr);
1356
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
1357
- return q.submit([&](sycl::handler &cgh)
1358
- {
1359
- cgh.depends_on(dep_events);
1360
- auto r = sycl::range<1>(size);
1361
- auto o = sycl::id<1>(offset);
1362
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1363
- sycl::access::target::device>
1364
- acc(alloc.buffer, cgh, r, o);
1365
- cgh.copy(from_ptr, acc); });
1366
- }
1367
- case device_to_host:
1368
- {
1369
- auto alloc = mm.translate_ptr(from_ptr);
1370
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
1371
- return q.submit([&](sycl::handler &cgh)
1372
- {
1373
- cgh.depends_on(dep_events);
1374
- auto r = sycl::range<1>(size);
1375
- auto o = sycl::id<1>(offset);
1376
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1377
- sycl::access::target::device>
1378
- acc(alloc.buffer, cgh, r, o);
1379
- cgh.copy(acc, to_ptr); });
1380
- }
1381
- case device_to_device:
1382
- {
1383
- auto to_alloc = mm.translate_ptr(to_ptr);
1384
- auto from_alloc = mm.translate_ptr(from_ptr);
1385
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
1386
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
1387
- return q.submit([&](sycl::handler &cgh)
1388
- {
1389
- cgh.depends_on(dep_events);
1390
- auto r = sycl::range<1>(size);
1391
- auto to_o = sycl::id<1>(to_offset);
1392
- auto from_o = sycl::id<1>(from_offset);
1393
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1394
- sycl::access::target::device>
1395
- to_acc(to_alloc.buffer, cgh, r, to_o);
1396
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1397
- sycl::access::target::device>
1398
- from_acc(from_alloc.buffer, cgh, r, from_o);
1399
- cgh.copy(from_acc, to_acc); });
1400
- }
1401
- default:
1402
- throw std::runtime_error("dpct_memcpy: invalid direction value");
1403
- }
1404
- #else
1405
1380
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
1406
1381
  GGML_UNUSED(direction);
1407
- #endif // DPCT_USM_LEVEL_NONE
1408
1382
  }
1409
1383
 
1410
1384
  // Get actual copy range and make sure it will not exceed range.
@@ -1544,45 +1518,15 @@ namespace dpct
1544
1518
  break;
1545
1519
  }
1546
1520
  case device_to_device:
1547
- #ifdef DPCT_USM_LEVEL_NONE
1548
- {
1549
- auto &mm = mem_mgr::instance();
1550
- auto to_alloc = mm.translate_ptr(to_surface);
1551
- auto from_alloc = mm.translate_ptr(from_surface);
1552
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
1553
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
1554
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1555
- {
1556
- cgh.depends_on(dep_events);
1557
- auto to_o = sycl::id<1>(to_offset);
1558
- auto from_o = sycl::id<1>(from_offset);
1559
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
1560
- sycl::access::target::device>
1561
- to_acc(to_alloc.buffer, cgh,
1562
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
1563
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
1564
- sycl::access::target::device>
1565
- from_acc(from_alloc.buffer, cgh,
1566
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
1567
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
1568
- size,
1569
- [=](sycl::id<3> id) {
1570
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
1571
- from_acc[get_offset(id, from_slice, from_range.get(0))];
1572
- }); }));
1573
- }
1574
- #else
1575
- event_list.push_back(q.submit([&](sycl::handler &cgh)
1576
- {
1577
- cgh.depends_on(dep_events);
1578
- cgh.parallel_for<class dpct_memcpy_3d_detail>(
1579
- size,
1580
- [=](sycl::id<3> id) {
1581
- to_surface[get_offset(id, to_slice, to_range.get(0))] =
1582
- from_surface[get_offset(id, from_slice, from_range.get(0))];
1583
- }); }));
1584
- #endif
1585
- break;
1521
+ event_list.push_back(q.submit([&](sycl::handler &cgh){
1522
+ cgh.depends_on(dep_events);
1523
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
1524
+ size,
1525
+ [=](sycl::id<3> id) {
1526
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
1527
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
1528
+ }); }));
1529
+ break;
1586
1530
  default:
1587
1531
  throw std::runtime_error("dpct_memcpy: invalid direction value");
1588
1532
  }
@@ -1680,11 +1624,7 @@ namespace dpct
1680
1624
  {
1681
1625
  if (ptr)
1682
1626
  {
1683
- #ifdef DPCT_USM_LEVEL_NONE
1684
- detail::mem_mgr::instance().mem_free(ptr);
1685
- #else
1686
1627
  sycl::free(ptr, q.get_context());
1687
- #endif // DPCT_USM_LEVEL_NONE
1688
1628
  }
1689
1629
  }
1690
1630
 
@@ -1692,11 +1632,7 @@ namespace dpct
1692
1632
  inline auto get_memory(const void *x)
1693
1633
  {
1694
1634
  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
1695
- #ifdef DPCT_USM_LEVEL_NONE
1696
- return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
1697
- #else
1698
1635
  return new_x;
1699
- #endif
1700
1636
  }
1701
1637
 
1702
1638
  template <typename T>
@@ -2148,72 +2084,8 @@ namespace dpct
2148
2084
  {
2149
2085
  if (!size)
2150
2086
  return sycl::event{};
2151
- #ifdef DPCT_USM_LEVEL_NONE
2152
- auto &mm = mem_mgr::instance();
2153
- auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
2154
-
2155
- switch (real_direction)
2156
- {
2157
- case host_to_host:
2158
- return q.submit([&](sycl::handler &cgh)
2159
- {
2160
- cgh.depends_on(dep_events);
2161
- cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
2162
- case host_to_device:
2163
- {
2164
- auto alloc = mm.translate_ptr(to_ptr);
2165
- size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
2166
- return q.submit([&](sycl::handler &cgh)
2167
- {
2168
- cgh.depends_on(dep_events);
2169
- auto r = sycl::range<1>(size);
2170
- auto o = sycl::id<1>(offset);
2171
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2172
- sycl::access::target::device>
2173
- acc(alloc.buffer, cgh, r, o);
2174
- cgh.copy(from_ptr, acc); });
2175
- }
2176
- case device_to_host:
2177
- {
2178
- auto alloc = mm.translate_ptr(from_ptr);
2179
- size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
2180
- return q.submit([&](sycl::handler &cgh)
2181
- {
2182
- cgh.depends_on(dep_events);
2183
- auto r = sycl::range<1>(size);
2184
- auto o = sycl::id<1>(offset);
2185
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2186
- sycl::access::target::device>
2187
- acc(alloc.buffer, cgh, r, o);
2188
- cgh.copy(acc, to_ptr); });
2189
- }
2190
- case device_to_device:
2191
- {
2192
- auto to_alloc = mm.translate_ptr(to_ptr);
2193
- auto from_alloc = mm.translate_ptr(from_ptr);
2194
- size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
2195
- size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
2196
- return q.submit([&](sycl::handler &cgh)
2197
- {
2198
- cgh.depends_on(dep_events);
2199
- auto r = sycl::range<1>(size);
2200
- auto to_o = sycl::id<1>(to_offset);
2201
- auto from_o = sycl::id<1>(from_offset);
2202
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2203
- sycl::access::target::device>
2204
- to_acc(to_alloc.buffer, cgh, r, to_o);
2205
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2206
- sycl::access::target::device>
2207
- from_acc(from_alloc.buffer, cgh, r, from_o);
2208
- cgh.copy(from_acc, to_acc); });
2209
- }
2210
- default:
2211
- throw std::runtime_error("dpct_memcpy: invalid direction value");
2212
- }
2213
- #else
2214
2087
  return q.memcpy(to_ptr, from_ptr, size, dep_events);
2215
2088
  GGML_UNUSED(direction);
2216
- #endif // DPCT_USM_LEVEL_NONE
2217
2089
  }
2218
2090
 
2219
2091
  // Get actual copy range and make sure it will not exceed range.
@@ -2353,34 +2225,6 @@ namespace dpct
2353
2225
  break;
2354
2226
  }
2355
2227
  case device_to_device:
2356
- #ifdef DPCT_USM_LEVEL_NONE
2357
- {
2358
- auto &mm = mem_mgr::instance();
2359
- auto to_alloc = mm.translate_ptr(to_surface);
2360
- auto from_alloc = mm.translate_ptr(from_surface);
2361
- size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
2362
- size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
2363
- event_list.push_back(q.submit([&](sycl::handler &cgh)
2364
- {
2365
- cgh.depends_on(dep_events);
2366
- auto to_o = sycl::id<1>(to_offset);
2367
- auto from_o = sycl::id<1>(from_offset);
2368
- sycl::accessor<byte_t, 1, sycl::access_mode::write,
2369
- sycl::access::target::device>
2370
- to_acc(to_alloc.buffer, cgh,
2371
- get_copy_range(size, to_slice, to_range.get(0)), to_o);
2372
- sycl::accessor<byte_t, 1, sycl::access_mode::read,
2373
- sycl::access::target::device>
2374
- from_acc(from_alloc.buffer, cgh,
2375
- get_copy_range(size, from_slice, from_range.get(0)), from_o);
2376
- cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
2377
- size,
2378
- [=](sycl::id<3> id) {
2379
- to_acc[get_offset(id, to_slice, to_range.get(0))] =
2380
- from_acc[get_offset(id, from_slice, from_range.get(0))];
2381
- }); }));
2382
- }
2383
- #else
2384
2228
  event_list.push_back(q.submit([&](sycl::handler &cgh)
2385
2229
  {
2386
2230
  cgh.depends_on(dep_events);
@@ -2390,7 +2234,6 @@ namespace dpct
2390
2234
  to_surface[get_offset(id, to_slice, to_range.get(0))] =
2391
2235
  from_surface[get_offset(id, from_slice, from_range.get(0))];
2392
2236
  }); }));
2393
- #endif
2394
2237
  break;
2395
2238
  default:
2396
2239
  throw std::runtime_error("dpct_memcpy: invalid direction value");
@@ -2581,9 +2424,6 @@ namespace dpct
2581
2424
  void *c[], library_data_t c_type, int ldc,
2582
2425
  int batch_size, library_data_t scaling_type)
2583
2426
  {
2584
- #ifdef DPCT_USM_LEVEL_NONE
2585
- throw std::runtime_error("this API is unsupported when USM level is none");
2586
- #else
2587
2427
  if (scaling_type == library_data_t::real_float &&
2588
2428
  c_type == library_data_t::complex_float)
2589
2429
  {
@@ -2718,7 +2558,6 @@ namespace dpct
2718
2558
  default:
2719
2559
  throw std::runtime_error("the combination of data type is unsupported");
2720
2560
  }
2721
- #endif
2722
2561
  }
2723
2562
 
2724
2563
  /// Computes a batch of matrix-matrix product with general matrices.
@@ -3057,24 +2896,9 @@ namespace dpct
3057
2896
  template <size_t D = Dimension>
3058
2897
  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
3059
2898
  init();
3060
- #ifdef DPCT_USM_LEVEL_NONE
3061
- return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
3062
- _device_ptr)
3063
- .template get_access<sycl::access_mode::read_write>()[index];
3064
- #else
3065
2899
  return _device_ptr[index];
3066
- #endif // DPCT_USM_LEVEL_NONE
3067
2900
  }
3068
2901
 
3069
- #ifdef DPCT_USM_LEVEL_NONE
3070
- /// Get sycl::accessor for the device memory object when usm is not used.
3071
- accessor_t get_access(sycl::handler &cgh) {
3072
- return get_buffer(_device_ptr)
3073
- .template reinterpret<T, Dimension>(_range)
3074
- .template get_access<detail::memory_traits<Memory, T>::mode,
3075
- detail::memory_traits<Memory, T>::target>(cgh);
3076
- }
3077
- #else
3078
2902
  /// Get dpct::accessor with dimension info for the device memory object
3079
2903
  /// when usm is used and dimension is greater than 1.
3080
2904
  template <size_t D = Dimension>
@@ -3082,7 +2906,6 @@ namespace dpct
3082
2906
  get_access(sycl::handler &cgh) {
3083
2907
  return dpct_accessor_t((T *)_device_ptr, _range);
3084
2908
  }
3085
- #endif // DPCT_USM_LEVEL_NONE
3086
2909
 
3087
2910
  private:
3088
2911
  device_memory(value_t *memory_ptr, size_t size)
@@ -3127,15 +2950,6 @@ namespace dpct
3127
2950
 
3128
2951
  /// Default constructor
3129
2952
  device_memory() : base(1) {}
3130
-
3131
- #ifdef DPCT_USM_LEVEL_NONE
3132
- /// Get sycl::accessor for the device memory object when usm is not used.
3133
- accessor_t get_access(sycl::handler &cgh) {
3134
- auto buf = get_buffer(base::get_ptr())
3135
- .template reinterpret<T, 1>(sycl::range<1>(1));
3136
- return accessor_t(buf, cgh);
3137
- }
3138
- #endif // DPCT_USM_LEVEL_NONE
3139
2953
  };
3140
2954
  } // namespace detail
3141
2955
 
@@ -3154,7 +2968,7 @@ namespace dpct
3154
2968
  #include "ggml-common.h"
3155
2969
 
3156
2970
  static int g_ggml_sycl_debug=0;
3157
- #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
2971
+ #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
3158
2972
 
3159
2973
  #define CHECK_TRY_ERROR(expr) \
3160
2974
  [&]() { \
@@ -3202,6 +3016,11 @@ static int g_work_group_size = 0;
3202
3016
  #define GGML_SYCL_MMV_Y 1
3203
3017
  #endif
3204
3018
 
3019
+ enum ggml_sycl_backend_gpu_mode {
3020
+ SYCL_UNSET_GPU_MODE = -1,
3021
+ SYCL_SINGLE_GPU_MODE = 0,
3022
+ SYCL_MUL_GPU_MODE
3023
+ };
3205
3024
 
3206
3025
  static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
3207
3026
 
@@ -3401,12 +3220,31 @@ class sycl_gpu_mgr {
3401
3220
  int work_group_size = 0;
3402
3221
  std::string gpus_list = "";
3403
3222
 
3223
+ /*
3224
+ Use all GPUs with same top max compute units
3225
+ */
3404
3226
  sycl_gpu_mgr() {
3405
3227
  detect_sycl_gpu_list_with_max_cu();
3406
3228
  get_allow_gpus();
3407
3229
  create_context_with_gpus();
3408
3230
  }
3409
3231
 
3232
+ /*
3233
+ Only use the assigned GPU
3234
+ */
3235
+ sycl_gpu_mgr(int main_gpu_id) {
3236
+ sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
3237
+ dpct::device_info prop;
3238
+ dpct::get_device_info(prop, device);
3239
+ gpus.push_back(main_gpu_id);
3240
+ devices.push_back(device);
3241
+ work_group_size = prop.get_max_work_group_size();
3242
+ max_compute_units = prop.get_max_compute_units();
3243
+
3244
+ get_allow_gpus();
3245
+ create_context_with_gpus();
3246
+ }
3247
+
3410
3248
  void create_context_with_gpus() {
3411
3249
  sycl::context ctx = sycl::context(devices);
3412
3250
  assert(gpus.size() > 0);
@@ -3422,7 +3260,7 @@ class sycl_gpu_mgr {
3422
3260
  gpus_list += std::to_string(gpus[i]);
3423
3261
  gpus_list += ",";
3424
3262
  }
3425
- if (gpus_list.length() > 2) {
3263
+ if (gpus_list.length() > 1) {
3426
3264
  gpus_list.pop_back();
3427
3265
  }
3428
3266
  }
@@ -3471,8 +3309,8 @@ class sycl_gpu_mgr {
3471
3309
  if (gpus[i] == id)
3472
3310
  return i;
3473
3311
  }
3474
- assert(false);
3475
- return -1;
3312
+ printf("miss to get device index by id=%d\n", id);
3313
+ GGML_ASSERT(false);
3476
3314
  }
3477
3315
 
3478
3316
  int get_next_index(int id) {
@@ -3481,8 +3319,7 @@ class sycl_gpu_mgr {
3481
3319
  if (gpus[i] == id)
3482
3320
  return i;
3483
3321
  }
3484
- assert(false);
3485
- return -1;
3322
+ GGML_ASSERT(false);
3486
3323
  }
3487
3324
 
3488
3325
  bool is_ext_oneapi_device(const sycl::device &dev) {
@@ -3500,11 +3337,14 @@ static int g_device_count = -1;
3500
3337
  static int g_all_sycl_device_count = -1;
3501
3338
  static int g_main_device = -1;
3502
3339
  static int g_main_device_id = -1;
3340
+ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
3503
3341
 
3504
3342
  static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
3505
3343
 
3506
3344
  static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
3507
3345
 
3346
+ static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
3347
+
3508
3348
  struct sycl_device_capabilities {
3509
3349
  int cc; // compute capability
3510
3350
  bool vmm; // virtual memory support
@@ -8239,7 +8079,7 @@ template <bool need_check> static void
8239
8079
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
8240
8080
  static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
8241
8081
  const sycl::nd_item<3> &item_ct1,
8242
- const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
8082
+ const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
8243
8083
  const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
8244
8084
  item_ct1.get_local_id(1);
8245
8085
 
@@ -10116,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
10116
9956
  dpct::queue_ptr stream) {
10117
9957
  const int nb = k / QK_K;
10118
9958
  {
10119
- iq2xxs_grid.init(*stream);
10120
- ksigns_iq2xs.init(*stream);
10121
- kmask_iq2xs.init(*stream);
10122
9959
 
10123
9960
  dpct::has_capability_or_fail(stream->get_device(),
10124
9961
  {sycl::aspect::fp16});
10125
9962
 
10126
9963
  stream->submit([&](sycl::handler &cgh) {
10127
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10128
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10129
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9964
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
9965
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9966
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10130
9967
 
10131
9968
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10132
9969
  sycl::range<3>(1, 1, 32),
@@ -10145,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
10145
9982
  dpct::queue_ptr stream) {
10146
9983
  const int nb = k / QK_K;
10147
9984
  {
10148
- iq2xs_grid.init(*stream);
10149
- ksigns_iq2xs.init(*stream);
10150
- kmask_iq2xs.init(*stream);
10151
9985
 
10152
9986
  dpct::has_capability_or_fail(stream->get_device(),
10153
9987
  {sycl::aspect::fp16});
10154
9988
 
10155
9989
  stream->submit([&](sycl::handler &cgh) {
10156
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10157
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10158
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
9990
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
9991
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
9992
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10159
9993
 
10160
9994
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10161
9995
  sycl::range<3>(1, 1, 32),
@@ -10174,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
10174
10008
  dpct::queue_ptr stream) {
10175
10009
  const int nb = k / QK_K;
10176
10010
  {
10177
- iq3xxs_grid.init(*stream);
10178
- ksigns_iq2xs.init(*stream);
10179
- kmask_iq2xs.init(*stream);
10180
10011
 
10181
10012
  dpct::has_capability_or_fail(stream->get_device(),
10182
10013
  {sycl::aspect::fp16});
10183
10014
 
10184
10015
  stream->submit([&](sycl::handler &cgh) {
10185
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10186
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10187
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10016
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10017
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10018
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10188
10019
 
10189
10020
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10190
10021
  sycl::range<3>(1, 1, 32),
@@ -10203,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
10203
10034
  dpct::queue_ptr stream) {
10204
10035
  const int nb = k / QK_K;
10205
10036
  {
10206
- iq3s_grid.init(*stream);
10207
- ksigns_iq2xs.init(*stream);
10208
- kmask_iq2xs.init(*stream);
10209
10037
 
10210
10038
  dpct::has_capability_or_fail(stream->get_device(),
10211
10039
  {sycl::aspect::fp16});
10212
10040
 
10213
10041
  stream->submit([&](sycl::handler &cgh) {
10214
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10215
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10216
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10042
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10043
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10044
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10217
10045
 
10218
10046
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10219
10047
  sycl::range<3>(1, 1, 32),
@@ -10232,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
10232
10060
  dpct::queue_ptr stream) {
10233
10061
  const int nb = k / QK_K;
10234
10062
  {
10235
- iq1s_grid_gpu.init(*stream);
10236
- ksigns_iq2xs.init(*stream);
10237
- kmask_iq2xs.init(*stream);
10238
10063
 
10239
10064
  dpct::has_capability_or_fail(stream->get_device(),
10240
10065
  {sycl::aspect::fp16});
10241
10066
 
10242
10067
  stream->submit([&](sycl::handler &cgh) {
10243
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10244
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10245
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10068
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10069
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10070
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10246
10071
 
10247
10072
  cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
10248
10073
  sycl::range<3>(1, 1, 32),
@@ -10575,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10575
10400
  const sycl::range<3> block_nums(1, 1, block_num_y);
10576
10401
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10577
10402
  {
10578
- iq3xxs_grid.init(*stream);
10579
- ksigns64.init(*stream);
10580
10403
 
10581
10404
  stream->submit([&](sycl::handler &cgh) {
10582
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10583
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10584
10405
 
10585
10406
  cgh.parallel_for(
10586
10407
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10588,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
10588
10409
  [[intel::reqd_sub_group_size(32)]] {
10589
10410
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
10590
10411
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
10591
- vx, vy, dst, ncols, nrows, item_ct1,
10592
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10412
+ vx, vy, dst, ncols, nrows, item_ct1);
10593
10413
  });
10594
10414
  });
10595
10415
  }
@@ -10604,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10604
10424
  const sycl::range<3> block_nums(1, 1, block_num_y);
10605
10425
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10606
10426
  {
10607
- iq3xxs_grid.init(*stream);
10608
- ksigns64.init(*stream);
10609
10427
 
10610
10428
  stream->submit([&](sycl::handler &cgh) {
10611
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10612
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10613
10429
 
10614
10430
  cgh.parallel_for(
10615
10431
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10617,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
10617
10433
  [[intel::reqd_sub_group_size(32)]] {
10618
10434
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
10619
10435
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
10620
- vx, vy, dst, ncols, nrows, item_ct1,
10621
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10436
+ vx, vy, dst, ncols, nrows, item_ct1);
10622
10437
  });
10623
10438
  });
10624
10439
  }
@@ -10633,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10633
10448
  const sycl::range<3> block_nums(1, 1, block_num_y);
10634
10449
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10635
10450
  {
10636
- iq3xxs_grid.init(*stream);
10637
- ksigns64.init(*stream);
10638
10451
 
10639
10452
  stream->submit([&](sycl::handler &cgh) {
10640
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10641
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10642
10453
 
10643
10454
  cgh.parallel_for(
10644
10455
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10646,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
10646
10457
  [[intel::reqd_sub_group_size(32)]] {
10647
10458
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
10648
10459
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
10649
- vx, vy, dst, ncols, nrows, item_ct1,
10650
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10460
+ vx, vy, dst, ncols, nrows, item_ct1);
10651
10461
  });
10652
10462
  });
10653
10463
  }
@@ -10662,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10662
10472
  const sycl::range<3> block_nums(1, 1, block_num_y);
10663
10473
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10664
10474
  {
10665
- iq3xxs_grid.init(*stream);
10666
- ksigns64.init(*stream);
10667
10475
 
10668
10476
  stream->submit([&](sycl::handler &cgh) {
10669
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10670
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10671
10477
 
10672
10478
  cgh.parallel_for(
10673
10479
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10675,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
10675
10481
  [[intel::reqd_sub_group_size(32)]] {
10676
10482
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
10677
10483
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
10678
- vx, vy, dst, ncols, nrows, item_ct1,
10679
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10484
+ vx, vy, dst, ncols, nrows, item_ct1);
10680
10485
  });
10681
10486
  });
10682
10487
  }
@@ -10691,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10691
10496
  const sycl::range<3> block_nums(1, 1, block_num_y);
10692
10497
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10693
10498
  {
10694
- iq3xxs_grid.init(*stream);
10695
- ksigns64.init(*stream);
10696
10499
 
10697
10500
  stream->submit([&](sycl::handler &cgh) {
10698
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10699
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10700
10501
 
10701
10502
  cgh.parallel_for(
10702
10503
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10704,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
10704
10505
  [[intel::reqd_sub_group_size(32)]] {
10705
10506
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
10706
10507
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
10707
- vx, vy, dst, ncols, nrows, item_ct1,
10708
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10508
+ vx, vy, dst, ncols, nrows, item_ct1);
10709
10509
  });
10710
10510
  });
10711
10511
  }
@@ -10720,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10720
10520
  const sycl::range<3> block_nums(1, 1, block_num_y);
10721
10521
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10722
10522
  {
10723
- iq3xxs_grid.init(*stream);
10724
- ksigns64.init(*stream);
10725
10523
 
10726
10524
  stream->submit([&](sycl::handler &cgh) {
10727
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10728
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10729
10525
 
10730
10526
  cgh.parallel_for(
10731
10527
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10733,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
10733
10529
  [[intel::reqd_sub_group_size(32)]] {
10734
10530
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
10735
10531
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
10736
- vx, vy, dst, ncols, nrows, item_ct1,
10737
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10532
+ vx, vy, dst, ncols, nrows, item_ct1);
10738
10533
  });
10739
10534
  });
10740
10535
  }
@@ -10749,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10749
10544
  const sycl::range<3> block_nums(1, 1, block_num_y);
10750
10545
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10751
10546
  {
10752
- iq3xxs_grid.init(*stream);
10753
- ksigns64.init(*stream);
10754
10547
 
10755
10548
  stream->submit([&](sycl::handler &cgh) {
10756
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10757
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10758
10549
 
10759
10550
  cgh.parallel_for(
10760
10551
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10762,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
10762
10553
  [[intel::reqd_sub_group_size(32)]] {
10763
10554
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
10764
10555
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
10765
- vx, vy, dst, ncols, nrows, item_ct1,
10766
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10556
+ vx, vy, dst, ncols, nrows, item_ct1);
10767
10557
  });
10768
10558
  });
10769
10559
  }
@@ -10778,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10778
10568
  const sycl::range<3> block_nums(1, 1, block_num_y);
10779
10569
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10780
10570
  {
10781
- iq3xxs_grid.init(*stream);
10782
- ksigns64.init(*stream);
10783
10571
 
10784
10572
  stream->submit([&](sycl::handler &cgh) {
10785
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10786
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10787
10573
 
10788
10574
  cgh.parallel_for(
10789
10575
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10791,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
10791
10577
  [[intel::reqd_sub_group_size(32)]] {
10792
10578
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
10793
10579
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
10794
- vx, vy, dst, ncols, nrows, item_ct1,
10795
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10580
+ vx, vy, dst, ncols, nrows, item_ct1);
10796
10581
  });
10797
10582
  });
10798
10583
  }
@@ -10807,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10807
10592
  const sycl::range<3> block_nums(1, 1, block_num_y);
10808
10593
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10809
10594
  {
10810
- iq3xxs_grid.init(*stream);
10811
- ksigns64.init(*stream);
10812
10595
 
10813
10596
  stream->submit([&](sycl::handler &cgh) {
10814
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10815
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10816
10597
 
10817
10598
  cgh.parallel_for(
10818
10599
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10820,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
10820
10601
  [[intel::reqd_sub_group_size(32)]] {
10821
10602
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
10822
10603
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
10823
- vx, vy, dst, ncols, nrows, item_ct1,
10824
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10604
+ vx, vy, dst, ncols, nrows, item_ct1);
10825
10605
  });
10826
10606
  });
10827
10607
  }
@@ -10836,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10836
10616
  const sycl::range<3> block_nums(1, 1, block_num_y);
10837
10617
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10838
10618
  {
10839
- iq3xxs_grid.init(*stream);
10840
- ksigns64.init(*stream);
10841
10619
 
10842
10620
  stream->submit([&](sycl::handler &cgh) {
10843
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10844
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10845
10621
 
10846
10622
  cgh.parallel_for(
10847
10623
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10849,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
10849
10625
  [[intel::reqd_sub_group_size(32)]] {
10850
10626
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
10851
10627
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
10852
- vx, vy, dst, ncols, nrows, item_ct1,
10853
- iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
10628
+ vx, vy, dst, ncols, nrows, item_ct1);
10854
10629
  });
10855
10630
  });
10856
10631
  }
10857
10632
  }
10858
10633
 
10634
+
10859
10635
  static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10860
10636
  float *dst, const int ncols,
10861
10637
  const int nrows,
@@ -10865,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
10865
10641
  const sycl::range<3> block_nums(1, 1, block_num_y);
10866
10642
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10867
10643
  {
10868
- iq2xxs_grid.init(*stream);
10869
- ksigns_iq2xs.init(*stream);
10870
- kmask_iq2xs.init(*stream);
10871
-
10872
10644
 
10873
10645
  stream->submit([&](sycl::handler &cgh) {
10874
- auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
10875
- auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
10876
- auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
10646
+ auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
10647
+ auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
10648
+ auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
10877
10649
 
10878
10650
  cgh.parallel_for(
10879
10651
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10896,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
10896
10668
  const sycl::range<3> block_nums(1, 1, block_num_y);
10897
10669
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10898
10670
  {
10899
- iq2xs_grid.init(*stream);
10900
- ksigns64.init(*stream);
10901
10671
 
10902
10672
  stream->submit([&](sycl::handler &cgh) {
10903
- auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
10904
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10673
+ auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
10674
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10905
10675
 
10906
10676
  cgh.parallel_for(
10907
10677
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10924,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
10924
10694
  const sycl::range<3> block_nums(1, 1, block_num_y);
10925
10695
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10926
10696
  {
10927
- iq3xxs_grid.init(*stream);
10928
- ksigns64.init(*stream);
10929
10697
 
10930
10698
  stream->submit([&](sycl::handler &cgh) {
10931
- auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
10932
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10699
+ auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
10700
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10933
10701
 
10934
10702
  cgh.parallel_for(
10935
10703
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10952,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
10952
10720
  const sycl::range<3> block_nums(1, 1, block_num_y);
10953
10721
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10954
10722
  {
10955
- iq3s_grid.init(*stream);
10956
- ksigns64.init(*stream);
10957
10723
 
10958
10724
  stream->submit([&](sycl::handler &cgh) {
10959
- auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
10960
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10725
+ auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
10726
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10961
10727
 
10962
10728
  cgh.parallel_for(
10963
10729
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -10980,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
10980
10746
  const sycl::range<3> block_nums(1, 1, block_num_y);
10981
10747
  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
10982
10748
  {
10983
- iq1s_grid_gpu.init(*stream);
10984
- ksigns64.init(*stream);
10985
10749
 
10986
10750
  stream->submit([&](sycl::handler &cgh) {
10987
- auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
10988
- auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
10751
+ auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
10752
+ auto ksigns64_ptr_ct1 = &ksigns64[0];
10989
10753
 
10990
10754
  cgh.parallel_for(
10991
10755
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -13008,37 +12772,57 @@ bool ggml_sycl_loaded(void) {
13008
12772
  return g_sycl_loaded;
13009
12773
  }
13010
12774
 
13011
- void print_device_detail(int id) {
12775
+ void print_device_detail(int id, sycl::device &device, std::string device_type) {
12776
+
13012
12777
  dpct::device_info prop;
13013
12778
  SYCL_CHECK(CHECK_TRY_ERROR(
13014
- dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
13015
- sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
12779
+ dpct::get_device_info(prop, device)));
12780
+
13016
12781
  std::string version;
13017
12782
  version += std::to_string(prop.get_major_version());
13018
12783
  version += ".";
13019
12784
  version += std::to_string(prop.get_minor_version());
13020
12785
 
13021
- fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
12786
+ device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
12787
+
12788
+ fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
13022
12789
  prop.get_name(), version.c_str(), prop.get_max_compute_units(),
13023
12790
  prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
13024
12791
  prop.get_global_mem_size());
13025
12792
  }
13026
12793
 
13027
12794
  void ggml_backend_sycl_print_sycl_devices() {
12795
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
13028
12796
  int device_count = dpct::dev_mgr::instance().device_count();
12797
+ std::map<std::string, size_t> DeviceNums;
13029
12798
  fprintf(stderr, "found %d SYCL devices:\n", device_count);
13030
- fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
13031
- fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
12799
+ fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
12800
+ fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
12801
+ fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
13032
12802
  for (int id = 0; id < device_count; ++id) {
13033
- print_device_detail(id);
12803
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
12804
+ sycl::backend backend = device.get_backend();
12805
+ std::string backend_type = get_device_backend_and_type(device);
12806
+ int type_id=DeviceNums[backend_type]++;
12807
+ std::stringstream device_type;
12808
+ device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
12809
+ print_device_detail(id, device, device_type.str());
13034
12810
  }
13035
12811
  }
13036
12812
 
13037
12813
  void print_gpu_device_list() {
13038
- fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
13039
- g_sycl_gpu_mgr->get_gpu_count(),
13040
- g_sycl_gpu_mgr->gpus_list.c_str(),
13041
- g_sycl_gpu_mgr->max_compute_units);
12814
+ GGML_ASSERT(g_sycl_gpu_mgr);
12815
+
12816
+ char* hint=NULL;
12817
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
12818
+ hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
12819
+ } else {
12820
+ hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
12821
+ }
12822
+ fprintf(stderr, hint,
12823
+ g_sycl_gpu_mgr->get_gpu_count(),
12824
+ g_sycl_gpu_mgr->gpus_list.c_str(),
12825
+ g_sycl_gpu_mgr->max_compute_units);
13042
12826
  }
13043
12827
 
13044
12828
  int get_sycl_env(const char *env_name, int default_val) {
@@ -13062,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
13062
12846
  return prop.get_max_work_group_size();
13063
12847
  }
13064
12848
 
13065
- void ggml_init_sycl() try {
12849
+ static void ggml_init_sycl() try {
13066
12850
  static bool initialized = false;
13067
12851
 
13068
12852
  if (!initialized) {
12853
+ fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
13069
12854
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
12855
+
13070
12856
  fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
13071
12857
 
13072
12858
  #if defined(GGML_SYCL_F16)
@@ -13074,6 +12860,15 @@ void ggml_init_sycl() try {
13074
12860
  #else
13075
12861
  fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
13076
12862
  #endif
12863
+
12864
+ /* NOT REMOVE, keep it for next optimize for XMX.
12865
+ #if defined(SYCL_USE_XMX)
12866
+ fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
12867
+ #else
12868
+ fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
12869
+ #endif
12870
+ */
12871
+
13077
12872
  if (CHECK_TRY_ERROR(g_all_sycl_device_count =
13078
12873
  dpct::dev_mgr::instance().device_count()) != 0) {
13079
12874
  initialized = true;
@@ -13082,68 +12877,65 @@ void ggml_init_sycl() try {
13082
12877
  }
13083
12878
  GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
13084
12879
  ggml_backend_sycl_print_sycl_devices();
12880
+ initialized = true;
12881
+ g_sycl_loaded = true;
12882
+ }
12883
+ }
12884
+ catch (sycl::exception const &exc) {
12885
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
12886
+ << ", line:" << __LINE__ << std::endl;
12887
+ std::exit(1);
12888
+ }
13085
12889
 
13086
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
13087
-
13088
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
13089
- g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13090
-
13091
- print_gpu_device_list();
12890
+ void ggml_init_by_gpus(int device_count) try {
12891
+ g_device_count = device_count;
12892
+ g_work_group_size = g_sycl_gpu_mgr->work_group_size;
13092
12893
 
13093
- int64_t total_vram = 0;
12894
+ int64_t total_vram = 0;
13094
12895
 
13095
- /* NOT REMOVE, keep it for next optimize for XMX.
13096
- #if defined(SYCL_USE_XMX)
13097
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
13098
- #else
13099
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
13100
- #endif
13101
- */
13102
- for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
13103
- g_device_caps[id].vmm = 0;
13104
- g_device_caps[id].device_id = -1;
13105
- g_device_caps[id].cc = 0;
13106
- g_tensor_split[id] = 0;
13107
- g_default_tensor_split[id] = 0;
13108
- }
12896
+ print_gpu_device_list();
13109
12897
 
13110
- for (int i = 0; i < g_device_count; ++i) {
13111
- int device_id = g_sycl_gpu_mgr->gpus[i];
13112
- g_device_caps[i].vmm = 0;
12898
+ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
12899
+ g_device_caps[id].vmm = 0;
12900
+ g_device_caps[id].device_id = -1;
12901
+ g_device_caps[id].cc = 0;
12902
+ g_tensor_split[id] = 0;
12903
+ g_default_tensor_split[id] = 0;
12904
+ }
13113
12905
 
13114
- dpct::device_info prop;
13115
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
13116
- prop, dpct::dev_mgr::instance().get_device(device_id))));
12906
+ for (int i = 0; i < g_device_count; ++i) {
12907
+ int device_id = g_sycl_gpu_mgr->gpus[i];
12908
+ g_device_caps[i].vmm = 0;
13117
12909
 
13118
- g_default_tensor_split[i] = total_vram;
13119
- total_vram += prop.get_global_mem_size();
12910
+ dpct::device_info prop;
12911
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
12912
+ prop, dpct::dev_mgr::instance().get_device(device_id))));
13120
12913
 
13121
- g_device_caps[i].cc =
13122
- 100 * prop.get_major_version() + 10 * prop.get_minor_version();
13123
- }
12914
+ g_default_tensor_split[i] = total_vram;
12915
+ total_vram += prop.get_global_mem_size();
13124
12916
 
13125
- for (int i = 0; i < g_device_count; ++i) {
13126
- g_default_tensor_split[i] /= total_vram;
13127
- }
12917
+ g_device_caps[i].cc =
12918
+ 100 * prop.get_major_version() + 10 * prop.get_minor_version();
12919
+ }
13128
12920
 
13129
- for (int i = 0; i < g_device_count; ++i) {
13130
- SYCL_CHECK(ggml_sycl_set_device(i));
12921
+ for (int i = 0; i < g_device_count; ++i) {
12922
+ g_default_tensor_split[i] /= total_vram;
12923
+ }
13131
12924
 
13132
- // create sycl streams
13133
- for (int is = 0; is < MAX_STREAMS; ++is) {
13134
- SYCL_CHECK(CHECK_TRY_ERROR(
13135
- g_syclStreams[i][is] =
13136
- dpct::get_current_device().create_queue(
13137
- g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13138
- }
12925
+ for (int i = 0; i < g_device_count; ++i) {
12926
+ SYCL_CHECK(ggml_sycl_set_device(i));
13139
12927
 
13140
- const dpct::queue_ptr stream = g_syclStreams[i][0];
13141
- // create sycl handle
13142
- SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
12928
+ // create sycl streams
12929
+ for (int is = 0; is < MAX_STREAMS; ++is) {
12930
+ SYCL_CHECK(CHECK_TRY_ERROR(
12931
+ g_syclStreams[i][is] =
12932
+ dpct::get_current_device().create_queue(
12933
+ g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
13143
12934
  }
13144
12935
 
13145
- initialized = true;
13146
- g_sycl_loaded = true;
12936
+ const dpct::queue_ptr stream = g_syclStreams[i][0];
12937
+ // create sycl handle
12938
+ SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
13147
12939
  }
13148
12940
  }
13149
12941
  catch (sycl::exception const &exc) {
@@ -15121,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15121
14913
  SYCL_CHECK(ggml_sycl_set_device(g_main_device));
15122
14914
  dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
15123
14915
 
14916
+ bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
14917
+ main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
14918
+
15124
14919
  SYCL_CHECK(
15125
14920
  CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
15126
14921
 
@@ -15151,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15151
14946
 
15152
14947
  dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
15153
14948
  dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
14949
+ if (no_mixed_dtypes) {
14950
+ cu_compute_type = dpct::library_data_t::real_half;
14951
+ cu_data_type = dpct::library_data_t::real_half;
14952
+ }
15154
14953
 
15155
14954
  // dst strides
15156
14955
  size_t nbd2 = dst->nb[2];
15157
14956
  size_t nbd3 = dst->nb[3];
15158
14957
 
14958
+ const float alpha_f32 = 1.0f;
14959
+ const float beta_f32 = 0.0f;
14960
+
15159
14961
  const sycl::half alpha_f16 = 1.0f;
15160
14962
  const sycl::half beta_f16 = 0.0f;
15161
14963
 
15162
- const float alpha_f32 = 1.0f;
15163
- const float beta_f32 = 0.0f;
15164
-
15165
14964
  const void * alpha = &alpha_f32;
15166
14965
  const void * beta = &beta_f32;
14966
+ if (no_mixed_dtypes) {
14967
+ alpha = &alpha_f16;
14968
+ beta = &beta_f16;
14969
+ }
15167
14970
 
15168
14971
  // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
15169
- // oneMKL open source supports half, half, float, float: datatypes
14972
+ // when oneMKL open source supports half, half, float, float: datatypes
15170
14973
 
15171
14974
  dst_t = (char *) dst_ddf;
14975
+ if (no_mixed_dtypes) {
14976
+ dst_t = (char *) dst_f16.alloc(ne_dst);
14977
+
14978
+ nbd2 /= sizeof(float) / sizeof(sycl::half);
14979
+ nbd3 /= sizeof(float) / sizeof(sycl::half);
14980
+ }
15172
14981
 
15173
14982
  GGML_ASSERT(ne12 % ne02 == 0);
15174
14983
  GGML_ASSERT(ne13 % ne03 == 0);
@@ -15254,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
15254
15063
  }
15255
15064
  #endif
15256
15065
 
15066
+ if (no_mixed_dtypes) {
15067
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
15068
+ to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
15069
+ }
15257
15070
  }
15258
15071
  catch (sycl::exception const &exc) {
15259
15072
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -16153,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
16153
15966
  }
16154
15967
 
16155
15968
  GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
15969
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
16156
15970
  for(int i=0;i<max_len;i++) id_list[i] = -1;
16157
15971
 
16158
15972
  if (!g_sycl_gpu_mgr) {
@@ -16187,6 +16001,7 @@ catch (sycl::exception const &exc) {
16187
16001
 
16188
16002
  GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
16189
16003
  size_t description_size) try {
16004
+ GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
16190
16005
  dpct::device_info prop;
16191
16006
  int device_id = g_sycl_gpu_mgr->gpus[device];
16192
16007
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -16201,6 +16016,7 @@ catch (sycl::exception const &exc) {
16201
16016
 
16202
16017
  GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
16203
16018
  size_t *total) try {
16019
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
16204
16020
  ggml_sycl_set_device(device);
16205
16021
 
16206
16022
  /*
@@ -16551,22 +16367,26 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16551
16367
  /* .is_host = */ nullptr,
16552
16368
  };
16553
16369
 
16554
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
16555
- static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
16370
+ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
16371
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
16556
16372
 
16557
- static bool ggml_backend_sycl_buffer_type_initialized = false;
16373
+ if (device_index>=g_device_count or device_index<0) {
16374
+ printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
16375
+ device_index, g_device_count-1);
16376
+ GGML_ASSERT(device_index<g_device_count);
16377
+ }
16378
+ static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
16558
16379
 
16559
- if (!ggml_backend_sycl_buffer_type_initialized) {
16380
+ if (!g_ggml_backend_sycl_buffer_type_initialized) {
16560
16381
  for (int i = 0; i < g_device_count; i++) {
16561
16382
  ggml_backend_sycl_buffer_types[i] = {
16562
16383
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
16563
16384
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
16564
16385
  };
16565
16386
  }
16566
- ggml_backend_sycl_buffer_type_initialized = true;
16387
+ g_ggml_backend_sycl_buffer_type_initialized = true;
16567
16388
  }
16568
-
16569
- return &ggml_backend_sycl_buffer_types[device];
16389
+ return &ggml_backend_sycl_buffer_types[device_index];
16570
16390
  }
16571
16391
 
16572
16392
  // sycl split buffer type
@@ -16919,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
16919
16739
  };
16920
16740
 
16921
16741
  GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
16742
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
16743
+ ggml_init_sycl();
16922
16744
  // FIXME: this is not thread safe
16923
16745
  static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
16924
16746
 
@@ -16990,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
16990
16812
  }
16991
16813
 
16992
16814
  ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
16815
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
16993
16816
  static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
16994
16817
  /* .iface = */ {
16995
16818
  /* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
@@ -17104,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
17104
16927
  params.ith = 0;
17105
16928
  for (int i = 0; i < cgraph->n_nodes; i++) {
17106
16929
  ggml_tensor * node = cgraph->nodes[i];
17107
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
16930
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
17108
16931
  continue;
17109
16932
  }
17110
16933
  #ifndef NDEBUG
@@ -17252,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
17252
17075
  UNUSED(backend);
17253
17076
  }
17254
17077
 
17078
+ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
17079
+ const int min_batch_size = 32;
17080
+ return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
17081
+ GGML_UNUSED(backend);
17082
+ }
17083
+
17084
+
17255
17085
  static ggml_backend_i ggml_backend_sycl_interface = {
17256
17086
  /* .get_name = */ ggml_backend_sycl_name,
17257
17087
  /* .free = */ ggml_backend_sycl_free,
@@ -17265,6 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17265
17095
  /* .graph_plan_compute = */ NULL,
17266
17096
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17267
17097
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17098
+ /* .offload_op = */ ggml_backend_sycl_offload_op,
17268
17099
  /* .event_new = */ NULL,
17269
17100
  /* .event_free = */ NULL,
17270
17101
  /* .event_record = */ NULL,
@@ -17278,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
17278
17109
  }
17279
17110
 
17280
17111
  GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
17281
- ggml_init_sycl(); // TODO: remove from ggml.c
17112
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
17113
+ ggml_init_sycl();
17282
17114
 
17283
17115
  check_allow_gpu_index(device);
17284
17116
 
@@ -17304,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
17304
17136
  }
17305
17137
 
17306
17138
  GGML_CALL int ggml_backend_sycl_get_device_count() {
17139
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
17307
17140
  if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17308
17141
  return g_sycl_gpu_mgr->get_gpu_count();
17309
17142
  }
@@ -17316,14 +17149,53 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
17316
17149
  }
17317
17150
 
17318
17151
  GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
17152
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
17319
17153
  return g_sycl_gpu_mgr->get_index(device_id);
17320
17154
  }
17321
17155
 
17156
+ GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
17157
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
17158
+ return g_sycl_gpu_mgr->gpus[device_index];
17159
+ }
17160
+
17161
+ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
17162
+ ggml_init_sycl();
17163
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
17164
+ fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
17165
+ GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
17166
+
17167
+ if (g_sycl_gpu_mgr) {
17168
+ delete g_sycl_gpu_mgr;
17169
+ }
17170
+ g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
17171
+ g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
17172
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17173
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17174
+ }
17175
+
17176
+ GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
17177
+ ggml_init_sycl();
17178
+ GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
17179
+
17180
+ if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
17181
+ return;
17182
+ }
17183
+
17184
+ fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
17185
+
17186
+ if (g_sycl_gpu_mgr) {
17187
+ delete g_sycl_gpu_mgr;
17188
+ }
17189
+ g_sycl_gpu_mgr = new sycl_gpu_mgr();
17190
+ g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
17191
+ ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
17192
+ g_ggml_backend_sycl_buffer_type_initialized = false;
17193
+ }
17194
+
17322
17195
  extern "C" int ggml_backend_sycl_reg_devices();
17323
17196
 
17324
17197
  int ggml_backend_sycl_reg_devices() {
17325
- if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
17326
- g_device_count = g_sycl_gpu_mgr->get_gpu_count();
17198
+ ggml_backend_sycl_set_mul_device_mode();
17327
17199
  assert(g_device_count>0);
17328
17200
  for (int i = 0; i < g_device_count; i++) {
17329
17201
  int id = g_sycl_gpu_mgr->gpus[i];