llama_cpp 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +91 -21
- data/vendor/tmp/llama.cpp/ggml-alloc.c +14 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +155 -125
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1779 -10762
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +167 -124
- data/vendor/tmp/llama.cpp/ggml-metal.metal +603 -303
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +663 -56
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +341 -469
- data/vendor/tmp/llama.cpp/ggml-sycl.h +19 -4
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +335 -307
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +229 -107
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +2136 -464
- data/vendor/tmp/llama.cpp/llama.h +86 -23
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -16,6 +16,7 @@
|
|
16
16
|
#include <cinttypes>
|
17
17
|
#include <cstddef>
|
18
18
|
#include <cstdint>
|
19
|
+
#include <cstdlib>
|
19
20
|
#include <float.h>
|
20
21
|
#include <limits>
|
21
22
|
#include <stdint.h>
|
@@ -24,10 +25,9 @@
|
|
24
25
|
#include <cmath>
|
25
26
|
#include <iostream>
|
26
27
|
#include <fstream>
|
27
|
-
|
28
28
|
#include <stdio.h>
|
29
29
|
#include <stdlib.h>
|
30
|
-
|
30
|
+
#include <regex>
|
31
31
|
|
32
32
|
#include <sycl/sycl.hpp>
|
33
33
|
#include <sycl/half_type.hpp>
|
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
|
|
82
82
|
#define __dpct_noinline__ __attribute__((noinline))
|
83
83
|
#endif
|
84
84
|
|
85
|
+
|
86
|
+
std::string get_device_type_name(const sycl::device &Device) {
|
87
|
+
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
88
|
+
switch (DeviceType) {
|
89
|
+
case sycl::info::device_type::cpu:
|
90
|
+
return "cpu";
|
91
|
+
case sycl::info::device_type::gpu:
|
92
|
+
return "gpu";
|
93
|
+
case sycl::info::device_type::host:
|
94
|
+
return "host";
|
95
|
+
case sycl::info::device_type::accelerator:
|
96
|
+
return "acc";
|
97
|
+
default:
|
98
|
+
return "unknown";
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
std::string get_device_backend_and_type(const sycl::device &device) {
|
103
|
+
std::stringstream device_type;
|
104
|
+
sycl::backend backend = device.get_backend();
|
105
|
+
device_type << backend << ":" << get_device_type_name(device);
|
106
|
+
return device_type.str();
|
107
|
+
}
|
108
|
+
|
85
109
|
namespace dpct
|
86
110
|
{
|
87
111
|
typedef sycl::queue *queue_ptr;
|
@@ -716,11 +740,7 @@ namespace dpct
|
|
716
740
|
|
717
741
|
sycl::queue &default_queue()
|
718
742
|
{
|
719
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
720
|
-
return out_of_order_queue();
|
721
|
-
#else
|
722
743
|
return in_order_queue();
|
723
|
-
#endif // DPCT_USM_LEVEL_NONE
|
724
744
|
}
|
725
745
|
|
726
746
|
void queues_wait_and_throw()
|
@@ -739,11 +759,7 @@ namespace dpct
|
|
739
759
|
|
740
760
|
sycl::queue *create_queue(bool enable_exception_handler = false)
|
741
761
|
{
|
742
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
743
|
-
return create_out_of_order_queue(enable_exception_handler);
|
744
|
-
#else
|
745
762
|
return create_in_order_queue(enable_exception_handler);
|
746
|
-
#endif // DPCT_USM_LEVEL_NONE
|
747
763
|
}
|
748
764
|
|
749
765
|
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
@@ -942,17 +958,67 @@ namespace dpct
|
|
942
958
|
|
943
959
|
private:
|
944
960
|
mutable std::recursive_mutex m_mutex;
|
961
|
+
static bool compare_dev(sycl::device &device1, sycl::device &device2)
|
962
|
+
{
|
963
|
+
dpct::device_info prop1;
|
964
|
+
dpct::get_device_info(prop1, device1);
|
965
|
+
dpct::device_info prop2;
|
966
|
+
dpct::get_device_info(prop2, device2);
|
967
|
+
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
|
968
|
+
}
|
969
|
+
static int convert_backend_index(std::string & backend) {
|
970
|
+
if (backend == "ext_oneapi_level_zero:gpu") return 0;
|
971
|
+
if (backend == "opencl:gpu") return 1;
|
972
|
+
if (backend == "ext_oneapi_cuda:gpu") return 2;
|
973
|
+
if (backend == "ext_oneapi_hip:gpu") return 3;
|
974
|
+
if (backend == "opencl:cpu") return 4;
|
975
|
+
if (backend == "opencl:acc") return 5;
|
976
|
+
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
977
|
+
GGML_ASSERT(false);
|
978
|
+
}
|
979
|
+
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
980
|
+
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
981
|
+
}
|
945
982
|
dev_mgr()
|
946
983
|
{
|
947
984
|
sycl::device default_device =
|
948
985
|
sycl::device(sycl::default_selector_v);
|
949
986
|
_devs.push_back(std::make_shared<device_ext>(default_device));
|
950
987
|
|
951
|
-
std::vector<sycl::device> sycl_all_devs
|
952
|
-
sycl::device::get_devices(sycl::info::device_type::all);
|
988
|
+
std::vector<sycl::device> sycl_all_devs;
|
953
989
|
// Collect other devices except for the default device.
|
954
990
|
if (default_device.is_cpu())
|
955
991
|
_cpu_device = 0;
|
992
|
+
|
993
|
+
auto Platforms = sycl::platform::get_platforms();
|
994
|
+
// Keep track of the number of devices per backend
|
995
|
+
std::map<sycl::backend, size_t> DeviceNums;
|
996
|
+
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
997
|
+
|
998
|
+
while (!Platforms.empty()) {
|
999
|
+
auto Platform = Platforms.back();
|
1000
|
+
Platforms.pop_back();
|
1001
|
+
auto devices = Platform.get_devices();
|
1002
|
+
std::string backend_type = get_device_backend_and_type(devices[0]);
|
1003
|
+
for (const auto &device : devices) {
|
1004
|
+
backend_devices[backend_type].push_back(device);
|
1005
|
+
}
|
1006
|
+
}
|
1007
|
+
|
1008
|
+
std::vector<std::string> keys;
|
1009
|
+
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
|
1010
|
+
keys.push_back(it->first);
|
1011
|
+
}
|
1012
|
+
std::sort(keys.begin(), keys.end(), compare_backend);
|
1013
|
+
|
1014
|
+
for (auto &key : keys) {
|
1015
|
+
std::vector<sycl::device> devs = backend_devices[key];
|
1016
|
+
std::sort(devs.begin(), devs.end(), compare_dev);
|
1017
|
+
for (const auto &dev : devs) {
|
1018
|
+
sycl_all_devs.push_back(dev);
|
1019
|
+
}
|
1020
|
+
}
|
1021
|
+
|
956
1022
|
for (auto &dev : sycl_all_devs)
|
957
1023
|
{
|
958
1024
|
if (dev == default_device)
|
@@ -1001,11 +1067,6 @@ namespace dpct
|
|
1001
1067
|
static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
|
1002
1068
|
const void *ptr)
|
1003
1069
|
{
|
1004
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1005
|
-
return mem_mgr::instance().is_device_ptr(ptr)
|
1006
|
-
? pointer_access_attribute::device_only
|
1007
|
-
: pointer_access_attribute::host_only;
|
1008
|
-
#else
|
1009
1070
|
switch (sycl::get_pointer_type(ptr, q.get_context()))
|
1010
1071
|
{
|
1011
1072
|
case sycl::usm::alloc::unknown:
|
@@ -1016,7 +1077,6 @@ namespace dpct
|
|
1016
1077
|
case sycl::usm::alloc::host:
|
1017
1078
|
return pointer_access_attribute::host_device;
|
1018
1079
|
}
|
1019
|
-
#endif
|
1020
1080
|
}
|
1021
1081
|
|
1022
1082
|
template <typename ArgT>
|
@@ -1199,11 +1259,7 @@ namespace dpct
|
|
1199
1259
|
|
1200
1260
|
static inline void *dpct_malloc(size_t size, sycl::queue &q)
|
1201
1261
|
{
|
1202
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1203
|
-
return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
|
1204
|
-
#else
|
1205
1262
|
return sycl::malloc_device(size, q.get_device(), q.get_context());
|
1206
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1207
1263
|
}
|
1208
1264
|
|
1209
1265
|
#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
|
@@ -1227,25 +1283,7 @@ namespace dpct
|
|
1227
1283
|
static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
|
1228
1284
|
valueT value, size_t size)
|
1229
1285
|
{
|
1230
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1231
|
-
auto &mm = mem_mgr::instance();
|
1232
|
-
assert(mm.is_device_ptr(dev_ptr));
|
1233
|
-
auto alloc = mm.translate_ptr(dev_ptr);
|
1234
|
-
size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
|
1235
|
-
|
1236
|
-
return q.submit([&](sycl::handler &cgh)
|
1237
|
-
{
|
1238
|
-
auto r = sycl::range<1>(size);
|
1239
|
-
auto o = sycl::id<1>(offset);
|
1240
|
-
auto new_buffer = alloc.buffer.reinterpret<valueT>(
|
1241
|
-
sycl::range<1>(alloc.size / sizeof(valueT)));
|
1242
|
-
sycl::accessor<valueT, 1, sycl::access_mode::write,
|
1243
|
-
sycl::access::target::device>
|
1244
|
-
acc(new_buffer, cgh, r, o);
|
1245
|
-
cgh.fill(acc, value); });
|
1246
|
-
#else
|
1247
1286
|
return q.fill(dev_ptr, value, size);
|
1248
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1249
1287
|
}
|
1250
1288
|
|
1251
1289
|
/**
|
@@ -1339,72 +1377,8 @@ namespace dpct
|
|
1339
1377
|
{
|
1340
1378
|
if (!size)
|
1341
1379
|
return sycl::event{};
|
1342
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1343
|
-
auto &mm = mem_mgr::instance();
|
1344
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
1345
|
-
|
1346
|
-
switch (real_direction)
|
1347
|
-
{
|
1348
|
-
case host_to_host:
|
1349
|
-
return q.submit([&](sycl::handler &cgh)
|
1350
|
-
{
|
1351
|
-
cgh.depends_on(dep_events);
|
1352
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
1353
|
-
case host_to_device:
|
1354
|
-
{
|
1355
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
1356
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
1357
|
-
return q.submit([&](sycl::handler &cgh)
|
1358
|
-
{
|
1359
|
-
cgh.depends_on(dep_events);
|
1360
|
-
auto r = sycl::range<1>(size);
|
1361
|
-
auto o = sycl::id<1>(offset);
|
1362
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1363
|
-
sycl::access::target::device>
|
1364
|
-
acc(alloc.buffer, cgh, r, o);
|
1365
|
-
cgh.copy(from_ptr, acc); });
|
1366
|
-
}
|
1367
|
-
case device_to_host:
|
1368
|
-
{
|
1369
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
1370
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
1371
|
-
return q.submit([&](sycl::handler &cgh)
|
1372
|
-
{
|
1373
|
-
cgh.depends_on(dep_events);
|
1374
|
-
auto r = sycl::range<1>(size);
|
1375
|
-
auto o = sycl::id<1>(offset);
|
1376
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1377
|
-
sycl::access::target::device>
|
1378
|
-
acc(alloc.buffer, cgh, r, o);
|
1379
|
-
cgh.copy(acc, to_ptr); });
|
1380
|
-
}
|
1381
|
-
case device_to_device:
|
1382
|
-
{
|
1383
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
1384
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
1385
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
1386
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
1387
|
-
return q.submit([&](sycl::handler &cgh)
|
1388
|
-
{
|
1389
|
-
cgh.depends_on(dep_events);
|
1390
|
-
auto r = sycl::range<1>(size);
|
1391
|
-
auto to_o = sycl::id<1>(to_offset);
|
1392
|
-
auto from_o = sycl::id<1>(from_offset);
|
1393
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1394
|
-
sycl::access::target::device>
|
1395
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
1396
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1397
|
-
sycl::access::target::device>
|
1398
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
1399
|
-
cgh.copy(from_acc, to_acc); });
|
1400
|
-
}
|
1401
|
-
default:
|
1402
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1403
|
-
}
|
1404
|
-
#else
|
1405
1380
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
1406
1381
|
GGML_UNUSED(direction);
|
1407
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1408
1382
|
}
|
1409
1383
|
|
1410
1384
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -1544,45 +1518,15 @@ namespace dpct
|
|
1544
1518
|
break;
|
1545
1519
|
}
|
1546
1520
|
case device_to_device:
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
cgh.depends_on(dep_events);
|
1557
|
-
auto to_o = sycl::id<1>(to_offset);
|
1558
|
-
auto from_o = sycl::id<1>(from_offset);
|
1559
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1560
|
-
sycl::access::target::device>
|
1561
|
-
to_acc(to_alloc.buffer, cgh,
|
1562
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
1563
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1564
|
-
sycl::access::target::device>
|
1565
|
-
from_acc(from_alloc.buffer, cgh,
|
1566
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
1567
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
1568
|
-
size,
|
1569
|
-
[=](sycl::id<3> id) {
|
1570
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
1571
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
1572
|
-
}); }));
|
1573
|
-
}
|
1574
|
-
#else
|
1575
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
1576
|
-
{
|
1577
|
-
cgh.depends_on(dep_events);
|
1578
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1579
|
-
size,
|
1580
|
-
[=](sycl::id<3> id) {
|
1581
|
-
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1582
|
-
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1583
|
-
}); }));
|
1584
|
-
#endif
|
1585
|
-
break;
|
1521
|
+
event_list.push_back(q.submit([&](sycl::handler &cgh){
|
1522
|
+
cgh.depends_on(dep_events);
|
1523
|
+
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1524
|
+
size,
|
1525
|
+
[=](sycl::id<3> id) {
|
1526
|
+
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1527
|
+
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1528
|
+
}); }));
|
1529
|
+
break;
|
1586
1530
|
default:
|
1587
1531
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1588
1532
|
}
|
@@ -1680,11 +1624,7 @@ namespace dpct
|
|
1680
1624
|
{
|
1681
1625
|
if (ptr)
|
1682
1626
|
{
|
1683
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1684
|
-
detail::mem_mgr::instance().mem_free(ptr);
|
1685
|
-
#else
|
1686
1627
|
sycl::free(ptr, q.get_context());
|
1687
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1688
1628
|
}
|
1689
1629
|
}
|
1690
1630
|
|
@@ -1692,11 +1632,7 @@ namespace dpct
|
|
1692
1632
|
inline auto get_memory(const void *x)
|
1693
1633
|
{
|
1694
1634
|
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
|
1695
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1696
|
-
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
|
1697
|
-
#else
|
1698
1635
|
return new_x;
|
1699
|
-
#endif
|
1700
1636
|
}
|
1701
1637
|
|
1702
1638
|
template <typename T>
|
@@ -2148,72 +2084,8 @@ namespace dpct
|
|
2148
2084
|
{
|
2149
2085
|
if (!size)
|
2150
2086
|
return sycl::event{};
|
2151
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2152
|
-
auto &mm = mem_mgr::instance();
|
2153
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
2154
|
-
|
2155
|
-
switch (real_direction)
|
2156
|
-
{
|
2157
|
-
case host_to_host:
|
2158
|
-
return q.submit([&](sycl::handler &cgh)
|
2159
|
-
{
|
2160
|
-
cgh.depends_on(dep_events);
|
2161
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
2162
|
-
case host_to_device:
|
2163
|
-
{
|
2164
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
2165
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
2166
|
-
return q.submit([&](sycl::handler &cgh)
|
2167
|
-
{
|
2168
|
-
cgh.depends_on(dep_events);
|
2169
|
-
auto r = sycl::range<1>(size);
|
2170
|
-
auto o = sycl::id<1>(offset);
|
2171
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2172
|
-
sycl::access::target::device>
|
2173
|
-
acc(alloc.buffer, cgh, r, o);
|
2174
|
-
cgh.copy(from_ptr, acc); });
|
2175
|
-
}
|
2176
|
-
case device_to_host:
|
2177
|
-
{
|
2178
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
2179
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
2180
|
-
return q.submit([&](sycl::handler &cgh)
|
2181
|
-
{
|
2182
|
-
cgh.depends_on(dep_events);
|
2183
|
-
auto r = sycl::range<1>(size);
|
2184
|
-
auto o = sycl::id<1>(offset);
|
2185
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2186
|
-
sycl::access::target::device>
|
2187
|
-
acc(alloc.buffer, cgh, r, o);
|
2188
|
-
cgh.copy(acc, to_ptr); });
|
2189
|
-
}
|
2190
|
-
case device_to_device:
|
2191
|
-
{
|
2192
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
2193
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
2194
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
2195
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
2196
|
-
return q.submit([&](sycl::handler &cgh)
|
2197
|
-
{
|
2198
|
-
cgh.depends_on(dep_events);
|
2199
|
-
auto r = sycl::range<1>(size);
|
2200
|
-
auto to_o = sycl::id<1>(to_offset);
|
2201
|
-
auto from_o = sycl::id<1>(from_offset);
|
2202
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2203
|
-
sycl::access::target::device>
|
2204
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
2205
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2206
|
-
sycl::access::target::device>
|
2207
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
2208
|
-
cgh.copy(from_acc, to_acc); });
|
2209
|
-
}
|
2210
|
-
default:
|
2211
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
2212
|
-
}
|
2213
|
-
#else
|
2214
2087
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
2215
2088
|
GGML_UNUSED(direction);
|
2216
|
-
#endif // DPCT_USM_LEVEL_NONE
|
2217
2089
|
}
|
2218
2090
|
|
2219
2091
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -2353,34 +2225,6 @@ namespace dpct
|
|
2353
2225
|
break;
|
2354
2226
|
}
|
2355
2227
|
case device_to_device:
|
2356
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2357
|
-
{
|
2358
|
-
auto &mm = mem_mgr::instance();
|
2359
|
-
auto to_alloc = mm.translate_ptr(to_surface);
|
2360
|
-
auto from_alloc = mm.translate_ptr(from_surface);
|
2361
|
-
size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
|
2362
|
-
size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
|
2363
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2364
|
-
{
|
2365
|
-
cgh.depends_on(dep_events);
|
2366
|
-
auto to_o = sycl::id<1>(to_offset);
|
2367
|
-
auto from_o = sycl::id<1>(from_offset);
|
2368
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2369
|
-
sycl::access::target::device>
|
2370
|
-
to_acc(to_alloc.buffer, cgh,
|
2371
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
2372
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2373
|
-
sycl::access::target::device>
|
2374
|
-
from_acc(from_alloc.buffer, cgh,
|
2375
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
2376
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
2377
|
-
size,
|
2378
|
-
[=](sycl::id<3> id) {
|
2379
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
2380
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
2381
|
-
}); }));
|
2382
|
-
}
|
2383
|
-
#else
|
2384
2228
|
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2385
2229
|
{
|
2386
2230
|
cgh.depends_on(dep_events);
|
@@ -2390,7 +2234,6 @@ namespace dpct
|
|
2390
2234
|
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
2391
2235
|
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
2392
2236
|
}); }));
|
2393
|
-
#endif
|
2394
2237
|
break;
|
2395
2238
|
default:
|
2396
2239
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
@@ -2581,9 +2424,6 @@ namespace dpct
|
|
2581
2424
|
void *c[], library_data_t c_type, int ldc,
|
2582
2425
|
int batch_size, library_data_t scaling_type)
|
2583
2426
|
{
|
2584
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2585
|
-
throw std::runtime_error("this API is unsupported when USM level is none");
|
2586
|
-
#else
|
2587
2427
|
if (scaling_type == library_data_t::real_float &&
|
2588
2428
|
c_type == library_data_t::complex_float)
|
2589
2429
|
{
|
@@ -2718,7 +2558,6 @@ namespace dpct
|
|
2718
2558
|
default:
|
2719
2559
|
throw std::runtime_error("the combination of data type is unsupported");
|
2720
2560
|
}
|
2721
|
-
#endif
|
2722
2561
|
}
|
2723
2562
|
|
2724
2563
|
/// Computes a batch of matrix-matrix product with general matrices.
|
@@ -3057,24 +2896,9 @@ namespace dpct
|
|
3057
2896
|
template <size_t D = Dimension>
|
3058
2897
|
typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
|
3059
2898
|
init();
|
3060
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3061
|
-
return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
|
3062
|
-
_device_ptr)
|
3063
|
-
.template get_access<sycl::access_mode::read_write>()[index];
|
3064
|
-
#else
|
3065
2899
|
return _device_ptr[index];
|
3066
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3067
2900
|
}
|
3068
2901
|
|
3069
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3070
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3071
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3072
|
-
return get_buffer(_device_ptr)
|
3073
|
-
.template reinterpret<T, Dimension>(_range)
|
3074
|
-
.template get_access<detail::memory_traits<Memory, T>::mode,
|
3075
|
-
detail::memory_traits<Memory, T>::target>(cgh);
|
3076
|
-
}
|
3077
|
-
#else
|
3078
2902
|
/// Get dpct::accessor with dimension info for the device memory object
|
3079
2903
|
/// when usm is used and dimension is greater than 1.
|
3080
2904
|
template <size_t D = Dimension>
|
@@ -3082,7 +2906,6 @@ namespace dpct
|
|
3082
2906
|
get_access(sycl::handler &cgh) {
|
3083
2907
|
return dpct_accessor_t((T *)_device_ptr, _range);
|
3084
2908
|
}
|
3085
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3086
2909
|
|
3087
2910
|
private:
|
3088
2911
|
device_memory(value_t *memory_ptr, size_t size)
|
@@ -3127,15 +2950,6 @@ namespace dpct
|
|
3127
2950
|
|
3128
2951
|
/// Default constructor
|
3129
2952
|
device_memory() : base(1) {}
|
3130
|
-
|
3131
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3132
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3133
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3134
|
-
auto buf = get_buffer(base::get_ptr())
|
3135
|
-
.template reinterpret<T, 1>(sycl::range<1>(1));
|
3136
|
-
return accessor_t(buf, cgh);
|
3137
|
-
}
|
3138
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3139
2953
|
};
|
3140
2954
|
} // namespace detail
|
3141
2955
|
|
@@ -3154,7 +2968,7 @@ namespace dpct
|
|
3154
2968
|
#include "ggml-common.h"
|
3155
2969
|
|
3156
2970
|
static int g_ggml_sycl_debug=0;
|
3157
|
-
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug)
|
2971
|
+
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
|
3158
2972
|
|
3159
2973
|
#define CHECK_TRY_ERROR(expr) \
|
3160
2974
|
[&]() { \
|
@@ -3202,6 +3016,11 @@ static int g_work_group_size = 0;
|
|
3202
3016
|
#define GGML_SYCL_MMV_Y 1
|
3203
3017
|
#endif
|
3204
3018
|
|
3019
|
+
enum ggml_sycl_backend_gpu_mode {
|
3020
|
+
SYCL_UNSET_GPU_MODE = -1,
|
3021
|
+
SYCL_SINGLE_GPU_MODE = 0,
|
3022
|
+
SYCL_MUL_GPU_MODE
|
3023
|
+
};
|
3205
3024
|
|
3206
3025
|
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
3207
3026
|
|
@@ -3401,12 +3220,31 @@ class sycl_gpu_mgr {
|
|
3401
3220
|
int work_group_size = 0;
|
3402
3221
|
std::string gpus_list = "";
|
3403
3222
|
|
3223
|
+
/*
|
3224
|
+
Use all GPUs with same top max compute units
|
3225
|
+
*/
|
3404
3226
|
sycl_gpu_mgr() {
|
3405
3227
|
detect_sycl_gpu_list_with_max_cu();
|
3406
3228
|
get_allow_gpus();
|
3407
3229
|
create_context_with_gpus();
|
3408
3230
|
}
|
3409
3231
|
|
3232
|
+
/*
|
3233
|
+
Only use the assigned GPU
|
3234
|
+
*/
|
3235
|
+
sycl_gpu_mgr(int main_gpu_id) {
|
3236
|
+
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
3237
|
+
dpct::device_info prop;
|
3238
|
+
dpct::get_device_info(prop, device);
|
3239
|
+
gpus.push_back(main_gpu_id);
|
3240
|
+
devices.push_back(device);
|
3241
|
+
work_group_size = prop.get_max_work_group_size();
|
3242
|
+
max_compute_units = prop.get_max_compute_units();
|
3243
|
+
|
3244
|
+
get_allow_gpus();
|
3245
|
+
create_context_with_gpus();
|
3246
|
+
}
|
3247
|
+
|
3410
3248
|
void create_context_with_gpus() {
|
3411
3249
|
sycl::context ctx = sycl::context(devices);
|
3412
3250
|
assert(gpus.size() > 0);
|
@@ -3422,7 +3260,7 @@ class sycl_gpu_mgr {
|
|
3422
3260
|
gpus_list += std::to_string(gpus[i]);
|
3423
3261
|
gpus_list += ",";
|
3424
3262
|
}
|
3425
|
-
if (gpus_list.length() >
|
3263
|
+
if (gpus_list.length() > 1) {
|
3426
3264
|
gpus_list.pop_back();
|
3427
3265
|
}
|
3428
3266
|
}
|
@@ -3471,8 +3309,8 @@ class sycl_gpu_mgr {
|
|
3471
3309
|
if (gpus[i] == id)
|
3472
3310
|
return i;
|
3473
3311
|
}
|
3474
|
-
|
3475
|
-
|
3312
|
+
printf("miss to get device index by id=%d\n", id);
|
3313
|
+
GGML_ASSERT(false);
|
3476
3314
|
}
|
3477
3315
|
|
3478
3316
|
int get_next_index(int id) {
|
@@ -3481,8 +3319,7 @@ class sycl_gpu_mgr {
|
|
3481
3319
|
if (gpus[i] == id)
|
3482
3320
|
return i;
|
3483
3321
|
}
|
3484
|
-
|
3485
|
-
return -1;
|
3322
|
+
GGML_ASSERT(false);
|
3486
3323
|
}
|
3487
3324
|
|
3488
3325
|
bool is_ext_oneapi_device(const sycl::device &dev) {
|
@@ -3500,11 +3337,14 @@ static int g_device_count = -1;
|
|
3500
3337
|
static int g_all_sycl_device_count = -1;
|
3501
3338
|
static int g_main_device = -1;
|
3502
3339
|
static int g_main_device_id = -1;
|
3340
|
+
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
3503
3341
|
|
3504
3342
|
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
3505
3343
|
|
3506
3344
|
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
3507
3345
|
|
3346
|
+
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
|
3347
|
+
|
3508
3348
|
struct sycl_device_capabilities {
|
3509
3349
|
int cc; // compute capability
|
3510
3350
|
bool vmm; // virtual memory support
|
@@ -8239,7 +8079,7 @@ template <bool need_check> static void
|
|
8239
8079
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
8240
8080
|
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8241
8081
|
const sycl::nd_item<3> &item_ct1,
|
8242
|
-
const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
|
8082
|
+
const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
|
8243
8083
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8244
8084
|
item_ct1.get_local_id(1);
|
8245
8085
|
|
@@ -10116,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10116
9956
|
dpct::queue_ptr stream) {
|
10117
9957
|
const int nb = k / QK_K;
|
10118
9958
|
{
|
10119
|
-
iq2xxs_grid.init(*stream);
|
10120
|
-
ksigns_iq2xs.init(*stream);
|
10121
|
-
kmask_iq2xs.init(*stream);
|
10122
9959
|
|
10123
9960
|
dpct::has_capability_or_fail(stream->get_device(),
|
10124
9961
|
{sycl::aspect::fp16});
|
10125
9962
|
|
10126
9963
|
stream->submit([&](sycl::handler &cgh) {
|
10127
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10128
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10129
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9964
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
9965
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9966
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10130
9967
|
|
10131
9968
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10132
9969
|
sycl::range<3>(1, 1, 32),
|
@@ -10145,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10145
9982
|
dpct::queue_ptr stream) {
|
10146
9983
|
const int nb = k / QK_K;
|
10147
9984
|
{
|
10148
|
-
iq2xs_grid.init(*stream);
|
10149
|
-
ksigns_iq2xs.init(*stream);
|
10150
|
-
kmask_iq2xs.init(*stream);
|
10151
9985
|
|
10152
9986
|
dpct::has_capability_or_fail(stream->get_device(),
|
10153
9987
|
{sycl::aspect::fp16});
|
10154
9988
|
|
10155
9989
|
stream->submit([&](sycl::handler &cgh) {
|
10156
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
10157
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10158
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9990
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
9991
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9992
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10159
9993
|
|
10160
9994
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10161
9995
|
sycl::range<3>(1, 1, 32),
|
@@ -10174,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10174
10008
|
dpct::queue_ptr stream) {
|
10175
10009
|
const int nb = k / QK_K;
|
10176
10010
|
{
|
10177
|
-
iq3xxs_grid.init(*stream);
|
10178
|
-
ksigns_iq2xs.init(*stream);
|
10179
|
-
kmask_iq2xs.init(*stream);
|
10180
10011
|
|
10181
10012
|
dpct::has_capability_or_fail(stream->get_device(),
|
10182
10013
|
{sycl::aspect::fp16});
|
10183
10014
|
|
10184
10015
|
stream->submit([&](sycl::handler &cgh) {
|
10185
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
10186
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10187
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10016
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10017
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10018
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10188
10019
|
|
10189
10020
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10190
10021
|
sycl::range<3>(1, 1, 32),
|
@@ -10203,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10203
10034
|
dpct::queue_ptr stream) {
|
10204
10035
|
const int nb = k / QK_K;
|
10205
10036
|
{
|
10206
|
-
iq3s_grid.init(*stream);
|
10207
|
-
ksigns_iq2xs.init(*stream);
|
10208
|
-
kmask_iq2xs.init(*stream);
|
10209
10037
|
|
10210
10038
|
dpct::has_capability_or_fail(stream->get_device(),
|
10211
10039
|
{sycl::aspect::fp16});
|
10212
10040
|
|
10213
10041
|
stream->submit([&](sycl::handler &cgh) {
|
10214
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
10215
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10216
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10042
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10043
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10044
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10217
10045
|
|
10218
10046
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10219
10047
|
sycl::range<3>(1, 1, 32),
|
@@ -10232,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10232
10060
|
dpct::queue_ptr stream) {
|
10233
10061
|
const int nb = k / QK_K;
|
10234
10062
|
{
|
10235
|
-
iq1s_grid_gpu.init(*stream);
|
10236
|
-
ksigns_iq2xs.init(*stream);
|
10237
|
-
kmask_iq2xs.init(*stream);
|
10238
10063
|
|
10239
10064
|
dpct::has_capability_or_fail(stream->get_device(),
|
10240
10065
|
{sycl::aspect::fp16});
|
10241
10066
|
|
10242
10067
|
stream->submit([&](sycl::handler &cgh) {
|
10243
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
10244
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10245
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10068
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10069
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10070
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10246
10071
|
|
10247
10072
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10248
10073
|
sycl::range<3>(1, 1, 32),
|
@@ -10575,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10575
10400
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10576
10401
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10577
10402
|
{
|
10578
|
-
iq3xxs_grid.init(*stream);
|
10579
|
-
ksigns64.init(*stream);
|
10580
10403
|
|
10581
10404
|
stream->submit([&](sycl::handler &cgh) {
|
10582
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10583
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10584
10405
|
|
10585
10406
|
cgh.parallel_for(
|
10586
10407
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10588,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10588
10409
|
[[intel::reqd_sub_group_size(32)]] {
|
10589
10410
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
10590
10411
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
10591
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10592
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10412
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10593
10413
|
});
|
10594
10414
|
});
|
10595
10415
|
}
|
@@ -10604,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10604
10424
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10605
10425
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10606
10426
|
{
|
10607
|
-
iq3xxs_grid.init(*stream);
|
10608
|
-
ksigns64.init(*stream);
|
10609
10427
|
|
10610
10428
|
stream->submit([&](sycl::handler &cgh) {
|
10611
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10612
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10613
10429
|
|
10614
10430
|
cgh.parallel_for(
|
10615
10431
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10617,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10617
10433
|
[[intel::reqd_sub_group_size(32)]] {
|
10618
10434
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
10619
10435
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
10620
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10621
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10436
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10622
10437
|
});
|
10623
10438
|
});
|
10624
10439
|
}
|
@@ -10633,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10633
10448
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10634
10449
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10635
10450
|
{
|
10636
|
-
iq3xxs_grid.init(*stream);
|
10637
|
-
ksigns64.init(*stream);
|
10638
10451
|
|
10639
10452
|
stream->submit([&](sycl::handler &cgh) {
|
10640
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10641
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10642
10453
|
|
10643
10454
|
cgh.parallel_for(
|
10644
10455
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10646,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10646
10457
|
[[intel::reqd_sub_group_size(32)]] {
|
10647
10458
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
10648
10459
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
10649
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10650
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10460
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10651
10461
|
});
|
10652
10462
|
});
|
10653
10463
|
}
|
@@ -10662,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10662
10472
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10663
10473
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10664
10474
|
{
|
10665
|
-
iq3xxs_grid.init(*stream);
|
10666
|
-
ksigns64.init(*stream);
|
10667
10475
|
|
10668
10476
|
stream->submit([&](sycl::handler &cgh) {
|
10669
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10670
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10671
10477
|
|
10672
10478
|
cgh.parallel_for(
|
10673
10479
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10675,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10675
10481
|
[[intel::reqd_sub_group_size(32)]] {
|
10676
10482
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
10677
10483
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
10678
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10679
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10484
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10680
10485
|
});
|
10681
10486
|
});
|
10682
10487
|
}
|
@@ -10691,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10691
10496
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10692
10497
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10693
10498
|
{
|
10694
|
-
iq3xxs_grid.init(*stream);
|
10695
|
-
ksigns64.init(*stream);
|
10696
10499
|
|
10697
10500
|
stream->submit([&](sycl::handler &cgh) {
|
10698
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10699
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10700
10501
|
|
10701
10502
|
cgh.parallel_for(
|
10702
10503
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10704,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10704
10505
|
[[intel::reqd_sub_group_size(32)]] {
|
10705
10506
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
10706
10507
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
10707
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10708
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10508
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10709
10509
|
});
|
10710
10510
|
});
|
10711
10511
|
}
|
@@ -10720,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10720
10520
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10721
10521
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10722
10522
|
{
|
10723
|
-
iq3xxs_grid.init(*stream);
|
10724
|
-
ksigns64.init(*stream);
|
10725
10523
|
|
10726
10524
|
stream->submit([&](sycl::handler &cgh) {
|
10727
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10728
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10729
10525
|
|
10730
10526
|
cgh.parallel_for(
|
10731
10527
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10733,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10733
10529
|
[[intel::reqd_sub_group_size(32)]] {
|
10734
10530
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
10735
10531
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
10736
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10737
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10532
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10738
10533
|
});
|
10739
10534
|
});
|
10740
10535
|
}
|
@@ -10749,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10749
10544
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10750
10545
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10751
10546
|
{
|
10752
|
-
iq3xxs_grid.init(*stream);
|
10753
|
-
ksigns64.init(*stream);
|
10754
10547
|
|
10755
10548
|
stream->submit([&](sycl::handler &cgh) {
|
10756
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10757
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10758
10549
|
|
10759
10550
|
cgh.parallel_for(
|
10760
10551
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10762,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10762
10553
|
[[intel::reqd_sub_group_size(32)]] {
|
10763
10554
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
10764
10555
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
10765
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10766
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10556
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10767
10557
|
});
|
10768
10558
|
});
|
10769
10559
|
}
|
@@ -10778,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10778
10568
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10779
10569
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10780
10570
|
{
|
10781
|
-
iq3xxs_grid.init(*stream);
|
10782
|
-
ksigns64.init(*stream);
|
10783
10571
|
|
10784
10572
|
stream->submit([&](sycl::handler &cgh) {
|
10785
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10786
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10787
10573
|
|
10788
10574
|
cgh.parallel_for(
|
10789
10575
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10791,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10791
10577
|
[[intel::reqd_sub_group_size(32)]] {
|
10792
10578
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
10793
10579
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
10794
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10795
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10580
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10796
10581
|
});
|
10797
10582
|
});
|
10798
10583
|
}
|
@@ -10807,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10807
10592
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10808
10593
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10809
10594
|
{
|
10810
|
-
iq3xxs_grid.init(*stream);
|
10811
|
-
ksigns64.init(*stream);
|
10812
10595
|
|
10813
10596
|
stream->submit([&](sycl::handler &cgh) {
|
10814
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10815
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10816
10597
|
|
10817
10598
|
cgh.parallel_for(
|
10818
10599
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10820,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10820
10601
|
[[intel::reqd_sub_group_size(32)]] {
|
10821
10602
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
10822
10603
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
10823
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10824
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10604
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10825
10605
|
});
|
10826
10606
|
});
|
10827
10607
|
}
|
@@ -10836,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10836
10616
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10837
10617
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10838
10618
|
{
|
10839
|
-
iq3xxs_grid.init(*stream);
|
10840
|
-
ksigns64.init(*stream);
|
10841
10619
|
|
10842
10620
|
stream->submit([&](sycl::handler &cgh) {
|
10843
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10844
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10845
10621
|
|
10846
10622
|
cgh.parallel_for(
|
10847
10623
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10849,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10849
10625
|
[[intel::reqd_sub_group_size(32)]] {
|
10850
10626
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
10851
10627
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
10852
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10853
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10628
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10854
10629
|
});
|
10855
10630
|
});
|
10856
10631
|
}
|
10857
10632
|
}
|
10858
10633
|
|
10634
|
+
|
10859
10635
|
static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
10860
10636
|
float *dst, const int ncols,
|
10861
10637
|
const int nrows,
|
@@ -10865,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
10865
10641
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10866
10642
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10867
10643
|
{
|
10868
|
-
iq2xxs_grid.init(*stream);
|
10869
|
-
ksigns_iq2xs.init(*stream);
|
10870
|
-
kmask_iq2xs.init(*stream);
|
10871
|
-
|
10872
10644
|
|
10873
10645
|
stream->submit([&](sycl::handler &cgh) {
|
10874
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10875
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10876
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10646
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
10647
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10648
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10877
10649
|
|
10878
10650
|
cgh.parallel_for(
|
10879
10651
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10896,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
10896
10668
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10897
10669
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10898
10670
|
{
|
10899
|
-
iq2xs_grid.init(*stream);
|
10900
|
-
ksigns64.init(*stream);
|
10901
10671
|
|
10902
10672
|
stream->submit([&](sycl::handler &cgh) {
|
10903
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
10904
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10673
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
10674
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
10905
10675
|
|
10906
10676
|
cgh.parallel_for(
|
10907
10677
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10924,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
10924
10694
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10925
10695
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10926
10696
|
{
|
10927
|
-
iq3xxs_grid.init(*stream);
|
10928
|
-
ksigns64.init(*stream);
|
10929
10697
|
|
10930
10698
|
stream->submit([&](sycl::handler &cgh) {
|
10931
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
10932
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10699
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10700
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
10933
10701
|
|
10934
10702
|
cgh.parallel_for(
|
10935
10703
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10952,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
10952
10720
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10953
10721
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10954
10722
|
{
|
10955
|
-
iq3s_grid.init(*stream);
|
10956
|
-
ksigns64.init(*stream);
|
10957
10723
|
|
10958
10724
|
stream->submit([&](sycl::handler &cgh) {
|
10959
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
10960
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10725
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10726
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
10961
10727
|
|
10962
10728
|
cgh.parallel_for(
|
10963
10729
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10980,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
10980
10746
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10981
10747
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10982
10748
|
{
|
10983
|
-
iq1s_grid_gpu.init(*stream);
|
10984
|
-
ksigns64.init(*stream);
|
10985
10749
|
|
10986
10750
|
stream->submit([&](sycl::handler &cgh) {
|
10987
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
10988
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10751
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10752
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
10989
10753
|
|
10990
10754
|
cgh.parallel_for(
|
10991
10755
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -13008,37 +12772,57 @@ bool ggml_sycl_loaded(void) {
|
|
13008
12772
|
return g_sycl_loaded;
|
13009
12773
|
}
|
13010
12774
|
|
13011
|
-
void print_device_detail(int id) {
|
12775
|
+
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
12776
|
+
|
13012
12777
|
dpct::device_info prop;
|
13013
12778
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
13014
|
-
dpct::get_device_info(prop,
|
13015
|
-
|
12779
|
+
dpct::get_device_info(prop, device)));
|
12780
|
+
|
13016
12781
|
std::string version;
|
13017
12782
|
version += std::to_string(prop.get_major_version());
|
13018
12783
|
version += ".";
|
13019
12784
|
version += std::to_string(prop.get_minor_version());
|
13020
12785
|
|
13021
|
-
|
12786
|
+
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
12787
|
+
|
12788
|
+
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
13022
12789
|
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
13023
12790
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
13024
12791
|
prop.get_global_mem_size());
|
13025
12792
|
}
|
13026
12793
|
|
13027
12794
|
void ggml_backend_sycl_print_sycl_devices() {
|
12795
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
13028
12796
|
int device_count = dpct::dev_mgr::instance().device_count();
|
12797
|
+
std::map<std::string, size_t> DeviceNums;
|
13029
12798
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
13030
|
-
fprintf(stderr, "|
|
13031
|
-
fprintf(stderr, "
|
12799
|
+
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
12800
|
+
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
12801
|
+
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
13032
12802
|
for (int id = 0; id < device_count; ++id) {
|
13033
|
-
|
12803
|
+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
12804
|
+
sycl::backend backend = device.get_backend();
|
12805
|
+
std::string backend_type = get_device_backend_and_type(device);
|
12806
|
+
int type_id=DeviceNums[backend_type]++;
|
12807
|
+
std::stringstream device_type;
|
12808
|
+
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
|
12809
|
+
print_device_detail(id, device, device_type.str());
|
13034
12810
|
}
|
13035
12811
|
}
|
13036
12812
|
|
13037
12813
|
void print_gpu_device_list() {
|
13038
|
-
|
13039
|
-
|
13040
|
-
|
13041
|
-
|
12814
|
+
GGML_ASSERT(g_sycl_gpu_mgr);
|
12815
|
+
|
12816
|
+
char* hint=NULL;
|
12817
|
+
if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
|
12818
|
+
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
|
12819
|
+
} else {
|
12820
|
+
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
12821
|
+
}
|
12822
|
+
fprintf(stderr, hint,
|
12823
|
+
g_sycl_gpu_mgr->get_gpu_count(),
|
12824
|
+
g_sycl_gpu_mgr->gpus_list.c_str(),
|
12825
|
+
g_sycl_gpu_mgr->max_compute_units);
|
13042
12826
|
}
|
13043
12827
|
|
13044
12828
|
int get_sycl_env(const char *env_name, int default_val) {
|
@@ -13062,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
|
|
13062
12846
|
return prop.get_max_work_group_size();
|
13063
12847
|
}
|
13064
12848
|
|
13065
|
-
void ggml_init_sycl() try {
|
12849
|
+
static void ggml_init_sycl() try {
|
13066
12850
|
static bool initialized = false;
|
13067
12851
|
|
13068
12852
|
if (!initialized) {
|
12853
|
+
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
13069
12854
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
12855
|
+
|
13070
12856
|
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
13071
12857
|
|
13072
12858
|
#if defined(GGML_SYCL_F16)
|
@@ -13074,6 +12860,15 @@ void ggml_init_sycl() try {
|
|
13074
12860
|
#else
|
13075
12861
|
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
13076
12862
|
#endif
|
12863
|
+
|
12864
|
+
/* NOT REMOVE, keep it for next optimize for XMX.
|
12865
|
+
#if defined(SYCL_USE_XMX)
|
12866
|
+
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
12867
|
+
#else
|
12868
|
+
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
12869
|
+
#endif
|
12870
|
+
*/
|
12871
|
+
|
13077
12872
|
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
13078
12873
|
dpct::dev_mgr::instance().device_count()) != 0) {
|
13079
12874
|
initialized = true;
|
@@ -13082,68 +12877,65 @@ void ggml_init_sycl() try {
|
|
13082
12877
|
}
|
13083
12878
|
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
13084
12879
|
ggml_backend_sycl_print_sycl_devices();
|
12880
|
+
initialized = true;
|
12881
|
+
g_sycl_loaded = true;
|
12882
|
+
}
|
12883
|
+
}
|
12884
|
+
catch (sycl::exception const &exc) {
|
12885
|
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
12886
|
+
<< ", line:" << __LINE__ << std::endl;
|
12887
|
+
std::exit(1);
|
12888
|
+
}
|
13085
12889
|
|
13086
|
-
|
13087
|
-
|
13088
|
-
|
13089
|
-
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
13090
|
-
|
13091
|
-
print_gpu_device_list();
|
12890
|
+
void ggml_init_by_gpus(int device_count) try {
|
12891
|
+
g_device_count = device_count;
|
12892
|
+
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
13092
12893
|
|
13093
|
-
|
12894
|
+
int64_t total_vram = 0;
|
13094
12895
|
|
13095
|
-
|
13096
|
-
#if defined(SYCL_USE_XMX)
|
13097
|
-
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
|
13098
|
-
#else
|
13099
|
-
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
13100
|
-
#endif
|
13101
|
-
*/
|
13102
|
-
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
13103
|
-
g_device_caps[id].vmm = 0;
|
13104
|
-
g_device_caps[id].device_id = -1;
|
13105
|
-
g_device_caps[id].cc = 0;
|
13106
|
-
g_tensor_split[id] = 0;
|
13107
|
-
g_default_tensor_split[id] = 0;
|
13108
|
-
}
|
12896
|
+
print_gpu_device_list();
|
13109
12897
|
|
13110
|
-
|
13111
|
-
|
13112
|
-
|
12898
|
+
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
12899
|
+
g_device_caps[id].vmm = 0;
|
12900
|
+
g_device_caps[id].device_id = -1;
|
12901
|
+
g_device_caps[id].cc = 0;
|
12902
|
+
g_tensor_split[id] = 0;
|
12903
|
+
g_default_tensor_split[id] = 0;
|
12904
|
+
}
|
13113
12905
|
|
13114
|
-
|
13115
|
-
|
13116
|
-
|
12906
|
+
for (int i = 0; i < g_device_count; ++i) {
|
12907
|
+
int device_id = g_sycl_gpu_mgr->gpus[i];
|
12908
|
+
g_device_caps[i].vmm = 0;
|
13117
12909
|
|
13118
|
-
|
13119
|
-
|
12910
|
+
dpct::device_info prop;
|
12911
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
12912
|
+
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
13120
12913
|
|
13121
|
-
|
13122
|
-
|
13123
|
-
}
|
12914
|
+
g_default_tensor_split[i] = total_vram;
|
12915
|
+
total_vram += prop.get_global_mem_size();
|
13124
12916
|
|
13125
|
-
|
13126
|
-
|
13127
|
-
|
12917
|
+
g_device_caps[i].cc =
|
12918
|
+
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
12919
|
+
}
|
13128
12920
|
|
13129
|
-
|
13130
|
-
|
12921
|
+
for (int i = 0; i < g_device_count; ++i) {
|
12922
|
+
g_default_tensor_split[i] /= total_vram;
|
12923
|
+
}
|
13131
12924
|
|
13132
|
-
|
13133
|
-
|
13134
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
13135
|
-
g_syclStreams[i][is] =
|
13136
|
-
dpct::get_current_device().create_queue(
|
13137
|
-
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
13138
|
-
}
|
12925
|
+
for (int i = 0; i < g_device_count; ++i) {
|
12926
|
+
SYCL_CHECK(ggml_sycl_set_device(i));
|
13139
12927
|
|
13140
|
-
|
13141
|
-
|
13142
|
-
SYCL_CHECK(CHECK_TRY_ERROR(
|
12928
|
+
// create sycl streams
|
12929
|
+
for (int is = 0; is < MAX_STREAMS; ++is) {
|
12930
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
12931
|
+
g_syclStreams[i][is] =
|
12932
|
+
dpct::get_current_device().create_queue(
|
12933
|
+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
13143
12934
|
}
|
13144
12935
|
|
13145
|
-
|
13146
|
-
|
12936
|
+
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
12937
|
+
// create sycl handle
|
12938
|
+
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
13147
12939
|
}
|
13148
12940
|
}
|
13149
12941
|
catch (sycl::exception const &exc) {
|
@@ -15121,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15121
14913
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
15122
14914
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
15123
14915
|
|
14916
|
+
bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
|
14917
|
+
main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
|
14918
|
+
|
15124
14919
|
SYCL_CHECK(
|
15125
14920
|
CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
|
15126
14921
|
|
@@ -15151,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15151
14946
|
|
15152
14947
|
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
15153
14948
|
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
14949
|
+
if (no_mixed_dtypes) {
|
14950
|
+
cu_compute_type = dpct::library_data_t::real_half;
|
14951
|
+
cu_data_type = dpct::library_data_t::real_half;
|
14952
|
+
}
|
15154
14953
|
|
15155
14954
|
// dst strides
|
15156
14955
|
size_t nbd2 = dst->nb[2];
|
15157
14956
|
size_t nbd3 = dst->nb[3];
|
15158
14957
|
|
14958
|
+
const float alpha_f32 = 1.0f;
|
14959
|
+
const float beta_f32 = 0.0f;
|
14960
|
+
|
15159
14961
|
const sycl::half alpha_f16 = 1.0f;
|
15160
14962
|
const sycl::half beta_f16 = 0.0f;
|
15161
14963
|
|
15162
|
-
const float alpha_f32 = 1.0f;
|
15163
|
-
const float beta_f32 = 0.0f;
|
15164
|
-
|
15165
14964
|
const void * alpha = &alpha_f32;
|
15166
14965
|
const void * beta = &beta_f32;
|
14966
|
+
if (no_mixed_dtypes) {
|
14967
|
+
alpha = &alpha_f16;
|
14968
|
+
beta = &beta_f16;
|
14969
|
+
}
|
15167
14970
|
|
15168
14971
|
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
15169
|
-
// oneMKL open source supports half, half, float, float: datatypes
|
14972
|
+
// when oneMKL open source supports half, half, float, float: datatypes
|
15170
14973
|
|
15171
14974
|
dst_t = (char *) dst_ddf;
|
14975
|
+
if (no_mixed_dtypes) {
|
14976
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
14977
|
+
|
14978
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
14979
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
14980
|
+
}
|
15172
14981
|
|
15173
14982
|
GGML_ASSERT(ne12 % ne02 == 0);
|
15174
14983
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -15254,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15254
15063
|
}
|
15255
15064
|
#endif
|
15256
15065
|
|
15066
|
+
if (no_mixed_dtypes) {
|
15067
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
15068
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
15069
|
+
}
|
15257
15070
|
}
|
15258
15071
|
catch (sycl::exception const &exc) {
|
15259
15072
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -16153,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16153
15966
|
}
|
16154
15967
|
|
16155
15968
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
15969
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
16156
15970
|
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
16157
15971
|
|
16158
15972
|
if (!g_sycl_gpu_mgr) {
|
@@ -16187,6 +16001,7 @@ catch (sycl::exception const &exc) {
|
|
16187
16001
|
|
16188
16002
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
16189
16003
|
size_t description_size) try {
|
16004
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
16190
16005
|
dpct::device_info prop;
|
16191
16006
|
int device_id = g_sycl_gpu_mgr->gpus[device];
|
16192
16007
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
@@ -16201,6 +16016,7 @@ catch (sycl::exception const &exc) {
|
|
16201
16016
|
|
16202
16017
|
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
16203
16018
|
size_t *total) try {
|
16019
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
16204
16020
|
ggml_sycl_set_device(device);
|
16205
16021
|
|
16206
16022
|
/*
|
@@ -16551,22 +16367,26 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16551
16367
|
/* .is_host = */ nullptr,
|
16552
16368
|
};
|
16553
16369
|
|
16554
|
-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int
|
16555
|
-
|
16370
|
+
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
16371
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
16556
16372
|
|
16557
|
-
|
16373
|
+
if (device_index>=g_device_count or device_index<0) {
|
16374
|
+
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
16375
|
+
device_index, g_device_count-1);
|
16376
|
+
GGML_ASSERT(device_index<g_device_count);
|
16377
|
+
}
|
16378
|
+
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
16558
16379
|
|
16559
|
-
if (!
|
16380
|
+
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
16560
16381
|
for (int i = 0; i < g_device_count; i++) {
|
16561
16382
|
ggml_backend_sycl_buffer_types[i] = {
|
16562
16383
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
16563
16384
|
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
16564
16385
|
};
|
16565
16386
|
}
|
16566
|
-
|
16387
|
+
g_ggml_backend_sycl_buffer_type_initialized = true;
|
16567
16388
|
}
|
16568
|
-
|
16569
|
-
return &ggml_backend_sycl_buffer_types[device];
|
16389
|
+
return &ggml_backend_sycl_buffer_types[device_index];
|
16570
16390
|
}
|
16571
16391
|
|
16572
16392
|
// sycl split buffer type
|
@@ -16919,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
16919
16739
|
};
|
16920
16740
|
|
16921
16741
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
16742
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
|
16743
|
+
ggml_init_sycl();
|
16922
16744
|
// FIXME: this is not thread safe
|
16923
16745
|
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
16924
16746
|
|
@@ -16990,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|
16990
16812
|
}
|
16991
16813
|
|
16992
16814
|
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
16815
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
|
16993
16816
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
16994
16817
|
/* .iface = */ {
|
16995
16818
|
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
@@ -17104,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|
17104
16927
|
params.ith = 0;
|
17105
16928
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17106
16929
|
ggml_tensor * node = cgraph->nodes[i];
|
17107
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
16930
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
17108
16931
|
continue;
|
17109
16932
|
}
|
17110
16933
|
#ifndef NDEBUG
|
@@ -17252,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17252
17075
|
UNUSED(backend);
|
17253
17076
|
}
|
17254
17077
|
|
17078
|
+
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17079
|
+
const int min_batch_size = 32;
|
17080
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17081
|
+
GGML_UNUSED(backend);
|
17082
|
+
}
|
17083
|
+
|
17084
|
+
|
17255
17085
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
17256
17086
|
/* .get_name = */ ggml_backend_sycl_name,
|
17257
17087
|
/* .free = */ ggml_backend_sycl_free,
|
@@ -17265,6 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17265
17095
|
/* .graph_plan_compute = */ NULL,
|
17266
17096
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17267
17097
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17098
|
+
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
17268
17099
|
/* .event_new = */ NULL,
|
17269
17100
|
/* .event_free = */ NULL,
|
17270
17101
|
/* .event_record = */ NULL,
|
@@ -17278,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|
17278
17109
|
}
|
17279
17110
|
|
17280
17111
|
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
17281
|
-
|
17112
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
17113
|
+
ggml_init_sycl();
|
17282
17114
|
|
17283
17115
|
check_allow_gpu_index(device);
|
17284
17116
|
|
@@ -17304,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
17304
17136
|
}
|
17305
17137
|
|
17306
17138
|
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
17139
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
17307
17140
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17308
17141
|
return g_sycl_gpu_mgr->get_gpu_count();
|
17309
17142
|
}
|
@@ -17316,14 +17149,53 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
|
|
17316
17149
|
}
|
17317
17150
|
|
17318
17151
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
17152
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
|
17319
17153
|
return g_sycl_gpu_mgr->get_index(device_id);
|
17320
17154
|
}
|
17321
17155
|
|
17156
|
+
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
17157
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
|
17158
|
+
return g_sycl_gpu_mgr->gpus[device_index];
|
17159
|
+
}
|
17160
|
+
|
17161
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
17162
|
+
ggml_init_sycl();
|
17163
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
17164
|
+
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
17165
|
+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
17166
|
+
|
17167
|
+
if (g_sycl_gpu_mgr) {
|
17168
|
+
delete g_sycl_gpu_mgr;
|
17169
|
+
}
|
17170
|
+
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
17171
|
+
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
|
17172
|
+
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
17173
|
+
g_ggml_backend_sycl_buffer_type_initialized = false;
|
17174
|
+
}
|
17175
|
+
|
17176
|
+
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
17177
|
+
ggml_init_sycl();
|
17178
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
|
17179
|
+
|
17180
|
+
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
17181
|
+
return;
|
17182
|
+
}
|
17183
|
+
|
17184
|
+
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
|
17185
|
+
|
17186
|
+
if (g_sycl_gpu_mgr) {
|
17187
|
+
delete g_sycl_gpu_mgr;
|
17188
|
+
}
|
17189
|
+
g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17190
|
+
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
|
17191
|
+
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
17192
|
+
g_ggml_backend_sycl_buffer_type_initialized = false;
|
17193
|
+
}
|
17194
|
+
|
17322
17195
|
extern "C" int ggml_backend_sycl_reg_devices();
|
17323
17196
|
|
17324
17197
|
int ggml_backend_sycl_reg_devices() {
|
17325
|
-
|
17326
|
-
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
17198
|
+
ggml_backend_sycl_set_mul_device_mode();
|
17327
17199
|
assert(g_device_count>0);
|
17328
17200
|
for (int i = 0; i < g_device_count; i++) {
|
17329
17201
|
int id = g_sycl_gpu_mgr->gpus[i];
|