llama_cpp 0.14.3 → 0.14.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -740,11 +740,7 @@ namespace dpct
|
|
740
740
|
|
741
741
|
sycl::queue &default_queue()
|
742
742
|
{
|
743
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
744
|
-
return out_of_order_queue();
|
745
|
-
#else
|
746
743
|
return in_order_queue();
|
747
|
-
#endif // DPCT_USM_LEVEL_NONE
|
748
744
|
}
|
749
745
|
|
750
746
|
void queues_wait_and_throw()
|
@@ -763,11 +759,7 @@ namespace dpct
|
|
763
759
|
|
764
760
|
sycl::queue *create_queue(bool enable_exception_handler = false)
|
765
761
|
{
|
766
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
767
|
-
return create_out_of_order_queue(enable_exception_handler);
|
768
|
-
#else
|
769
762
|
return create_in_order_queue(enable_exception_handler);
|
770
|
-
#endif // DPCT_USM_LEVEL_NONE
|
771
763
|
}
|
772
764
|
|
773
765
|
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
@@ -1075,11 +1067,6 @@ namespace dpct
|
|
1075
1067
|
static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
|
1076
1068
|
const void *ptr)
|
1077
1069
|
{
|
1078
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1079
|
-
return mem_mgr::instance().is_device_ptr(ptr)
|
1080
|
-
? pointer_access_attribute::device_only
|
1081
|
-
: pointer_access_attribute::host_only;
|
1082
|
-
#else
|
1083
1070
|
switch (sycl::get_pointer_type(ptr, q.get_context()))
|
1084
1071
|
{
|
1085
1072
|
case sycl::usm::alloc::unknown:
|
@@ -1090,7 +1077,6 @@ namespace dpct
|
|
1090
1077
|
case sycl::usm::alloc::host:
|
1091
1078
|
return pointer_access_attribute::host_device;
|
1092
1079
|
}
|
1093
|
-
#endif
|
1094
1080
|
}
|
1095
1081
|
|
1096
1082
|
template <typename ArgT>
|
@@ -1273,11 +1259,7 @@ namespace dpct
|
|
1273
1259
|
|
1274
1260
|
static inline void *dpct_malloc(size_t size, sycl::queue &q)
|
1275
1261
|
{
|
1276
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1277
|
-
return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
|
1278
|
-
#else
|
1279
1262
|
return sycl::malloc_device(size, q.get_device(), q.get_context());
|
1280
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1281
1263
|
}
|
1282
1264
|
|
1283
1265
|
#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
|
@@ -1301,25 +1283,7 @@ namespace dpct
|
|
1301
1283
|
static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
|
1302
1284
|
valueT value, size_t size)
|
1303
1285
|
{
|
1304
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1305
|
-
auto &mm = mem_mgr::instance();
|
1306
|
-
assert(mm.is_device_ptr(dev_ptr));
|
1307
|
-
auto alloc = mm.translate_ptr(dev_ptr);
|
1308
|
-
size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
|
1309
|
-
|
1310
|
-
return q.submit([&](sycl::handler &cgh)
|
1311
|
-
{
|
1312
|
-
auto r = sycl::range<1>(size);
|
1313
|
-
auto o = sycl::id<1>(offset);
|
1314
|
-
auto new_buffer = alloc.buffer.reinterpret<valueT>(
|
1315
|
-
sycl::range<1>(alloc.size / sizeof(valueT)));
|
1316
|
-
sycl::accessor<valueT, 1, sycl::access_mode::write,
|
1317
|
-
sycl::access::target::device>
|
1318
|
-
acc(new_buffer, cgh, r, o);
|
1319
|
-
cgh.fill(acc, value); });
|
1320
|
-
#else
|
1321
1286
|
return q.fill(dev_ptr, value, size);
|
1322
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1323
1287
|
}
|
1324
1288
|
|
1325
1289
|
/**
|
@@ -1413,72 +1377,8 @@ namespace dpct
|
|
1413
1377
|
{
|
1414
1378
|
if (!size)
|
1415
1379
|
return sycl::event{};
|
1416
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1417
|
-
auto &mm = mem_mgr::instance();
|
1418
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
1419
|
-
|
1420
|
-
switch (real_direction)
|
1421
|
-
{
|
1422
|
-
case host_to_host:
|
1423
|
-
return q.submit([&](sycl::handler &cgh)
|
1424
|
-
{
|
1425
|
-
cgh.depends_on(dep_events);
|
1426
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
1427
|
-
case host_to_device:
|
1428
|
-
{
|
1429
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
1430
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
1431
|
-
return q.submit([&](sycl::handler &cgh)
|
1432
|
-
{
|
1433
|
-
cgh.depends_on(dep_events);
|
1434
|
-
auto r = sycl::range<1>(size);
|
1435
|
-
auto o = sycl::id<1>(offset);
|
1436
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1437
|
-
sycl::access::target::device>
|
1438
|
-
acc(alloc.buffer, cgh, r, o);
|
1439
|
-
cgh.copy(from_ptr, acc); });
|
1440
|
-
}
|
1441
|
-
case device_to_host:
|
1442
|
-
{
|
1443
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
1444
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
1445
|
-
return q.submit([&](sycl::handler &cgh)
|
1446
|
-
{
|
1447
|
-
cgh.depends_on(dep_events);
|
1448
|
-
auto r = sycl::range<1>(size);
|
1449
|
-
auto o = sycl::id<1>(offset);
|
1450
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1451
|
-
sycl::access::target::device>
|
1452
|
-
acc(alloc.buffer, cgh, r, o);
|
1453
|
-
cgh.copy(acc, to_ptr); });
|
1454
|
-
}
|
1455
|
-
case device_to_device:
|
1456
|
-
{
|
1457
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
1458
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
1459
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
1460
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
1461
|
-
return q.submit([&](sycl::handler &cgh)
|
1462
|
-
{
|
1463
|
-
cgh.depends_on(dep_events);
|
1464
|
-
auto r = sycl::range<1>(size);
|
1465
|
-
auto to_o = sycl::id<1>(to_offset);
|
1466
|
-
auto from_o = sycl::id<1>(from_offset);
|
1467
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1468
|
-
sycl::access::target::device>
|
1469
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
1470
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1471
|
-
sycl::access::target::device>
|
1472
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
1473
|
-
cgh.copy(from_acc, to_acc); });
|
1474
|
-
}
|
1475
|
-
default:
|
1476
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1477
|
-
}
|
1478
|
-
#else
|
1479
1380
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
1480
1381
|
GGML_UNUSED(direction);
|
1481
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1482
1382
|
}
|
1483
1383
|
|
1484
1384
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -1618,45 +1518,15 @@ namespace dpct
|
|
1618
1518
|
break;
|
1619
1519
|
}
|
1620
1520
|
case device_to_device:
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
cgh.depends_on(dep_events);
|
1631
|
-
auto to_o = sycl::id<1>(to_offset);
|
1632
|
-
auto from_o = sycl::id<1>(from_offset);
|
1633
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1634
|
-
sycl::access::target::device>
|
1635
|
-
to_acc(to_alloc.buffer, cgh,
|
1636
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
1637
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1638
|
-
sycl::access::target::device>
|
1639
|
-
from_acc(from_alloc.buffer, cgh,
|
1640
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
1641
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
1642
|
-
size,
|
1643
|
-
[=](sycl::id<3> id) {
|
1644
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
1645
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
1646
|
-
}); }));
|
1647
|
-
}
|
1648
|
-
#else
|
1649
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
1650
|
-
{
|
1651
|
-
cgh.depends_on(dep_events);
|
1652
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1653
|
-
size,
|
1654
|
-
[=](sycl::id<3> id) {
|
1655
|
-
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1656
|
-
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1657
|
-
}); }));
|
1658
|
-
#endif
|
1659
|
-
break;
|
1521
|
+
event_list.push_back(q.submit([&](sycl::handler &cgh){
|
1522
|
+
cgh.depends_on(dep_events);
|
1523
|
+
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1524
|
+
size,
|
1525
|
+
[=](sycl::id<3> id) {
|
1526
|
+
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1527
|
+
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1528
|
+
}); }));
|
1529
|
+
break;
|
1660
1530
|
default:
|
1661
1531
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1662
1532
|
}
|
@@ -1754,11 +1624,7 @@ namespace dpct
|
|
1754
1624
|
{
|
1755
1625
|
if (ptr)
|
1756
1626
|
{
|
1757
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1758
|
-
detail::mem_mgr::instance().mem_free(ptr);
|
1759
|
-
#else
|
1760
1627
|
sycl::free(ptr, q.get_context());
|
1761
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1762
1628
|
}
|
1763
1629
|
}
|
1764
1630
|
|
@@ -1766,11 +1632,7 @@ namespace dpct
|
|
1766
1632
|
inline auto get_memory(const void *x)
|
1767
1633
|
{
|
1768
1634
|
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
|
1769
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1770
|
-
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
|
1771
|
-
#else
|
1772
1635
|
return new_x;
|
1773
|
-
#endif
|
1774
1636
|
}
|
1775
1637
|
|
1776
1638
|
template <typename T>
|
@@ -2222,72 +2084,8 @@ namespace dpct
|
|
2222
2084
|
{
|
2223
2085
|
if (!size)
|
2224
2086
|
return sycl::event{};
|
2225
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2226
|
-
auto &mm = mem_mgr::instance();
|
2227
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
2228
|
-
|
2229
|
-
switch (real_direction)
|
2230
|
-
{
|
2231
|
-
case host_to_host:
|
2232
|
-
return q.submit([&](sycl::handler &cgh)
|
2233
|
-
{
|
2234
|
-
cgh.depends_on(dep_events);
|
2235
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
2236
|
-
case host_to_device:
|
2237
|
-
{
|
2238
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
2239
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
2240
|
-
return q.submit([&](sycl::handler &cgh)
|
2241
|
-
{
|
2242
|
-
cgh.depends_on(dep_events);
|
2243
|
-
auto r = sycl::range<1>(size);
|
2244
|
-
auto o = sycl::id<1>(offset);
|
2245
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2246
|
-
sycl::access::target::device>
|
2247
|
-
acc(alloc.buffer, cgh, r, o);
|
2248
|
-
cgh.copy(from_ptr, acc); });
|
2249
|
-
}
|
2250
|
-
case device_to_host:
|
2251
|
-
{
|
2252
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
2253
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
2254
|
-
return q.submit([&](sycl::handler &cgh)
|
2255
|
-
{
|
2256
|
-
cgh.depends_on(dep_events);
|
2257
|
-
auto r = sycl::range<1>(size);
|
2258
|
-
auto o = sycl::id<1>(offset);
|
2259
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2260
|
-
sycl::access::target::device>
|
2261
|
-
acc(alloc.buffer, cgh, r, o);
|
2262
|
-
cgh.copy(acc, to_ptr); });
|
2263
|
-
}
|
2264
|
-
case device_to_device:
|
2265
|
-
{
|
2266
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
2267
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
2268
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
2269
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
2270
|
-
return q.submit([&](sycl::handler &cgh)
|
2271
|
-
{
|
2272
|
-
cgh.depends_on(dep_events);
|
2273
|
-
auto r = sycl::range<1>(size);
|
2274
|
-
auto to_o = sycl::id<1>(to_offset);
|
2275
|
-
auto from_o = sycl::id<1>(from_offset);
|
2276
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2277
|
-
sycl::access::target::device>
|
2278
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
2279
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2280
|
-
sycl::access::target::device>
|
2281
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
2282
|
-
cgh.copy(from_acc, to_acc); });
|
2283
|
-
}
|
2284
|
-
default:
|
2285
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
2286
|
-
}
|
2287
|
-
#else
|
2288
2087
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
2289
2088
|
GGML_UNUSED(direction);
|
2290
|
-
#endif // DPCT_USM_LEVEL_NONE
|
2291
2089
|
}
|
2292
2090
|
|
2293
2091
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -2427,34 +2225,6 @@ namespace dpct
|
|
2427
2225
|
break;
|
2428
2226
|
}
|
2429
2227
|
case device_to_device:
|
2430
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2431
|
-
{
|
2432
|
-
auto &mm = mem_mgr::instance();
|
2433
|
-
auto to_alloc = mm.translate_ptr(to_surface);
|
2434
|
-
auto from_alloc = mm.translate_ptr(from_surface);
|
2435
|
-
size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
|
2436
|
-
size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
|
2437
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2438
|
-
{
|
2439
|
-
cgh.depends_on(dep_events);
|
2440
|
-
auto to_o = sycl::id<1>(to_offset);
|
2441
|
-
auto from_o = sycl::id<1>(from_offset);
|
2442
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2443
|
-
sycl::access::target::device>
|
2444
|
-
to_acc(to_alloc.buffer, cgh,
|
2445
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
2446
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2447
|
-
sycl::access::target::device>
|
2448
|
-
from_acc(from_alloc.buffer, cgh,
|
2449
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
2450
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
2451
|
-
size,
|
2452
|
-
[=](sycl::id<3> id) {
|
2453
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
2454
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
2455
|
-
}); }));
|
2456
|
-
}
|
2457
|
-
#else
|
2458
2228
|
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2459
2229
|
{
|
2460
2230
|
cgh.depends_on(dep_events);
|
@@ -2464,7 +2234,6 @@ namespace dpct
|
|
2464
2234
|
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
2465
2235
|
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
2466
2236
|
}); }));
|
2467
|
-
#endif
|
2468
2237
|
break;
|
2469
2238
|
default:
|
2470
2239
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
@@ -2655,9 +2424,6 @@ namespace dpct
|
|
2655
2424
|
void *c[], library_data_t c_type, int ldc,
|
2656
2425
|
int batch_size, library_data_t scaling_type)
|
2657
2426
|
{
|
2658
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2659
|
-
throw std::runtime_error("this API is unsupported when USM level is none");
|
2660
|
-
#else
|
2661
2427
|
if (scaling_type == library_data_t::real_float &&
|
2662
2428
|
c_type == library_data_t::complex_float)
|
2663
2429
|
{
|
@@ -2792,7 +2558,6 @@ namespace dpct
|
|
2792
2558
|
default:
|
2793
2559
|
throw std::runtime_error("the combination of data type is unsupported");
|
2794
2560
|
}
|
2795
|
-
#endif
|
2796
2561
|
}
|
2797
2562
|
|
2798
2563
|
/// Computes a batch of matrix-matrix product with general matrices.
|
@@ -3131,24 +2896,9 @@ namespace dpct
|
|
3131
2896
|
template <size_t D = Dimension>
|
3132
2897
|
typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
|
3133
2898
|
init();
|
3134
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3135
|
-
return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
|
3136
|
-
_device_ptr)
|
3137
|
-
.template get_access<sycl::access_mode::read_write>()[index];
|
3138
|
-
#else
|
3139
2899
|
return _device_ptr[index];
|
3140
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3141
2900
|
}
|
3142
2901
|
|
3143
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3144
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3145
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3146
|
-
return get_buffer(_device_ptr)
|
3147
|
-
.template reinterpret<T, Dimension>(_range)
|
3148
|
-
.template get_access<detail::memory_traits<Memory, T>::mode,
|
3149
|
-
detail::memory_traits<Memory, T>::target>(cgh);
|
3150
|
-
}
|
3151
|
-
#else
|
3152
2902
|
/// Get dpct::accessor with dimension info for the device memory object
|
3153
2903
|
/// when usm is used and dimension is greater than 1.
|
3154
2904
|
template <size_t D = Dimension>
|
@@ -3156,7 +2906,6 @@ namespace dpct
|
|
3156
2906
|
get_access(sycl::handler &cgh) {
|
3157
2907
|
return dpct_accessor_t((T *)_device_ptr, _range);
|
3158
2908
|
}
|
3159
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3160
2909
|
|
3161
2910
|
private:
|
3162
2911
|
device_memory(value_t *memory_ptr, size_t size)
|
@@ -3201,15 +2950,6 @@ namespace dpct
|
|
3201
2950
|
|
3202
2951
|
/// Default constructor
|
3203
2952
|
device_memory() : base(1) {}
|
3204
|
-
|
3205
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3206
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3207
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3208
|
-
auto buf = get_buffer(base::get_ptr())
|
3209
|
-
.template reinterpret<T, 1>(sycl::range<1>(1));
|
3210
|
-
return accessor_t(buf, cgh);
|
3211
|
-
}
|
3212
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3213
2953
|
};
|
3214
2954
|
} // namespace detail
|
3215
2955
|
|
@@ -3228,7 +2968,7 @@ namespace dpct
|
|
3228
2968
|
#include "ggml-common.h"
|
3229
2969
|
|
3230
2970
|
static int g_ggml_sycl_debug=0;
|
3231
|
-
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug)
|
2971
|
+
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
|
3232
2972
|
|
3233
2973
|
#define CHECK_TRY_ERROR(expr) \
|
3234
2974
|
[&]() { \
|
@@ -8339,7 +8079,7 @@ template <bool need_check> static void
|
|
8339
8079
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
8340
8080
|
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8341
8081
|
const sycl::nd_item<3> &item_ct1,
|
8342
|
-
const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
|
8082
|
+
const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
|
8343
8083
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8344
8084
|
item_ct1.get_local_id(1);
|
8345
8085
|
|
@@ -10216,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10216
9956
|
dpct::queue_ptr stream) {
|
10217
9957
|
const int nb = k / QK_K;
|
10218
9958
|
{
|
10219
|
-
iq2xxs_grid.init(*stream);
|
10220
|
-
ksigns_iq2xs.init(*stream);
|
10221
|
-
kmask_iq2xs.init(*stream);
|
10222
9959
|
|
10223
9960
|
dpct::has_capability_or_fail(stream->get_device(),
|
10224
9961
|
{sycl::aspect::fp16});
|
10225
9962
|
|
10226
9963
|
stream->submit([&](sycl::handler &cgh) {
|
10227
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10228
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10229
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9964
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
9965
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9966
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10230
9967
|
|
10231
9968
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10232
9969
|
sycl::range<3>(1, 1, 32),
|
@@ -10245,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10245
9982
|
dpct::queue_ptr stream) {
|
10246
9983
|
const int nb = k / QK_K;
|
10247
9984
|
{
|
10248
|
-
iq2xs_grid.init(*stream);
|
10249
|
-
ksigns_iq2xs.init(*stream);
|
10250
|
-
kmask_iq2xs.init(*stream);
|
10251
9985
|
|
10252
9986
|
dpct::has_capability_or_fail(stream->get_device(),
|
10253
9987
|
{sycl::aspect::fp16});
|
10254
9988
|
|
10255
9989
|
stream->submit([&](sycl::handler &cgh) {
|
10256
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
10257
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10258
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9990
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
9991
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9992
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10259
9993
|
|
10260
9994
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10261
9995
|
sycl::range<3>(1, 1, 32),
|
@@ -10274,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10274
10008
|
dpct::queue_ptr stream) {
|
10275
10009
|
const int nb = k / QK_K;
|
10276
10010
|
{
|
10277
|
-
iq3xxs_grid.init(*stream);
|
10278
|
-
ksigns_iq2xs.init(*stream);
|
10279
|
-
kmask_iq2xs.init(*stream);
|
10280
10011
|
|
10281
10012
|
dpct::has_capability_or_fail(stream->get_device(),
|
10282
10013
|
{sycl::aspect::fp16});
|
10283
10014
|
|
10284
10015
|
stream->submit([&](sycl::handler &cgh) {
|
10285
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
10286
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10287
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10016
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10017
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10018
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10288
10019
|
|
10289
10020
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10290
10021
|
sycl::range<3>(1, 1, 32),
|
@@ -10303,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10303
10034
|
dpct::queue_ptr stream) {
|
10304
10035
|
const int nb = k / QK_K;
|
10305
10036
|
{
|
10306
|
-
iq3s_grid.init(*stream);
|
10307
|
-
ksigns_iq2xs.init(*stream);
|
10308
|
-
kmask_iq2xs.init(*stream);
|
10309
10037
|
|
10310
10038
|
dpct::has_capability_or_fail(stream->get_device(),
|
10311
10039
|
{sycl::aspect::fp16});
|
10312
10040
|
|
10313
10041
|
stream->submit([&](sycl::handler &cgh) {
|
10314
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
10315
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10316
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10042
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10043
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10044
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10317
10045
|
|
10318
10046
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10319
10047
|
sycl::range<3>(1, 1, 32),
|
@@ -10332,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10332
10060
|
dpct::queue_ptr stream) {
|
10333
10061
|
const int nb = k / QK_K;
|
10334
10062
|
{
|
10335
|
-
iq1s_grid_gpu.init(*stream);
|
10336
|
-
ksigns_iq2xs.init(*stream);
|
10337
|
-
kmask_iq2xs.init(*stream);
|
10338
10063
|
|
10339
10064
|
dpct::has_capability_or_fail(stream->get_device(),
|
10340
10065
|
{sycl::aspect::fp16});
|
10341
10066
|
|
10342
10067
|
stream->submit([&](sycl::handler &cgh) {
|
10343
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
10344
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10345
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10068
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10069
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10070
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10346
10071
|
|
10347
10072
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10348
10073
|
sycl::range<3>(1, 1, 32),
|
@@ -10675,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10675
10400
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10676
10401
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10677
10402
|
{
|
10678
|
-
iq3xxs_grid.init(*stream);
|
10679
|
-
ksigns64.init(*stream);
|
10680
10403
|
|
10681
10404
|
stream->submit([&](sycl::handler &cgh) {
|
10682
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10683
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10684
10405
|
|
10685
10406
|
cgh.parallel_for(
|
10686
10407
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10688,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10688
10409
|
[[intel::reqd_sub_group_size(32)]] {
|
10689
10410
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
10690
10411
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
10691
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10692
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10412
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10693
10413
|
});
|
10694
10414
|
});
|
10695
10415
|
}
|
@@ -10704,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10704
10424
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10705
10425
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10706
10426
|
{
|
10707
|
-
iq3xxs_grid.init(*stream);
|
10708
|
-
ksigns64.init(*stream);
|
10709
10427
|
|
10710
10428
|
stream->submit([&](sycl::handler &cgh) {
|
10711
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10712
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10713
10429
|
|
10714
10430
|
cgh.parallel_for(
|
10715
10431
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10717,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10717
10433
|
[[intel::reqd_sub_group_size(32)]] {
|
10718
10434
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
10719
10435
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
10720
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10721
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10436
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10722
10437
|
});
|
10723
10438
|
});
|
10724
10439
|
}
|
@@ -10733,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10733
10448
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10734
10449
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10735
10450
|
{
|
10736
|
-
iq3xxs_grid.init(*stream);
|
10737
|
-
ksigns64.init(*stream);
|
10738
10451
|
|
10739
10452
|
stream->submit([&](sycl::handler &cgh) {
|
10740
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10741
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10742
10453
|
|
10743
10454
|
cgh.parallel_for(
|
10744
10455
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10746,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10746
10457
|
[[intel::reqd_sub_group_size(32)]] {
|
10747
10458
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
10748
10459
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
10749
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10750
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10460
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10751
10461
|
});
|
10752
10462
|
});
|
10753
10463
|
}
|
@@ -10762,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10762
10472
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10763
10473
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10764
10474
|
{
|
10765
|
-
iq3xxs_grid.init(*stream);
|
10766
|
-
ksigns64.init(*stream);
|
10767
10475
|
|
10768
10476
|
stream->submit([&](sycl::handler &cgh) {
|
10769
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10770
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10771
10477
|
|
10772
10478
|
cgh.parallel_for(
|
10773
10479
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10775,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10775
10481
|
[[intel::reqd_sub_group_size(32)]] {
|
10776
10482
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
10777
10483
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
10778
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10779
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10484
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10780
10485
|
});
|
10781
10486
|
});
|
10782
10487
|
}
|
@@ -10791,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10791
10496
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10792
10497
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10793
10498
|
{
|
10794
|
-
iq3xxs_grid.init(*stream);
|
10795
|
-
ksigns64.init(*stream);
|
10796
10499
|
|
10797
10500
|
stream->submit([&](sycl::handler &cgh) {
|
10798
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10799
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10800
10501
|
|
10801
10502
|
cgh.parallel_for(
|
10802
10503
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10804,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10804
10505
|
[[intel::reqd_sub_group_size(32)]] {
|
10805
10506
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
10806
10507
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
10807
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10808
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10508
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10809
10509
|
});
|
10810
10510
|
});
|
10811
10511
|
}
|
@@ -10820,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10820
10520
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10821
10521
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10822
10522
|
{
|
10823
|
-
iq3xxs_grid.init(*stream);
|
10824
|
-
ksigns64.init(*stream);
|
10825
10523
|
|
10826
10524
|
stream->submit([&](sycl::handler &cgh) {
|
10827
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10828
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10829
10525
|
|
10830
10526
|
cgh.parallel_for(
|
10831
10527
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10833,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10833
10529
|
[[intel::reqd_sub_group_size(32)]] {
|
10834
10530
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
10835
10531
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
10836
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10837
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10532
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10838
10533
|
});
|
10839
10534
|
});
|
10840
10535
|
}
|
@@ -10849,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10849
10544
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10850
10545
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10851
10546
|
{
|
10852
|
-
iq3xxs_grid.init(*stream);
|
10853
|
-
ksigns64.init(*stream);
|
10854
10547
|
|
10855
10548
|
stream->submit([&](sycl::handler &cgh) {
|
10856
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10857
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10858
10549
|
|
10859
10550
|
cgh.parallel_for(
|
10860
10551
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10862,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10862
10553
|
[[intel::reqd_sub_group_size(32)]] {
|
10863
10554
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
10864
10555
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
10865
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10866
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10556
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10867
10557
|
});
|
10868
10558
|
});
|
10869
10559
|
}
|
@@ -10878,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10878
10568
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10879
10569
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10880
10570
|
{
|
10881
|
-
iq3xxs_grid.init(*stream);
|
10882
|
-
ksigns64.init(*stream);
|
10883
10571
|
|
10884
10572
|
stream->submit([&](sycl::handler &cgh) {
|
10885
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10886
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10887
10573
|
|
10888
10574
|
cgh.parallel_for(
|
10889
10575
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10891,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10891
10577
|
[[intel::reqd_sub_group_size(32)]] {
|
10892
10578
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
10893
10579
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
10894
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10895
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10580
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10896
10581
|
});
|
10897
10582
|
});
|
10898
10583
|
}
|
@@ -10907,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10907
10592
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10908
10593
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10909
10594
|
{
|
10910
|
-
iq3xxs_grid.init(*stream);
|
10911
|
-
ksigns64.init(*stream);
|
10912
10595
|
|
10913
10596
|
stream->submit([&](sycl::handler &cgh) {
|
10914
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10915
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10916
10597
|
|
10917
10598
|
cgh.parallel_for(
|
10918
10599
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10920,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10920
10601
|
[[intel::reqd_sub_group_size(32)]] {
|
10921
10602
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
10922
10603
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
10923
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10924
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10604
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10925
10605
|
});
|
10926
10606
|
});
|
10927
10607
|
}
|
@@ -10936,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10936
10616
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10937
10617
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10938
10618
|
{
|
10939
|
-
iq3xxs_grid.init(*stream);
|
10940
|
-
ksigns64.init(*stream);
|
10941
10619
|
|
10942
10620
|
stream->submit([&](sycl::handler &cgh) {
|
10943
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10944
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10945
10621
|
|
10946
10622
|
cgh.parallel_for(
|
10947
10623
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10949,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10949
10625
|
[[intel::reqd_sub_group_size(32)]] {
|
10950
10626
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
10951
10627
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
10952
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10953
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10628
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10954
10629
|
});
|
10955
10630
|
});
|
10956
10631
|
}
|
10957
10632
|
}
|
10958
10633
|
|
10634
|
+
|
10959
10635
|
static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
10960
10636
|
float *dst, const int ncols,
|
10961
10637
|
const int nrows,
|
@@ -10965,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
10965
10641
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10966
10642
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10967
10643
|
{
|
10968
|
-
iq2xxs_grid.init(*stream);
|
10969
|
-
ksigns_iq2xs.init(*stream);
|
10970
|
-
kmask_iq2xs.init(*stream);
|
10971
|
-
|
10972
10644
|
|
10973
10645
|
stream->submit([&](sycl::handler &cgh) {
|
10974
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10975
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10976
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10646
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
10647
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10648
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10977
10649
|
|
10978
10650
|
cgh.parallel_for(
|
10979
10651
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10996,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
10996
10668
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10997
10669
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10998
10670
|
{
|
10999
|
-
iq2xs_grid.init(*stream);
|
11000
|
-
ksigns64.init(*stream);
|
11001
10671
|
|
11002
10672
|
stream->submit([&](sycl::handler &cgh) {
|
11003
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
11004
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10673
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
10674
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11005
10675
|
|
11006
10676
|
cgh.parallel_for(
|
11007
10677
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11024,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
11024
10694
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11025
10695
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11026
10696
|
{
|
11027
|
-
iq3xxs_grid.init(*stream);
|
11028
|
-
ksigns64.init(*stream);
|
11029
10697
|
|
11030
10698
|
stream->submit([&](sycl::handler &cgh) {
|
11031
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
11032
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10699
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10700
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11033
10701
|
|
11034
10702
|
cgh.parallel_for(
|
11035
10703
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11052,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11052
10720
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11053
10721
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11054
10722
|
{
|
11055
|
-
iq3s_grid.init(*stream);
|
11056
|
-
ksigns64.init(*stream);
|
11057
10723
|
|
11058
10724
|
stream->submit([&](sycl::handler &cgh) {
|
11059
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
11060
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10725
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10726
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11061
10727
|
|
11062
10728
|
cgh.parallel_for(
|
11063
10729
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11080,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11080
10746
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11081
10747
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11082
10748
|
{
|
11083
|
-
iq1s_grid_gpu.init(*stream);
|
11084
|
-
ksigns64.init(*stream);
|
11085
10749
|
|
11086
10750
|
stream->submit([&](sycl::handler &cgh) {
|
11087
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
11088
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10751
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10752
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11089
10753
|
|
11090
10754
|
cgh.parallel_for(
|
11091
10755
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -13128,6 +12792,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
13128
12792
|
}
|
13129
12793
|
|
13130
12794
|
void ggml_backend_sycl_print_sycl_devices() {
|
12795
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
13131
12796
|
int device_count = dpct::dev_mgr::instance().device_count();
|
13132
12797
|
std::map<std::string, size_t> DeviceNums;
|
13133
12798
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
@@ -13181,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
|
|
13181
12846
|
return prop.get_max_work_group_size();
|
13182
12847
|
}
|
13183
12848
|
|
13184
|
-
void ggml_init_sycl() try {
|
12849
|
+
static void ggml_init_sycl() try {
|
13185
12850
|
static bool initialized = false;
|
13186
12851
|
|
13187
12852
|
if (!initialized) {
|
12853
|
+
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
13188
12854
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
12855
|
+
|
13189
12856
|
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
13190
12857
|
|
13191
12858
|
#if defined(GGML_SYCL_F16)
|
@@ -15246,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15246
14913
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
15247
14914
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
15248
14915
|
|
14916
|
+
bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
|
14917
|
+
main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
|
14918
|
+
|
15249
14919
|
SYCL_CHECK(
|
15250
14920
|
CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
|
15251
14921
|
|
@@ -15276,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15276
14946
|
|
15277
14947
|
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
15278
14948
|
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
14949
|
+
if (no_mixed_dtypes) {
|
14950
|
+
cu_compute_type = dpct::library_data_t::real_half;
|
14951
|
+
cu_data_type = dpct::library_data_t::real_half;
|
14952
|
+
}
|
15279
14953
|
|
15280
14954
|
// dst strides
|
15281
14955
|
size_t nbd2 = dst->nb[2];
|
15282
14956
|
size_t nbd3 = dst->nb[3];
|
15283
14957
|
|
14958
|
+
const float alpha_f32 = 1.0f;
|
14959
|
+
const float beta_f32 = 0.0f;
|
14960
|
+
|
15284
14961
|
const sycl::half alpha_f16 = 1.0f;
|
15285
14962
|
const sycl::half beta_f16 = 0.0f;
|
15286
14963
|
|
15287
|
-
const float alpha_f32 = 1.0f;
|
15288
|
-
const float beta_f32 = 0.0f;
|
15289
|
-
|
15290
14964
|
const void * alpha = &alpha_f32;
|
15291
14965
|
const void * beta = &beta_f32;
|
14966
|
+
if (no_mixed_dtypes) {
|
14967
|
+
alpha = &alpha_f16;
|
14968
|
+
beta = &beta_f16;
|
14969
|
+
}
|
15292
14970
|
|
15293
14971
|
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
15294
|
-
// oneMKL open source supports half, half, float, float: datatypes
|
14972
|
+
// when oneMKL open source supports half, half, float, float: datatypes
|
15295
14973
|
|
15296
14974
|
dst_t = (char *) dst_ddf;
|
14975
|
+
if (no_mixed_dtypes) {
|
14976
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
14977
|
+
|
14978
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
14979
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
14980
|
+
}
|
15297
14981
|
|
15298
14982
|
GGML_ASSERT(ne12 % ne02 == 0);
|
15299
14983
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -15379,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15379
15063
|
}
|
15380
15064
|
#endif
|
15381
15065
|
|
15066
|
+
if (no_mixed_dtypes) {
|
15067
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
15068
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
15069
|
+
}
|
15382
15070
|
}
|
15383
15071
|
catch (sycl::exception const &exc) {
|
15384
15072
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -16278,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16278
15966
|
}
|
16279
15967
|
|
16280
15968
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
15969
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
16281
15970
|
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
16282
15971
|
|
16283
15972
|
if (!g_sycl_gpu_mgr) {
|
@@ -16312,6 +16001,7 @@ catch (sycl::exception const &exc) {
|
|
16312
16001
|
|
16313
16002
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
16314
16003
|
size_t description_size) try {
|
16004
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
16315
16005
|
dpct::device_info prop;
|
16316
16006
|
int device_id = g_sycl_gpu_mgr->gpus[device];
|
16317
16007
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
@@ -16326,6 +16016,7 @@ catch (sycl::exception const &exc) {
|
|
16326
16016
|
|
16327
16017
|
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
16328
16018
|
size_t *total) try {
|
16019
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
16329
16020
|
ggml_sycl_set_device(device);
|
16330
16021
|
|
16331
16022
|
/*
|
@@ -16677,6 +16368,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16677
16368
|
};
|
16678
16369
|
|
16679
16370
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
16371
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
16372
|
+
|
16680
16373
|
if (device_index>=g_device_count or device_index<0) {
|
16681
16374
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
16682
16375
|
device_index, g_device_count-1);
|
@@ -17046,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
17046
16739
|
};
|
17047
16740
|
|
17048
16741
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
16742
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
|
16743
|
+
ggml_init_sycl();
|
17049
16744
|
// FIXME: this is not thread safe
|
17050
16745
|
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
17051
16746
|
|
@@ -17117,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|
17117
16812
|
}
|
17118
16813
|
|
17119
16814
|
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
16815
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
|
17120
16816
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
17121
16817
|
/* .iface = */ {
|
17122
16818
|
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
@@ -17231,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|
17231
16927
|
params.ith = 0;
|
17232
16928
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17233
16929
|
ggml_tensor * node = cgraph->nodes[i];
|
17234
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
16930
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
17235
16931
|
continue;
|
17236
16932
|
}
|
17237
16933
|
#ifndef NDEBUG
|
@@ -17379,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17379
17075
|
UNUSED(backend);
|
17380
17076
|
}
|
17381
17077
|
|
17078
|
+
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17079
|
+
const int min_batch_size = 32;
|
17080
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17081
|
+
GGML_UNUSED(backend);
|
17082
|
+
}
|
17083
|
+
|
17084
|
+
|
17382
17085
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
17383
17086
|
/* .get_name = */ ggml_backend_sycl_name,
|
17384
17087
|
/* .free = */ ggml_backend_sycl_free,
|
@@ -17392,7 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17392
17095
|
/* .graph_plan_compute = */ NULL,
|
17393
17096
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17394
17097
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17395
|
-
/* .offload_op = */
|
17098
|
+
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
17396
17099
|
/* .event_new = */ NULL,
|
17397
17100
|
/* .event_free = */ NULL,
|
17398
17101
|
/* .event_record = */ NULL,
|
@@ -17406,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|
17406
17109
|
}
|
17407
17110
|
|
17408
17111
|
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
17409
|
-
|
17112
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
17113
|
+
ggml_init_sycl();
|
17410
17114
|
|
17411
17115
|
check_allow_gpu_index(device);
|
17412
17116
|
|
@@ -17432,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
17432
17136
|
}
|
17433
17137
|
|
17434
17138
|
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
17139
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
17435
17140
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17436
17141
|
return g_sycl_gpu_mgr->get_gpu_count();
|
17437
17142
|
}
|
@@ -17444,16 +17149,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
|
|
17444
17149
|
}
|
17445
17150
|
|
17446
17151
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
17152
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
|
17447
17153
|
return g_sycl_gpu_mgr->get_index(device_id);
|
17448
17154
|
}
|
17449
17155
|
|
17450
17156
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
17157
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
|
17451
17158
|
return g_sycl_gpu_mgr->gpus[device_index];
|
17452
17159
|
}
|
17453
17160
|
|
17454
17161
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
17455
|
-
|
17162
|
+
ggml_init_sycl();
|
17163
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
17456
17164
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
17165
|
+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
17166
|
+
|
17457
17167
|
if (g_sycl_gpu_mgr) {
|
17458
17168
|
delete g_sycl_gpu_mgr;
|
17459
17169
|
}
|
@@ -17464,6 +17174,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
|
|
17464
17174
|
}
|
17465
17175
|
|
17466
17176
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
17177
|
+
ggml_init_sycl();
|
17178
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
|
17179
|
+
|
17467
17180
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
17468
17181
|
return;
|
17469
17182
|
}
|