llama_cpp 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +71 -18
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +300 -9333
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +638 -43
- data/vendor/tmp/llama.cpp/ggml-quants.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +106 -393
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +133 -93
- data/vendor/tmp/llama.cpp/ggml.h +11 -5
- data/vendor/tmp/llama.cpp/llama.cpp +1763 -431
- data/vendor/tmp/llama.cpp/llama.h +67 -19
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -740,11 +740,7 @@ namespace dpct
|
|
740
740
|
|
741
741
|
sycl::queue &default_queue()
|
742
742
|
{
|
743
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
744
|
-
return out_of_order_queue();
|
745
|
-
#else
|
746
743
|
return in_order_queue();
|
747
|
-
#endif // DPCT_USM_LEVEL_NONE
|
748
744
|
}
|
749
745
|
|
750
746
|
void queues_wait_and_throw()
|
@@ -763,11 +759,7 @@ namespace dpct
|
|
763
759
|
|
764
760
|
sycl::queue *create_queue(bool enable_exception_handler = false)
|
765
761
|
{
|
766
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
767
|
-
return create_out_of_order_queue(enable_exception_handler);
|
768
|
-
#else
|
769
762
|
return create_in_order_queue(enable_exception_handler);
|
770
|
-
#endif // DPCT_USM_LEVEL_NONE
|
771
763
|
}
|
772
764
|
|
773
765
|
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
@@ -1075,11 +1067,6 @@ namespace dpct
|
|
1075
1067
|
static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
|
1076
1068
|
const void *ptr)
|
1077
1069
|
{
|
1078
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1079
|
-
return mem_mgr::instance().is_device_ptr(ptr)
|
1080
|
-
? pointer_access_attribute::device_only
|
1081
|
-
: pointer_access_attribute::host_only;
|
1082
|
-
#else
|
1083
1070
|
switch (sycl::get_pointer_type(ptr, q.get_context()))
|
1084
1071
|
{
|
1085
1072
|
case sycl::usm::alloc::unknown:
|
@@ -1090,7 +1077,6 @@ namespace dpct
|
|
1090
1077
|
case sycl::usm::alloc::host:
|
1091
1078
|
return pointer_access_attribute::host_device;
|
1092
1079
|
}
|
1093
|
-
#endif
|
1094
1080
|
}
|
1095
1081
|
|
1096
1082
|
template <typename ArgT>
|
@@ -1273,11 +1259,7 @@ namespace dpct
|
|
1273
1259
|
|
1274
1260
|
static inline void *dpct_malloc(size_t size, sycl::queue &q)
|
1275
1261
|
{
|
1276
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1277
|
-
return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
|
1278
|
-
#else
|
1279
1262
|
return sycl::malloc_device(size, q.get_device(), q.get_context());
|
1280
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1281
1263
|
}
|
1282
1264
|
|
1283
1265
|
#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
|
@@ -1301,25 +1283,7 @@ namespace dpct
|
|
1301
1283
|
static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
|
1302
1284
|
valueT value, size_t size)
|
1303
1285
|
{
|
1304
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1305
|
-
auto &mm = mem_mgr::instance();
|
1306
|
-
assert(mm.is_device_ptr(dev_ptr));
|
1307
|
-
auto alloc = mm.translate_ptr(dev_ptr);
|
1308
|
-
size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
|
1309
|
-
|
1310
|
-
return q.submit([&](sycl::handler &cgh)
|
1311
|
-
{
|
1312
|
-
auto r = sycl::range<1>(size);
|
1313
|
-
auto o = sycl::id<1>(offset);
|
1314
|
-
auto new_buffer = alloc.buffer.reinterpret<valueT>(
|
1315
|
-
sycl::range<1>(alloc.size / sizeof(valueT)));
|
1316
|
-
sycl::accessor<valueT, 1, sycl::access_mode::write,
|
1317
|
-
sycl::access::target::device>
|
1318
|
-
acc(new_buffer, cgh, r, o);
|
1319
|
-
cgh.fill(acc, value); });
|
1320
|
-
#else
|
1321
1286
|
return q.fill(dev_ptr, value, size);
|
1322
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1323
1287
|
}
|
1324
1288
|
|
1325
1289
|
/**
|
@@ -1413,72 +1377,8 @@ namespace dpct
|
|
1413
1377
|
{
|
1414
1378
|
if (!size)
|
1415
1379
|
return sycl::event{};
|
1416
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1417
|
-
auto &mm = mem_mgr::instance();
|
1418
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
1419
|
-
|
1420
|
-
switch (real_direction)
|
1421
|
-
{
|
1422
|
-
case host_to_host:
|
1423
|
-
return q.submit([&](sycl::handler &cgh)
|
1424
|
-
{
|
1425
|
-
cgh.depends_on(dep_events);
|
1426
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
1427
|
-
case host_to_device:
|
1428
|
-
{
|
1429
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
1430
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
1431
|
-
return q.submit([&](sycl::handler &cgh)
|
1432
|
-
{
|
1433
|
-
cgh.depends_on(dep_events);
|
1434
|
-
auto r = sycl::range<1>(size);
|
1435
|
-
auto o = sycl::id<1>(offset);
|
1436
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1437
|
-
sycl::access::target::device>
|
1438
|
-
acc(alloc.buffer, cgh, r, o);
|
1439
|
-
cgh.copy(from_ptr, acc); });
|
1440
|
-
}
|
1441
|
-
case device_to_host:
|
1442
|
-
{
|
1443
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
1444
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
1445
|
-
return q.submit([&](sycl::handler &cgh)
|
1446
|
-
{
|
1447
|
-
cgh.depends_on(dep_events);
|
1448
|
-
auto r = sycl::range<1>(size);
|
1449
|
-
auto o = sycl::id<1>(offset);
|
1450
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1451
|
-
sycl::access::target::device>
|
1452
|
-
acc(alloc.buffer, cgh, r, o);
|
1453
|
-
cgh.copy(acc, to_ptr); });
|
1454
|
-
}
|
1455
|
-
case device_to_device:
|
1456
|
-
{
|
1457
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
1458
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
1459
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
1460
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
1461
|
-
return q.submit([&](sycl::handler &cgh)
|
1462
|
-
{
|
1463
|
-
cgh.depends_on(dep_events);
|
1464
|
-
auto r = sycl::range<1>(size);
|
1465
|
-
auto to_o = sycl::id<1>(to_offset);
|
1466
|
-
auto from_o = sycl::id<1>(from_offset);
|
1467
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1468
|
-
sycl::access::target::device>
|
1469
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
1470
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1471
|
-
sycl::access::target::device>
|
1472
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
1473
|
-
cgh.copy(from_acc, to_acc); });
|
1474
|
-
}
|
1475
|
-
default:
|
1476
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1477
|
-
}
|
1478
|
-
#else
|
1479
1380
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
1480
1381
|
GGML_UNUSED(direction);
|
1481
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1482
1382
|
}
|
1483
1383
|
|
1484
1384
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -1618,45 +1518,15 @@ namespace dpct
|
|
1618
1518
|
break;
|
1619
1519
|
}
|
1620
1520
|
case device_to_device:
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
cgh.depends_on(dep_events);
|
1631
|
-
auto to_o = sycl::id<1>(to_offset);
|
1632
|
-
auto from_o = sycl::id<1>(from_offset);
|
1633
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1634
|
-
sycl::access::target::device>
|
1635
|
-
to_acc(to_alloc.buffer, cgh,
|
1636
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
1637
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1638
|
-
sycl::access::target::device>
|
1639
|
-
from_acc(from_alloc.buffer, cgh,
|
1640
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
1641
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
1642
|
-
size,
|
1643
|
-
[=](sycl::id<3> id) {
|
1644
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
1645
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
1646
|
-
}); }));
|
1647
|
-
}
|
1648
|
-
#else
|
1649
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
1650
|
-
{
|
1651
|
-
cgh.depends_on(dep_events);
|
1652
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1653
|
-
size,
|
1654
|
-
[=](sycl::id<3> id) {
|
1655
|
-
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1656
|
-
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1657
|
-
}); }));
|
1658
|
-
#endif
|
1659
|
-
break;
|
1521
|
+
event_list.push_back(q.submit([&](sycl::handler &cgh){
|
1522
|
+
cgh.depends_on(dep_events);
|
1523
|
+
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1524
|
+
size,
|
1525
|
+
[=](sycl::id<3> id) {
|
1526
|
+
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1527
|
+
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1528
|
+
}); }));
|
1529
|
+
break;
|
1660
1530
|
default:
|
1661
1531
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1662
1532
|
}
|
@@ -1754,11 +1624,7 @@ namespace dpct
|
|
1754
1624
|
{
|
1755
1625
|
if (ptr)
|
1756
1626
|
{
|
1757
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1758
|
-
detail::mem_mgr::instance().mem_free(ptr);
|
1759
|
-
#else
|
1760
1627
|
sycl::free(ptr, q.get_context());
|
1761
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1762
1628
|
}
|
1763
1629
|
}
|
1764
1630
|
|
@@ -1766,11 +1632,7 @@ namespace dpct
|
|
1766
1632
|
inline auto get_memory(const void *x)
|
1767
1633
|
{
|
1768
1634
|
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
|
1769
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1770
|
-
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
|
1771
|
-
#else
|
1772
1635
|
return new_x;
|
1773
|
-
#endif
|
1774
1636
|
}
|
1775
1637
|
|
1776
1638
|
template <typename T>
|
@@ -2222,72 +2084,8 @@ namespace dpct
|
|
2222
2084
|
{
|
2223
2085
|
if (!size)
|
2224
2086
|
return sycl::event{};
|
2225
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2226
|
-
auto &mm = mem_mgr::instance();
|
2227
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
2228
|
-
|
2229
|
-
switch (real_direction)
|
2230
|
-
{
|
2231
|
-
case host_to_host:
|
2232
|
-
return q.submit([&](sycl::handler &cgh)
|
2233
|
-
{
|
2234
|
-
cgh.depends_on(dep_events);
|
2235
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
2236
|
-
case host_to_device:
|
2237
|
-
{
|
2238
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
2239
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
2240
|
-
return q.submit([&](sycl::handler &cgh)
|
2241
|
-
{
|
2242
|
-
cgh.depends_on(dep_events);
|
2243
|
-
auto r = sycl::range<1>(size);
|
2244
|
-
auto o = sycl::id<1>(offset);
|
2245
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2246
|
-
sycl::access::target::device>
|
2247
|
-
acc(alloc.buffer, cgh, r, o);
|
2248
|
-
cgh.copy(from_ptr, acc); });
|
2249
|
-
}
|
2250
|
-
case device_to_host:
|
2251
|
-
{
|
2252
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
2253
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
2254
|
-
return q.submit([&](sycl::handler &cgh)
|
2255
|
-
{
|
2256
|
-
cgh.depends_on(dep_events);
|
2257
|
-
auto r = sycl::range<1>(size);
|
2258
|
-
auto o = sycl::id<1>(offset);
|
2259
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2260
|
-
sycl::access::target::device>
|
2261
|
-
acc(alloc.buffer, cgh, r, o);
|
2262
|
-
cgh.copy(acc, to_ptr); });
|
2263
|
-
}
|
2264
|
-
case device_to_device:
|
2265
|
-
{
|
2266
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
2267
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
2268
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
2269
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
2270
|
-
return q.submit([&](sycl::handler &cgh)
|
2271
|
-
{
|
2272
|
-
cgh.depends_on(dep_events);
|
2273
|
-
auto r = sycl::range<1>(size);
|
2274
|
-
auto to_o = sycl::id<1>(to_offset);
|
2275
|
-
auto from_o = sycl::id<1>(from_offset);
|
2276
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2277
|
-
sycl::access::target::device>
|
2278
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
2279
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2280
|
-
sycl::access::target::device>
|
2281
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
2282
|
-
cgh.copy(from_acc, to_acc); });
|
2283
|
-
}
|
2284
|
-
default:
|
2285
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
2286
|
-
}
|
2287
|
-
#else
|
2288
2087
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
2289
2088
|
GGML_UNUSED(direction);
|
2290
|
-
#endif // DPCT_USM_LEVEL_NONE
|
2291
2089
|
}
|
2292
2090
|
|
2293
2091
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -2427,34 +2225,6 @@ namespace dpct
|
|
2427
2225
|
break;
|
2428
2226
|
}
|
2429
2227
|
case device_to_device:
|
2430
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2431
|
-
{
|
2432
|
-
auto &mm = mem_mgr::instance();
|
2433
|
-
auto to_alloc = mm.translate_ptr(to_surface);
|
2434
|
-
auto from_alloc = mm.translate_ptr(from_surface);
|
2435
|
-
size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
|
2436
|
-
size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
|
2437
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2438
|
-
{
|
2439
|
-
cgh.depends_on(dep_events);
|
2440
|
-
auto to_o = sycl::id<1>(to_offset);
|
2441
|
-
auto from_o = sycl::id<1>(from_offset);
|
2442
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2443
|
-
sycl::access::target::device>
|
2444
|
-
to_acc(to_alloc.buffer, cgh,
|
2445
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
2446
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2447
|
-
sycl::access::target::device>
|
2448
|
-
from_acc(from_alloc.buffer, cgh,
|
2449
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
2450
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
2451
|
-
size,
|
2452
|
-
[=](sycl::id<3> id) {
|
2453
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
2454
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
2455
|
-
}); }));
|
2456
|
-
}
|
2457
|
-
#else
|
2458
2228
|
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2459
2229
|
{
|
2460
2230
|
cgh.depends_on(dep_events);
|
@@ -2464,7 +2234,6 @@ namespace dpct
|
|
2464
2234
|
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
2465
2235
|
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
2466
2236
|
}); }));
|
2467
|
-
#endif
|
2468
2237
|
break;
|
2469
2238
|
default:
|
2470
2239
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
@@ -2655,9 +2424,6 @@ namespace dpct
|
|
2655
2424
|
void *c[], library_data_t c_type, int ldc,
|
2656
2425
|
int batch_size, library_data_t scaling_type)
|
2657
2426
|
{
|
2658
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2659
|
-
throw std::runtime_error("this API is unsupported when USM level is none");
|
2660
|
-
#else
|
2661
2427
|
if (scaling_type == library_data_t::real_float &&
|
2662
2428
|
c_type == library_data_t::complex_float)
|
2663
2429
|
{
|
@@ -2792,7 +2558,6 @@ namespace dpct
|
|
2792
2558
|
default:
|
2793
2559
|
throw std::runtime_error("the combination of data type is unsupported");
|
2794
2560
|
}
|
2795
|
-
#endif
|
2796
2561
|
}
|
2797
2562
|
|
2798
2563
|
/// Computes a batch of matrix-matrix product with general matrices.
|
@@ -3131,24 +2896,9 @@ namespace dpct
|
|
3131
2896
|
template <size_t D = Dimension>
|
3132
2897
|
typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
|
3133
2898
|
init();
|
3134
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3135
|
-
return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
|
3136
|
-
_device_ptr)
|
3137
|
-
.template get_access<sycl::access_mode::read_write>()[index];
|
3138
|
-
#else
|
3139
2899
|
return _device_ptr[index];
|
3140
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3141
2900
|
}
|
3142
2901
|
|
3143
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3144
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3145
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3146
|
-
return get_buffer(_device_ptr)
|
3147
|
-
.template reinterpret<T, Dimension>(_range)
|
3148
|
-
.template get_access<detail::memory_traits<Memory, T>::mode,
|
3149
|
-
detail::memory_traits<Memory, T>::target>(cgh);
|
3150
|
-
}
|
3151
|
-
#else
|
3152
2902
|
/// Get dpct::accessor with dimension info for the device memory object
|
3153
2903
|
/// when usm is used and dimension is greater than 1.
|
3154
2904
|
template <size_t D = Dimension>
|
@@ -3156,7 +2906,6 @@ namespace dpct
|
|
3156
2906
|
get_access(sycl::handler &cgh) {
|
3157
2907
|
return dpct_accessor_t((T *)_device_ptr, _range);
|
3158
2908
|
}
|
3159
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3160
2909
|
|
3161
2910
|
private:
|
3162
2911
|
device_memory(value_t *memory_ptr, size_t size)
|
@@ -3201,15 +2950,6 @@ namespace dpct
|
|
3201
2950
|
|
3202
2951
|
/// Default constructor
|
3203
2952
|
device_memory() : base(1) {}
|
3204
|
-
|
3205
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3206
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3207
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3208
|
-
auto buf = get_buffer(base::get_ptr())
|
3209
|
-
.template reinterpret<T, 1>(sycl::range<1>(1));
|
3210
|
-
return accessor_t(buf, cgh);
|
3211
|
-
}
|
3212
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3213
2953
|
};
|
3214
2954
|
} // namespace detail
|
3215
2955
|
|
@@ -3228,7 +2968,7 @@ namespace dpct
|
|
3228
2968
|
#include "ggml-common.h"
|
3229
2969
|
|
3230
2970
|
static int g_ggml_sycl_debug=0;
|
3231
|
-
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug)
|
2971
|
+
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
|
3232
2972
|
|
3233
2973
|
#define CHECK_TRY_ERROR(expr) \
|
3234
2974
|
[&]() { \
|
@@ -8339,7 +8079,7 @@ template <bool need_check> static void
|
|
8339
8079
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
8340
8080
|
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8341
8081
|
const sycl::nd_item<3> &item_ct1,
|
8342
|
-
const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
|
8082
|
+
const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
|
8343
8083
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8344
8084
|
item_ct1.get_local_id(1);
|
8345
8085
|
|
@@ -10216,17 +9956,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10216
9956
|
dpct::queue_ptr stream) {
|
10217
9957
|
const int nb = k / QK_K;
|
10218
9958
|
{
|
10219
|
-
iq2xxs_grid.init(*stream);
|
10220
|
-
ksigns_iq2xs.init(*stream);
|
10221
|
-
kmask_iq2xs.init(*stream);
|
10222
9959
|
|
10223
9960
|
dpct::has_capability_or_fail(stream->get_device(),
|
10224
9961
|
{sycl::aspect::fp16});
|
10225
9962
|
|
10226
9963
|
stream->submit([&](sycl::handler &cgh) {
|
10227
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10228
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10229
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9964
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
9965
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9966
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10230
9967
|
|
10231
9968
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10232
9969
|
sycl::range<3>(1, 1, 32),
|
@@ -10245,17 +9982,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10245
9982
|
dpct::queue_ptr stream) {
|
10246
9983
|
const int nb = k / QK_K;
|
10247
9984
|
{
|
10248
|
-
iq2xs_grid.init(*stream);
|
10249
|
-
ksigns_iq2xs.init(*stream);
|
10250
|
-
kmask_iq2xs.init(*stream);
|
10251
9985
|
|
10252
9986
|
dpct::has_capability_or_fail(stream->get_device(),
|
10253
9987
|
{sycl::aspect::fp16});
|
10254
9988
|
|
10255
9989
|
stream->submit([&](sycl::handler &cgh) {
|
10256
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
10257
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10258
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
9990
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
9991
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
9992
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10259
9993
|
|
10260
9994
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10261
9995
|
sycl::range<3>(1, 1, 32),
|
@@ -10274,17 +10008,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
|
10274
10008
|
dpct::queue_ptr stream) {
|
10275
10009
|
const int nb = k / QK_K;
|
10276
10010
|
{
|
10277
|
-
iq3xxs_grid.init(*stream);
|
10278
|
-
ksigns_iq2xs.init(*stream);
|
10279
|
-
kmask_iq2xs.init(*stream);
|
10280
10011
|
|
10281
10012
|
dpct::has_capability_or_fail(stream->get_device(),
|
10282
10013
|
{sycl::aspect::fp16});
|
10283
10014
|
|
10284
10015
|
stream->submit([&](sycl::handler &cgh) {
|
10285
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
10286
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10287
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10016
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10017
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10018
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10288
10019
|
|
10289
10020
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10290
10021
|
sycl::range<3>(1, 1, 32),
|
@@ -10303,17 +10034,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10303
10034
|
dpct::queue_ptr stream) {
|
10304
10035
|
const int nb = k / QK_K;
|
10305
10036
|
{
|
10306
|
-
iq3s_grid.init(*stream);
|
10307
|
-
ksigns_iq2xs.init(*stream);
|
10308
|
-
kmask_iq2xs.init(*stream);
|
10309
10037
|
|
10310
10038
|
dpct::has_capability_or_fail(stream->get_device(),
|
10311
10039
|
{sycl::aspect::fp16});
|
10312
10040
|
|
10313
10041
|
stream->submit([&](sycl::handler &cgh) {
|
10314
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
10315
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10316
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10042
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10043
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10044
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10317
10045
|
|
10318
10046
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10319
10047
|
sycl::range<3>(1, 1, 32),
|
@@ -10332,17 +10060,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
|
10332
10060
|
dpct::queue_ptr stream) {
|
10333
10061
|
const int nb = k / QK_K;
|
10334
10062
|
{
|
10335
|
-
iq1s_grid_gpu.init(*stream);
|
10336
|
-
ksigns_iq2xs.init(*stream);
|
10337
|
-
kmask_iq2xs.init(*stream);
|
10338
10063
|
|
10339
10064
|
dpct::has_capability_or_fail(stream->get_device(),
|
10340
10065
|
{sycl::aspect::fp16});
|
10341
10066
|
|
10342
10067
|
stream->submit([&](sycl::handler &cgh) {
|
10343
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
10344
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10345
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10068
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10069
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10070
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10346
10071
|
|
10347
10072
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10348
10073
|
sycl::range<3>(1, 1, 32),
|
@@ -10675,12 +10400,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10675
10400
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10676
10401
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10677
10402
|
{
|
10678
|
-
iq3xxs_grid.init(*stream);
|
10679
|
-
ksigns64.init(*stream);
|
10680
10403
|
|
10681
10404
|
stream->submit([&](sycl::handler &cgh) {
|
10682
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10683
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10684
10405
|
|
10685
10406
|
cgh.parallel_for(
|
10686
10407
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10688,8 +10409,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10688
10409
|
[[intel::reqd_sub_group_size(32)]] {
|
10689
10410
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
10690
10411
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
10691
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10692
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10412
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10693
10413
|
});
|
10694
10414
|
});
|
10695
10415
|
}
|
@@ -10704,12 +10424,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10704
10424
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10705
10425
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10706
10426
|
{
|
10707
|
-
iq3xxs_grid.init(*stream);
|
10708
|
-
ksigns64.init(*stream);
|
10709
10427
|
|
10710
10428
|
stream->submit([&](sycl::handler &cgh) {
|
10711
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10712
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10713
10429
|
|
10714
10430
|
cgh.parallel_for(
|
10715
10431
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10717,8 +10433,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10717
10433
|
[[intel::reqd_sub_group_size(32)]] {
|
10718
10434
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
10719
10435
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
10720
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10721
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10436
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10722
10437
|
});
|
10723
10438
|
});
|
10724
10439
|
}
|
@@ -10733,12 +10448,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10733
10448
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10734
10449
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10735
10450
|
{
|
10736
|
-
iq3xxs_grid.init(*stream);
|
10737
|
-
ksigns64.init(*stream);
|
10738
10451
|
|
10739
10452
|
stream->submit([&](sycl::handler &cgh) {
|
10740
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10741
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10742
10453
|
|
10743
10454
|
cgh.parallel_for(
|
10744
10455
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10746,8 +10457,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10746
10457
|
[[intel::reqd_sub_group_size(32)]] {
|
10747
10458
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
10748
10459
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
10749
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10750
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10460
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10751
10461
|
});
|
10752
10462
|
});
|
10753
10463
|
}
|
@@ -10762,12 +10472,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10762
10472
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10763
10473
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10764
10474
|
{
|
10765
|
-
iq3xxs_grid.init(*stream);
|
10766
|
-
ksigns64.init(*stream);
|
10767
10475
|
|
10768
10476
|
stream->submit([&](sycl::handler &cgh) {
|
10769
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10770
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10771
10477
|
|
10772
10478
|
cgh.parallel_for(
|
10773
10479
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10775,8 +10481,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10775
10481
|
[[intel::reqd_sub_group_size(32)]] {
|
10776
10482
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
10777
10483
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
10778
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10779
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10484
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10780
10485
|
});
|
10781
10486
|
});
|
10782
10487
|
}
|
@@ -10791,12 +10496,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10791
10496
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10792
10497
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10793
10498
|
{
|
10794
|
-
iq3xxs_grid.init(*stream);
|
10795
|
-
ksigns64.init(*stream);
|
10796
10499
|
|
10797
10500
|
stream->submit([&](sycl::handler &cgh) {
|
10798
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10799
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10800
10501
|
|
10801
10502
|
cgh.parallel_for(
|
10802
10503
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10804,8 +10505,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10804
10505
|
[[intel::reqd_sub_group_size(32)]] {
|
10805
10506
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
10806
10507
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
10807
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10808
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10508
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10809
10509
|
});
|
10810
10510
|
});
|
10811
10511
|
}
|
@@ -10820,12 +10520,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10820
10520
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10821
10521
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10822
10522
|
{
|
10823
|
-
iq3xxs_grid.init(*stream);
|
10824
|
-
ksigns64.init(*stream);
|
10825
10523
|
|
10826
10524
|
stream->submit([&](sycl::handler &cgh) {
|
10827
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10828
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10829
10525
|
|
10830
10526
|
cgh.parallel_for(
|
10831
10527
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10833,8 +10529,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10833
10529
|
[[intel::reqd_sub_group_size(32)]] {
|
10834
10530
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
10835
10531
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
10836
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10837
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10532
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10838
10533
|
});
|
10839
10534
|
});
|
10840
10535
|
}
|
@@ -10849,12 +10544,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10849
10544
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10850
10545
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10851
10546
|
{
|
10852
|
-
iq3xxs_grid.init(*stream);
|
10853
|
-
ksigns64.init(*stream);
|
10854
10547
|
|
10855
10548
|
stream->submit([&](sycl::handler &cgh) {
|
10856
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10857
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10858
10549
|
|
10859
10550
|
cgh.parallel_for(
|
10860
10551
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10862,8 +10553,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10862
10553
|
[[intel::reqd_sub_group_size(32)]] {
|
10863
10554
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
10864
10555
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
10865
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10866
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10556
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10867
10557
|
});
|
10868
10558
|
});
|
10869
10559
|
}
|
@@ -10878,12 +10568,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10878
10568
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10879
10569
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10880
10570
|
{
|
10881
|
-
iq3xxs_grid.init(*stream);
|
10882
|
-
ksigns64.init(*stream);
|
10883
10571
|
|
10884
10572
|
stream->submit([&](sycl::handler &cgh) {
|
10885
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10886
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10887
10573
|
|
10888
10574
|
cgh.parallel_for(
|
10889
10575
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10891,8 +10577,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10891
10577
|
[[intel::reqd_sub_group_size(32)]] {
|
10892
10578
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
10893
10579
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
10894
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10895
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10580
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10896
10581
|
});
|
10897
10582
|
});
|
10898
10583
|
}
|
@@ -10907,12 +10592,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10907
10592
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10908
10593
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10909
10594
|
{
|
10910
|
-
iq3xxs_grid.init(*stream);
|
10911
|
-
ksigns64.init(*stream);
|
10912
10595
|
|
10913
10596
|
stream->submit([&](sycl::handler &cgh) {
|
10914
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10915
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10916
10597
|
|
10917
10598
|
cgh.parallel_for(
|
10918
10599
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10920,8 +10601,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10920
10601
|
[[intel::reqd_sub_group_size(32)]] {
|
10921
10602
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
10922
10603
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
10923
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10924
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10604
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10925
10605
|
});
|
10926
10606
|
});
|
10927
10607
|
}
|
@@ -10936,12 +10616,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10936
10616
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10937
10617
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10938
10618
|
{
|
10939
|
-
iq3xxs_grid.init(*stream);
|
10940
|
-
ksigns64.init(*stream);
|
10941
10619
|
|
10942
10620
|
stream->submit([&](sycl::handler &cgh) {
|
10943
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10944
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10945
10621
|
|
10946
10622
|
cgh.parallel_for(
|
10947
10623
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10949,13 +10625,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10949
10625
|
[[intel::reqd_sub_group_size(32)]] {
|
10950
10626
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
10951
10627
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
10952
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10953
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10628
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10954
10629
|
});
|
10955
10630
|
});
|
10956
10631
|
}
|
10957
10632
|
}
|
10958
10633
|
|
10634
|
+
|
10959
10635
|
static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
10960
10636
|
float *dst, const int ncols,
|
10961
10637
|
const int nrows,
|
@@ -10965,15 +10641,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
10965
10641
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10966
10642
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10967
10643
|
{
|
10968
|
-
iq2xxs_grid.init(*stream);
|
10969
|
-
ksigns_iq2xs.init(*stream);
|
10970
|
-
kmask_iq2xs.init(*stream);
|
10971
|
-
|
10972
10644
|
|
10973
10645
|
stream->submit([&](sycl::handler &cgh) {
|
10974
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid
|
10975
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs
|
10976
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs
|
10646
|
+
auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
|
10647
|
+
auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
|
10648
|
+
auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];
|
10977
10649
|
|
10978
10650
|
cgh.parallel_for(
|
10979
10651
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10996,12 +10668,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
10996
10668
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10997
10669
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10998
10670
|
{
|
10999
|
-
iq2xs_grid.init(*stream);
|
11000
|
-
ksigns64.init(*stream);
|
11001
10671
|
|
11002
10672
|
stream->submit([&](sycl::handler &cgh) {
|
11003
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
11004
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10673
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
10674
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11005
10675
|
|
11006
10676
|
cgh.parallel_for(
|
11007
10677
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11024,12 +10694,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
11024
10694
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11025
10695
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11026
10696
|
{
|
11027
|
-
iq3xxs_grid.init(*stream);
|
11028
|
-
ksigns64.init(*stream);
|
11029
10697
|
|
11030
10698
|
stream->submit([&](sycl::handler &cgh) {
|
11031
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
11032
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10699
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
10700
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11033
10701
|
|
11034
10702
|
cgh.parallel_for(
|
11035
10703
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11052,12 +10720,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11052
10720
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11053
10721
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11054
10722
|
{
|
11055
|
-
iq3s_grid.init(*stream);
|
11056
|
-
ksigns64.init(*stream);
|
11057
10723
|
|
11058
10724
|
stream->submit([&](sycl::handler &cgh) {
|
11059
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
11060
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10725
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
10726
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11061
10727
|
|
11062
10728
|
cgh.parallel_for(
|
11063
10729
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -11080,12 +10746,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11080
10746
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11081
10747
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11082
10748
|
{
|
11083
|
-
iq1s_grid_gpu.init(*stream);
|
11084
|
-
ksigns64.init(*stream);
|
11085
10749
|
|
11086
10750
|
stream->submit([&](sycl::handler &cgh) {
|
11087
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
11088
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
10751
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
10752
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11089
10753
|
|
11090
10754
|
cgh.parallel_for(
|
11091
10755
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -13128,6 +12792,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
13128
12792
|
}
|
13129
12793
|
|
13130
12794
|
void ggml_backend_sycl_print_sycl_devices() {
|
12795
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
13131
12796
|
int device_count = dpct::dev_mgr::instance().device_count();
|
13132
12797
|
std::map<std::string, size_t> DeviceNums;
|
13133
12798
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
@@ -13181,11 +12846,13 @@ int get_work_group_size(int user_device_id) {
|
|
13181
12846
|
return prop.get_max_work_group_size();
|
13182
12847
|
}
|
13183
12848
|
|
13184
|
-
void ggml_init_sycl() try {
|
12849
|
+
static void ggml_init_sycl() try {
|
13185
12850
|
static bool initialized = false;
|
13186
12851
|
|
13187
12852
|
if (!initialized) {
|
12853
|
+
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
13188
12854
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
12855
|
+
|
13189
12856
|
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
13190
12857
|
|
13191
12858
|
#if defined(GGML_SYCL_F16)
|
@@ -15246,6 +14913,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15246
14913
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
15247
14914
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
15248
14915
|
|
14916
|
+
bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
|
14917
|
+
main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
|
14918
|
+
|
15249
14919
|
SYCL_CHECK(
|
15250
14920
|
CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
|
15251
14921
|
|
@@ -15276,24 +14946,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15276
14946
|
|
15277
14947
|
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
15278
14948
|
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
14949
|
+
if (no_mixed_dtypes) {
|
14950
|
+
cu_compute_type = dpct::library_data_t::real_half;
|
14951
|
+
cu_data_type = dpct::library_data_t::real_half;
|
14952
|
+
}
|
15279
14953
|
|
15280
14954
|
// dst strides
|
15281
14955
|
size_t nbd2 = dst->nb[2];
|
15282
14956
|
size_t nbd3 = dst->nb[3];
|
15283
14957
|
|
14958
|
+
const float alpha_f32 = 1.0f;
|
14959
|
+
const float beta_f32 = 0.0f;
|
14960
|
+
|
15284
14961
|
const sycl::half alpha_f16 = 1.0f;
|
15285
14962
|
const sycl::half beta_f16 = 0.0f;
|
15286
14963
|
|
15287
|
-
const float alpha_f32 = 1.0f;
|
15288
|
-
const float beta_f32 = 0.0f;
|
15289
|
-
|
15290
14964
|
const void * alpha = &alpha_f32;
|
15291
14965
|
const void * beta = &beta_f32;
|
14966
|
+
if (no_mixed_dtypes) {
|
14967
|
+
alpha = &alpha_f16;
|
14968
|
+
beta = &beta_f16;
|
14969
|
+
}
|
15292
14970
|
|
15293
14971
|
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
15294
|
-
// oneMKL open source supports half, half, float, float: datatypes
|
14972
|
+
// when oneMKL open source supports half, half, float, float: datatypes
|
15295
14973
|
|
15296
14974
|
dst_t = (char *) dst_ddf;
|
14975
|
+
if (no_mixed_dtypes) {
|
14976
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
14977
|
+
|
14978
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
14979
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
14980
|
+
}
|
15297
14981
|
|
15298
14982
|
GGML_ASSERT(ne12 % ne02 == 0);
|
15299
14983
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -15379,6 +15063,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15379
15063
|
}
|
15380
15064
|
#endif
|
15381
15065
|
|
15066
|
+
if (no_mixed_dtypes) {
|
15067
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
15068
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
15069
|
+
}
|
15382
15070
|
}
|
15383
15071
|
catch (sycl::exception const &exc) {
|
15384
15072
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -16278,6 +15966,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16278
15966
|
}
|
16279
15967
|
|
16280
15968
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
15969
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
16281
15970
|
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
16282
15971
|
|
16283
15972
|
if (!g_sycl_gpu_mgr) {
|
@@ -16312,6 +16001,7 @@ catch (sycl::exception const &exc) {
|
|
16312
16001
|
|
16313
16002
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
16314
16003
|
size_t description_size) try {
|
16004
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
16315
16005
|
dpct::device_info prop;
|
16316
16006
|
int device_id = g_sycl_gpu_mgr->gpus[device];
|
16317
16007
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
@@ -16326,6 +16016,7 @@ catch (sycl::exception const &exc) {
|
|
16326
16016
|
|
16327
16017
|
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
16328
16018
|
size_t *total) try {
|
16019
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
16329
16020
|
ggml_sycl_set_device(device);
|
16330
16021
|
|
16331
16022
|
/*
|
@@ -16677,6 +16368,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16677
16368
|
};
|
16678
16369
|
|
16679
16370
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
16371
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
16372
|
+
|
16680
16373
|
if (device_index>=g_device_count or device_index<0) {
|
16681
16374
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
16682
16375
|
device_index, g_device_count-1);
|
@@ -17046,6 +16739,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
17046
16739
|
};
|
17047
16740
|
|
17048
16741
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
16742
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
|
16743
|
+
ggml_init_sycl();
|
17049
16744
|
// FIXME: this is not thread safe
|
17050
16745
|
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
17051
16746
|
|
@@ -17117,6 +16812,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|
17117
16812
|
}
|
17118
16813
|
|
17119
16814
|
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
16815
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
|
17120
16816
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
17121
16817
|
/* .iface = */ {
|
17122
16818
|
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
@@ -17231,7 +16927,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|
17231
16927
|
params.ith = 0;
|
17232
16928
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17233
16929
|
ggml_tensor * node = cgraph->nodes[i];
|
17234
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
16930
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
17235
16931
|
continue;
|
17236
16932
|
}
|
17237
16933
|
#ifndef NDEBUG
|
@@ -17379,6 +17075,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17379
17075
|
UNUSED(backend);
|
17380
17076
|
}
|
17381
17077
|
|
17078
|
+
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17079
|
+
const int min_batch_size = 32;
|
17080
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17081
|
+
GGML_UNUSED(backend);
|
17082
|
+
}
|
17083
|
+
|
17084
|
+
|
17382
17085
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
17383
17086
|
/* .get_name = */ ggml_backend_sycl_name,
|
17384
17087
|
/* .free = */ ggml_backend_sycl_free,
|
@@ -17392,7 +17095,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17392
17095
|
/* .graph_plan_compute = */ NULL,
|
17393
17096
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17394
17097
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17395
|
-
/* .offload_op = */
|
17098
|
+
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
17396
17099
|
/* .event_new = */ NULL,
|
17397
17100
|
/* .event_free = */ NULL,
|
17398
17101
|
/* .event_record = */ NULL,
|
@@ -17406,7 +17109,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|
17406
17109
|
}
|
17407
17110
|
|
17408
17111
|
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
17409
|
-
|
17112
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
17113
|
+
ggml_init_sycl();
|
17410
17114
|
|
17411
17115
|
check_allow_gpu_index(device);
|
17412
17116
|
|
@@ -17432,6 +17136,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
17432
17136
|
}
|
17433
17137
|
|
17434
17138
|
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
17139
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
17435
17140
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17436
17141
|
return g_sycl_gpu_mgr->get_gpu_count();
|
17437
17142
|
}
|
@@ -17444,16 +17149,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
|
|
17444
17149
|
}
|
17445
17150
|
|
17446
17151
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
17152
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
|
17447
17153
|
return g_sycl_gpu_mgr->get_index(device_id);
|
17448
17154
|
}
|
17449
17155
|
|
17450
17156
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
17157
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
|
17451
17158
|
return g_sycl_gpu_mgr->gpus[device_index];
|
17452
17159
|
}
|
17453
17160
|
|
17454
17161
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
17455
|
-
|
17162
|
+
ggml_init_sycl();
|
17163
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
17456
17164
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
17165
|
+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
17166
|
+
|
17457
17167
|
if (g_sycl_gpu_mgr) {
|
17458
17168
|
delete g_sycl_gpu_mgr;
|
17459
17169
|
}
|
@@ -17464,6 +17174,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
|
|
17464
17174
|
}
|
17465
17175
|
|
17466
17176
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
17177
|
+
ggml_init_sycl();
|
17178
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
|
17179
|
+
|
17467
17180
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
17468
17181
|
return;
|
17469
17182
|
}
|