llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +27 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +14 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +81 -20
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
- data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
- data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
- data/vendor/tmp/llama.cpp/ggml.c +141 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -12
- data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
- data/vendor/tmp/llama.cpp/llama.h +145 -29
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
- data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
- data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
- data/vendor/tmp/llama.cpp/unicode.h +2 -0
- metadata +5 -3
@@ -740,11 +740,7 @@ namespace dpct
|
|
740
740
|
|
741
741
|
sycl::queue &default_queue()
|
742
742
|
{
|
743
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
744
|
-
return out_of_order_queue();
|
745
|
-
#else
|
746
743
|
return in_order_queue();
|
747
|
-
#endif // DPCT_USM_LEVEL_NONE
|
748
744
|
}
|
749
745
|
|
750
746
|
void queues_wait_and_throw()
|
@@ -763,11 +759,7 @@ namespace dpct
|
|
763
759
|
|
764
760
|
sycl::queue *create_queue(bool enable_exception_handler = false)
|
765
761
|
{
|
766
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
767
|
-
return create_out_of_order_queue(enable_exception_handler);
|
768
|
-
#else
|
769
762
|
return create_in_order_queue(enable_exception_handler);
|
770
|
-
#endif // DPCT_USM_LEVEL_NONE
|
771
763
|
}
|
772
764
|
|
773
765
|
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
@@ -1075,11 +1067,6 @@ namespace dpct
|
|
1075
1067
|
static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
|
1076
1068
|
const void *ptr)
|
1077
1069
|
{
|
1078
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1079
|
-
return mem_mgr::instance().is_device_ptr(ptr)
|
1080
|
-
? pointer_access_attribute::device_only
|
1081
|
-
: pointer_access_attribute::host_only;
|
1082
|
-
#else
|
1083
1070
|
switch (sycl::get_pointer_type(ptr, q.get_context()))
|
1084
1071
|
{
|
1085
1072
|
case sycl::usm::alloc::unknown:
|
@@ -1090,7 +1077,6 @@ namespace dpct
|
|
1090
1077
|
case sycl::usm::alloc::host:
|
1091
1078
|
return pointer_access_attribute::host_device;
|
1092
1079
|
}
|
1093
|
-
#endif
|
1094
1080
|
}
|
1095
1081
|
|
1096
1082
|
template <typename ArgT>
|
@@ -1273,11 +1259,7 @@ namespace dpct
|
|
1273
1259
|
|
1274
1260
|
static inline void *dpct_malloc(size_t size, sycl::queue &q)
|
1275
1261
|
{
|
1276
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1277
|
-
return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
|
1278
|
-
#else
|
1279
1262
|
return sycl::malloc_device(size, q.get_device(), q.get_context());
|
1280
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1281
1263
|
}
|
1282
1264
|
|
1283
1265
|
#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
|
@@ -1301,25 +1283,7 @@ namespace dpct
|
|
1301
1283
|
static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
|
1302
1284
|
valueT value, size_t size)
|
1303
1285
|
{
|
1304
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1305
|
-
auto &mm = mem_mgr::instance();
|
1306
|
-
assert(mm.is_device_ptr(dev_ptr));
|
1307
|
-
auto alloc = mm.translate_ptr(dev_ptr);
|
1308
|
-
size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
|
1309
|
-
|
1310
|
-
return q.submit([&](sycl::handler &cgh)
|
1311
|
-
{
|
1312
|
-
auto r = sycl::range<1>(size);
|
1313
|
-
auto o = sycl::id<1>(offset);
|
1314
|
-
auto new_buffer = alloc.buffer.reinterpret<valueT>(
|
1315
|
-
sycl::range<1>(alloc.size / sizeof(valueT)));
|
1316
|
-
sycl::accessor<valueT, 1, sycl::access_mode::write,
|
1317
|
-
sycl::access::target::device>
|
1318
|
-
acc(new_buffer, cgh, r, o);
|
1319
|
-
cgh.fill(acc, value); });
|
1320
|
-
#else
|
1321
1286
|
return q.fill(dev_ptr, value, size);
|
1322
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1323
1287
|
}
|
1324
1288
|
|
1325
1289
|
/**
|
@@ -1413,72 +1377,8 @@ namespace dpct
|
|
1413
1377
|
{
|
1414
1378
|
if (!size)
|
1415
1379
|
return sycl::event{};
|
1416
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1417
|
-
auto &mm = mem_mgr::instance();
|
1418
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
1419
|
-
|
1420
|
-
switch (real_direction)
|
1421
|
-
{
|
1422
|
-
case host_to_host:
|
1423
|
-
return q.submit([&](sycl::handler &cgh)
|
1424
|
-
{
|
1425
|
-
cgh.depends_on(dep_events);
|
1426
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
1427
|
-
case host_to_device:
|
1428
|
-
{
|
1429
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
1430
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
1431
|
-
return q.submit([&](sycl::handler &cgh)
|
1432
|
-
{
|
1433
|
-
cgh.depends_on(dep_events);
|
1434
|
-
auto r = sycl::range<1>(size);
|
1435
|
-
auto o = sycl::id<1>(offset);
|
1436
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1437
|
-
sycl::access::target::device>
|
1438
|
-
acc(alloc.buffer, cgh, r, o);
|
1439
|
-
cgh.copy(from_ptr, acc); });
|
1440
|
-
}
|
1441
|
-
case device_to_host:
|
1442
|
-
{
|
1443
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
1444
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
1445
|
-
return q.submit([&](sycl::handler &cgh)
|
1446
|
-
{
|
1447
|
-
cgh.depends_on(dep_events);
|
1448
|
-
auto r = sycl::range<1>(size);
|
1449
|
-
auto o = sycl::id<1>(offset);
|
1450
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1451
|
-
sycl::access::target::device>
|
1452
|
-
acc(alloc.buffer, cgh, r, o);
|
1453
|
-
cgh.copy(acc, to_ptr); });
|
1454
|
-
}
|
1455
|
-
case device_to_device:
|
1456
|
-
{
|
1457
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
1458
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
1459
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
1460
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
1461
|
-
return q.submit([&](sycl::handler &cgh)
|
1462
|
-
{
|
1463
|
-
cgh.depends_on(dep_events);
|
1464
|
-
auto r = sycl::range<1>(size);
|
1465
|
-
auto to_o = sycl::id<1>(to_offset);
|
1466
|
-
auto from_o = sycl::id<1>(from_offset);
|
1467
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1468
|
-
sycl::access::target::device>
|
1469
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
1470
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1471
|
-
sycl::access::target::device>
|
1472
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
1473
|
-
cgh.copy(from_acc, to_acc); });
|
1474
|
-
}
|
1475
|
-
default:
|
1476
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1477
|
-
}
|
1478
|
-
#else
|
1479
1380
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
1480
1381
|
GGML_UNUSED(direction);
|
1481
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1482
1382
|
}
|
1483
1383
|
|
1484
1384
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -1618,45 +1518,15 @@ namespace dpct
|
|
1618
1518
|
break;
|
1619
1519
|
}
|
1620
1520
|
case device_to_device:
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
cgh.depends_on(dep_events);
|
1631
|
-
auto to_o = sycl::id<1>(to_offset);
|
1632
|
-
auto from_o = sycl::id<1>(from_offset);
|
1633
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
1634
|
-
sycl::access::target::device>
|
1635
|
-
to_acc(to_alloc.buffer, cgh,
|
1636
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
1637
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
1638
|
-
sycl::access::target::device>
|
1639
|
-
from_acc(from_alloc.buffer, cgh,
|
1640
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
1641
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
1642
|
-
size,
|
1643
|
-
[=](sycl::id<3> id) {
|
1644
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
1645
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
1646
|
-
}); }));
|
1647
|
-
}
|
1648
|
-
#else
|
1649
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
1650
|
-
{
|
1651
|
-
cgh.depends_on(dep_events);
|
1652
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1653
|
-
size,
|
1654
|
-
[=](sycl::id<3> id) {
|
1655
|
-
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1656
|
-
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1657
|
-
}); }));
|
1658
|
-
#endif
|
1659
|
-
break;
|
1521
|
+
event_list.push_back(q.submit([&](sycl::handler &cgh){
|
1522
|
+
cgh.depends_on(dep_events);
|
1523
|
+
cgh.parallel_for<class dpct_memcpy_3d_detail>(
|
1524
|
+
size,
|
1525
|
+
[=](sycl::id<3> id) {
|
1526
|
+
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
1527
|
+
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
1528
|
+
}); }));
|
1529
|
+
break;
|
1660
1530
|
default:
|
1661
1531
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
1662
1532
|
}
|
@@ -1754,11 +1624,7 @@ namespace dpct
|
|
1754
1624
|
{
|
1755
1625
|
if (ptr)
|
1756
1626
|
{
|
1757
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1758
|
-
detail::mem_mgr::instance().mem_free(ptr);
|
1759
|
-
#else
|
1760
1627
|
sycl::free(ptr, q.get_context());
|
1761
|
-
#endif // DPCT_USM_LEVEL_NONE
|
1762
1628
|
}
|
1763
1629
|
}
|
1764
1630
|
|
@@ -1766,11 +1632,7 @@ namespace dpct
|
|
1766
1632
|
inline auto get_memory(const void *x)
|
1767
1633
|
{
|
1768
1634
|
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
|
1769
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
1770
|
-
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
|
1771
|
-
#else
|
1772
1635
|
return new_x;
|
1773
|
-
#endif
|
1774
1636
|
}
|
1775
1637
|
|
1776
1638
|
template <typename T>
|
@@ -1802,24 +1664,6 @@ namespace dpct
|
|
1802
1664
|
const void *alpha, const void *a, int lda, const void *b,
|
1803
1665
|
int ldb, const void *beta, void *c, int ldc)
|
1804
1666
|
{
|
1805
|
-
#ifndef __INTEL_MKL__
|
1806
|
-
GGML_UNUSED(q);
|
1807
|
-
GGML_UNUSED(a_trans);
|
1808
|
-
GGML_UNUSED(b_trans);
|
1809
|
-
GGML_UNUSED(m);
|
1810
|
-
GGML_UNUSED(n);
|
1811
|
-
GGML_UNUSED(k);
|
1812
|
-
GGML_UNUSED(alpha);
|
1813
|
-
GGML_UNUSED(a);
|
1814
|
-
GGML_UNUSED(lda);
|
1815
|
-
GGML_UNUSED(b);
|
1816
|
-
GGML_UNUSED(ldb);
|
1817
|
-
GGML_UNUSED(beta);
|
1818
|
-
GGML_UNUSED(c);
|
1819
|
-
GGML_UNUSED(ldc);
|
1820
|
-
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
|
1821
|
-
"Project does not support this API.");
|
1822
|
-
#else
|
1823
1667
|
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
1824
1668
|
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
1825
1669
|
auto data_a = get_memory<const Ta>(a);
|
@@ -1828,7 +1672,6 @@ namespace dpct
|
|
1828
1672
|
oneapi::mkl::blas::column_major::gemm(
|
1829
1673
|
q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
|
1830
1674
|
data_b, ldb, beta_value, data_c, ldc);
|
1831
|
-
#endif
|
1832
1675
|
}
|
1833
1676
|
|
1834
1677
|
template <typename VecT, class BinaryOperation, class = void>
|
@@ -2222,72 +2065,8 @@ namespace dpct
|
|
2222
2065
|
{
|
2223
2066
|
if (!size)
|
2224
2067
|
return sycl::event{};
|
2225
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2226
|
-
auto &mm = mem_mgr::instance();
|
2227
|
-
auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
|
2228
|
-
|
2229
|
-
switch (real_direction)
|
2230
|
-
{
|
2231
|
-
case host_to_host:
|
2232
|
-
return q.submit([&](sycl::handler &cgh)
|
2233
|
-
{
|
2234
|
-
cgh.depends_on(dep_events);
|
2235
|
-
cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
|
2236
|
-
case host_to_device:
|
2237
|
-
{
|
2238
|
-
auto alloc = mm.translate_ptr(to_ptr);
|
2239
|
-
size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
|
2240
|
-
return q.submit([&](sycl::handler &cgh)
|
2241
|
-
{
|
2242
|
-
cgh.depends_on(dep_events);
|
2243
|
-
auto r = sycl::range<1>(size);
|
2244
|
-
auto o = sycl::id<1>(offset);
|
2245
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2246
|
-
sycl::access::target::device>
|
2247
|
-
acc(alloc.buffer, cgh, r, o);
|
2248
|
-
cgh.copy(from_ptr, acc); });
|
2249
|
-
}
|
2250
|
-
case device_to_host:
|
2251
|
-
{
|
2252
|
-
auto alloc = mm.translate_ptr(from_ptr);
|
2253
|
-
size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
|
2254
|
-
return q.submit([&](sycl::handler &cgh)
|
2255
|
-
{
|
2256
|
-
cgh.depends_on(dep_events);
|
2257
|
-
auto r = sycl::range<1>(size);
|
2258
|
-
auto o = sycl::id<1>(offset);
|
2259
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2260
|
-
sycl::access::target::device>
|
2261
|
-
acc(alloc.buffer, cgh, r, o);
|
2262
|
-
cgh.copy(acc, to_ptr); });
|
2263
|
-
}
|
2264
|
-
case device_to_device:
|
2265
|
-
{
|
2266
|
-
auto to_alloc = mm.translate_ptr(to_ptr);
|
2267
|
-
auto from_alloc = mm.translate_ptr(from_ptr);
|
2268
|
-
size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
|
2269
|
-
size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
|
2270
|
-
return q.submit([&](sycl::handler &cgh)
|
2271
|
-
{
|
2272
|
-
cgh.depends_on(dep_events);
|
2273
|
-
auto r = sycl::range<1>(size);
|
2274
|
-
auto to_o = sycl::id<1>(to_offset);
|
2275
|
-
auto from_o = sycl::id<1>(from_offset);
|
2276
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2277
|
-
sycl::access::target::device>
|
2278
|
-
to_acc(to_alloc.buffer, cgh, r, to_o);
|
2279
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2280
|
-
sycl::access::target::device>
|
2281
|
-
from_acc(from_alloc.buffer, cgh, r, from_o);
|
2282
|
-
cgh.copy(from_acc, to_acc); });
|
2283
|
-
}
|
2284
|
-
default:
|
2285
|
-
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
2286
|
-
}
|
2287
|
-
#else
|
2288
2068
|
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
2289
2069
|
GGML_UNUSED(direction);
|
2290
|
-
#endif // DPCT_USM_LEVEL_NONE
|
2291
2070
|
}
|
2292
2071
|
|
2293
2072
|
// Get actual copy range and make sure it will not exceed range.
|
@@ -2427,34 +2206,6 @@ namespace dpct
|
|
2427
2206
|
break;
|
2428
2207
|
}
|
2429
2208
|
case device_to_device:
|
2430
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2431
|
-
{
|
2432
|
-
auto &mm = mem_mgr::instance();
|
2433
|
-
auto to_alloc = mm.translate_ptr(to_surface);
|
2434
|
-
auto from_alloc = mm.translate_ptr(from_surface);
|
2435
|
-
size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
|
2436
|
-
size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
|
2437
|
-
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2438
|
-
{
|
2439
|
-
cgh.depends_on(dep_events);
|
2440
|
-
auto to_o = sycl::id<1>(to_offset);
|
2441
|
-
auto from_o = sycl::id<1>(from_offset);
|
2442
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::write,
|
2443
|
-
sycl::access::target::device>
|
2444
|
-
to_acc(to_alloc.buffer, cgh,
|
2445
|
-
get_copy_range(size, to_slice, to_range.get(0)), to_o);
|
2446
|
-
sycl::accessor<byte_t, 1, sycl::access_mode::read,
|
2447
|
-
sycl::access::target::device>
|
2448
|
-
from_acc(from_alloc.buffer, cgh,
|
2449
|
-
get_copy_range(size, from_slice, from_range.get(0)), from_o);
|
2450
|
-
cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
|
2451
|
-
size,
|
2452
|
-
[=](sycl::id<3> id) {
|
2453
|
-
to_acc[get_offset(id, to_slice, to_range.get(0))] =
|
2454
|
-
from_acc[get_offset(id, from_slice, from_range.get(0))];
|
2455
|
-
}); }));
|
2456
|
-
}
|
2457
|
-
#else
|
2458
2209
|
event_list.push_back(q.submit([&](sycl::handler &cgh)
|
2459
2210
|
{
|
2460
2211
|
cgh.depends_on(dep_events);
|
@@ -2464,7 +2215,6 @@ namespace dpct
|
|
2464
2215
|
to_surface[get_offset(id, to_slice, to_range.get(0))] =
|
2465
2216
|
from_surface[get_offset(id, from_slice, from_range.get(0))];
|
2466
2217
|
}); }));
|
2467
|
-
#endif
|
2468
2218
|
break;
|
2469
2219
|
default:
|
2470
2220
|
throw std::runtime_error("dpct_memcpy: invalid direction value");
|
@@ -2561,6 +2311,7 @@ namespace dpct
|
|
2561
2311
|
lda, b, ldb, beta, c, ldc);
|
2562
2312
|
break;
|
2563
2313
|
}
|
2314
|
+
#ifdef __INTEL_MKL__
|
2564
2315
|
case detail::get_type_combination_id(
|
2565
2316
|
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
2566
2317
|
library_data_t::real_float, library_data_t::real_float):
|
@@ -2622,6 +2373,7 @@ namespace dpct
|
|
2622
2373
|
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
|
2623
2374
|
break;
|
2624
2375
|
}
|
2376
|
+
#endif // __INTEL_MKL__
|
2625
2377
|
default:
|
2626
2378
|
throw std::runtime_error("the combination of data type is unsupported");
|
2627
2379
|
}
|
@@ -2655,9 +2407,6 @@ namespace dpct
|
|
2655
2407
|
void *c[], library_data_t c_type, int ldc,
|
2656
2408
|
int batch_size, library_data_t scaling_type)
|
2657
2409
|
{
|
2658
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
2659
|
-
throw std::runtime_error("this API is unsupported when USM level is none");
|
2660
|
-
#else
|
2661
2410
|
if (scaling_type == library_data_t::real_float &&
|
2662
2411
|
c_type == library_data_t::complex_float)
|
2663
2412
|
{
|
@@ -2792,7 +2541,6 @@ namespace dpct
|
|
2792
2541
|
default:
|
2793
2542
|
throw std::runtime_error("the combination of data type is unsupported");
|
2794
2543
|
}
|
2795
|
-
#endif
|
2796
2544
|
}
|
2797
2545
|
|
2798
2546
|
/// Computes a batch of matrix-matrix product with general matrices.
|
@@ -3131,24 +2879,9 @@ namespace dpct
|
|
3131
2879
|
template <size_t D = Dimension>
|
3132
2880
|
typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
|
3133
2881
|
init();
|
3134
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3135
|
-
return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
|
3136
|
-
_device_ptr)
|
3137
|
-
.template get_access<sycl::access_mode::read_write>()[index];
|
3138
|
-
#else
|
3139
2882
|
return _device_ptr[index];
|
3140
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3141
2883
|
}
|
3142
2884
|
|
3143
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3144
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3145
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3146
|
-
return get_buffer(_device_ptr)
|
3147
|
-
.template reinterpret<T, Dimension>(_range)
|
3148
|
-
.template get_access<detail::memory_traits<Memory, T>::mode,
|
3149
|
-
detail::memory_traits<Memory, T>::target>(cgh);
|
3150
|
-
}
|
3151
|
-
#else
|
3152
2885
|
/// Get dpct::accessor with dimension info for the device memory object
|
3153
2886
|
/// when usm is used and dimension is greater than 1.
|
3154
2887
|
template <size_t D = Dimension>
|
@@ -3156,7 +2889,6 @@ namespace dpct
|
|
3156
2889
|
get_access(sycl::handler &cgh) {
|
3157
2890
|
return dpct_accessor_t((T *)_device_ptr, _range);
|
3158
2891
|
}
|
3159
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3160
2892
|
|
3161
2893
|
private:
|
3162
2894
|
device_memory(value_t *memory_ptr, size_t size)
|
@@ -3201,15 +2933,6 @@ namespace dpct
|
|
3201
2933
|
|
3202
2934
|
/// Default constructor
|
3203
2935
|
device_memory() : base(1) {}
|
3204
|
-
|
3205
|
-
#ifdef DPCT_USM_LEVEL_NONE
|
3206
|
-
/// Get sycl::accessor for the device memory object when usm is not used.
|
3207
|
-
accessor_t get_access(sycl::handler &cgh) {
|
3208
|
-
auto buf = get_buffer(base::get_ptr())
|
3209
|
-
.template reinterpret<T, 1>(sycl::range<1>(1));
|
3210
|
-
return accessor_t(buf, cgh);
|
3211
|
-
}
|
3212
|
-
#endif // DPCT_USM_LEVEL_NONE
|
3213
2936
|
};
|
3214
2937
|
} // namespace detail
|
3215
2938
|
|
@@ -3228,7 +2951,7 @@ namespace dpct
|
|
3228
2951
|
#include "ggml-common.h"
|
3229
2952
|
|
3230
2953
|
static int g_ggml_sycl_debug=0;
|
3231
|
-
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug)
|
2954
|
+
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
|
3232
2955
|
|
3233
2956
|
#define CHECK_TRY_ERROR(expr) \
|
3234
2957
|
[&]() { \
|
@@ -3315,6 +3038,10 @@ typedef float dfloat; // dequantize float
|
|
3315
3038
|
typedef sycl::float2 dfloat2;
|
3316
3039
|
#endif //GGML_SYCL_F16
|
3317
3040
|
|
3041
|
+
#define MMVQ_MAX_BATCH_SIZE 8
|
3042
|
+
|
3043
|
+
static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
3044
|
+
|
3318
3045
|
bool ggml_sycl_loaded(void);
|
3319
3046
|
void * ggml_sycl_host_malloc(size_t size);
|
3320
3047
|
void ggml_sycl_host_free(void * ptr);
|
@@ -4750,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
|
4750
4477
|
|
4751
4478
|
}
|
4752
4479
|
|
4480
|
+
template <typename dst_t>
|
4481
|
+
__dpct_inline__ static void
|
4482
|
+
dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4483
|
+
const sycl::nd_item<3> &item_ct1) {
|
4484
|
+
|
4485
|
+
const int i = item_ct1.get_group(2);
|
4486
|
+
const block_iq2_s * x = (const block_iq2_s *) vx;
|
4487
|
+
|
4488
|
+
const int tid = item_ct1.get_local_id(2);
|
4489
|
+
#if QK_K == 256
|
4490
|
+
const int il = tid/8; // 0...3
|
4491
|
+
const int ib = tid%8; // 0...7
|
4492
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4493
|
+
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
4494
|
+
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
4495
|
+
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
4496
|
+
#pragma unroll
|
4497
|
+
for (int j = 0; j < 8; ++j)
|
4498
|
+
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
4499
|
+
#else
|
4500
|
+
assert(false);
|
4501
|
+
|
4502
|
+
#endif
|
4503
|
+
|
4504
|
+
}
|
4505
|
+
|
4753
4506
|
template<typename dst_t>
|
4754
4507
|
static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
4755
4508
|
const sycl::nd_item<3> &item_ct1,
|
@@ -4782,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
|
4782
4535
|
|
4783
4536
|
}
|
4784
4537
|
|
4785
|
-
template<typename dst_t>
|
4786
|
-
static void
|
4787
|
-
|
4788
|
-
|
4789
|
-
|
4790
|
-
const uint8_t *kmask_iq2xs) {
|
4538
|
+
template <typename dst_t>
|
4539
|
+
__dpct_inline__ static void
|
4540
|
+
dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4541
|
+
const sycl::nd_item<3> &item_ct1,
|
4542
|
+
const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
|
4791
4543
|
|
4792
4544
|
const int i = item_ct1.get_group(2);
|
4793
|
-
const block_iq3_s * x = (const block_iq3_s
|
4545
|
+
const block_iq3_s * x = (const block_iq3_s *) vx;
|
4794
4546
|
|
4795
4547
|
const int tid = item_ct1.get_local_id(2);
|
4796
4548
|
#if QK_K == 256
|
4797
4549
|
const int il = tid/8; // 0...3
|
4798
4550
|
const int ib = tid%8; // 0...7
|
4799
4551
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4800
|
-
const uint8_t
|
4801
|
-
const uint8_t
|
4802
|
-
const uint8_t
|
4552
|
+
const uint8_t * qs = x[i].qs + 8*ib;
|
4553
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
|
4554
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
|
4803
4555
|
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
|
4804
4556
|
const uint8_t signs = x[i].signs[4*ib + il];
|
4557
|
+
#pragma unroll
|
4805
4558
|
for (int j = 0; j < 4; ++j) {
|
4806
4559
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
4807
4560
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
@@ -4812,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
|
|
4812
4565
|
|
4813
4566
|
}
|
4814
4567
|
|
4815
|
-
template<typename dst_t>
|
4816
|
-
static void
|
4817
|
-
|
4818
|
-
|
4819
|
-
|
4820
|
-
|
4568
|
+
template <typename dst_t>
|
4569
|
+
__dpct_inline__ static void
|
4570
|
+
dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4571
|
+
const sycl::nd_item<3> &item_ct1,
|
4572
|
+
const uint32_t *iq1s_grid_gpu) {
|
4573
|
+
|
4821
4574
|
const int i = item_ct1.get_group(2);
|
4822
4575
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
4823
4576
|
|
@@ -4826,14 +4579,49 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
|
|
4826
4579
|
const int il = tid/8; // 0...3
|
4827
4580
|
const int ib = tid%8; // 0...7
|
4828
4581
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4829
|
-
const
|
4830
|
-
const
|
4831
|
-
const
|
4832
|
-
|
4833
|
-
|
4834
|
-
|
4835
|
-
|
4836
|
-
|
4582
|
+
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
|
4583
|
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
|
4584
|
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
4585
|
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
|
4586
|
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
4587
|
+
grid32[0] &= 0x0f0f0f0f;
|
4588
|
+
#pragma unroll
|
4589
|
+
for (int j = 0; j < 8; ++j) {
|
4590
|
+
y[j] = d * (q[j] + delta);
|
4591
|
+
}
|
4592
|
+
#else
|
4593
|
+
assert(false);
|
4594
|
+
#endif
|
4595
|
+
|
4596
|
+
}
|
4597
|
+
|
4598
|
+
template <typename dst_t>
|
4599
|
+
__dpct_inline__ static void
|
4600
|
+
dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4601
|
+
const sycl::nd_item<3> &item_ct1,
|
4602
|
+
const uint32_t *iq1s_grid_gpu) {
|
4603
|
+
|
4604
|
+
const int i = item_ct1.get_group(2);
|
4605
|
+
const block_iq1_m * x = (const block_iq1_m *) vx;
|
4606
|
+
|
4607
|
+
const int tid = item_ct1.get_local_id(2);
|
4608
|
+
#if QK_K == 256
|
4609
|
+
const int il = tid/8; // 0...3
|
4610
|
+
const int ib = tid%8; // 0...7
|
4611
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
4612
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
4613
|
+
iq1m_scale_t scale;
|
4614
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
4615
|
+
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
|
4616
|
+
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
|
4617
|
+
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
|
4618
|
+
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
4619
|
+
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
|
4620
|
+
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
4621
|
+
grid32[0] &= 0x0f0f0f0f;
|
4622
|
+
#pragma unroll
|
4623
|
+
for (int j = 0; j < 8; ++j) {
|
4624
|
+
y[j] = d * (q[j] + delta);
|
4837
4625
|
}
|
4838
4626
|
#else
|
4839
4627
|
assert(false);
|
@@ -4841,6 +4629,51 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
|
|
4841
4629
|
|
4842
4630
|
}
|
4843
4631
|
|
4632
|
+
template <typename dst_t>
|
4633
|
+
__dpct_inline__ static void
|
4634
|
+
dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4635
|
+
const sycl::nd_item<3> &item_ct1) {
|
4636
|
+
|
4637
|
+
const int i = item_ct1.get_group(2);
|
4638
|
+
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
4639
|
+
|
4640
|
+
const int tid = item_ct1.get_local_id(2);
|
4641
|
+
const int il = tid/8; // 0...3
|
4642
|
+
const int ib = tid%8; // 0...7
|
4643
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
4644
|
+
const uint8_t * q4 = x[ib].qs + 4*il;
|
4645
|
+
const float d = (float)x[ib].d;
|
4646
|
+
#pragma unroll
|
4647
|
+
for (int j = 0; j < 4; ++j) {
|
4648
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
4649
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
4650
|
+
}
|
4651
|
+
|
4652
|
+
}
|
4653
|
+
|
4654
|
+
|
4655
|
+
template <typename dst_t>
|
4656
|
+
__dpct_inline__ static void
|
4657
|
+
dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
4658
|
+
const sycl::nd_item<3> &item_ct1) {
|
4659
|
+
const int i = item_ct1.get_group(2);
|
4660
|
+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
4661
|
+
|
4662
|
+
const int tid = item_ct1.get_local_id(2);
|
4663
|
+
const int il = tid/8; // 0...3
|
4664
|
+
const int ib = tid%8; // 0...7
|
4665
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
4666
|
+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
4667
|
+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
4668
|
+
#pragma unroll
|
4669
|
+
for (int j = 0; j < 4; ++j) {
|
4670
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
4671
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
4672
|
+
}
|
4673
|
+
}
|
4674
|
+
|
4675
|
+
|
4676
|
+
|
4844
4677
|
/*
|
4845
4678
|
DPCT1110:4: The total declared local variable size in device function
|
4846
4679
|
dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
|
@@ -7647,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
|
7647
7480
|
#endif
|
7648
7481
|
}
|
7649
7482
|
|
7483
|
+
static __dpct_inline__ float
|
7484
|
+
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
7485
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7486
|
+
#if QK_K == 256
|
7487
|
+
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
7488
|
+
|
7489
|
+
const int ib32 = iqs;
|
7490
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
7491
|
+
const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
|
7492
|
+
const uint8_t ls1 = bq2->scales[ib32] & 0xf;
|
7493
|
+
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
7494
|
+
int sumi1 = 0;
|
7495
|
+
for (int l = 0; l < 2; ++l) {
|
7496
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
7497
|
+
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
7498
|
+
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
|
7499
|
+
std::equal_to<>());
|
7500
|
+
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
7501
|
+
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
|
7502
|
+
std::equal_to<>());
|
7503
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7504
|
+
grid[0] ^ signs0, signs0, std::minus<>());
|
7505
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7506
|
+
grid[1] ^ signs1, signs1, std::minus<>());
|
7507
|
+
sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
7508
|
+
sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
7509
|
+
q8 += 8;
|
7510
|
+
}
|
7511
|
+
int sumi2 = 0;
|
7512
|
+
for (int l = 2; l < 4; ++l) {
|
7513
|
+
const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
|
7514
|
+
const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
7515
|
+
((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
|
7516
|
+
std::equal_to<>());
|
7517
|
+
const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
7518
|
+
((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
|
7519
|
+
std::equal_to<>());
|
7520
|
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7521
|
+
grid[0] ^ signs0, signs0, std::minus<>());
|
7522
|
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
7523
|
+
grid[1] ^ signs1, signs1, std::minus<>());
|
7524
|
+
sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
7525
|
+
sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
7526
|
+
q8 += 8;
|
7527
|
+
}
|
7528
|
+
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
7529
|
+
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
7530
|
+
#else
|
7531
|
+
assert(false);
|
7532
|
+
#endif
|
7533
|
+
}
|
7534
|
+
|
7650
7535
|
static __dpct_inline__ float
|
7651
7536
|
vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
7652
7537
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
@@ -7689,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
|
7689
7574
|
|
7690
7575
|
static __dpct_inline__ float
|
7691
7576
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
7692
|
-
|
7693
|
-
|
7694
|
-
#if DPCT_COMPATIBILITY_TEMP >= \
|
7695
|
-
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
7577
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7578
|
+
const uint32_t *iq3s_grid) {
|
7696
7579
|
#if QK_K == 256
|
7697
7580
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
7698
7581
|
|
@@ -7704,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7704
7587
|
const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
|
7705
7588
|
const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
|
7706
7589
|
uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
|
7707
|
-
((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201,
|
7590
|
+
((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
|
7591
|
+
0x08040201, std::equal_to<>());
|
7708
7592
|
uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
|
7709
|
-
((bq2->signs[4*ib32+l] >>
|
7593
|
+
((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
|
7594
|
+
0x08040201, std::equal_to<>());
|
7710
7595
|
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
7711
7596
|
grid1[0] ^ signs0, signs0, std::minus<>());
|
7712
7597
|
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
@@ -7715,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
7715
7600
|
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
7716
7601
|
q8 += 8;
|
7717
7602
|
}
|
7718
|
-
const float d =
|
7603
|
+
const float d =
|
7604
|
+
(float)bq2->d *
|
7605
|
+
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
7606
|
+
bq8_1[ib32].ds[0];
|
7719
7607
|
return d * sumi;
|
7720
7608
|
#else
|
7721
7609
|
assert(false);
|
7722
|
-
return 0.f;
|
7723
|
-
#endif
|
7724
|
-
#else
|
7725
|
-
assert(false);
|
7726
|
-
return 0.f;
|
7727
7610
|
#endif
|
7728
7611
|
}
|
7729
7612
|
|
7730
7613
|
static __dpct_inline__ float
|
7731
7614
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
7732
|
-
|
7733
|
-
|
7615
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
7616
|
+
const uint32_t *iq1s_grid_gpu) {
|
7734
7617
|
#if QK_K == 256
|
7735
7618
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
7736
7619
|
|
7737
7620
|
const int ib32 = iqs;
|
7738
|
-
const uint8_t * qs = bq1->qs + 4*ib32;
|
7739
|
-
const int8_t * q8 = bq8_1[ib32].qs;
|
7740
7621
|
int sumi = 0;
|
7622
|
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
7741
7623
|
for (int l = 0; l < 4; ++l) {
|
7742
|
-
const
|
7743
|
-
|
7744
|
-
|
7745
|
-
|
7746
|
-
|
7747
|
-
|
7748
|
-
|
7749
|
-
|
7750
|
-
|
7624
|
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
7625
|
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
7626
|
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
7627
|
+
sumi = dpct::dp4a(q8[2 * l + 1], grid1,
|
7628
|
+
dpct::dp4a(q8[2 * l + 0], grid0, sumi));
|
7629
|
+
}
|
7630
|
+
|
7631
|
+
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
|
7632
|
+
const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
|
7633
|
+
const float d = d1q * bq8_1[ib32].ds[0];
|
7634
|
+
const float m = d1q * bq8_1[ib32].ds[1];
|
7635
|
+
return d * sumi + m * delta;
|
7636
|
+
#else
|
7637
|
+
assert(false);
|
7638
|
+
#endif
|
7639
|
+
}
|
7640
|
+
|
7641
|
+
static __dpct_inline__ float
|
7642
|
+
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
7643
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7644
|
+
#if QK_K == 256
|
7645
|
+
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
7646
|
+
|
7647
|
+
const int ib32 = iqs;
|
7648
|
+
int sumi[2] = {0, 0};
|
7649
|
+
float sumf[2] = {0.f, 0.f};
|
7650
|
+
|
7651
|
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
7652
|
+
for (int l = 0; l < 4; ++l) {
|
7653
|
+
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
|
7654
|
+
int grid0 = grid[0] & 0x0f0f0f0f;
|
7655
|
+
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
|
7656
|
+
sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
|
7657
|
+
dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
|
7658
|
+
const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
|
7659
|
+
const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
|
7660
|
+
dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
|
7661
|
+
sumf[l/2] += delta*sumy;
|
7662
|
+
}
|
7663
|
+
|
7664
|
+
iq1m_scale_t scale;
|
7665
|
+
const uint16_t * sc = (const uint16_t *)bq1->scales;
|
7666
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
7667
|
+
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
7668
|
+
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
7669
|
+
#else
|
7670
|
+
assert(false);
|
7671
|
+
#endif
|
7672
|
+
}
|
7673
|
+
|
7674
|
+
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
7675
|
+
const uint8_t *values,
|
7676
|
+
int &val1, int &val2) {
|
7677
|
+
|
7678
|
+
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
7679
|
+
aux32 = q4 & 0x0f0f0f0f;
|
7680
|
+
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
7681
|
+
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
7682
|
+
val1 = v1 | (v2 << 16);
|
7683
|
+
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
7684
|
+
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
7685
|
+
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
7686
|
+
val2 = v1 | (v2 << 16);
|
7687
|
+
}
|
7688
|
+
|
7689
|
+
|
7690
|
+
static __dpct_inline__ float
|
7691
|
+
vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
|
7692
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7693
|
+
|
7694
|
+
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
7695
|
+
|
7696
|
+
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
7697
|
+
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
7698
|
+
|
7699
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7700
|
+
|
7701
|
+
int v1, v2;
|
7702
|
+
int sumi1 = 0, sumi2 = 0;
|
7703
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
7704
|
+
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
7705
|
+
get_int_from_table_16(aux, values, v1, v2);
|
7706
|
+
sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
|
7707
|
+
sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
|
7751
7708
|
}
|
7752
|
-
|
7753
|
-
|
7709
|
+
|
7710
|
+
const float d = (float)bq->d * bq8_1->ds[0];
|
7711
|
+
return d * (sumi1 + sumi2);
|
7712
|
+
}
|
7713
|
+
|
7714
|
+
|
7715
|
+
static __dpct_inline__ float
|
7716
|
+
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
7717
|
+
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
7718
|
+
|
7719
|
+
#if QK_K == 256
|
7720
|
+
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
7721
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
7722
|
+
|
7723
|
+
// iqs is 0...7
|
7724
|
+
const int ib32 = iqs;
|
7725
|
+
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
|
7726
|
+
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
|
7727
|
+
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
|
7728
|
+
const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
|
7729
|
+
int v1, v2;
|
7730
|
+
int sumi1 = 0, sumi2 = 0;
|
7731
|
+
for (int j = 0; j < 4; ++j) {
|
7732
|
+
get_int_from_table_16(q4[j], values, v1, v2);
|
7733
|
+
sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
|
7734
|
+
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
7735
|
+
}
|
7736
|
+
return d * (sumi1 + sumi2);
|
7754
7737
|
#else
|
7755
7738
|
assert(false);
|
7756
|
-
return 0.f;
|
7757
7739
|
#endif
|
7758
7740
|
}
|
7759
7741
|
|
@@ -8338,8 +8320,7 @@ template <bool need_check> static void
|
|
8338
8320
|
|
8339
8321
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
8340
8322
|
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
8341
|
-
const sycl::nd_item<3> &item_ct1
|
8342
|
-
const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
|
8323
|
+
const sycl::nd_item<3> &item_ct1) {
|
8343
8324
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8344
8325
|
item_ct1.get_local_id(1);
|
8345
8326
|
|
@@ -8383,10 +8364,203 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
8383
8364
|
}
|
8384
8365
|
|
8385
8366
|
template <int qk, int qi, typename block_q_t, int vdr>
|
8386
|
-
static void mul_mat_vec_q_iq2_xxs_q8_1(const void *
|
8387
|
-
|
8388
|
-
|
8389
|
-
|
8367
|
+
static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
8368
|
+
const void *__restrict__ vy,
|
8369
|
+
float *__restrict__ dst, const int ncols,
|
8370
|
+
const int nrows,
|
8371
|
+
const sycl::nd_item<3> &item_ct1) {
|
8372
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8373
|
+
item_ct1.get_local_id(1);
|
8374
|
+
|
8375
|
+
if (row >= nrows) {
|
8376
|
+
return;
|
8377
|
+
}
|
8378
|
+
|
8379
|
+
const int blocks_per_row = ncols / qk;
|
8380
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8381
|
+
|
8382
|
+
// partial sum for each thread
|
8383
|
+
float tmp = 0.0f;
|
8384
|
+
|
8385
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8386
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8387
|
+
|
8388
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8389
|
+
i += blocks_per_warp) {
|
8390
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8391
|
+
|
8392
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8393
|
+
|
8394
|
+
const int iqs =
|
8395
|
+
vdr *
|
8396
|
+
(item_ct1.get_local_id(2) %
|
8397
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8398
|
+
|
8399
|
+
tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
|
8400
|
+
}
|
8401
|
+
|
8402
|
+
// sum up partial sums and write back result
|
8403
|
+
#pragma unroll
|
8404
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8405
|
+
tmp +=
|
8406
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8407
|
+
}
|
8408
|
+
|
8409
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8410
|
+
dst[row] = tmp;
|
8411
|
+
}
|
8412
|
+
}
|
8413
|
+
|
8414
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8415
|
+
static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
8416
|
+
const void *__restrict__ vy,
|
8417
|
+
float *__restrict__ dst, const int ncols,
|
8418
|
+
const int nrows,
|
8419
|
+
const sycl::nd_item<3> &item_ct1) {
|
8420
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8421
|
+
item_ct1.get_local_id(1);
|
8422
|
+
|
8423
|
+
if (row >= nrows) {
|
8424
|
+
return;
|
8425
|
+
}
|
8426
|
+
|
8427
|
+
const int blocks_per_row = ncols / qk;
|
8428
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8429
|
+
|
8430
|
+
// partial sum for each thread
|
8431
|
+
float tmp = 0.0f;
|
8432
|
+
|
8433
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8434
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8435
|
+
|
8436
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8437
|
+
i += blocks_per_warp) {
|
8438
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8439
|
+
|
8440
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8441
|
+
|
8442
|
+
const int iqs =
|
8443
|
+
vdr *
|
8444
|
+
(item_ct1.get_local_id(2) %
|
8445
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8446
|
+
|
8447
|
+
tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
|
8448
|
+
}
|
8449
|
+
|
8450
|
+
// sum up partial sums and write back result
|
8451
|
+
#pragma unroll
|
8452
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8453
|
+
tmp +=
|
8454
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8455
|
+
}
|
8456
|
+
|
8457
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8458
|
+
dst[row] = tmp;
|
8459
|
+
}
|
8460
|
+
}
|
8461
|
+
|
8462
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8463
|
+
static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
8464
|
+
const void *__restrict__ vy,
|
8465
|
+
float *__restrict__ dst, const int ncols,
|
8466
|
+
const int nrows,
|
8467
|
+
const sycl::nd_item<3> &item_ct1) {
|
8468
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8469
|
+
item_ct1.get_local_id(1);
|
8470
|
+
|
8471
|
+
if (row >= nrows) {
|
8472
|
+
return;
|
8473
|
+
}
|
8474
|
+
|
8475
|
+
const int blocks_per_row = ncols / qk;
|
8476
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8477
|
+
|
8478
|
+
// partial sum for each thread
|
8479
|
+
float tmp = 0.0f;
|
8480
|
+
|
8481
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8482
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8483
|
+
|
8484
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8485
|
+
i += blocks_per_warp) {
|
8486
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8487
|
+
|
8488
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8489
|
+
|
8490
|
+
const int iqs =
|
8491
|
+
vdr *
|
8492
|
+
(item_ct1.get_local_id(2) %
|
8493
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8494
|
+
|
8495
|
+
tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
|
8496
|
+
}
|
8497
|
+
|
8498
|
+
// sum up partial sums and write back result
|
8499
|
+
#pragma unroll
|
8500
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8501
|
+
tmp +=
|
8502
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8503
|
+
}
|
8504
|
+
|
8505
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8506
|
+
dst[row] = tmp;
|
8507
|
+
}
|
8508
|
+
}
|
8509
|
+
|
8510
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8511
|
+
static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
8512
|
+
const void *__restrict__ vy,
|
8513
|
+
float *__restrict__ dst, const int ncols,
|
8514
|
+
const int nrows,
|
8515
|
+
const sycl::nd_item<3> &item_ct1) {
|
8516
|
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8517
|
+
item_ct1.get_local_id(1);
|
8518
|
+
|
8519
|
+
if (row >= nrows) {
|
8520
|
+
return;
|
8521
|
+
}
|
8522
|
+
|
8523
|
+
const int blocks_per_row = ncols / qk;
|
8524
|
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8525
|
+
|
8526
|
+
// partial sum for each thread
|
8527
|
+
float tmp = 0.0f;
|
8528
|
+
|
8529
|
+
const block_q_t * x = (const block_q_t *) vx;
|
8530
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8531
|
+
|
8532
|
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
8533
|
+
i += blocks_per_warp) {
|
8534
|
+
const int ibx = row*blocks_per_row + i; // x block index
|
8535
|
+
|
8536
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
8537
|
+
|
8538
|
+
const int iqs =
|
8539
|
+
vdr *
|
8540
|
+
(item_ct1.get_local_id(2) %
|
8541
|
+
(qi / vdr)); // x block quant index when casting the quants to int
|
8542
|
+
|
8543
|
+
tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
|
8544
|
+
}
|
8545
|
+
|
8546
|
+
// sum up partial sums and write back result
|
8547
|
+
#pragma unroll
|
8548
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
8549
|
+
tmp +=
|
8550
|
+
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
8551
|
+
}
|
8552
|
+
|
8553
|
+
if (item_ct1.get_local_id(2) == 0) {
|
8554
|
+
dst[row] = tmp;
|
8555
|
+
}
|
8556
|
+
}
|
8557
|
+
|
8558
|
+
template <int qk, int qi, typename block_q_t, int vdr>
|
8559
|
+
static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
8560
|
+
const void *__restrict__ vy,
|
8561
|
+
float *__restrict__ dst, const int ncols,
|
8562
|
+
const int nrows,
|
8563
|
+
const sycl::nd_item<3> &item_ct1) {
|
8390
8564
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8391
8565
|
item_ct1.get_local_id(1);
|
8392
8566
|
|
@@ -8414,7 +8588,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8414
8588
|
(item_ct1.get_local_id(2) %
|
8415
8589
|
(qi / vdr)); // x block quant index when casting the quants to int
|
8416
8590
|
|
8417
|
-
tmp +=
|
8591
|
+
tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
|
8418
8592
|
}
|
8419
8593
|
|
8420
8594
|
// sum up partial sums and write back result
|
@@ -8430,9 +8604,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8430
8604
|
}
|
8431
8605
|
|
8432
8606
|
template <int qk, int qi, typename block_q_t, int vdr>
|
8433
|
-
static void
|
8434
|
-
|
8435
|
-
|
8607
|
+
static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
8608
|
+
const void *__restrict__ vy,
|
8609
|
+
float *__restrict__ dst, const int ncols,
|
8610
|
+
const int nrows,
|
8611
|
+
const sycl::nd_item<3> &item_ct1) {
|
8436
8612
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8437
8613
|
item_ct1.get_local_id(1);
|
8438
8614
|
|
@@ -8460,7 +8636,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
|
|
8460
8636
|
(item_ct1.get_local_id(2) %
|
8461
8637
|
(qi / vdr)); // x block quant index when casting the quants to int
|
8462
8638
|
|
8463
|
-
tmp +=
|
8639
|
+
tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
|
8464
8640
|
}
|
8465
8641
|
|
8466
8642
|
// sum up partial sums and write back result
|
@@ -8476,9 +8652,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void *
|
|
8476
8652
|
}
|
8477
8653
|
|
8478
8654
|
template <int qk, int qi, typename block_q_t, int vdr>
|
8479
|
-
static void
|
8480
|
-
|
8481
|
-
|
8655
|
+
static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
8656
|
+
const void *__restrict__ vy,
|
8657
|
+
float *__restrict__ dst, const int ncols,
|
8658
|
+
const int nrows,
|
8659
|
+
const sycl::nd_item<3> &item_ct1) {
|
8482
8660
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8483
8661
|
item_ct1.get_local_id(1);
|
8484
8662
|
|
@@ -8506,7 +8684,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8506
8684
|
(item_ct1.get_local_id(2) %
|
8507
8685
|
(qi / vdr)); // x block quant index when casting the quants to int
|
8508
8686
|
|
8509
|
-
tmp +=
|
8687
|
+
tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
|
8510
8688
|
}
|
8511
8689
|
|
8512
8690
|
// sum up partial sums and write back result
|
@@ -8522,9 +8700,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void
|
|
8522
8700
|
}
|
8523
8701
|
|
8524
8702
|
template <int qk, int qi, typename block_q_t, int vdr>
|
8525
|
-
static void
|
8526
|
-
|
8527
|
-
|
8703
|
+
static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
8704
|
+
const void *__restrict__ vy,
|
8705
|
+
float *__restrict__ dst, const int ncols,
|
8706
|
+
const int nrows,
|
8707
|
+
const sycl::nd_item<3> &item_ct1) {
|
8528
8708
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8529
8709
|
item_ct1.get_local_id(1);
|
8530
8710
|
|
@@ -8552,7 +8732,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
|
|
8552
8732
|
(item_ct1.get_local_id(2) %
|
8553
8733
|
(qi / vdr)); // x block quant index when casting the quants to int
|
8554
8734
|
|
8555
|
-
tmp +=
|
8735
|
+
tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
|
8556
8736
|
}
|
8557
8737
|
|
8558
8738
|
// sum up partial sums and write back result
|
@@ -8567,10 +8747,13 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
|
|
8567
8747
|
}
|
8568
8748
|
}
|
8569
8749
|
|
8750
|
+
|
8570
8751
|
template <int qk, int qi, typename block_q_t, int vdr>
|
8571
|
-
static void
|
8572
|
-
|
8573
|
-
|
8752
|
+
static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
8753
|
+
const void *__restrict__ vy,
|
8754
|
+
float *__restrict__ dst, const int ncols,
|
8755
|
+
const int nrows,
|
8756
|
+
const sycl::nd_item<3> &item_ct1) {
|
8574
8757
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
8575
8758
|
item_ct1.get_local_id(1);
|
8576
8759
|
|
@@ -8598,7 +8781,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
|
|
8598
8781
|
(item_ct1.get_local_id(2) %
|
8599
8782
|
(qi / vdr)); // x block quant index when casting the quants to int
|
8600
8783
|
|
8601
|
-
tmp +=
|
8784
|
+
tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
|
8602
8785
|
}
|
8603
8786
|
|
8604
8787
|
// sum up partial sums and write back result
|
@@ -8613,6 +8796,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
|
|
8613
8796
|
}
|
8614
8797
|
}
|
8615
8798
|
|
8799
|
+
|
8616
8800
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
8617
8801
|
static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
8618
8802
|
const sycl::nd_item<3> &item_ct1) {
|
@@ -9174,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
|
9174
9358
|
}
|
9175
9359
|
}
|
9176
9360
|
|
9361
|
+
|
9177
9362
|
template<typename T>
|
9178
|
-
static inline void
|
9363
|
+
static inline void ggml_sycl_swap(T & a, T & b) {
|
9179
9364
|
T tmp = a;
|
9180
9365
|
a = b;
|
9181
9366
|
b = tmp;
|
9182
9367
|
}
|
9183
9368
|
|
9184
|
-
template<ggml_sort_order order>
|
9185
|
-
static void
|
9186
|
-
|
9369
|
+
template <ggml_sort_order order>
|
9370
|
+
__dpct_inline__ static void
|
9371
|
+
k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
|
9372
|
+
const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
|
9187
9373
|
// bitonic sort
|
9188
9374
|
int col = item_ct1.get_local_id(2);
|
9189
9375
|
int row = item_ct1.get_group(1);
|
9190
9376
|
|
9191
|
-
if (col >=
|
9377
|
+
if (col >= ncols_pad) {
|
9378
|
+
return;
|
9379
|
+
}
|
9192
9380
|
|
9193
9381
|
const float * x_row = x + row * ncols;
|
9194
|
-
|
9382
|
+
auto dst_row = (int *)dpct_local;
|
9195
9383
|
|
9196
9384
|
// initialize indices
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
/*
|
9201
|
-
DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
|
9202
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
9203
|
-
performance if there is no access to global memory.
|
9204
|
-
*/
|
9205
|
-
item_ct1.barrier();
|
9385
|
+
dst_row[col] = col;
|
9386
|
+
|
9387
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
9206
9388
|
|
9207
|
-
for (int k = 2; k <=
|
9389
|
+
for (int k = 2; k <= ncols_pad; k *= 2) {
|
9208
9390
|
for (int j = k / 2; j > 0; j /= 2) {
|
9209
9391
|
int ixj = col ^ j;
|
9210
9392
|
if (ixj > col) {
|
9211
9393
|
if ((col & k) == 0) {
|
9212
|
-
if (
|
9213
|
-
|
9394
|
+
if (dst_row[col] >= ncols ||
|
9395
|
+
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
9396
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
9397
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
9398
|
+
) {
|
9399
|
+
ggml_sycl_swap(dst_row[col], dst_row[ixj]);
|
9214
9400
|
}
|
9215
9401
|
} else {
|
9216
|
-
if (
|
9217
|
-
|
9402
|
+
if (dst_row[ixj] >= ncols ||
|
9403
|
+
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
|
9404
|
+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
9405
|
+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
9406
|
+
) {
|
9407
|
+
ggml_sycl_swap(dst_row[col], dst_row[ixj]);
|
9218
9408
|
}
|
9219
9409
|
}
|
9220
9410
|
}
|
9221
9411
|
/*
|
9222
|
-
DPCT1118:
|
9412
|
+
DPCT1118:1: SYCL group functions and algorithms must be encountered
|
9223
9413
|
in converged control flow. You may need to adjust the code.
|
9224
9414
|
*/
|
9225
|
-
|
9226
|
-
DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
|
9227
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
9228
|
-
better performance if there is no access to global memory.
|
9229
|
-
*/
|
9230
|
-
item_ct1.barrier();
|
9415
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
9231
9416
|
}
|
9232
9417
|
}
|
9418
|
+
|
9419
|
+
// copy the result to dst without the padding
|
9420
|
+
if (col < ncols) {
|
9421
|
+
dst[row * ncols + col] = dst_row[col];
|
9422
|
+
}
|
9233
9423
|
}
|
9234
9424
|
|
9425
|
+
|
9235
9426
|
static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
|
9236
9427
|
const sycl::nd_item<3> &item_ct1) {
|
9237
9428
|
const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
@@ -10210,31 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
|
10210
10401
|
#endif
|
10211
10402
|
}
|
10212
10403
|
|
10213
|
-
|
10214
10404
|
template <typename dst_t>
|
10215
|
-
static void
|
10405
|
+
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
10216
10406
|
dpct::queue_ptr stream) {
|
10217
10407
|
const int nb = k / QK_K;
|
10218
10408
|
{
|
10219
|
-
|
10220
|
-
|
10221
|
-
|
10409
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10410
|
+
{sycl::aspect::fp16});
|
10411
|
+
|
10412
|
+
stream->submit([&](sycl::handler &cgh) {
|
10413
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10414
|
+
sycl::range<3>(1, 1, 32),
|
10415
|
+
sycl::range<3>(1, 1, 32)),
|
10416
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10417
|
+
dequantize_block_iq1_s(
|
10418
|
+
vx, y, item_ct1, iq1s_grid_gpu
|
10419
|
+
);
|
10420
|
+
});
|
10421
|
+
});
|
10422
|
+
}
|
10423
|
+
}
|
10222
10424
|
|
10425
|
+
template <typename dst_t>
|
10426
|
+
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
|
10427
|
+
dpct::queue_ptr stream) {
|
10428
|
+
const int nb = k / QK_K;
|
10429
|
+
{
|
10223
10430
|
dpct::has_capability_or_fail(stream->get_device(),
|
10224
10431
|
{sycl::aspect::fp16});
|
10225
10432
|
|
10226
10433
|
stream->submit([&](sycl::handler &cgh) {
|
10227
|
-
|
10228
|
-
|
10229
|
-
|
10434
|
+
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10435
|
+
sycl::range<3>(1, 1, 32),
|
10436
|
+
sycl::range<3>(1, 1, 32)),
|
10437
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10438
|
+
dequantize_block_iq1_m(
|
10439
|
+
vx, y, item_ct1, iq1s_grid_gpu
|
10440
|
+
);
|
10441
|
+
});
|
10442
|
+
});
|
10443
|
+
}
|
10444
|
+
}
|
10445
|
+
|
10446
|
+
template <typename dst_t>
|
10447
|
+
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
|
10448
|
+
dpct::queue_ptr stream) {
|
10449
|
+
const int nb = k / QK_K;
|
10450
|
+
{
|
10451
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10452
|
+
{sycl::aspect::fp16});
|
10230
10453
|
|
10454
|
+
stream->submit([&](sycl::handler &cgh) {
|
10231
10455
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10232
10456
|
sycl::range<3>(1, 1, 32),
|
10233
10457
|
sycl::range<3>(1, 1, 32)),
|
10234
10458
|
[=](sycl::nd_item<3> item_ct1) {
|
10235
10459
|
dequantize_block_iq2_xxs(
|
10236
|
-
vx, y, item_ct1,
|
10237
|
-
|
10460
|
+
vx, y, item_ct1, iq2xxs_grid,
|
10461
|
+
ksigns_iq2xs, kmask_iq2xs);
|
10238
10462
|
});
|
10239
10463
|
});
|
10240
10464
|
}
|
@@ -10245,117 +10469,130 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
|
|
10245
10469
|
dpct::queue_ptr stream) {
|
10246
10470
|
const int nb = k / QK_K;
|
10247
10471
|
{
|
10248
|
-
iq2xs_grid.init(*stream);
|
10249
|
-
ksigns_iq2xs.init(*stream);
|
10250
|
-
kmask_iq2xs.init(*stream);
|
10251
|
-
|
10252
10472
|
dpct::has_capability_or_fail(stream->get_device(),
|
10253
10473
|
{sycl::aspect::fp16});
|
10254
10474
|
|
10255
10475
|
stream->submit([&](sycl::handler &cgh) {
|
10256
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
|
10257
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10258
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10259
|
-
|
10260
10476
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10261
10477
|
sycl::range<3>(1, 1, 32),
|
10262
10478
|
sycl::range<3>(1, 1, 32)),
|
10263
10479
|
[=](sycl::nd_item<3> item_ct1) {
|
10264
10480
|
dequantize_block_iq2_xs(
|
10265
|
-
vx, y, item_ct1,
|
10266
|
-
|
10481
|
+
vx, y, item_ct1, iq2xs_grid,
|
10482
|
+
ksigns_iq2xs, kmask_iq2xs);
|
10267
10483
|
});
|
10268
10484
|
});
|
10269
10485
|
}
|
10270
10486
|
}
|
10271
10487
|
|
10272
10488
|
template <typename dst_t>
|
10273
|
-
static void
|
10274
|
-
|
10489
|
+
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
|
10490
|
+
dpct::queue_ptr stream) {
|
10275
10491
|
const int nb = k / QK_K;
|
10276
10492
|
{
|
10277
|
-
iq3xxs_grid.init(*stream);
|
10278
|
-
ksigns_iq2xs.init(*stream);
|
10279
|
-
kmask_iq2xs.init(*stream);
|
10280
|
-
|
10281
10493
|
dpct::has_capability_or_fail(stream->get_device(),
|
10282
10494
|
{sycl::aspect::fp16});
|
10283
10495
|
|
10284
10496
|
stream->submit([&](sycl::handler &cgh) {
|
10285
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10286
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10287
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10288
|
-
|
10289
10497
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10290
10498
|
sycl::range<3>(1, 1, 32),
|
10291
10499
|
sycl::range<3>(1, 1, 32)),
|
10292
10500
|
[=](sycl::nd_item<3> item_ct1) {
|
10293
|
-
|
10294
|
-
vx, y, item_ct1, iq3xxs_grid_ptr_ct1,
|
10295
|
-
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10501
|
+
dequantize_block_iq2_s(vx, y, item_ct1);
|
10296
10502
|
});
|
10297
10503
|
});
|
10298
10504
|
}
|
10299
10505
|
}
|
10300
10506
|
|
10507
|
+
|
10301
10508
|
template <typename dst_t>
|
10302
|
-
static void
|
10509
|
+
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
|
10303
10510
|
dpct::queue_ptr stream) {
|
10304
10511
|
const int nb = k / QK_K;
|
10305
10512
|
{
|
10306
|
-
iq3s_grid.init(*stream);
|
10307
|
-
ksigns_iq2xs.init(*stream);
|
10308
|
-
kmask_iq2xs.init(*stream);
|
10309
|
-
|
10310
10513
|
dpct::has_capability_or_fail(stream->get_device(),
|
10311
10514
|
{sycl::aspect::fp16});
|
10312
10515
|
|
10313
10516
|
stream->submit([&](sycl::handler &cgh) {
|
10314
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
|
10315
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10316
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10317
|
-
|
10318
10517
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10319
10518
|
sycl::range<3>(1, 1, 32),
|
10320
10519
|
sycl::range<3>(1, 1, 32)),
|
10321
10520
|
[=](sycl::nd_item<3> item_ct1) {
|
10322
|
-
|
10323
|
-
vx, y, item_ct1,
|
10324
|
-
|
10521
|
+
dequantize_block_iq3_xxs(
|
10522
|
+
vx, y, item_ct1, iq3xxs_grid,
|
10523
|
+
ksigns_iq2xs, kmask_iq2xs);
|
10325
10524
|
});
|
10326
10525
|
});
|
10327
10526
|
}
|
10328
10527
|
}
|
10329
10528
|
|
10330
10529
|
template <typename dst_t>
|
10331
|
-
static void
|
10530
|
+
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
|
10332
10531
|
dpct::queue_ptr stream) {
|
10333
10532
|
const int nb = k / QK_K;
|
10334
10533
|
{
|
10335
|
-
iq1s_grid_gpu.init(*stream);
|
10336
|
-
ksigns_iq2xs.init(*stream);
|
10337
|
-
kmask_iq2xs.init(*stream);
|
10338
|
-
|
10339
10534
|
dpct::has_capability_or_fail(stream->get_device(),
|
10340
10535
|
{sycl::aspect::fp16});
|
10341
10536
|
|
10342
10537
|
stream->submit([&](sycl::handler &cgh) {
|
10343
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
10344
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10345
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10346
|
-
|
10347
10538
|
cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10348
10539
|
sycl::range<3>(1, 1, 32),
|
10349
10540
|
sycl::range<3>(1, 1, 32)),
|
10350
10541
|
[=](sycl::nd_item<3> item_ct1) {
|
10351
|
-
|
10352
|
-
vx, y, item_ct1,
|
10353
|
-
ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
10542
|
+
dequantize_block_iq3_s(
|
10543
|
+
vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
|
10354
10544
|
});
|
10355
10545
|
});
|
10356
10546
|
}
|
10357
10547
|
}
|
10358
10548
|
|
10549
|
+
template <typename dst_t>
|
10550
|
+
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
10551
|
+
dpct::queue_ptr stream) {
|
10552
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
10553
|
+
#if QK_K == 64
|
10554
|
+
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
10555
|
+
#else
|
10556
|
+
{
|
10557
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10558
|
+
{sycl::aspect::fp16});
|
10559
|
+
|
10560
|
+
stream->submit([&](sycl::handler &cgh) {
|
10561
|
+
cgh.parallel_for(
|
10562
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10563
|
+
sycl::range<3>(1, 1, 32),
|
10564
|
+
sycl::range<3>(1, 1, 32)),
|
10565
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10566
|
+
dequantize_block_iq4_xs(vx, y, item_ct1);
|
10567
|
+
});
|
10568
|
+
});
|
10569
|
+
}
|
10570
|
+
#endif
|
10571
|
+
}
|
10572
|
+
|
10573
|
+
|
10574
|
+
template <typename dst_t>
|
10575
|
+
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
|
10576
|
+
dpct::queue_ptr stream) {
|
10577
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
10578
|
+
{
|
10579
|
+
dpct::has_capability_or_fail(stream->get_device(),
|
10580
|
+
{sycl::aspect::fp16});
|
10581
|
+
|
10582
|
+
stream->submit([&](sycl::handler &cgh) {
|
10583
|
+
cgh.parallel_for(
|
10584
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
10585
|
+
sycl::range<3>(1, 1, 32),
|
10586
|
+
sycl::range<3>(1, 1, 32)),
|
10587
|
+
[=](sycl::nd_item<3> item_ct1) {
|
10588
|
+
dequantize_block_iq4_nl(vx, y, item_ct1);
|
10589
|
+
});
|
10590
|
+
});
|
10591
|
+
}
|
10592
|
+
}
|
10593
|
+
|
10594
|
+
|
10595
|
+
|
10359
10596
|
template <typename src_t, typename dst_t>
|
10360
10597
|
static void convert_unary_sycl(const void *__restrict__ vx,
|
10361
10598
|
dst_t *__restrict__ y, const int k,
|
@@ -10400,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
|
|
10400
10637
|
return dequantize_row_q5_K_sycl;
|
10401
10638
|
case GGML_TYPE_Q6_K:
|
10402
10639
|
return dequantize_row_q6_K_sycl;
|
10640
|
+
case GGML_TYPE_IQ1_S:
|
10641
|
+
return dequantize_row_iq1_s_sycl;
|
10642
|
+
case GGML_TYPE_IQ1_M:
|
10643
|
+
return dequantize_row_iq1_m_sycl;
|
10403
10644
|
case GGML_TYPE_IQ2_XXS:
|
10404
10645
|
return dequantize_row_iq2_xxs_sycl;
|
10405
10646
|
case GGML_TYPE_IQ2_XS:
|
10406
10647
|
return dequantize_row_iq2_xs_sycl;
|
10648
|
+
case GGML_TYPE_IQ2_S:
|
10649
|
+
return dequantize_row_iq2_s_sycl;
|
10407
10650
|
case GGML_TYPE_IQ3_XXS:
|
10408
10651
|
return dequantize_row_iq3_xxs_sycl;
|
10409
10652
|
case GGML_TYPE_IQ3_S:
|
10410
10653
|
return dequantize_row_iq3_s_sycl;
|
10411
|
-
case
|
10412
|
-
return
|
10654
|
+
case GGML_TYPE_IQ4_XS:
|
10655
|
+
return dequantize_row_iq4_xs_sycl;
|
10656
|
+
case GGML_TYPE_IQ4_NL:
|
10657
|
+
return dequantize_row_iq4_nl_sycl;
|
10413
10658
|
case GGML_TYPE_F32:
|
10414
10659
|
return convert_unary_sycl<float>;
|
10415
10660
|
default:
|
@@ -10444,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
|
|
10444
10689
|
return dequantize_row_q5_K_sycl;
|
10445
10690
|
case GGML_TYPE_Q6_K:
|
10446
10691
|
return dequantize_row_q6_K_sycl;
|
10692
|
+
case GGML_TYPE_IQ1_S:
|
10693
|
+
return dequantize_row_iq1_s_sycl;
|
10694
|
+
case GGML_TYPE_IQ1_M:
|
10695
|
+
return dequantize_row_iq1_m_sycl;
|
10447
10696
|
case GGML_TYPE_IQ2_XXS:
|
10448
10697
|
return dequantize_row_iq2_xxs_sycl;
|
10449
10698
|
case GGML_TYPE_IQ2_XS:
|
10450
10699
|
return dequantize_row_iq2_xs_sycl;
|
10700
|
+
case GGML_TYPE_IQ2_S:
|
10701
|
+
return dequantize_row_iq2_s_sycl;
|
10451
10702
|
case GGML_TYPE_IQ3_XXS:
|
10452
10703
|
return dequantize_row_iq3_xxs_sycl;
|
10453
10704
|
case GGML_TYPE_IQ3_S:
|
10454
10705
|
return dequantize_row_iq3_s_sycl;
|
10455
|
-
case
|
10456
|
-
return
|
10706
|
+
case GGML_TYPE_IQ4_XS:
|
10707
|
+
return dequantize_row_iq4_xs_sycl;
|
10708
|
+
case GGML_TYPE_IQ4_NL:
|
10709
|
+
return dequantize_row_iq4_nl_sycl;
|
10457
10710
|
case GGML_TYPE_F16:
|
10458
10711
|
return convert_unary_sycl<sycl::half>;
|
10459
10712
|
default:
|
@@ -10675,12 +10928,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10675
10928
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10676
10929
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10677
10930
|
{
|
10678
|
-
iq3xxs_grid.init(*stream);
|
10679
|
-
ksigns64.init(*stream);
|
10680
10931
|
|
10681
10932
|
stream->submit([&](sycl::handler &cgh) {
|
10682
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10683
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10684
10933
|
|
10685
10934
|
cgh.parallel_for(
|
10686
10935
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10688,8 +10937,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10688
10937
|
[[intel::reqd_sub_group_size(32)]] {
|
10689
10938
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
10690
10939
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
10691
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10692
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10940
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10693
10941
|
});
|
10694
10942
|
});
|
10695
10943
|
}
|
@@ -10704,12 +10952,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10704
10952
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10705
10953
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10706
10954
|
{
|
10707
|
-
iq3xxs_grid.init(*stream);
|
10708
|
-
ksigns64.init(*stream);
|
10709
10955
|
|
10710
10956
|
stream->submit([&](sycl::handler &cgh) {
|
10711
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10712
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10713
10957
|
|
10714
10958
|
cgh.parallel_for(
|
10715
10959
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10717,8 +10961,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10717
10961
|
[[intel::reqd_sub_group_size(32)]] {
|
10718
10962
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
10719
10963
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
10720
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10721
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10964
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10722
10965
|
});
|
10723
10966
|
});
|
10724
10967
|
}
|
@@ -10733,12 +10976,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10733
10976
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10734
10977
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10735
10978
|
{
|
10736
|
-
iq3xxs_grid.init(*stream);
|
10737
|
-
ksigns64.init(*stream);
|
10738
10979
|
|
10739
10980
|
stream->submit([&](sycl::handler &cgh) {
|
10740
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10741
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10742
10981
|
|
10743
10982
|
cgh.parallel_for(
|
10744
10983
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10746,8 +10985,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10746
10985
|
[[intel::reqd_sub_group_size(32)]] {
|
10747
10986
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
10748
10987
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
10749
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10750
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
10988
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10751
10989
|
});
|
10752
10990
|
});
|
10753
10991
|
}
|
@@ -10762,12 +11000,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10762
11000
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10763
11001
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10764
11002
|
{
|
10765
|
-
iq3xxs_grid.init(*stream);
|
10766
|
-
ksigns64.init(*stream);
|
10767
11003
|
|
10768
11004
|
stream->submit([&](sycl::handler &cgh) {
|
10769
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10770
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10771
11005
|
|
10772
11006
|
cgh.parallel_for(
|
10773
11007
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10775,8 +11009,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
10775
11009
|
[[intel::reqd_sub_group_size(32)]] {
|
10776
11010
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
10777
11011
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
10778
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10779
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11012
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10780
11013
|
});
|
10781
11014
|
});
|
10782
11015
|
}
|
@@ -10791,12 +11024,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10791
11024
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10792
11025
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10793
11026
|
{
|
10794
|
-
iq3xxs_grid.init(*stream);
|
10795
|
-
ksigns64.init(*stream);
|
10796
11027
|
|
10797
11028
|
stream->submit([&](sycl::handler &cgh) {
|
10798
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10799
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10800
11029
|
|
10801
11030
|
cgh.parallel_for(
|
10802
11031
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10804,8 +11033,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
10804
11033
|
[[intel::reqd_sub_group_size(32)]] {
|
10805
11034
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
10806
11035
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
10807
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10808
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11036
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10809
11037
|
});
|
10810
11038
|
});
|
10811
11039
|
}
|
@@ -10820,12 +11048,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10820
11048
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10821
11049
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10822
11050
|
{
|
10823
|
-
iq3xxs_grid.init(*stream);
|
10824
|
-
ksigns64.init(*stream);
|
10825
11051
|
|
10826
11052
|
stream->submit([&](sycl::handler &cgh) {
|
10827
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10828
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10829
11053
|
|
10830
11054
|
cgh.parallel_for(
|
10831
11055
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10833,8 +11057,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10833
11057
|
[[intel::reqd_sub_group_size(32)]] {
|
10834
11058
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
10835
11059
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
10836
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10837
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11060
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10838
11061
|
});
|
10839
11062
|
});
|
10840
11063
|
}
|
@@ -10849,12 +11072,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10849
11072
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10850
11073
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10851
11074
|
{
|
10852
|
-
iq3xxs_grid.init(*stream);
|
10853
|
-
ksigns64.init(*stream);
|
10854
11075
|
|
10855
11076
|
stream->submit([&](sycl::handler &cgh) {
|
10856
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10857
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10858
11077
|
|
10859
11078
|
cgh.parallel_for(
|
10860
11079
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10862,8 +11081,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10862
11081
|
[[intel::reqd_sub_group_size(32)]] {
|
10863
11082
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
10864
11083
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
10865
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10866
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11084
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10867
11085
|
});
|
10868
11086
|
});
|
10869
11087
|
}
|
@@ -10878,12 +11096,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10878
11096
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10879
11097
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10880
11098
|
{
|
10881
|
-
iq3xxs_grid.init(*stream);
|
10882
|
-
ksigns64.init(*stream);
|
10883
11099
|
|
10884
11100
|
stream->submit([&](sycl::handler &cgh) {
|
10885
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10886
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10887
11101
|
|
10888
11102
|
cgh.parallel_for(
|
10889
11103
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10891,8 +11105,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10891
11105
|
[[intel::reqd_sub_group_size(32)]] {
|
10892
11106
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
10893
11107
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
10894
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10895
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11108
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10896
11109
|
});
|
10897
11110
|
});
|
10898
11111
|
}
|
@@ -10907,12 +11120,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10907
11120
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10908
11121
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10909
11122
|
{
|
10910
|
-
iq3xxs_grid.init(*stream);
|
10911
|
-
ksigns64.init(*stream);
|
10912
11123
|
|
10913
11124
|
stream->submit([&](sycl::handler &cgh) {
|
10914
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10915
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10916
11125
|
|
10917
11126
|
cgh.parallel_for(
|
10918
11127
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10920,8 +11129,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10920
11129
|
[[intel::reqd_sub_group_size(32)]] {
|
10921
11130
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
10922
11131
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
10923
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10924
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11132
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10925
11133
|
});
|
10926
11134
|
});
|
10927
11135
|
}
|
@@ -10936,12 +11144,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10936
11144
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10937
11145
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10938
11146
|
{
|
10939
|
-
iq3xxs_grid.init(*stream);
|
10940
|
-
ksigns64.init(*stream);
|
10941
11147
|
|
10942
11148
|
stream->submit([&](sycl::handler &cgh) {
|
10943
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
|
10944
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
10945
11149
|
|
10946
11150
|
cgh.parallel_for(
|
10947
11151
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
@@ -10949,13 +11153,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
10949
11153
|
[[intel::reqd_sub_group_size(32)]] {
|
10950
11154
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
10951
11155
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
10952
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10953
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11156
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10954
11157
|
});
|
10955
11158
|
});
|
10956
11159
|
}
|
10957
11160
|
}
|
10958
11161
|
|
11162
|
+
|
10959
11163
|
static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
10960
11164
|
float *dst, const int ncols,
|
10961
11165
|
const int nrows,
|
@@ -10965,23 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
10965
11169
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10966
11170
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10967
11171
|
{
|
10968
|
-
iq2xxs_grid.init(*stream);
|
10969
|
-
ksigns_iq2xs.init(*stream);
|
10970
|
-
kmask_iq2xs.init(*stream);
|
10971
|
-
|
10972
|
-
|
10973
11172
|
stream->submit([&](sycl::handler &cgh) {
|
10974
|
-
auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
|
10975
|
-
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
10976
|
-
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
10977
|
-
|
10978
11173
|
cgh.parallel_for(
|
10979
11174
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10980
11175
|
[=](sycl::nd_item<3> item_ct1)
|
10981
11176
|
[[intel::reqd_sub_group_size(32)]] {
|
10982
11177
|
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
10983
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
10984
|
-
iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
|
11178
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
10985
11179
|
});
|
10986
11180
|
});
|
10987
11181
|
}
|
@@ -10996,20 +11190,42 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
10996
11190
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
10997
11191
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
10998
11192
|
{
|
10999
|
-
iq2xs_grid.init(*stream);
|
11000
|
-
ksigns64.init(*stream);
|
11001
11193
|
|
11002
11194
|
stream->submit([&](sycl::handler &cgh) {
|
11003
|
-
auto iq2xs_grid_ptr_ct1 = iq2xs_grid
|
11004
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
11195
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
11196
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11005
11197
|
|
11006
11198
|
cgh.parallel_for(
|
11007
11199
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11008
11200
|
[=](sycl::nd_item<3> item_ct1)
|
11009
11201
|
[[intel::reqd_sub_group_size(32)]] {
|
11010
11202
|
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
11011
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
11012
|
-
|
11203
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11204
|
+
});
|
11205
|
+
});
|
11206
|
+
}
|
11207
|
+
}
|
11208
|
+
|
11209
|
+
static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
11210
|
+
float *dst, const int ncols,
|
11211
|
+
const int nrows,
|
11212
|
+
dpct::queue_ptr stream) {
|
11213
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
11214
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
11215
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11216
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11217
|
+
{
|
11218
|
+
|
11219
|
+
stream->submit([&](sycl::handler &cgh) {
|
11220
|
+
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
11221
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11222
|
+
|
11223
|
+
cgh.parallel_for(
|
11224
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11225
|
+
[=](sycl::nd_item<3> item_ct1)
|
11226
|
+
[[intel::reqd_sub_group_size(32)]] {
|
11227
|
+
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
|
11228
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11013
11229
|
});
|
11014
11230
|
});
|
11015
11231
|
}
|
@@ -11024,20 +11240,17 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
11024
11240
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11025
11241
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11026
11242
|
{
|
11027
|
-
iq3xxs_grid.init(*stream);
|
11028
|
-
ksigns64.init(*stream);
|
11029
11243
|
|
11030
11244
|
stream->submit([&](sycl::handler &cgh) {
|
11031
|
-
auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid
|
11032
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
11245
|
+
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
11246
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11033
11247
|
|
11034
11248
|
cgh.parallel_for(
|
11035
11249
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11036
11250
|
[=](sycl::nd_item<3> item_ct1)
|
11037
11251
|
[[intel::reqd_sub_group_size(32)]] {
|
11038
11252
|
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
11039
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
11040
|
-
iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11253
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11041
11254
|
});
|
11042
11255
|
});
|
11043
11256
|
}
|
@@ -11052,20 +11265,16 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11052
11265
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11053
11266
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11054
11267
|
{
|
11055
|
-
iq3s_grid.init(*stream);
|
11056
|
-
ksigns64.init(*stream);
|
11057
11268
|
|
11058
11269
|
stream->submit([&](sycl::handler &cgh) {
|
11059
|
-
auto iq3s_grid_ptr_ct1 = iq3s_grid
|
11060
|
-
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
11270
|
+
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
11061
11271
|
|
11062
11272
|
cgh.parallel_for(
|
11063
11273
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11064
11274
|
[=](sycl::nd_item<3> item_ct1)
|
11065
11275
|
[[intel::reqd_sub_group_size(32)]] {
|
11066
11276
|
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
11067
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
11068
|
-
iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
|
11277
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11069
11278
|
});
|
11070
11279
|
});
|
11071
11280
|
}
|
@@ -11080,20 +11289,82 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
11080
11289
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11081
11290
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11082
11291
|
{
|
11083
|
-
iq1s_grid_gpu.init(*stream);
|
11084
|
-
ksigns64.init(*stream);
|
11085
11292
|
|
11086
11293
|
stream->submit([&](sycl::handler &cgh) {
|
11087
|
-
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu
|
11088
|
-
auto ksigns64_ptr_ct1 = ksigns64
|
11294
|
+
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
11295
|
+
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
11089
11296
|
|
11090
11297
|
cgh.parallel_for(
|
11091
11298
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11092
11299
|
[=](sycl::nd_item<3> item_ct1)
|
11093
11300
|
[[intel::reqd_sub_group_size(32)]] {
|
11094
11301
|
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
11095
|
-
vx, vy, dst, ncols, nrows, item_ct1
|
11096
|
-
|
11302
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11303
|
+
});
|
11304
|
+
});
|
11305
|
+
}
|
11306
|
+
}
|
11307
|
+
|
11308
|
+
static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
11309
|
+
float *dst, const int ncols,
|
11310
|
+
const int nrows,
|
11311
|
+
dpct::queue_ptr stream) {
|
11312
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
11313
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
11314
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11315
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11316
|
+
{
|
11317
|
+
stream->submit([&](sycl::handler &cgh) {
|
11318
|
+
cgh.parallel_for(
|
11319
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11320
|
+
[=](sycl::nd_item<3> item_ct1)
|
11321
|
+
[[intel::reqd_sub_group_size(32)]] {
|
11322
|
+
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
11323
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11324
|
+
});
|
11325
|
+
});
|
11326
|
+
}
|
11327
|
+
}
|
11328
|
+
|
11329
|
+
static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
11330
|
+
float *dst, const int ncols,
|
11331
|
+
const int nrows,
|
11332
|
+
dpct::queue_ptr stream) {
|
11333
|
+
GGML_ASSERT(ncols % QK4_NL == 0);
|
11334
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
11335
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11336
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11337
|
+
{
|
11338
|
+
|
11339
|
+
stream->submit([&](sycl::handler &cgh) {
|
11340
|
+
cgh.parallel_for(
|
11341
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11342
|
+
[=](sycl::nd_item<3> item_ct1)
|
11343
|
+
[[intel::reqd_sub_group_size(32)]] {
|
11344
|
+
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
11345
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11346
|
+
});
|
11347
|
+
});
|
11348
|
+
}
|
11349
|
+
}
|
11350
|
+
|
11351
|
+
static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
11352
|
+
float *dst, const int ncols,
|
11353
|
+
const int nrows,
|
11354
|
+
dpct::queue_ptr stream) {
|
11355
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
11356
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
11357
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
11358
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
11359
|
+
{
|
11360
|
+
|
11361
|
+
stream->submit([&](sycl::handler &cgh) {
|
11362
|
+
cgh.parallel_for(
|
11363
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
11364
|
+
[=](sycl::nd_item<3> item_ct1)
|
11365
|
+
[[intel::reqd_sub_group_size(32)]] {
|
11366
|
+
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
|
11367
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
11097
11368
|
});
|
11098
11369
|
});
|
11099
11370
|
}
|
@@ -12717,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
12717
12988
|
});
|
12718
12989
|
}
|
12719
12990
|
|
12991
|
+
static int next_power_of_2(int x) {
|
12992
|
+
int n = 1;
|
12993
|
+
while (n < x) {
|
12994
|
+
n *= 2;
|
12995
|
+
}
|
12996
|
+
return n;
|
12997
|
+
}
|
12998
|
+
|
12720
12999
|
static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
12721
13000
|
const int nrows, ggml_sort_order order,
|
12722
13001
|
dpct::queue_ptr stream) {
|
12723
13002
|
// bitonic sort requires ncols to be power of 2
|
12724
|
-
|
13003
|
+
const int ncols_pad = next_power_of_2(ncols);
|
12725
13004
|
|
12726
|
-
const sycl::range<3> block_dims(1, 1,
|
13005
|
+
const sycl::range<3> block_dims(1, 1, ncols_pad);
|
12727
13006
|
const sycl::range<3> block_nums(1, nrows, 1);
|
13007
|
+
const size_t shared_mem = ncols_pad * sizeof(int);
|
13008
|
+
|
13009
|
+
// GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
13010
|
+
|
12728
13011
|
if (order == GGML_SORT_ORDER_ASC) {
|
12729
|
-
|
12730
|
-
|
12731
|
-
|
12732
|
-
|
12733
|
-
|
12734
|
-
|
12735
|
-
|
12736
|
-
|
12737
|
-
|
12738
|
-
|
13012
|
+
stream->submit([&](sycl::handler &cgh) {
|
13013
|
+
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
13014
|
+
sycl::range<1>(shared_mem), cgh);
|
13015
|
+
|
13016
|
+
cgh.parallel_for(
|
13017
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
13018
|
+
[=](sycl::nd_item<3> item_ct1) {
|
13019
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
|
13020
|
+
x, dst, ncols, ncols_pad, item_ct1,
|
13021
|
+
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
13022
|
+
.get());
|
13023
|
+
});
|
13024
|
+
});
|
12739
13025
|
} else if (order == GGML_SORT_ORDER_DESC) {
|
12740
|
-
|
12741
|
-
|
12742
|
-
|
12743
|
-
|
12744
|
-
|
12745
|
-
|
12746
|
-
|
12747
|
-
|
12748
|
-
|
12749
|
-
|
13026
|
+
stream->submit([&](sycl::handler &cgh) {
|
13027
|
+
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
13028
|
+
sycl::range<1>(shared_mem), cgh);
|
13029
|
+
|
13030
|
+
cgh.parallel_for(
|
13031
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
13032
|
+
[=](sycl::nd_item<3> item_ct1) {
|
13033
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
|
13034
|
+
x, dst, ncols, ncols_pad, item_ct1,
|
13035
|
+
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
13036
|
+
.get());
|
13037
|
+
});
|
13038
|
+
});
|
12750
13039
|
} else {
|
12751
13040
|
GGML_ASSERT(false);
|
12752
13041
|
}
|
@@ -13128,6 +13417,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
13128
13417
|
}
|
13129
13418
|
|
13130
13419
|
void ggml_backend_sycl_print_sycl_devices() {
|
13420
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
13131
13421
|
int device_count = dpct::dev_mgr::instance().device_count();
|
13132
13422
|
std::map<std::string, size_t> DeviceNums;
|
13133
13423
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
@@ -13181,11 +13471,13 @@ int get_work_group_size(int user_device_id) {
|
|
13181
13471
|
return prop.get_max_work_group_size();
|
13182
13472
|
}
|
13183
13473
|
|
13184
|
-
void ggml_init_sycl() try {
|
13474
|
+
static void ggml_init_sycl() try {
|
13185
13475
|
static bool initialized = false;
|
13186
13476
|
|
13187
13477
|
if (!initialized) {
|
13478
|
+
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
13188
13479
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
13480
|
+
|
13189
13481
|
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
13190
13482
|
|
13191
13483
|
#if defined(GGML_SYCL_F16)
|
@@ -13871,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
|
13871
14163
|
case GGML_TYPE_Q5_K:
|
13872
14164
|
case GGML_TYPE_IQ2_XXS:
|
13873
14165
|
case GGML_TYPE_IQ2_XS:
|
14166
|
+
case GGML_TYPE_IQ2_S:
|
13874
14167
|
case GGML_TYPE_IQ1_S:
|
14168
|
+
case GGML_TYPE_IQ1_M:
|
13875
14169
|
case GGML_TYPE_IQ3_XXS:
|
14170
|
+
case GGML_TYPE_IQ4_XS:
|
14171
|
+
case GGML_TYPE_IQ4_NL:
|
13876
14172
|
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
13877
14173
|
case GGML_TYPE_IQ3_S:
|
13878
14174
|
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
@@ -13891,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
13891
14187
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
13892
14188
|
const dpct::queue_ptr &stream) {
|
13893
14189
|
|
13894
|
-
|
14190
|
+
const int64_t ne10 = src1->ne[0];
|
14191
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
13895
14192
|
|
13896
14193
|
const int64_t ne00 = src0->ne[0];
|
13897
14194
|
const int64_t row_diff = row_high - row_low;
|
13898
14195
|
|
14196
|
+
int id;
|
14197
|
+
SYCL_CHECK(
|
14198
|
+
CHECK_TRY_ERROR(id = get_current_device_id()));
|
14199
|
+
|
14200
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
14201
|
+
// nrows_dst == nrows of the matrix that the kernel writes into
|
14202
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff;
|
14203
|
+
|
13899
14204
|
switch (src0->type) {
|
13900
14205
|
case GGML_TYPE_Q4_0:
|
13901
14206
|
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
@@ -13927,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
13927
14232
|
case GGML_TYPE_Q6_K:
|
13928
14233
|
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13929
14234
|
break;
|
14235
|
+
case GGML_TYPE_IQ1_S:
|
14236
|
+
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14237
|
+
break;
|
14238
|
+
case GGML_TYPE_IQ1_M:
|
14239
|
+
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14240
|
+
break;
|
13930
14241
|
case GGML_TYPE_IQ2_XXS:
|
13931
14242
|
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13932
14243
|
break;
|
13933
14244
|
case GGML_TYPE_IQ2_XS:
|
13934
14245
|
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13935
14246
|
break;
|
14247
|
+
case GGML_TYPE_IQ2_S:
|
14248
|
+
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14249
|
+
break;
|
13936
14250
|
case GGML_TYPE_IQ3_XXS:
|
13937
14251
|
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13938
14252
|
break;
|
13939
14253
|
case GGML_TYPE_IQ3_S:
|
13940
14254
|
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13941
14255
|
break;
|
13942
|
-
case
|
13943
|
-
|
14256
|
+
case GGML_TYPE_IQ4_NL:
|
14257
|
+
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
14258
|
+
break;
|
14259
|
+
case GGML_TYPE_IQ4_XS:
|
14260
|
+
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
13944
14261
|
break;
|
13945
14262
|
default:
|
13946
14263
|
GGML_ASSERT(false);
|
@@ -14022,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
14022
14339
|
convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
14023
14340
|
break;
|
14024
14341
|
default:
|
14342
|
+
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
|
14025
14343
|
GGML_ASSERT(false);
|
14026
14344
|
break;
|
14027
14345
|
}
|
@@ -14876,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
14876
15194
|
src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
|
14877
15195
|
}
|
14878
15196
|
// do the computation
|
14879
|
-
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
14880
|
-
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream);
|
15197
|
+
SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
15198
|
+
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
14881
15199
|
/*
|
14882
15200
|
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
14883
15201
|
use the error codes. The call was replaced with 0. You need to
|
@@ -15246,6 +15564,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15246
15564
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
15247
15565
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
|
15248
15566
|
|
15567
|
+
bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
|
15568
|
+
main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
|
15569
|
+
|
15249
15570
|
SYCL_CHECK(
|
15250
15571
|
CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
|
15251
15572
|
|
@@ -15276,24 +15597,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15276
15597
|
|
15277
15598
|
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
15278
15599
|
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
15600
|
+
if (no_mixed_dtypes) {
|
15601
|
+
cu_compute_type = dpct::library_data_t::real_half;
|
15602
|
+
cu_data_type = dpct::library_data_t::real_half;
|
15603
|
+
}
|
15279
15604
|
|
15280
15605
|
// dst strides
|
15281
15606
|
size_t nbd2 = dst->nb[2];
|
15282
15607
|
size_t nbd3 = dst->nb[3];
|
15283
15608
|
|
15609
|
+
const float alpha_f32 = 1.0f;
|
15610
|
+
const float beta_f32 = 0.0f;
|
15611
|
+
|
15284
15612
|
const sycl::half alpha_f16 = 1.0f;
|
15285
15613
|
const sycl::half beta_f16 = 0.0f;
|
15286
15614
|
|
15287
|
-
const float alpha_f32 = 1.0f;
|
15288
|
-
const float beta_f32 = 0.0f;
|
15289
|
-
|
15290
15615
|
const void * alpha = &alpha_f32;
|
15291
15616
|
const void * beta = &beta_f32;
|
15617
|
+
if (no_mixed_dtypes) {
|
15618
|
+
alpha = &alpha_f16;
|
15619
|
+
beta = &beta_f16;
|
15620
|
+
}
|
15292
15621
|
|
15293
15622
|
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
15294
|
-
// oneMKL open source supports half, half, float, float: datatypes
|
15623
|
+
// when oneMKL open source supports half, half, float, float: datatypes
|
15295
15624
|
|
15296
15625
|
dst_t = (char *) dst_ddf;
|
15626
|
+
if (no_mixed_dtypes) {
|
15627
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
15628
|
+
|
15629
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
15630
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
15631
|
+
}
|
15297
15632
|
|
15298
15633
|
GGML_ASSERT(ne12 % ne02 == 0);
|
15299
15634
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -15379,6 +15714,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15379
15714
|
}
|
15380
15715
|
#endif
|
15381
15716
|
|
15717
|
+
if (no_mixed_dtypes) {
|
15718
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
15719
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
15720
|
+
}
|
15382
15721
|
}
|
15383
15722
|
catch (sycl::exception const &exc) {
|
15384
15723
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -15437,11 +15776,17 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
15437
15776
|
#ifdef GGML_SYCL_FORCE_DMMV
|
15438
15777
|
const bool use_mul_mat_vec_q = false;
|
15439
15778
|
#else
|
15440
|
-
|
15779
|
+
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
15780
|
+
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
15781
|
+
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
15782
|
+
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
15783
|
+
(src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
|
15784
|
+
(src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
|
15785
|
+
|
15786
|
+
|
15441
15787
|
#endif // GGML_SYCL_FORCE_DMMV
|
15442
15788
|
|
15443
15789
|
if (use_mul_mat_vec_q) {
|
15444
|
-
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
15445
15790
|
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
15446
15791
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
15447
15792
|
} else {
|
@@ -16278,6 +16623,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16278
16623
|
}
|
16279
16624
|
|
16280
16625
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
16626
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
16281
16627
|
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
16282
16628
|
|
16283
16629
|
if (!g_sycl_gpu_mgr) {
|
@@ -16312,6 +16658,7 @@ catch (sycl::exception const &exc) {
|
|
16312
16658
|
|
16313
16659
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
16314
16660
|
size_t description_size) try {
|
16661
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
16315
16662
|
dpct::device_info prop;
|
16316
16663
|
int device_id = g_sycl_gpu_mgr->gpus[device];
|
16317
16664
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
@@ -16326,6 +16673,7 @@ catch (sycl::exception const &exc) {
|
|
16326
16673
|
|
16327
16674
|
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
16328
16675
|
size_t *total) try {
|
16676
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
16329
16677
|
ggml_sycl_set_device(device);
|
16330
16678
|
|
16331
16679
|
/*
|
@@ -16677,6 +17025,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
|
16677
17025
|
};
|
16678
17026
|
|
16679
17027
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
17028
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
17029
|
+
|
16680
17030
|
if (device_index>=g_device_count or device_index<0) {
|
16681
17031
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
16682
17032
|
device_index, g_device_count-1);
|
@@ -17046,6 +17396,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
17046
17396
|
};
|
17047
17397
|
|
17048
17398
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
17399
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
|
17400
|
+
ggml_init_sycl();
|
17049
17401
|
// FIXME: this is not thread safe
|
17050
17402
|
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
17051
17403
|
|
@@ -17117,6 +17469,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
|
17117
17469
|
}
|
17118
17470
|
|
17119
17471
|
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
17472
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
|
17120
17473
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
17121
17474
|
/* .iface = */ {
|
17122
17475
|
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
@@ -17231,7 +17584,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|
17231
17584
|
params.ith = 0;
|
17232
17585
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
17233
17586
|
ggml_tensor * node = cgraph->nodes[i];
|
17234
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
17587
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
17235
17588
|
continue;
|
17236
17589
|
}
|
17237
17590
|
#ifndef NDEBUG
|
@@ -17289,9 +17642,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17289
17642
|
return false;
|
17290
17643
|
}
|
17291
17644
|
ggml_type a_type = a->type;
|
17292
|
-
if (a_type == GGML_TYPE_IQ4_NL
|
17293
|
-
a_type ==
|
17294
|
-
|
17645
|
+
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
|
17646
|
+
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
17647
|
+
a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
|
17648
|
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
|
17649
|
+
) {
|
17650
|
+
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
17651
|
+
return false;
|
17652
|
+
}
|
17295
17653
|
}
|
17296
17654
|
return true;
|
17297
17655
|
} break;
|
@@ -17379,6 +17737,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17379
17737
|
UNUSED(backend);
|
17380
17738
|
}
|
17381
17739
|
|
17740
|
+
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
17741
|
+
const int min_batch_size = 32;
|
17742
|
+
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
17743
|
+
GGML_UNUSED(backend);
|
17744
|
+
}
|
17745
|
+
|
17746
|
+
|
17382
17747
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
17383
17748
|
/* .get_name = */ ggml_backend_sycl_name,
|
17384
17749
|
/* .free = */ ggml_backend_sycl_free,
|
@@ -17392,7 +17757,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
17392
17757
|
/* .graph_plan_compute = */ NULL,
|
17393
17758
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
17394
17759
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
17395
|
-
/* .offload_op = */
|
17760
|
+
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
17396
17761
|
/* .event_new = */ NULL,
|
17397
17762
|
/* .event_free = */ NULL,
|
17398
17763
|
/* .event_record = */ NULL,
|
@@ -17406,7 +17771,8 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|
17406
17771
|
}
|
17407
17772
|
|
17408
17773
|
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
17409
|
-
|
17774
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
17775
|
+
ggml_init_sycl();
|
17410
17776
|
|
17411
17777
|
check_allow_gpu_index(device);
|
17412
17778
|
|
@@ -17432,6 +17798,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
17432
17798
|
}
|
17433
17799
|
|
17434
17800
|
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
17801
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
17435
17802
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
17436
17803
|
return g_sycl_gpu_mgr->get_gpu_count();
|
17437
17804
|
}
|
@@ -17444,16 +17811,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
|
|
17444
17811
|
}
|
17445
17812
|
|
17446
17813
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
17814
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
|
17447
17815
|
return g_sycl_gpu_mgr->get_index(device_id);
|
17448
17816
|
}
|
17449
17817
|
|
17450
17818
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
17819
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
|
17451
17820
|
return g_sycl_gpu_mgr->gpus[device_index];
|
17452
17821
|
}
|
17453
17822
|
|
17454
17823
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
17455
|
-
|
17824
|
+
ggml_init_sycl();
|
17825
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
17456
17826
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
17827
|
+
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
17828
|
+
|
17457
17829
|
if (g_sycl_gpu_mgr) {
|
17458
17830
|
delete g_sycl_gpu_mgr;
|
17459
17831
|
}
|
@@ -17464,6 +17836,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
|
|
17464
17836
|
}
|
17465
17837
|
|
17466
17838
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
17839
|
+
ggml_init_sycl();
|
17840
|
+
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
|
17841
|
+
|
17467
17842
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
17468
17843
|
return;
|
17469
17844
|
}
|