llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -740,11 +740,7 @@ namespace dpct 
     | 
|
| 
       740 
740 
     | 
    
         | 
| 
       741 
741 
     | 
    
         
             
                    sycl::queue &default_queue()
         
     | 
| 
       742 
742 
     | 
    
         
             
                    {
         
     | 
| 
       743 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       744 
     | 
    
         
            -
                        return out_of_order_queue();
         
     | 
| 
       745 
     | 
    
         
            -
            #else
         
     | 
| 
       746 
743 
     | 
    
         
             
                        return in_order_queue();
         
     | 
| 
       747 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       748 
744 
     | 
    
         
             
                    }
         
     | 
| 
       749 
745 
     | 
    
         | 
| 
       750 
746 
     | 
    
         
             
                    void queues_wait_and_throw()
         
     | 
| 
         @@ -763,11 +759,7 @@ namespace dpct 
     | 
|
| 
       763 
759 
     | 
    
         | 
| 
       764 
760 
     | 
    
         
             
                    sycl::queue *create_queue(bool enable_exception_handler = false)
         
     | 
| 
       765 
761 
     | 
    
         
             
                    {
         
     | 
| 
       766 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       767 
     | 
    
         
            -
                        return create_out_of_order_queue(enable_exception_handler);
         
     | 
| 
       768 
     | 
    
         
            -
            #else
         
     | 
| 
       769 
762 
     | 
    
         
             
                        return create_in_order_queue(enable_exception_handler);
         
     | 
| 
       770 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       771 
763 
     | 
    
         
             
                    }
         
     | 
| 
       772 
764 
     | 
    
         | 
| 
       773 
765 
     | 
    
         
             
                    sycl::queue *create_queue(sycl::context context, sycl::device device,
         
     | 
| 
         @@ -1075,11 +1067,6 @@ namespace dpct 
     | 
|
| 
       1075 
1067 
     | 
    
         
             
                    static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
         
     | 
| 
       1076 
1068 
     | 
    
         
             
                                                                          const void *ptr)
         
     | 
| 
       1077 
1069 
     | 
    
         
             
                    {
         
     | 
| 
       1078 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1079 
     | 
    
         
            -
                        return mem_mgr::instance().is_device_ptr(ptr)
         
     | 
| 
       1080 
     | 
    
         
            -
                                   ? pointer_access_attribute::device_only
         
     | 
| 
       1081 
     | 
    
         
            -
                                   : pointer_access_attribute::host_only;
         
     | 
| 
       1082 
     | 
    
         
            -
            #else
         
     | 
| 
       1083 
1070 
     | 
    
         
             
                        switch (sycl::get_pointer_type(ptr, q.get_context()))
         
     | 
| 
       1084 
1071 
     | 
    
         
             
                        {
         
     | 
| 
       1085 
1072 
     | 
    
         
             
                        case sycl::usm::alloc::unknown:
         
     | 
| 
         @@ -1090,7 +1077,6 @@ namespace dpct 
     | 
|
| 
       1090 
1077 
     | 
    
         
             
                        case sycl::usm::alloc::host:
         
     | 
| 
       1091 
1078 
     | 
    
         
             
                            return pointer_access_attribute::host_device;
         
     | 
| 
       1092 
1079 
     | 
    
         
             
                        }
         
     | 
| 
       1093 
     | 
    
         
            -
            #endif
         
     | 
| 
       1094 
1080 
     | 
    
         
             
                    }
         
     | 
| 
       1095 
1081 
     | 
    
         | 
| 
       1096 
1082 
     | 
    
         
             
                    template <typename ArgT>
         
     | 
| 
         @@ -1273,11 +1259,7 @@ namespace dpct 
     | 
|
| 
       1273 
1259 
     | 
    
         | 
| 
       1274 
1260 
     | 
    
         
             
                    static inline void *dpct_malloc(size_t size, sycl::queue &q)
         
     | 
| 
       1275 
1261 
     | 
    
         
             
                    {
         
     | 
| 
       1276 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1277 
     | 
    
         
            -
                        return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
         
     | 
| 
       1278 
     | 
    
         
            -
            #else
         
     | 
| 
       1279 
1262 
     | 
    
         
             
                        return sycl::malloc_device(size, q.get_device(), q.get_context());
         
     | 
| 
       1280 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       1281 
1263 
     | 
    
         
             
                    }
         
     | 
| 
       1282 
1264 
     | 
    
         | 
| 
       1283 
1265 
     | 
    
         
             
            #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
         
     | 
| 
         @@ -1301,25 +1283,7 @@ namespace dpct 
     | 
|
| 
       1301 
1283 
     | 
    
         
             
                    static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
         
     | 
| 
       1302 
1284 
     | 
    
         
             
                                                          valueT value, size_t size)
         
     | 
| 
       1303 
1285 
     | 
    
         
             
                    {
         
     | 
| 
       1304 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1305 
     | 
    
         
            -
                        auto &mm = mem_mgr::instance();
         
     | 
| 
       1306 
     | 
    
         
            -
                        assert(mm.is_device_ptr(dev_ptr));
         
     | 
| 
       1307 
     | 
    
         
            -
                        auto alloc = mm.translate_ptr(dev_ptr);
         
     | 
| 
       1308 
     | 
    
         
            -
                        size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
         
     | 
| 
       1309 
     | 
    
         
            -
             
     | 
| 
       1310 
     | 
    
         
            -
                        return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1311 
     | 
    
         
            -
                                        {
         
     | 
| 
       1312 
     | 
    
         
            -
                auto r = sycl::range<1>(size);
         
     | 
| 
       1313 
     | 
    
         
            -
                auto o = sycl::id<1>(offset);
         
     | 
| 
       1314 
     | 
    
         
            -
                auto new_buffer = alloc.buffer.reinterpret<valueT>(
         
     | 
| 
       1315 
     | 
    
         
            -
                    sycl::range<1>(alloc.size / sizeof(valueT)));
         
     | 
| 
       1316 
     | 
    
         
            -
                sycl::accessor<valueT, 1, sycl::access_mode::write,
         
     | 
| 
       1317 
     | 
    
         
            -
                            sycl::access::target::device>
         
     | 
| 
       1318 
     | 
    
         
            -
                    acc(new_buffer, cgh, r, o);
         
     | 
| 
       1319 
     | 
    
         
            -
                cgh.fill(acc, value); });
         
     | 
| 
       1320 
     | 
    
         
            -
            #else
         
     | 
| 
       1321 
1286 
     | 
    
         
             
                        return q.fill(dev_ptr, value, size);
         
     | 
| 
       1322 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       1323 
1287 
     | 
    
         
             
                    }
         
     | 
| 
       1324 
1288 
     | 
    
         | 
| 
       1325 
1289 
     | 
    
         
             
                    /**
         
     | 
| 
         @@ -1413,72 +1377,8 @@ namespace dpct 
     | 
|
| 
       1413 
1377 
     | 
    
         
             
                    {
         
     | 
| 
       1414 
1378 
     | 
    
         
             
                        if (!size)
         
     | 
| 
       1415 
1379 
     | 
    
         
             
                            return sycl::event{};
         
     | 
| 
       1416 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1417 
     | 
    
         
            -
                        auto &mm = mem_mgr::instance();
         
     | 
| 
       1418 
     | 
    
         
            -
                        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
         
     | 
| 
       1419 
     | 
    
         
            -
             
     | 
| 
       1420 
     | 
    
         
            -
                        switch (real_direction)
         
     | 
| 
       1421 
     | 
    
         
            -
                        {
         
     | 
| 
       1422 
     | 
    
         
            -
                        case host_to_host:
         
     | 
| 
       1423 
     | 
    
         
            -
                            return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1424 
     | 
    
         
            -
                                            {
         
     | 
| 
       1425 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1426 
     | 
    
         
            -
                cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
         
     | 
| 
       1427 
     | 
    
         
            -
                        case host_to_device:
         
     | 
| 
       1428 
     | 
    
         
            -
                        {
         
     | 
| 
       1429 
     | 
    
         
            -
                            auto alloc = mm.translate_ptr(to_ptr);
         
     | 
| 
       1430 
     | 
    
         
            -
                            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
         
     | 
| 
       1431 
     | 
    
         
            -
                            return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1432 
     | 
    
         
            -
                                            {
         
     | 
| 
       1433 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1434 
     | 
    
         
            -
                auto r = sycl::range<1>(size);
         
     | 
| 
       1435 
     | 
    
         
            -
                auto o = sycl::id<1>(offset);
         
     | 
| 
       1436 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       1437 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1438 
     | 
    
         
            -
                    acc(alloc.buffer, cgh, r, o);
         
     | 
| 
       1439 
     | 
    
         
            -
                cgh.copy(from_ptr, acc); });
         
     | 
| 
       1440 
     | 
    
         
            -
                        }
         
     | 
| 
       1441 
     | 
    
         
            -
                        case device_to_host:
         
     | 
| 
       1442 
     | 
    
         
            -
                        {
         
     | 
| 
       1443 
     | 
    
         
            -
                            auto alloc = mm.translate_ptr(from_ptr);
         
     | 
| 
       1444 
     | 
    
         
            -
                            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
         
     | 
| 
       1445 
     | 
    
         
            -
                            return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1446 
     | 
    
         
            -
                                            {
         
     | 
| 
       1447 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1448 
     | 
    
         
            -
                auto r = sycl::range<1>(size);
         
     | 
| 
       1449 
     | 
    
         
            -
                auto o = sycl::id<1>(offset);
         
     | 
| 
       1450 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       1451 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1452 
     | 
    
         
            -
                    acc(alloc.buffer, cgh, r, o);
         
     | 
| 
       1453 
     | 
    
         
            -
                cgh.copy(acc, to_ptr); });
         
     | 
| 
       1454 
     | 
    
         
            -
                        }
         
     | 
| 
       1455 
     | 
    
         
            -
                        case device_to_device:
         
     | 
| 
       1456 
     | 
    
         
            -
                        {
         
     | 
| 
       1457 
     | 
    
         
            -
                            auto to_alloc = mm.translate_ptr(to_ptr);
         
     | 
| 
       1458 
     | 
    
         
            -
                            auto from_alloc = mm.translate_ptr(from_ptr);
         
     | 
| 
       1459 
     | 
    
         
            -
                            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
         
     | 
| 
       1460 
     | 
    
         
            -
                            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
         
     | 
| 
       1461 
     | 
    
         
            -
                            return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1462 
     | 
    
         
            -
                                            {
         
     | 
| 
       1463 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1464 
     | 
    
         
            -
                auto r = sycl::range<1>(size);
         
     | 
| 
       1465 
     | 
    
         
            -
                auto to_o = sycl::id<1>(to_offset);
         
     | 
| 
       1466 
     | 
    
         
            -
                auto from_o = sycl::id<1>(from_offset);
         
     | 
| 
       1467 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       1468 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1469 
     | 
    
         
            -
                    to_acc(to_alloc.buffer, cgh, r, to_o);
         
     | 
| 
       1470 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       1471 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1472 
     | 
    
         
            -
                    from_acc(from_alloc.buffer, cgh, r, from_o);
         
     | 
| 
       1473 
     | 
    
         
            -
                cgh.copy(from_acc, to_acc); });
         
     | 
| 
       1474 
     | 
    
         
            -
                        }
         
     | 
| 
       1475 
     | 
    
         
            -
                        default:
         
     | 
| 
       1476 
     | 
    
         
            -
                            throw std::runtime_error("dpct_memcpy: invalid direction value");
         
     | 
| 
       1477 
     | 
    
         
            -
                        }
         
     | 
| 
       1478 
     | 
    
         
            -
            #else
         
     | 
| 
       1479 
1380 
     | 
    
         
             
                        return q.memcpy(to_ptr, from_ptr, size, dep_events);
         
     | 
| 
       1480 
1381 
     | 
    
         
             
                        GGML_UNUSED(direction);
         
     | 
| 
       1481 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       1482 
1382 
     | 
    
         
             
                    }
         
     | 
| 
       1483 
1383 
     | 
    
         | 
| 
       1484 
1384 
     | 
    
         
             
                    // Get actual copy range and make sure it will not exceed range.
         
     | 
| 
         @@ -1618,45 +1518,15 @@ namespace dpct 
     | 
|
| 
       1618 
1518 
     | 
    
         
             
                            break;
         
     | 
| 
       1619 
1519 
     | 
    
         
             
                        }
         
     | 
| 
       1620 
1520 
     | 
    
         
             
                        case device_to_device:
         
     | 
| 
       1621 
     | 
    
         
            -
             
     | 
| 
       1622 
     | 
    
         
            -
             
     | 
| 
       1623 
     | 
    
         
            -
                             
     | 
| 
       1624 
     | 
    
         
            -
             
     | 
| 
       1625 
     | 
    
         
            -
             
     | 
| 
       1626 
     | 
    
         
            -
             
     | 
| 
       1627 
     | 
    
         
            -
             
     | 
| 
       1628 
     | 
    
         
            -
             
     | 
| 
       1629 
     | 
    
         
            -
             
     | 
| 
       1630 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1631 
     | 
    
         
            -
                auto to_o = sycl::id<1>(to_offset);
         
     | 
| 
       1632 
     | 
    
         
            -
                auto from_o = sycl::id<1>(from_offset);
         
     | 
| 
       1633 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       1634 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1635 
     | 
    
         
            -
                    to_acc(to_alloc.buffer, cgh,
         
     | 
| 
       1636 
     | 
    
         
            -
                            get_copy_range(size, to_slice, to_range.get(0)), to_o);
         
     | 
| 
       1637 
     | 
    
         
            -
                sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       1638 
     | 
    
         
            -
                                    sycl::access::target::device>
         
     | 
| 
       1639 
     | 
    
         
            -
                    from_acc(from_alloc.buffer, cgh,
         
     | 
| 
       1640 
     | 
    
         
            -
                            get_copy_range(size, from_slice, from_range.get(0)), from_o);
         
     | 
| 
       1641 
     | 
    
         
            -
                cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
         
     | 
| 
       1642 
     | 
    
         
            -
                    size,
         
     | 
| 
       1643 
     | 
    
         
            -
                    [=](sycl::id<3> id) {
         
     | 
| 
       1644 
     | 
    
         
            -
                        to_acc[get_offset(id, to_slice, to_range.get(0))] =
         
     | 
| 
       1645 
     | 
    
         
            -
                            from_acc[get_offset(id, from_slice, from_range.get(0))];
         
     | 
| 
       1646 
     | 
    
         
            -
                    }); }));
         
     | 
| 
       1647 
     | 
    
         
            -
                        }
         
     | 
| 
       1648 
     | 
    
         
            -
            #else
         
     | 
| 
       1649 
     | 
    
         
            -
                            event_list.push_back(q.submit([&](sycl::handler &cgh)
         
     | 
| 
       1650 
     | 
    
         
            -
                                                          {
         
     | 
| 
       1651 
     | 
    
         
            -
                cgh.depends_on(dep_events);
         
     | 
| 
       1652 
     | 
    
         
            -
                cgh.parallel_for<class dpct_memcpy_3d_detail>(
         
     | 
| 
       1653 
     | 
    
         
            -
                    size,
         
     | 
| 
       1654 
     | 
    
         
            -
                    [=](sycl::id<3> id) {
         
     | 
| 
       1655 
     | 
    
         
            -
                        to_surface[get_offset(id, to_slice, to_range.get(0))] =
         
     | 
| 
       1656 
     | 
    
         
            -
                            from_surface[get_offset(id, from_slice, from_range.get(0))];
         
     | 
| 
       1657 
     | 
    
         
            -
                    }); }));
         
     | 
| 
       1658 
     | 
    
         
            -
            #endif
         
     | 
| 
       1659 
     | 
    
         
            -
                        break;
         
     | 
| 
      
 1521 
     | 
    
         
            +
                            event_list.push_back(q.submit([&](sycl::handler &cgh){
         
     | 
| 
      
 1522 
     | 
    
         
            +
                            cgh.depends_on(dep_events);
         
     | 
| 
      
 1523 
     | 
    
         
            +
                            cgh.parallel_for<class dpct_memcpy_3d_detail>(
         
     | 
| 
      
 1524 
     | 
    
         
            +
                                size,
         
     | 
| 
      
 1525 
     | 
    
         
            +
                                [=](sycl::id<3> id) {
         
     | 
| 
      
 1526 
     | 
    
         
            +
                                    to_surface[get_offset(id, to_slice, to_range.get(0))] =
         
     | 
| 
      
 1527 
     | 
    
         
            +
                                        from_surface[get_offset(id, from_slice, from_range.get(0))];
         
     | 
| 
      
 1528 
     | 
    
         
            +
                                }); }));
         
     | 
| 
      
 1529 
     | 
    
         
            +
                            break;
         
     | 
| 
       1660 
1530 
     | 
    
         
             
                        default:
         
     | 
| 
       1661 
1531 
     | 
    
         
             
                            throw std::runtime_error("dpct_memcpy: invalid direction value");
         
     | 
| 
       1662 
1532 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -1754,11 +1624,7 @@ namespace dpct 
     | 
|
| 
       1754 
1624 
     | 
    
         
             
                    {
         
     | 
| 
       1755 
1625 
     | 
    
         
             
                        if (ptr)
         
     | 
| 
       1756 
1626 
     | 
    
         
             
                        {
         
     | 
| 
       1757 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1758 
     | 
    
         
            -
                            detail::mem_mgr::instance().mem_free(ptr);
         
     | 
| 
       1759 
     | 
    
         
            -
            #else
         
     | 
| 
       1760 
1627 
     | 
    
         
             
                            sycl::free(ptr, q.get_context());
         
     | 
| 
       1761 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       1762 
1628 
     | 
    
         
             
                        }
         
     | 
| 
       1763 
1629 
     | 
    
         
             
                    }
         
     | 
| 
       1764 
1630 
     | 
    
         | 
| 
         @@ -1766,11 +1632,7 @@ namespace dpct 
     | 
|
| 
       1766 
1632 
     | 
    
         
             
                    inline auto get_memory(const void *x)
         
     | 
| 
       1767 
1633 
     | 
    
         
             
                    {
         
     | 
| 
       1768 
1634 
     | 
    
         
             
                        T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
         
     | 
| 
       1769 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       1770 
     | 
    
         
            -
                        return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
         
     | 
| 
       1771 
     | 
    
         
            -
            #else
         
     | 
| 
       1772 
1635 
     | 
    
         
             
                        return new_x;
         
     | 
| 
       1773 
     | 
    
         
            -
            #endif
         
     | 
| 
       1774 
1636 
     | 
    
         
             
                    }
         
     | 
| 
       1775 
1637 
     | 
    
         | 
| 
       1776 
1638 
     | 
    
         
             
                    template <typename T>
         
     | 
| 
         @@ -1802,24 +1664,6 @@ namespace dpct 
     | 
|
| 
       1802 
1664 
     | 
    
         
             
                                          const void *alpha, const void *a, int lda, const void *b,
         
     | 
| 
       1803 
1665 
     | 
    
         
             
                                          int ldb, const void *beta, void *c, int ldc)
         
     | 
| 
       1804 
1666 
     | 
    
         
             
                    {
         
     | 
| 
       1805 
     | 
    
         
            -
            #ifndef __INTEL_MKL__
         
     | 
| 
       1806 
     | 
    
         
            -
                        GGML_UNUSED(q);
         
     | 
| 
       1807 
     | 
    
         
            -
                        GGML_UNUSED(a_trans);
         
     | 
| 
       1808 
     | 
    
         
            -
                        GGML_UNUSED(b_trans);
         
     | 
| 
       1809 
     | 
    
         
            -
                        GGML_UNUSED(m);
         
     | 
| 
       1810 
     | 
    
         
            -
                        GGML_UNUSED(n);
         
     | 
| 
       1811 
     | 
    
         
            -
                        GGML_UNUSED(k);
         
     | 
| 
       1812 
     | 
    
         
            -
                        GGML_UNUSED(alpha);
         
     | 
| 
       1813 
     | 
    
         
            -
                        GGML_UNUSED(a);
         
     | 
| 
       1814 
     | 
    
         
            -
                        GGML_UNUSED(lda);
         
     | 
| 
       1815 
     | 
    
         
            -
                        GGML_UNUSED(b);
         
     | 
| 
       1816 
     | 
    
         
            -
                        GGML_UNUSED(ldb);
         
     | 
| 
       1817 
     | 
    
         
            -
                        GGML_UNUSED(beta);
         
     | 
| 
       1818 
     | 
    
         
            -
                        GGML_UNUSED(c);
         
     | 
| 
       1819 
     | 
    
         
            -
                        GGML_UNUSED(ldc);
         
     | 
| 
       1820 
     | 
    
         
            -
                        throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
         
     | 
| 
       1821 
     | 
    
         
            -
                                                 "Project does not support this API.");
         
     | 
| 
       1822 
     | 
    
         
            -
            #else
         
     | 
| 
       1823 
1667 
     | 
    
         
             
                        Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
         
     | 
| 
       1824 
1668 
     | 
    
         
             
                        Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
         
     | 
| 
       1825 
1669 
     | 
    
         
             
                        auto data_a = get_memory<const Ta>(a);
         
     | 
| 
         @@ -1828,7 +1672,6 @@ namespace dpct 
     | 
|
| 
       1828 
1672 
     | 
    
         
             
                        oneapi::mkl::blas::column_major::gemm(
         
     | 
| 
       1829 
1673 
     | 
    
         
             
                            q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
         
     | 
| 
       1830 
1674 
     | 
    
         
             
                            data_b, ldb, beta_value, data_c, ldc);
         
     | 
| 
       1831 
     | 
    
         
            -
            #endif
         
     | 
| 
       1832 
1675 
     | 
    
         
             
                    }
         
     | 
| 
       1833 
1676 
     | 
    
         | 
| 
       1834 
1677 
     | 
    
         
             
                    template <typename VecT, class BinaryOperation, class = void>
         
     | 
| 
         @@ -2222,72 +2065,8 @@ namespace dpct 
     | 
|
| 
       2222 
2065 
     | 
    
         
             
                {
         
     | 
| 
       2223 
2066 
     | 
    
         
             
                    if (!size)
         
     | 
| 
       2224 
2067 
     | 
    
         
             
                        return sycl::event{};
         
     | 
| 
       2225 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       2226 
     | 
    
         
            -
                    auto &mm = mem_mgr::instance();
         
     | 
| 
       2227 
     | 
    
         
            -
                    auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
         
     | 
| 
       2228 
     | 
    
         
            -
             
     | 
| 
       2229 
     | 
    
         
            -
                    switch (real_direction)
         
     | 
| 
       2230 
     | 
    
         
            -
                    {
         
     | 
| 
       2231 
     | 
    
         
            -
                    case host_to_host:
         
     | 
| 
       2232 
     | 
    
         
            -
                        return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2233 
     | 
    
         
            -
                                        {
         
     | 
| 
       2234 
     | 
    
         
            -
                    cgh.depends_on(dep_events);
         
     | 
| 
       2235 
     | 
    
         
            -
                    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
         
     | 
| 
       2236 
     | 
    
         
            -
                    case host_to_device:
         
     | 
| 
       2237 
     | 
    
         
            -
                    {
         
     | 
| 
       2238 
     | 
    
         
            -
                        auto alloc = mm.translate_ptr(to_ptr);
         
     | 
| 
       2239 
     | 
    
         
            -
                        size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
         
     | 
| 
       2240 
     | 
    
         
            -
                        return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2241 
     | 
    
         
            -
                                        {
         
     | 
| 
       2242 
     | 
    
         
            -
                    cgh.depends_on(dep_events);
         
     | 
| 
       2243 
     | 
    
         
            -
                    auto r = sycl::range<1>(size);
         
     | 
| 
       2244 
     | 
    
         
            -
                    auto o = sycl::id<1>(offset);
         
     | 
| 
       2245 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       2246 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2247 
     | 
    
         
            -
                        acc(alloc.buffer, cgh, r, o);
         
     | 
| 
       2248 
     | 
    
         
            -
                    cgh.copy(from_ptr, acc); });
         
     | 
| 
       2249 
     | 
    
         
            -
                    }
         
     | 
| 
       2250 
     | 
    
         
            -
                    case device_to_host:
         
     | 
| 
       2251 
     | 
    
         
            -
                    {
         
     | 
| 
       2252 
     | 
    
         
            -
                        auto alloc = mm.translate_ptr(from_ptr);
         
     | 
| 
       2253 
     | 
    
         
            -
                        size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
         
     | 
| 
       2254 
     | 
    
         
            -
                        return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2255 
     | 
    
         
            -
                                        {
         
     | 
| 
       2256 
     | 
    
         
            -
                    cgh.depends_on(dep_events);
         
     | 
| 
       2257 
     | 
    
         
            -
                    auto r = sycl::range<1>(size);
         
     | 
| 
       2258 
     | 
    
         
            -
                    auto o = sycl::id<1>(offset);
         
     | 
| 
       2259 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       2260 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2261 
     | 
    
         
            -
                        acc(alloc.buffer, cgh, r, o);
         
     | 
| 
       2262 
     | 
    
         
            -
                    cgh.copy(acc, to_ptr); });
         
     | 
| 
       2263 
     | 
    
         
            -
                    }
         
     | 
| 
       2264 
     | 
    
         
            -
                    case device_to_device:
         
     | 
| 
       2265 
     | 
    
         
            -
                    {
         
     | 
| 
       2266 
     | 
    
         
            -
                        auto to_alloc = mm.translate_ptr(to_ptr);
         
     | 
| 
       2267 
     | 
    
         
            -
                        auto from_alloc = mm.translate_ptr(from_ptr);
         
     | 
| 
       2268 
     | 
    
         
            -
                        size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
         
     | 
| 
       2269 
     | 
    
         
            -
                        size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
         
     | 
| 
       2270 
     | 
    
         
            -
                        return q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2271 
     | 
    
         
            -
                                        {
         
     | 
| 
       2272 
     | 
    
         
            -
                    cgh.depends_on(dep_events);
         
     | 
| 
       2273 
     | 
    
         
            -
                    auto r = sycl::range<1>(size);
         
     | 
| 
       2274 
     | 
    
         
            -
                    auto to_o = sycl::id<1>(to_offset);
         
     | 
| 
       2275 
     | 
    
         
            -
                    auto from_o = sycl::id<1>(from_offset);
         
     | 
| 
       2276 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       2277 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2278 
     | 
    
         
            -
                        to_acc(to_alloc.buffer, cgh, r, to_o);
         
     | 
| 
       2279 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       2280 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2281 
     | 
    
         
            -
                        from_acc(from_alloc.buffer, cgh, r, from_o);
         
     | 
| 
       2282 
     | 
    
         
            -
                    cgh.copy(from_acc, to_acc); });
         
     | 
| 
       2283 
     | 
    
         
            -
                    }
         
     | 
| 
       2284 
     | 
    
         
            -
                    default:
         
     | 
| 
       2285 
     | 
    
         
            -
                        throw std::runtime_error("dpct_memcpy: invalid direction value");
         
     | 
| 
       2286 
     | 
    
         
            -
                    }
         
     | 
| 
       2287 
     | 
    
         
            -
            #else
         
     | 
| 
       2288 
2068 
     | 
    
         
             
                    return q.memcpy(to_ptr, from_ptr, size, dep_events);
         
     | 
| 
       2289 
2069 
     | 
    
         
             
                    GGML_UNUSED(direction);
         
     | 
| 
       2290 
     | 
    
         
            -
            #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       2291 
2070 
     | 
    
         
             
                }
         
     | 
| 
       2292 
2071 
     | 
    
         | 
| 
       2293 
2072 
     | 
    
         
             
                // Get actual copy range and make sure it will not exceed range.
         
     | 
| 
         @@ -2427,34 +2206,6 @@ namespace dpct 
     | 
|
| 
       2427 
2206 
     | 
    
         
             
                        break;
         
     | 
| 
       2428 
2207 
     | 
    
         
             
                    }
         
     | 
| 
       2429 
2208 
     | 
    
         
             
                    case device_to_device:
         
     | 
| 
       2430 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       2431 
     | 
    
         
            -
                    {
         
     | 
| 
       2432 
     | 
    
         
            -
                        auto &mm = mem_mgr::instance();
         
     | 
| 
       2433 
     | 
    
         
            -
                        auto to_alloc = mm.translate_ptr(to_surface);
         
     | 
| 
       2434 
     | 
    
         
            -
                        auto from_alloc = mm.translate_ptr(from_surface);
         
     | 
| 
       2435 
     | 
    
         
            -
                        size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
         
     | 
| 
       2436 
     | 
    
         
            -
                        size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
         
     | 
| 
       2437 
     | 
    
         
            -
                        event_list.push_back(q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2438 
     | 
    
         
            -
                                                      {
         
     | 
| 
       2439 
     | 
    
         
            -
                    cgh.depends_on(dep_events);
         
     | 
| 
       2440 
     | 
    
         
            -
                    auto to_o = sycl::id<1>(to_offset);
         
     | 
| 
       2441 
     | 
    
         
            -
                    auto from_o = sycl::id<1>(from_offset);
         
     | 
| 
       2442 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::write,
         
     | 
| 
       2443 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2444 
     | 
    
         
            -
                        to_acc(to_alloc.buffer, cgh,
         
     | 
| 
       2445 
     | 
    
         
            -
                                get_copy_range(size, to_slice, to_range.get(0)), to_o);
         
     | 
| 
       2446 
     | 
    
         
            -
                    sycl::accessor<byte_t, 1, sycl::access_mode::read,
         
     | 
| 
       2447 
     | 
    
         
            -
                                        sycl::access::target::device>
         
     | 
| 
       2448 
     | 
    
         
            -
                        from_acc(from_alloc.buffer, cgh,
         
     | 
| 
       2449 
     | 
    
         
            -
                                get_copy_range(size, from_slice, from_range.get(0)), from_o);
         
     | 
| 
       2450 
     | 
    
         
            -
                    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
         
     | 
| 
       2451 
     | 
    
         
            -
                        size,
         
     | 
| 
       2452 
     | 
    
         
            -
                        [=](sycl::id<3> id) {
         
     | 
| 
       2453 
     | 
    
         
            -
                            to_acc[get_offset(id, to_slice, to_range.get(0))] =
         
     | 
| 
       2454 
     | 
    
         
            -
                                from_acc[get_offset(id, from_slice, from_range.get(0))];
         
     | 
| 
       2455 
     | 
    
         
            -
                        }); }));
         
     | 
| 
       2456 
     | 
    
         
            -
                    }
         
     | 
| 
       2457 
     | 
    
         
            -
            #else
         
     | 
| 
       2458 
2209 
     | 
    
         
             
                        event_list.push_back(q.submit([&](sycl::handler &cgh)
         
     | 
| 
       2459 
2210 
     | 
    
         
             
                                                      {
         
     | 
| 
       2460 
2211 
     | 
    
         
             
                    cgh.depends_on(dep_events);
         
     | 
| 
         @@ -2464,7 +2215,6 @@ namespace dpct 
     | 
|
| 
       2464 
2215 
     | 
    
         
             
                            to_surface[get_offset(id, to_slice, to_range.get(0))] =
         
     | 
| 
       2465 
2216 
     | 
    
         
             
                                from_surface[get_offset(id, from_slice, from_range.get(0))];
         
     | 
| 
       2466 
2217 
     | 
    
         
             
                        }); }));
         
     | 
| 
       2467 
     | 
    
         
            -
            #endif
         
     | 
| 
       2468 
2218 
     | 
    
         
             
                    break;
         
     | 
| 
       2469 
2219 
     | 
    
         
             
                    default:
         
     | 
| 
       2470 
2220 
     | 
    
         
             
                        throw std::runtime_error("dpct_memcpy: invalid direction value");
         
     | 
| 
         @@ -2561,6 +2311,7 @@ namespace dpct 
     | 
|
| 
       2561 
2311 
     | 
    
         
             
                                                      lda, b, ldb, beta, c, ldc);
         
     | 
| 
       2562 
2312 
     | 
    
         
             
                        break;
         
     | 
| 
       2563 
2313 
     | 
    
         
             
                    }
         
     | 
| 
      
 2314 
     | 
    
         
            +
            #ifdef __INTEL_MKL__
         
     | 
| 
       2564 
2315 
     | 
    
         
             
                    case detail::get_type_combination_id(
         
     | 
| 
       2565 
2316 
     | 
    
         
             
                        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
         
     | 
| 
       2566 
2317 
     | 
    
         
             
                        library_data_t::real_float, library_data_t::real_float):
         
     | 
| 
         @@ -2622,6 +2373,7 @@ namespace dpct 
     | 
|
| 
       2622 
2373 
     | 
    
         
             
                            q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
         
     | 
| 
       2623 
2374 
     | 
    
         
             
                        break;
         
     | 
| 
       2624 
2375 
     | 
    
         
             
                    }
         
     | 
| 
      
 2376 
     | 
    
         
            +
            #endif // __INTEL_MKL__
         
     | 
| 
       2625 
2377 
     | 
    
         
             
                    default:
         
     | 
| 
       2626 
2378 
     | 
    
         
             
                        throw std::runtime_error("the combination of data type is unsupported");
         
     | 
| 
       2627 
2379 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -2655,9 +2407,6 @@ namespace dpct 
     | 
|
| 
       2655 
2407 
     | 
    
         
             
                                       void *c[], library_data_t c_type, int ldc,
         
     | 
| 
       2656 
2408 
     | 
    
         
             
                                       int batch_size, library_data_t scaling_type)
         
     | 
| 
       2657 
2409 
     | 
    
         
             
                {
         
     | 
| 
       2658 
     | 
    
         
            -
            #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       2659 
     | 
    
         
            -
                    throw std::runtime_error("this API is unsupported when USM level is none");
         
     | 
| 
       2660 
     | 
    
         
            -
            #else
         
     | 
| 
       2661 
2410 
     | 
    
         
             
                    if (scaling_type == library_data_t::real_float &&
         
     | 
| 
       2662 
2411 
     | 
    
         
             
                        c_type == library_data_t::complex_float)
         
     | 
| 
       2663 
2412 
     | 
    
         
             
                    {
         
     | 
| 
         @@ -2792,7 +2541,6 @@ namespace dpct 
     | 
|
| 
       2792 
2541 
     | 
    
         
             
                    default:
         
     | 
| 
       2793 
2542 
     | 
    
         
             
                        throw std::runtime_error("the combination of data type is unsupported");
         
     | 
| 
       2794 
2543 
     | 
    
         
             
                    }
         
     | 
| 
       2795 
     | 
    
         
            -
            #endif
         
     | 
| 
       2796 
2544 
     | 
    
         
             
                }
         
     | 
| 
       2797 
2545 
     | 
    
         | 
| 
       2798 
2546 
     | 
    
         
             
                /// Computes a batch of matrix-matrix product with general matrices.
         
     | 
| 
         @@ -3131,24 +2879,9 @@ namespace dpct 
     | 
|
| 
       3131 
2879 
     | 
    
         
             
                        template <size_t D = Dimension>
         
     | 
| 
       3132 
2880 
     | 
    
         
             
                        typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
         
     | 
| 
       3133 
2881 
     | 
    
         
             
                            init();
         
     | 
| 
       3134 
     | 
    
         
            -
                    #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       3135 
     | 
    
         
            -
                            return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
         
     | 
| 
       3136 
     | 
    
         
            -
                                    _device_ptr)
         
     | 
| 
       3137 
     | 
    
         
            -
                                .template get_access<sycl::access_mode::read_write>()[index];
         
     | 
| 
       3138 
     | 
    
         
            -
                    #else
         
     | 
| 
       3139 
2882 
     | 
    
         
             
                            return _device_ptr[index];
         
     | 
| 
       3140 
     | 
    
         
            -
                    #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       3141 
2883 
     | 
    
         
             
                        }
         
     | 
| 
       3142 
2884 
     | 
    
         | 
| 
       3143 
     | 
    
         
            -
                    #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       3144 
     | 
    
         
            -
                        /// Get sycl::accessor for the device memory object when usm is not used.
         
     | 
| 
       3145 
     | 
    
         
            -
                        accessor_t get_access(sycl::handler &cgh) {
         
     | 
| 
       3146 
     | 
    
         
            -
                            return get_buffer(_device_ptr)
         
     | 
| 
       3147 
     | 
    
         
            -
                                .template reinterpret<T, Dimension>(_range)
         
     | 
| 
       3148 
     | 
    
         
            -
                                .template get_access<detail::memory_traits<Memory, T>::mode,
         
     | 
| 
       3149 
     | 
    
         
            -
                                                    detail::memory_traits<Memory, T>::target>(cgh);
         
     | 
| 
       3150 
     | 
    
         
            -
                        }
         
     | 
| 
       3151 
     | 
    
         
            -
                    #else
         
     | 
| 
       3152 
2885 
     | 
    
         
             
                        /// Get dpct::accessor with dimension info for the device memory object
         
     | 
| 
       3153 
2886 
     | 
    
         
             
                        /// when usm is used and dimension is greater than 1.
         
     | 
| 
       3154 
2887 
     | 
    
         
             
                        template <size_t D = Dimension>
         
     | 
| 
         @@ -3156,7 +2889,6 @@ namespace dpct 
     | 
|
| 
       3156 
2889 
     | 
    
         
             
                        get_access(sycl::handler &cgh) {
         
     | 
| 
       3157 
2890 
     | 
    
         
             
                            return dpct_accessor_t((T *)_device_ptr, _range);
         
     | 
| 
       3158 
2891 
     | 
    
         
             
                        }
         
     | 
| 
       3159 
     | 
    
         
            -
                    #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       3160 
2892 
     | 
    
         | 
| 
       3161 
2893 
     | 
    
         
             
                    private:
         
     | 
| 
       3162 
2894 
     | 
    
         
             
                        device_memory(value_t *memory_ptr, size_t size)
         
     | 
| 
         @@ -3201,15 +2933,6 @@ namespace dpct 
     | 
|
| 
       3201 
2933 
     | 
    
         | 
| 
       3202 
2934 
     | 
    
         
             
                        /// Default constructor
         
     | 
| 
       3203 
2935 
     | 
    
         
             
                        device_memory() : base(1) {}
         
     | 
| 
       3204 
     | 
    
         
            -
             
     | 
| 
       3205 
     | 
    
         
            -
                    #ifdef DPCT_USM_LEVEL_NONE
         
     | 
| 
       3206 
     | 
    
         
            -
                        /// Get sycl::accessor for the device memory object when usm is not used.
         
     | 
| 
       3207 
     | 
    
         
            -
                        accessor_t get_access(sycl::handler &cgh) {
         
     | 
| 
       3208 
     | 
    
         
            -
                            auto buf = get_buffer(base::get_ptr())
         
     | 
| 
       3209 
     | 
    
         
            -
                                        .template reinterpret<T, 1>(sycl::range<1>(1));
         
     | 
| 
       3210 
     | 
    
         
            -
                            return accessor_t(buf, cgh);
         
     | 
| 
       3211 
     | 
    
         
            -
                        }
         
     | 
| 
       3212 
     | 
    
         
            -
                    #endif // DPCT_USM_LEVEL_NONE
         
     | 
| 
       3213 
2936 
     | 
    
         
             
                    };
         
     | 
| 
       3214 
2937 
     | 
    
         
             
                    } // namespace detail
         
     | 
| 
       3215 
2938 
     | 
    
         | 
| 
         @@ -3228,7 +2951,7 @@ namespace dpct 
     | 
|
| 
       3228 
2951 
     | 
    
         
             
            #include "ggml-common.h"
         
     | 
| 
       3229 
2952 
     | 
    
         | 
| 
       3230 
2953 
     | 
    
         
             
            static int g_ggml_sycl_debug=0;
         
     | 
| 
       3231 
     | 
    
         
            -
            #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug)  
     | 
| 
      
 2954 
     | 
    
         
            +
            #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
         
     | 
| 
       3232 
2955 
     | 
    
         | 
| 
       3233 
2956 
     | 
    
         
             
            #define CHECK_TRY_ERROR(expr)                                                  \
         
     | 
| 
       3234 
2957 
     | 
    
         
             
              [&]() {                                                                      \
         
     | 
| 
         @@ -3315,6 +3038,10 @@ typedef float dfloat; // dequantize float 
     | 
|
| 
       3315 
3038 
     | 
    
         
             
            typedef sycl::float2 dfloat2;
         
     | 
| 
       3316 
3039 
     | 
    
         
             
            #endif //GGML_SYCL_F16
         
     | 
| 
       3317 
3040 
     | 
    
         | 
| 
      
 3041 
     | 
    
         
            +
            #define MMVQ_MAX_BATCH_SIZE  8
         
     | 
| 
      
 3042 
     | 
    
         
            +
             
     | 
| 
      
 3043 
     | 
    
         
            +
            static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
         
     | 
| 
      
 3044 
     | 
    
         
            +
             
     | 
| 
       3318 
3045 
     | 
    
         
             
            bool   ggml_sycl_loaded(void);
         
     | 
| 
       3319 
3046 
     | 
    
         
             
            void * ggml_sycl_host_malloc(size_t size);
         
     | 
| 
       3320 
3047 
     | 
    
         
             
            void   ggml_sycl_host_free(void * ptr);
         
     | 
| 
         @@ -4750,6 +4477,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest 
     | 
|
| 
       4750 
4477 
     | 
    
         | 
| 
       4751 
4478 
     | 
    
         
             
            }
         
     | 
| 
       4752 
4479 
     | 
    
         | 
| 
      
 4480 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4481 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4482 
     | 
    
         
            +
            dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4483 
     | 
    
         
            +
                                   const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 4484 
     | 
    
         
            +
             
     | 
| 
      
 4485 
     | 
    
         
            +
                const int i = item_ct1.get_group(2);
         
     | 
| 
      
 4486 
     | 
    
         
            +
                const block_iq2_s * x = (const block_iq2_s *) vx;
         
     | 
| 
      
 4487 
     | 
    
         
            +
             
     | 
| 
      
 4488 
     | 
    
         
            +
                const int tid = item_ct1.get_local_id(2);
         
     | 
| 
      
 4489 
     | 
    
         
            +
            #if QK_K == 256
         
     | 
| 
      
 4490 
     | 
    
         
            +
                const int il = tid/8; // 0...3
         
     | 
| 
      
 4491 
     | 
    
         
            +
                const int ib = tid%8; // 0...7
         
     | 
| 
      
 4492 
     | 
    
         
            +
                dst_t * y = yy + i*QK_K + 32*ib + 8*il;
         
     | 
| 
      
 4493 
     | 
    
         
            +
                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
         
     | 
| 
      
 4494 
     | 
    
         
            +
                const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
         
     | 
| 
      
 4495 
     | 
    
         
            +
                const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
         
     | 
| 
      
 4496 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4497 
     | 
    
         
            +
                for (int j = 0; j < 8; ++j)
         
     | 
| 
      
 4498 
     | 
    
         
            +
                    y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
         
     | 
| 
      
 4499 
     | 
    
         
            +
            #else
         
     | 
| 
      
 4500 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 4501 
     | 
    
         
            +
             
     | 
| 
      
 4502 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 4503 
     | 
    
         
            +
             
     | 
| 
      
 4504 
     | 
    
         
            +
            }
         
     | 
| 
      
 4505 
     | 
    
         
            +
             
     | 
| 
       4753 
4506 
     | 
    
         
             
            template<typename dst_t>
         
     | 
| 
       4754 
4507 
     | 
    
         
             
            static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
         
     | 
| 
       4755 
4508 
     | 
    
         
             
                                                 const sycl::nd_item<3> &item_ct1,
         
     | 
| 
         @@ -4782,26 +4535,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res 
     | 
|
| 
       4782 
4535 
     | 
    
         | 
| 
       4783 
4536 
     | 
    
         
             
            }
         
     | 
| 
       4784 
4537 
     | 
    
         | 
| 
       4785 
     | 
    
         
            -
            template<typename dst_t>
         
     | 
| 
       4786 
     | 
    
         
            -
            static void 
     | 
| 
       4787 
     | 
    
         
            -
             
     | 
| 
       4788 
     | 
    
         
            -
             
     | 
| 
       4789 
     | 
    
         
            -
             
     | 
| 
       4790 
     | 
    
         
            -
                                                 const uint8_t *kmask_iq2xs) {
         
     | 
| 
      
 4538 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4539 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4540 
     | 
    
         
            +
            dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4541 
     | 
    
         
            +
                                   const sycl::nd_item<3> &item_ct1,
         
     | 
| 
      
 4542 
     | 
    
         
            +
                                   const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
         
     | 
| 
       4791 
4543 
     | 
    
         | 
| 
       4792 
4544 
     | 
    
         
             
                const int i = item_ct1.get_group(2);
         
     | 
| 
       4793 
     | 
    
         
            -
                const block_iq3_s * x = (const block_iq3_s 
     | 
| 
      
 4545 
     | 
    
         
            +
                const block_iq3_s * x = (const block_iq3_s *) vx;
         
     | 
| 
       4794 
4546 
     | 
    
         | 
| 
       4795 
4547 
     | 
    
         
             
                const int tid = item_ct1.get_local_id(2);
         
     | 
| 
       4796 
4548 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       4797 
4549 
     | 
    
         
             
                const int il = tid/8; // 0...3
         
     | 
| 
       4798 
4550 
     | 
    
         
             
                const int ib = tid%8; // 0...7
         
     | 
| 
       4799 
4551 
     | 
    
         
             
                dst_t * y = yy + i*QK_K + 32*ib + 8*il;
         
     | 
| 
       4800 
     | 
    
         
            -
                const uint8_t 
     | 
| 
       4801 
     | 
    
         
            -
                const uint8_t 
     | 
| 
       4802 
     | 
    
         
            -
                const uint8_t 
     | 
| 
      
 4552 
     | 
    
         
            +
                const uint8_t * qs = x[i].qs + 8*ib;
         
     | 
| 
      
 4553 
     | 
    
         
            +
                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
         
     | 
| 
      
 4554 
     | 
    
         
            +
                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
         
     | 
| 
       4803 
4555 
     | 
    
         
             
                const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
         
     | 
| 
       4804 
4556 
     | 
    
         
             
                const uint8_t signs = x[i].signs[4*ib + il];
         
     | 
| 
      
 4557 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
       4805 
4558 
     | 
    
         
             
                for (int j = 0; j < 4; ++j) {
         
     | 
| 
       4806 
4559 
     | 
    
         
             
                    y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
         
     | 
| 
       4807 
4560 
     | 
    
         
             
                    y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
         
     | 
| 
         @@ -4812,12 +4565,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr 
     | 
|
| 
       4812 
4565 
     | 
    
         | 
| 
       4813 
4566 
     | 
    
         
             
            }
         
     | 
| 
       4814 
4567 
     | 
    
         | 
| 
       4815 
     | 
    
         
            -
            template<typename dst_t>
         
     | 
| 
       4816 
     | 
    
         
            -
            static void 
     | 
| 
       4817 
     | 
    
         
            -
             
     | 
| 
       4818 
     | 
    
         
            -
             
     | 
| 
       4819 
     | 
    
         
            -
             
     | 
| 
       4820 
     | 
    
         
            -
             
     | 
| 
      
 4568 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4569 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4570 
     | 
    
         
            +
            dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4571 
     | 
    
         
            +
                                   const sycl::nd_item<3> &item_ct1,
         
     | 
| 
      
 4572 
     | 
    
         
            +
                                   const uint32_t *iq1s_grid_gpu) {
         
     | 
| 
      
 4573 
     | 
    
         
            +
             
     | 
| 
       4821 
4574 
     | 
    
         
             
                const int i = item_ct1.get_group(2);
         
     | 
| 
       4822 
4575 
     | 
    
         
             
                const block_iq1_s * x = (const block_iq1_s  *) vx;
         
     | 
| 
       4823 
4576 
     | 
    
         | 
| 
         @@ -4826,14 +4579,49 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr 
     | 
|
| 
       4826 
4579 
     | 
    
         
             
                const int il = tid/8; // 0...3
         
     | 
| 
       4827 
4580 
     | 
    
         
             
                const int ib = tid%8; // 0...7
         
     | 
| 
       4828 
4581 
     | 
    
         
             
                dst_t * y = yy + i*QK_K + 32*ib + 8*il;
         
     | 
| 
       4829 
     | 
    
         
            -
                const  
     | 
| 
       4830 
     | 
    
         
            -
                const  
     | 
| 
       4831 
     | 
    
         
            -
                const  
     | 
| 
       4832 
     | 
    
         
            -
                 
     | 
| 
       4833 
     | 
    
         
            -
                 
     | 
| 
       4834 
     | 
    
         
            -
                 
     | 
| 
       4835 
     | 
    
         
            -
             
     | 
| 
       4836 
     | 
    
         
            -
             
     | 
| 
      
 4582 
     | 
    
         
            +
                const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
         
     | 
| 
      
 4583 
     | 
    
         
            +
                const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
         
     | 
| 
      
 4584 
     | 
    
         
            +
                uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
         
     | 
| 
      
 4585 
     | 
    
         
            +
                grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
         
     | 
| 
      
 4586 
     | 
    
         
            +
                grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
         
     | 
| 
      
 4587 
     | 
    
         
            +
                grid32[0] &= 0x0f0f0f0f;
         
     | 
| 
      
 4588 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4589 
     | 
    
         
            +
                for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 4590 
     | 
    
         
            +
                    y[j] = d * (q[j] + delta);
         
     | 
| 
      
 4591 
     | 
    
         
            +
                }
         
     | 
| 
      
 4592 
     | 
    
         
            +
            #else
         
     | 
| 
      
 4593 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 4594 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 4595 
     | 
    
         
            +
             
     | 
| 
      
 4596 
     | 
    
         
            +
            }
         
     | 
| 
      
 4597 
     | 
    
         
            +
             
     | 
| 
      
 4598 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4599 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4600 
     | 
    
         
            +
            dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4601 
     | 
    
         
            +
                                   const sycl::nd_item<3> &item_ct1,
         
     | 
| 
      
 4602 
     | 
    
         
            +
                                   const uint32_t *iq1s_grid_gpu) {
         
     | 
| 
      
 4603 
     | 
    
         
            +
             
     | 
| 
      
 4604 
     | 
    
         
            +
                const int i = item_ct1.get_group(2);
         
     | 
| 
      
 4605 
     | 
    
         
            +
                const block_iq1_m * x = (const block_iq1_m  *) vx;
         
     | 
| 
      
 4606 
     | 
    
         
            +
             
     | 
| 
      
 4607 
     | 
    
         
            +
                const int tid = item_ct1.get_local_id(2);
         
     | 
| 
      
 4608 
     | 
    
         
            +
            #if QK_K == 256
         
     | 
| 
      
 4609 
     | 
    
         
            +
                const int il = tid/8; // 0...3
         
     | 
| 
      
 4610 
     | 
    
         
            +
                const int ib = tid%8; // 0...7
         
     | 
| 
      
 4611 
     | 
    
         
            +
                dst_t * y = yy + i*QK_K + 32*ib + 8*il;
         
     | 
| 
      
 4612 
     | 
    
         
            +
                const uint16_t * sc = (const uint16_t *)x[i].scales;
         
     | 
| 
      
 4613 
     | 
    
         
            +
                iq1m_scale_t scale;
         
     | 
| 
      
 4614 
     | 
    
         
            +
                scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 4615 
     | 
    
         
            +
                const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
         
     | 
| 
      
 4616 
     | 
    
         
            +
                const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
         
     | 
| 
      
 4617 
     | 
    
         
            +
                const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
         
     | 
| 
      
 4618 
     | 
    
         
            +
                uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
         
     | 
| 
      
 4619 
     | 
    
         
            +
                grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
         
     | 
| 
      
 4620 
     | 
    
         
            +
                grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
         
     | 
| 
      
 4621 
     | 
    
         
            +
                grid32[0] &= 0x0f0f0f0f;
         
     | 
| 
      
 4622 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4623 
     | 
    
         
            +
                for (int j = 0; j < 8; ++j) {
         
     | 
| 
      
 4624 
     | 
    
         
            +
                    y[j] = d * (q[j] + delta);
         
     | 
| 
       4837 
4625 
     | 
    
         
             
                }
         
     | 
| 
       4838 
4626 
     | 
    
         
             
            #else
         
     | 
| 
       4839 
4627 
     | 
    
         
             
                assert(false);
         
     | 
| 
         @@ -4841,6 +4629,51 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr 
     | 
|
| 
       4841 
4629 
     | 
    
         | 
| 
       4842 
4630 
     | 
    
         
             
            }
         
     | 
| 
       4843 
4631 
     | 
    
         | 
| 
      
 4632 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4633 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4634 
     | 
    
         
            +
            dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4635 
     | 
    
         
            +
                                    const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 4636 
     | 
    
         
            +
             
     | 
| 
      
 4637 
     | 
    
         
            +
                const int i = item_ct1.get_group(2);
         
     | 
| 
      
 4638 
     | 
    
         
            +
                const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
         
     | 
| 
      
 4639 
     | 
    
         
            +
             
     | 
| 
      
 4640 
     | 
    
         
            +
                const int tid = item_ct1.get_local_id(2);
         
     | 
| 
      
 4641 
     | 
    
         
            +
                const int il = tid/8; // 0...3
         
     | 
| 
      
 4642 
     | 
    
         
            +
                const int ib = tid%8; // 0...7
         
     | 
| 
      
 4643 
     | 
    
         
            +
                dst_t * y = yy + i*QK_K + 32*ib + 4*il;
         
     | 
| 
      
 4644 
     | 
    
         
            +
                const uint8_t  * q4 = x[ib].qs + 4*il;
         
     | 
| 
      
 4645 
     | 
    
         
            +
                const float d = (float)x[ib].d;
         
     | 
| 
      
 4646 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4647 
     | 
    
         
            +
                for (int j = 0; j < 4; ++j) {
         
     | 
| 
      
 4648 
     | 
    
         
            +
                    y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
         
     | 
| 
      
 4649 
     | 
    
         
            +
                    y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
         
     | 
| 
      
 4650 
     | 
    
         
            +
                }
         
     | 
| 
      
 4651 
     | 
    
         
            +
             
     | 
| 
      
 4652 
     | 
    
         
            +
            }
         
     | 
| 
      
 4653 
     | 
    
         
            +
             
     | 
| 
      
 4654 
     | 
    
         
            +
             
     | 
| 
      
 4655 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 4656 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 4657 
     | 
    
         
            +
            dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
         
     | 
| 
      
 4658 
     | 
    
         
            +
                                    const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 4659 
     | 
    
         
            +
                const int i = item_ct1.get_group(2);
         
     | 
| 
      
 4660 
     | 
    
         
            +
                const block_iq4_xs * x = (const block_iq4_xs *)vx;
         
     | 
| 
      
 4661 
     | 
    
         
            +
             
     | 
| 
      
 4662 
     | 
    
         
            +
                const int tid = item_ct1.get_local_id(2);
         
     | 
| 
      
 4663 
     | 
    
         
            +
                const int il = tid/8; // 0...3
         
     | 
| 
      
 4664 
     | 
    
         
            +
                const int ib = tid%8; // 0...7
         
     | 
| 
      
 4665 
     | 
    
         
            +
                dst_t * y = yy + i*QK_K + 32*ib + 4*il;
         
     | 
| 
      
 4666 
     | 
    
         
            +
                const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
         
     | 
| 
      
 4667 
     | 
    
         
            +
                const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
         
     | 
| 
      
 4668 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4669 
     | 
    
         
            +
                for (int j = 0; j < 4; ++j) {
         
     | 
| 
      
 4670 
     | 
    
         
            +
                    y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
         
     | 
| 
      
 4671 
     | 
    
         
            +
                    y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
         
     | 
| 
      
 4672 
     | 
    
         
            +
                }
         
     | 
| 
      
 4673 
     | 
    
         
            +
            }
         
     | 
| 
      
 4674 
     | 
    
         
            +
             
     | 
| 
      
 4675 
     | 
    
         
            +
             
     | 
| 
      
 4676 
     | 
    
         
            +
             
     | 
| 
       4844 
4677 
     | 
    
         
             
            /*
         
     | 
| 
       4845 
4678 
     | 
    
         
             
            DPCT1110:4: The total declared local variable size in device function
         
     | 
| 
       4846 
4679 
     | 
    
         
             
            dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
         
     | 
| 
         @@ -7647,6 +7480,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, 
     | 
|
| 
       7647 
7480 
     | 
    
         
             
            #endif
         
     | 
| 
       7648 
7481 
     | 
    
         
             
            }
         
     | 
| 
       7649 
7482 
     | 
    
         | 
| 
      
 7483 
     | 
    
         
            +
            static __dpct_inline__ float
         
     | 
| 
      
 7484 
     | 
    
         
            +
            vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
         
     | 
| 
      
 7485 
     | 
    
         
            +
                               const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
         
     | 
| 
      
 7486 
     | 
    
         
            +
            #if QK_K == 256
         
     | 
| 
      
 7487 
     | 
    
         
            +
                const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
         
     | 
| 
      
 7488 
     | 
    
         
            +
             
     | 
| 
      
 7489 
     | 
    
         
            +
                const int ib32 = iqs;
         
     | 
| 
      
 7490 
     | 
    
         
            +
                const int8_t  * q8 = bq8_1[ib32].qs;
         
     | 
| 
      
 7491 
     | 
    
         
            +
                const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
         
     | 
| 
      
 7492 
     | 
    
         
            +
                const uint8_t ls1 = bq2->scales[ib32] & 0xf;
         
     | 
| 
      
 7493 
     | 
    
         
            +
                const uint8_t ls2 = bq2->scales[ib32] >>  4;
         
     | 
| 
      
 7494 
     | 
    
         
            +
                int sumi1 = 0;
         
     | 
| 
      
 7495 
     | 
    
         
            +
                for (int l = 0; l < 2; ++l) {
         
     | 
| 
      
 7496 
     | 
    
         
            +
                    const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
         
     | 
| 
      
 7497 
     | 
    
         
            +
                    const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7498 
     | 
    
         
            +
                        ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
         
     | 
| 
      
 7499 
     | 
    
         
            +
                        std::equal_to<>());
         
     | 
| 
      
 7500 
     | 
    
         
            +
                    const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7501 
     | 
    
         
            +
                        ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
         
     | 
| 
      
 7502 
     | 
    
         
            +
                        std::equal_to<>());
         
     | 
| 
      
 7503 
     | 
    
         
            +
                    const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7504 
     | 
    
         
            +
                        grid[0] ^ signs0, signs0, std::minus<>());
         
     | 
| 
      
 7505 
     | 
    
         
            +
                    const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7506 
     | 
    
         
            +
                        grid[1] ^ signs1, signs1, std::minus<>());
         
     | 
| 
      
 7507 
     | 
    
         
            +
                    sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
         
     | 
| 
      
 7508 
     | 
    
         
            +
                    sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
         
     | 
| 
      
 7509 
     | 
    
         
            +
                    q8 += 8;
         
     | 
| 
      
 7510 
     | 
    
         
            +
                }
         
     | 
| 
      
 7511 
     | 
    
         
            +
                int sumi2 = 0;
         
     | 
| 
      
 7512 
     | 
    
         
            +
                for (int l = 2; l < 4; ++l) {
         
     | 
| 
      
 7513 
     | 
    
         
            +
                    const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
         
     | 
| 
      
 7514 
     | 
    
         
            +
                    const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7515 
     | 
    
         
            +
                        ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
         
     | 
| 
      
 7516 
     | 
    
         
            +
                        std::equal_to<>());
         
     | 
| 
      
 7517 
     | 
    
         
            +
                    const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7518 
     | 
    
         
            +
                        ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
         
     | 
| 
      
 7519 
     | 
    
         
            +
                        std::equal_to<>());
         
     | 
| 
      
 7520 
     | 
    
         
            +
                    const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7521 
     | 
    
         
            +
                        grid[0] ^ signs0, signs0, std::minus<>());
         
     | 
| 
      
 7522 
     | 
    
         
            +
                    const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
      
 7523 
     | 
    
         
            +
                        grid[1] ^ signs1, signs1, std::minus<>());
         
     | 
| 
      
 7524 
     | 
    
         
            +
                    sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
         
     | 
| 
      
 7525 
     | 
    
         
            +
                    sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
         
     | 
| 
      
 7526 
     | 
    
         
            +
                    q8 += 8;
         
     | 
| 
      
 7527 
     | 
    
         
            +
                }
         
     | 
| 
      
 7528 
     | 
    
         
            +
                const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
         
     | 
| 
      
 7529 
     | 
    
         
            +
                return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
         
     | 
| 
      
 7530 
     | 
    
         
            +
            #else
         
     | 
| 
      
 7531 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 7532 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 7533 
     | 
    
         
            +
            }
         
     | 
| 
      
 7534 
     | 
    
         
            +
             
     | 
| 
       7650 
7535 
     | 
    
         
             
            static __dpct_inline__ float
         
     | 
| 
       7651 
7536 
     | 
    
         
             
            vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
         
     | 
| 
       7652 
7537 
     | 
    
         
             
                                 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
         
     | 
| 
         @@ -7689,10 +7574,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, 
     | 
|
| 
       7689 
7574 
     | 
    
         | 
| 
       7690 
7575 
     | 
    
         
             
            static __dpct_inline__ float
         
     | 
| 
       7691 
7576 
     | 
    
         
             
            vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
         
     | 
| 
       7692 
     | 
    
         
            -
             
     | 
| 
       7693 
     | 
    
         
            -
             
     | 
| 
       7694 
     | 
    
         
            -
            #if DPCT_COMPATIBILITY_TEMP >=                                                 \
         
     | 
| 
       7695 
     | 
    
         
            -
                MIN_CC_DP4A // lowest compute capability for integer intrinsics
         
     | 
| 
      
 7577 
     | 
    
         
            +
                               const block_q8_1 *__restrict__ bq8_1, const int &iqs,
         
     | 
| 
      
 7578 
     | 
    
         
            +
                               const uint32_t *iq3s_grid) {
         
     | 
| 
       7696 
7579 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       7697 
7580 
     | 
    
         
             
                const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
         
     | 
| 
       7698 
7581 
     | 
    
         | 
| 
         @@ -7704,9 +7587,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, 
     | 
|
| 
       7704 
7587 
     | 
    
         
             
                    const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
         
     | 
| 
       7705 
7588 
     | 
    
         
             
                    const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
         
     | 
| 
       7706 
7589 
     | 
    
         
             
                    uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
       7707 
     | 
    
         
            -
                        ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 
     | 
| 
      
 7590 
     | 
    
         
            +
                        ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
         
     | 
| 
      
 7591 
     | 
    
         
            +
                        0x08040201, std::equal_to<>());
         
     | 
| 
       7708 
7592 
     | 
    
         
             
                    uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
       7709 
     | 
    
         
            -
                        ((bq2->signs[4*ib32+l] >> 
     | 
| 
      
 7593 
     | 
    
         
            +
                        ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
         
     | 
| 
      
 7594 
     | 
    
         
            +
                        0x08040201, std::equal_to<>());
         
     | 
| 
       7710 
7595 
     | 
    
         
             
                    const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
       7711 
7596 
     | 
    
         
             
                        grid1[0] ^ signs0, signs0, std::minus<>());
         
     | 
| 
       7712 
7597 
     | 
    
         
             
                    const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
         
     | 
| 
         @@ -7715,45 +7600,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, 
     | 
|
| 
       7715 
7600 
     | 
    
         
             
                    sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
         
     | 
| 
       7716 
7601 
     | 
    
         
             
                    q8 += 8;
         
     | 
| 
       7717 
7602 
     | 
    
         
             
                }
         
     | 
| 
       7718 
     | 
    
         
            -
                const float d = 
     | 
| 
      
 7603 
     | 
    
         
            +
                const float d =
         
     | 
| 
      
 7604 
     | 
    
         
            +
                    (float)bq2->d *
         
     | 
| 
      
 7605 
     | 
    
         
            +
                    (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
         
     | 
| 
      
 7606 
     | 
    
         
            +
                    bq8_1[ib32].ds[0];
         
     | 
| 
       7719 
7607 
     | 
    
         
             
                return d * sumi;
         
     | 
| 
       7720 
7608 
     | 
    
         
             
            #else
         
     | 
| 
       7721 
7609 
     | 
    
         
             
                assert(false);
         
     | 
| 
       7722 
     | 
    
         
            -
                return 0.f;
         
     | 
| 
       7723 
     | 
    
         
            -
            #endif
         
     | 
| 
       7724 
     | 
    
         
            -
            #else
         
     | 
| 
       7725 
     | 
    
         
            -
                assert(false);
         
     | 
| 
       7726 
     | 
    
         
            -
                return 0.f;
         
     | 
| 
       7727 
7610 
     | 
    
         
             
            #endif
         
     | 
| 
       7728 
7611 
     | 
    
         
             
            }
         
     | 
| 
       7729 
7612 
     | 
    
         | 
| 
       7730 
7613 
     | 
    
         
             
            static __dpct_inline__ float
         
     | 
| 
       7731 
7614 
     | 
    
         
             
            vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
         
     | 
| 
       7732 
     | 
    
         
            -
             
     | 
| 
       7733 
     | 
    
         
            -
             
     | 
| 
      
 7615 
     | 
    
         
            +
                               const block_q8_1 *__restrict__ bq8_1, const int &iqs,
         
     | 
| 
      
 7616 
     | 
    
         
            +
                               const uint32_t *iq1s_grid_gpu) {
         
     | 
| 
       7734 
7617 
     | 
    
         
             
            #if QK_K == 256
         
     | 
| 
       7735 
7618 
     | 
    
         
             
                const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
         
     | 
| 
       7736 
7619 
     | 
    
         | 
| 
       7737 
7620 
     | 
    
         
             
                const int ib32 = iqs;
         
     | 
| 
       7738 
     | 
    
         
            -
                const uint8_t  * qs = bq1->qs + 4*ib32;
         
     | 
| 
       7739 
     | 
    
         
            -
                const int8_t   * q8 = bq8_1[ib32].qs;
         
     | 
| 
       7740 
7621 
     | 
    
         
             
                int sumi = 0;
         
     | 
| 
      
 7622 
     | 
    
         
            +
                const int * q8 = (const int *)bq8_1[ib32].qs;
         
     | 
| 
       7741 
7623 
     | 
    
         
             
                for (int l = 0; l < 4; ++l) {
         
     | 
| 
       7742 
     | 
    
         
            -
                    const  
     | 
| 
       7743 
     | 
    
         
            -
                     
     | 
| 
       7744 
     | 
    
         
            -
                     
     | 
| 
       7745 
     | 
    
         
            -
             
     | 
| 
       7746 
     | 
    
         
            -
             
     | 
| 
       7747 
     | 
    
         
            -
             
     | 
| 
       7748 
     | 
    
         
            -
             
     | 
| 
       7749 
     | 
    
         
            -
             
     | 
| 
       7750 
     | 
    
         
            -
             
     | 
| 
      
 7624 
     | 
    
         
            +
                    const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
         
     | 
| 
      
 7625 
     | 
    
         
            +
                    int grid0 = grid[0] & 0x0f0f0f0f;
         
     | 
| 
      
 7626 
     | 
    
         
            +
                    int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
         
     | 
| 
      
 7627 
     | 
    
         
            +
                    sumi = dpct::dp4a(q8[2 * l + 1], grid1,
         
     | 
| 
      
 7628 
     | 
    
         
            +
                                      dpct::dp4a(q8[2 * l + 0], grid0, sumi));
         
     | 
| 
      
 7629 
     | 
    
         
            +
                }
         
     | 
| 
      
 7630 
     | 
    
         
            +
             
     | 
| 
      
 7631 
     | 
    
         
            +
                const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
         
     | 
| 
      
 7632 
     | 
    
         
            +
                const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
         
     | 
| 
      
 7633 
     | 
    
         
            +
                const float d = d1q * bq8_1[ib32].ds[0];
         
     | 
| 
      
 7634 
     | 
    
         
            +
                const float m = d1q * bq8_1[ib32].ds[1];
         
     | 
| 
      
 7635 
     | 
    
         
            +
                return d * sumi + m * delta;
         
     | 
| 
      
 7636 
     | 
    
         
            +
            #else
         
     | 
| 
      
 7637 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 7638 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 7639 
     | 
    
         
            +
            }
         
     | 
| 
      
 7640 
     | 
    
         
            +
             
     | 
| 
      
 7641 
     | 
    
         
            +
            static __dpct_inline__ float
         
     | 
| 
      
 7642 
     | 
    
         
            +
            vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
         
     | 
| 
      
 7643 
     | 
    
         
            +
                               const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
         
     | 
| 
      
 7644 
     | 
    
         
            +
            #if QK_K == 256
         
     | 
| 
      
 7645 
     | 
    
         
            +
                const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
         
     | 
| 
      
 7646 
     | 
    
         
            +
             
     | 
| 
      
 7647 
     | 
    
         
            +
                const int ib32 = iqs;
         
     | 
| 
      
 7648 
     | 
    
         
            +
                int   sumi[2] = {0, 0};
         
     | 
| 
      
 7649 
     | 
    
         
            +
                float sumf[2] = {0.f, 0.f};
         
     | 
| 
      
 7650 
     | 
    
         
            +
             
     | 
| 
      
 7651 
     | 
    
         
            +
                const int * q8 = (const int *)bq8_1[ib32].qs;
         
     | 
| 
      
 7652 
     | 
    
         
            +
                for (int l = 0; l < 4; ++l) {
         
     | 
| 
      
 7653 
     | 
    
         
            +
                    const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
         
     | 
| 
      
 7654 
     | 
    
         
            +
                    int grid0 = grid[0] & 0x0f0f0f0f;
         
     | 
| 
      
 7655 
     | 
    
         
            +
                    int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
         
     | 
| 
      
 7656 
     | 
    
         
            +
                    sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
         
     | 
| 
      
 7657 
     | 
    
         
            +
                                             dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
         
     | 
| 
      
 7658 
     | 
    
         
            +
                    const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
         
     | 
| 
      
 7659 
     | 
    
         
            +
                    const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
         
     | 
| 
      
 7660 
     | 
    
         
            +
                                                dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
         
     | 
| 
      
 7661 
     | 
    
         
            +
                    sumf[l/2] += delta*sumy;
         
     | 
| 
      
 7662 
     | 
    
         
            +
                }
         
     | 
| 
      
 7663 
     | 
    
         
            +
             
     | 
| 
      
 7664 
     | 
    
         
            +
                iq1m_scale_t scale;
         
     | 
| 
      
 7665 
     | 
    
         
            +
                const uint16_t * sc = (const uint16_t *)bq1->scales;
         
     | 
| 
      
 7666 
     | 
    
         
            +
                scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
         
     | 
| 
      
 7667 
     | 
    
         
            +
                const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
         
     | 
| 
      
 7668 
     | 
    
         
            +
                return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
         
     | 
| 
      
 7669 
     | 
    
         
            +
            #else
         
     | 
| 
      
 7670 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 7671 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 7672 
     | 
    
         
            +
            }
         
     | 
| 
      
 7673 
     | 
    
         
            +
             
     | 
| 
      
 7674 
     | 
    
         
            +
            static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
         
     | 
| 
      
 7675 
     | 
    
         
            +
                                                              const uint8_t *values,
         
     | 
| 
      
 7676 
     | 
    
         
            +
                                                              int &val1, int &val2) {
         
     | 
| 
      
 7677 
     | 
    
         
            +
             
     | 
| 
      
 7678 
     | 
    
         
            +
                uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
         
     | 
| 
      
 7679 
     | 
    
         
            +
                aux32 = q4 & 0x0f0f0f0f;
         
     | 
| 
      
 7680 
     | 
    
         
            +
                uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
         
     | 
| 
      
 7681 
     | 
    
         
            +
                uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
         
     | 
| 
      
 7682 
     | 
    
         
            +
                val1 = v1 | (v2 << 16);
         
     | 
| 
      
 7683 
     | 
    
         
            +
                aux32 = (q4 >> 4) & 0x0f0f0f0f;
         
     | 
| 
      
 7684 
     | 
    
         
            +
                v1 = values[q8[0]] | (values[q8[1]] << 8);
         
     | 
| 
      
 7685 
     | 
    
         
            +
                v2 = values[q8[2]] | (values[q8[3]] << 8);
         
     | 
| 
      
 7686 
     | 
    
         
            +
                val2 = v1 | (v2 << 16);
         
     | 
| 
      
 7687 
     | 
    
         
            +
            }
         
     | 
| 
      
 7688 
     | 
    
         
            +
             
     | 
| 
      
 7689 
     | 
    
         
            +
             
     | 
| 
      
 7690 
     | 
    
         
            +
            static __dpct_inline__ float
         
     | 
| 
      
 7691 
     | 
    
         
            +
            vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
         
     | 
| 
      
 7692 
     | 
    
         
            +
                                const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
         
     | 
| 
      
 7693 
     | 
    
         
            +
             
     | 
| 
      
 7694 
     | 
    
         
            +
                const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
         
     | 
| 
      
 7695 
     | 
    
         
            +
             
     | 
| 
      
 7696 
     | 
    
         
            +
                const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
         
     | 
| 
      
 7697 
     | 
    
         
            +
                const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
         
     | 
| 
      
 7698 
     | 
    
         
            +
             
     | 
| 
      
 7699 
     | 
    
         
            +
                const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
         
     | 
| 
      
 7700 
     | 
    
         
            +
             
     | 
| 
      
 7701 
     | 
    
         
            +
                int v1, v2;
         
     | 
| 
      
 7702 
     | 
    
         
            +
                int sumi1 = 0, sumi2 = 0;
         
     | 
| 
      
 7703 
     | 
    
         
            +
                for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
         
     | 
| 
      
 7704 
     | 
    
         
            +
                    const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
         
     | 
| 
      
 7705 
     | 
    
         
            +
                    get_int_from_table_16(aux, values, v1, v2);
         
     | 
| 
      
 7706 
     | 
    
         
            +
                    sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
         
     | 
| 
      
 7707 
     | 
    
         
            +
                    sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
         
     | 
| 
       7751 
7708 
     | 
    
         
             
                }
         
     | 
| 
       7752 
     | 
    
         
            -
             
     | 
| 
       7753 
     | 
    
         
            -
                 
     | 
| 
      
 7709 
     | 
    
         
            +
             
     | 
| 
      
 7710 
     | 
    
         
            +
                const float d = (float)bq->d * bq8_1->ds[0];
         
     | 
| 
      
 7711 
     | 
    
         
            +
                return d * (sumi1 + sumi2);
         
     | 
| 
      
 7712 
     | 
    
         
            +
            }
         
     | 
| 
      
 7713 
     | 
    
         
            +
             
     | 
| 
      
 7714 
     | 
    
         
            +
             
     | 
| 
      
 7715 
     | 
    
         
            +
            static __dpct_inline__ float
         
     | 
| 
      
 7716 
     | 
    
         
            +
            vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
         
     | 
| 
      
 7717 
     | 
    
         
            +
                                const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
         
     | 
| 
      
 7718 
     | 
    
         
            +
             
     | 
| 
      
 7719 
     | 
    
         
            +
            #if QK_K == 256
         
     | 
| 
      
 7720 
     | 
    
         
            +
                const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
         
     | 
| 
      
 7721 
     | 
    
         
            +
                const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
         
     | 
| 
      
 7722 
     | 
    
         
            +
             
     | 
| 
      
 7723 
     | 
    
         
            +
                // iqs is 0...7
         
     | 
| 
      
 7724 
     | 
    
         
            +
                const int ib32 = iqs;
         
     | 
| 
      
 7725 
     | 
    
         
            +
                const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
         
     | 
| 
      
 7726 
     | 
    
         
            +
                const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
         
     | 
| 
      
 7727 
     | 
    
         
            +
                const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
         
     | 
| 
      
 7728 
     | 
    
         
            +
                const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
         
     | 
| 
      
 7729 
     | 
    
         
            +
                int v1, v2;
         
     | 
| 
      
 7730 
     | 
    
         
            +
                int sumi1 = 0, sumi2 = 0;
         
     | 
| 
      
 7731 
     | 
    
         
            +
                for (int j = 0; j < 4; ++j) {
         
     | 
| 
      
 7732 
     | 
    
         
            +
                    get_int_from_table_16(q4[j], values, v1, v2);
         
     | 
| 
      
 7733 
     | 
    
         
            +
                    sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
         
     | 
| 
      
 7734 
     | 
    
         
            +
                    sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
         
     | 
| 
      
 7735 
     | 
    
         
            +
                }
         
     | 
| 
      
 7736 
     | 
    
         
            +
                return d * (sumi1 + sumi2);
         
     | 
| 
       7754 
7737 
     | 
    
         
             
            #else
         
     | 
| 
       7755 
7738 
     | 
    
         
             
                assert(false);
         
     | 
| 
       7756 
     | 
    
         
            -
                return 0.f;
         
     | 
| 
       7757 
7739 
     | 
    
         
             
            #endif
         
     | 
| 
       7758 
7740 
     | 
    
         
             
            }
         
     | 
| 
       7759 
7741 
     | 
    
         | 
| 
         @@ -8338,8 +8320,7 @@ template <bool need_check> static void 
     | 
|
| 
       8338 
8320 
     | 
    
         | 
| 
       8339 
8321 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
         
     | 
| 
       8340 
8322 
     | 
    
         
             
            static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
         
     | 
| 
       8341 
     | 
    
         
            -
                                      const sycl::nd_item<3> &item_ct1 
     | 
| 
       8342 
     | 
    
         
            -
                                      const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
         
     | 
| 
      
 8323 
     | 
    
         
            +
                                      const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8343 
8324 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8344 
8325 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8345 
8326 
     | 
    
         | 
| 
         @@ -8383,10 +8364,203 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ 
     | 
|
| 
       8383 
8364 
     | 
    
         
             
            }
         
     | 
| 
       8384 
8365 
     | 
    
         | 
| 
       8385 
8366 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
       8386 
     | 
    
         
            -
            static void mul_mat_vec_q_iq2_xxs_q8_1(const void * 
     | 
| 
       8387 
     | 
    
         
            -
             
     | 
| 
       8388 
     | 
    
         
            -
             
     | 
| 
       8389 
     | 
    
         
            -
             
     | 
| 
      
 8367 
     | 
    
         
            +
            static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8368 
     | 
    
         
            +
                                                   const void *__restrict__ vy,
         
     | 
| 
      
 8369 
     | 
    
         
            +
                                                   float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8370 
     | 
    
         
            +
                                                   const int nrows,
         
     | 
| 
      
 8371 
     | 
    
         
            +
                                                   const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 8372 
     | 
    
         
            +
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
      
 8373 
     | 
    
         
            +
                                item_ct1.get_local_id(1);
         
     | 
| 
      
 8374 
     | 
    
         
            +
             
     | 
| 
      
 8375 
     | 
    
         
            +
                if (row >= nrows) {
         
     | 
| 
      
 8376 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 8377 
     | 
    
         
            +
                }
         
     | 
| 
      
 8378 
     | 
    
         
            +
             
     | 
| 
      
 8379 
     | 
    
         
            +
                const int blocks_per_row = ncols / qk;
         
     | 
| 
      
 8380 
     | 
    
         
            +
                const int blocks_per_warp = vdr * WARP_SIZE / qi;
         
     | 
| 
      
 8381 
     | 
    
         
            +
             
     | 
| 
      
 8382 
     | 
    
         
            +
            // partial sum for each thread
         
     | 
| 
      
 8383 
     | 
    
         
            +
                float tmp = 0.0f;
         
     | 
| 
      
 8384 
     | 
    
         
            +
             
     | 
| 
      
 8385 
     | 
    
         
            +
                const block_q_t  * x = (const block_q_t  *) vx;
         
     | 
| 
      
 8386 
     | 
    
         
            +
                const block_q8_1 * y = (const block_q8_1 *) vy;
         
     | 
| 
      
 8387 
     | 
    
         
            +
             
     | 
| 
      
 8388 
     | 
    
         
            +
                for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
         
     | 
| 
      
 8389 
     | 
    
         
            +
                     i += blocks_per_warp) {
         
     | 
| 
      
 8390 
     | 
    
         
            +
                    const int ibx = row*blocks_per_row + i; // x block index
         
     | 
| 
      
 8391 
     | 
    
         
            +
             
     | 
| 
      
 8392 
     | 
    
         
            +
                    const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
         
     | 
| 
      
 8393 
     | 
    
         
            +
             
     | 
| 
      
 8394 
     | 
    
         
            +
                    const int iqs =
         
     | 
| 
      
 8395 
     | 
    
         
            +
                        vdr *
         
     | 
| 
      
 8396 
     | 
    
         
            +
                        (item_ct1.get_local_id(2) %
         
     | 
| 
      
 8397 
     | 
    
         
            +
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
      
 8398 
     | 
    
         
            +
             
     | 
| 
      
 8399 
     | 
    
         
            +
                    tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
         
     | 
| 
      
 8400 
     | 
    
         
            +
                }
         
     | 
| 
      
 8401 
     | 
    
         
            +
             
     | 
| 
      
 8402 
     | 
    
         
            +
                // sum up partial sums and write back result
         
     | 
| 
      
 8403 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 8404 
     | 
    
         
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         
     | 
| 
      
 8405 
     | 
    
         
            +
                    tmp +=
         
     | 
| 
      
 8406 
     | 
    
         
            +
                        dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
         
     | 
| 
      
 8407 
     | 
    
         
            +
                }
         
     | 
| 
      
 8408 
     | 
    
         
            +
             
     | 
| 
      
 8409 
     | 
    
         
            +
                if (item_ct1.get_local_id(2) == 0) {
         
     | 
| 
      
 8410 
     | 
    
         
            +
                    dst[row] = tmp;
         
     | 
| 
      
 8411 
     | 
    
         
            +
                }
         
     | 
| 
      
 8412 
     | 
    
         
            +
            }
         
     | 
| 
      
 8413 
     | 
    
         
            +
             
     | 
| 
      
 8414 
     | 
    
         
            +
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
      
 8415 
     | 
    
         
            +
            static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8416 
     | 
    
         
            +
                                                  const void *__restrict__ vy,
         
     | 
| 
      
 8417 
     | 
    
         
            +
                                                  float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8418 
     | 
    
         
            +
                                                  const int nrows,
         
     | 
| 
      
 8419 
     | 
    
         
            +
                                                  const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 8420 
     | 
    
         
            +
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
      
 8421 
     | 
    
         
            +
                                item_ct1.get_local_id(1);
         
     | 
| 
      
 8422 
     | 
    
         
            +
             
     | 
| 
      
 8423 
     | 
    
         
            +
                if (row >= nrows) {
         
     | 
| 
      
 8424 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 8425 
     | 
    
         
            +
                }
         
     | 
| 
      
 8426 
     | 
    
         
            +
             
     | 
| 
      
 8427 
     | 
    
         
            +
                const int blocks_per_row = ncols / qk;
         
     | 
| 
      
 8428 
     | 
    
         
            +
                const int blocks_per_warp = vdr * WARP_SIZE / qi;
         
     | 
| 
      
 8429 
     | 
    
         
            +
             
     | 
| 
      
 8430 
     | 
    
         
            +
            // partial sum for each thread
         
     | 
| 
      
 8431 
     | 
    
         
            +
                float tmp = 0.0f;
         
     | 
| 
      
 8432 
     | 
    
         
            +
             
     | 
| 
      
 8433 
     | 
    
         
            +
                const block_q_t  * x = (const block_q_t  *) vx;
         
     | 
| 
      
 8434 
     | 
    
         
            +
                const block_q8_1 * y = (const block_q8_1 *) vy;
         
     | 
| 
      
 8435 
     | 
    
         
            +
             
     | 
| 
      
 8436 
     | 
    
         
            +
                for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
         
     | 
| 
      
 8437 
     | 
    
         
            +
                     i += blocks_per_warp) {
         
     | 
| 
      
 8438 
     | 
    
         
            +
                    const int ibx = row*blocks_per_row + i; // x block index
         
     | 
| 
      
 8439 
     | 
    
         
            +
             
     | 
| 
      
 8440 
     | 
    
         
            +
                    const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
         
     | 
| 
      
 8441 
     | 
    
         
            +
             
     | 
| 
      
 8442 
     | 
    
         
            +
                    const int iqs =
         
     | 
| 
      
 8443 
     | 
    
         
            +
                        vdr *
         
     | 
| 
      
 8444 
     | 
    
         
            +
                        (item_ct1.get_local_id(2) %
         
     | 
| 
      
 8445 
     | 
    
         
            +
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
      
 8446 
     | 
    
         
            +
             
     | 
| 
      
 8447 
     | 
    
         
            +
                    tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64);
         
     | 
| 
      
 8448 
     | 
    
         
            +
                }
         
     | 
| 
      
 8449 
     | 
    
         
            +
             
     | 
| 
      
 8450 
     | 
    
         
            +
                // sum up partial sums and write back result
         
     | 
| 
      
 8451 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 8452 
     | 
    
         
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         
     | 
| 
      
 8453 
     | 
    
         
            +
                    tmp +=
         
     | 
| 
      
 8454 
     | 
    
         
            +
                        dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
         
     | 
| 
      
 8455 
     | 
    
         
            +
                }
         
     | 
| 
      
 8456 
     | 
    
         
            +
             
     | 
| 
      
 8457 
     | 
    
         
            +
                if (item_ct1.get_local_id(2) == 0) {
         
     | 
| 
      
 8458 
     | 
    
         
            +
                    dst[row] = tmp;
         
     | 
| 
      
 8459 
     | 
    
         
            +
                }
         
     | 
| 
      
 8460 
     | 
    
         
            +
            }
         
     | 
| 
      
 8461 
     | 
    
         
            +
             
     | 
| 
      
 8462 
     | 
    
         
            +
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
      
 8463 
     | 
    
         
            +
            static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8464 
     | 
    
         
            +
                                                 const void *__restrict__ vy,
         
     | 
| 
      
 8465 
     | 
    
         
            +
                                                 float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8466 
     | 
    
         
            +
                                                 const int nrows,
         
     | 
| 
      
 8467 
     | 
    
         
            +
                                                 const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 8468 
     | 
    
         
            +
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
      
 8469 
     | 
    
         
            +
                                item_ct1.get_local_id(1);
         
     | 
| 
      
 8470 
     | 
    
         
            +
             
     | 
| 
      
 8471 
     | 
    
         
            +
                if (row >= nrows) {
         
     | 
| 
      
 8472 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 8473 
     | 
    
         
            +
                }
         
     | 
| 
      
 8474 
     | 
    
         
            +
             
     | 
| 
      
 8475 
     | 
    
         
            +
                const int blocks_per_row = ncols / qk;
         
     | 
| 
      
 8476 
     | 
    
         
            +
                const int blocks_per_warp = vdr * WARP_SIZE / qi;
         
     | 
| 
      
 8477 
     | 
    
         
            +
             
     | 
| 
      
 8478 
     | 
    
         
            +
            // partial sum for each thread
         
     | 
| 
      
 8479 
     | 
    
         
            +
                float tmp = 0.0f;
         
     | 
| 
      
 8480 
     | 
    
         
            +
             
     | 
| 
      
 8481 
     | 
    
         
            +
                const block_q_t  * x = (const block_q_t  *) vx;
         
     | 
| 
      
 8482 
     | 
    
         
            +
                const block_q8_1 * y = (const block_q8_1 *) vy;
         
     | 
| 
      
 8483 
     | 
    
         
            +
             
     | 
| 
      
 8484 
     | 
    
         
            +
                for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
         
     | 
| 
      
 8485 
     | 
    
         
            +
                     i += blocks_per_warp) {
         
     | 
| 
      
 8486 
     | 
    
         
            +
                    const int ibx = row*blocks_per_row + i; // x block index
         
     | 
| 
      
 8487 
     | 
    
         
            +
             
     | 
| 
      
 8488 
     | 
    
         
            +
                    const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
         
     | 
| 
      
 8489 
     | 
    
         
            +
             
     | 
| 
      
 8490 
     | 
    
         
            +
                    const int iqs =
         
     | 
| 
      
 8491 
     | 
    
         
            +
                        vdr *
         
     | 
| 
      
 8492 
     | 
    
         
            +
                        (item_ct1.get_local_id(2) %
         
     | 
| 
      
 8493 
     | 
    
         
            +
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
      
 8494 
     | 
    
         
            +
             
     | 
| 
      
 8495 
     | 
    
         
            +
                    tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs);
         
     | 
| 
      
 8496 
     | 
    
         
            +
                }
         
     | 
| 
      
 8497 
     | 
    
         
            +
             
     | 
| 
      
 8498 
     | 
    
         
            +
                // sum up partial sums and write back result
         
     | 
| 
      
 8499 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 8500 
     | 
    
         
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         
     | 
| 
      
 8501 
     | 
    
         
            +
                    tmp +=
         
     | 
| 
      
 8502 
     | 
    
         
            +
                        dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
         
     | 
| 
      
 8503 
     | 
    
         
            +
                }
         
     | 
| 
      
 8504 
     | 
    
         
            +
             
     | 
| 
      
 8505 
     | 
    
         
            +
                if (item_ct1.get_local_id(2) == 0) {
         
     | 
| 
      
 8506 
     | 
    
         
            +
                    dst[row] = tmp;
         
     | 
| 
      
 8507 
     | 
    
         
            +
                }
         
     | 
| 
      
 8508 
     | 
    
         
            +
            }
         
     | 
| 
      
 8509 
     | 
    
         
            +
             
     | 
| 
      
 8510 
     | 
    
         
            +
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
      
 8511 
     | 
    
         
            +
            static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8512 
     | 
    
         
            +
                                                   const void *__restrict__ vy,
         
     | 
| 
      
 8513 
     | 
    
         
            +
                                                   float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8514 
     | 
    
         
            +
                                                   const int nrows,
         
     | 
| 
      
 8515 
     | 
    
         
            +
                                                   const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
      
 8516 
     | 
    
         
            +
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
      
 8517 
     | 
    
         
            +
                                item_ct1.get_local_id(1);
         
     | 
| 
      
 8518 
     | 
    
         
            +
             
     | 
| 
      
 8519 
     | 
    
         
            +
                if (row >= nrows) {
         
     | 
| 
      
 8520 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 8521 
     | 
    
         
            +
                }
         
     | 
| 
      
 8522 
     | 
    
         
            +
             
     | 
| 
      
 8523 
     | 
    
         
            +
                const int blocks_per_row = ncols / qk;
         
     | 
| 
      
 8524 
     | 
    
         
            +
                const int blocks_per_warp = vdr * WARP_SIZE / qi;
         
     | 
| 
      
 8525 
     | 
    
         
            +
             
     | 
| 
      
 8526 
     | 
    
         
            +
            // partial sum for each thread
         
     | 
| 
      
 8527 
     | 
    
         
            +
                float tmp = 0.0f;
         
     | 
| 
      
 8528 
     | 
    
         
            +
             
     | 
| 
      
 8529 
     | 
    
         
            +
                const block_q_t  * x = (const block_q_t  *) vx;
         
     | 
| 
      
 8530 
     | 
    
         
            +
                const block_q8_1 * y = (const block_q8_1 *) vy;
         
     | 
| 
      
 8531 
     | 
    
         
            +
             
     | 
| 
      
 8532 
     | 
    
         
            +
                for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
         
     | 
| 
      
 8533 
     | 
    
         
            +
                     i += blocks_per_warp) {
         
     | 
| 
      
 8534 
     | 
    
         
            +
                    const int ibx = row*blocks_per_row + i; // x block index
         
     | 
| 
      
 8535 
     | 
    
         
            +
             
     | 
| 
      
 8536 
     | 
    
         
            +
                    const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
         
     | 
| 
      
 8537 
     | 
    
         
            +
             
     | 
| 
      
 8538 
     | 
    
         
            +
                    const int iqs =
         
     | 
| 
      
 8539 
     | 
    
         
            +
                        vdr *
         
     | 
| 
      
 8540 
     | 
    
         
            +
                        (item_ct1.get_local_id(2) %
         
     | 
| 
      
 8541 
     | 
    
         
            +
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
      
 8542 
     | 
    
         
            +
             
     | 
| 
      
 8543 
     | 
    
         
            +
                    tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64);
         
     | 
| 
      
 8544 
     | 
    
         
            +
                }
         
     | 
| 
      
 8545 
     | 
    
         
            +
             
     | 
| 
      
 8546 
     | 
    
         
            +
                // sum up partial sums and write back result
         
     | 
| 
      
 8547 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 8548 
     | 
    
         
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         
     | 
| 
      
 8549 
     | 
    
         
            +
                    tmp +=
         
     | 
| 
      
 8550 
     | 
    
         
            +
                        dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
         
     | 
| 
      
 8551 
     | 
    
         
            +
                }
         
     | 
| 
      
 8552 
     | 
    
         
            +
             
     | 
| 
      
 8553 
     | 
    
         
            +
                if (item_ct1.get_local_id(2) == 0) {
         
     | 
| 
      
 8554 
     | 
    
         
            +
                    dst[row] = tmp;
         
     | 
| 
      
 8555 
     | 
    
         
            +
                }
         
     | 
| 
      
 8556 
     | 
    
         
            +
            }
         
     | 
| 
      
 8557 
     | 
    
         
            +
             
     | 
| 
      
 8558 
     | 
    
         
            +
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
      
 8559 
     | 
    
         
            +
            static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8560 
     | 
    
         
            +
                                                 const void *__restrict__ vy,
         
     | 
| 
      
 8561 
     | 
    
         
            +
                                                 float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8562 
     | 
    
         
            +
                                                 const int nrows,
         
     | 
| 
      
 8563 
     | 
    
         
            +
                                                 const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8390 
8564 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8391 
8565 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8392 
8566 
     | 
    
         | 
| 
         @@ -8414,7 +8588,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void 
     | 
|
| 
       8414 
8588 
     | 
    
         
             
                        (item_ct1.get_local_id(2) %
         
     | 
| 
       8415 
8589 
     | 
    
         
             
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
       8416 
8590 
     | 
    
         | 
| 
       8417 
     | 
    
         
            -
                    tmp +=  
     | 
| 
      
 8591 
     | 
    
         
            +
                    tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid);
         
     | 
| 
       8418 
8592 
     | 
    
         
             
                }
         
     | 
| 
       8419 
8593 
     | 
    
         | 
| 
       8420 
8594 
     | 
    
         
             
                // sum up partial sums and write back result
         
     | 
| 
         @@ -8430,9 +8604,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void 
     | 
|
| 
       8430 
8604 
     | 
    
         
             
            }
         
     | 
| 
       8431 
8605 
     | 
    
         | 
| 
       8432 
8606 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
       8433 
     | 
    
         
            -
            static void  
     | 
| 
       8434 
     | 
    
         
            -
             
     | 
| 
       8435 
     | 
    
         
            -
             
     | 
| 
      
 8607 
     | 
    
         
            +
            static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8608 
     | 
    
         
            +
                                                 const void *__restrict__ vy,
         
     | 
| 
      
 8609 
     | 
    
         
            +
                                                 float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8610 
     | 
    
         
            +
                                                 const int nrows,
         
     | 
| 
      
 8611 
     | 
    
         
            +
                                                 const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8436 
8612 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8437 
8613 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8438 
8614 
     | 
    
         | 
| 
         @@ -8460,7 +8636,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8460 
8636 
     | 
    
         
             
                        (item_ct1.get_local_id(2) %
         
     | 
| 
       8461 
8637 
     | 
    
         
             
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
       8462 
8638 
     | 
    
         | 
| 
       8463 
     | 
    
         
            -
                    tmp +=  
     | 
| 
      
 8639 
     | 
    
         
            +
                    tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu);
         
     | 
| 
       8464 
8640 
     | 
    
         
             
                }
         
     | 
| 
       8465 
8641 
     | 
    
         | 
| 
       8466 
8642 
     | 
    
         
             
                // sum up partial sums and write back result
         
     | 
| 
         @@ -8476,9 +8652,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8476 
8652 
     | 
    
         
             
            }
         
     | 
| 
       8477 
8653 
     | 
    
         | 
| 
       8478 
8654 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
       8479 
     | 
    
         
            -
            static void  
     | 
| 
       8480 
     | 
    
         
            -
             
     | 
| 
       8481 
     | 
    
         
            -
             
     | 
| 
      
 8655 
     | 
    
         
            +
            static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8656 
     | 
    
         
            +
                                                 const void *__restrict__ vy,
         
     | 
| 
      
 8657 
     | 
    
         
            +
                                                 float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8658 
     | 
    
         
            +
                                                 const int nrows,
         
     | 
| 
      
 8659 
     | 
    
         
            +
                                                 const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8482 
8660 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8483 
8661 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8484 
8662 
     | 
    
         | 
| 
         @@ -8506,7 +8684,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void 
     | 
|
| 
       8506 
8684 
     | 
    
         
             
                        (item_ct1.get_local_id(2) %
         
     | 
| 
       8507 
8685 
     | 
    
         
             
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
       8508 
8686 
     | 
    
         | 
| 
       8509 
     | 
    
         
            -
                    tmp +=  
     | 
| 
      
 8687 
     | 
    
         
            +
                    tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs);
         
     | 
| 
       8510 
8688 
     | 
    
         
             
                }
         
     | 
| 
       8511 
8689 
     | 
    
         | 
| 
       8512 
8690 
     | 
    
         
             
                // sum up partial sums and write back result
         
     | 
| 
         @@ -8522,9 +8700,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void 
     | 
|
| 
       8522 
8700 
     | 
    
         
             
            }
         
     | 
| 
       8523 
8701 
     | 
    
         | 
| 
       8524 
8702 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
       8525 
     | 
    
         
            -
            static void  
     | 
| 
       8526 
     | 
    
         
            -
             
     | 
| 
       8527 
     | 
    
         
            -
             
     | 
| 
      
 8703 
     | 
    
         
            +
            static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8704 
     | 
    
         
            +
                                                  const void *__restrict__ vy,
         
     | 
| 
      
 8705 
     | 
    
         
            +
                                                  float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8706 
     | 
    
         
            +
                                                  const int nrows,
         
     | 
| 
      
 8707 
     | 
    
         
            +
                                                  const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8528 
8708 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8529 
8709 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8530 
8710 
     | 
    
         | 
| 
         @@ -8552,7 +8732,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8552 
8732 
     | 
    
         
             
                        (item_ct1.get_local_id(2) %
         
     | 
| 
       8553 
8733 
     | 
    
         
             
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
       8554 
8734 
     | 
    
         | 
| 
       8555 
     | 
    
         
            -
                    tmp +=  
     | 
| 
      
 8735 
     | 
    
         
            +
                    tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs);
         
     | 
| 
       8556 
8736 
     | 
    
         
             
                }
         
     | 
| 
       8557 
8737 
     | 
    
         | 
| 
       8558 
8738 
     | 
    
         
             
                // sum up partial sums and write back result
         
     | 
| 
         @@ -8567,10 +8747,13 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8567 
8747 
     | 
    
         
             
                }
         
     | 
| 
       8568 
8748 
     | 
    
         
             
            }
         
     | 
| 
       8569 
8749 
     | 
    
         | 
| 
      
 8750 
     | 
    
         
            +
             
     | 
| 
       8570 
8751 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr>
         
     | 
| 
       8571 
     | 
    
         
            -
            static void  
     | 
| 
       8572 
     | 
    
         
            -
             
     | 
| 
       8573 
     | 
    
         
            -
             
     | 
| 
      
 8752 
     | 
    
         
            +
            static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
         
     | 
| 
      
 8753 
     | 
    
         
            +
                                                  const void *__restrict__ vy,
         
     | 
| 
      
 8754 
     | 
    
         
            +
                                                  float *__restrict__ dst, const int ncols,
         
     | 
| 
      
 8755 
     | 
    
         
            +
                                                  const int nrows,
         
     | 
| 
      
 8756 
     | 
    
         
            +
                                                  const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       8574 
8757 
     | 
    
         
             
                const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
         
     | 
| 
       8575 
8758 
     | 
    
         
             
                                item_ct1.get_local_id(1);
         
     | 
| 
       8576 
8759 
     | 
    
         | 
| 
         @@ -8598,7 +8781,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8598 
8781 
     | 
    
         
             
                        (item_ct1.get_local_id(2) %
         
     | 
| 
       8599 
8782 
     | 
    
         
             
                         (qi / vdr)); // x block quant index when casting the quants to int
         
     | 
| 
       8600 
8783 
     | 
    
         | 
| 
       8601 
     | 
    
         
            -
                    tmp +=  
     | 
| 
      
 8784 
     | 
    
         
            +
                    tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs);
         
     | 
| 
       8602 
8785 
     | 
    
         
             
                }
         
     | 
| 
       8603 
8786 
     | 
    
         | 
| 
       8604 
8787 
     | 
    
         
             
                // sum up partial sums and write back result
         
     | 
| 
         @@ -8613,6 +8796,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * 
     | 
|
| 
       8613 
8796 
     | 
    
         
             
                }
         
     | 
| 
       8614 
8797 
     | 
    
         
             
            }
         
     | 
| 
       8615 
8798 
     | 
    
         | 
| 
      
 8799 
     | 
    
         
            +
             
     | 
| 
       8616 
8800 
     | 
    
         
             
            template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
         
     | 
| 
       8617 
8801 
     | 
    
         
             
            static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
         
     | 
| 
       8618 
8802 
     | 
    
         
             
                                               const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
         @@ -9174,64 +9358,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, 
     | 
|
| 
       9174 
9358 
     | 
    
         
             
                }
         
     | 
| 
       9175 
9359 
     | 
    
         
             
            }
         
     | 
| 
       9176 
9360 
     | 
    
         | 
| 
      
 9361 
     | 
    
         
            +
             
     | 
| 
       9177 
9362 
     | 
    
         
             
            template<typename T>
         
     | 
| 
       9178 
     | 
    
         
            -
            static inline void  
     | 
| 
      
 9363 
     | 
    
         
            +
            static inline void ggml_sycl_swap(T & a, T & b) {
         
     | 
| 
       9179 
9364 
     | 
    
         
             
                T tmp = a;
         
     | 
| 
       9180 
9365 
     | 
    
         
             
                a = b;
         
     | 
| 
       9181 
9366 
     | 
    
         
             
                b = tmp;
         
     | 
| 
       9182 
9367 
     | 
    
         
             
            }
         
     | 
| 
       9183 
9368 
     | 
    
         | 
| 
       9184 
     | 
    
         
            -
            template<ggml_sort_order order>
         
     | 
| 
       9185 
     | 
    
         
            -
            static void 
     | 
| 
       9186 
     | 
    
         
            -
             
     | 
| 
      
 9369 
     | 
    
         
            +
            template <ggml_sort_order order>
         
     | 
| 
      
 9370 
     | 
    
         
            +
            __dpct_inline__ static void
         
     | 
| 
      
 9371 
     | 
    
         
            +
            k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad,
         
     | 
| 
      
 9372 
     | 
    
         
            +
                              const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) {
         
     | 
| 
       9187 
9373 
     | 
    
         
             
                // bitonic sort
         
     | 
| 
       9188 
9374 
     | 
    
         
             
                int col = item_ct1.get_local_id(2);
         
     | 
| 
       9189 
9375 
     | 
    
         
             
                int row = item_ct1.get_group(1);
         
     | 
| 
       9190 
9376 
     | 
    
         | 
| 
       9191 
     | 
    
         
            -
                if (col >=  
     | 
| 
      
 9377 
     | 
    
         
            +
                if (col >= ncols_pad) {
         
     | 
| 
      
 9378 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 9379 
     | 
    
         
            +
                }
         
     | 
| 
       9192 
9380 
     | 
    
         | 
| 
       9193 
9381 
     | 
    
         
             
                const float * x_row = x + row * ncols;
         
     | 
| 
       9194 
     | 
    
         
            -
                 
     | 
| 
      
 9382 
     | 
    
         
            +
                auto dst_row = (int *)dpct_local;
         
     | 
| 
       9195 
9383 
     | 
    
         | 
| 
       9196 
9384 
     | 
    
         
             
                // initialize indices
         
     | 
| 
       9197 
     | 
    
         
            -
                 
     | 
| 
       9198 
     | 
    
         
            -
             
     | 
| 
       9199 
     | 
    
         
            -
                 
     | 
| 
       9200 
     | 
    
         
            -
                /*
         
     | 
| 
       9201 
     | 
    
         
            -
                DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
         
     | 
| 
       9202 
     | 
    
         
            -
                sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
         
     | 
| 
       9203 
     | 
    
         
            -
                performance if there is no access to global memory.
         
     | 
| 
       9204 
     | 
    
         
            -
                */
         
     | 
| 
       9205 
     | 
    
         
            -
                item_ct1.barrier();
         
     | 
| 
      
 9385 
     | 
    
         
            +
                dst_row[col] = col;
         
     | 
| 
      
 9386 
     | 
    
         
            +
             
     | 
| 
      
 9387 
     | 
    
         
            +
                item_ct1.barrier(sycl::access::fence_space::local_space);
         
     | 
| 
       9206 
9388 
     | 
    
         | 
| 
       9207 
     | 
    
         
            -
                for (int k = 2; k <=  
     | 
| 
      
 9389 
     | 
    
         
            +
                for (int k = 2; k <= ncols_pad; k *= 2) {
         
     | 
| 
       9208 
9390 
     | 
    
         
             
                    for (int j = k / 2; j > 0; j /= 2) {
         
     | 
| 
       9209 
9391 
     | 
    
         
             
                        int ixj = col ^ j;
         
     | 
| 
       9210 
9392 
     | 
    
         
             
                        if (ixj > col) {
         
     | 
| 
       9211 
9393 
     | 
    
         
             
                            if ((col & k) == 0) {
         
     | 
| 
       9212 
     | 
    
         
            -
                                if ( 
     | 
| 
       9213 
     | 
    
         
            -
                                     
     | 
| 
      
 9394 
     | 
    
         
            +
                                if (dst_row[col] >= ncols ||
         
     | 
| 
      
 9395 
     | 
    
         
            +
                                    (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
         
     | 
| 
      
 9396 
     | 
    
         
            +
                                        x_row[dst_row[col]] > x_row[dst_row[ixj]] :
         
     | 
| 
      
 9397 
     | 
    
         
            +
                                        x_row[dst_row[col]] < x_row[dst_row[ixj]]))
         
     | 
| 
      
 9398 
     | 
    
         
            +
                                ) {
         
     | 
| 
      
 9399 
     | 
    
         
            +
                                    ggml_sycl_swap(dst_row[col], dst_row[ixj]);
         
     | 
| 
       9214 
9400 
     | 
    
         
             
                                }
         
     | 
| 
       9215 
9401 
     | 
    
         
             
                            } else {
         
     | 
| 
       9216 
     | 
    
         
            -
                                if ( 
     | 
| 
       9217 
     | 
    
         
            -
                                     
     | 
| 
      
 9402 
     | 
    
         
            +
                                if (dst_row[ixj] >= ncols ||
         
     | 
| 
      
 9403 
     | 
    
         
            +
                                    (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
         
     | 
| 
      
 9404 
     | 
    
         
            +
                                        x_row[dst_row[col]] < x_row[dst_row[ixj]] :
         
     | 
| 
      
 9405 
     | 
    
         
            +
                                        x_row[dst_row[col]] > x_row[dst_row[ixj]]))
         
     | 
| 
      
 9406 
     | 
    
         
            +
                                ) {
         
     | 
| 
      
 9407 
     | 
    
         
            +
                                    ggml_sycl_swap(dst_row[col], dst_row[ixj]);
         
     | 
| 
       9218 
9408 
     | 
    
         
             
                                }
         
     | 
| 
       9219 
9409 
     | 
    
         
             
                            }
         
     | 
| 
       9220 
9410 
     | 
    
         
             
                        }
         
     | 
| 
       9221 
9411 
     | 
    
         
             
                        /*
         
     | 
| 
       9222 
     | 
    
         
            -
                        DPCT1118: 
     | 
| 
      
 9412 
     | 
    
         
            +
                        DPCT1118:1: SYCL group functions and algorithms must be encountered
         
     | 
| 
       9223 
9413 
     | 
    
         
             
                        in converged control flow. You may need to adjust the code.
         
     | 
| 
       9224 
9414 
     | 
    
         
             
                        */
         
     | 
| 
       9225 
     | 
    
         
            -
                         
     | 
| 
       9226 
     | 
    
         
            -
                        DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
         
     | 
| 
       9227 
     | 
    
         
            -
                        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         
     | 
| 
       9228 
     | 
    
         
            -
                        better performance if there is no access to global memory.
         
     | 
| 
       9229 
     | 
    
         
            -
                        */
         
     | 
| 
       9230 
     | 
    
         
            -
                        item_ct1.barrier();
         
     | 
| 
      
 9415 
     | 
    
         
            +
                        item_ct1.barrier(sycl::access::fence_space::local_space);
         
     | 
| 
       9231 
9416 
     | 
    
         
             
                    }
         
     | 
| 
       9232 
9417 
     | 
    
         
             
                }
         
     | 
| 
      
 9418 
     | 
    
         
            +
             
     | 
| 
      
 9419 
     | 
    
         
            +
                // copy the result to dst without the padding
         
     | 
| 
      
 9420 
     | 
    
         
            +
                if (col < ncols) {
         
     | 
| 
      
 9421 
     | 
    
         
            +
                    dst[row * ncols + col] = dst_row[col];
         
     | 
| 
      
 9422 
     | 
    
         
            +
                }
         
     | 
| 
       9233 
9423 
     | 
    
         
             
            }
         
     | 
| 
       9234 
9424 
     | 
    
         | 
| 
      
 9425 
     | 
    
         
            +
             
     | 
| 
       9235 
9426 
     | 
    
         
             
            static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
         
     | 
| 
       9236 
9427 
     | 
    
         
             
                                          const sycl::nd_item<3> &item_ct1) {
         
     | 
| 
       9237 
9428 
     | 
    
         
             
                const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
         
     | 
| 
         @@ -10210,31 +10401,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, 
     | 
|
| 
       10210 
10401 
     | 
    
         
             
            #endif
         
     | 
| 
       10211 
10402 
     | 
    
         
             
            }
         
     | 
| 
       10212 
10403 
     | 
    
         | 
| 
       10213 
     | 
    
         
            -
             
     | 
| 
       10214 
10404 
     | 
    
         
             
            template <typename dst_t>
         
     | 
| 
       10215 
     | 
    
         
            -
            static void  
     | 
| 
      
 10405 
     | 
    
         
            +
            static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
       10216 
10406 
     | 
    
         
             
                                                    dpct::queue_ptr stream) {
         
     | 
| 
       10217 
10407 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       10218 
10408 
     | 
    
         
             
                {
         
     | 
| 
       10219 
     | 
    
         
            -
                     
     | 
| 
       10220 
     | 
    
         
            -
             
     | 
| 
       10221 
     | 
    
         
            -
             
     | 
| 
      
 10409 
     | 
    
         
            +
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
      
 10410 
     | 
    
         
            +
                                                 {sycl::aspect::fp16});
         
     | 
| 
      
 10411 
     | 
    
         
            +
             
     | 
| 
      
 10412 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 10413 
     | 
    
         
            +
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
      
 10414 
     | 
    
         
            +
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
      
 10415 
     | 
    
         
            +
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
      
 10416 
     | 
    
         
            +
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 10417 
     | 
    
         
            +
                                             dequantize_block_iq1_s(
         
     | 
| 
      
 10418 
     | 
    
         
            +
                                                 vx, y, item_ct1, iq1s_grid_gpu
         
     | 
| 
      
 10419 
     | 
    
         
            +
                                                 );
         
     | 
| 
      
 10420 
     | 
    
         
            +
                                         });
         
     | 
| 
      
 10421 
     | 
    
         
            +
                    });
         
     | 
| 
      
 10422 
     | 
    
         
            +
                }
         
     | 
| 
      
 10423 
     | 
    
         
            +
            }
         
     | 
| 
       10222 
10424 
     | 
    
         | 
| 
      
 10425 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 10426 
     | 
    
         
            +
            static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
      
 10427 
     | 
    
         
            +
                                                    dpct::queue_ptr stream) {
         
     | 
| 
      
 10428 
     | 
    
         
            +
                const int nb = k / QK_K;
         
     | 
| 
      
 10429 
     | 
    
         
            +
                {
         
     | 
| 
       10223 
10430 
     | 
    
         
             
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
       10224 
10431 
     | 
    
         
             
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10225 
10432 
     | 
    
         | 
| 
       10226 
10433 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10227 
     | 
    
         
            -
                         
     | 
| 
       10228 
     | 
    
         
            -
             
     | 
| 
       10229 
     | 
    
         
            -
             
     | 
| 
      
 10434 
     | 
    
         
            +
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
      
 10435 
     | 
    
         
            +
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
      
 10436 
     | 
    
         
            +
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
      
 10437 
     | 
    
         
            +
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 10438 
     | 
    
         
            +
                                             dequantize_block_iq1_m(
         
     | 
| 
      
 10439 
     | 
    
         
            +
                                                 vx, y, item_ct1, iq1s_grid_gpu
         
     | 
| 
      
 10440 
     | 
    
         
            +
                                                 );
         
     | 
| 
      
 10441 
     | 
    
         
            +
                                         });
         
     | 
| 
      
 10442 
     | 
    
         
            +
                    });
         
     | 
| 
      
 10443 
     | 
    
         
            +
                }
         
     | 
| 
      
 10444 
     | 
    
         
            +
            }
         
     | 
| 
      
 10445 
     | 
    
         
            +
             
     | 
| 
      
 10446 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 10447 
     | 
    
         
            +
            static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
      
 10448 
     | 
    
         
            +
                                                    dpct::queue_ptr stream) {
         
     | 
| 
      
 10449 
     | 
    
         
            +
                const int nb = k / QK_K;
         
     | 
| 
      
 10450 
     | 
    
         
            +
                {
         
     | 
| 
      
 10451 
     | 
    
         
            +
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
      
 10452 
     | 
    
         
            +
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10230 
10453 
     | 
    
         | 
| 
      
 10454 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10231 
10455 
     | 
    
         
             
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
       10232 
10456 
     | 
    
         
             
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
       10233 
10457 
     | 
    
         
             
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
       10234 
10458 
     | 
    
         
             
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
       10235 
10459 
     | 
    
         
             
                                             dequantize_block_iq2_xxs(
         
     | 
| 
       10236 
     | 
    
         
            -
                                                 vx, y, item_ct1,  
     | 
| 
       10237 
     | 
    
         
            -
                                                  
     | 
| 
      
 10460 
     | 
    
         
            +
                                                 vx, y, item_ct1, iq2xxs_grid,
         
     | 
| 
      
 10461 
     | 
    
         
            +
                                                 ksigns_iq2xs, kmask_iq2xs);
         
     | 
| 
       10238 
10462 
     | 
    
         
             
                                         });
         
     | 
| 
       10239 
10463 
     | 
    
         
             
                    });
         
     | 
| 
       10240 
10464 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10245,117 +10469,130 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, 
     | 
|
| 
       10245 
10469 
     | 
    
         
             
                                                   dpct::queue_ptr stream) {
         
     | 
| 
       10246 
10470 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       10247 
10471 
     | 
    
         
             
                {
         
     | 
| 
       10248 
     | 
    
         
            -
                    iq2xs_grid.init(*stream);
         
     | 
| 
       10249 
     | 
    
         
            -
                    ksigns_iq2xs.init(*stream);
         
     | 
| 
       10250 
     | 
    
         
            -
                    kmask_iq2xs.init(*stream);
         
     | 
| 
       10251 
     | 
    
         
            -
             
     | 
| 
       10252 
10472 
     | 
    
         
             
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
       10253 
10473 
     | 
    
         
             
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10254 
10474 
     | 
    
         | 
| 
       10255 
10475 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10256 
     | 
    
         
            -
                        auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
         
     | 
| 
       10257 
     | 
    
         
            -
                        auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
         
     | 
| 
       10258 
     | 
    
         
            -
                        auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
         
     | 
| 
       10259 
     | 
    
         
            -
             
     | 
| 
       10260 
10476 
     | 
    
         
             
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
       10261 
10477 
     | 
    
         
             
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
       10262 
10478 
     | 
    
         
             
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
       10263 
10479 
     | 
    
         
             
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
       10264 
10480 
     | 
    
         
             
                                             dequantize_block_iq2_xs(
         
     | 
| 
       10265 
     | 
    
         
            -
                                                 vx, y, item_ct1,  
     | 
| 
       10266 
     | 
    
         
            -
                                                  
     | 
| 
      
 10481 
     | 
    
         
            +
                                                 vx, y, item_ct1, iq2xs_grid,
         
     | 
| 
      
 10482 
     | 
    
         
            +
                                                 ksigns_iq2xs, kmask_iq2xs);
         
     | 
| 
       10267 
10483 
     | 
    
         
             
                                         });
         
     | 
| 
       10268 
10484 
     | 
    
         
             
                    });
         
     | 
| 
       10269 
10485 
     | 
    
         
             
                }
         
     | 
| 
       10270 
10486 
     | 
    
         
             
            }
         
     | 
| 
       10271 
10487 
     | 
    
         | 
| 
       10272 
10488 
     | 
    
         
             
            template <typename dst_t>
         
     | 
| 
       10273 
     | 
    
         
            -
            static void  
     | 
| 
       10274 
     | 
    
         
            -
             
     | 
| 
      
 10489 
     | 
    
         
            +
            static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
      
 10490 
     | 
    
         
            +
                                                  dpct::queue_ptr stream) {
         
     | 
| 
       10275 
10491 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       10276 
10492 
     | 
    
         
             
                {
         
     | 
| 
       10277 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10278 
     | 
    
         
            -
                    ksigns_iq2xs.init(*stream);
         
     | 
| 
       10279 
     | 
    
         
            -
                    kmask_iq2xs.init(*stream);
         
     | 
| 
       10280 
     | 
    
         
            -
             
     | 
| 
       10281 
10493 
     | 
    
         
             
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
       10282 
10494 
     | 
    
         
             
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10283 
10495 
     | 
    
         | 
| 
       10284 
10496 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10285 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10286 
     | 
    
         
            -
                        auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
         
     | 
| 
       10287 
     | 
    
         
            -
                        auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
         
     | 
| 
       10288 
     | 
    
         
            -
             
     | 
| 
       10289 
10497 
     | 
    
         
             
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
       10290 
10498 
     | 
    
         
             
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
       10291 
10499 
     | 
    
         
             
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
       10292 
10500 
     | 
    
         
             
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
       10293 
     | 
    
         
            -
                                              
     | 
| 
       10294 
     | 
    
         
            -
                                                 vx, y, item_ct1, iq3xxs_grid_ptr_ct1,
         
     | 
| 
       10295 
     | 
    
         
            -
                                                 ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
         
     | 
| 
      
 10501 
     | 
    
         
            +
                                             dequantize_block_iq2_s(vx, y, item_ct1);
         
     | 
| 
       10296 
10502 
     | 
    
         
             
                                         });
         
     | 
| 
       10297 
10503 
     | 
    
         
             
                    });
         
     | 
| 
       10298 
10504 
     | 
    
         
             
                }
         
     | 
| 
       10299 
10505 
     | 
    
         
             
            }
         
     | 
| 
       10300 
10506 
     | 
    
         | 
| 
      
 10507 
     | 
    
         
            +
             
     | 
| 
       10301 
10508 
     | 
    
         
             
            template <typename dst_t>
         
     | 
| 
       10302 
     | 
    
         
            -
            static void  
     | 
| 
      
 10509 
     | 
    
         
            +
            static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
       10303 
10510 
     | 
    
         
             
                                                    dpct::queue_ptr stream) {
         
     | 
| 
       10304 
10511 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       10305 
10512 
     | 
    
         
             
                {
         
     | 
| 
       10306 
     | 
    
         
            -
                    iq3s_grid.init(*stream);
         
     | 
| 
       10307 
     | 
    
         
            -
                    ksigns_iq2xs.init(*stream);
         
     | 
| 
       10308 
     | 
    
         
            -
                    kmask_iq2xs.init(*stream);
         
     | 
| 
       10309 
     | 
    
         
            -
             
     | 
| 
       10310 
10513 
     | 
    
         
             
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
       10311 
10514 
     | 
    
         
             
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10312 
10515 
     | 
    
         | 
| 
       10313 
10516 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10314 
     | 
    
         
            -
                        auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
         
     | 
| 
       10315 
     | 
    
         
            -
                        auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
         
     | 
| 
       10316 
     | 
    
         
            -
                        auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
         
     | 
| 
       10317 
     | 
    
         
            -
             
     | 
| 
       10318 
10517 
     | 
    
         
             
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
       10319 
10518 
     | 
    
         
             
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
       10320 
10519 
     | 
    
         
             
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
       10321 
10520 
     | 
    
         
             
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
       10322 
     | 
    
         
            -
                                              
     | 
| 
       10323 
     | 
    
         
            -
                                                 vx, y, item_ct1,  
     | 
| 
       10324 
     | 
    
         
            -
                                                  
     | 
| 
      
 10521 
     | 
    
         
            +
                                             dequantize_block_iq3_xxs(
         
     | 
| 
      
 10522 
     | 
    
         
            +
                                                 vx, y, item_ct1, iq3xxs_grid,
         
     | 
| 
      
 10523 
     | 
    
         
            +
                                                 ksigns_iq2xs, kmask_iq2xs);
         
     | 
| 
       10325 
10524 
     | 
    
         
             
                                         });
         
     | 
| 
       10326 
10525 
     | 
    
         
             
                    });
         
     | 
| 
       10327 
10526 
     | 
    
         
             
                }
         
     | 
| 
       10328 
10527 
     | 
    
         
             
            }
         
     | 
| 
       10329 
10528 
     | 
    
         | 
| 
       10330 
10529 
     | 
    
         
             
            template <typename dst_t>
         
     | 
| 
       10331 
     | 
    
         
            -
            static void  
     | 
| 
      
 10530 
     | 
    
         
            +
            static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
       10332 
10531 
     | 
    
         
             
                                                    dpct::queue_ptr stream) {
         
     | 
| 
       10333 
10532 
     | 
    
         
             
                const int nb = k / QK_K;
         
     | 
| 
       10334 
10533 
     | 
    
         
             
                {
         
     | 
| 
       10335 
     | 
    
         
            -
                    iq1s_grid_gpu.init(*stream);
         
     | 
| 
       10336 
     | 
    
         
            -
                    ksigns_iq2xs.init(*stream);
         
     | 
| 
       10337 
     | 
    
         
            -
                    kmask_iq2xs.init(*stream);
         
     | 
| 
       10338 
     | 
    
         
            -
             
     | 
| 
       10339 
10534 
     | 
    
         
             
                    dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
       10340 
10535 
     | 
    
         
             
                                                 {sycl::aspect::fp16});
         
     | 
| 
       10341 
10536 
     | 
    
         | 
| 
       10342 
10537 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10343 
     | 
    
         
            -
                        auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
         
     | 
| 
       10344 
     | 
    
         
            -
                        auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
         
     | 
| 
       10345 
     | 
    
         
            -
                        auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
         
     | 
| 
       10346 
     | 
    
         
            -
             
     | 
| 
       10347 
10538 
     | 
    
         
             
                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
       10348 
10539 
     | 
    
         
             
                                                               sycl::range<3>(1, 1, 32),
         
     | 
| 
       10349 
10540 
     | 
    
         
             
                                                           sycl::range<3>(1, 1, 32)),
         
     | 
| 
       10350 
10541 
     | 
    
         
             
                                         [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
       10351 
     | 
    
         
            -
                                              
     | 
| 
       10352 
     | 
    
         
            -
                                                 vx, y, item_ct1,  
     | 
| 
       10353 
     | 
    
         
            -
                                                 ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
         
     | 
| 
      
 10542 
     | 
    
         
            +
                                             dequantize_block_iq3_s(
         
     | 
| 
      
 10543 
     | 
    
         
            +
                                                 vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
         
     | 
| 
       10354 
10544 
     | 
    
         
             
                                         });
         
     | 
| 
       10355 
10545 
     | 
    
         
             
                    });
         
     | 
| 
       10356 
10546 
     | 
    
         
             
                }
         
     | 
| 
       10357 
10547 
     | 
    
         
             
            }
         
     | 
| 
       10358 
10548 
     | 
    
         | 
| 
      
 10549 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 10550 
     | 
    
         
            +
            static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
      
 10551 
     | 
    
         
            +
                                                   dpct::queue_ptr stream) {
         
     | 
| 
      
 10552 
     | 
    
         
            +
                const int nb = (k + QK_K - 1) / QK_K;
         
     | 
| 
      
 10553 
     | 
    
         
            +
            #if QK_K == 64
         
     | 
| 
      
 10554 
     | 
    
         
            +
                dequantize_row_iq4_nl_sycl(vx, y, k, stream);
         
     | 
| 
      
 10555 
     | 
    
         
            +
            #else
         
     | 
| 
      
 10556 
     | 
    
         
            +
                  {
         
     | 
| 
      
 10557 
     | 
    
         
            +
                        dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
      
 10558 
     | 
    
         
            +
                                                     {sycl::aspect::fp16});
         
     | 
| 
      
 10559 
     | 
    
         
            +
             
     | 
| 
      
 10560 
     | 
    
         
            +
                        stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 10561 
     | 
    
         
            +
                              cgh.parallel_for(
         
     | 
| 
      
 10562 
     | 
    
         
            +
                                  sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
      
 10563 
     | 
    
         
            +
                                                        sycl::range<3>(1, 1, 32),
         
     | 
| 
      
 10564 
     | 
    
         
            +
                                                    sycl::range<3>(1, 1, 32)),
         
     | 
| 
      
 10565 
     | 
    
         
            +
                                  [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 10566 
     | 
    
         
            +
                                        dequantize_block_iq4_xs(vx, y, item_ct1);
         
     | 
| 
      
 10567 
     | 
    
         
            +
                                  });
         
     | 
| 
      
 10568 
     | 
    
         
            +
                        });
         
     | 
| 
      
 10569 
     | 
    
         
            +
                  }
         
     | 
| 
      
 10570 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 10571 
     | 
    
         
            +
            }
         
     | 
| 
      
 10572 
     | 
    
         
            +
             
     | 
| 
      
 10573 
     | 
    
         
            +
             
     | 
| 
      
 10574 
     | 
    
         
            +
            template <typename dst_t>
         
     | 
| 
      
 10575 
     | 
    
         
            +
            static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
         
     | 
| 
      
 10576 
     | 
    
         
            +
                                                   dpct::queue_ptr stream) {
         
     | 
| 
      
 10577 
     | 
    
         
            +
                const int nb = (k + QK_K - 1) / QK_K;
         
     | 
| 
      
 10578 
     | 
    
         
            +
                  {
         
     | 
| 
      
 10579 
     | 
    
         
            +
                        dpct::has_capability_or_fail(stream->get_device(),
         
     | 
| 
      
 10580 
     | 
    
         
            +
                                                     {sycl::aspect::fp16});
         
     | 
| 
      
 10581 
     | 
    
         
            +
             
     | 
| 
      
 10582 
     | 
    
         
            +
                        stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 10583 
     | 
    
         
            +
                              cgh.parallel_for(
         
     | 
| 
      
 10584 
     | 
    
         
            +
                                  sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
         
     | 
| 
      
 10585 
     | 
    
         
            +
                                                        sycl::range<3>(1, 1, 32),
         
     | 
| 
      
 10586 
     | 
    
         
            +
                                                    sycl::range<3>(1, 1, 32)),
         
     | 
| 
      
 10587 
     | 
    
         
            +
                                  [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 10588 
     | 
    
         
            +
                                        dequantize_block_iq4_nl(vx, y, item_ct1);
         
     | 
| 
      
 10589 
     | 
    
         
            +
                                  });
         
     | 
| 
      
 10590 
     | 
    
         
            +
                        });
         
     | 
| 
      
 10591 
     | 
    
         
            +
                  }
         
     | 
| 
      
 10592 
     | 
    
         
            +
            }
         
     | 
| 
      
 10593 
     | 
    
         
            +
             
     | 
| 
      
 10594 
     | 
    
         
            +
             
     | 
| 
      
 10595 
     | 
    
         
            +
             
     | 
| 
       10359 
10596 
     | 
    
         
             
            template <typename src_t, typename dst_t>
         
     | 
| 
       10360 
10597 
     | 
    
         
             
            static void convert_unary_sycl(const void *__restrict__ vx,
         
     | 
| 
       10361 
10598 
     | 
    
         
             
                                           dst_t *__restrict__ y, const int k,
         
     | 
| 
         @@ -10400,16 +10637,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { 
     | 
|
| 
       10400 
10637 
     | 
    
         
             
                        return dequantize_row_q5_K_sycl;
         
     | 
| 
       10401 
10638 
     | 
    
         
             
                    case GGML_TYPE_Q6_K:
         
     | 
| 
       10402 
10639 
     | 
    
         
             
                        return dequantize_row_q6_K_sycl;
         
     | 
| 
      
 10640 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 10641 
     | 
    
         
            +
                        return dequantize_row_iq1_s_sycl;
         
     | 
| 
      
 10642 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
      
 10643 
     | 
    
         
            +
                        return dequantize_row_iq1_m_sycl;
         
     | 
| 
       10403 
10644 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XXS:
         
     | 
| 
       10404 
10645 
     | 
    
         
             
                        return dequantize_row_iq2_xxs_sycl;
         
     | 
| 
       10405 
10646 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       10406 
10647 
     | 
    
         
             
                        return dequantize_row_iq2_xs_sycl;
         
     | 
| 
      
 10648 
     | 
    
         
            +
                    case GGML_TYPE_IQ2_S:
         
     | 
| 
      
 10649 
     | 
    
         
            +
                        return dequantize_row_iq2_s_sycl;
         
     | 
| 
       10407 
10650 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       10408 
10651 
     | 
    
         
             
                        return dequantize_row_iq3_xxs_sycl;
         
     | 
| 
       10409 
10652 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
       10410 
10653 
     | 
    
         
             
                        return dequantize_row_iq3_s_sycl;
         
     | 
| 
       10411 
     | 
    
         
            -
                    case  
     | 
| 
       10412 
     | 
    
         
            -
                        return  
     | 
| 
      
 10654 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
      
 10655 
     | 
    
         
            +
                        return dequantize_row_iq4_xs_sycl;
         
     | 
| 
      
 10656 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
      
 10657 
     | 
    
         
            +
                        return dequantize_row_iq4_nl_sycl;
         
     | 
| 
       10413 
10658 
     | 
    
         
             
                    case GGML_TYPE_F32:
         
     | 
| 
       10414 
10659 
     | 
    
         
             
                        return convert_unary_sycl<float>;
         
     | 
| 
       10415 
10660 
     | 
    
         
             
                    default:
         
     | 
| 
         @@ -10444,16 +10689,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { 
     | 
|
| 
       10444 
10689 
     | 
    
         
             
                        return dequantize_row_q5_K_sycl;
         
     | 
| 
       10445 
10690 
     | 
    
         
             
                    case GGML_TYPE_Q6_K:
         
     | 
| 
       10446 
10691 
     | 
    
         
             
                        return dequantize_row_q6_K_sycl;
         
     | 
| 
      
 10692 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 10693 
     | 
    
         
            +
                        return dequantize_row_iq1_s_sycl;
         
     | 
| 
      
 10694 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
      
 10695 
     | 
    
         
            +
                        return dequantize_row_iq1_m_sycl;
         
     | 
| 
       10447 
10696 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XXS:
         
     | 
| 
       10448 
10697 
     | 
    
         
             
                        return dequantize_row_iq2_xxs_sycl;
         
     | 
| 
       10449 
10698 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       10450 
10699 
     | 
    
         
             
                        return dequantize_row_iq2_xs_sycl;
         
     | 
| 
      
 10700 
     | 
    
         
            +
                    case GGML_TYPE_IQ2_S:
         
     | 
| 
      
 10701 
     | 
    
         
            +
                        return dequantize_row_iq2_s_sycl;
         
     | 
| 
       10451 
10702 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       10452 
10703 
     | 
    
         
             
                        return dequantize_row_iq3_xxs_sycl;
         
     | 
| 
       10453 
10704 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
       10454 
10705 
     | 
    
         
             
                        return dequantize_row_iq3_s_sycl;
         
     | 
| 
       10455 
     | 
    
         
            -
                    case  
     | 
| 
       10456 
     | 
    
         
            -
                        return  
     | 
| 
      
 10706 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
      
 10707 
     | 
    
         
            +
                        return dequantize_row_iq4_xs_sycl;
         
     | 
| 
      
 10708 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
      
 10709 
     | 
    
         
            +
                        return dequantize_row_iq4_nl_sycl;
         
     | 
| 
       10457 
10710 
     | 
    
         
             
                    case GGML_TYPE_F16:
         
     | 
| 
       10458 
10711 
     | 
    
         
             
                        return convert_unary_sycl<sycl::half>;
         
     | 
| 
       10459 
10712 
     | 
    
         
             
                    default:
         
     | 
| 
         @@ -10675,12 +10928,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10675 
10928 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10676 
10929 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10677 
10930 
     | 
    
         
             
                {
         
     | 
| 
       10678 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10679 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10680 
10931 
     | 
    
         | 
| 
       10681 
10932 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10682 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10683 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10684 
10933 
     | 
    
         | 
| 
       10685 
10934 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10686 
10935 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10688,8 +10937,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10688 
10937 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10689 
10938 
     | 
    
         
             
                                    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
         
     | 
| 
       10690 
10939 
     | 
    
         
             
                                                  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
         
     | 
| 
       10691 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10692 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 10940 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10693 
10941 
     | 
    
         
             
                                });
         
     | 
| 
       10694 
10942 
     | 
    
         
             
                    });
         
     | 
| 
       10695 
10943 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10704,12 +10952,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10704 
10952 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10705 
10953 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10706 
10954 
     | 
    
         
             
                {
         
     | 
| 
       10707 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10708 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10709 
10955 
     | 
    
         | 
| 
       10710 
10956 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10711 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10712 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10713 
10957 
     | 
    
         | 
| 
       10714 
10958 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10715 
10959 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10717,8 +10961,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10717 
10961 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10718 
10962 
     | 
    
         
             
                                    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
         
     | 
| 
       10719 
10963 
     | 
    
         
             
                                                  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
         
     | 
| 
       10720 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10721 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 10964 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10722 
10965 
     | 
    
         
             
                                });
         
     | 
| 
       10723 
10966 
     | 
    
         
             
                    });
         
     | 
| 
       10724 
10967 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10733,12 +10976,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10733 
10976 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10734 
10977 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10735 
10978 
     | 
    
         
             
                {
         
     | 
| 
       10736 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10737 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10738 
10979 
     | 
    
         | 
| 
       10739 
10980 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10740 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10741 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10742 
10981 
     | 
    
         | 
| 
       10743 
10982 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10744 
10983 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10746,8 +10985,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10746 
10985 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10747 
10986 
     | 
    
         
             
                                    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
         
     | 
| 
       10748 
10987 
     | 
    
         
             
                                                  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
         
     | 
| 
       10749 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10750 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 10988 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10751 
10989 
     | 
    
         
             
                                });
         
     | 
| 
       10752 
10990 
     | 
    
         
             
                    });
         
     | 
| 
       10753 
10991 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10762,12 +11000,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10762 
11000 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10763 
11001 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10764 
11002 
     | 
    
         
             
                {
         
     | 
| 
       10765 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10766 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10767 
11003 
     | 
    
         | 
| 
       10768 
11004 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10769 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10770 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10771 
11005 
     | 
    
         | 
| 
       10772 
11006 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10773 
11007 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10775,8 +11009,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10775 
11009 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10776 
11010 
     | 
    
         
             
                                    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
         
     | 
| 
       10777 
11011 
     | 
    
         
             
                                                  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
         
     | 
| 
       10778 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10779 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11012 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10780 
11013 
     | 
    
         
             
                                });
         
     | 
| 
       10781 
11014 
     | 
    
         
             
                    });
         
     | 
| 
       10782 
11015 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10791,12 +11024,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10791 
11024 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10792 
11025 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10793 
11026 
     | 
    
         
             
                {
         
     | 
| 
       10794 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10795 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10796 
11027 
     | 
    
         | 
| 
       10797 
11028 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10798 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10799 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10800 
11029 
     | 
    
         | 
| 
       10801 
11030 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10802 
11031 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10804,8 +11033,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10804 
11033 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10805 
11034 
     | 
    
         
             
                                    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
         
     | 
| 
       10806 
11035 
     | 
    
         
             
                                                  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
         
     | 
| 
       10807 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10808 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11036 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10809 
11037 
     | 
    
         
             
                                });
         
     | 
| 
       10810 
11038 
     | 
    
         
             
                    });
         
     | 
| 
       10811 
11039 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10820,12 +11048,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10820 
11048 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10821 
11049 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10822 
11050 
     | 
    
         
             
                {
         
     | 
| 
       10823 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10824 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10825 
11051 
     | 
    
         | 
| 
       10826 
11052 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10827 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10828 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10829 
11053 
     | 
    
         | 
| 
       10830 
11054 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10831 
11055 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10833,8 +11057,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10833 
11057 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10834 
11058 
     | 
    
         
             
                                    mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
         
     | 
| 
       10835 
11059 
     | 
    
         
             
                                                  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
         
     | 
| 
       10836 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10837 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11060 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10838 
11061 
     | 
    
         
             
                                });
         
     | 
| 
       10839 
11062 
     | 
    
         
             
                    });
         
     | 
| 
       10840 
11063 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10849,12 +11072,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10849 
11072 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10850 
11073 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10851 
11074 
     | 
    
         
             
                {
         
     | 
| 
       10852 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10853 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10854 
11075 
     | 
    
         | 
| 
       10855 
11076 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10856 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10857 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10858 
11077 
     | 
    
         | 
| 
       10859 
11078 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10860 
11079 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10862,8 +11081,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10862 
11081 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10863 
11082 
     | 
    
         
             
                                    mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
         
     | 
| 
       10864 
11083 
     | 
    
         
             
                                                  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
         
     | 
| 
       10865 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10866 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11084 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10867 
11085 
     | 
    
         
             
                                });
         
     | 
| 
       10868 
11086 
     | 
    
         
             
                    });
         
     | 
| 
       10869 
11087 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10878,12 +11096,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10878 
11096 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10879 
11097 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10880 
11098 
     | 
    
         
             
                {
         
     | 
| 
       10881 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10882 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10883 
11099 
     | 
    
         | 
| 
       10884 
11100 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10885 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10886 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10887 
11101 
     | 
    
         | 
| 
       10888 
11102 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10889 
11103 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10891,8 +11105,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10891 
11105 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10892 
11106 
     | 
    
         
             
                                    mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
         
     | 
| 
       10893 
11107 
     | 
    
         
             
                                                  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
         
     | 
| 
       10894 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10895 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11108 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10896 
11109 
     | 
    
         
             
                                });
         
     | 
| 
       10897 
11110 
     | 
    
         
             
                    });
         
     | 
| 
       10898 
11111 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10907,12 +11120,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10907 
11120 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10908 
11121 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10909 
11122 
     | 
    
         
             
                {
         
     | 
| 
       10910 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10911 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10912 
11123 
     | 
    
         | 
| 
       10913 
11124 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10914 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10915 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10916 
11125 
     | 
    
         | 
| 
       10917 
11126 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10918 
11127 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10920,8 +11129,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10920 
11129 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10921 
11130 
     | 
    
         
             
                                    mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
         
     | 
| 
       10922 
11131 
     | 
    
         
             
                                                  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
         
     | 
| 
       10923 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10924 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11132 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10925 
11133 
     | 
    
         
             
                                });
         
     | 
| 
       10926 
11134 
     | 
    
         
             
                    });
         
     | 
| 
       10927 
11135 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10936,12 +11144,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10936 
11144 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10937 
11145 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10938 
11146 
     | 
    
         
             
                {
         
     | 
| 
       10939 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       10940 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       10941 
11147 
     | 
    
         | 
| 
       10942 
11148 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10943 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
         
     | 
| 
       10944 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
       10945 
11149 
     | 
    
         | 
| 
       10946 
11150 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10947 
11151 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
         @@ -10949,13 +11153,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10949 
11153 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10950 
11154 
     | 
    
         
             
                                    mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
         
     | 
| 
       10951 
11155 
     | 
    
         
             
                                                  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
         
     | 
| 
       10952 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10953 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11156 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10954 
11157 
     | 
    
         
             
                                });
         
     | 
| 
       10955 
11158 
     | 
    
         
             
                    });
         
     | 
| 
       10956 
11159 
     | 
    
         
             
                }
         
     | 
| 
       10957 
11160 
     | 
    
         
             
            }
         
     | 
| 
       10958 
11161 
     | 
    
         | 
| 
      
 11162 
     | 
    
         
            +
             
     | 
| 
       10959 
11163 
     | 
    
         
             
            static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
         
     | 
| 
       10960 
11164 
     | 
    
         
             
                                                      float *dst, const int ncols,
         
     | 
| 
       10961 
11165 
     | 
    
         
             
                                                      const int nrows,
         
     | 
| 
         @@ -10965,23 +11169,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10965 
11169 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10966 
11170 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10967 
11171 
     | 
    
         
             
                {
         
     | 
| 
       10968 
     | 
    
         
            -
                    iq2xxs_grid.init(*stream);
         
     | 
| 
       10969 
     | 
    
         
            -
                    ksigns_iq2xs.init(*stream);
         
     | 
| 
       10970 
     | 
    
         
            -
                    kmask_iq2xs.init(*stream);
         
     | 
| 
       10971 
     | 
    
         
            -
             
     | 
| 
       10972 
     | 
    
         
            -
             
     | 
| 
       10973 
11172 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       10974 
     | 
    
         
            -
                        auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
         
     | 
| 
       10975 
     | 
    
         
            -
                        auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
         
     | 
| 
       10976 
     | 
    
         
            -
                        auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
         
     | 
| 
       10977 
     | 
    
         
            -
             
     | 
| 
       10978 
11173 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       10979 
11174 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
       10980 
11175 
     | 
    
         
             
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
       10981 
11176 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       10982 
11177 
     | 
    
         
             
                                    mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
         
     | 
| 
       10983 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       10984 
     | 
    
         
            -
                                        iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
         
     | 
| 
      
 11178 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       10985 
11179 
     | 
    
         
             
                                });
         
     | 
| 
       10986 
11180 
     | 
    
         
             
                    });
         
     | 
| 
       10987 
11181 
     | 
    
         
             
                }
         
     | 
| 
         @@ -10996,20 +11190,42 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       10996 
11190 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       10997 
11191 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       10998 
11192 
     | 
    
         
             
                {
         
     | 
| 
       10999 
     | 
    
         
            -
                    iq2xs_grid.init(*stream);
         
     | 
| 
       11000 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       11001 
11193 
     | 
    
         | 
| 
       11002 
11194 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       11003 
     | 
    
         
            -
                        auto iq2xs_grid_ptr_ct1 = iq2xs_grid 
     | 
| 
       11004 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64 
     | 
| 
      
 11195 
     | 
    
         
            +
                        auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
         
     | 
| 
      
 11196 
     | 
    
         
            +
                        auto ksigns64_ptr_ct1 = &ksigns64[0];
         
     | 
| 
       11005 
11197 
     | 
    
         | 
| 
       11006 
11198 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       11007 
11199 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
       11008 
11200 
     | 
    
         
             
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
       11009 
11201 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       11010 
11202 
     | 
    
         
             
                                    mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
         
     | 
| 
       11011 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       11012 
     | 
    
         
            -
             
     | 
| 
      
 11203 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
      
 11204 
     | 
    
         
            +
                                });
         
     | 
| 
      
 11205 
     | 
    
         
            +
                    });
         
     | 
| 
      
 11206 
     | 
    
         
            +
                }
         
     | 
| 
      
 11207 
     | 
    
         
            +
            }
         
     | 
| 
      
 11208 
     | 
    
         
            +
             
     | 
| 
      
 11209 
     | 
    
         
            +
            static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
         
     | 
| 
      
 11210 
     | 
    
         
            +
                                                     float *dst, const int ncols,
         
     | 
| 
      
 11211 
     | 
    
         
            +
                                                     const int nrows,
         
     | 
| 
      
 11212 
     | 
    
         
            +
                                                     dpct::queue_ptr stream) {
         
     | 
| 
      
 11213 
     | 
    
         
            +
                GGML_ASSERT(ncols % QK_K == 0);
         
     | 
| 
      
 11214 
     | 
    
         
            +
                const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
         
     | 
| 
      
 11215 
     | 
    
         
            +
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
      
 11216 
     | 
    
         
            +
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
      
 11217 
     | 
    
         
            +
                {
         
     | 
| 
      
 11218 
     | 
    
         
            +
             
     | 
| 
      
 11219 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 11220 
     | 
    
         
            +
                        auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
         
     | 
| 
      
 11221 
     | 
    
         
            +
                        auto ksigns64_ptr_ct1 = &ksigns64[0];
         
     | 
| 
      
 11222 
     | 
    
         
            +
             
     | 
| 
      
 11223 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 11224 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 11225 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
      
 11226 
     | 
    
         
            +
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
      
 11227 
     | 
    
         
            +
                                    mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
         
     | 
| 
      
 11228 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       11013 
11229 
     | 
    
         
             
                                });
         
     | 
| 
       11014 
11230 
     | 
    
         
             
                    });
         
     | 
| 
       11015 
11231 
     | 
    
         
             
                }
         
     | 
| 
         @@ -11024,20 +11240,17 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       11024 
11240 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       11025 
11241 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       11026 
11242 
     | 
    
         
             
                {
         
     | 
| 
       11027 
     | 
    
         
            -
                    iq3xxs_grid.init(*stream);
         
     | 
| 
       11028 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       11029 
11243 
     | 
    
         | 
| 
       11030 
11244 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       11031 
     | 
    
         
            -
                        auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid 
     | 
| 
       11032 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64 
     | 
| 
      
 11245 
     | 
    
         
            +
                        auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
         
     | 
| 
      
 11246 
     | 
    
         
            +
                        auto ksigns64_ptr_ct1 = &ksigns64[0];
         
     | 
| 
       11033 
11247 
     | 
    
         | 
| 
       11034 
11248 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       11035 
11249 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
       11036 
11250 
     | 
    
         
             
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
       11037 
11251 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       11038 
11252 
     | 
    
         
             
                                    mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
         
     | 
| 
       11039 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       11040 
     | 
    
         
            -
                                        iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11253 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       11041 
11254 
     | 
    
         
             
                                });
         
     | 
| 
       11042 
11255 
     | 
    
         
             
                    });
         
     | 
| 
       11043 
11256 
     | 
    
         
             
                }
         
     | 
| 
         @@ -11052,20 +11265,16 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       11052 
11265 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       11053 
11266 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       11054 
11267 
     | 
    
         
             
                {
         
     | 
| 
       11055 
     | 
    
         
            -
                    iq3s_grid.init(*stream);
         
     | 
| 
       11056 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       11057 
11268 
     | 
    
         | 
| 
       11058 
11269 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       11059 
     | 
    
         
            -
                        auto iq3s_grid_ptr_ct1 = iq3s_grid 
     | 
| 
       11060 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
         
     | 
| 
      
 11270 
     | 
    
         
            +
                        auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
         
     | 
| 
       11061 
11271 
     | 
    
         | 
| 
       11062 
11272 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       11063 
11273 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
       11064 
11274 
     | 
    
         
             
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
       11065 
11275 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       11066 
11276 
     | 
    
         
             
                                    mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
         
     | 
| 
       11067 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       11068 
     | 
    
         
            -
                                        iq3s_grid_ptr_ct1, ksigns64_ptr_ct1);
         
     | 
| 
      
 11277 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       11069 
11278 
     | 
    
         
             
                                });
         
     | 
| 
       11070 
11279 
     | 
    
         
             
                    });
         
     | 
| 
       11071 
11280 
     | 
    
         
             
                }
         
     | 
| 
         @@ -11080,20 +11289,82 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, 
     | 
|
| 
       11080 
11289 
     | 
    
         
             
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
       11081 
11290 
     | 
    
         
             
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
       11082 
11291 
     | 
    
         
             
                {
         
     | 
| 
       11083 
     | 
    
         
            -
                    iq1s_grid_gpu.init(*stream);
         
     | 
| 
       11084 
     | 
    
         
            -
                    ksigns64.init(*stream);
         
     | 
| 
       11085 
11292 
     | 
    
         | 
| 
       11086 
11293 
     | 
    
         
             
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
       11087 
     | 
    
         
            -
                        auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu 
     | 
| 
       11088 
     | 
    
         
            -
                        auto ksigns64_ptr_ct1 = ksigns64 
     | 
| 
      
 11294 
     | 
    
         
            +
                        auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
         
     | 
| 
      
 11295 
     | 
    
         
            +
                        auto ksigns64_ptr_ct1 = &ksigns64[0];
         
     | 
| 
       11089 
11296 
     | 
    
         | 
| 
       11090 
11297 
     | 
    
         
             
                        cgh.parallel_for(
         
     | 
| 
       11091 
11298 
     | 
    
         
             
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
       11092 
11299 
     | 
    
         
             
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
       11093 
11300 
     | 
    
         
             
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
       11094 
11301 
     | 
    
         
             
                                    mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
         
     | 
| 
       11095 
     | 
    
         
            -
                                        vx, vy, dst, ncols, nrows, item_ct1 
     | 
| 
       11096 
     | 
    
         
            -
             
     | 
| 
      
 11302 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
      
 11303 
     | 
    
         
            +
                                });
         
     | 
| 
      
 11304 
     | 
    
         
            +
                    });
         
     | 
| 
      
 11305 
     | 
    
         
            +
                }
         
     | 
| 
      
 11306 
     | 
    
         
            +
            }
         
     | 
| 
      
 11307 
     | 
    
         
            +
             
     | 
| 
      
 11308 
     | 
    
         
            +
            static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
         
     | 
| 
      
 11309 
     | 
    
         
            +
                                                      float *dst, const int ncols,
         
     | 
| 
      
 11310 
     | 
    
         
            +
                                                      const int nrows,
         
     | 
| 
      
 11311 
     | 
    
         
            +
                                                      dpct::queue_ptr stream) {
         
     | 
| 
      
 11312 
     | 
    
         
            +
                GGML_ASSERT(ncols % QK_K == 0);
         
     | 
| 
      
 11313 
     | 
    
         
            +
                const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
         
     | 
| 
      
 11314 
     | 
    
         
            +
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
      
 11315 
     | 
    
         
            +
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
      
 11316 
     | 
    
         
            +
                {
         
     | 
| 
      
 11317 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 11318 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 11319 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 11320 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
      
 11321 
     | 
    
         
            +
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
      
 11322 
     | 
    
         
            +
                                    mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
         
     | 
| 
      
 11323 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
      
 11324 
     | 
    
         
            +
                                });
         
     | 
| 
      
 11325 
     | 
    
         
            +
                    });
         
     | 
| 
      
 11326 
     | 
    
         
            +
                }
         
     | 
| 
      
 11327 
     | 
    
         
            +
            }
         
     | 
| 
      
 11328 
     | 
    
         
            +
             
     | 
| 
      
 11329 
     | 
    
         
            +
            static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
         
     | 
| 
      
 11330 
     | 
    
         
            +
                                                      float *dst, const int ncols,
         
     | 
| 
      
 11331 
     | 
    
         
            +
                                                      const int nrows,
         
     | 
| 
      
 11332 
     | 
    
         
            +
                                                      dpct::queue_ptr stream) {
         
     | 
| 
      
 11333 
     | 
    
         
            +
                GGML_ASSERT(ncols % QK4_NL == 0);
         
     | 
| 
      
 11334 
     | 
    
         
            +
                const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
         
     | 
| 
      
 11335 
     | 
    
         
            +
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
      
 11336 
     | 
    
         
            +
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
      
 11337 
     | 
    
         
            +
                {
         
     | 
| 
      
 11338 
     | 
    
         
            +
             
     | 
| 
      
 11339 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 11340 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 11341 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 11342 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
      
 11343 
     | 
    
         
            +
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
      
 11344 
     | 
    
         
            +
                                    mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
         
     | 
| 
      
 11345 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
      
 11346 
     | 
    
         
            +
                                });
         
     | 
| 
      
 11347 
     | 
    
         
            +
                    });
         
     | 
| 
      
 11348 
     | 
    
         
            +
                }
         
     | 
| 
      
 11349 
     | 
    
         
            +
            }
         
     | 
| 
      
 11350 
     | 
    
         
            +
             
     | 
| 
      
 11351 
     | 
    
         
            +
            static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
         
     | 
| 
      
 11352 
     | 
    
         
            +
                                                      float *dst, const int ncols,
         
     | 
| 
      
 11353 
     | 
    
         
            +
                                                      const int nrows,
         
     | 
| 
      
 11354 
     | 
    
         
            +
                                                      dpct::queue_ptr stream) {
         
     | 
| 
      
 11355 
     | 
    
         
            +
                GGML_ASSERT(ncols % QK_K == 0);
         
     | 
| 
      
 11356 
     | 
    
         
            +
                const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
         
     | 
| 
      
 11357 
     | 
    
         
            +
                const sycl::range<3> block_nums(1, 1, block_num_y);
         
     | 
| 
      
 11358 
     | 
    
         
            +
                const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
         
     | 
| 
      
 11359 
     | 
    
         
            +
                {
         
     | 
| 
      
 11360 
     | 
    
         
            +
             
     | 
| 
      
 11361 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 11362 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 11363 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 11364 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1)
         
     | 
| 
      
 11365 
     | 
    
         
            +
                                [[intel::reqd_sub_group_size(32)]] {
         
     | 
| 
      
 11366 
     | 
    
         
            +
                                    mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
         
     | 
| 
      
 11367 
     | 
    
         
            +
                                        vx, vy, dst, ncols, nrows, item_ct1);
         
     | 
| 
       11097 
11368 
     | 
    
         
             
                                });
         
     | 
| 
       11098 
11369 
     | 
    
         
             
                    });
         
     | 
| 
       11099 
11370 
     | 
    
         
             
                }
         
     | 
| 
         @@ -12717,36 +12988,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, 
     | 
|
| 
       12717 
12988 
     | 
    
         
             
                                         });
         
     | 
| 
       12718 
12989 
     | 
    
         
             
            }
         
     | 
| 
       12719 
12990 
     | 
    
         | 
| 
      
 12991 
     | 
    
         
            +
            static int next_power_of_2(int x) {
         
     | 
| 
      
 12992 
     | 
    
         
            +
                int n = 1;
         
     | 
| 
      
 12993 
     | 
    
         
            +
                while (n < x) {
         
     | 
| 
      
 12994 
     | 
    
         
            +
                    n *= 2;
         
     | 
| 
      
 12995 
     | 
    
         
            +
                }
         
     | 
| 
      
 12996 
     | 
    
         
            +
                return n;
         
     | 
| 
      
 12997 
     | 
    
         
            +
            }
         
     | 
| 
      
 12998 
     | 
    
         
            +
             
     | 
| 
       12720 
12999 
     | 
    
         
             
            static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
         
     | 
| 
       12721 
13000 
     | 
    
         
             
                                             const int nrows, ggml_sort_order order,
         
     | 
| 
       12722 
13001 
     | 
    
         
             
                                             dpct::queue_ptr stream) {
         
     | 
| 
       12723 
13002 
     | 
    
         
             
                // bitonic sort requires ncols to be power of 2
         
     | 
| 
       12724 
     | 
    
         
            -
                 
     | 
| 
      
 13003 
     | 
    
         
            +
                const int ncols_pad = next_power_of_2(ncols);
         
     | 
| 
       12725 
13004 
     | 
    
         | 
| 
       12726 
     | 
    
         
            -
                const sycl::range<3> block_dims(1, 1,  
     | 
| 
      
 13005 
     | 
    
         
            +
                const sycl::range<3> block_dims(1, 1, ncols_pad);
         
     | 
| 
       12727 
13006 
     | 
    
         
             
                const sycl::range<3> block_nums(1, nrows, 1);
         
     | 
| 
      
 13007 
     | 
    
         
            +
                const size_t shared_mem = ncols_pad * sizeof(int);
         
     | 
| 
      
 13008 
     | 
    
         
            +
             
     | 
| 
      
 13009 
     | 
    
         
            +
                // GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
         
     | 
| 
      
 13010 
     | 
    
         
            +
             
     | 
| 
       12728 
13011 
     | 
    
         
             
                if (order == GGML_SORT_ORDER_ASC) {
         
     | 
| 
       12729 
     | 
    
         
            -
                     
     | 
| 
       12730 
     | 
    
         
            -
             
     | 
| 
       12731 
     | 
    
         
            -
             
     | 
| 
       12732 
     | 
    
         
            -
             
     | 
| 
       12733 
     | 
    
         
            -
             
     | 
| 
       12734 
     | 
    
         
            -
             
     | 
| 
       12735 
     | 
    
         
            -
             
     | 
| 
       12736 
     | 
    
         
            -
             
     | 
| 
       12737 
     | 
    
         
            -
             
     | 
| 
       12738 
     | 
    
         
            -
             
     | 
| 
      
 13012 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 13013 
     | 
    
         
            +
                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
         
     | 
| 
      
 13014 
     | 
    
         
            +
                            sycl::range<1>(shared_mem), cgh);
         
     | 
| 
      
 13015 
     | 
    
         
            +
             
     | 
| 
      
 13016 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 13017 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 13018 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 13019 
     | 
    
         
            +
                                k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
         
     | 
| 
      
 13020 
     | 
    
         
            +
                                    x, dst, ncols, ncols_pad, item_ct1,
         
     | 
| 
      
 13021 
     | 
    
         
            +
                                    dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
         
     | 
| 
      
 13022 
     | 
    
         
            +
                                        .get());
         
     | 
| 
      
 13023 
     | 
    
         
            +
                            });
         
     | 
| 
      
 13024 
     | 
    
         
            +
                    });
         
     | 
| 
       12739 
13025 
     | 
    
         
             
                } else if (order == GGML_SORT_ORDER_DESC) {
         
     | 
| 
       12740 
     | 
    
         
            -
                     
     | 
| 
       12741 
     | 
    
         
            -
             
     | 
| 
       12742 
     | 
    
         
            -
             
     | 
| 
       12743 
     | 
    
         
            -
             
     | 
| 
       12744 
     | 
    
         
            -
             
     | 
| 
       12745 
     | 
    
         
            -
             
     | 
| 
       12746 
     | 
    
         
            -
             
     | 
| 
       12747 
     | 
    
         
            -
             
     | 
| 
       12748 
     | 
    
         
            -
             
     | 
| 
       12749 
     | 
    
         
            -
             
     | 
| 
      
 13026 
     | 
    
         
            +
                    stream->submit([&](sycl::handler &cgh) {
         
     | 
| 
      
 13027 
     | 
    
         
            +
                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
         
     | 
| 
      
 13028 
     | 
    
         
            +
                            sycl::range<1>(shared_mem), cgh);
         
     | 
| 
      
 13029 
     | 
    
         
            +
             
     | 
| 
      
 13030 
     | 
    
         
            +
                        cgh.parallel_for(
         
     | 
| 
      
 13031 
     | 
    
         
            +
                            sycl::nd_range<3>(block_nums * block_dims, block_dims),
         
     | 
| 
      
 13032 
     | 
    
         
            +
                            [=](sycl::nd_item<3> item_ct1) {
         
     | 
| 
      
 13033 
     | 
    
         
            +
                                k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
         
     | 
| 
      
 13034 
     | 
    
         
            +
                                    x, dst, ncols, ncols_pad, item_ct1,
         
     | 
| 
      
 13035 
     | 
    
         
            +
                                    dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
         
     | 
| 
      
 13036 
     | 
    
         
            +
                                        .get());
         
     | 
| 
      
 13037 
     | 
    
         
            +
                            });
         
     | 
| 
      
 13038 
     | 
    
         
            +
                    });
         
     | 
| 
       12750 
13039 
     | 
    
         
             
                } else {
         
     | 
| 
       12751 
13040 
     | 
    
         
             
                    GGML_ASSERT(false);
         
     | 
| 
       12752 
13041 
     | 
    
         
             
                }
         
     | 
| 
         @@ -13128,6 +13417,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) 
     | 
|
| 
       13128 
13417 
     | 
    
         
             
            }
         
     | 
| 
       13129 
13418 
     | 
    
         | 
| 
       13130 
13419 
     | 
    
         
             
            void ggml_backend_sycl_print_sycl_devices() {
         
     | 
| 
      
 13420 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
         
     | 
| 
       13131 
13421 
     | 
    
         
             
                int device_count = dpct::dev_mgr::instance().device_count();
         
     | 
| 
       13132 
13422 
     | 
    
         
             
                std::map<std::string, size_t> DeviceNums;
         
     | 
| 
       13133 
13423 
     | 
    
         
             
                fprintf(stderr, "found %d SYCL devices:\n", device_count);
         
     | 
| 
         @@ -13181,11 +13471,13 @@ int get_work_group_size(int user_device_id) { 
     | 
|
| 
       13181 
13471 
     | 
    
         
             
                return prop.get_max_work_group_size();
         
     | 
| 
       13182 
13472 
     | 
    
         
             
            }
         
     | 
| 
       13183 
13473 
     | 
    
         | 
| 
       13184 
     | 
    
         
            -
            void ggml_init_sycl() try {
         
     | 
| 
      
 13474 
     | 
    
         
            +
            static void ggml_init_sycl() try {
         
     | 
| 
       13185 
13475 
     | 
    
         
             
                static bool initialized = false;
         
     | 
| 
       13186 
13476 
     | 
    
         | 
| 
       13187 
13477 
     | 
    
         
             
                if (!initialized) {
         
     | 
| 
      
 13478 
     | 
    
         
            +
                    fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
         
     | 
| 
       13188 
13479 
     | 
    
         
             
                    g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
         
     | 
| 
      
 13480 
     | 
    
         
            +
             
     | 
| 
       13189 
13481 
     | 
    
         
             
                    fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
         
     | 
| 
       13190 
13482 
     | 
    
         | 
| 
       13191 
13483 
     | 
    
         
             
            #if defined(GGML_SYCL_F16)
         
     | 
| 
         @@ -13871,8 +14163,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC 
     | 
|
| 
       13871 
14163 
     | 
    
         
             
                    case GGML_TYPE_Q5_K:
         
     | 
| 
       13872 
14164 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XXS:
         
     | 
| 
       13873 
14165 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
      
 14166 
     | 
    
         
            +
                    case GGML_TYPE_IQ2_S:
         
     | 
| 
       13874 
14167 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 14168 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       13875 
14169 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
      
 14170 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
      
 14171 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       13876 
14172 
     | 
    
         
             
                        return max_compute_capability >= VER_GEN9 ? 128 : 64;
         
     | 
| 
       13877 
14173 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
       13878 
14174 
     | 
    
         
             
                        return max_compute_capability >= VER_GEN9 ? 128 : 64;
         
     | 
| 
         @@ -13891,11 +14187,20 @@ inline void ggml_sycl_op_mul_mat_vec_q( 
     | 
|
| 
       13891 
14187 
     | 
    
         
             
                const int64_t src1_ncols, const int64_t src1_padded_row_size,
         
     | 
| 
       13892 
14188 
     | 
    
         
             
                const dpct::queue_ptr &stream) {
         
     | 
| 
       13893 
14189 
     | 
    
         | 
| 
       13894 
     | 
    
         
            -
                 
     | 
| 
      
 14190 
     | 
    
         
            +
                const int64_t ne10 = src1->ne[0];
         
     | 
| 
      
 14191 
     | 
    
         
            +
                GGML_ASSERT(ne10 % QK8_1 == 0);
         
     | 
| 
       13895 
14192 
     | 
    
         | 
| 
       13896 
14193 
     | 
    
         
             
                const int64_t ne00 = src0->ne[0];
         
     | 
| 
       13897 
14194 
     | 
    
         
             
                const int64_t row_diff = row_high - row_low;
         
     | 
| 
       13898 
14195 
     | 
    
         | 
| 
      
 14196 
     | 
    
         
            +
                int id;
         
     | 
| 
      
 14197 
     | 
    
         
            +
                SYCL_CHECK(
         
     | 
| 
      
 14198 
     | 
    
         
            +
                    CHECK_TRY_ERROR(id = get_current_device_id()));
         
     | 
| 
      
 14199 
     | 
    
         
            +
             
     | 
| 
      
 14200 
     | 
    
         
            +
                // the main device has a larger memory buffer to hold the results from all GPUs
         
     | 
| 
      
 14201 
     | 
    
         
            +
                // nrows_dst == nrows of the matrix that the kernel writes into
         
     | 
| 
      
 14202 
     | 
    
         
            +
                const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff;
         
     | 
| 
      
 14203 
     | 
    
         
            +
             
     | 
| 
       13899 
14204 
     | 
    
         
             
                switch (src0->type) {
         
     | 
| 
       13900 
14205 
     | 
    
         
             
                    case GGML_TYPE_Q4_0:
         
     | 
| 
       13901 
14206 
     | 
    
         
             
                        mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
         @@ -13927,20 +14232,32 @@ inline void ggml_sycl_op_mul_mat_vec_q( 
     | 
|
| 
       13927 
14232 
     | 
    
         
             
                    case GGML_TYPE_Q6_K:
         
     | 
| 
       13928 
14233 
     | 
    
         
             
                        mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13929 
14234 
     | 
    
         
             
                        break;
         
     | 
| 
      
 14235 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 14236 
     | 
    
         
            +
                        mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
      
 14237 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 14238 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
      
 14239 
     | 
    
         
            +
                        mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
      
 14240 
     | 
    
         
            +
                        break;
         
     | 
| 
       13930 
14241 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XXS:
         
     | 
| 
       13931 
14242 
     | 
    
         
             
                        mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13932 
14243 
     | 
    
         
             
                        break;
         
     | 
| 
       13933 
14244 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       13934 
14245 
     | 
    
         
             
                        mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13935 
14246 
     | 
    
         
             
                        break;
         
     | 
| 
      
 14247 
     | 
    
         
            +
                    case GGML_TYPE_IQ2_S:
         
     | 
| 
      
 14248 
     | 
    
         
            +
                        mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
      
 14249 
     | 
    
         
            +
                        break;
         
     | 
| 
       13936 
14250 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       13937 
14251 
     | 
    
         
             
                        mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13938 
14252 
     | 
    
         
             
                        break;
         
     | 
| 
       13939 
14253 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
       13940 
14254 
     | 
    
         
             
                        mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13941 
14255 
     | 
    
         
             
                        break;
         
     | 
| 
       13942 
     | 
    
         
            -
                    case  
     | 
| 
       13943 
     | 
    
         
            -
                         
     | 
| 
      
 14256 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
      
 14257 
     | 
    
         
            +
                        mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
      
 14258 
     | 
    
         
            +
                        break;
         
     | 
| 
      
 14259 
     | 
    
         
            +
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
      
 14260 
     | 
    
         
            +
                        mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       13944 
14261 
     | 
    
         
             
                        break;
         
     | 
| 
       13945 
14262 
     | 
    
         
             
                    default:
         
     | 
| 
       13946 
14263 
     | 
    
         
             
                        GGML_ASSERT(false);
         
     | 
| 
         @@ -14022,6 +14339,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( 
     | 
|
| 
       14022 
14339 
     | 
    
         
             
                        convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
         
     | 
| 
       14023 
14340 
     | 
    
         
             
                        break;
         
     | 
| 
       14024 
14341 
     | 
    
         
             
                    default:
         
     | 
| 
      
 14342 
     | 
    
         
            +
                        printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
         
     | 
| 
       14025 
14343 
     | 
    
         
             
                        GGML_ASSERT(false);
         
     | 
| 
       14026 
14344 
     | 
    
         
             
                        break;
         
     | 
| 
       14027 
14345 
     | 
    
         
             
                }
         
     | 
| 
         @@ -14876,8 +15194,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, 
     | 
|
| 
       14876 
15194 
     | 
    
         
             
                                src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
         
     | 
| 
       14877 
15195 
     | 
    
         
             
                            }
         
     | 
| 
       14878 
15196 
     | 
    
         
             
                            // do the computation
         
     | 
| 
       14879 
     | 
    
         
            -
                            op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
         
     | 
| 
       14880 
     | 
    
         
            -
                                dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream);
         
     | 
| 
      
 15197 
     | 
    
         
            +
                            SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
         
     | 
| 
      
 15198 
     | 
    
         
            +
                                dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
         
     | 
| 
       14881 
15199 
     | 
    
         
             
                            /*
         
     | 
| 
       14882 
15200 
     | 
    
         
             
                            DPCT1010:93: SYCL uses exceptions to report errors and does not
         
     | 
| 
       14883 
15201 
     | 
    
         
             
                            use the error codes. The call was replaced with 0. You need to
         
     | 
| 
         @@ -15246,6 +15564,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, 
     | 
|
| 
       15246 
15564 
     | 
    
         
             
                SYCL_CHECK(ggml_sycl_set_device(g_main_device));
         
     | 
| 
       15247 
15565 
     | 
    
         
             
                dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
         
     | 
| 
       15248 
15566 
     | 
    
         | 
| 
      
 15567 
     | 
    
         
            +
                bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda ||
         
     | 
| 
      
 15568 
     | 
    
         
            +
                                       main_stream->get_backend() == sycl::backend::ext_oneapi_hip;
         
     | 
| 
      
 15569 
     | 
    
         
            +
             
     | 
| 
       15249 
15570 
     | 
    
         
             
                SYCL_CHECK(
         
     | 
| 
       15250 
15571 
     | 
    
         
             
                    CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream));
         
     | 
| 
       15251 
15572 
     | 
    
         | 
| 
         @@ -15276,24 +15597,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, 
     | 
|
| 
       15276 
15597 
     | 
    
         | 
| 
       15277 
15598 
     | 
    
         
             
                dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
         
     | 
| 
       15278 
15599 
     | 
    
         
             
                dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
         
     | 
| 
      
 15600 
     | 
    
         
            +
                if (no_mixed_dtypes) {
         
     | 
| 
      
 15601 
     | 
    
         
            +
                    cu_compute_type = dpct::library_data_t::real_half;
         
     | 
| 
      
 15602 
     | 
    
         
            +
                    cu_data_type = dpct::library_data_t::real_half;
         
     | 
| 
      
 15603 
     | 
    
         
            +
                }
         
     | 
| 
       15279 
15604 
     | 
    
         | 
| 
       15280 
15605 
     | 
    
         
             
                // dst strides
         
     | 
| 
       15281 
15606 
     | 
    
         
             
                size_t nbd2 = dst->nb[2];
         
     | 
| 
       15282 
15607 
     | 
    
         
             
                size_t nbd3 = dst->nb[3];
         
     | 
| 
       15283 
15608 
     | 
    
         | 
| 
      
 15609 
     | 
    
         
            +
                const float alpha_f32 = 1.0f;
         
     | 
| 
      
 15610 
     | 
    
         
            +
                const float beta_f32 = 0.0f;
         
     | 
| 
      
 15611 
     | 
    
         
            +
             
     | 
| 
       15284 
15612 
     | 
    
         
             
                const sycl::half alpha_f16 = 1.0f;
         
     | 
| 
       15285 
15613 
     | 
    
         
             
                const sycl::half beta_f16 = 0.0f;
         
     | 
| 
       15286 
15614 
     | 
    
         | 
| 
       15287 
     | 
    
         
            -
                const float alpha_f32 = 1.0f;
         
     | 
| 
       15288 
     | 
    
         
            -
                const float beta_f32  = 0.0f;
         
     | 
| 
       15289 
     | 
    
         
            -
             
     | 
| 
       15290 
15615 
     | 
    
         
             
                const void * alpha = &alpha_f32;
         
     | 
| 
       15291 
15616 
     | 
    
         
             
                const void * beta  = &beta_f32;
         
     | 
| 
      
 15617 
     | 
    
         
            +
                if (no_mixed_dtypes) {
         
     | 
| 
      
 15618 
     | 
    
         
            +
                    alpha = &alpha_f16;
         
     | 
| 
      
 15619 
     | 
    
         
            +
                    beta  = &beta_f16;
         
     | 
| 
      
 15620 
     | 
    
         
            +
                }
         
     | 
| 
       15292 
15621 
     | 
    
         | 
| 
       15293 
15622 
     | 
    
         
             
                // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
         
     | 
| 
       15294 
     | 
    
         
            -
                // oneMKL open source supports half, half, float, float: datatypes
         
     | 
| 
      
 15623 
     | 
    
         
            +
                // when oneMKL open source supports half, half, float, float: datatypes
         
     | 
| 
       15295 
15624 
     | 
    
         | 
| 
       15296 
15625 
     | 
    
         
             
                dst_t = (char *) dst_ddf;
         
     | 
| 
      
 15626 
     | 
    
         
            +
                if (no_mixed_dtypes) {
         
     | 
| 
      
 15627 
     | 
    
         
            +
                    dst_t = (char *) dst_f16.alloc(ne_dst);
         
     | 
| 
      
 15628 
     | 
    
         
            +
             
     | 
| 
      
 15629 
     | 
    
         
            +
                    nbd2 /= sizeof(float) / sizeof(sycl::half);
         
     | 
| 
      
 15630 
     | 
    
         
            +
                    nbd3 /= sizeof(float) / sizeof(sycl::half);
         
     | 
| 
      
 15631 
     | 
    
         
            +
                }
         
     | 
| 
       15297 
15632 
     | 
    
         | 
| 
       15298 
15633 
     | 
    
         
             
                GGML_ASSERT(ne12 % ne02 == 0);
         
     | 
| 
       15299 
15634 
     | 
    
         
             
                GGML_ASSERT(ne13 % ne03 == 0);
         
     | 
| 
         @@ -15379,6 +15714,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, 
     | 
|
| 
       15379 
15714 
     | 
    
         
             
                }
         
     | 
| 
       15380 
15715 
     | 
    
         
             
            #endif
         
     | 
| 
       15381 
15716 
     | 
    
         | 
| 
      
 15717 
     | 
    
         
            +
                if (no_mixed_dtypes) {
         
     | 
| 
      
 15718 
     | 
    
         
            +
                    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
         
     | 
| 
      
 15719 
     | 
    
         
            +
                    to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
         
     | 
| 
      
 15720 
     | 
    
         
            +
                }
         
     | 
| 
       15382 
15721 
     | 
    
         
             
            }
         
     | 
| 
       15383 
15722 
     | 
    
         
             
            catch (sycl::exception const &exc) {
         
     | 
| 
       15384 
15723 
     | 
    
         
             
              std::cerr << exc.what() << "Exception caught at file:" << __FILE__
         
     | 
| 
         @@ -15437,11 +15776,17 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 
     | 
|
| 
       15437 
15776 
     | 
    
         
             
            #ifdef GGML_SYCL_FORCE_DMMV
         
     | 
| 
       15438 
15777 
     | 
    
         
             
                        const bool use_mul_mat_vec_q = false;
         
     | 
| 
       15439 
15778 
     | 
    
         
             
            #else
         
     | 
| 
       15440 
     | 
    
         
            -
                         
     | 
| 
      
 15779 
     | 
    
         
            +
                        bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
         
     | 
| 
      
 15780 
     | 
    
         
            +
                        use_mul_mat_vec_q = use_mul_mat_vec_q ||
         
     | 
| 
      
 15781 
     | 
    
         
            +
                            (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
         
     | 
| 
      
 15782 
     | 
    
         
            +
                            (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
         
     | 
| 
      
 15783 
     | 
    
         
            +
                            (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
         
     | 
| 
      
 15784 
     | 
    
         
            +
                            (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
         
     | 
| 
      
 15785 
     | 
    
         
            +
             
     | 
| 
      
 15786 
     | 
    
         
            +
             
     | 
| 
       15441 
15787 
     | 
    
         
             
            #endif // GGML_SYCL_FORCE_DMMV
         
     | 
| 
       15442 
15788 
     | 
    
         | 
| 
       15443 
15789 
     | 
    
         
             
                        if (use_mul_mat_vec_q) {
         
     | 
| 
       15444 
     | 
    
         
            -
                            // NOTE: this kernel does not support ggml_nrows(src1) > 1
         
     | 
| 
       15445 
15790 
     | 
    
         
             
                            // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
         
     | 
| 
       15446 
15791 
     | 
    
         
             
                            ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
         
     | 
| 
       15447 
15792 
     | 
    
         
             
                        } else {
         
     | 
| 
         @@ -16278,6 +16623,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ 
     | 
|
| 
       16278 
16623 
     | 
    
         
             
            }
         
     | 
| 
       16279 
16624 
     | 
    
         | 
| 
       16280 
16625 
     | 
    
         
             
            GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
         
     | 
| 
      
 16626 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
         
     | 
| 
       16281 
16627 
     | 
    
         
             
                for(int i=0;i<max_len;i++) id_list[i] = -1;
         
     | 
| 
       16282 
16628 
     | 
    
         | 
| 
       16283 
16629 
     | 
    
         
             
                if (!g_sycl_gpu_mgr) {
         
     | 
| 
         @@ -16312,6 +16658,7 @@ catch (sycl::exception const &exc) { 
     | 
|
| 
       16312 
16658 
     | 
    
         | 
| 
       16313 
16659 
     | 
    
         
             
            GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
         
     | 
| 
       16314 
16660 
     | 
    
         
             
                                                  size_t description_size) try {
         
     | 
| 
      
 16661 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
         
     | 
| 
       16315 
16662 
     | 
    
         
             
                dpct::device_info prop;
         
     | 
| 
       16316 
16663 
     | 
    
         
             
                int device_id = g_sycl_gpu_mgr->gpus[device];
         
     | 
| 
       16317 
16664 
     | 
    
         
             
                SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
         
     | 
| 
         @@ -16326,6 +16673,7 @@ catch (sycl::exception const &exc) { 
     | 
|
| 
       16326 
16673 
     | 
    
         | 
| 
       16327 
16674 
     | 
    
         
             
            GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
         
     | 
| 
       16328 
16675 
     | 
    
         
             
                                                               size_t *total) try {
         
     | 
| 
      
 16676 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
         
     | 
| 
       16329 
16677 
     | 
    
         
             
                ggml_sycl_set_device(device);
         
     | 
| 
       16330 
16678 
     | 
    
         | 
| 
       16331 
16679 
     | 
    
         
             
                /*
         
     | 
| 
         @@ -16677,6 +17025,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { 
     | 
|
| 
       16677 
17025 
     | 
    
         
             
            };
         
     | 
| 
       16678 
17026 
     | 
    
         | 
| 
       16679 
17027 
     | 
    
         
             
            ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
         
     | 
| 
      
 17028 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
         
     | 
| 
      
 17029 
     | 
    
         
            +
             
     | 
| 
       16680 
17030 
     | 
    
         
             
                if (device_index>=g_device_count or device_index<0) {
         
     | 
| 
       16681 
17031 
     | 
    
         
             
                    printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
         
     | 
| 
       16682 
17032 
     | 
    
         
             
                        device_index, g_device_count-1);
         
     | 
| 
         @@ -17046,6 +17396,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface 
     | 
|
| 
       17046 
17396 
     | 
    
         
             
            };
         
     | 
| 
       17047 
17397 
     | 
    
         | 
| 
       17048 
17398 
     | 
    
         
             
            GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
         
     | 
| 
      
 17399 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
         
     | 
| 
      
 17400 
     | 
    
         
            +
                ggml_init_sycl();
         
     | 
| 
       17049 
17401 
     | 
    
         
             
                // FIXME: this is not thread safe
         
     | 
| 
       17050 
17402 
     | 
    
         
             
                static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
         
     | 
| 
       17051 
17403 
     | 
    
         | 
| 
         @@ -17117,6 +17469,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm 
     | 
|
| 
       17117 
17469 
     | 
    
         
             
            }
         
     | 
| 
       17118 
17470 
     | 
    
         | 
| 
       17119 
17471 
     | 
    
         
             
            ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
         
     | 
| 
      
 17472 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
         
     | 
| 
       17120 
17473 
     | 
    
         
             
                static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
         
     | 
| 
       17121 
17474 
     | 
    
         
             
                    /* .iface    = */ {
         
     | 
| 
       17122 
17475 
     | 
    
         
             
                        /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
         
     | 
| 
         @@ -17231,7 +17584,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back 
     | 
|
| 
       17231 
17584 
     | 
    
         
             
                params.ith = 0;
         
     | 
| 
       17232 
17585 
     | 
    
         
             
                for (int i = 0; i < cgraph->n_nodes; i++) {
         
     | 
| 
       17233 
17586 
     | 
    
         
             
                    ggml_tensor * node = cgraph->nodes[i];
         
     | 
| 
       17234 
     | 
    
         
            -
                    if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
         
     | 
| 
      
 17587 
     | 
    
         
            +
                    if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
         
     | 
| 
       17235 
17588 
     | 
    
         
             
                        continue;
         
     | 
| 
       17236 
17589 
     | 
    
         
             
                    }
         
     | 
| 
       17237 
17590 
     | 
    
         
             
            #ifndef NDEBUG
         
     | 
| 
         @@ -17289,9 +17642,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons 
     | 
|
| 
       17289 
17642 
     | 
    
         
             
                                return false;
         
     | 
| 
       17290 
17643 
     | 
    
         
             
                            }
         
     | 
| 
       17291 
17644 
     | 
    
         
             
                            ggml_type a_type = a->type;
         
     | 
| 
       17292 
     | 
    
         
            -
                            if (a_type == GGML_TYPE_IQ4_NL 
     | 
| 
       17293 
     | 
    
         
            -
                                a_type ==  
     | 
| 
       17294 
     | 
    
         
            -
                                 
     | 
| 
      
 17645 
     | 
    
         
            +
                            if (a_type == GGML_TYPE_IQ4_NL  || a_type == GGML_TYPE_IQ4_XS ||
         
     | 
| 
      
 17646 
     | 
    
         
            +
                                a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S  ||
         
     | 
| 
      
 17647 
     | 
    
         
            +
                                a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S ||
         
     | 
| 
      
 17648 
     | 
    
         
            +
                                a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M
         
     | 
| 
      
 17649 
     | 
    
         
            +
                                ) {
         
     | 
| 
      
 17650 
     | 
    
         
            +
                                if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
         
     | 
| 
      
 17651 
     | 
    
         
            +
                                    return false;
         
     | 
| 
      
 17652 
     | 
    
         
            +
                                }
         
     | 
| 
       17295 
17653 
     | 
    
         
             
                            }
         
     | 
| 
       17296 
17654 
     | 
    
         
             
                            return true;
         
     | 
| 
       17297 
17655 
     | 
    
         
             
                        } break;
         
     | 
| 
         @@ -17379,6 +17737,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons 
     | 
|
| 
       17379 
17737 
     | 
    
         
             
                UNUSED(backend);
         
     | 
| 
       17380 
17738 
     | 
    
         
             
            }
         
     | 
| 
       17381 
17739 
     | 
    
         | 
| 
      
 17740 
     | 
    
         
            +
            GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
         
     | 
| 
      
 17741 
     | 
    
         
            +
                const int min_batch_size = 32;
         
     | 
| 
      
 17742 
     | 
    
         
            +
                return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
         
     | 
| 
      
 17743 
     | 
    
         
            +
                GGML_UNUSED(backend);
         
     | 
| 
      
 17744 
     | 
    
         
            +
            }
         
     | 
| 
      
 17745 
     | 
    
         
            +
             
     | 
| 
      
 17746 
     | 
    
         
            +
             
     | 
| 
       17382 
17747 
     | 
    
         
             
            static ggml_backend_i ggml_backend_sycl_interface = {
         
     | 
| 
       17383 
17748 
     | 
    
         
             
                /* .get_name                = */ ggml_backend_sycl_name,
         
     | 
| 
       17384 
17749 
     | 
    
         
             
                /* .free                    = */ ggml_backend_sycl_free,
         
     | 
| 
         @@ -17392,7 +17757,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { 
     | 
|
| 
       17392 
17757 
     | 
    
         
             
                /* .graph_plan_compute      = */ NULL,
         
     | 
| 
       17393 
17758 
     | 
    
         
             
                /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
         
     | 
| 
       17394 
17759 
     | 
    
         
             
                /* .supports_op             = */ ggml_backend_sycl_supports_op,
         
     | 
| 
       17395 
     | 
    
         
            -
                /* .offload_op              = */  
     | 
| 
      
 17760 
     | 
    
         
            +
                /* .offload_op              = */ ggml_backend_sycl_offload_op,
         
     | 
| 
       17396 
17761 
     | 
    
         
             
                /* .event_new               = */ NULL,
         
     | 
| 
       17397 
17762 
     | 
    
         
             
                /* .event_free              = */ NULL,
         
     | 
| 
       17398 
17763 
     | 
    
         
             
                /* .event_record            = */ NULL,
         
     | 
| 
         @@ -17406,7 +17771,8 @@ static ggml_guid_t ggml_backend_sycl_guid() { 
     | 
|
| 
       17406 
17771 
     | 
    
         
             
            }
         
     | 
| 
       17407 
17772 
     | 
    
         | 
| 
       17408 
17773 
     | 
    
         
             
            GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
         
     | 
| 
       17409 
     | 
    
         
            -
                 
     | 
| 
      
 17774 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
         
     | 
| 
      
 17775 
     | 
    
         
            +
                ggml_init_sycl();
         
     | 
| 
       17410 
17776 
     | 
    
         | 
| 
       17411 
17777 
     | 
    
         
             
                check_allow_gpu_index(device);
         
     | 
| 
       17412 
17778 
     | 
    
         | 
| 
         @@ -17432,6 +17798,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { 
     | 
|
| 
       17432 
17798 
     | 
    
         
             
            }
         
     | 
| 
       17433 
17799 
     | 
    
         | 
| 
       17434 
17800 
     | 
    
         
             
            GGML_CALL int ggml_backend_sycl_get_device_count() {
         
     | 
| 
      
 17801 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
         
     | 
| 
       17435 
17802 
     | 
    
         
             
                if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
         
     | 
| 
       17436 
17803 
     | 
    
         
             
                return g_sycl_gpu_mgr->get_gpu_count();
         
     | 
| 
       17437 
17804 
     | 
    
         
             
            }
         
     | 
| 
         @@ -17444,16 +17811,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, 
     | 
|
| 
       17444 
17811 
     | 
    
         
             
            }
         
     | 
| 
       17445 
17812 
     | 
    
         | 
| 
       17446 
17813 
     | 
    
         
             
            GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
         
     | 
| 
      
 17814 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
         
     | 
| 
       17447 
17815 
     | 
    
         
             
                return g_sycl_gpu_mgr->get_index(device_id);
         
     | 
| 
       17448 
17816 
     | 
    
         
             
            }
         
     | 
| 
       17449 
17817 
     | 
    
         | 
| 
       17450 
17818 
     | 
    
         
             
            GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
         
     | 
| 
      
 17819 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
         
     | 
| 
       17451 
17820 
     | 
    
         
             
                return g_sycl_gpu_mgr->gpus[device_index];
         
     | 
| 
       17452 
17821 
     | 
    
         
             
            }
         
     | 
| 
       17453 
17822 
     | 
    
         | 
| 
       17454 
17823 
     | 
    
         
             
            GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
         
     | 
| 
       17455 
     | 
    
         
            -
                 
     | 
| 
      
 17824 
     | 
    
         
            +
                ggml_init_sycl();
         
     | 
| 
      
 17825 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
         
     | 
| 
       17456 
17826 
     | 
    
         
             
                fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
         
     | 
| 
      
 17827 
     | 
    
         
            +
                GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
         
     | 
| 
      
 17828 
     | 
    
         
            +
             
     | 
| 
       17457 
17829 
     | 
    
         
             
                if (g_sycl_gpu_mgr) {
         
     | 
| 
       17458 
17830 
     | 
    
         
             
                    delete g_sycl_gpu_mgr;
         
     | 
| 
       17459 
17831 
     | 
    
         
             
                }
         
     | 
| 
         @@ -17464,6 +17836,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id 
     | 
|
| 
       17464 
17836 
     | 
    
         
             
            }
         
     | 
| 
       17465 
17837 
     | 
    
         | 
| 
       17466 
17838 
     | 
    
         
             
            GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
         
     | 
| 
      
 17839 
     | 
    
         
            +
                ggml_init_sycl();
         
     | 
| 
      
 17840 
     | 
    
         
            +
                GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
         
     | 
| 
      
 17841 
     | 
    
         
            +
             
     | 
| 
       17467 
17842 
     | 
    
         
             
                if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
         
     | 
| 
       17468 
17843 
     | 
    
         
             
                    return;
         
     | 
| 
       17469 
17844 
     | 
    
         
             
                }
         
     |