llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -11,17 +11,6 @@ extern "C" { 
     | 
|
| 
       11 
11 
     | 
    
         
             
            #define GGML_VK_MAX_DEVICES 16
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            GGML_API void ggml_vk_instance_init(void);
         
     | 
| 
       14 
     | 
    
         
            -
            GGML_API void ggml_vk_init_cpu_assist(void);
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
            GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
         
     | 
| 
       17 
     | 
    
         
            -
            GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
         
     | 
| 
       18 
     | 
    
         
            -
            GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
         
     | 
| 
       19 
     | 
    
         
            -
            GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
         
     | 
| 
       20 
     | 
    
         
            -
            #ifdef GGML_VULKAN_CHECK_RESULTS
         
     | 
| 
       21 
     | 
    
         
            -
            void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
         
     | 
| 
       22 
     | 
    
         
            -
            #endif
         
     | 
| 
       23 
     | 
    
         
            -
            GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
         
     | 
| 
       24 
     | 
    
         
            -
            GGML_API void ggml_vk_free_cpu_assist(void);
         
     | 
| 
       25 
14 
     | 
    
         | 
| 
       26 
15 
     | 
    
         
             
            // backend API
         
     | 
| 
       27 
16 
     | 
    
         
             
            GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
         
     | 
    
        data/vendor/tmp/llama.cpp/ggml.c
    CHANGED
    
    | 
         @@ -3,6 +3,7 @@ 
     | 
|
| 
       3 
3 
     | 
    
         | 
| 
       4 
4 
     | 
    
         
             
            #include "ggml-impl.h"
         
     | 
| 
       5 
5 
     | 
    
         
             
            #include "ggml-quants.h"
         
     | 
| 
      
 6 
     | 
    
         
            +
            #include "ggml.h"
         
     | 
| 
       6 
7 
     | 
    
         | 
| 
       7 
8 
     | 
    
         
             
            #if defined(_MSC_VER) || defined(__MINGW32__)
         
     | 
| 
       8 
9 
     | 
    
         
             
            #include <malloc.h> // using malloc.h with MSC/MINGW
         
     | 
| 
         @@ -43,6 +44,10 @@ 
     | 
|
| 
       43 
44 
     | 
    
         | 
| 
       44 
45 
     | 
    
         
             
            #if defined(_WIN32)
         
     | 
| 
       45 
46 
     | 
    
         | 
| 
      
 47 
     | 
    
         
            +
            #define WIN32_LEAN_AND_MEAN
         
     | 
| 
      
 48 
     | 
    
         
            +
            #ifndef NOMINMAX
         
     | 
| 
      
 49 
     | 
    
         
            +
                #define NOMINMAX
         
     | 
| 
      
 50 
     | 
    
         
            +
            #endif
         
     | 
| 
       46 
51 
     | 
    
         
             
            #include <windows.h>
         
     | 
| 
       47 
52 
     | 
    
         | 
| 
       48 
53 
     | 
    
         
             
            typedef volatile LONG atomic_int;
         
     | 
| 
         @@ -273,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { 
     | 
|
| 
       273 
278 
     | 
    
         
             
            #include <Accelerate/Accelerate.h>
         
     | 
| 
       274 
279 
     | 
    
         
             
            #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
         
     | 
| 
       275 
280 
     | 
    
         
             
            #include "ggml-opencl.h"
         
     | 
| 
       276 
     | 
    
         
            -
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       277 
     | 
    
         
            -
            #include "ggml-vulkan.h"
         
     | 
| 
       278 
281 
     | 
    
         
             
            #endif
         
     | 
| 
       279 
282 
     | 
    
         
             
            #elif defined(GGML_USE_OPENBLAS)
         
     | 
| 
       280 
283 
     | 
    
         
             
            #if defined(GGML_BLAS_USE_MKL)
         
     | 
| 
         @@ -284,10 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { 
     | 
|
| 
       284 
287 
     | 
    
         
             
            #endif
         
     | 
| 
       285 
288 
     | 
    
         
             
            #elif defined(GGML_USE_CLBLAST)
         
     | 
| 
       286 
289 
     | 
    
         
             
            #include "ggml-opencl.h"
         
     | 
| 
       287 
     | 
    
         
            -
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       288 
     | 
    
         
            -
            #include "ggml-vulkan.h"
         
     | 
| 
       289 
     | 
    
         
            -
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       290 
     | 
    
         
            -
            #include "ggml-sycl.h"
         
     | 
| 
       291 
290 
     | 
    
         
             
            #endif
         
     | 
| 
       292 
291 
     | 
    
         | 
| 
       293 
292 
     | 
    
         
             
            // floating point type used to accumulate sums
         
     | 
| 
         @@ -339,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { 
     | 
|
| 
       339 
338 
     | 
    
         
             
                return GGML_FP32_TO_FP16(x);
         
     | 
| 
       340 
339 
     | 
    
         
             
            }
         
     | 
| 
       341 
340 
     | 
    
         | 
| 
       342 
     | 
    
         
            -
            void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,  
     | 
| 
       343 
     | 
    
         
            -
                for ( 
     | 
| 
      
 341 
     | 
    
         
            +
            void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
         
     | 
| 
      
 342 
     | 
    
         
            +
                for (int64_t i = 0; i < n; i++) {
         
     | 
| 
       344 
343 
     | 
    
         
             
                    y[i] = GGML_FP16_TO_FP32(x[i]);
         
     | 
| 
       345 
344 
     | 
    
         
             
                }
         
     | 
| 
       346 
345 
     | 
    
         
             
            }
         
     | 
| 
       347 
346 
     | 
    
         | 
| 
       348 
     | 
    
         
            -
            void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,  
     | 
| 
       349 
     | 
    
         
            -
                 
     | 
| 
      
 347 
     | 
    
         
            +
            void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
         
     | 
| 
      
 348 
     | 
    
         
            +
                int64_t i = 0;
         
     | 
| 
       350 
349 
     | 
    
         
             
            #if defined(__F16C__)
         
     | 
| 
       351 
350 
     | 
    
         
             
                for (; i + 7 < n; i += 8) {
         
     | 
| 
       352 
351 
     | 
    
         
             
                    __m256 x_vec = _mm256_loadu_ps(x + i);
         
     | 
| 
         @@ -430,6 +429,57 @@ int64_t ggml_cycles_per_ms(void) { 
     | 
|
| 
       430 
429 
     | 
    
         
             
            #define ggml_perf_cycles_per_ms() 0
         
     | 
| 
       431 
430 
     | 
    
         
             
            #endif
         
     | 
| 
       432 
431 
     | 
    
         | 
| 
      
 432 
     | 
    
         
            +
            //
         
     | 
| 
      
 433 
     | 
    
         
            +
            // cross-platform UTF-8 file paths
         
     | 
| 
      
 434 
     | 
    
         
            +
            //
         
     | 
| 
      
 435 
     | 
    
         
            +
             
     | 
| 
      
 436 
     | 
    
         
            +
            #ifdef _WIN32
         
     | 
| 
      
 437 
     | 
    
         
            +
            static wchar_t * ggml_mbstowcs(const char * mbs) {
         
     | 
| 
      
 438 
     | 
    
         
            +
                int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
         
     | 
| 
      
 439 
     | 
    
         
            +
                if (!wlen) {
         
     | 
| 
      
 440 
     | 
    
         
            +
                    errno = EINVAL;
         
     | 
| 
      
 441 
     | 
    
         
            +
                    return NULL;
         
     | 
| 
      
 442 
     | 
    
         
            +
                }
         
     | 
| 
      
 443 
     | 
    
         
            +
             
     | 
| 
      
 444 
     | 
    
         
            +
                wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
         
     | 
| 
      
 445 
     | 
    
         
            +
                wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
         
     | 
| 
      
 446 
     | 
    
         
            +
                if (!wlen) {
         
     | 
| 
      
 447 
     | 
    
         
            +
                    GGML_FREE(wbuf);
         
     | 
| 
      
 448 
     | 
    
         
            +
                    errno = EINVAL;
         
     | 
| 
      
 449 
     | 
    
         
            +
                    return NULL;
         
     | 
| 
      
 450 
     | 
    
         
            +
                }
         
     | 
| 
      
 451 
     | 
    
         
            +
             
     | 
| 
      
 452 
     | 
    
         
            +
                return wbuf;
         
     | 
| 
      
 453 
     | 
    
         
            +
            }
         
     | 
| 
      
 454 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 455 
     | 
    
         
            +
             
     | 
| 
      
 456 
     | 
    
         
            +
            FILE * ggml_fopen(const char * fname, const char * mode) {
         
     | 
| 
      
 457 
     | 
    
         
            +
            #ifdef _WIN32
         
     | 
| 
      
 458 
     | 
    
         
            +
                FILE * file = NULL;
         
     | 
| 
      
 459 
     | 
    
         
            +
             
     | 
| 
      
 460 
     | 
    
         
            +
                // convert fname (UTF-8)
         
     | 
| 
      
 461 
     | 
    
         
            +
                wchar_t * wfname = ggml_mbstowcs(fname);
         
     | 
| 
      
 462 
     | 
    
         
            +
                if (wfname) {
         
     | 
| 
      
 463 
     | 
    
         
            +
                    // convert mode (ANSI)
         
     | 
| 
      
 464 
     | 
    
         
            +
                    wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
         
     | 
| 
      
 465 
     | 
    
         
            +
                    wchar_t * wmode_p = wmode;
         
     | 
| 
      
 466 
     | 
    
         
            +
                    do {
         
     | 
| 
      
 467 
     | 
    
         
            +
                        *wmode_p++ = (wchar_t)*mode;
         
     | 
| 
      
 468 
     | 
    
         
            +
                    } while (*mode++);
         
     | 
| 
      
 469 
     | 
    
         
            +
             
     | 
| 
      
 470 
     | 
    
         
            +
                    // open file
         
     | 
| 
      
 471 
     | 
    
         
            +
                    file = _wfopen(wfname, wmode);
         
     | 
| 
      
 472 
     | 
    
         
            +
             
     | 
| 
      
 473 
     | 
    
         
            +
                    GGML_FREE(wfname);
         
     | 
| 
      
 474 
     | 
    
         
            +
                    GGML_FREE(wmode);
         
     | 
| 
      
 475 
     | 
    
         
            +
                }
         
     | 
| 
      
 476 
     | 
    
         
            +
             
     | 
| 
      
 477 
     | 
    
         
            +
                return file;
         
     | 
| 
      
 478 
     | 
    
         
            +
            #else
         
     | 
| 
      
 479 
     | 
    
         
            +
                return fopen(fname, mode);
         
     | 
| 
      
 480 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 481 
     | 
    
         
            +
            }
         
     | 
| 
      
 482 
     | 
    
         
            +
             
     | 
| 
       433 
483 
     | 
    
         
             
            //
         
     | 
| 
       434 
484 
     | 
    
         
             
            // cache line
         
     | 
| 
       435 
485 
     | 
    
         
             
            //
         
     | 
| 
         @@ -740,6 +790,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { 
     | 
|
| 
       740 
790 
     | 
    
         
             
                    .vec_dot_type             = GGML_TYPE_Q8_K,
         
     | 
| 
       741 
791 
     | 
    
         
             
                    .nrows                    = 1,
         
     | 
| 
       742 
792 
     | 
    
         
             
                },
         
     | 
| 
      
 793 
     | 
    
         
            +
                [GGML_TYPE_IQ1_M] = {
         
     | 
| 
      
 794 
     | 
    
         
            +
                    .type_name                = "iq1_m",
         
     | 
| 
      
 795 
     | 
    
         
            +
                    .blck_size                = QK_K,
         
     | 
| 
      
 796 
     | 
    
         
            +
                    .type_size                = sizeof(block_iq1_m),
         
     | 
| 
      
 797 
     | 
    
         
            +
                    .is_quantized             = true,
         
     | 
| 
      
 798 
     | 
    
         
            +
                    .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
         
     | 
| 
      
 799 
     | 
    
         
            +
                    .from_float               = NULL,
         
     | 
| 
      
 800 
     | 
    
         
            +
                    .from_float_reference     = NULL,
         
     | 
| 
      
 801 
     | 
    
         
            +
                    .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
         
     | 
| 
      
 802 
     | 
    
         
            +
                    .vec_dot_type             = GGML_TYPE_Q8_K,
         
     | 
| 
      
 803 
     | 
    
         
            +
                    .nrows                    = 1,
         
     | 
| 
      
 804 
     | 
    
         
            +
                },
         
     | 
| 
       743 
805 
     | 
    
         
             
                [GGML_TYPE_IQ4_NL] = {
         
     | 
| 
       744 
806 
     | 
    
         
             
                    .type_name                = "iq4_nl",
         
     | 
| 
       745 
807 
     | 
    
         
             
                    .blck_size                = QK4_NL,
         
     | 
| 
         @@ -2485,6 +2547,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { 
     | 
|
| 
       2485 
2547 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
         
     | 
| 
       2486 
2548 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
         
     | 
| 
       2487 
2549 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
         
     | 
| 
      
 2550 
     | 
    
         
            +
                    case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
         
     | 
| 
       2488 
2551 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
         
     | 
| 
       2489 
2552 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         
     | 
| 
       2490 
2553 
     | 
    
         
             
                    case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         
     | 
| 
         @@ -2540,6 +2603,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { 
     | 
|
| 
       2540 
2603 
     | 
    
         
             
                    tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
         
     | 
| 
       2541 
2604 
     | 
    
         
             
            }
         
     | 
| 
       2542 
2605 
     | 
    
         | 
| 
      
 2606 
     | 
    
         
            +
            GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
         
     | 
| 
      
 2607 
     | 
    
         
            +
                for (int i = 0; i < GGML_MAX_DIMS; ++i) {
         
     | 
| 
      
 2608 
     | 
    
         
            +
                    if (tensor->ne[i] == 0) {
         
     | 
| 
      
 2609 
     | 
    
         
            +
                        // empty if any dimension has no elements
         
     | 
| 
      
 2610 
     | 
    
         
            +
                        return true;
         
     | 
| 
      
 2611 
     | 
    
         
            +
                    }
         
     | 
| 
      
 2612 
     | 
    
         
            +
                }
         
     | 
| 
      
 2613 
     | 
    
         
            +
                return false;
         
     | 
| 
      
 2614 
     | 
    
         
            +
            }
         
     | 
| 
      
 2615 
     | 
    
         
            +
             
     | 
| 
       2543 
2616 
     | 
    
         
             
            bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
         
     | 
| 
       2544 
2617 
     | 
    
         
             
                static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
         
     | 
| 
       2545 
2618 
     | 
    
         | 
| 
         @@ -2554,7 +2627,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor 
     | 
|
| 
       2554 
2627 
     | 
    
         
             
            static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
         
     | 
| 
       2555 
2628 
     | 
    
         
             
                static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
         
     | 
| 
       2556 
2629 
     | 
    
         | 
| 
       2557 
     | 
    
         
            -
                return
         
     | 
| 
      
 2630 
     | 
    
         
            +
                return ggml_is_empty(t0) ? ggml_is_empty(t1) :
         
     | 
| 
       2558 
2631 
     | 
    
         
             
                    (t1->ne[0]%t0->ne[0] == 0) &&
         
     | 
| 
       2559 
2632 
     | 
    
         
             
                    (t1->ne[1]%t0->ne[1] == 0) &&
         
     | 
| 
       2560 
2633 
     | 
    
         
             
                    (t1->ne[2]%t0->ne[2] == 0) &&
         
     | 
| 
         @@ -2640,10 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { 
     | 
|
| 
       2640 
2713 
     | 
    
         | 
| 
       2641 
2714 
     | 
    
         
             
            #if defined(GGML_USE_CLBLAST)
         
     | 
| 
       2642 
2715 
     | 
    
         
             
                    ggml_cl_init();
         
     | 
| 
       2643 
     | 
    
         
            -
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       2644 
     | 
    
         
            -
                    ggml_vk_init_cpu_assist();
         
     | 
| 
       2645 
     | 
    
         
            -
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       2646 
     | 
    
         
            -
                    ggml_init_sycl();
         
     | 
| 
       2647 
2716 
     | 
    
         
             
            #endif
         
     | 
| 
       2648 
2717 
     | 
    
         | 
| 
       2649 
2718 
     | 
    
         
             
                    ggml_setup_op_has_task_pass();
         
     | 
| 
         @@ -2863,7 +2932,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( 
     | 
|
| 
       2863 
2932 
     | 
    
         
             
                    data_size *= ne[i];
         
     | 
| 
       2864 
2933 
     | 
    
         
             
                }
         
     | 
| 
       2865 
2934 
     | 
    
         | 
| 
       2866 
     | 
    
         
            -
                GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
         
     | 
| 
      
 2935 
     | 
    
         
            +
                GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
         
     | 
| 
       2867 
2936 
     | 
    
         | 
| 
       2868 
2937 
     | 
    
         
             
                void * data = view_src != NULL ? view_src->data : NULL;
         
     | 
| 
       2869 
2938 
     | 
    
         
             
                if (data != NULL) {
         
     | 
| 
         @@ -4504,45 +4573,38 @@ void ggml_mul_mat_set_prec( 
     | 
|
| 
       4504 
4573 
     | 
    
         | 
| 
       4505 
4574 
     | 
    
         
             
            // ggml_mul_mat_id
         
     | 
| 
       4506 
4575 
     | 
    
         | 
| 
      
 4576 
     | 
    
         
            +
            // NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
         
     | 
| 
      
 4577 
     | 
    
         
            +
            //       this will allow computing all the used experts in a single matrix multiplication
         
     | 
| 
       4507 
4578 
     | 
    
         
             
            struct ggml_tensor * ggml_mul_mat_id(
         
     | 
| 
       4508 
4579 
     | 
    
         
             
                    struct ggml_context * ctx,
         
     | 
| 
       4509 
     | 
    
         
            -
                    struct ggml_tensor  *  
     | 
| 
       4510 
     | 
    
         
            -
                    int                   n_as,
         
     | 
| 
      
 4580 
     | 
    
         
            +
                    struct ggml_tensor  * as,
         
     | 
| 
       4511 
4581 
     | 
    
         
             
                    struct ggml_tensor  * ids,
         
     | 
| 
       4512 
4582 
     | 
    
         
             
                    int                   id,
         
     | 
| 
       4513 
4583 
     | 
    
         
             
                    struct ggml_tensor  * b) {
         
     | 
| 
       4514 
4584 
     | 
    
         | 
| 
       4515 
4585 
     | 
    
         
             
                GGML_ASSERT(ids->type == GGML_TYPE_I32);
         
     | 
| 
       4516 
     | 
    
         
            -
                GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
         
     | 
| 
       4517 
     | 
    
         
            -
                GGML_ASSERT(ids->ne[1] == b->ne[1]);
         
     | 
| 
      
 4586 
     | 
    
         
            +
                GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
         
     | 
| 
      
 4587 
     | 
    
         
            +
                GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
         
     | 
| 
       4518 
4588 
     | 
    
         
             
                GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
         
     | 
| 
       4519 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
       4520 
     | 
    
         
            -
                GGML_ASSERT( 
     | 
| 
      
 4589 
     | 
    
         
            +
                GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
         
     | 
| 
      
 4590 
     | 
    
         
            +
                GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
         
     | 
| 
       4521 
4591 
     | 
    
         | 
| 
       4522 
4592 
     | 
    
         
             
                bool is_node = false;
         
     | 
| 
       4523 
4593 
     | 
    
         | 
| 
       4524 
     | 
    
         
            -
                if (as 
     | 
| 
      
 4594 
     | 
    
         
            +
                if (as->grad || b->grad) {
         
     | 
| 
       4525 
4595 
     | 
    
         
             
                    is_node = true;
         
     | 
| 
       4526 
4596 
     | 
    
         
             
                }
         
     | 
| 
       4527 
4597 
     | 
    
         | 
| 
       4528 
     | 
    
         
            -
                const int64_t ne[4] = { as 
     | 
| 
      
 4598 
     | 
    
         
            +
                const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
         
     | 
| 
       4529 
4599 
     | 
    
         
             
                struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
         
     | 
| 
       4530 
4600 
     | 
    
         | 
| 
       4531 
4601 
     | 
    
         
             
                ggml_set_op_params_i32(result, 0, id);
         
     | 
| 
       4532 
     | 
    
         
            -
                ggml_set_op_params_i32(result, 1, n_as);
         
     | 
| 
       4533 
4602 
     | 
    
         | 
| 
       4534 
4603 
     | 
    
         
             
                result->op   = GGML_OP_MUL_MAT_ID;
         
     | 
| 
       4535 
4604 
     | 
    
         
             
                result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
         
     | 
| 
       4536 
     | 
    
         
            -
                result->src[0] =  
     | 
| 
      
 4605 
     | 
    
         
            +
                result->src[0] = as;
         
     | 
| 
       4537 
4606 
     | 
    
         
             
                result->src[1] = b;
         
     | 
| 
       4538 
     | 
    
         
            -
             
     | 
| 
       4539 
     | 
    
         
            -
                for (int i = 0; i < n_as; i++) {
         
     | 
| 
       4540 
     | 
    
         
            -
                    struct ggml_tensor * a = as[i];
         
     | 
| 
       4541 
     | 
    
         
            -
                    GGML_ASSERT(ggml_are_same_shape(as[0], a));
         
     | 
| 
       4542 
     | 
    
         
            -
                    GGML_ASSERT(ggml_can_mul_mat(a, b));
         
     | 
| 
       4543 
     | 
    
         
            -
                    GGML_ASSERT(!ggml_is_transposed(a));
         
     | 
| 
       4544 
     | 
    
         
            -
                    result->src[i + 2] = a;
         
     | 
| 
       4545 
     | 
    
         
            -
                }
         
     | 
| 
      
 4607 
     | 
    
         
            +
                result->src[2] = ids;
         
     | 
| 
       4546 
4608 
     | 
    
         | 
| 
       4547 
4609 
     | 
    
         
             
                return result;
         
     | 
| 
       4548 
4610 
     | 
    
         
             
            }
         
     | 
| 
         @@ -8083,6 +8145,7 @@ static void ggml_compute_forward_add( 
     | 
|
| 
       8083 
8145 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       8084 
8146 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       8085 
8147 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 8148 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       8086 
8149 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       8087 
8150 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       8088 
8151 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -8365,6 +8428,7 @@ static void ggml_compute_forward_add1( 
     | 
|
| 
       8365 
8428 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       8366 
8429 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       8367 
8430 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 8431 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       8368 
8432 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       8369 
8433 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       8370 
8434 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -8492,6 +8556,7 @@ static void ggml_compute_forward_acc( 
     | 
|
| 
       8492 
8556 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       8493 
8557 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       8494 
8558 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 8559 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       8495 
8560 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       8496 
8561 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       8497 
8562 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -10876,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id( 
     | 
|
| 
       10876 
10941 
     | 
    
         
             
                    const struct ggml_compute_params * params,
         
     | 
| 
       10877 
10942 
     | 
    
         
             
                          struct ggml_tensor * dst) {
         
     | 
| 
       10878 
10943 
     | 
    
         | 
| 
       10879 
     | 
    
         
            -
                const struct ggml_tensor *  
     | 
| 
      
 10944 
     | 
    
         
            +
                const struct ggml_tensor * src0 = dst->src[0];
         
     | 
| 
       10880 
10945 
     | 
    
         
             
                const struct ggml_tensor * src1 = dst->src[1];
         
     | 
| 
       10881 
     | 
    
         
            -
             
     | 
| 
       10882 
     | 
    
         
            -
                const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
         
     | 
| 
      
 10946 
     | 
    
         
            +
                const struct ggml_tensor * ids = dst->src[2];
         
     | 
| 
       10883 
10947 
     | 
    
         | 
| 
       10884 
10948 
     | 
    
         
             
                GGML_TENSOR_BINARY_OP_LOCALS
         
     | 
| 
       10885 
10949 
     | 
    
         | 
| 
         @@ -10909,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id( 
     | 
|
| 
       10909 
10973 
     | 
    
         
             
                GGML_ASSERT(nb1 <= nb2);
         
     | 
| 
       10910 
10974 
     | 
    
         
             
                GGML_ASSERT(nb2 <= nb3);
         
     | 
| 
       10911 
10975 
     | 
    
         | 
| 
       10912 
     | 
    
         
            -
                // broadcast  
     | 
| 
       10913 
     | 
    
         
            -
                 
     | 
| 
       10914 
     | 
    
         
            -
                 
     | 
| 
      
 10976 
     | 
    
         
            +
                // broadcast is not supported with mmid
         
     | 
| 
      
 10977 
     | 
    
         
            +
                assert(ne12 == 1);
         
     | 
| 
      
 10978 
     | 
    
         
            +
                assert(ne13 == 1);
         
     | 
| 
       10915 
10979 
     | 
    
         | 
| 
       10916 
10980 
     | 
    
         
             
                // row groups
         
     | 
| 
       10917 
10981 
     | 
    
         
             
                const int id   = ggml_get_op_params_i32(dst, 0);
         
     | 
| 
       10918 
     | 
    
         
            -
                const int n_as =  
     | 
| 
      
 10982 
     | 
    
         
            +
                const int n_as = src0->ne[2];
         
     | 
| 
       10919 
10983 
     | 
    
         | 
| 
       10920 
10984 
     | 
    
         
             
                char * wdata_src1_end = (src1->type == vec_dot_type) ?
         
     | 
| 
       10921 
10985 
     | 
    
         
             
                        (char *) params->wdata :
         
     | 
| 
         @@ -10975,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id( 
     | 
|
| 
       10975 
11039 
     | 
    
         
             
                        continue;
         
     | 
| 
       10976 
11040 
     | 
    
         
             
                    }
         
     | 
| 
       10977 
11041 
     | 
    
         | 
| 
       10978 
     | 
    
         
            -
                     
     | 
| 
      
 11042 
     | 
    
         
            +
                    size_t src0_offset = cur_a*src0->nb[2];
         
     | 
| 
       10979 
11043 
     | 
    
         | 
| 
       10980 
11044 
     | 
    
         
             
                    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         
     | 
| 
       10981 
11045 
     | 
    
         
             
                    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
         
     | 
| 
         @@ -11010,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id( 
     | 
|
| 
       11010 
11074 
     | 
    
         
             
                        continue;
         
     | 
| 
       11011 
11075 
     | 
    
         
             
                    }
         
     | 
| 
       11012 
11076 
     | 
    
         | 
| 
       11013 
     | 
    
         
            -
                    assert(ne12 % ne02 == 0);
         
     | 
| 
       11014 
     | 
    
         
            -
                    assert(ne13 % ne03 == 0);
         
     | 
| 
       11015 
     | 
    
         
            -
             
     | 
| 
       11016 
11077 
     | 
    
         
             
                    // block-tiling attempt
         
     | 
| 
       11017 
11078 
     | 
    
         
             
                    const int64_t blck_0 = 16;
         
     | 
| 
       11018 
11079 
     | 
    
         
             
                    const int64_t blck_1 = 16;
         
     | 
| 
         @@ -11029,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id( 
     | 
|
| 
       11029 
11090 
     | 
    
         
             
                                const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
         
     | 
| 
       11030 
11091 
     | 
    
         | 
| 
       11031 
11092 
     | 
    
         
             
                                // broadcast src0 into src1
         
     | 
| 
       11032 
     | 
    
         
            -
                                const int64_t i03 = i13/r3;
         
     | 
| 
       11033 
     | 
    
         
            -
                                const int64_t i02 = i12/r2;
         
     | 
| 
      
 11093 
     | 
    
         
            +
                                //const int64_t i03 = i13/r3;
         
     | 
| 
      
 11094 
     | 
    
         
            +
                                //const int64_t i02 = i12/r2;
         
     | 
| 
       11034 
11095 
     | 
    
         | 
| 
       11035 
11096 
     | 
    
         
             
                                const int64_t i1 = i11;
         
     | 
| 
       11036 
11097 
     | 
    
         
             
                                const int64_t i2 = i12;
         
     | 
| 
       11037 
11098 
     | 
    
         
             
                                const int64_t i3 = i13;
         
     | 
| 
       11038 
11099 
     | 
    
         | 
| 
       11039 
     | 
    
         
            -
                                const char * src0_row = (const char *)  
     | 
| 
      
 11100 
     | 
    
         
            +
                                const char * src0_row = (const char *) src0->data + src0_offset;
         
     | 
| 
       11040 
11101 
     | 
    
         | 
| 
       11041 
11102 
     | 
    
         
             
                                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
         
     | 
| 
       11042 
11103 
     | 
    
         
             
                                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
         
     | 
| 
         @@ -11395,6 +11456,7 @@ static void ggml_compute_forward_out_prod( 
     | 
|
| 
       11395 
11456 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       11396 
11457 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       11397 
11458 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 11459 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       11398 
11460 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       11399 
11461 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       11400 
11462 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -11586,6 +11648,7 @@ static void ggml_compute_forward_set( 
     | 
|
| 
       11586 
11648 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       11587 
11649 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       11588 
11650 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 11651 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       11589 
11652 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       11590 
11653 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       11591 
11654 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -11809,6 +11872,7 @@ static void ggml_compute_forward_get_rows( 
     | 
|
| 
       11809 
11872 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       11810 
11873 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       11811 
11874 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 11875 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       11812 
11876 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       11813 
11877 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       11814 
11878 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -12512,6 +12576,7 @@ static void ggml_compute_forward_alibi( 
     | 
|
| 
       12512 
12576 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       12513 
12577 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       12514 
12578 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 12579 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       12515 
12580 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       12516 
12581 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       12517 
12582 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -12600,6 +12665,7 @@ static void ggml_compute_forward_clamp( 
     | 
|
| 
       12600 
12665 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       12601 
12666 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       12602 
12667 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 12668 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:
         
     | 
| 
       12603 
12669 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:
         
     | 
| 
       12604 
12670 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:
         
     | 
| 
       12605 
12671 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:
         
     | 
| 
         @@ -16041,30 +16107,10 @@ static void ggml_compute_forward_cross_entropy_loss_back( 
     | 
|
| 
       16041 
16107 
     | 
    
         
             
            static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
         
     | 
| 
       16042 
16108 
     | 
    
         
             
                GGML_ASSERT(params);
         
     | 
| 
       16043 
16109 
     | 
    
         | 
| 
       16044 
     | 
    
         
            -
                if (tensor->op == GGML_OP_NONE) {
         
     | 
| 
       16045 
     | 
    
         
            -
                    return;
         
     | 
| 
       16046 
     | 
    
         
            -
                }
         
     | 
| 
       16047 
     | 
    
         
            -
             
     | 
| 
       16048 
     | 
    
         
            -
            #if defined(GGML_USE_VULKAN)
         
     | 
| 
       16049 
     | 
    
         
            -
                const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
         
     | 
| 
       16050 
     | 
    
         
            -
            #ifdef GGML_VULKAN_CHECK_RESULTS
         
     | 
| 
       16051 
     | 
    
         
            -
                if (skip_cpu) {
         
     | 
| 
       16052 
     | 
    
         
            -
                    ggml_vk_check_results_1_cpu_assist(params, tensor);
         
     | 
| 
       16053 
     | 
    
         
            -
                }
         
     | 
| 
       16054 
     | 
    
         
            -
            #endif
         
     | 
| 
       16055 
     | 
    
         
            -
                if (skip_cpu) {
         
     | 
| 
      
 16110 
     | 
    
         
            +
                if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
         
     | 
| 
       16056 
16111 
     | 
    
         
             
                    return;
         
     | 
| 
       16057 
16112 
     | 
    
         
             
                }
         
     | 
| 
       16058 
     | 
    
         
            -
                GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
         
     | 
| 
       16059 
     | 
    
         
            -
                GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
         
     | 
| 
       16060 
     | 
    
         
            -
            #endif // GGML_USE_VULKAN
         
     | 
| 
       16061 
16113 
     | 
    
         | 
| 
       16062 
     | 
    
         
            -
            #ifdef GGML_USE_SYCL
         
     | 
| 
       16063 
     | 
    
         
            -
                bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
         
     | 
| 
       16064 
     | 
    
         
            -
                if (skip_cpu) {
         
     | 
| 
       16065 
     | 
    
         
            -
                    return;
         
     | 
| 
       16066 
     | 
    
         
            -
                }
         
     | 
| 
       16067 
     | 
    
         
            -
            #endif // GGML_USE_SYCL
         
     | 
| 
       16068 
16114 
     | 
    
         
             
                switch (tensor->op) {
         
     | 
| 
       16069 
16115 
     | 
    
         
             
                    case GGML_OP_DUP:
         
     | 
| 
       16070 
16116 
     | 
    
         
             
                        {
         
     | 
| 
         @@ -17916,6 +17962,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const 
     | 
|
| 
       17916 
17962 
     | 
    
         
             
            static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
         
     | 
| 
       17917 
17963 
     | 
    
         
             
                int n_tasks = 0;
         
     | 
| 
       17918 
17964 
     | 
    
         | 
| 
      
 17965 
     | 
    
         
            +
                if (ggml_is_empty(node)) {
         
     | 
| 
      
 17966 
     | 
    
         
            +
                    // no need to multi-thread a no-op
         
     | 
| 
      
 17967 
     | 
    
         
            +
                    n_tasks = 1;
         
     | 
| 
      
 17968 
     | 
    
         
            +
                    return n_tasks;
         
     | 
| 
      
 17969 
     | 
    
         
            +
                }
         
     | 
| 
      
 17970 
     | 
    
         
            +
             
     | 
| 
       17919 
17971 
     | 
    
         
             
                switch (node->op) {
         
     | 
| 
       17920 
17972 
     | 
    
         
             
                    case GGML_OP_CPY:
         
     | 
| 
       17921 
17973 
     | 
    
         
             
                    case GGML_OP_DUP:
         
     | 
| 
         @@ -18401,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa 
     | 
|
| 
       18401 
18453 
     | 
    
         
             
                        case GGML_OP_MUL_MAT_ID:
         
     | 
| 
       18402 
18454 
     | 
    
         
             
                            {
         
     | 
| 
       18403 
18455 
     | 
    
         
             
                                cur = 0;
         
     | 
| 
       18404 
     | 
    
         
            -
                                const struct ggml_tensor * src0 = node->src[ 
     | 
| 
      
 18456 
     | 
    
         
            +
                                const struct ggml_tensor * src0 = node->src[0];
         
     | 
| 
       18405 
18457 
     | 
    
         
             
                                const struct ggml_tensor * src1 = node->src[1];
         
     | 
| 
       18406 
18458 
     | 
    
         
             
                                const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
         
     | 
| 
       18407 
18459 
     | 
    
         
             
                                if (src1->type != vec_dot_type) {
         
     | 
| 
       18408 
18460 
     | 
    
         
             
                                    cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
         
     | 
| 
       18409 
18461 
     | 
    
         
             
                                }
         
     | 
| 
       18410 
     | 
    
         
            -
                                const int n_as =  
     | 
| 
      
 18462 
     | 
    
         
            +
                                const int n_as = src0->ne[2];
         
     | 
| 
       18411 
18463 
     | 
    
         
             
                                cur += GGML_PAD(cur, sizeof(int64_t));       // align
         
     | 
| 
       18412 
18464 
     | 
    
         
             
                                cur += n_as * sizeof(int64_t);               // matrix_row_counts
         
     | 
| 
       18413 
18465 
     | 
    
         
             
                                cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
         
     | 
| 
         @@ -18534,17 +18586,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl 
     | 
|
| 
       18534 
18586 
     | 
    
         
             
                    }
         
     | 
| 
       18535 
18587 
     | 
    
         
             
                }
         
     | 
| 
       18536 
18588 
     | 
    
         | 
| 
       18537 
     | 
    
         
            -
            #ifdef GGML_USE_VULKAN
         
     | 
| 
       18538 
     | 
    
         
            -
                for (int i = 0; i < cgraph->n_nodes; i++) {
         
     | 
| 
       18539 
     | 
    
         
            -
                    ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
         
     | 
| 
       18540 
     | 
    
         
            -
                }
         
     | 
| 
       18541 
     | 
    
         
            -
                ggml_vk_preallocate_buffers_cpu_assist();
         
     | 
| 
       18542 
     | 
    
         
            -
             
     | 
| 
       18543 
     | 
    
         
            -
                for (int i = 0; i < cgraph->n_nodes; i++) {
         
     | 
| 
       18544 
     | 
    
         
            -
                    ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
         
     | 
| 
       18545 
     | 
    
         
            -
                }
         
     | 
| 
       18546 
     | 
    
         
            -
            #endif
         
     | 
| 
       18547 
     | 
    
         
            -
             
     | 
| 
       18548 
18589 
     | 
    
         
             
                const int n_threads = cplan->n_threads;
         
     | 
| 
       18549 
18590 
     | 
    
         | 
| 
       18550 
18591 
     | 
    
         
             
                struct ggml_compute_state_shared state_shared = {
         
     | 
| 
         @@ -18601,10 +18642,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl 
     | 
|
| 
       18601 
18642 
     | 
    
         
             
                    }
         
     | 
| 
       18602 
18643 
     | 
    
         
             
                }
         
     | 
| 
       18603 
18644 
     | 
    
         | 
| 
       18604 
     | 
    
         
            -
            #ifdef GGML_USE_VULKAN
         
     | 
| 
       18605 
     | 
    
         
            -
                ggml_vk_graph_cleanup_cpu_assist();
         
     | 
| 
       18606 
     | 
    
         
            -
            #endif
         
     | 
| 
       18607 
     | 
    
         
            -
             
     | 
| 
       18608 
18645 
     | 
    
         
             
                // performance stats (graph)
         
     | 
| 
       18609 
18646 
     | 
    
         
             
                {
         
     | 
| 
       18610 
18647 
     | 
    
         
             
                    int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
         
     | 
| 
         @@ -18739,7 +18776,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { 
     | 
|
| 
       18739 
18776 
     | 
    
         | 
| 
       18740 
18777 
     | 
    
         
             
                // write binary data
         
     | 
| 
       18741 
18778 
     | 
    
         
             
                {
         
     | 
| 
       18742 
     | 
    
         
            -
                    FILE * fout =  
     | 
| 
      
 18779 
     | 
    
         
            +
                    FILE * fout = ggml_fopen(fname, "wb");
         
     | 
| 
       18743 
18780 
     | 
    
         | 
| 
       18744 
18781 
     | 
    
         
             
                    if (!fout) {
         
     | 
| 
       18745 
18782 
     | 
    
         
             
                        fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
         
     | 
| 
         @@ -18877,7 +18914,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * 
     | 
|
| 
       18877 
18914 
     | 
    
         | 
| 
       18878 
18915 
     | 
    
         
             
                // read file into data
         
     | 
| 
       18879 
18916 
     | 
    
         
             
                {
         
     | 
| 
       18880 
     | 
    
         
            -
                    FILE * fin =  
     | 
| 
      
 18917 
     | 
    
         
            +
                    FILE * fin = ggml_fopen(fname, "rb");
         
     | 
| 
       18881 
18918 
     | 
    
         
             
                    if (!fin) {
         
     | 
| 
       18882 
18919 
     | 
    
         
             
                        fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
         
     | 
| 
       18883 
18920 
     | 
    
         
             
                        return result;
         
     | 
| 
         @@ -19213,7 +19250,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, 
     | 
|
| 
       19213 
19250 
     | 
    
         
             
            void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
         
     | 
| 
       19214 
19251 
     | 
    
         
             
                char color[16];
         
     | 
| 
       19215 
19252 
     | 
    
         | 
| 
       19216 
     | 
    
         
            -
                FILE * fp =  
     | 
| 
      
 19253 
     | 
    
         
            +
                FILE * fp = ggml_fopen(filename, "w");
         
     | 
| 
       19217 
19254 
     | 
    
         
             
                GGML_ASSERT(fp);
         
     | 
| 
       19218 
19255 
     | 
    
         | 
| 
       19219 
19256 
     | 
    
         
             
                fprintf(fp, "digraph G {\n");
         
     | 
| 
         @@ -20260,7 +20297,8 @@ void ggml_quantize_init(enum ggml_type type) { 
     | 
|
| 
       20260 
20297 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XXS:
         
     | 
| 
       20261 
20298 
     | 
    
         
             
                    case GGML_TYPE_IQ2_XS:
         
     | 
| 
       20262 
20299 
     | 
    
         
             
                    case GGML_TYPE_IQ2_S:
         
     | 
| 
       20263 
     | 
    
         
            -
                    case GGML_TYPE_IQ1_S: 
     | 
| 
      
 20300 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 20301 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
         
     | 
| 
       20264 
20302 
     | 
    
         
             
                    case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
         
     | 
| 
       20265 
20303 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
         
     | 
| 
       20266 
20304 
     | 
    
         
             
                    default: // nothing
         
     | 
| 
         @@ -20285,18 +20323,19 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { 
     | 
|
| 
       20285 
20323 
     | 
    
         
             
                return
         
     | 
| 
       20286 
20324 
     | 
    
         
             
                    type == GGML_TYPE_IQ2_XXS ||
         
     | 
| 
       20287 
20325 
     | 
    
         
             
                    type == GGML_TYPE_IQ2_XS  ||
         
     | 
| 
       20288 
     | 
    
         
            -
                    type == GGML_TYPE_IQ1_S 
     | 
| 
      
 20326 
     | 
    
         
            +
                    type == GGML_TYPE_IQ1_S;//   ||
         
     | 
| 
      
 20327 
     | 
    
         
            +
                    //type == GGML_TYPE_IQ1_M;
         
     | 
| 
       20289 
20328 
     | 
    
         
             
            }
         
     | 
| 
       20290 
20329 
     | 
    
         | 
| 
       20291 
20330 
     | 
    
         
             
            size_t ggml_quantize_chunk(
         
     | 
| 
       20292 
20331 
     | 
    
         
             
                    enum ggml_type   type,
         
     | 
| 
       20293 
20332 
     | 
    
         
             
                       const float * src,
         
     | 
| 
       20294 
20333 
     | 
    
         
             
                              void * dst,
         
     | 
| 
       20295 
     | 
    
         
            -
             
     | 
| 
       20296 
     | 
    
         
            -
             
     | 
| 
       20297 
     | 
    
         
            -
             
     | 
| 
      
 20334 
     | 
    
         
            +
                           int64_t   start,
         
     | 
| 
      
 20335 
     | 
    
         
            +
                           int64_t   nrows,
         
     | 
| 
      
 20336 
     | 
    
         
            +
                           int64_t   n_per_row,
         
     | 
| 
       20298 
20337 
     | 
    
         
             
                       const float * imatrix) {
         
     | 
| 
       20299 
     | 
    
         
            -
                const  
     | 
| 
      
 20338 
     | 
    
         
            +
                const int64_t n = (int64_t) nrows * n_per_row;
         
     | 
| 
       20300 
20339 
     | 
    
         | 
| 
       20301 
20340 
     | 
    
         
             
                if (ggml_quantize_requires_imatrix(type)) {
         
     | 
| 
       20302 
20341 
     | 
    
         
             
                    GGML_ASSERT(imatrix != NULL);
         
     | 
| 
         @@ -20329,6 +20368,7 @@ size_t ggml_quantize_chunk( 
     | 
|
| 
       20329 
20368 
     | 
    
         
             
                    case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
       20330 
20369 
     | 
    
         
             
                    case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
       20331 
20370 
     | 
    
         
             
                    case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
      
 20371 
     | 
    
         
            +
                    case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
       20332 
20372 
     | 
    
         
             
                    case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
       20333 
20373 
     | 
    
         
             
            #if QK_K == 64
         
     | 
| 
       20334 
20374 
     | 
    
         
             
                    case GGML_TYPE_IQ4_XS:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         
     | 
| 
         @@ -20531,7 +20571,7 @@ struct gguf_context * gguf_init_empty(void) { 
     | 
|
| 
       20531 
20571 
     | 
    
         
             
            }
         
     | 
| 
       20532 
20572 
     | 
    
         | 
| 
       20533 
20573 
     | 
    
         
             
            struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
         
     | 
| 
       20534 
     | 
    
         
            -
                FILE * file =  
     | 
| 
      
 20574 
     | 
    
         
            +
                FILE * file = ggml_fopen(fname, "rb");
         
     | 
| 
       20535 
20575 
     | 
    
         
             
                if (!file) {
         
     | 
| 
       20536 
20576 
     | 
    
         
             
                    return NULL;
         
     | 
| 
       20537 
20577 
     | 
    
         
             
                }
         
     | 
| 
         @@ -21486,7 +21526,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * 
     | 
|
| 
       21486 
21526 
     | 
    
         
             
            }
         
     | 
| 
       21487 
21527 
     | 
    
         | 
| 
       21488 
21528 
     | 
    
         
             
            void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
         
     | 
| 
       21489 
     | 
    
         
            -
                FILE * file =  
     | 
| 
      
 21529 
     | 
    
         
            +
                FILE * file = ggml_fopen(fname, "wb");
         
     | 
| 
       21490 
21530 
     | 
    
         
             
                if (!file) {
         
     | 
| 
       21491 
21531 
     | 
    
         
             
                    GGML_ASSERT(false && "failed to open file for writing");
         
     | 
| 
       21492 
21532 
     | 
    
         
             
                }
         
     | 
| 
         @@ -21628,15 +21668,15 @@ int ggml_cpu_has_wasm_simd(void) { 
     | 
|
| 
       21628 
21668 
     | 
    
         
             
            }
         
     | 
| 
       21629 
21669 
     | 
    
         | 
| 
       21630 
21670 
     | 
    
         
             
            int ggml_cpu_has_blas(void) {
         
     | 
| 
       21631 
     | 
    
         
            -
            #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined( 
     | 
| 
      
 21671 
     | 
    
         
            +
            #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
         
     | 
| 
       21632 
21672 
     | 
    
         
             
                return 1;
         
     | 
| 
       21633 
21673 
     | 
    
         
             
            #else
         
     | 
| 
       21634 
21674 
     | 
    
         
             
                return 0;
         
     | 
| 
       21635 
21675 
     | 
    
         
             
            #endif
         
     | 
| 
       21636 
21676 
     | 
    
         
             
            }
         
     | 
| 
       21637 
21677 
     | 
    
         | 
| 
       21638 
     | 
    
         
            -
            int  
     | 
| 
       21639 
     | 
    
         
            -
            #if defined( 
     | 
| 
      
 21678 
     | 
    
         
            +
            int ggml_cpu_has_cuda(void) {
         
     | 
| 
      
 21679 
     | 
    
         
            +
            #if defined(GGML_USE_CUDA)
         
     | 
| 
       21640 
21680 
     | 
    
         
             
                return 1;
         
     | 
| 
       21641 
21681 
     | 
    
         
             
            #else
         
     | 
| 
       21642 
21682 
     | 
    
         
             
                return 0;
         
     | 
| 
         @@ -21676,7 +21716,7 @@ int ggml_cpu_has_sycl(void) { 
     | 
|
| 
       21676 
21716 
     | 
    
         
             
            }
         
     | 
| 
       21677 
21717 
     | 
    
         | 
| 
       21678 
21718 
     | 
    
         
             
            int ggml_cpu_has_gpublas(void) {
         
     | 
| 
       21679 
     | 
    
         
            -
                return  
     | 
| 
      
 21719 
     | 
    
         
            +
                return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
         
     | 
| 
       21680 
21720 
     | 
    
         
             
                       ggml_cpu_has_sycl();
         
     | 
| 
       21681 
21721 
     | 
    
         
             
            }
         
     | 
| 
       21682 
21722 
     | 
    
         | 
    
        data/vendor/tmp/llama.cpp/ggml.h
    CHANGED
    
    | 
         @@ -214,9 +214,10 @@ 
     | 
|
| 
       214 
214 
     | 
    
         
             
            #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
         
     | 
| 
       215 
215 
     | 
    
         
             
            #endif
         
     | 
| 
       216 
216 
     | 
    
         | 
| 
       217 
     | 
    
         
            -
            #include <stdint.h>
         
     | 
| 
       218 
     | 
    
         
            -
            #include <stddef.h>
         
     | 
| 
       219 
217 
     | 
    
         
             
            #include <stdbool.h>
         
     | 
| 
      
 218 
     | 
    
         
            +
            #include <stddef.h>
         
     | 
| 
      
 219 
     | 
    
         
            +
            #include <stdint.h>
         
     | 
| 
      
 220 
     | 
    
         
            +
            #include <stdio.h>
         
     | 
| 
       220 
221 
     | 
    
         | 
| 
       221 
222 
     | 
    
         
             
            #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
         
     | 
| 
       222 
223 
     | 
    
         
             
            #define GGML_FILE_VERSION 1
         
     | 
| 
         @@ -331,8 +332,8 @@ extern "C" { 
     | 
|
| 
       331 
332 
     | 
    
         
             
                GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
         
     | 
| 
       332 
333 
     | 
    
         
             
                GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
         
     | 
| 
       333 
334 
     | 
    
         | 
| 
       334 
     | 
    
         
            -
                GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,  
     | 
| 
       335 
     | 
    
         
            -
                GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,  
     | 
| 
      
 335 
     | 
    
         
            +
                GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
         
     | 
| 
      
 336 
     | 
    
         
            +
                GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
         
     | 
| 
       336 
337 
     | 
    
         | 
| 
       337 
338 
     | 
    
         
             
                struct ggml_object;
         
     | 
| 
       338 
339 
     | 
    
         
             
                struct ggml_context;
         
     | 
| 
         @@ -368,6 +369,7 @@ extern "C" { 
     | 
|
| 
       368 
369 
     | 
    
         
             
                    GGML_TYPE_I32     = 26,
         
     | 
| 
       369 
370 
     | 
    
         
             
                    GGML_TYPE_I64     = 27,
         
     | 
| 
       370 
371 
     | 
    
         
             
                    GGML_TYPE_F64     = 28,
         
     | 
| 
      
 372 
     | 
    
         
            +
                    GGML_TYPE_IQ1_M   = 29,
         
     | 
| 
       371 
373 
     | 
    
         
             
                    GGML_TYPE_COUNT,
         
     | 
| 
       372 
374 
     | 
    
         
             
                };
         
     | 
| 
       373 
375 
     | 
    
         | 
| 
         @@ -407,6 +409,7 @@ extern "C" { 
     | 
|
| 
       407 
409 
     | 
    
         
             
                    GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
         
     | 
| 
       408 
410 
     | 
    
         
             
                    GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
         
     | 
| 
       409 
411 
     | 
    
         
             
                    GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         
     | 
| 
      
 412 
     | 
    
         
            +
                    GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         
     | 
| 
       410 
413 
     | 
    
         
             
                };
         
     | 
| 
       411 
414 
     | 
    
         | 
| 
       412 
415 
     | 
    
         
             
                // available tensor operations:
         
     | 
| 
         @@ -708,6 +711,9 @@ extern "C" { 
     | 
|
| 
       708 
711 
     | 
    
         | 
| 
       709 
712 
     | 
    
         
             
                GGML_API void    ggml_print_backtrace(void);
         
     | 
| 
       710 
713 
     | 
    
         | 
| 
      
 714 
     | 
    
         
            +
                // accepts a UTF-8 path, even on Windows
         
     | 
| 
      
 715 
     | 
    
         
            +
                GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
         
     | 
| 
      
 716 
     | 
    
         
            +
             
     | 
| 
       711 
717 
     | 
    
         
             
                GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
         
     | 
| 
       712 
718 
     | 
    
         
             
                GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
         
     | 
| 
       713 
719 
     | 
    
         | 
| 
         @@ -744,6 +750,7 @@ extern "C" { 
     | 
|
| 
       744 
750 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
         
     | 
| 
       745 
751 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
         
     | 
| 
       746 
752 
     | 
    
         
             
                GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
         
     | 
| 
      
 753 
     | 
    
         
            +
                GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
         
     | 
| 
       747 
754 
     | 
    
         
             
                GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
         
     | 
| 
       748 
755 
     | 
    
         
             
                GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
         
     | 
| 
       749 
756 
     | 
    
         
             
                GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
         
     | 
| 
         @@ -1157,8 +1164,7 @@ extern "C" { 
     | 
|
| 
       1157 
1164 
     | 
    
         
             
                //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
         
     | 
| 
       1158 
1165 
     | 
    
         
             
                GGML_API struct ggml_tensor * ggml_mul_mat_id(
         
     | 
| 
       1159 
1166 
     | 
    
         
             
                        struct ggml_context * ctx,
         
     | 
| 
       1160 
     | 
    
         
            -
                        struct ggml_tensor  *  
     | 
| 
       1161 
     | 
    
         
            -
                        int                   n_as,
         
     | 
| 
      
 1167 
     | 
    
         
            +
                        struct ggml_tensor  * as,
         
     | 
| 
       1162 
1168 
     | 
    
         
             
                        struct ggml_tensor  * ids,
         
     | 
| 
       1163 
1169 
     | 
    
         
             
                        int                   id,
         
     | 
| 
       1164 
1170 
     | 
    
         
             
                        struct ggml_tensor  * b);
         
     | 
| 
         @@ -2204,9 +2210,9 @@ extern "C" { 
     | 
|
| 
       2204 
2210 
     | 
    
         
             
                        enum ggml_type   type,
         
     | 
| 
       2205 
2211 
     | 
    
         
             
                           const float * src,
         
     | 
| 
       2206 
2212 
     | 
    
         
             
                                  void * dst,
         
     | 
| 
       2207 
     | 
    
         
            -
             
     | 
| 
       2208 
     | 
    
         
            -
             
     | 
| 
       2209 
     | 
    
         
            -
             
     | 
| 
      
 2213 
     | 
    
         
            +
                               int64_t   start,
         
     | 
| 
      
 2214 
     | 
    
         
            +
                               int64_t   nrows,
         
     | 
| 
      
 2215 
     | 
    
         
            +
                               int64_t   n_per_row,
         
     | 
| 
       2210 
2216 
     | 
    
         
             
                           const float * imatrix);
         
     | 
| 
       2211 
2217 
     | 
    
         | 
| 
       2212 
2218 
     | 
    
         
             
                //
         
     | 
| 
         @@ -2350,7 +2356,7 @@ extern "C" { 
     | 
|
| 
       2350 
2356 
     | 
    
         
             
                GGML_API int ggml_cpu_has_fp16_va    (void);
         
     | 
| 
       2351 
2357 
     | 
    
         
             
                GGML_API int ggml_cpu_has_wasm_simd  (void);
         
     | 
| 
       2352 
2358 
     | 
    
         
             
                GGML_API int ggml_cpu_has_blas       (void);
         
     | 
| 
       2353 
     | 
    
         
            -
                GGML_API int  
     | 
| 
      
 2359 
     | 
    
         
            +
                GGML_API int ggml_cpu_has_cuda       (void);
         
     | 
| 
       2354 
2360 
     | 
    
         
             
                GGML_API int ggml_cpu_has_clblast    (void);
         
     | 
| 
       2355 
2361 
     | 
    
         
             
                GGML_API int ggml_cpu_has_vulkan     (void);
         
     | 
| 
       2356 
2362 
     | 
    
         
             
                GGML_API int ggml_cpu_has_kompute    (void);
         
     | 
| 
         @@ -2371,8 +2377,8 @@ extern "C" { 
     | 
|
| 
       2371 
2377 
     | 
    
         
             
            #else
         
     | 
| 
       2372 
2378 
     | 
    
         
             
            #define GGML_RESTRICT restrict
         
     | 
| 
       2373 
2379 
     | 
    
         
             
            #endif
         
     | 
| 
       2374 
     | 
    
         
            -
                typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y,  
     | 
| 
       2375 
     | 
    
         
            -
                typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y,  
     | 
| 
      
 2380 
     | 
    
         
            +
                typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
         
     | 
| 
      
 2381 
     | 
    
         
            +
                typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
         
     | 
| 
       2376 
2382 
     | 
    
         
             
                typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
         
     | 
| 
       2377 
2383 
     | 
    
         
             
                                                  const void * GGML_RESTRICT y, size_t by, int nrc);
         
     | 
| 
       2378 
2384 
     | 
    
         |